xlog.c 287.2 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
7
 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.432 2010/08/26 19:23:41 alvherre Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <ctype.h>
T
Tom Lane 已提交
18
#include <signal.h>
19
#include <time.h>
20
#include <fcntl.h>
21
#include <sys/stat.h>
22
#include <sys/time.h>
23 24
#include <sys/wait.h>
#include <unistd.h>
25

26
#include "access/clog.h"
27
#include "access/multixact.h"
28
#include "access/subtrans.h"
29
#include "access/transam.h"
30
#include "access/tuptoaster.h"
31
#include "access/twophase.h"
32
#include "access/xact.h"
33
#include "access/xlog_internal.h"
34
#include "access/xlogutils.h"
35
#include "catalog/catversion.h"
T
Tom Lane 已提交
36
#include "catalog/pg_control.h"
37
#include "catalog/pg_database.h"
38 39
#include "catalog/pg_type.h"
#include "funcapi.h"
40
#include "libpq/pqsignal.h"
41
#include "miscadmin.h"
42
#include "pgstat.h"
43
#include "postmaster/bgwriter.h"
44 45
#include "replication/walreceiver.h"
#include "replication/walsender.h"
46
#include "storage/bufmgr.h"
47
#include "storage/fd.h"
48
#include "storage/ipc.h"
49
#include "storage/pmsignal.h"
50
#include "storage/procarray.h"
51
#include "storage/smgr.h"
52
#include "storage/spin.h"
53
#include "utils/builtins.h"
54
#include "utils/guc.h"
55
#include "utils/ps_status.h"
56
#include "utils/relmapper.h"
57
#include "pg_trace.h"
58

59

60 61
/* File path names (all relative to $PGDATA) */
#define BACKUP_LABEL_FILE		"backup_label"
62
#define BACKUP_LABEL_OLD		"backup_label.old"
63 64 65 66
#define RECOVERY_COMMAND_FILE	"recovery.conf"
#define RECOVERY_COMMAND_DONE	"recovery.done"


T
Tom Lane 已提交
67 68
/* User-settable parameters */
int			CheckPointSegments = 3;
69
int			wal_keep_segments = 0;
V
Vadim B. Mikheev 已提交
70
int			XLOGbuffers = 8;
71
int			XLogArchiveTimeout = 0;
72
bool		XLogArchiveMode = false;
73
char	   *XLogArchiveCommand = NULL;
74
bool		EnableHotStandby = false;
75
bool		fullPageWrites = true;
76
bool		log_checkpoints = false;
77
int			sync_method = DEFAULT_SYNC_METHOD;
78
int			wal_level = WAL_LEVEL_MINIMAL;
T
Tom Lane 已提交
79

80 81 82 83
#ifdef WAL_DEBUG
bool		XLOG_DEBUG = false;
#endif

84
/*
85 86 87 88 89
 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 * When we are done with an old XLOG segment file, we will recycle it as a
 * future XLOG segment as long as there aren't already XLOGfileslop future
 * segments; else we'll delete it.  This could be made a separate GUC
 * variable, but at present I think it's sufficient to hardwire it as
B
Bruce Momjian 已提交
90
 * 2*CheckPointSegments+1.	Under normal conditions, a checkpoint will free
91 92 93
 * no more than 2*CheckPointSegments log segments, and we want to recycle all
 * of them; the +1 allows boundary cases to happen without wasting a
 * delete/create-segment cycle.
94 95 96
 */
#define XLOGfileslop	(2*CheckPointSegments + 1)

97 98 99
/*
 * GUC support
 */
100 101 102 103 104 105 106
const struct config_enum_entry wal_level_options[] = {
	{"minimal", WAL_LEVEL_MINIMAL, false},
	{"archive", WAL_LEVEL_ARCHIVE, false},
	{"hot_standby", WAL_LEVEL_HOT_STANDBY, false},
	{NULL, 0, false}
};

107
const struct config_enum_entry sync_method_options[] = {
108
	{"fsync", SYNC_METHOD_FSYNC, false},
109
#ifdef HAVE_FSYNC_WRITETHROUGH
110
	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
111 112
#endif
#ifdef HAVE_FDATASYNC
113
	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
114 115
#endif
#ifdef OPEN_SYNC_FLAG
116
	{"open_sync", SYNC_METHOD_OPEN, false},
117 118
#endif
#ifdef OPEN_DATASYNC_FLAG
119
	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
120
#endif
121
	{NULL, 0, false}
122
};
T
Tom Lane 已提交
123

124 125 126 127 128 129 130
/*
 * Statistics for current checkpoint are collected in this global struct.
 * Because only the background writer or a stand-alone backend can perform
 * checkpoints, this will be unused in normal backends.
 */
CheckpointStatsData CheckpointStats;

T
Tom Lane 已提交
131
/*
132 133
 * ThisTimeLineID will be same in all backends --- it identifies current
 * WAL timeline for the database system.
T
Tom Lane 已提交
134
 */
135
TimeLineID	ThisTimeLineID = 0;
V
WAL  
Vadim B. Mikheev 已提交
136

137
/*
138
 * Are we doing recovery from XLOG?
139
 *
140 141 142 143 144
 * This is only ever true in the startup process; it should be read as meaning
 * "this process is replaying WAL records", rather than "the system is in
 * recovery mode".  It should be examined primarily by functions that need
 * to act differently when called from a WAL redo function (e.g., to skip WAL
 * logging).  To check whether the system is in recovery regardless of which
145 146
 * process you're running in, use RecoveryInProgress() but only after shared
 * memory startup and lock initialization.
147
 */
T
Tom Lane 已提交
148
bool		InRecovery = false;
B
Bruce Momjian 已提交
149

150
/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
B
Bruce Momjian 已提交
151
HotStandbyState standbyState = STANDBY_DISABLED;
152

B
Bruce Momjian 已提交
153
static XLogRecPtr LastRec;
154

155 156
/*
 * Local copy of SharedRecoveryInProgress variable. True actually means "not
157
 * known, need to check the shared state".
158 159 160
 */
static bool LocalRecoveryInProgress = true;

161 162 163 164 165 166
/*
 * Local state for XLogInsertAllowed():
 *		1: unconditionally allowed to insert XLOG
 *		0: unconditionally not allowed to insert XLOG
 *		-1: must check RecoveryInProgress(); disallow until it is false
 * Most processes start with -1 and transition to 1 after seeing that recovery
B
Bruce Momjian 已提交
167
 * is not in progress.	But we can also force the value for special cases.
168 169 170 171 172 173 174 175
 * The coding in XLogInsertAllowed() depends on the first two of these states
 * being numerically the same as bool true and false.
 */
static int	LocalXLogInsertAllowed = -1;

/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;

176
/* Was the last xlog file restored from archive, or local? */
B
Bruce Momjian 已提交
177
static bool restoredFromArchive = false;
178

179
/* options taken from recovery.conf for archive recovery */
180
static char *recoveryRestoreCommand = NULL;
181
static char *recoveryEndCommand = NULL;
182
static char *archiveCleanupCommand = NULL;
183
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
184
static bool recoveryTargetInclusive = true;
B
Bruce Momjian 已提交
185
static TransactionId recoveryTargetXid;
186
static TimestampTz recoveryTargetTime;
187

188 189 190
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyMode = false;
static char *PrimaryConnInfo = NULL;
191
static char *TriggerFile = NULL;
192

193
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
B
Bruce Momjian 已提交
194
static TransactionId recoveryStopXid;
195
static TimestampTz recoveryStopTime;
B
Bruce Momjian 已提交
196
static bool recoveryStopAfter;
197 198 199 200 201 202 203 204 205 206 207 208 209

/*
 * During normal operation, the only timeline we care about is ThisTimeLineID.
 * During recovery, however, things are more complicated.  To simplify life
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 * scan through the WAL history (that is, it is the line that was active when
 * the currently-scanned WAL record was generated).  We also need these
 * timeline values:
 *
 * recoveryTargetTLI: the desired timeline that we want to end in.
 *
 * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 * its known parents, newest first (so recoveryTargetTLI is always the
B
Bruce Momjian 已提交
210
 * first list member).	Only these TLIs are expected to be seen in the WAL
211 212 213 214 215 216 217 218 219
 * segments we read, and indeed only these TLIs will be considered as
 * candidate WAL files to open at all.
 *
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
 * (This is not necessarily the same as ThisTimeLineID, because we could
 * be scanning data that was copied from an ancestor timeline when the current
 * file was created.)  During a sequential scan we do not allow this value
 * to decrease.
 */
B
Bruce Momjian 已提交
220 221 222
static TimeLineID recoveryTargetTLI;
static List *expectedTLIs;
static TimeLineID curFileTLI;
223

T
Tom Lane 已提交
224 225
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
226 227 228 229
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 * end+1 of the last record, and is reset when we end a top-level transaction,
 * or start a new one; so it can be used to tell if the current transaction has
 * created any XLOG records.
T
Tom Lane 已提交
230 231
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
232

233
XLogRecPtr	XactLastRecEnd = {0, 0};
234

T
Tom Lane 已提交
235 236 237
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
238
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
239
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
B
Bruce Momjian 已提交
240
 * hold the Insert lock).  See XLogInsert for details.	We are also allowed
241
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
242 243
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 * InitXLOGAccess.
T
Tom Lane 已提交
244
 */
245
static XLogRecPtr RedoRecPtr;
246

247 248 249 250 251 252 253 254 255 256 257 258
/*
 * RedoStartLSN points to the checkpoint's REDO location which is specified
 * in a backup label file, backup history file or control file. In standby
 * mode, XLOG streaming usually starts from the position where an invalid
 * record was found. But if we fail to read even the initial checkpoint
 * record, we use the REDO location instead of the checkpoint location as
 * the start position of XLOG streaming. Otherwise we would have to jump
 * backwards to the REDO location after reading the checkpoint record,
 * because the REDO record can precede the checkpoint record.
 */
static XLogRecPtr RedoStartLSN = {0, 0};

T
Tom Lane 已提交
259 260 261 262 263 264 265 266 267
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
268
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
269 270 271
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
272 273 274 275
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
276 277 278
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
279 280
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
281 282 283
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
284
 * but is updated when convenient.	Again, it exists for the convenience of
285
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
286 287 288 289 290 291 292 293 294 295
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
296 297 298 299 300 301 302 303 304 305 306 307 308
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
309
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
310 311
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the bgwriter, this is just pro forma).
312
 *
T
Tom Lane 已提交
313 314
 *----------
 */
315

T
Tom Lane 已提交
316
typedef struct XLogwrtRqst
317
{
T
Tom Lane 已提交
318 319
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
320
} XLogwrtRqst;
321

322 323 324 325 326 327
typedef struct XLogwrtResult
{
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
} XLogwrtResult;

T
Tom Lane 已提交
328 329 330
/*
 * Shared state data for XLogInsert.
 */
331 332
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
333 334
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
335
	int			curridx;		/* current block index in cache */
B
Bruce Momjian 已提交
336 337 338
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
339
	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
340 341
} XLogCtlInsert;

T
Tom Lane 已提交
342 343 344
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
345 346
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
347 348
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	int			curridx;		/* cache index of next block to write */
349
	pg_time_t	lastSegSwitchTime;		/* time of last xlog segment switch */
350 351
} XLogCtlWrite;

T
Tom Lane 已提交
352 353 354
/*
 * Total shared-memory state for XLOG.
 */
355 356
typedef struct XLogCtlData
{
357
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
358
	XLogCtlInsert Insert;
359

T
Tom Lane 已提交
360
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
361 362
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
363 364
	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
	TransactionId ckptXid;
365
	XLogRecPtr	asyncXactLSN; /* LSN of newest async commit/abort */
B
Bruce Momjian 已提交
366
	uint32		lastRemovedLog; /* latest removed/recycled XLOG segment */
367
	uint32		lastRemovedSeg;
368

369
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
370 371
	XLogCtlWrite Write;

T
Tom Lane 已提交
372
	/*
B
Bruce Momjian 已提交
373 374 375
	 * These values do not change after startup, although the pointed-to pages
	 * and xlblocks values certainly do.  Permission to read/write the pages
	 * and xlblocks values depends on WALInsertLock and WALWriteLock.
T
Tom Lane 已提交
376
	 */
B
Bruce Momjian 已提交
377
	char	   *pages;			/* buffers for unwritten XLOG pages */
378
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
379
	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
380
	TimeLineID	ThisTimeLineID;
381
	TimeLineID	RecoveryTargetTLI;
B
Bruce Momjian 已提交
382

383
	/*
384
	 * archiveCleanupCommand is read from recovery.conf but needs to be in
385 386
	 * shared memory so that the bgwriter process can access it.
	 */
387
	char		archiveCleanupCommand[MAXPGPATH];
T
Tom Lane 已提交
388

389 390
	/*
	 * SharedRecoveryInProgress indicates if we're still in crash or archive
391
	 * recovery.  Protected by info_lck.
392 393 394 395
	 */
	bool		SharedRecoveryInProgress;

	/*
396 397
	 * During recovery, we keep a copy of the latest checkpoint record here.
	 * Used by the background writer when it wants to create a restartpoint.
398 399 400 401 402 403 404 405
	 *
	 * Protected by info_lck.
	 */
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;

	/* end+1 of the last record replayed (or being replayed) */
	XLogRecPtr	replayEndRecPtr;
406 407
	/* end+1 of the last record replayed */
	XLogRecPtr	recoveryLastRecPtr;
408 409
	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
	TimestampTz recoveryLastXTime;
410

411
	slock_t		info_lck;		/* locks shared variables shown above */
412 413
} XLogCtlData;

414
static XLogCtlData *XLogCtl = NULL;
415

416
/*
T
Tom Lane 已提交
417
 * We maintain an image of pg_control in shared memory.
418
 */
419
static ControlFileData *ControlFile = NULL;
420

T
Tom Lane 已提交
421 422 423 424 425
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
426

T
Tom Lane 已提交
427 428
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
429
	(XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
T
Tom Lane 已提交
430 431 432 433 434 435

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
436
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
437 438 439 440 441 442 443
	)

#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
444

T
Tom Lane 已提交
445 446 447 448
/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
449
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
450

451 452 453 454 455 456 457 458 459
/*
 * Codes indicating where we got a WAL file from during recovery, or where
 * to attempt to get one.  These are chosen so that they can be OR'd together
 * in a bitmask state variable.
 */
#define XLOG_FROM_ARCHIVE		(1<<0)	/* Restored using restore_command */
#define XLOG_FROM_PG_XLOG		(1<<1)	/* Existing file in pg_xlog */
#define XLOG_FROM_STREAM		(1<<2)	/* Streamed from master */

T
Tom Lane 已提交
460 461 462 463 464 465 466 467 468 469
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
470

T
Tom Lane 已提交
471 472 473 474
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
475
 * will be just past that page. readLen indicates how much of the current
476 477
 * page has been read into readBuf, and readSource indicates where we got
 * the currently open file from.
T
Tom Lane 已提交
478
 */
479 480 481 482
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
483
static uint32 readLen = 0;
B
Bruce Momjian 已提交
484
static int	readSource = 0;		/* XLOG_FROM_* code */
B
Bruce Momjian 已提交
485

486 487 488 489
/*
 * Keeps track of which sources we've tried to read the current WAL
 * record from and failed.
 */
B
Bruce Momjian 已提交
490
static int	failedSources = 0;	/* OR of XLOG_FROM_* codes */
491 492 493 494 495 496 497 498

/*
 * These variables track when we last obtained some WAL data to process,
 * and where we got it from.  (XLogReceiptSource is initially the same as
 * readSource, but readSource gets reset to zero when we don't have data
 * to process right now.)
 */
static TimestampTz XLogReceiptTime = 0;
B
Bruce Momjian 已提交
499
static int	XLogReceiptSource = 0;		/* XLOG_FROM_* code */
B
Bruce Momjian 已提交
500

501
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
T
Tom Lane 已提交
502
static char *readBuf = NULL;
B
Bruce Momjian 已提交
503

504 505 506 507
/* Buffer for current ReadRecord result (expandable) */
static char *readRecordBuf = NULL;
static uint32 readRecordBufSize = 0;

T
Tom Lane 已提交
508
/* State information for XLOG reading */
B
Bruce Momjian 已提交
509 510
static XLogRecPtr ReadRecPtr;	/* start of last record read */
static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
511
static TimeLineID lastPageTLI = 0;
512

513 514 515
static XLogRecPtr minRecoveryPoint;		/* local copy of
										 * ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
516
static bool reachedMinRecoveryPoint = false;
517

V
WAL  
Vadim B. Mikheev 已提交
518 519
static bool InRedo = false;

520 521 522
/* Have we launched bgwriter during recovery? */
static bool bgwriterLaunched = false;

523 524 525 526 527 528 529 530 531 532 533 534
/*
 * Information logged when we detect a change in one of the parameters
 * important for Hot Standby.
 */
typedef struct xl_parameter_change
{
	int			MaxConnections;
	int			max_prepared_xacts;
	int			max_locks_per_xact;
	int			wal_level;
} xl_parameter_change;

535
/*
536
 * Flags set by interrupt handlers for later service in the redo loop.
537
 */
538
static volatile sig_atomic_t got_SIGHUP = false;
539
static volatile sig_atomic_t shutdown_requested = false;
540

541 542
/*
 * Flag set when executing a restore command, to tell SIGTERM signal handler
543
 * that it's safe to just proc_exit.
544 545 546
 */
static volatile sig_atomic_t in_restore_command = false;

547

548 549
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
550 551
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
552 553
static void XLogArchiveCleanup(const char *xlog);
static void readRecoveryCommandFile(void);
554
static void exitArchiveRecovery(TimeLineID endTLI,
B
Bruce Momjian 已提交
555
					uint32 endLogId, uint32 endLogSeg);
556
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
557 558
static void SetLatestXTime(TimestampTz xtime);
static TimestampTz GetLatestXTime(void);
559 560
static void CheckRequiredParameterValues(void);
static void XLogReportParameters(void);
561
static void LocalSetXLogInsertAllowed(void);
562
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
T
Tom Lane 已提交
563

564
static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
B
Bruce Momjian 已提交
565
				XLogRecPtr *lsn, BkpBlock *bkpb);
566
static bool AdvanceXLInsertBuffer(bool new_segment);
567
static bool XLogCheckpointNeeded(uint32 logid, uint32 logseg);
568
static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
569 570
static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
571
					   bool use_lock);
B
Bruce Momjian 已提交
572
static int XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
573
			 int source, bool notexistOk);
B
Bruce Momjian 已提交
574
static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode,
575
				   int sources);
576 577
static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess);
B
Bruce Momjian 已提交
578
static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
B
Bruce Momjian 已提交
579
static void XLogFileClose(void);
580
static bool RestoreArchivedFile(char *path, const char *xlogfname,
B
Bruce Momjian 已提交
581
					const char *recovername, off_t expectedSize);
582 583
static void ExecuteRecoveryCommand(char *command, char *commandName,
					   bool failOnerror);
584 585
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
586
static void UpdateLastRemovedPtr(char *filename);
587
static void ValidateXLOGDirectoryStructure(void);
588
static void CleanupBackupHistory(void);
589
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
590
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
591
static void CheckRecoveryConsistency(void);
592
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
593
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
594 595 596 597
static List *readTimeLineHistory(TimeLineID targetTLI);
static bool existsTimeLineHistory(TimeLineID probeTLI);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
B
Bruce Momjian 已提交
598 599
					 TimeLineID endTLI,
					 uint32 endLogId, uint32 endLogSeg);
T
Tom Lane 已提交
600 601
static void WriteControlFile(void);
static void ReadControlFile(void);
602
static char *str_time(pg_time_t tnow);
603
static bool CheckForStandbyTrigger(void);
604

605
#ifdef WAL_DEBUG
606
static void xlog_outrec(StringInfo buf, XLogRecord *record);
607
#endif
608
static void pg_start_backup_callback(int code, Datum arg);
609
static bool read_backup_label(XLogRecPtr *checkPointLoc);
610
static void rm_redo_error_callback(void *arg);
611
static int	get_sync_bit(int method);
T
Tom Lane 已提交
612 613 614 615 616


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
617
 * the rdata chain (see xlog.h for notes about rdata).
T
Tom Lane 已提交
618 619 620 621 622 623 624 625 626 627 628
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
629
XLogRecPtr
630
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
631
{
B
Bruce Momjian 已提交
632 633
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
634
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
635 636 637
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
638
	int			curridx;
B
Bruce Momjian 已提交
639 640 641 642 643
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
644 645 646 647
	XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
	pg_crc32	rdata_crc;
B
Bruce Momjian 已提交
648 649 650 651
	uint32		len,
				write_len;
	unsigned	i;
	bool		updrqst;
652
	bool		doPageWrites;
653
	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
V
Vadim B. Mikheev 已提交
654

655
	/* cross-check on whether we should be here or not */
656 657
	if (!XLogInsertAllowed())
		elog(ERROR, "cannot make new WAL entries during recovery");
658

659
	/* info's high bits are reserved for use by me */
V
Vadim B. Mikheev 已提交
660
	if (info & XLR_INFO_MASK)
661
		elog(PANIC, "invalid xlog info mask %02X", info);
V
Vadim B. Mikheev 已提交
662

663 664
	TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

T
Tom Lane 已提交
665
	/*
B
Bruce Momjian 已提交
666 667
	 * In bootstrap mode, we don't actually log anything but XLOG resources;
	 * return a phony record pointer.
T
Tom Lane 已提交
668
	 */
V
Vadim B. Mikheev 已提交
669
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
670 671
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
672
		RecPtr.xrecoff = SizeOfXLogLongPHD;		/* start of 1st chkpt record */
673
		return RecPtr;
V
WAL  
Vadim B. Mikheev 已提交
674 675
	}

T
Tom Lane 已提交
676
	/*
677
	 * Here we scan the rdata chain, determine which buffers must be backed
T
Tom Lane 已提交
678
	 * up, and compute the CRC values for the data.  Note that the record
B
Bruce Momjian 已提交
679 680 681 682
	 * header isn't added into the CRC initially since we don't know the final
	 * length or info bits quite yet.  Thus, the CRC will represent the CRC of
	 * the whole record in the order "rdata, then backup blocks, then record
	 * header".
T
Tom Lane 已提交
683
	 *
684 685 686 687 688
	 * We may have to loop back to here if a race condition is detected below.
	 * We could prevent the race by doing all this work while holding the
	 * insert lock, but it seems better to avoid doing CRC calculations while
	 * holding the lock.  This means we have to be careful about modifying the
	 * rdata chain until we know we aren't going to loop back again.  The only
B
Bruce Momjian 已提交
689 690 691 692 693
	 * change we allow ourselves to make earlier is to set rdt->data = NULL in
	 * chain items we have decided we will have to back up the whole buffer
	 * for.  This is OK because we will certainly decide the same thing again
	 * for those items if we do it over; doing it here saves an extra pass
	 * over the chain later.
T
Tom Lane 已提交
694
	 */
695
begin:;
T
Tom Lane 已提交
696 697 698 699 700 701
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

702 703 704 705 706 707 708 709
	/*
	 * Decide if we need to do full-page writes in this XLOG record: true if
	 * full_page_writes is on or we have a PITR request for it.  Since we
	 * don't yet have the insert lock, forcePageWrites could change under us,
	 * but we'll recheck it once we have the lock.
	 */
	doPageWrites = fullPageWrites || Insert->forcePageWrites;

710
	INIT_CRC32(rdata_crc);
T
Tom Lane 已提交
711
	len = 0;
B
Bruce Momjian 已提交
712
	for (rdt = rdata;;)
713 714 715
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
716
			/* Simple data, just include it */
717
			len += rdt->len;
718
			COMP_CRC32(rdata_crc, rdt->data, rdt->len);
719
		}
T
Tom Lane 已提交
720
		else
721
		{
T
Tom Lane 已提交
722 723
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
724
			{
T
Tom Lane 已提交
725
				if (rdt->buffer == dtbuf[i])
726
				{
727
					/* Buffer already referenced by earlier chain item */
T
Tom Lane 已提交
728 729 730 731 732
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
733
						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
734 735
					}
					break;
736
				}
T
Tom Lane 已提交
737
				if (dtbuf[i] == InvalidBuffer)
738
				{
T
Tom Lane 已提交
739 740
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
741 742
					if (XLogCheckBuffer(rdt, doPageWrites,
										&(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
T
Tom Lane 已提交
743 744 745 746 747 748 749
					{
						dtbuf_bkp[i] = true;
						rdt->data = NULL;
					}
					else if (rdt->data)
					{
						len += rdt->len;
750
						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
751 752
					}
					break;
753 754
				}
			}
T
Tom Lane 已提交
755
			if (i >= XLR_MAX_BKP_BLOCKS)
756
				elog(PANIC, "can backup at most %d blocks per xlog record",
T
Tom Lane 已提交
757
					 XLR_MAX_BKP_BLOCKS);
758
		}
759
		/* Break out of loop when rdt points to last chain item */
760 761 762 763 764
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797
	/*
	 * Now add the backup block headers and data into the CRC
	 */
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (dtbuf_bkp[i])
		{
			BkpBlock   *bkpb = &(dtbuf_xlg[i]);
			char	   *page;

			COMP_CRC32(rdata_crc,
					   (char *) bkpb,
					   sizeof(BkpBlock));
			page = (char *) BufferGetBlock(dtbuf[i]);
			if (bkpb->hole_length == 0)
			{
				COMP_CRC32(rdata_crc,
						   page,
						   BLCKSZ);
			}
			else
			{
				/* must skip the hole */
				COMP_CRC32(rdata_crc,
						   page,
						   bkpb->hole_offset);
				COMP_CRC32(rdata_crc,
						   page + (bkpb->hole_offset + bkpb->hole_length),
						   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
			}
		}
	}

T
Tom Lane 已提交
798
	/*
799 800
	 * NOTE: We disallow len == 0 because it provides a useful bit of extra
	 * error checking in ReadRecord.  This means that all callers of
B
Bruce Momjian 已提交
801 802 803
	 * XLogInsert must supply at least some not-in-a-buffer data.  However, we
	 * make an exception for XLOG SWITCH records because we don't want them to
	 * ever cross a segment boundary.
T
Tom Lane 已提交
804
	 */
805
	if (len == 0 && !isLogSwitch)
806
		elog(PANIC, "invalid xlog record length %u", len);
807

808
	START_CRIT_SECTION();
809

810 811 812
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
813
	/*
B
Bruce Momjian 已提交
814 815 816
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to go
	 * back and recompute everything.  This can only happen just after a
	 * checkpoint, so it's better to be slow in this case and fast otherwise.
817 818
	 *
	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
B
Bruce Momjian 已提交
819 820
	 * affect the contents of the XLOG record, so we'll update our local copy
	 * but not force a recomputation.
T
Tom Lane 已提交
821 822
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
823
	{
T
Tom Lane 已提交
824 825 826
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

827
		if (doPageWrites)
828
		{
829
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
T
Tom Lane 已提交
830
			{
831 832 833 834 835 836 837 838 839 840 841 842 843
				if (dtbuf[i] == InvalidBuffer)
					continue;
				if (dtbuf_bkp[i] == false &&
					XLByteLE(dtbuf_lsn[i], RedoRecPtr))
				{
					/*
					 * Oops, this buffer now needs to be backed up, but we
					 * didn't think so above.  Start over.
					 */
					LWLockRelease(WALInsertLock);
					END_CRIT_SECTION();
					goto begin;
				}
T
Tom Lane 已提交
844
			}
845 846 847
		}
	}

848
	/*
B
Bruce Momjian 已提交
849 850 851 852
	 * Also check to see if forcePageWrites was just turned on; if we weren't
	 * already doing full-page writes then go back and recompute. (If it was
	 * just turned off, we could recompute the record without full pages, but
	 * we choose not to bother.)
853 854 855 856 857 858 859 860 861
	 */
	if (Insert->forcePageWrites && !doPageWrites)
	{
		/* Oops, must redo it with full-page data */
		LWLockRelease(WALInsertLock);
		END_CRIT_SECTION();
		goto begin;
	}

T
Tom Lane 已提交
862
	/*
B
Bruce Momjian 已提交
863 864 865 866
	 * Make additional rdata chain entries for the backup blocks, so that we
	 * don't need to special-case them in the write loop.  Note that we have
	 * now irrevocably changed the input rdata chain.  At the exit of this
	 * loop, write_len includes the backup block data.
T
Tom Lane 已提交
867
	 *
868 869 870
	 * Also set the appropriate info bits to show which buffers were backed
	 * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
	 * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
T
Tom Lane 已提交
871 872 873
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
874
	{
875 876 877
		BkpBlock   *bkpb;
		char	   *page;

878
		if (!dtbuf_bkp[i])
879 880
			continue;

T
Tom Lane 已提交
881
		info |= XLR_SET_BKP_BLOCK(i);
882

883 884 885 886 887
		bkpb = &(dtbuf_xlg[i]);
		page = (char *) BufferGetBlock(dtbuf[i]);

		rdt->next = &(dtbuf_rdt1[i]);
		rdt = rdt->next;
888

889 890
		rdt->data = (char *) bkpb;
		rdt->len = sizeof(BkpBlock);
T
Tom Lane 已提交
891
		write_len += sizeof(BkpBlock);
892

893 894
		rdt->next = &(dtbuf_rdt2[i]);
		rdt = rdt->next;
895

896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
		if (bkpb->hole_length == 0)
		{
			rdt->data = page;
			rdt->len = BLCKSZ;
			write_len += BLCKSZ;
			rdt->next = NULL;
		}
		else
		{
			/* must skip the hole */
			rdt->data = page;
			rdt->len = bkpb->hole_offset;
			write_len += bkpb->hole_offset;

			rdt->next = &(dtbuf_rdt3[i]);
			rdt = rdt->next;

			rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
			rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
			write_len += rdt->len;
			rdt->next = NULL;
		}
918 919
	}

920 921 922 923 924 925 926
	/*
	 * If we backed up any full blocks and online backup is not in progress,
	 * mark the backup blocks as removable.  This allows the WAL archiver to
	 * know whether it is safe to compress archived WAL data by transforming
	 * full-block records into the non-full-block format.
	 *
	 * Note: we could just set the flag whenever !forcePageWrites, but
B
Bruce Momjian 已提交
927 928
	 * defining it like this leaves the info bit free for some potential other
	 * use in records without any backup blocks.
929 930 931 932
	 */
	if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
		info |= XLR_BKP_REMOVABLE;

933
	/*
934
	 * If there isn't enough space on the current XLOG page for a record
B
Bruce Momjian 已提交
935
	 * header, advance to the next page (leaving the unused space as zeroes).
936
	 */
T
Tom Lane 已提交
937 938
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
939 940
	if (freespace < SizeOfXLogRecord)
	{
941
		updrqst = AdvanceXLInsertBuffer(false);
942 943 944
		freespace = INSERT_FREESPACE(Insert);
	}

945
	/* Compute record's XLOG location */
T
Tom Lane 已提交
946
	curridx = Insert->curridx;
947 948 949
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/*
B
Bruce Momjian 已提交
950 951 952 953 954
	 * If the record is an XLOG_SWITCH, and we are exactly at the start of a
	 * segment, we need not insert it (and don't want to because we'd like
	 * consecutive switch requests to be no-ops).  Instead, make sure
	 * everything is written and flushed through the end of the prior segment,
	 * and return the prior segment's end address.
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985
	 */
	if (isLogSwitch &&
		(RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
	{
		/* We can release insert lock immediately */
		LWLockRelease(WALInsertLock);

		RecPtr.xrecoff -= SizeOfXLogLongPHD;
		if (RecPtr.xrecoff == 0)
		{
			/* crossing a logid boundary */
			RecPtr.xlogid -= 1;
			RecPtr.xrecoff = XLogFileSize;
		}

		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(RecPtr, LogwrtResult.Flush))
		{
			XLogwrtRqst FlushRqst;

			FlushRqst.Write = RecPtr;
			FlushRqst.Flush = RecPtr;
			XLogWrite(FlushRqst, false, false);
		}
		LWLockRelease(WALWriteLock);

		END_CRIT_SECTION();

		return RecPtr;
	}
T
Tom Lane 已提交
986

987 988
	/* Insert record header */

989
	record = (XLogRecord *) Insert->currpos;
990
	record->xl_prev = Insert->PrevRecord;
991
	record->xl_xid = GetCurrentTransactionIdIfAny();
992
	record->xl_tot_len = SizeOfXLogRecord + write_len;
T
Tom Lane 已提交
993
	record->xl_len = len;		/* doesn't include backup blocks */
994
	record->xl_info = info;
995
	record->xl_rmid = rmid;
996

997 998 999 1000
	/* Now we can finish computing the record's CRC */
	COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(rdata_crc);
1001 1002
	record->xl_crc = rdata_crc;

1003
#ifdef WAL_DEBUG
V
WAL  
Vadim B. Mikheev 已提交
1004 1005
	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
1006
		StringInfoData buf;
V
WAL  
Vadim B. Mikheev 已提交
1007

1008
		initStringInfo(&buf);
1009 1010
		appendStringInfo(&buf, "INSERT @ %X/%X: ",
						 RecPtr.xlogid, RecPtr.xrecoff);
1011
		xlog_outrec(&buf, record);
1012
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
1013
		{
1014 1015
			appendStringInfo(&buf, " - ");
			RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
1016
		}
1017 1018
		elog(LOG, "%s", buf.data);
		pfree(buf.data);
V
WAL  
Vadim B. Mikheev 已提交
1019
	}
1020
#endif
V
WAL  
Vadim B. Mikheev 已提交
1021

T
Tom Lane 已提交
1022 1023 1024 1025
	/* Record begin of record in appropriate places */
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

1026
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
1027
	freespace -= SizeOfXLogRecord;
1028

T
Tom Lane 已提交
1029 1030 1031 1032
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
1033
	{
1034 1035 1036 1037
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
1038
		{
1039 1040 1041 1042 1043
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
1044
				write_len -= freespace;
1045 1046 1047 1048 1049
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
1050
				write_len -= rdata->len;
1051 1052 1053 1054
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
1055 1056
		}

1057
		/* Use next buffer */
1058
		updrqst = AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
1059 1060 1061 1062 1063 1064
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
1065
		freespace = INSERT_FREESPACE(Insert);
1066
	}
1067

T
Tom Lane 已提交
1068 1069
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
1070
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
1071
	freespace = INSERT_FREESPACE(Insert);
1072

V
Vadim B. Mikheev 已提交
1073
	/*
B
Bruce Momjian 已提交
1074 1075
	 * The recptr I return is the beginning of the *next* record. This will be
	 * stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
1076
	 */
T
Tom Lane 已提交
1077
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
1078

1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
	/*
	 * If the record is an XLOG_SWITCH, we must now write and flush all the
	 * existing data, and then forcibly advance to the start of the next
	 * segment.  It's not good to do this I/O while holding the insert lock,
	 * but there seems too much risk of confusion if we try to release the
	 * lock sooner.  Fortunately xlog switch needn't be a high-performance
	 * operation anyway...
	 */
	if (isLogSwitch)
	{
		XLogCtlWrite *Write = &XLogCtl->Write;
		XLogwrtRqst FlushRqst;
		XLogRecPtr	OldSegEnd;

1093 1094
		TRACE_POSTGRESQL_XLOG_SWITCH();

1095 1096 1097
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

		/*
B
Bruce Momjian 已提交
1098 1099
		 * Flush through the end of the page containing XLOG_SWITCH, and
		 * perform end-of-segment actions (eg, notifying archiver).
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
		 */
		WriteRqst = XLogCtl->xlblocks[curridx];
		FlushRqst.Write = WriteRqst;
		FlushRqst.Flush = WriteRqst;
		XLogWrite(FlushRqst, false, true);

		/* Set up the next buffer as first page of next segment */
		/* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
		(void) AdvanceXLInsertBuffer(true);

		/* There should be no unwritten data */
		curridx = Insert->curridx;
		Assert(curridx == Write->curridx);

		/* Compute end address of old segment */
		OldSegEnd = XLogCtl->xlblocks[curridx];
		OldSegEnd.xrecoff -= XLOG_BLCKSZ;
		if (OldSegEnd.xrecoff == 0)
		{
			/* crossing a logid boundary */
			OldSegEnd.xlogid -= 1;
			OldSegEnd.xrecoff = XLogFileSize;
		}

		/* Make it look like we've written and synced all of old segment */
		LogwrtResult.Write = OldSegEnd;
		LogwrtResult.Flush = OldSegEnd;

		/*
		 * Update shared-memory status --- this code should match XLogWrite
		 */
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire(&xlogctl->info_lck);
			xlogctl->LogwrtResult = LogwrtResult;
			if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
				xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
			if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
				xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
			SpinLockRelease(&xlogctl->info_lck);
		}

		Write->LogwrtResult = LogwrtResult;

		LWLockRelease(WALWriteLock);

		updrqst = false;		/* done already */
	}
1150
	else
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
	{
		/* normal case, ie not xlog switch */

		/* Need to update shared LogwrtRqst if some block was filled up */
		if (freespace < SizeOfXLogRecord)
		{
			/* curridx is filled and available for writing out */
			updrqst = true;
		}
		else
		{
			/* if updrqst already set, write through end of previous buf */
			curridx = PrevBufIdx(curridx);
		}
		WriteRqst = XLogCtl->xlblocks[curridx];
	}
1167

1168
	LWLockRelease(WALInsertLock);
1169 1170 1171

	if (updrqst)
	{
1172 1173 1174
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1175
		SpinLockAcquire(&xlogctl->info_lck);
T
Tom Lane 已提交
1176
		/* advance global request to include new block(s) */
1177 1178
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
1179
		/* update local result copy while I have the chance */
1180
		LogwrtResult = xlogctl->LogwrtResult;
1181
		SpinLockRelease(&xlogctl->info_lck);
1182 1183
	}

1184
	XactLastRecEnd = RecPtr;
1185

1186
	END_CRIT_SECTION();
1187

1188
	return RecPtr;
1189
}
1190

1191
/*
1192 1193 1194
 * Determine whether the buffer referenced by an XLogRecData item has to
 * be backed up, and if so fill a BkpBlock struct for it.  In any case
 * save the buffer's LSN at *lsn.
1195
 */
1196
static bool
1197
XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1198
				XLogRecPtr *lsn, BkpBlock *bkpb)
1199
{
1200
	Page		page;
1201

1202
	page = BufferGetPage(rdata->buffer);
1203 1204

	/*
B
Bruce Momjian 已提交
1205 1206 1207
	 * XXX We assume page LSN is first data on *every* page that can be passed
	 * to XLogInsert, whether it otherwise has the standard page layout or
	 * not.
1208
	 */
1209
	*lsn = PageGetLSN(page);
1210

1211
	if (doPageWrites &&
1212
		XLByteLE(PageGetLSN(page), RedoRecPtr))
1213
	{
1214 1215 1216
		/*
		 * The page needs to be backed up, so set up *bkpb
		 */
1217
		BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1218

1219 1220 1221
		if (rdata->buffer_std)
		{
			/* Assume we can omit data between pd_lower and pd_upper */
1222 1223
			uint16		lower = ((PageHeader) page)->pd_lower;
			uint16		upper = ((PageHeader) page)->pd_upper;
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244

			if (lower >= SizeOfPageHeaderData &&
				upper > lower &&
				upper <= BLCKSZ)
			{
				bkpb->hole_offset = lower;
				bkpb->hole_length = upper - lower;
			}
			else
			{
				/* No "hole" to compress out */
				bkpb->hole_offset = 0;
				bkpb->hole_length = 0;
			}
		}
		else
		{
			/* Not a standard page header, don't try to eliminate "hole" */
			bkpb->hole_offset = 0;
			bkpb->hole_length = 0;
		}
1245

1246
		return true;			/* buffer requires backup */
1247
	}
1248 1249

	return false;				/* buffer does not need to be backed up */
1250 1251
}

1252 1253 1254 1255 1256 1257
/*
 * XLogArchiveNotify
 *
 * Create an archive notification file
 *
 * The name of the notification file is the message that will be picked up
1258
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
1259
 * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1260
 * then when complete, rename it to 0000000100000001000000C6.done
1261 1262 1263 1264 1265
 */
static void
XLogArchiveNotify(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
B
Bruce Momjian 已提交
1266
	FILE	   *fd;
1267 1268 1269 1270

	/* insert an otherwise empty file called <XLOG>.ready */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	fd = AllocateFile(archiveStatusPath, "w");
B
Bruce Momjian 已提交
1271 1272
	if (fd == NULL)
	{
1273 1274 1275 1276 1277 1278
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not create archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}
B
Bruce Momjian 已提交
1279 1280
	if (FreeFile(fd))
	{
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not write archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}

	/* Notify archiver that it's got something to do */
	if (IsUnderPostmaster)
		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
}

/*
 * Convenience routine to notify using log/seg representation of filename
 */
static void
XLogArchiveNotifySeg(uint32 log, uint32 seg)
{
	char		xlog[MAXFNAMELEN];

1301
	XLogFileName(xlog, ThisTimeLineID, log, seg);
1302 1303 1304 1305
	XLogArchiveNotify(xlog);
}

/*
1306
 * XLogArchiveCheckDone
1307
 *
1308 1309 1310 1311
 * This is called when we are ready to delete or recycle an old XLOG segment
 * file or backup history file.  If it is okay to delete it then return true.
 * If it is not time to delete it, make sure a .ready file exists, and return
 * false.
1312 1313
 *
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1314 1315 1316 1317
 * then return false; else create <XLOG>.ready and return false.
 *
 * The reason we do things this way is so that if the original attempt to
 * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1318 1319
 */
static bool
1320
XLogArchiveCheckDone(const char *xlog)
1321 1322 1323 1324
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

1325 1326 1327 1328 1329
	/* Always deletable if archiving is off */
	if (!XLogArchivingActive())
		return true;

	/* First check for .done --- this means archiver is done with it */
1330 1331 1332 1333 1334 1335 1336
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
B
Bruce Momjian 已提交
1337
		return false;
1338 1339 1340 1341 1342 1343 1344

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Retry creation of the .ready file */
1345
	XLogArchiveNotify(xlog);
1346 1347 1348
	return false;
}

1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
/*
 * XLogArchiveIsBusy
 *
 * Check to see if an XLOG segment file is still unarchived.
 * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 * the first place we aren't chartered to recreate the .ready file, and
 * in the second place we should consider that if the file is already gone
 * then it's not busy.  (This check is needed to handle the race condition
 * that a checkpoint already deleted the no-longer-needed file.)
 */
static bool
XLogArchiveIsBusy(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

	/* First check for .done --- this means archiver is done with it */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/*
1381 1382 1383
	 * Check to see if the WAL file has been removed by checkpoint, which
	 * implies it has already been archived, and explains why we can't see a
	 * status file for it.
1384 1385 1386 1387 1388 1389 1390 1391 1392
	 */
	snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
	if (stat(archiveStatusPath, &stat_buf) != 0 &&
		errno == ENOENT)
		return false;

	return true;
}

1393 1394 1395
/*
 * XLogArchiveCleanup
 *
1396
 * Cleanup archive notification file(s) for a particular xlog segment
1397 1398 1399 1400
 */
static void
XLogArchiveCleanup(const char *xlog)
{
B
Bruce Momjian 已提交
1401
	char		archiveStatusPath[MAXPGPATH];
1402

1403
	/* Remove the .done file */
1404 1405 1406
	StatusFilePath(archiveStatusPath, xlog, ".done");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1407 1408 1409 1410 1411

	/* Remove the .ready file if present --- normally it shouldn't be */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1412 1413
}

T
Tom Lane 已提交
1414 1415 1416 1417
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
1418 1419 1420 1421
 * If new_segment is TRUE then we set up the next buffer page as the first
 * page of the next xlog segment file, possibly but not usually the next
 * consecutive file page.
 *
T
Tom Lane 已提交
1422
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
1423
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
1424 1425 1426
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
1427
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
1428 1429
 */
static bool
1430
AdvanceXLInsertBuffer(bool new_segment)
1431
{
T
Tom Lane 已提交
1432 1433
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
1434
	int			nextidx = NextBufIdx(Insert->curridx);
T
Tom Lane 已提交
1435 1436 1437
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
1438 1439
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
1440

T
Tom Lane 已提交
1441 1442 1443
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
1444

T
Tom Lane 已提交
1445
	/*
B
Bruce Momjian 已提交
1446 1447 1448
	 * Get ending-offset of the buffer page we need to replace (this may be
	 * zero if the buffer hasn't been used yet).  Fall through if it's already
	 * written out.
T
Tom Lane 已提交
1449 1450 1451 1452 1453 1454
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
1455

T
Tom Lane 已提交
1456
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1457

1458
		/* Before waiting, get info_lck and update LogwrtResult */
1459 1460 1461 1462
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

1463
			SpinLockAcquire(&xlogctl->info_lck);
1464 1465 1466
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
1467
			SpinLockRelease(&xlogctl->info_lck);
1468
		}
1469 1470 1471 1472 1473 1474 1475 1476 1477

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
1478
		{
1479 1480 1481 1482
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1483
			{
1484 1485 1486
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
1487
			}
1488
			else
T
Tom Lane 已提交
1489 1490
			{
				/*
B
Bruce Momjian 已提交
1491 1492
				 * Have to write buffers while holding insert lock. This is
				 * not good, so only write as much as we absolutely must.
T
Tom Lane 已提交
1493
				 */
1494
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
T
Tom Lane 已提交
1495 1496 1497
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
1498
				XLogWrite(WriteRqst, false, false);
1499
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1500
				Insert->LogwrtResult = LogwrtResult;
1501
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1502 1503 1504 1505
			}
		}
	}

T
Tom Lane 已提交
1506
	/*
B
Bruce Momjian 已提交
1507 1508
	 * Now the next buffer slot is free and we can set it up to be the next
	 * output page.
T
Tom Lane 已提交
1509
	 */
1510
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1511 1512 1513 1514 1515 1516 1517 1518

	if (new_segment)
	{
		/* force it to a segment start point */
		NewPageEndPtr.xrecoff += XLogSegSize - 1;
		NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
	}

1519
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
1520
	{
T
Tom Lane 已提交
1521
		/* crossing a logid boundary */
1522
		NewPageEndPtr.xlogid += 1;
1523
		NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1524
	}
T
Tom Lane 已提交
1525
	else
1526
		NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1527
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1528
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
B
Bruce Momjian 已提交
1529

T
Tom Lane 已提交
1530
	Insert->curridx = nextidx;
1531
	Insert->currpage = NewPage;
B
Bruce Momjian 已提交
1532 1533

	Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
B
Bruce Momjian 已提交
1534

T
Tom Lane 已提交
1535
	/*
B
Bruce Momjian 已提交
1536 1537
	 * Be sure to re-zero the buffer so that bytes beyond what we've written
	 * will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
1538
	 */
1539
	MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1540

1541 1542 1543
	/*
	 * Fill the new page's header
	 */
B
Bruce Momjian 已提交
1544 1545
	NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;

1546
	/* NewPage->xlp_info = 0; */	/* done by memset */
B
Bruce Momjian 已提交
1547 1548
	NewPage   ->xlp_tli = ThisTimeLineID;
	NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1549
	NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
T
Tom Lane 已提交
1550

1551
	/*
1552
	 * If first page of an XLOG segment file, make it a long header.
1553 1554 1555
	 */
	if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
	{
1556
		XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1557

1558 1559
		NewLongPage->xlp_sysid = ControlFile->system_identifier;
		NewLongPage->xlp_seg_size = XLogSegSize;
1560
		NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
B
Bruce Momjian 已提交
1561 1562 1563
		NewPage   ->xlp_info |= XLP_LONG_HEADER;

		Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1564 1565
	}

T
Tom Lane 已提交
1566
	return update_needed;
1567 1568
}

1569 1570 1571
/*
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
 *
1572 1573 1574
 * logid/logseg indicate a log file that has just been filled up (or read
 * during recovery). We measure the distance from RedoRecPtr to logid/logseg
 * and see if that exceeds CheckPointSegments.
1575 1576 1577 1578
 *
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
 */
static bool
1579
XLogCheckpointNeeded(uint32 logid, uint32 logseg)
1580 1581
{
	/*
1582 1583
	 * A straight computation of segment number could overflow 32 bits. Rather
	 * than assuming we have working 64-bit arithmetic, we compare the
B
Bruce Momjian 已提交
1584 1585
	 * highest-order bits separately, and force a checkpoint immediately when
	 * they change.
1586 1587 1588 1589 1590 1591 1592 1593 1594
	 */
	uint32		old_segno,
				new_segno;
	uint32		old_highbits,
				new_highbits;

	old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
		(RedoRecPtr.xrecoff / XLogSegSize);
	old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1595 1596
	new_segno = (logid % XLogSegSize) * XLogSegsPerFile + logseg;
	new_highbits = logid / XLogSegSize;
1597
	if (new_highbits != old_highbits ||
B
Bruce Momjian 已提交
1598
		new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1599 1600 1601 1602
		return true;
	return false;
}

T
Tom Lane 已提交
1603 1604 1605
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
1606 1607 1608 1609 1610
 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
 * may stop at any convenient boundary (such as a cache or logfile boundary).
 * This option allows us to avoid uselessly issuing multiple writes when a
 * single one would do.
 *
1611 1612 1613 1614 1615 1616
 * If xlog_switch == TRUE, we are intending an xlog segment switch, so
 * perform end-of-segment actions after writing the last page, even if
 * it's not physically the end of its segment.  (NB: this will work properly
 * only if caller specifies WriteRqst == page-end and flexible == false,
 * and there is some data to write.)
 *
1617
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
1618
 */
1619
static void
1620
XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1621
{
1622
	XLogCtlWrite *Write = &XLogCtl->Write;
T
Tom Lane 已提交
1623
	bool		ispartialpage;
1624
	bool		last_iteration;
1625
	bool		finishing_seg;
1626
	bool		use_existent;
1627 1628 1629 1630
	int			curridx;
	int			npages;
	int			startidx;
	uint32		startoffset;
1631

1632 1633 1634
	/* We should always be inside a critical section here */
	Assert(CritSectionCount > 0);

B
Bruce Momjian 已提交
1635
	/*
B
Bruce Momjian 已提交
1636
	 * Update local LogwrtResult (caller probably did this already, but...)
B
Bruce Momjian 已提交
1637
	 */
T
Tom Lane 已提交
1638 1639
	LogwrtResult = Write->LogwrtResult;

1640 1641 1642
	/*
	 * Since successive pages in the xlog cache are consecutively allocated,
	 * we can usually gather multiple pages together and issue just one
B
Bruce Momjian 已提交
1643 1644 1645 1646 1647
	 * write() call.  npages is the number of pages we have determined can be
	 * written together; startidx is the cache block index of the first one,
	 * and startoffset is the file offset at which it should go. The latter
	 * two variables are only valid when npages > 0, but we must initialize
	 * all of them to keep the compiler quiet.
1648 1649 1650 1651 1652 1653 1654 1655 1656
	 */
	npages = 0;
	startidx = 0;
	startoffset = 0;

	/*
	 * Within the loop, curridx is the cache block index of the page to
	 * consider writing.  We advance Write->curridx only after successfully
	 * writing pages.  (Right now, this refinement is useless since we are
B
Bruce Momjian 已提交
1657 1658
	 * going to PANIC if any error occurs anyway; but someday it may come in
	 * useful.)
1659 1660
	 */
	curridx = Write->curridx;
B
 
Bruce Momjian 已提交
1661

T
Tom Lane 已提交
1662
	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1663
	{
1664
		/*
B
Bruce Momjian 已提交
1665 1666 1667
		 * Make sure we're not ahead of the insert process.  This could happen
		 * if we're passed a bogus WriteRqst.Write that is past the end of the
		 * last page that's been initialized by AdvanceXLInsertBuffer.
1668
		 */
1669
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1670
			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1671
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1672 1673
				 XLogCtl->xlblocks[curridx].xlogid,
				 XLogCtl->xlblocks[curridx].xrecoff);
1674

T
Tom Lane 已提交
1675
		/* Advance LogwrtResult.Write to end of current buffer page */
1676
		LogwrtResult.Write = XLogCtl->xlblocks[curridx];
T
Tom Lane 已提交
1677 1678 1679
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1680
		{
T
Tom Lane 已提交
1681
			/*
1682 1683
			 * Switch to new logfile segment.  We cannot have any pending
			 * pages here (since we dump what we have at segment end).
T
Tom Lane 已提交
1684
			 */
1685
			Assert(npages == 0);
T
Tom Lane 已提交
1686
			if (openLogFile >= 0)
1687
				XLogFileClose();
T
Tom Lane 已提交
1688 1689
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1690 1691 1692 1693
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1694
			openLogOff = 0;
1695 1696
		}

1697
		/* Make sure we have the current logfile open */
T
Tom Lane 已提交
1698
		if (openLogFile < 0)
1699
		{
T
Tom Lane 已提交
1700
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1701
			openLogFile = XLogFileOpen(openLogId, openLogSeg);
T
Tom Lane 已提交
1702
			openLogOff = 0;
1703 1704
		}

1705 1706 1707 1708 1709
		/* Add current page to the set of pending pages-to-dump */
		if (npages == 0)
		{
			/* first of group */
			startidx = curridx;
1710
			startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1711 1712
		}
		npages++;
1713

T
Tom Lane 已提交
1714
		/*
B
Bruce Momjian 已提交
1715 1716 1717 1718
		 * Dump the set if this will be the last loop iteration, or if we are
		 * at the last page of the cache area (since the next page won't be
		 * contiguous in memory), or if we are at the end of the logfile
		 * segment.
T
Tom Lane 已提交
1719
		 */
1720 1721
		last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);

1722
		finishing_seg = !ispartialpage &&
1723
			(startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1724

1725
		if (last_iteration ||
1726 1727
			curridx == XLogCtl->XLogCacheBlck ||
			finishing_seg)
T
Tom Lane 已提交
1728
		{
1729 1730
			char	   *from;
			Size		nbytes;
1731

1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744
			/* Need to seek in the file? */
			if (openLogOff != startoffset)
			{
				if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
					ereport(PANIC,
							(errcode_for_file_access(),
							 errmsg("could not seek in log file %u, "
									"segment %u to offset %u: %m",
									openLogId, openLogSeg, startoffset)));
				openLogOff = startoffset;
			}

			/* OK to write the page(s) */
1745 1746
			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
			nbytes = npages * (Size) XLOG_BLCKSZ;
1747 1748 1749 1750 1751 1752 1753 1754 1755
			errno = 0;
			if (write(openLogFile, from, nbytes) != nbytes)
			{
				/* if write didn't set errno, assume no disk space */
				if (errno == 0)
					errno = ENOSPC;
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not write to log file %u, segment %u "
P
Peter Eisentraut 已提交
1756
								"at offset %u, length %lu: %m",
1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
								openLogId, openLogSeg,
								openLogOff, (unsigned long) nbytes)));
			}

			/* Update state for write */
			openLogOff += nbytes;
			Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
			npages = 0;

			/*
			 * If we just wrote the whole last page of a logfile segment,
			 * fsync the segment immediately.  This avoids having to go back
			 * and re-open prior segments when an fsync request comes along
			 * later. Doing it here ensures that one and only one backend will
			 * perform this fsync.
			 *
1773 1774 1775
			 * We also do this if this is the last page written for an xlog
			 * switch.
			 *
1776
			 * This is also the right place to notify the Archiver that the
B
Bruce Momjian 已提交
1777
			 * segment is ready to copy to archival storage, and to update the
1778 1779 1780
			 * timer for archive_timeout, and to signal for a checkpoint if
			 * too many logfile segments have been used since the last
			 * checkpoint.
1781
			 */
1782
			if (finishing_seg || (xlog_switch && last_iteration))
1783
			{
1784
				issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
B
Bruce Momjian 已提交
1785
				LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */
1786 1787 1788

				if (XLogArchivingActive())
					XLogArchiveNotifySeg(openLogId, openLogSeg);
1789

1790
				Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1791 1792

				/*
1793
				 * Signal bgwriter to start a checkpoint if we've consumed too
1794
				 * much xlog since the last one.  For speed, we first check
B
Bruce Momjian 已提交
1795 1796 1797
				 * using the local copy of RedoRecPtr, which might be out of
				 * date; if it looks like a checkpoint is needed, forcibly
				 * update RedoRecPtr and recheck.
1798
				 */
1799
				if (IsUnderPostmaster &&
1800
					XLogCheckpointNeeded(openLogId, openLogSeg))
1801
				{
1802
					(void) GetRedoRecPtr();
1803
					if (XLogCheckpointNeeded(openLogId, openLogSeg))
1804
						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1805
				}
1806
			}
T
Tom Lane 已提交
1807
		}
1808

T
Tom Lane 已提交
1809 1810 1811 1812 1813 1814
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
1815 1816 1817 1818 1819
		curridx = NextBufIdx(curridx);

		/* If flexible, break out of loop as soon as we wrote something */
		if (flexible && npages == 0)
			break;
1820
	}
1821 1822 1823

	Assert(npages == 0);
	Assert(curridx == Write->curridx);
1824

T
Tom Lane 已提交
1825 1826 1827 1828 1829
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1830
	{
T
Tom Lane 已提交
1831
		/*
B
Bruce Momjian 已提交
1832 1833 1834
		 * Could get here without iterating above loop, in which case we might
		 * have no open file or the wrong one.	However, we do not need to
		 * fsync more than one file.
T
Tom Lane 已提交
1835
		 */
1836 1837
		if (sync_method != SYNC_METHOD_OPEN &&
			sync_method != SYNC_METHOD_OPEN_DSYNC)
T
Tom Lane 已提交
1838
		{
1839
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1840
				!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1841
				XLogFileClose();
1842 1843 1844
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1845
				openLogFile = XLogFileOpen(openLogId, openLogSeg);
1846 1847
				openLogOff = 0;
			}
1848
			issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
T
Tom Lane 已提交
1849 1850
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1851 1852
	}

T
Tom Lane 已提交
1853 1854 1855
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1856
	 * We make sure that the shared 'request' values do not fall behind the
B
Bruce Momjian 已提交
1857 1858
	 * 'result' values.  This is not absolutely essential, but it saves some
	 * code in a couple of places.
T
Tom Lane 已提交
1859
	 */
1860 1861 1862 1863
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1864
		SpinLockAcquire(&xlogctl->info_lck);
1865 1866 1867 1868 1869
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1870
		SpinLockRelease(&xlogctl->info_lck);
1871
	}
1872

T
Tom Lane 已提交
1873 1874 1875
	Write->LogwrtResult = LogwrtResult;
}

1876
/*
1877 1878
 * Record the LSN for an asynchronous transaction commit/abort.
 * (This should not be called for for synchronous commits.)
1879 1880
 */
void
1881
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1882 1883 1884 1885 1886
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
1887 1888
	if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
		xlogctl->asyncXactLSN = asyncXactLSN;
1889 1890 1891
	SpinLockRelease(&xlogctl->info_lck);
}

1892 1893 1894 1895
/*
 * Advance minRecoveryPoint in control file.
 *
 * If we crash during recovery, we must reach this point again before the
1896 1897
 * database is consistent.
 *
1898
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1899
 * is only updated if it's not already greater than or equal to 'lsn'.
1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
 */
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
	/* Quick check using our local copy of the variable */
	if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
		return;

	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

	/* update local copy */
	minRecoveryPoint = ControlFile->minRecoveryPoint;

	/*
	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
1915 1916
	 * i.e., we're doing crash recovery.  We never modify the control file's
	 * value in that case, so we can short-circuit future checks here too.
1917 1918 1919 1920 1921 1922 1923
	 */
	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
		updateMinRecoveryPoint = false;
	else if (force || XLByteLT(minRecoveryPoint, lsn))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
1924
		XLogRecPtr	newMinRecoveryPoint;
1925 1926 1927 1928

		/*
		 * To avoid having to update the control file too often, we update it
		 * all the way to the last record being replayed, even though 'lsn'
1929 1930 1931 1932
		 * would suffice for correctness.  This also allows the 'force' case
		 * to not need a valid 'lsn' value.
		 *
		 * Another important reason for doing it this way is that the passed
B
Bruce Momjian 已提交
1933 1934 1935 1936
		 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
		 * the caller got it from a corrupted heap page.  Accepting such a
		 * value as the min recovery point would prevent us from coming up at
		 * all.  Instead, we just log a warning and continue with recovery.
1937
		 * (See also the comments about corrupt LSNs in XLogFlush.)
1938 1939 1940 1941 1942
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
		SpinLockRelease(&xlogctl->info_lck);

1943 1944
		if (!force && XLByteLT(newMinRecoveryPoint, lsn))
			elog(WARNING,
B
Bruce Momjian 已提交
1945
			   "xlog min recovery request %X/%X is past current point %X/%X",
1946 1947 1948
				 lsn.xlogid, lsn.xrecoff,
				 newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);

1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963
		/* update control file */
		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
		{
			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
			UpdateControlFile();
			minRecoveryPoint = newMinRecoveryPoint;

			ereport(DEBUG2,
					(errmsg("updated min recovery point to %X/%X",
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
		}
	}
	LWLockRelease(ControlFileLock);
}

T
Tom Lane 已提交
1964 1965 1966
/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
1967
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
1968 1969 1970 1971 1972 1973 1974 1975
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

1976
	/*
1977
	 * During REDO, we are reading not writing WAL.  Therefore, instead of
B
Bruce Momjian 已提交
1978 1979 1980 1981
	 * trying to flush the WAL, we should update minRecoveryPoint instead. We
	 * test XLogInsertAllowed(), not InRecovery, because we need the bgwriter
	 * to act this way too, and because when the bgwriter tries to write the
	 * end-of-recovery checkpoint, it should indeed flush.
1982
	 */
1983
	if (!XLogInsertAllowed())
1984 1985
	{
		UpdateMinRecoveryPoint(record, false);
T
Tom Lane 已提交
1986
		return;
1987
	}
T
Tom Lane 已提交
1988 1989 1990 1991 1992

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

1993
#ifdef WAL_DEBUG
1994
	if (XLOG_DEBUG)
1995
		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1996 1997 1998
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1999
#endif
2000

T
Tom Lane 已提交
2001 2002 2003 2004
	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
B
Bruce Momjian 已提交
2005 2006 2007 2008
	 * piggyback as much data as we can on each fsync: if we see any more data
	 * entered into the xlog buffer, we'll write and fsync that too, so that
	 * the final value of LogwrtResult.Flush is as large as possible. This
	 * gives us some chance of avoiding another fsync immediately after.
T
Tom Lane 已提交
2009 2010 2011 2012 2013
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

2014
	/* read LogwrtResult and update local state */
2015 2016 2017 2018
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2019
		SpinLockAcquire(&xlogctl->info_lck);
2020 2021 2022
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
2023
		SpinLockRelease(&xlogctl->info_lck);
2024
	}
2025 2026 2027

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2028
	{
2029 2030 2031 2032
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2033
		{
2034 2035 2036 2037 2038 2039
			/* try to write/flush later additions to XLOG as well */
			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
			{
				XLogCtlInsert *Insert = &XLogCtl->Insert;
				uint32		freespace = INSERT_FREESPACE(Insert);

B
Bruce Momjian 已提交
2040
				if (freespace < SizeOfXLogRecord)		/* buffer is full */
2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				else
				{
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
					WriteRqstPtr.xrecoff -= freespace;
				}
				LWLockRelease(WALInsertLock);
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = WriteRqstPtr;
			}
			else
			{
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = record;
			}
2056
			XLogWrite(WriteRqst, false, false);
T
Tom Lane 已提交
2057
		}
2058
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
2059 2060 2061
	}

	END_CRIT_SECTION();
2062 2063 2064

	/*
	 * If we still haven't flushed to the request point then we have a
B
Bruce Momjian 已提交
2065 2066
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
2067
	 *
2068 2069 2070 2071
	 * Formerly we treated this as a PANIC condition, but that hurts the
	 * system's robustness rather than helping it: we do not want to take down
	 * the whole system due to corruption on one data page.  In particular, if
	 * the bad page is encountered again during recovery then we would be
2072
	 * unable to restart the database at all!  (This scenario actually
B
Bruce Momjian 已提交
2073 2074 2075 2076 2077
	 * happened in the field several times with 7.1 releases.)	As of 8.4, bad
	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
	 * the only time we can reach here during recovery is while flushing the
	 * end-of-recovery checkpoint record, and we don't expect that to have a
	 * bad LSN.
2078
	 *
B
Bruce Momjian 已提交
2079 2080 2081 2082
	 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
	 * since xact.c calls this routine inside a critical section.  However,
	 * calls from bufmgr.c are not within critical sections and so we will not
	 * force a restart for a bad LSN on a data page.
2083 2084
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
2085
		elog(ERROR,
B
Bruce Momjian 已提交
2086
		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2087 2088
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2089 2090
}

2091 2092 2093 2094 2095 2096
/*
 * Flush xlog, but without specifying exactly where to flush to.
 *
 * We normally flush only completed blocks; but if there is nothing to do on
 * that basis, we check for unflushed async commits in the current incomplete
 * block, and flush through the latest one of those.  Thus, if async commits
B
Bruce Momjian 已提交
2097
 * are not being used, we will flush complete blocks only.	We can guarantee
2098
 * that async commits reach disk after at most three cycles; normally only
B
Bruce Momjian 已提交
2099
 * one or two.	(We allow XLogWrite to write "flexibly", meaning it can stop
2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111
 * at the end of the buffer ring; this makes a difference only with very high
 * load or long wal_writer_delay, but imposes one extra cycle for the worst
 * case for async commits.)
 *
 * This routine is invoked periodically by the background walwriter process.
 */
void
XLogBackgroundFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
	bool		flexible = true;

2112 2113 2114 2115
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return;

2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135
	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* back off to last completed page boundary */
	WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;

	/* if we have already flushed that far, consider async commit records */
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2136
		SpinLockAcquire(&xlogctl->info_lck);
2137
		WriteRqstPtr = xlogctl->asyncXactLSN;
2138
		SpinLockRelease(&xlogctl->info_lck);
2139 2140 2141
		flexible = false;		/* ensure it all gets written */
	}

2142
	/*
B
Bruce Momjian 已提交
2143 2144 2145
	 * If already known flushed, we're done. Just need to check if we are
	 * holding an open file handle to a logfile that's no longer in use,
	 * preventing the file from being deleted.
2146
	 */
2147
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2148
	{
B
Bruce Momjian 已提交
2149 2150
		if (openLogFile >= 0)
		{
2151 2152 2153 2154 2155
			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
			{
				XLogFileClose();
			}
		}
2156
		return;
2157
	}
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184

#ifdef WAL_DEBUG
	if (XLOG_DEBUG)
		elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
			 WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
#endif

	START_CRIT_SECTION();

	/* now wait for the write lock */
	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
	LogwrtResult = XLogCtl->Write.LogwrtResult;
	if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		XLogwrtRqst WriteRqst;

		WriteRqst.Write = WriteRqstPtr;
		WriteRqst.Flush = WriteRqstPtr;
		XLogWrite(WriteRqst, flexible, false);
	}
	LWLockRelease(WALWriteLock);

	END_CRIT_SECTION();
}

2185 2186 2187 2188 2189 2190 2191 2192 2193
/*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
bool
XLogNeedsFlush(XLogRecPtr record)
{
2194 2195 2196 2197 2198
	/*
	 * During recovery, we don't flush WAL but update minRecoveryPoint
	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
	 * would need to be updated.
	 */
2199
	if (RecoveryInProgress())
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214
	{
		/* Quick exit if already known updated */
		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
			return false;

		/*
		 * Update local copy of minRecoveryPoint. But if the lock is busy,
		 * just return a conservative guess.
		 */
		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
			return true;
		minRecoveryPoint = ControlFile->minRecoveryPoint;
		LWLockRelease(ControlFileLock);

		/*
B
Bruce Momjian 已提交
2215 2216 2217 2218
		 * An invalid minRecoveryPoint means that we need to recover all the
		 * WAL, i.e., we're doing crash recovery.  We never modify the control
		 * file's value in that case, so we can short-circuit future checks
		 * here too.
2219 2220 2221 2222 2223 2224 2225 2226 2227 2228
		 */
		if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
			updateMinRecoveryPoint = false;

		/* check again */
		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
			return false;
		else
			return true;
	}
2229

2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* check again */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	return true;
}

T
Tom Lane 已提交
2251 2252 2253
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
2254 2255 2256
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
2257
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
2258 2259
 * file was used.
 *
2260
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2261
 * place.  This should be TRUE except during bootstrap log creation.  The
2262
 * caller must *not* hold the lock at call.
2263
 *
T
Tom Lane 已提交
2264
 * Returns FD of opened file.
2265 2266 2267 2268 2269
 *
 * Note: errors here are ERROR not PANIC because we might or might not be
 * inside a critical section (eg, during checkpoint there is no reason to
 * take down the system on failure).  They will promote to PANIC if we are
 * in a critical section.
T
Tom Lane 已提交
2270
 */
2271
int
2272 2273
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
2274
{
2275
	char		path[MAXPGPATH];
2276
	char		tmppath[MAXPGPATH];
2277
	char	   *zbuffer;
2278 2279 2280
	uint32		installed_log;
	uint32		installed_seg;
	int			max_advance;
2281
	int			fd;
2282
	int			nbytes;
2283

2284
	XLogFilePath(path, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
2285 2286

	/*
B
Bruce Momjian 已提交
2287
	 * Try to use existent file (checkpoint maker may have created it already)
V
Vadim B. Mikheev 已提交
2288
	 */
2289
	if (*use_existent)
V
Vadim B. Mikheev 已提交
2290
	{
2291
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2292
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
2293 2294 2295
		if (fd < 0)
		{
			if (errno != ENOENT)
2296
				ereport(ERROR,
2297
						(errcode_for_file_access(),
2298
						 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2299
								path, log, seg)));
V
Vadim B. Mikheev 已提交
2300 2301
		}
		else
2302
			return fd;
V
Vadim B. Mikheev 已提交
2303 2304
	}

2305
	/*
B
Bruce Momjian 已提交
2306 2307 2308 2309
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
	 * another process is doing the same thing.  If so, we will end up
	 * pre-creating an extra log segment.  That seems OK, and better than
	 * holding the lock throughout this lengthy process.
2310
	 */
2311 2312
	elog(DEBUG2, "creating and filling new WAL file");

2313
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2314 2315

	unlink(tmppath);
2316

2317
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2318
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
2319
					   S_IRUSR | S_IWUSR);
2320
	if (fd < 0)
2321
		ereport(ERROR,
2322
				(errcode_for_file_access(),
2323
				 errmsg("could not create file \"%s\": %m", tmppath)));
2324

2325
	/*
B
Bruce Momjian 已提交
2326 2327 2328 2329 2330 2331 2332
	 * Zero-fill the file.	We have to do this the hard way to ensure that all
	 * the file space has really been allocated --- on platforms that allow
	 * "holes" in files, just seeking to the end doesn't allocate intermediate
	 * space.  This way, we know that we have all the space and (after the
	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
	 * log file.
2333 2334 2335 2336
	 *
	 * Note: palloc zbuffer, instead of just using a local char array, to
	 * ensure it is reasonably well-aligned; this may save a few cycles
	 * transferring data to the kernel.
2337
	 */
2338 2339
	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2340
	{
2341
		errno = 0;
2342
		if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
T
Tom Lane 已提交
2343
		{
B
Bruce Momjian 已提交
2344
			int			save_errno = errno;
T
Tom Lane 已提交
2345

B
Bruce Momjian 已提交
2346
			/*
B
Bruce Momjian 已提交
2347
			 * If we fail to make the file, delete it to release disk space
B
Bruce Momjian 已提交
2348
			 */
2349
			unlink(tmppath);
2350 2351
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
2352

2353
			ereport(ERROR,
2354
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2355
					 errmsg("could not write to file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2356
		}
2357
	}
2358
	pfree(zbuffer);
2359

2360
	if (pg_fsync(fd) != 0)
2361
		ereport(ERROR,
2362
				(errcode_for_file_access(),
2363
				 errmsg("could not fsync file \"%s\": %m", tmppath)));
2364

2365
	if (close(fd))
2366
		ereport(ERROR,
2367 2368
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2369

2370
	/*
2371 2372
	 * Now move the segment into place with its final name.
	 *
2373
	 * If caller didn't want to use a pre-existing file, get rid of any
B
Bruce Momjian 已提交
2374 2375 2376
	 * pre-existing file.  Otherwise, cope with possibility that someone else
	 * has created the file while we were filling ours: if so, use ours to
	 * pre-create a future log segment.
2377
	 */
2378 2379 2380 2381 2382
	installed_log = log;
	installed_seg = seg;
	max_advance = XLOGfileslop;
	if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
								*use_existent, &max_advance,
2383 2384
								use_lock))
	{
2385 2386 2387 2388 2389
		/*
		 * No need for any more future segments, or InstallXLogFileSegment()
		 * failed to rename the file into place. If the rename failed, opening
		 * the file below will fail.
		 */
2390 2391 2392 2393 2394 2395 2396
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
2397
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2398 2399
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2400
		ereport(ERROR,
2401
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
2402 2403
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2404

2405 2406
	elog(DEBUG2, "done creating and filling new WAL file");

2407
	return fd;
2408 2409
}

2410 2411 2412 2413 2414 2415 2416 2417 2418
/*
 * Create a new XLOG file segment by copying a pre-existing one.
 *
 * log, seg: identify segment to be created.
 *
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 *		a different timeline)
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
2419
 * considerations.	But we should be just as tense as XLogFileInit to avoid
2420 2421 2422 2423 2424 2425 2426 2427
 * emplacing a bogus file.
 */
static void
XLogFileCopy(uint32 log, uint32 seg,
			 TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
2428
	char		buffer[XLOG_BLCKSZ];
2429 2430 2431 2432 2433 2434 2435 2436 2437 2438
	int			srcfd;
	int			fd;
	int			nbytes;

	/*
	 * Open the source file
	 */
	XLogFilePath(path, srcTLI, srclog, srcseg);
	srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
2439
		ereport(ERROR,
2440 2441 2442 2443 2444 2445
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", path)));

	/*
	 * Copy into a temp file name.
	 */
2446
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2447 2448 2449

	unlink(tmppath);

2450
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2451 2452 2453
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2454
		ereport(ERROR,
2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * Do the data copying.
	 */
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
	{
		errno = 0;
		if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			if (errno != 0)
2467
				ereport(ERROR,
2468 2469 2470
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			else
2471
				ereport(ERROR,
B
Bruce Momjian 已提交
2472
						(errmsg("not enough data in file \"%s\"", path)));
2473 2474 2475 2476 2477 2478 2479
		}
		errno = 0;
		if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			int			save_errno = errno;

			/*
B
Bruce Momjian 已提交
2480
			 * If we fail to make the file, delete it to release disk space
2481 2482 2483 2484 2485
			 */
			unlink(tmppath);
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;

2486
			ereport(ERROR,
2487
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2488
					 errmsg("could not write to file \"%s\": %m", tmppath)));
2489 2490 2491 2492
		}
	}

	if (pg_fsync(fd) != 0)
2493
		ereport(ERROR,
2494 2495 2496 2497
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
2498
		ereport(ERROR,
2499 2500 2501 2502 2503 2504 2505 2506
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));

	close(srcfd);

	/*
	 * Now move the segment into place with its final name.
	 */
2507
	if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2508
		elog(ERROR, "InstallXLogFileSegment should not have failed");
2509 2510
}

2511 2512 2513 2514 2515 2516
/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
2517 2518 2519
 * *log, *seg: identify segment to install as (or first possible target).
 * When find_free is TRUE, these are modified on return to indicate the
 * actual installation location or last segment searched.
2520 2521 2522 2523 2524 2525 2526
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
2527 2528 2529 2530
 * *max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  On return, reduced
 * by the number of slots skipped over.  (Irrelevant, and may be NULL,
 * when find_free is FALSE.)
2531
 *
2532
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2533
 * place.  This should be TRUE except during bootstrap log creation.  The
2534
 * caller must *not* hold the lock at call.
2535
 *
2536 2537 2538
 * Returns TRUE if the file was installed successfully.  FALSE indicates that
 * max_advance limit was exceeded, or an error occurred while renaming the
 * file into place.
2539 2540
 */
static bool
2541 2542
InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
2543 2544 2545
					   bool use_lock)
{
	char		path[MAXPGPATH];
2546
	struct stat stat_buf;
2547

2548
	XLogFilePath(path, ThisTimeLineID, *log, *seg);
2549 2550 2551 2552 2553

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
2554
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2555

2556 2557 2558 2559 2560
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
2561 2562
	else
	{
2563
		/* Find a free slot to put it in */
2564
		while (stat(path, &stat_buf) == 0)
2565
		{
2566
			if (*max_advance <= 0)
2567 2568 2569
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
2570
					LWLockRelease(ControlFileLock);
2571 2572
				return false;
			}
2573 2574 2575
			NextLogSeg(*log, *seg);
			(*max_advance)--;
			XLogFilePath(path, ThisTimeLineID, *log, *seg);
2576 2577 2578 2579 2580 2581 2582
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
2583
	 */
2584
#if HAVE_WORKING_LINK
2585
	if (link(tmppath, path) < 0)
2586 2587 2588 2589
	{
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
2590
				(errcode_for_file_access(),
2591
				 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2592
						tmppath, path, *log, *seg)));
2593 2594
		return false;
	}
2595
	unlink(tmppath);
2596
#else
2597
	if (rename(tmppath, path) < 0)
2598
	{
2599 2600 2601
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
2602
				(errcode_for_file_access(),
2603
				 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2604
						tmppath, path, *log, *seg)));
2605
		return false;
2606
	}
2607
#endif
V
Vadim B. Mikheev 已提交
2608

2609
	if (use_lock)
2610
		LWLockRelease(ControlFileLock);
2611

2612
	return true;
2613 2614
}

T
Tom Lane 已提交
2615
/*
2616
 * Open a pre-existing logfile segment for writing.
T
Tom Lane 已提交
2617
 */
2618
int
2619
XLogFileOpen(uint32 log, uint32 seg)
2620
{
2621 2622
	char		path[MAXPGPATH];
	int			fd;
2623

2624
	XLogFilePath(path, ThisTimeLineID, log, seg);
2625

2626
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2627
					   S_IRUSR | S_IWUSR);
2628
	if (fd < 0)
2629 2630
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
2631 2632
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2633 2634 2635 2636 2637 2638

	return fd;
}

/*
 * Open a logfile segment for reading (during recovery).
2639
 *
2640
 * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2641
 * Otherwise, it's assumed to be already available in pg_xlog.
2642 2643
 */
static int
2644
XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
2645
			 int source, bool notfoundOk)
2646 2647
{
	char		xlogfname[MAXFNAMELEN];
2648
	char		activitymsg[MAXFNAMELEN + 16];
2649
	char		path[MAXPGPATH];
2650
	int			fd;
2651

B
Bruce Momjian 已提交
2652
	XLogFileName(xlogfname, tli, log, seg);
2653

2654
	switch (source)
B
Bruce Momjian 已提交
2655
	{
2656 2657 2658 2659 2660
		case XLOG_FROM_ARCHIVE:
			/* Report recovery progress in PS display */
			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
					 xlogfname);
			set_ps_display(activitymsg, false);
2661

2662 2663 2664 2665 2666 2667 2668 2669
			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
													  "RECOVERYXLOG",
													  XLogSegSize);
			if (!restoredFromArchive)
				return -1;
			break;

		case XLOG_FROM_PG_XLOG:
2670
		case XLOG_FROM_STREAM:
2671 2672 2673 2674 2675 2676
			XLogFilePath(path, tli, log, seg);
			restoredFromArchive = false;
			break;

		default:
			elog(ERROR, "invalid XLogFileRead source %d", source);
B
Bruce Momjian 已提交
2677
	}
2678

B
Bruce Momjian 已提交
2679 2680 2681 2682 2683
	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (fd >= 0)
	{
		/* Success! */
		curFileTLI = tli;
2684

B
Bruce Momjian 已提交
2685 2686 2687 2688 2689
		/* Report recovery progress in PS display */
		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
				 xlogfname);
		set_ps_display(activitymsg, false);

2690
		/* Track source of data in assorted state variables */
2691
		readSource = source;
2692 2693 2694 2695 2696
		XLogReceiptSource = source;
		/* In FROM_STREAM case, caller tracks receipt time, not me */
		if (source != XLOG_FROM_STREAM)
			XLogReceiptTime = GetCurrentTimestamp();

B
Bruce Momjian 已提交
2697 2698 2699 2700 2701 2702 2703 2704
		return fd;
	}
	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
		ereport(PANIC,
				(errcode_for_file_access(),
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
	return -1;
2705 2706 2707 2708 2709 2710 2711 2712
}

/*
 * Open a logfile segment for reading (during recovery).
 *
 * This version searches for the segment with any TLI listed in expectedTLIs.
 */
static int
2713
XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735
{
	char		path[MAXPGPATH];
	ListCell   *cell;
	int			fd;

	/*
	 * Loop looking for a suitable timeline ID: we might need to read any of
	 * the timelines listed in expectedTLIs.
	 *
	 * We expect curFileTLI on entry to be the TLI of the preceding file in
	 * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
	 * to go backwards; this prevents us from picking up the wrong file when a
	 * parent timeline extends to higher segment numbers than the child we
	 * want to read.
	 */
	foreach(cell, expectedTLIs)
	{
		TimeLineID	tli = (TimeLineID) lfirst_int(cell);

		if (tli < curFileTLI)
			break;				/* don't bother looking at too-old TLIs */

2736 2737 2738 2739 2740 2741 2742 2743 2744
		if (sources & XLOG_FROM_ARCHIVE)
		{
			fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_ARCHIVE, true);
			if (fd != -1)
			{
				elog(DEBUG1, "got WAL segment from archive");
				return fd;
			}
		}
2745

2746
		if (sources & XLOG_FROM_PG_XLOG)
2747
		{
2748
			fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
2749 2750 2751
			if (fd != -1)
				return fd;
		}
2752 2753 2754 2755 2756 2757 2758
	}

	/* Couldn't find it.  For simplicity, complain about front timeline */
	XLogFilePath(path, recoveryTargetTLI, log, seg);
	errno = ENOENT;
	ereport(emode,
			(errcode_for_file_access(),
B
Bruce Momjian 已提交
2759 2760
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2761
	return -1;
2762 2763
}

2764 2765 2766 2767 2768 2769 2770 2771 2772
/*
 * Close the current logfile segment for writing.
 */
static void
XLogFileClose(void)
{
	Assert(openLogFile >= 0);

	/*
2773
	 * WAL segment files will not be re-read in normal operation, so we advise
2774
	 * the OS to release any cached pages.	But do not do so if WAL archiving
B
Bruce Momjian 已提交
2775 2776
	 * or streaming is active, because archiver and walsender process could
	 * use the cache to read the WAL segment.
2777
	 */
2778
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2779
	if (!XLogIsNeeded())
2780
		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2781
#endif
2782

2783 2784
	if (close(openLogFile))
		ereport(PANIC,
B
Bruce Momjian 已提交
2785 2786 2787
				(errcode_for_file_access(),
				 errmsg("could not close log file %u, segment %u: %m",
						openLogId, openLogSeg)));
2788 2789 2790
	openLogFile = -1;
}

2791
/*
2792
 * Attempt to retrieve the specified file from off-line archival storage.
2793
 * If successful, fill "path" with its complete path (note that this will be
2794 2795
 * a temp file name that doesn't follow the normal naming convention), and
 * return TRUE.
2796
 *
2797 2798 2799
 * If not successful, fill "path" with the name of the normal on-line file
 * (which may or may not actually exist, but we'll try to use it), and return
 * FALSE.
2800 2801 2802 2803
 *
 * For fixed-size files, the caller may pass the expected size as an
 * additional crosscheck on successful recovery.  If the file size is not
 * known, set expectedSize = 0.
2804
 */
2805 2806
static bool
RestoreArchivedFile(char *path, const char *xlogfname,
2807
					const char *recovername, off_t expectedSize)
2808
{
B
Bruce Momjian 已提交
2809 2810
	char		xlogpath[MAXPGPATH];
	char		xlogRestoreCmd[MAXPGPATH];
2811
	char		lastRestartPointFname[MAXPGPATH];
B
Bruce Momjian 已提交
2812 2813
	char	   *dp;
	char	   *endp;
2814
	const char *sp;
B
Bruce Momjian 已提交
2815
	int			rc;
2816
	bool		signaled;
2817
	struct stat stat_buf;
B
Bruce Momjian 已提交
2818 2819
	uint32		restartLog;
	uint32		restartSeg;
2820

2821
	/* In standby mode, restore_command might not be supplied */
2822
	if (recoveryRestoreCommand == NULL)
2823 2824
		goto not_available;

2825
	/*
B
Bruce Momjian 已提交
2826 2827 2828 2829
	 * When doing archive recovery, we always prefer an archived log file even
	 * if a file of the same name exists in XLOGDIR.  The reason is that the
	 * file in XLOGDIR could be an old, un-filled or partly-filled version
	 * that was copied and restored as part of backing up $PGDATA.
2830
	 *
B
Bruce Momjian 已提交
2831
	 * We could try to optimize this slightly by checking the local copy
B
Bruce Momjian 已提交
2832 2833 2834 2835
	 * lastchange timestamp against the archived copy, but we have no API to
	 * do this, nor can we guarantee that the lastchange timestamp was
	 * preserved correctly when we copied to archive. Our aim is robustness,
	 * so we elect not to do this.
2836
	 *
2837 2838 2839
	 * If we cannot obtain the log file from the archive, however, we will try
	 * to use the XLOGDIR file if it exists.  This is so that we can make use
	 * of log segments that weren't yet transferred to the archive.
2840
	 *
2841 2842 2843 2844
	 * Notice that we don't actually overwrite any files when we copy back
	 * from archive because the recoveryRestoreCommand may inadvertently
	 * restore inappropriate xlogs, or they may be corrupt, so we may wish to
	 * fallback to the segments remaining in current XLOGDIR later. The
B
Bruce Momjian 已提交
2845 2846
	 * copy-from-archive filename is always the same, ensuring that we don't
	 * run out of disk space on long recoveries.
2847
	 */
2848
	snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2849 2850

	/*
2851
	 * Make sure there is no existing file named recovername.
2852 2853 2854 2855 2856 2857
	 */
	if (stat(xlogpath, &stat_buf) != 0)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
2858
					 errmsg("could not stat file \"%s\": %m",
2859 2860 2861 2862 2863 2864 2865
							xlogpath)));
	}
	else
	{
		if (unlink(xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
2866
					 errmsg("could not remove file \"%s\": %m",
2867 2868 2869
							xlogpath)));
	}

2870 2871
	/*
	 * Calculate the archive file cutoff point for use during log shipping
2872 2873
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
2874 2875
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
2876 2877
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
2878 2879 2880
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
2881 2882 2883 2884
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
2885 2886 2887 2888 2889 2890 2891 2892 2893
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
		/* we shouldn't need anything earlier than last restart point */
2894
		Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2895 2896 2897 2898
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912
	/*
	 * construct the command to be executed
	 */
	dp = xlogRestoreCmd;
	endp = xlogRestoreCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryRestoreCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'p':
2913
					/* %p: relative path of target file */
2914
					sp++;
B
Bruce Momjian 已提交
2915
					StrNCpy(dp, xlogpath, endp - dp);
2916
					make_native_path(dp);
2917 2918 2919 2920 2921
					dp += strlen(dp);
					break;
				case 'f':
					/* %f: filename of desired file */
					sp++;
B
Bruce Momjian 已提交
2922
					StrNCpy(dp, xlogfname, endp - dp);
2923 2924
					dp += strlen(dp);
					break;
2925 2926 2927 2928 2929 2930
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
B
Bruce Momjian 已提交
2953
			(errmsg_internal("executing restore command \"%s\"",
2954 2955
							 xlogRestoreCmd)));

2956 2957
	/*
	 * Set in_restore_command to tell the signal handler that we should exit
2958
	 * right away on SIGTERM. We know that we're at a safe point to do that.
2959 2960 2961 2962 2963
	 * Check if we had already received the signal, so that we don't miss a
	 * shutdown request received just before this.
	 */
	in_restore_command = true;
	if (shutdown_requested)
2964
		proc_exit(1);
2965

2966
	/*
2967
	 * Copy xlog from archival storage to XLOGDIR
2968 2969
	 */
	rc = system(xlogRestoreCmd);
2970 2971 2972

	in_restore_command = false;

2973 2974
	if (rc == 0)
	{
2975 2976 2977 2978 2979 2980 2981
		/*
		 * command apparently succeeded, but let's make sure the file is
		 * really there now and has the correct size.
		 */
		if (stat(xlogpath, &stat_buf) == 0)
		{
			if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2982
			{
B
Bruce Momjian 已提交
2983
				int			elevel;
2984 2985 2986 2987 2988 2989 2990

				/*
				 * If we find a partial file in standby mode, we assume it's
				 * because it's just being copied to the archive, and keep
				 * trying.
				 *
				 * Otherwise treat a wrong-sized file as FATAL to ensure the
B
Bruce Momjian 已提交
2991
				 * DBA would notice it, but is that too strong? We could try
2992 2993
				 * to plow ahead with a local copy of the file ... but the
				 * problem is that there probably isn't one, and we'd
B
Bruce Momjian 已提交
2994 2995
				 * incorrectly conclude we've reached the end of WAL and we're
				 * done recovering ...
2996 2997 2998 2999 3000 3001
				 */
				if (StandbyMode && stat_buf.st_size < expectedSize)
					elevel = DEBUG1;
				else
					elevel = FATAL;
				ereport(elevel,
3002 3003 3004 3005
						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
								xlogfname,
								(unsigned long) stat_buf.st_size,
								(unsigned long) expectedSize)));
3006 3007
				return false;
			}
3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022
			else
			{
				ereport(LOG,
						(errmsg("restored log file \"%s\" from archive",
								xlogfname)));
				strcpy(path, xlogpath);
				return true;
			}
		}
		else
		{
			/* stat failed */
			if (errno != ENOENT)
				ereport(FATAL,
						(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3023
						 errmsg("could not stat file \"%s\": %m",
3024
								xlogpath)));
3025 3026 3027 3028
		}
	}

	/*
3029
	 * Remember, we rollforward UNTIL the restore fails so failure here is
B
Bruce Momjian 已提交
3030
	 * just part of the process... that makes it difficult to determine
B
Bruce Momjian 已提交
3031 3032 3033
	 * whether the restore failed because there isn't an archive to restore,
	 * or because the administrator has specified the restore program
	 * incorrectly.  We have to assume the former.
3034 3035
	 *
	 * However, if the failure was due to any sort of signal, it's best to
B
Bruce Momjian 已提交
3036 3037 3038
	 * punt and abort recovery.  (If we "return false" here, upper levels will
	 * assume that recovery is complete and start up the database!) It's
	 * essential to abort on child SIGINT and SIGQUIT, because per spec
3039
	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3040 3041 3042 3043 3044
	 * those it's a good bet we should have gotten it too.
	 *
	 * On SIGTERM, assume we have received a fast shutdown request, and exit
	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
	 * child process. If we receive it first, the signal handler will call
3045 3046 3047
	 * proc_exit, otherwise we do it here. If we or the child process received
	 * SIGTERM for any other reason than a fast shutdown request, postmaster
	 * will perform an immediate shutdown when it sees us exiting
3048
	 * unexpectedly.
3049
	 *
B
Bruce Momjian 已提交
3050 3051 3052 3053
	 * Per the Single Unix Spec, shells report exit status > 128 when a called
	 * command died on a signal.  Also, 126 and 127 are used to report
	 * problems such as an unfindable command; treat those as fatal errors
	 * too.
3054
	 */
3055
	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3056
		proc_exit(1);
3057

3058 3059 3060
	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

	ereport(signaled ? FATAL : DEBUG2,
B
Bruce Momjian 已提交
3061 3062
		(errmsg("could not restore file \"%s\" from archive: return code %d",
				xlogfname, rc)));
3063

3064
not_available:
B
Bruce Momjian 已提交
3065

3066
	/*
B
Bruce Momjian 已提交
3067 3068
	 * if an archived file is not available, there might still be a version of
	 * this file in XLOGDIR, so return that as the filename to open.
3069
	 *
B
Bruce Momjian 已提交
3070 3071
	 * In many recovery scenarios we expect this to fail also, but if so that
	 * just means we've reached the end of WAL.
3072
	 */
3073
	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3074
	return false;
3075 3076
}

3077
/*
3078 3079 3080 3081 3082 3083 3084
 * Attempt to execute an external shell command during recovery.
 *
 * 'command' is the shell command to be executed, 'commandName' is a
 * human-readable name describing the command emitted in the logs. If
 * 'failonSignal' is true and the command is killed by a signal, a FATAL
 * error is thrown. Otherwise a WARNING is emitted.
 *
3085
 * This is currently used for restore_end_command and archive_cleanup_command.
3086 3087
 */
static void
3088
ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
3089
{
3090
	char		xlogRecoveryCmd[MAXPGPATH];
3091 3092 3093 3094 3095 3096 3097 3098 3099
	char		lastRestartPointFname[MAXPGPATH];
	char	   *dp;
	char	   *endp;
	const char *sp;
	int			rc;
	bool		signaled;
	uint32		restartLog;
	uint32		restartSeg;

3100
	Assert(command && commandName);
3101 3102 3103

	/*
	 * Calculate the archive file cutoff point for use during log shipping
3104 3105
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
3106
	 */
3107 3108 3109 3110 3111 3112 3113
	LWLockAcquire(ControlFileLock, LW_SHARED);
	XLByteToSeg(ControlFile->checkPointCopy.redo,
				restartLog, restartSeg);
	XLogFileName(lastRestartPointFname,
				 ControlFile->checkPointCopy.ThisTimeLineID,
				 restartLog, restartSeg);
	LWLockRelease(ControlFileLock);
3114 3115 3116 3117

	/*
	 * construct the command to be executed
	 */
3118 3119
	dp = xlogRecoveryCmd;
	endp = xlogRecoveryCmd + MAXPGPATH - 1;
3120 3121
	*endp = '\0';

3122
	for (sp = command; *sp; sp++)
3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
3156
			(errmsg_internal("executing %s \"%s\"", commandName, command)));
3157 3158

	/*
T
Tom Lane 已提交
3159
	 * execute the constructed command
3160
	 */
3161
	rc = system(xlogRecoveryCmd);
3162 3163 3164 3165
	if (rc != 0)
	{
		/*
		 * If the failure was due to any sort of signal, it's best to punt and
3166
		 * abort recovery. See also detailed comments on signals in
3167 3168 3169 3170
		 * RestoreArchivedFile().
		 */
		signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

3171 3172 3173 3174 3175 3176 3177
		/*
		 * translator: First %s represents a recovery.conf parameter name like
		 * "recovery_end_command", and the 2nd is the value of that parameter.
		 */
		ereport((signaled && failOnSignal) ? FATAL : WARNING,
				(errmsg("%s \"%s\": return code %d", commandName,
						command, rc)));
3178 3179 3180
	}
}

V
Vadim B. Mikheev 已提交
3181
/*
3182 3183 3184 3185 3186 3187 3188 3189
 * Preallocate log files beyond the specified log endpoint.
 *
 * XXX this is currently extremely conservative, since it forces only one
 * future log segment to exist, and even that only if we are 75% done with
 * the current one.  This is only appropriate for very low-WAL-volume systems.
 * High-volume systems will be OK once they've built up a sufficient set of
 * recycled log segments, but the startup transient is likely to include
 * a lot of segment creations by foreground processes, which is not so good.
T
Tom Lane 已提交
3190
 */
3191
static void
T
Tom Lane 已提交
3192 3193 3194 3195 3196
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
3197
	bool		use_existent;
T
Tom Lane 已提交
3198 3199

	XLByteToPrevSeg(endptr, _logId, _logSeg);
B
Bruce Momjian 已提交
3200
	if ((endptr.xrecoff - 1) % XLogSegSize >=
B
Bruce Momjian 已提交
3201
		(uint32) (0.75 * XLogSegSize))
T
Tom Lane 已提交
3202 3203
	{
		NextLogSeg(_logId, _logSeg);
3204 3205
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
3206
		close(lf);
3207
		if (!use_existent)
3208
			CheckpointStats.ckpt_segs_added++;
T
Tom Lane 已提交
3209 3210 3211
	}
}

3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227
/*
 * Get the log/seg of the latest removed or recycled WAL segment.
 * Returns 0 if no WAL segments have been removed since startup.
 */
void
XLogGetLastRemoved(uint32 *log, uint32 *seg)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	*log = xlogctl->lastRemovedLog;
	*seg = xlogctl->lastRemovedSeg;
	SpinLockRelease(&xlogctl->info_lck);
}

3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252
/*
 * Update the last removed log/seg pointer in shared memory, to reflect
 * that the given XLOG file has been removed.
 */
static void
UpdateLastRemovedPtr(char *filename)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	uint32		tli,
				log,
				seg;

	XLogFromFileName(filename, &tli, &log, &seg);

	SpinLockAcquire(&xlogctl->info_lck);
	if (log > xlogctl->lastRemovedLog ||
		(log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
	{
		xlogctl->lastRemovedLog = log;
		xlogctl->lastRemovedSeg = seg;
	}
	SpinLockRelease(&xlogctl->info_lck);
}

T
Tom Lane 已提交
3253
/*
3254
 * Recycle or remove all log files older or equal to passed log/seg#
3255 3256 3257
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
3258 3259
 */
static void
3260
RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
3261
{
3262 3263
	uint32		endlogId;
	uint32		endlogSeg;
3264
	int			max_advance;
B
Bruce Momjian 已提交
3265 3266
	DIR		   *xldir;
	struct dirent *xlde;
3267
	char		lastoff[MAXFNAMELEN];
B
Bruce Momjian 已提交
3268
	char		path[MAXPGPATH];
B
Bruce Momjian 已提交
3269

3270 3271 3272
#ifdef WIN32
	char		newpath[MAXPGPATH];
#endif
3273
	struct stat statbuf;
3274

3275 3276
	elog(DEBUG2, "removing WAL segments older than log file %u, segment %u",
		 log, seg);
V
Vadim B. Mikheev 已提交
3277

3278 3279 3280 3281
	/*
	 * Initialize info about where to try to recycle to.  We allow recycling
	 * segments up to XLOGfileslop segments beyond the current XLOG location.
	 */
3282
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3283
	max_advance = XLOGfileslop;
V
Vadim B. Mikheev 已提交
3284

3285
	xldir = AllocateDir(XLOGDIR);
V
Vadim B. Mikheev 已提交
3286
	if (xldir == NULL)
3287
		ereport(ERROR,
3288
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3289 3290
				 errmsg("could not open transaction log directory \"%s\": %m",
						XLOGDIR)));
V
Vadim B. Mikheev 已提交
3291

3292
	XLogFileName(lastoff, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
3293

3294
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
V
Vadim B. Mikheev 已提交
3295
	{
3296
		/*
3297
		 * We ignore the timeline part of the XLOG segment identifiers in
B
Bruce Momjian 已提交
3298 3299 3300 3301 3302
		 * deciding whether a segment is still needed.	This ensures that we
		 * won't prematurely remove a segment from a parent timeline. We could
		 * probably be a little more proactive about removing segments of
		 * non-parent timelines, but that would be a whole lot more
		 * complicated.
3303
		 *
B
Bruce Momjian 已提交
3304 3305
		 * We use the alphanumeric sorting property of the filenames to decide
		 * which ones are earlier than the lastoff segment.
3306
		 */
3307 3308 3309
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
V
Vadim B. Mikheev 已提交
3310
		{
3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321
			/*
			 * Normally we don't delete old XLOG files during recovery to
			 * avoid accidentally deleting a file that looks stale due to a
			 * bug or hardware issue, but in fact contains important data.
			 * During streaming recovery, however, we will eventually fill the
			 * disk if we never clean up, so we have to. That's not an issue
			 * with file-based archive recovery because in that case we
			 * restore one XLOG file at a time, on-demand, and with a
			 * different filename that can't be confused with regular XLOG
			 * files.
			 */
3322
			if (WalRcvInProgress() || XLogArchiveCheckDone(xlde->d_name))
3323
			{
3324
				snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3325

3326 3327 3328
				/* Update the last removed location in shared memory first */
				UpdateLastRemovedPtr(xlde->d_name);

3329
				/*
B
Bruce Momjian 已提交
3330
				 * Before deleting the file, see if it can be recycled as a
3331 3332 3333
				 * future log segment. Only recycle normal files, pg_standby
				 * for example can create symbolic links pointing to a
				 * separate archive directory.
3334
				 */
3335 3336 3337
				if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
					InstallXLogFileSegment(&endlogId, &endlogSeg, path,
										   true, &max_advance, true))
3338
				{
3339
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3340 3341
							(errmsg("recycled transaction log file \"%s\"",
									xlde->d_name)));
3342
					CheckpointStats.ckpt_segs_recycled++;
3343 3344 3345 3346 3347 3348
					/* Needn't recheck that slot on future iterations */
					if (max_advance > 0)
					{
						NextLogSeg(endlogId, endlogSeg);
						max_advance--;
					}
3349 3350 3351 3352
				}
				else
				{
					/* No need for any more future segments... */
B
Bruce Momjian 已提交
3353
					int			rc;
3354

3355
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3356 3357
							(errmsg("removing transaction log file \"%s\"",
									xlde->d_name)));
3358 3359

#ifdef WIN32
B
Bruce Momjian 已提交
3360

3361 3362 3363 3364
					/*
					 * On Windows, if another process (e.g another backend)
					 * holds the file open in FILE_SHARE_DELETE mode, unlink
					 * will succeed, but the file will still show up in
B
Bruce Momjian 已提交
3365 3366 3367 3368
					 * directory listing until the last handle is closed. To
					 * avoid confusing the lingering deleted file for a live
					 * WAL file that needs to be archived, rename it before
					 * deleting it.
3369 3370 3371 3372 3373 3374 3375
					 *
					 * If another process holds the file open without
					 * FILE_SHARE_DELETE flag, rename will fail. We'll try
					 * again at the next checkpoint.
					 */
					snprintf(newpath, MAXPGPATH, "%s.deleted", path);
					if (rename(path, newpath) != 0)
3376 3377
					{
						ereport(LOG,
3378
								(errcode_for_file_access(),
3379
								 errmsg("could not rename old transaction log file \"%s\": %m",
3380
										path)));
3381 3382
						continue;
					}
3383 3384 3385 3386 3387
					rc = unlink(newpath);
#else
					rc = unlink(path);
#endif
					if (rc != 0)
3388 3389
					{
						ereport(LOG,
3390 3391 3392
								(errcode_for_file_access(),
								 errmsg("could not remove old transaction log file \"%s\": %m",
										path)));
3393 3394
						continue;
					}
3395
					CheckpointStats.ckpt_segs_removed++;
3396
				}
3397 3398

				XLogArchiveCleanup(xlde->d_name);
3399
			}
V
Vadim B. Mikheev 已提交
3400 3401
		}
	}
B
Bruce Momjian 已提交
3402

3403
	FreeDir(xldir);
V
Vadim B. Mikheev 已提交
3404 3405
}

3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422
/*
 * Verify whether pg_xlog and pg_xlog/archive_status exist.
 * If the latter does not exist, recreate it.
 *
 * It is not the goal of this function to verify the contents of these
 * directories, but to help in cases where someone has performed a cluster
 * copy for PITR purposes but omitted pg_xlog from the copy.
 *
 * We could also recreate pg_xlog if it doesn't exist, but a deliberate
 * policy decision was made not to.  It is fairly common for pg_xlog to be
 * a symlink, and if that was the DBA's intent then automatically making a
 * plain directory would result in degraded performance with no notice.
 */
static void
ValidateXLOGDirectoryStructure(void)
{
	char		path[MAXPGPATH];
3423
	struct stat stat_buf;
3424 3425 3426 3427

	/* Check for pg_xlog; if it doesn't exist, error out */
	if (stat(XLOGDIR, &stat_buf) != 0 ||
		!S_ISDIR(stat_buf.st_mode))
3428
		ereport(FATAL,
3429 3430 3431 3432 3433 3434 3435 3436 3437
				(errmsg("required WAL directory \"%s\" does not exist",
						XLOGDIR)));

	/* Check for archive_status */
	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
	if (stat(path, &stat_buf) == 0)
	{
		/* Check for weird cases where it exists but isn't a directory */
		if (!S_ISDIR(stat_buf.st_mode))
3438
			ereport(FATAL,
3439 3440 3441 3442 3443 3444 3445 3446
					(errmsg("required WAL directory \"%s\" does not exist",
							path)));
	}
	else
	{
		ereport(LOG,
				(errmsg("creating missing WAL directory \"%s\"", path)));
		if (mkdir(path, 0700) < 0)
3447
			ereport(FATAL,
3448 3449 3450 3451 3452
					(errmsg("could not create missing directory \"%s\": %m",
							path)));
	}
}

3453
/*
3454 3455 3456
 * Remove previous backup history files.  This also retries creation of
 * .ready files for any backup history files for which XLogArchiveNotify
 * failed earlier.
3457 3458
 */
static void
3459
CleanupBackupHistory(void)
3460 3461 3462 3463 3464
{
	DIR		   *xldir;
	struct dirent *xlde;
	char		path[MAXPGPATH];

3465
	xldir = AllocateDir(XLOGDIR);
3466 3467 3468
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3469 3470
				 errmsg("could not open transaction log directory \"%s\": %m",
						XLOGDIR)));
3471

3472
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3473 3474 3475 3476 3477 3478
	{
		if (strlen(xlde->d_name) > 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
				   ".backup") == 0)
		{
3479
			if (XLogArchiveCheckDone(xlde->d_name))
3480 3481
			{
				ereport(DEBUG2,
B
Bruce Momjian 已提交
3482 3483
				(errmsg("removing transaction log backup history file \"%s\"",
						xlde->d_name)));
3484
				snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3485 3486 3487 3488 3489 3490 3491 3492 3493
				unlink(path);
				XLogArchiveCleanup(xlde->d_name);
			}
		}
	}

	FreeDir(xldir);
}

T
Tom Lane 已提交
3494 3495 3496 3497
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
3498 3499 3500 3501 3502 3503 3504 3505 3506
 *
 * Note: when a backup block is available in XLOG, we restore it
 * unconditionally, even if the page in the database appears newer.
 * This is to protect ourselves against database pages that were partially
 * or incorrectly written during a crash.  We assume that the XLOG data
 * must be good because it has passed a CRC check, while the database
 * page might not be.  This will force us to replay all subsequent
 * modifications of the page that appear in XLOG, rather than possibly
 * ignoring them as already applied, but that's not a huge drawback.
3507 3508
 *
 * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3509 3510 3511 3512 3513
 * Otherwise, a normal exclusive lock is used.	During crash recovery, that's
 * just pro forma because there can't be any regular backends in the system,
 * but in hot standby mode the distinction is important. The 'cleanup'
 * argument applies to all backup blocks in the WAL record, that suffices for
 * now.
T
Tom Lane 已提交
3514
 */
3515 3516
void
RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3517 3518 3519 3520 3521 3522 3523
{
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

3524 3525 3526
	if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
		return;

B
Bruce Momjian 已提交
3527
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
3528
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3529
	{
T
Tom Lane 已提交
3530
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3531 3532
			continue;

3533
		memcpy(&bkpb, blk, sizeof(BkpBlock));
3534 3535
		blk += sizeof(BkpBlock);

3536 3537
		buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
										RBM_ZERO);
3538
		Assert(BufferIsValid(buffer));
3539 3540 3541 3542 3543
		if (cleanup)
			LockBufferForCleanup(buffer);
		else
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

3544
		page = (Page) BufferGetPage(buffer);
3545

3546
		if (bkpb.hole_length == 0)
3547
		{
3548 3549 3550 3551 3552 3553 3554 3555 3556 3557
			memcpy((char *) page, blk, BLCKSZ);
		}
		else
		{
			/* must zero-fill the hole */
			MemSet((char *) page, 0, BLCKSZ);
			memcpy((char *) page, blk, bkpb.hole_offset);
			memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
				   blk + bkpb.hole_offset,
				   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3558 3559
		}

3560 3561
		PageSetLSN(page, lsn);
		PageSetTLI(page, ThisTimeLineID);
3562 3563
		MarkBufferDirty(buffer);
		UnlockReleaseBuffer(buffer);
3564

3565
		blk += BLCKSZ - bkpb.hole_length;
3566 3567 3568
	}
}

T
Tom Lane 已提交
3569 3570 3571 3572 3573 3574 3575
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
3576 3577 3578
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
3579
	pg_crc32	crc;
3580 3581
	int			i;
	uint32		len = record->xl_len;
3582
	BkpBlock	bkpb;
3583 3584
	char	   *blk;

3585 3586 3587
	/* First the rmgr data */
	INIT_CRC32(crc);
	COMP_CRC32(crc, XLogRecGetData(record), len);
3588

3589
	/* Add in the backup blocks, if any */
B
Bruce Momjian 已提交
3590
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
3591
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3592
	{
B
Bruce Momjian 已提交
3593
		uint32		blen;
3594

T
Tom Lane 已提交
3595
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3596 3597
			continue;

3598 3599
		memcpy(&bkpb, blk, sizeof(BkpBlock));
		if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3600
		{
3601
			ereport(emode_for_corrupt_record(emode, recptr),
3602 3603 3604
					(errmsg("incorrect hole size in record at %X/%X",
							recptr.xlogid, recptr.xrecoff)));
			return false;
3605
		}
3606 3607 3608 3609 3610 3611 3612 3613
		blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
		COMP_CRC32(crc, blk, blen);
		blk += blen;
	}

	/* Check that xl_tot_len agrees with our calculation */
	if (blk != (char *) record + record->xl_tot_len)
	{
3614
		ereport(emode_for_corrupt_record(emode, recptr),
3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626
				(errmsg("incorrect total length in record at %X/%X",
						recptr.xlogid, recptr.xrecoff)));
		return false;
	}

	/* Finally include the record header */
	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(crc);

	if (!EQ_CRC32(record->xl_crc, crc))
	{
3627
		ereport(emode_for_corrupt_record(emode, recptr),
B
Bruce Momjian 已提交
3628 3629
		(errmsg("incorrect resource manager data checksum in record at %X/%X",
				recptr.xlogid, recptr.xrecoff)));
3630
		return false;
3631 3632
	}

3633
	return true;
3634 3635
}

T
Tom Lane 已提交
3636 3637 3638 3639 3640 3641
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
3642
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3643
 * (emode must be either PANIC, LOG)
T
Tom Lane 已提交
3644
 *
3645 3646
 * The record is copied into readRecordBuf, so that on successful return,
 * the returned record pointer always points there.
T
Tom Lane 已提交
3647
 */
3648
static XLogRecord *
3649
ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
3650
{
3651
	XLogRecord *record;
3652
	char	   *buffer;
3653
	XLogRecPtr	tmpRecPtr = EndRecPtr;
3654
	bool		randAccess = false;
T
Tom Lane 已提交
3655 3656
	uint32		len,
				total_len;
3657 3658
	uint32		targetRecOff;
	uint32		pageHeaderSize;
T
Tom Lane 已提交
3659 3660 3661 3662

	if (readBuf == NULL)
	{
		/*
B
Bruce Momjian 已提交
3663 3664 3665 3666 3667
		 * First time through, permanently allocate readBuf.  We do it this
		 * way, rather than just making a static array, for two reasons: (1)
		 * no need to waste the storage in most instantiations of the backend;
		 * (2) a static char array isn't guaranteed to have any particular
		 * alignment, whereas malloc() will provide MAXALIGN'd storage.
T
Tom Lane 已提交
3668
		 */
3669
		readBuf = (char *) malloc(XLOG_BLCKSZ);
T
Tom Lane 已提交
3670 3671
		Assert(readBuf != NULL);
	}
3672

T
Tom Lane 已提交
3673
	if (RecPtr == NULL)
3674
	{
3675
		RecPtr = &tmpRecPtr;
3676 3677

		/*
B
Bruce Momjian 已提交
3678 3679
		 * Align recptr to next page if no more records can fit on the current
		 * page.
3680
		 */
3681 3682
		if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
		{
3683
			NextLogPage(tmpRecPtr);
3684 3685
			/* We will account for page header size below */
		}
3686 3687 3688 3689 3690 3691

		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
3692 3693 3694 3695 3696 3697 3698
	}
	else
	{
		if (!XRecOffIsValid(RecPtr->xrecoff))
			ereport(PANIC,
					(errmsg("invalid record offset at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
B
Bruce Momjian 已提交
3699

3700
		/*
B
Bruce Momjian 已提交
3701 3702 3703 3704 3705
		 * Since we are going to a random position in WAL, forget any prior
		 * state about what timeline we were in, and allow it to be any
		 * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
		 * to go backwards (but we can't reset that variable right here, since
		 * we might not change files at all).
3706 3707 3708
		 */
		lastPageTLI = 0;		/* see comment in ValidXLOGHeader */
		randAccess = true;		/* allow curFileTLI to go backwards too */
3709 3710
	}

3711 3712 3713
	/* This is the first try to read this page. */
	failedSources = 0;
retry:
3714 3715 3716
	/* Read the page containing the record */
	if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
		return NULL;
3717

3718
	pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3719
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3720 3721 3722
	if (targetRecOff == 0)
	{
		/*
B
Bruce Momjian 已提交
3723 3724 3725
		 * Can only get here in the continuing-from-prev-page case, because
		 * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
		 * to skip over the new page's header.
3726 3727 3728 3729 3730 3731
		 */
		tmpRecPtr.xrecoff += pageHeaderSize;
		targetRecOff = pageHeaderSize;
	}
	else if (targetRecOff < pageHeaderSize)
	{
3732
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3733 3734 3735 3736
				(errmsg("invalid record offset at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
T
Tom Lane 已提交
3737
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3738
		targetRecOff == pageHeaderSize)
3739
	{
3740
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3741 3742
				(errmsg("contrecord is requested by %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
3743 3744
		goto next_record_is_invalid;
	}
3745
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3746

T
Tom Lane 已提交
3747
	/*
B
Bruce Momjian 已提交
3748 3749
	 * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
	 * required.
T
Tom Lane 已提交
3750
	 */
3751 3752 3753 3754
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		if (record->xl_len != 0)
		{
3755
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3756 3757 3758 3759 3760 3761
					(errmsg("invalid xlog switch record at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else if (record->xl_len == 0)
3762
	{
3763
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3764 3765
				(errmsg("record with zero length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
3766 3767
		goto next_record_is_invalid;
	}
3768 3769 3770 3771
	if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
		record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
		XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
	{
3772
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3773 3774 3775 3776
				(errmsg("invalid record length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
3777 3778
	if (record->xl_rmid > RM_MAX_ID)
	{
3779
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3780
				(errmsg("invalid resource manager ID %u at %X/%X",
B
Bruce Momjian 已提交
3781
						record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3782 3783
		goto next_record_is_invalid;
	}
3784 3785 3786
	if (randAccess)
	{
		/*
B
Bruce Momjian 已提交
3787 3788
		 * We can't exactly verify the prev-link, but surely it should be less
		 * than the record's own address.
3789 3790 3791
		 */
		if (!XLByteLT(record->xl_prev, *RecPtr))
		{
3792
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3793 3794 3795 3796 3797 3798 3799 3800 3801
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else
	{
		/*
B
Bruce Momjian 已提交
3802 3803 3804
		 * Record's prev-link should exactly match our previous location. This
		 * check guards against torn WAL pages where a stale but valid-looking
		 * WAL record starts on a sector boundary.
3805 3806 3807
		 */
		if (!XLByteEQ(record->xl_prev, ReadRecPtr))
		{
3808
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3809 3810 3811 3812 3813 3814
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
B
Bruce Momjian 已提交
3815

T
Tom Lane 已提交
3816
	/*
B
Bruce Momjian 已提交
3817
	 * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3818 3819 3820 3821
	 * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
	 * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
	 * enough for all "normal" records, but very large commit or abort records
	 * might need more space.)
T
Tom Lane 已提交
3822
	 */
3823
	total_len = record->xl_tot_len;
3824
	if (total_len > readRecordBufSize)
3825
	{
3826 3827
		uint32		newSize = total_len;

3828 3829
		newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
		newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3830 3831 3832 3833 3834 3835 3836
		if (readRecordBuf)
			free(readRecordBuf);
		readRecordBuf = (char *) malloc(newSize);
		if (!readRecordBuf)
		{
			readRecordBufSize = 0;
			/* We treat this as a "bogus data" condition */
3837
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3838 3839 3840 3841 3842
					(errmsg("record length %u at %X/%X too long",
							total_len, RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
		readRecordBufSize = newSize;
3843
	}
3844 3845

	buffer = readRecordBuf;
3846
	len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
T
Tom Lane 已提交
3847
	if (total_len > len)
3848
	{
T
Tom Lane 已提交
3849 3850
		/* Need to reassemble record */
		XLogContRecord *contrecord;
3851
		XLogRecPtr	pagelsn;
B
Bruce Momjian 已提交
3852
		uint32		gotlen = len;
3853

3854 3855 3856 3857
		/* Initialize pagelsn to the beginning of the page this record is on */
		pagelsn = *RecPtr;
		pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;

T
Tom Lane 已提交
3858
		memcpy(buffer, record, len);
3859
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
3860
		buffer += len;
3861
		for (;;)
3862
		{
3863 3864 3865
			/* Calculate pointer to beginning of next page */
			pagelsn.xrecoff += XLOG_BLCKSZ;
			if (pagelsn.xrecoff >= XLogFileSize)
3866
			{
3867 3868
				(pagelsn.xlogid)++;
				pagelsn.xrecoff = 0;
3869
			}
3870 3871 3872
			/* Wait for the next page to become available */
			if (!XLogPageRead(&pagelsn, emode, false, false))
				return NULL;
3873

3874
			/* Check that the continuation record looks valid */
T
Tom Lane 已提交
3875
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3876
			{
3877
				ereport(emode_for_corrupt_record(emode, *RecPtr),
3878 3879
						(errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
								readId, readSeg, readOff)));
3880 3881
				goto next_record_is_invalid;
			}
3882 3883
			pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
			contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
B
Bruce Momjian 已提交
3884
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
3885
				total_len != (contrecord->xl_rem_len + gotlen))
3886
			{
3887
				ereport(emode_for_corrupt_record(emode, *RecPtr),
3888 3889 3890
						(errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
								contrecord->xl_rem_len,
								readId, readSeg, readOff)));
3891 3892
				goto next_record_is_invalid;
			}
3893
			len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
T
Tom Lane 已提交
3894
			if (contrecord->xl_rem_len > len)
3895
			{
B
Bruce Momjian 已提交
3896
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
3897 3898 3899 3900 3901 3902 3903 3904
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
3905
		if (!RecordIsValid(record, *RecPtr, emode))
T
Tom Lane 已提交
3906
			goto next_record_is_invalid;
3907
		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
T
Tom Lane 已提交
3908 3909
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3910 3911
			pageHeaderSize +
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3912

T
Tom Lane 已提交
3913
		ReadRecPtr = *RecPtr;
3914
		/* needn't worry about XLOG SWITCH, it can't cross page boundaries */
T
Tom Lane 已提交
3915
		return record;
3916 3917
	}

T
Tom Lane 已提交
3918
	/* Record does not cross a page boundary */
3919
	if (!RecordIsValid(record, *RecPtr, emode))
T
Tom Lane 已提交
3920 3921 3922
		goto next_record_is_invalid;
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3923

T
Tom Lane 已提交
3924 3925
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
B
Bruce Momjian 已提交
3926

3927 3928 3929 3930 3931 3932 3933 3934
	/*
	 * Special processing if it's an XLOG SWITCH record
	 */
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		/* Pretend it extends to end of segment */
		EndRecPtr.xrecoff += XLogSegSize - 1;
		EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
B
Bruce Momjian 已提交
3935

3936
		/*
B
Bruce Momjian 已提交
3937 3938 3939
		 * Pretend that readBuf contains the last page of the segment. This is
		 * just to avoid Assert failure in StartupXLOG if XLOG ends with this
		 * segment.
3940 3941 3942
		 */
		readOff = XLogSegSize - XLOG_BLCKSZ;
	}
T
Tom Lane 已提交
3943
	return (XLogRecord *) buffer;
3944

3945 3946 3947
next_record_is_invalid:
	failedSources |= readSource;

3948 3949 3950 3951 3952
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
3953 3954 3955 3956 3957 3958

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return NULL;
3959 3960
}

3961 3962 3963 3964
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
3965
 * ReadRecord.	It's not intended for use from anywhere else.
3966 3967
 */
static bool
3968
ValidXLOGHeader(XLogPageHeader hdr, int emode)
3969
{
3970 3971
	XLogRecPtr	recaddr;

3972 3973 3974
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;

3975 3976
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
3977
		ereport(emode_for_corrupt_record(emode, recaddr),
3978 3979
				(errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
						hdr->xlp_magic, readId, readSeg, readOff)));
3980 3981 3982 3983
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
3984
		ereport(emode_for_corrupt_record(emode, recaddr),
3985 3986
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
3987 3988
		return false;
	}
3989
	if (hdr->xlp_info & XLP_LONG_HEADER)
3990
	{
3991
		XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
B
Bruce Momjian 已提交
3992

3993
		if (longhdr->xlp_sysid != ControlFile->system_identifier)
3994
		{
3995 3996
			char		fhdrident_str[32];
			char		sysident_str[32];
3997

3998
			/*
B
Bruce Momjian 已提交
3999 4000
			 * Format sysids separately to keep platform-dependent format code
			 * out of the translatable message string.
4001 4002 4003 4004 4005
			 */
			snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
					 longhdr->xlp_sysid);
			snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
					 ControlFile->system_identifier);
4006
			ereport(emode_for_corrupt_record(emode, recaddr),
P
Peter Eisentraut 已提交
4007 4008
					(errmsg("WAL file is from different database system"),
					 errdetail("WAL file database system identifier is %s, pg_control database system identifier is %s.",
B
Bruce Momjian 已提交
4009
							   fhdrident_str, sysident_str)));
4010 4011 4012 4013
			return false;
		}
		if (longhdr->xlp_seg_size != XLogSegSize)
		{
4014
			ereport(emode_for_corrupt_record(emode, recaddr),
P
Peter Eisentraut 已提交
4015
					(errmsg("WAL file is from different database system"),
B
Bruce Momjian 已提交
4016
					 errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
4017 4018
			return false;
		}
4019 4020
		if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
		{
4021
			ereport(emode_for_corrupt_record(emode, recaddr),
P
Peter Eisentraut 已提交
4022
					(errmsg("WAL file is from different database system"),
4023 4024 4025
					 errdetail("Incorrect XLOG_BLCKSZ in page header.")));
			return false;
		}
4026
	}
4027 4028 4029
	else if (readOff == 0)
	{
		/* hmm, first page of file doesn't have a long header? */
4030
		ereport(emode_for_corrupt_record(emode, recaddr),
4031 4032 4033 4034 4035
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
		return false;
	}

4036 4037
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
4038
		ereport(emode_for_corrupt_record(emode, recaddr),
4039
				(errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
B
Bruce Momjian 已提交
4040
						hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
4041 4042 4043 4044 4045 4046 4047 4048 4049
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Check page TLI is one of the expected values.
	 */
	if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
	{
4050
		ereport(emode_for_corrupt_record(emode, recaddr),
4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061
				(errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
						hdr->xlp_tli,
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Since child timelines are always assigned a TLI greater than their
	 * immediate parent's TLI, we should never see TLI go backwards across
	 * successive pages of a consistent WAL sequence.
	 *
B
Bruce Momjian 已提交
4062 4063 4064
	 * Of course this check should only be applied when advancing sequentially
	 * across pages; therefore ReadRecord resets lastPageTLI to zero when
	 * going to a random page.
4065 4066 4067
	 */
	if (hdr->xlp_tli < lastPageTLI)
	{
4068
		ereport(emode_for_corrupt_record(emode, recaddr),
4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081
				(errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
						hdr->xlp_tli, lastPageTLI,
						readId, readSeg, readOff)));
		return false;
	}
	lastPageTLI = hdr->xlp_tli;
	return true;
}

/*
 * Try to read a timeline's history file.
 *
 * If successful, return the list of component TLIs (the given TLI followed by
B
Bruce Momjian 已提交
4082
 * its ancestor TLIs).	If we can't find the history file, assume that the
4083 4084 4085 4086 4087 4088 4089 4090 4091 4092
 * timeline has no parents, and return a list of just the specified timeline
 * ID.
 */
static List *
readTimeLineHistory(TimeLineID targetTLI)
{
	List	   *result;
	char		path[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		fline[MAXPGPATH];
B
Bruce Momjian 已提交
4093
	FILE	   *fd;
4094

4095 4096 4097 4098
	/* Timeline 1 does not have a history file, so no need to check */
	if (targetTLI == 1)
		return list_make1_int((int) targetTLI);

4099 4100 4101
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, targetTLI);
4102
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4103 4104 4105 4106
	}
	else
		TLHistoryFilePath(path, targetTLI);

B
Bruce Momjian 已提交
4107
	fd = AllocateFile(path, "r");
4108 4109 4110 4111 4112
	if (fd == NULL)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4113
					 errmsg("could not open file \"%s\": %m", path)));
4114 4115 4116 4117 4118 4119
		/* Not there, so assume no parents */
		return list_make1_int((int) targetTLI);
	}

	result = NIL;

B
Bruce Momjian 已提交
4120 4121 4122
	/*
	 * Parse the file...
	 */
4123
	while (fgets(fline, sizeof(fline), fd) != NULL)
4124 4125
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
4126 4127 4128
		char	   *ptr;
		char	   *endptr;
		TimeLineID	tli;
4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148

		for (ptr = fline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* expect a numeric timeline ID as first field of line */
		tli = (TimeLineID) strtoul(ptr, &endptr, 0);
		if (endptr == ptr)
			ereport(FATAL,
					(errmsg("syntax error in history file: %s", fline),
					 errhint("Expected a numeric timeline ID.")));

		if (result &&
			tli <= (TimeLineID) linitial_int(result))
			ereport(FATAL,
					(errmsg("invalid data in history file: %s", fline),
B
Bruce Momjian 已提交
4149
				   errhint("Timeline IDs must be in increasing sequence.")));
4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162

		/* Build list with newest item first */
		result = lcons_int((int) tli, result);

		/* we ignore the remainder of each line */
	}

	FreeFile(fd);

	if (result &&
		targetTLI <= (TimeLineID) linitial_int(result))
		ereport(FATAL,
				(errmsg("invalid data in history file \"%s\"", path),
B
Bruce Momjian 已提交
4163
			errhint("Timeline IDs must be less than child timeline's ID.")));
4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181

	result = lcons_int((int) targetTLI, result);

	ereport(DEBUG3,
			(errmsg_internal("history of timeline %u is %s",
							 targetTLI, nodeToString(result))));

	return result;
}

/*
 * Probe whether a timeline history file exists for the given timeline ID
 */
static bool
existsTimeLineHistory(TimeLineID probeTLI)
{
	char		path[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
B
Bruce Momjian 已提交
4182
	FILE	   *fd;
4183

4184 4185 4186 4187
	/* Timeline 1 does not have a history file, so no need to check */
	if (probeTLI == 1)
		return false;

4188 4189 4190
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, probeTLI);
4191
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206
	}
	else
		TLHistoryFilePath(path, probeTLI);

	fd = AllocateFile(path, "r");
	if (fd != NULL)
	{
		FreeFile(fd);
		return true;
	}
	else
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4207
					 errmsg("could not open file \"%s\": %m", path)));
4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225
		return false;
	}
}

/*
 * Find the newest existing timeline, assuming that startTLI exists.
 *
 * Note: while this is somewhat heuristic, it does positively guarantee
 * that (result + 1) is not a known timeline, and therefore it should
 * be safe to assign that ID to a new timeline.
 */
static TimeLineID
findNewestTimeLine(TimeLineID startTLI)
{
	TimeLineID	newestTLI;
	TimeLineID	probeTLI;

	/*
B
Bruce Momjian 已提交
4226 4227
	 * The algorithm is just to probe for the existence of timeline history
	 * files.  XXX is it useful to allow gaps in the sequence?
4228 4229 4230
	 */
	newestTLI = startTLI;

B
Bruce Momjian 已提交
4231
	for (probeTLI = startTLI + 1;; probeTLI++)
4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254
	{
		if (existsTimeLineHistory(probeTLI))
		{
			newestTLI = probeTLI;		/* probeTLI exists */
		}
		else
		{
			/* doesn't exist, assume we're done */
			break;
		}
	}

	return newestTLI;
}

/*
 * Create a new timeline history file.
 *
 *	newTLI: ID of the new timeline
 *	parentTLI: ID of its immediate parent
 *	endTLI et al: ID of the last used WAL file, for annotation purposes
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
4255
 * considerations.	But we should be just as tense as XLogFileInit to avoid
4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270
 * emplacing a bogus file.
 */
static void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
					 TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		xlogfname[MAXFNAMELEN];
	char		buffer[BLCKSZ];
	int			srcfd;
	int			fd;
	int			nbytes;

B
Bruce Momjian 已提交
4271
	Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4272 4273 4274 4275

	/*
	 * Write into a temp file name.
	 */
4276
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4277 4278 4279

	unlink(tmppath);

4280
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
4281 4282 4283
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
4284
		ereport(ERROR,
4285 4286 4287 4288 4289 4290 4291 4292 4293
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * If a history file exists for the parent, copy it verbatim
	 */
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, parentTLI);
4294
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4295 4296 4297 4298 4299 4300 4301 4302
	}
	else
		TLHistoryFilePath(path, parentTLI);

	srcfd = BasicOpenFile(path, O_RDONLY, 0);
	if (srcfd < 0)
	{
		if (errno != ENOENT)
4303
			ereport(ERROR,
4304
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4305
					 errmsg("could not open file \"%s\": %m", path)));
4306 4307 4308 4309 4310 4311 4312 4313 4314
		/* Not there, so assume parent has no parents */
	}
	else
	{
		for (;;)
		{
			errno = 0;
			nbytes = (int) read(srcfd, buffer, sizeof(buffer));
			if (nbytes < 0 || errno != 0)
4315
				ereport(ERROR,
4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			if (nbytes == 0)
				break;
			errno = 0;
			if ((int) write(fd, buffer, nbytes) != nbytes)
			{
				int			save_errno = errno;

				/*
				 * If we fail to make the file, delete it to release disk
				 * space
				 */
				unlink(tmppath);
B
Bruce Momjian 已提交
4330 4331

				/*
B
Bruce Momjian 已提交
4332
				 * if write didn't set errno, assume problem is no disk space
B
Bruce Momjian 已提交
4333
				 */
4334 4335
				errno = save_errno ? save_errno : ENOSPC;

4336
				ereport(ERROR,
4337
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
4338
					 errmsg("could not write to file \"%s\": %m", tmppath)));
4339 4340 4341 4342 4343 4344 4345 4346
			}
		}
		close(srcfd);
	}

	/*
	 * Append one line with the details of this timeline split.
	 *
B
Bruce Momjian 已提交
4347 4348
	 * If we did have a parent file, insert an extra newline just in case the
	 * parent file failed to end with one.
4349 4350 4351
	 */
	XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);

4352
	/*
B
Bruce Momjian 已提交
4353 4354
	 * Write comment to history file to explain why and where timeline
	 * changed. Comment varies according to the recovery target used.
4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377
	 */
	if (recoveryTarget == RECOVERY_TARGET_XID)
		snprintf(buffer, sizeof(buffer),
				 "%s%u\t%s\t%s transaction %u\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname,
				 recoveryStopAfter ? "after" : "before",
				 recoveryStopXid);
	if (recoveryTarget == RECOVERY_TARGET_TIME)
		snprintf(buffer, sizeof(buffer),
				 "%s%u\t%s\t%s %s\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname,
				 recoveryStopAfter ? "after" : "before",
				 timestamptz_to_str(recoveryStopTime));
	else
		snprintf(buffer, sizeof(buffer),
				 "%s%u\t%s\tno recovery target specified\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname);
4378 4379 4380 4381 4382 4383 4384 4385

	nbytes = strlen(buffer);
	errno = 0;
	if ((int) write(fd, buffer, nbytes) != nbytes)
	{
		int			save_errno = errno;

		/*
B
Bruce Momjian 已提交
4386
		 * If we fail to make the file, delete it to release disk space
4387 4388 4389 4390 4391
		 */
		unlink(tmppath);
		/* if write didn't set errno, assume problem is no disk space */
		errno = save_errno ? save_errno : ENOSPC;

4392
		ereport(ERROR,
4393 4394 4395 4396 4397
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", tmppath)));
	}

	if (pg_fsync(fd) != 0)
4398
		ereport(ERROR,
4399 4400 4401 4402
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
4403
		ereport(ERROR,
4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));


	/*
	 * Now move the completed history file into place with its final name.
	 */
	TLHistoryFilePath(path, newTLI);

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
	 */
#if HAVE_WORKING_LINK
	if (link(tmppath, path) < 0)
4420
		ereport(ERROR,
4421 4422 4423 4424 4425 4426
				(errcode_for_file_access(),
				 errmsg("could not link file \"%s\" to \"%s\": %m",
						tmppath, path)));
	unlink(tmppath);
#else
	if (rename(tmppath, path) < 0)
4427
		ereport(ERROR,
4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						tmppath, path)));
#endif

	/* The history file can be archived immediately. */
	TLHistoryFileName(histfname, newTLI);
	XLogArchiveNotify(histfname);
}

/*
 * I/O routines for pg_control
4440 4441
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
4442
 * contents of pg_control.	WriteControlFile() initializes pg_control
4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */
static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
4456
	char		buffer[PG_CONTROL_SIZE];		/* need not be aligned */
4457 4458

	/*
T
Tom Lane 已提交
4459
	 * Initialize version and compatibility-check fields
4460
	 */
T
Tom Lane 已提交
4461 4462
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4463 4464 4465 4466

	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
	ControlFile->floatFormat = FLOATFORMAT_VALUE;

4467 4468
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
4469
	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4470
	ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4471 4472

	ControlFile->nameDataLen = NAMEDATALEN;
4473
	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4474

4475 4476
	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

4477
#ifdef HAVE_INT64_TIMESTAMP
4478
	ControlFile->enableIntTimes = true;
4479
#else
4480
	ControlFile->enableIntTimes = false;
4481
#endif
4482 4483
	ControlFile->float4ByVal = FLOAT4PASSBYVAL;
	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4484

T
Tom Lane 已提交
4485
	/* Contents are protected with a CRC */
4486 4487 4488 4489 4490
	INIT_CRC32(ControlFile->crc);
	COMP_CRC32(ControlFile->crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(ControlFile->crc);
T
Tom Lane 已提交
4491

4492
	/*
4493 4494 4495 4496 4497
	 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
	 * excess over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail when we
	 * check the contents of the file, but hopefully with a more specific
	 * error than "couldn't read pg_control".
4498
	 */
4499 4500
	if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
		elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4501

4502
	memset(buffer, 0, PG_CONTROL_SIZE);
4503 4504
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

4505 4506
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4507
					   S_IRUSR | S_IWUSR);
4508
	if (fd < 0)
4509 4510 4511
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not create control file \"%s\": %m",
4512
						XLOG_CONTROL_FILE)));
4513

4514
	errno = 0;
4515
	if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4516 4517 4518 4519
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4520 4521
		ereport(PANIC,
				(errcode_for_file_access(),
4522
				 errmsg("could not write to control file: %m")));
4523
	}
4524

4525
	if (pg_fsync(fd) != 0)
4526 4527
		ereport(PANIC,
				(errcode_for_file_access(),
4528
				 errmsg("could not fsync control file: %m")));
4529

4530 4531 4532 4533
	if (close(fd))
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not close control file: %m")));
4534 4535 4536 4537 4538
}

static void
ReadControlFile(void)
{
4539
	pg_crc32	crc;
4540 4541 4542 4543 4544
	int			fd;

	/*
	 * Read data...
	 */
4545 4546 4547
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
4548
	if (fd < 0)
4549 4550 4551
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
4552
						XLOG_CONTROL_FILE)));
4553 4554

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4555 4556
		ereport(PANIC,
				(errcode_for_file_access(),
4557
				 errmsg("could not read from control file: %m")));
4558 4559 4560

	close(fd);

T
Tom Lane 已提交
4561
	/*
B
Bruce Momjian 已提交
4562 4563 4564 4565
	 * Check for expected pg_control format version.  If this is wrong, the
	 * CRC check will likely fail because we'll be checking the wrong number
	 * of bytes.  Complaining about wrong version will probably be more
	 * enlightening than complaining about wrong CRC.
T
Tom Lane 已提交
4566
	 */
4567 4568 4569 4570 4571

	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4572 4573
		 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
			ControlFile->pg_control_version, ControlFile->pg_control_version,
4574 4575 4576
						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));

T
Tom Lane 已提交
4577
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4578 4579 4580
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
B
Bruce Momjian 已提交
4581 4582
				  " but the server was compiled with PG_CONTROL_VERSION %d.",
						ControlFile->pg_control_version, PG_CONTROL_VERSION),
4583
				 errhint("It looks like you need to initdb.")));
4584

T
Tom Lane 已提交
4585
	/* Now check the CRC. */
4586 4587 4588 4589 4590
	INIT_CRC32(crc);
	COMP_CRC32(crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(crc);
4591

4592
	if (!EQ_CRC32(crc, ControlFile->crc))
4593
		ereport(FATAL,
4594
				(errmsg("incorrect checksum in control file")));
4595

4596
	/*
4597
	 * Do compatibility checking immediately.  If the database isn't
4598 4599
	 * compatible with the backend executable, we want to abort before we can
	 * possibly do any damage.
4600
	 */
T
Tom Lane 已提交
4601
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4602 4603 4604
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
B
Bruce Momjian 已提交
4605 4606
				  " but the server was compiled with CATALOG_VERSION_NO %d.",
						ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4607
				 errhint("It looks like you need to initdb.")));
4608 4609 4610
	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4611 4612 4613 4614
		   errdetail("The database cluster was initialized with MAXALIGN %d,"
					 " but the server was compiled with MAXALIGN %d.",
					 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
				 errhint("It looks like you need to initdb.")));
4615 4616 4617
	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
P
Peter Eisentraut 已提交
4618
				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4619
				 errhint("It looks like you need to initdb.")));
4620
	if (ControlFile->blcksz != BLCKSZ)
4621 4622
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4623 4624 4625 4626
			 errdetail("The database cluster was initialized with BLCKSZ %d,"
					   " but the server was compiled with BLCKSZ %d.",
					   ControlFile->blcksz, BLCKSZ),
				 errhint("It looks like you need to recompile or initdb.")));
4627
	if (ControlFile->relseg_size != RELSEG_SIZE)
4628 4629
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4630 4631 4632 4633
		errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
				  " but the server was compiled with RELSEG_SIZE %d.",
				  ControlFile->relseg_size, RELSEG_SIZE),
				 errhint("It looks like you need to recompile or initdb.")));
4634 4635 4636
	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4637 4638 4639
		errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
				  " but the server was compiled with XLOG_BLCKSZ %d.",
				  ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4640
				 errhint("It looks like you need to recompile or initdb.")));
4641 4642 4643 4644
	if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
B
Bruce Momjian 已提交
4645
					   " but the server was compiled with XLOG_SEG_SIZE %d.",
4646
						   ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
B
Bruce Momjian 已提交
4647
				 errhint("It looks like you need to recompile or initdb.")));
4648
	if (ControlFile->nameDataLen != NAMEDATALEN)
4649 4650
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4651 4652 4653 4654
		errdetail("The database cluster was initialized with NAMEDATALEN %d,"
				  " but the server was compiled with NAMEDATALEN %d.",
				  ControlFile->nameDataLen, NAMEDATALEN),
				 errhint("It looks like you need to recompile or initdb.")));
4655
	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4656 4657
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4658
				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
B
Bruce Momjian 已提交
4659
					  " but the server was compiled with INDEX_MAX_KEYS %d.",
4660
						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
B
Bruce Momjian 已提交
4661
				 errhint("It looks like you need to recompile or initdb.")));
4662 4663 4664 4665
	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
B
Bruce Momjian 已提交
4666 4667
				" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
			  ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4668
				 errhint("It looks like you need to recompile or initdb.")));
4669 4670

#ifdef HAVE_INT64_TIMESTAMP
4671
	if (ControlFile->enableIntTimes != true)
4672 4673 4674
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
4675 4676
				  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
4677
#else
4678
	if (ControlFile->enableIntTimes != false)
4679 4680 4681
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
4682 4683
			   " but the server was compiled without HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
4684 4685
#endif

4686 4687 4688 4689 4690
#ifdef USE_FLOAT4_BYVAL
	if (ControlFile->float4ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4691
					  " but the server was compiled with USE_FLOAT4_BYVAL."),
4692 4693 4694 4695 4696
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float4ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4697 4698
		errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
				  " but the server was compiled without USE_FLOAT4_BYVAL."),
4699 4700 4701 4702 4703 4704 4705 4706
				 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT8_BYVAL
	if (ControlFile->float8ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4707
					  " but the server was compiled with USE_FLOAT8_BYVAL."),
4708 4709 4710 4711 4712
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float8ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4713 4714
		errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
				  " but the server was compiled without USE_FLOAT8_BYVAL."),
4715 4716
				 errhint("It looks like you need to recompile or initdb.")));
#endif
4717 4718
}

4719
void
4720
UpdateControlFile(void)
4721
{
4722
	int			fd;
4723

4724 4725 4726 4727 4728
	INIT_CRC32(ControlFile->crc);
	COMP_CRC32(ControlFile->crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(ControlFile->crc);
4729

4730 4731 4732
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
4733
	if (fd < 0)
4734 4735 4736
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
4737
						XLOG_CONTROL_FILE)));
4738

4739
	errno = 0;
4740
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4741 4742 4743 4744
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4745 4746
		ereport(PANIC,
				(errcode_for_file_access(),
4747
				 errmsg("could not write to control file: %m")));
4748
	}
4749

4750
	if (pg_fsync(fd) != 0)
4751 4752
		ereport(PANIC,
				(errcode_for_file_access(),
4753
				 errmsg("could not fsync control file: %m")));
4754

4755 4756 4757 4758
	if (close(fd))
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not close control file: %m")));
4759 4760
}

4761 4762 4763 4764 4765 4766 4767 4768 4769 4770
/*
 * Returns the unique system identifier from control file.
 */
uint64
GetSystemIdentifier(void)
{
	Assert(ControlFile != NULL);
	return ControlFile->system_identifier;
}

4771
/*
T
Tom Lane 已提交
4772
 * Initialization of shared memory for XLOG
4773
 */
4774
Size
4775
XLOGShmemSize(void)
4776
{
4777
	Size		size;
4778

4779 4780 4781 4782 4783 4784 4785
	/* XLogCtl */
	size = sizeof(XLogCtlData);
	/* xlblocks array */
	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
	/* extra alignment padding for XLOG I/O buffers */
	size = add_size(size, ALIGNOF_XLOG_BUFFER);
	/* and the buffers themselves */
4786
	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4787 4788

	/*
B
Bruce Momjian 已提交
4789 4790 4791
	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
	 * routine again below to compute the actual allocation size.
4792 4793 4794
	 */

	return size;
4795 4796 4797 4798 4799
}

void
XLOGShmemInit(void)
{
4800 4801
	bool		foundCFile,
				foundXLog;
4802
	char	   *allocptr;
4803

4804
	ControlFile = (ControlFileData *)
4805
		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4806 4807
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4808

4809
	if (foundCFile || foundXLog)
4810 4811
	{
		/* both should be present or neither */
4812
		Assert(foundCFile && foundXLog);
4813 4814
		return;
	}
4815

T
Tom Lane 已提交
4816
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
4817

T
Tom Lane 已提交
4818
	/*
B
Bruce Momjian 已提交
4819 4820 4821
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
	 * multiple of the alignment for same, so no extra alignment padding is
	 * needed here.
T
Tom Lane 已提交
4822
	 */
4823 4824
	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
T
Tom Lane 已提交
4825
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4826
	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
B
Bruce Momjian 已提交
4827

T
Tom Lane 已提交
4828
	/*
4829
	 * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
T
Tom Lane 已提交
4830
	 */
4831 4832
	allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
	XLogCtl->pages = allocptr;
4833
	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
T
Tom Lane 已提交
4834 4835

	/*
B
Bruce Momjian 已提交
4836 4837
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
	 * in additional info.)
T
Tom Lane 已提交
4838 4839
	 */
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4840
	XLogCtl->SharedRecoveryInProgress = true;
T
Tom Lane 已提交
4841
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4842
	SpinLockInit(&XLogCtl->info_lck);
T
Tom Lane 已提交
4843

4844
	/*
B
Bruce Momjian 已提交
4845 4846 4847
	 * If we are not in bootstrap mode, pg_control should already exist. Read
	 * and validate it immediately (see comments in ReadControlFile() for the
	 * reasons why).
4848 4849 4850
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
4851 4852 4853
}

/*
T
Tom Lane 已提交
4854 4855
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
4856 4857
 */
void
T
Tom Lane 已提交
4858
BootStrapXLOG(void)
4859
{
4860
	CheckPoint	checkPoint;
T
Tom Lane 已提交
4861 4862
	char	   *buffer;
	XLogPageHeader page;
4863
	XLogLongPageHeader longpage;
4864
	XLogRecord *record;
B
Bruce Momjian 已提交
4865
	bool		use_existent;
4866 4867
	uint64		sysidentifier;
	struct timeval tv;
4868
	pg_crc32	crc;
4869

4870
	/*
B
Bruce Momjian 已提交
4871 4872 4873 4874 4875 4876 4877 4878 4879 4880
	 * Select a hopefully-unique system identifier code for this installation.
	 * We use the result of gettimeofday(), including the fractional seconds
	 * field, as being about as unique as we can easily get.  (Think not to
	 * use random(), since it hasn't been seeded and there's no portable way
	 * to seed it other than the system clock value...)  The upper half of the
	 * uint64 value is just the tv_sec part, while the lower half is the XOR
	 * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
	 * unnecessarily if "uint64" is really only 32 bits wide.  A person
	 * knowing this encoding can determine the initialization time of the
	 * installation, which could perhaps be useful sometimes.
4881 4882 4883 4884 4885
	 */
	gettimeofday(&tv, NULL);
	sysidentifier = ((uint64) tv.tv_sec) << 32;
	sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

4886 4887 4888
	/* First timeline ID is always 1 */
	ThisTimeLineID = 1;

4889
	/* page buffer must be aligned suitably for O_DIRECT */
4890
	buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4891
	page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4892
	memset(page, 0, XLOG_BLCKSZ);
T
Tom Lane 已提交
4893

4894
	/* Set up information for the initial checkpoint record */
4895
	checkPoint.redo.xlogid = 0;
4896 4897
	checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
	checkPoint.ThisTimeLineID = ThisTimeLineID;
4898
	checkPoint.nextXidEpoch = 0;
4899
	checkPoint.nextXid = FirstNormalTransactionId;
4900
	checkPoint.nextOid = FirstBootstrapObjectId;
4901
	checkPoint.nextMulti = FirstMultiXactId;
4902
	checkPoint.nextMultiOffset = 0;
4903 4904
	checkPoint.oldestXid = FirstNormalTransactionId;
	checkPoint.oldestXidDB = TemplateDbOid;
4905
	checkPoint.time = (pg_time_t) time(NULL);
4906
	checkPoint.oldestActiveXid = InvalidTransactionId;
4907

4908 4909 4910
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;
4911
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4912
	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4913

4914
	/* Set up the XLOG page header */
4915
	page->xlp_magic = XLOG_PAGE_MAGIC;
4916 4917
	page->xlp_info = XLP_LONG_HEADER;
	page->xlp_tli = ThisTimeLineID;
4918 4919
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
4920 4921 4922
	longpage = (XLogLongPageHeader) page;
	longpage->xlp_sysid = sysidentifier;
	longpage->xlp_seg_size = XLogSegSize;
4923
	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4924 4925

	/* Insert the initial checkpoint record */
4926
	record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4927
	record->xl_prev.xlogid = 0;
4928
	record->xl_prev.xrecoff = 0;
4929
	record->xl_xid = InvalidTransactionId;
4930
	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4931
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
4932
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4933
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
4934
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4935

4936 4937 4938 4939 4940
	INIT_CRC32(crc);
	COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(crc);
4941 4942
	record->xl_crc = crc;

4943
	/* Create first XLOG segment file */
4944 4945
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
4946

4947
	/* Write the first page with the initial record */
4948
	errno = 0;
4949
	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4950 4951 4952 4953
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4954 4955
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
4956
			  errmsg("could not write bootstrap transaction log file: %m")));
4957
	}
4958

T
Tom Lane 已提交
4959
	if (pg_fsync(openLogFile) != 0)
4960 4961
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
4962
			  errmsg("could not fsync bootstrap transaction log file: %m")));
4963

4964 4965 4966
	if (close(openLogFile))
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
4967
			  errmsg("could not close bootstrap transaction log file: %m")));
4968

T
Tom Lane 已提交
4969
	openLogFile = -1;
4970

4971 4972
	/* Now create pg_control */

4973
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
4974
	/* Initialize pg_control status fields */
4975
	ControlFile->system_identifier = sysidentifier;
T
Tom Lane 已提交
4976 4977
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
4978
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
4979
	ControlFile->checkPointCopy = checkPoint;
4980 4981 4982 4983 4984 4985 4986

	/* Set important parameter values for use when replaying WAL */
	ControlFile->MaxConnections = MaxConnections;
	ControlFile->max_prepared_xacts = max_prepared_xacts;
	ControlFile->max_locks_per_xact = max_locks_per_xact;
	ControlFile->wal_level = wal_level;

4987
	/* some additional ControlFile fields are set in WriteControlFile() */
4988

4989
	WriteControlFile();
4990 4991 4992

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
4993
	BootStrapSUBTRANS();
4994
	BootStrapMultiXact();
4995

4996
	pfree(buffer);
4997 4998
}

4999
static char *
5000
str_time(pg_time_t tnow)
5001
{
5002
	static char buf[128];
5003

5004 5005 5006
	pg_strftime(buf, sizeof(buf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&tnow, log_timezone));
5007

5008
	return buf;
5009 5010
}

5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033
/*
 * Parse one line from recovery.conf. 'cmdline' is the raw line from the
 * file. If the line is parsed successfully, returns true, false indicates
 * syntax error. On success, *key_p and *value_p are set to the parameter
 * name and value on the line, respectively. If the line is an empty line,
 * consisting entirely of whitespace and comments, function returns true
 * and *keyp_p and *value_p are set to NULL.
 *
 * The pointers returned in *key_p and *value_p point to an internal buffer
 * that is valid only until the next call of parseRecoveryCommandFile().
 */
static bool
parseRecoveryCommandFileLine(char *cmdline, char **key_p, char **value_p)
{
	char	   *ptr;
	char	   *bufp;
	char	   *key;
	char	   *value;
	static char *buf = NULL;

	*key_p = *value_p = NULL;

	/*
B
Bruce Momjian 已提交
5034 5035
	 * Allocate the buffer on first use. It's used to hold both the parameter
	 * name and value.
5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080
	 */
	if (buf == NULL)
		buf = malloc(MAXPGPATH + 1);
	bufp = buf;

	/* Skip any whitespace at the beginning of line */
	for (ptr = cmdline; *ptr; ptr++)
	{
		if (!isspace((unsigned char) *ptr))
			break;
	}
	/* Ignore empty lines */
	if (*ptr == '\0' || *ptr == '#')
		return true;

	/* Read the parameter name */
	key = bufp;
	while (*ptr && !isspace((unsigned char) *ptr) &&
		   *ptr != '=' && *ptr != '\'')
		*(bufp++) = *(ptr++);
	*(bufp++) = '\0';

	/* Skip to the beginning quote of the parameter value */
	ptr = strchr(ptr, '\'');
	if (!ptr)
		return false;
	ptr++;

	/* Read the parameter value to *bufp. Collapse any '' escapes as we go. */
	value = bufp;
	for (;;)
	{
		if (*ptr == '\'')
		{
			ptr++;
			if (*ptr == '\'')
				*(bufp++) = '\'';
			else
			{
				/* end of parameter */
				*bufp = '\0';
				break;
			}
		}
		else if (*ptr == '\0')
B
Bruce Momjian 已提交
5081
			return false;		/* unterminated quoted string */
5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104
		else
			*(bufp++) = *ptr;

		ptr++;
	}
	*(bufp++) = '\0';

	/* Check that there's no garbage after the value */
	while (*ptr)
	{
		if (*ptr == '#')
			break;
		if (!isspace((unsigned char) *ptr))
			return false;
		ptr++;
	}

	/* Success! */
	*key_p = key;
	*value_p = value;
	return true;
}

5105 5106
/*
 * See if there is a recovery command file (recovery.conf), and if so
5107
 * read in parameters for archive recovery and XLOG streaming.
5108 5109 5110 5111 5112 5113 5114 5115
 *
 * XXX longer term intention is to expand this to
 * cater for additional parameters and controls
 * possibly use a flex lexer similar to the GUC one
 */
static void
readRecoveryCommandFile(void)
{
B
Bruce Momjian 已提交
5116 5117 5118 5119 5120 5121
	FILE	   *fd;
	char		cmdline[MAXPGPATH];
	TimeLineID	rtli = 0;
	bool		rtliGiven = false;
	bool		syntaxError = false;

5122
	fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5123 5124 5125 5126 5127
	if (fd == NULL)
	{
		if (errno == ENOENT)
			return;				/* not there, so no archive recovery */
		ereport(FATAL,
B
Bruce Momjian 已提交
5128
				(errcode_for_file_access(),
5129
				 errmsg("could not open recovery command file \"%s\": %m",
5130
						RECOVERY_COMMAND_FILE)));
5131 5132
	}

B
Bruce Momjian 已提交
5133 5134 5135
	/*
	 * Parse the file...
	 */
5136
	while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
5137
	{
B
Bruce Momjian 已提交
5138 5139
		char	   *tok1;
		char	   *tok2;
5140

5141
		if (!parseRecoveryCommandFileLine(cmdline, &tok1, &tok2))
5142 5143 5144 5145
		{
			syntaxError = true;
			break;
		}
5146 5147
		if (tok1 == NULL)
			continue;
5148

B
Bruce Momjian 已提交
5149 5150
		if (strcmp(tok1, "restore_command") == 0)
		{
5151
			recoveryRestoreCommand = pstrdup(tok2);
5152
			ereport(DEBUG2,
5153
					(errmsg("restore_command = '%s'",
5154 5155
							recoveryRestoreCommand)));
		}
5156 5157 5158
		else if (strcmp(tok1, "recovery_end_command") == 0)
		{
			recoveryEndCommand = pstrdup(tok2);
5159
			ereport(DEBUG2,
5160 5161 5162
					(errmsg("recovery_end_command = '%s'",
							recoveryEndCommand)));
		}
5163
		else if (strcmp(tok1, "archive_cleanup_command") == 0)
5164
		{
5165
			archiveCleanupCommand = pstrdup(tok2);
5166
			ereport(DEBUG2,
5167 5168
					(errmsg("archive_cleanup_command = '%s'",
							archiveCleanupCommand)));
5169
		}
B
Bruce Momjian 已提交
5170 5171
		else if (strcmp(tok1, "recovery_target_timeline") == 0)
		{
5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184
			rtliGiven = true;
			if (strcmp(tok2, "latest") == 0)
				rtli = 0;
			else
			{
				errno = 0;
				rtli = (TimeLineID) strtoul(tok2, NULL, 0);
				if (errno == EINVAL || errno == ERANGE)
					ereport(FATAL,
							(errmsg("recovery_target_timeline is not a valid number: \"%s\"",
									tok2)));
			}
			if (rtli)
5185
				ereport(DEBUG2,
5186 5187
						(errmsg("recovery_target_timeline = %u", rtli)));
			else
5188
				ereport(DEBUG2,
5189 5190
						(errmsg("recovery_target_timeline = latest")));
		}
B
Bruce Momjian 已提交
5191 5192
		else if (strcmp(tok1, "recovery_target_xid") == 0)
		{
5193 5194 5195 5196
			errno = 0;
			recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
			if (errno == EINVAL || errno == ERANGE)
				ereport(FATAL,
B
Bruce Momjian 已提交
5197 5198
				 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
						 tok2)));
5199
			ereport(DEBUG2,
5200 5201
					(errmsg("recovery_target_xid = %u",
							recoveryTargetXid)));
5202
			recoveryTarget = RECOVERY_TARGET_XID;
5203
		}
B
Bruce Momjian 已提交
5204 5205
		else if (strcmp(tok1, "recovery_target_time") == 0)
		{
5206 5207 5208 5209
			/*
			 * if recovery_target_xid specified, then this overrides
			 * recovery_target_time
			 */
5210
			if (recoveryTarget == RECOVERY_TARGET_XID)
5211
				continue;
5212
			recoveryTarget = RECOVERY_TARGET_TIME;
B
Bruce Momjian 已提交
5213

5214
			/*
5215
			 * Convert the time string given by the user to TimestampTz form.
5216
			 */
5217 5218
			recoveryTargetTime =
				DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
B
Bruce Momjian 已提交
5219
														CStringGetDatum(tok2),
5220 5221
												ObjectIdGetDatum(InvalidOid),
														Int32GetDatum(-1)));
5222
			ereport(DEBUG2,
5223
					(errmsg("recovery_target_time = '%s'",
5224
							timestamptz_to_str(recoveryTargetTime))));
5225
		}
B
Bruce Momjian 已提交
5226 5227
		else if (strcmp(tok1, "recovery_target_inclusive") == 0)
		{
5228 5229 5230
			/*
			 * does nothing if a recovery_target is not also set
			 */
5231
			if (!parse_bool(tok2, &recoveryTargetInclusive))
5232 5233
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5234
						 errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive")));
5235
			ereport(DEBUG2,
5236 5237
					(errmsg("recovery_target_inclusive = %s", tok2)));
		}
5238 5239 5240 5241 5242
		else if (strcmp(tok1, "standby_mode") == 0)
		{
			if (!parse_bool(tok2, &StandbyMode))
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5243
						 errmsg("parameter \"%s\" requires a Boolean value", "standby_mode")));
5244
			ereport(DEBUG2,
5245 5246 5247 5248 5249
					(errmsg("standby_mode = '%s'", tok2)));
		}
		else if (strcmp(tok1, "primary_conninfo") == 0)
		{
			PrimaryConnInfo = pstrdup(tok2);
5250
			ereport(DEBUG2,
5251 5252 5253 5254 5255 5256
					(errmsg("primary_conninfo = '%s'",
							PrimaryConnInfo)));
		}
		else if (strcmp(tok1, "trigger_file") == 0)
		{
			TriggerFile = pstrdup(tok2);
5257
			ereport(DEBUG2,
5258 5259 5260
					(errmsg("trigger_file = '%s'",
							TriggerFile)));
		}
5261 5262 5263 5264 5265 5266 5267 5268
		else
			ereport(FATAL,
					(errmsg("unrecognized recovery parameter \"%s\"",
							tok1)));
	}

	FreeFile(fd);

B
Bruce Momjian 已提交
5269 5270
	if (syntaxError)
		ereport(FATAL,
5271 5272
				(errmsg("syntax error in recovery command file: %s",
						cmdline),
B
Bruce Momjian 已提交
5273
			  errhint("Lines should have the format parameter = 'value'.")));
5274

5275 5276 5277 5278 5279 5280
	/*
	 * Check for compulsory parameters
	 */
	if (StandbyMode)
	{
		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5281
			ereport(WARNING,
5282
					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5283 5284
							RECOVERY_COMMAND_FILE),
					 errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5285 5286 5287 5288 5289
	}
	else
	{
		if (recoveryRestoreCommand == NULL)
			ereport(FATAL,
5290
					(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5291 5292
							RECOVERY_COMMAND_FILE)));
	}
5293

5294 5295 5296
	/* Enable fetching from archive recovery area */
	InArchiveRecovery = true;

5297
	/*
B
Bruce Momjian 已提交
5298 5299 5300 5301
	 * If user specified recovery_target_timeline, validate it or compute the
	 * "latest" value.	We can't do this until after we've gotten the restore
	 * command and set InArchiveRecovery, because we need to fetch timeline
	 * history files from the archive.
5302
	 */
5303 5304 5305 5306 5307 5308 5309
	if (rtliGiven)
	{
		if (rtli)
		{
			/* Timeline 1 does not have a history file, all else should */
			if (rtli != 1 && !existsTimeLineHistory(rtli))
				ereport(FATAL,
5310
						(errmsg("recovery target timeline %u does not exist",
B
Bruce Momjian 已提交
5311
								rtli)));
5312 5313 5314 5315 5316 5317 5318 5319
			recoveryTargetTLI = rtli;
		}
		else
		{
			/* We start the "latest" search from pg_control's timeline */
			recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
		}
	}
5320 5321 5322 5323 5324 5325
}

/*
 * Exit archive-recovery state
 */
static void
5326
exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
5327
{
B
Bruce Momjian 已提交
5328 5329
	char		recoveryPath[MAXPGPATH];
	char		xlogpath[MAXPGPATH];
5330
	XLogRecPtr	InvalidXLogRecPtr = {0, 0};
5331 5332

	/*
5333
	 * We are no longer in archive recovery state.
5334 5335 5336
	 */
	InArchiveRecovery = false;

5337 5338 5339 5340 5341
	/*
	 * Update min recovery point one last time.
	 */
	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);

5342
	/*
B
Bruce Momjian 已提交
5343 5344
	 * If the ending log segment is still open, close it (to avoid problems on
	 * Windows with trying to rename or delete an open file).
5345
	 */
5346 5347 5348 5349 5350
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
5351 5352

	/*
B
Bruce Momjian 已提交
5353 5354 5355 5356 5357 5358 5359
	 * If the segment was fetched from archival storage, we want to replace
	 * the existing xlog segment (if any) with the archival version.  This is
	 * because whatever is in XLOGDIR is very possibly older than what we have
	 * from the archives, since it could have come from restoring a PGDATA
	 * backup.	In any case, the archival version certainly is more
	 * descriptive of what our current database state is, because that is what
	 * we replayed from.
5360
	 *
5361 5362
	 * Note that if we are establishing a new timeline, ThisTimeLineID is
	 * already set to the new value, and so we will create a new file instead
5363 5364
	 * of overwriting any existing file.  (This is, in fact, always the case
	 * at present.)
5365
	 */
5366
	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5367
	XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5368 5369 5370 5371 5372 5373 5374 5375 5376 5377

	if (restoredFromArchive)
	{
		ereport(DEBUG3,
				(errmsg_internal("moving last restored xlog to \"%s\"",
								 xlogpath)));
		unlink(xlogpath);		/* might or might not exist */
		if (rename(recoveryPath, xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
5378
					 errmsg("could not rename file \"%s\" to \"%s\": %m",
5379 5380 5381 5382 5383 5384 5385 5386 5387 5388
							recoveryPath, xlogpath)));
		/* XXX might we need to fix permissions on the file? */
	}
	else
	{
		/*
		 * If the latest segment is not archival, but there's still a
		 * RECOVERYXLOG laying about, get rid of it.
		 */
		unlink(recoveryPath);	/* ignore any error */
B
Bruce Momjian 已提交
5389

5390
		/*
B
Bruce Momjian 已提交
5391 5392 5393
		 * If we are establishing a new timeline, we have to copy data from
		 * the last WAL segment of the old timeline to create a starting WAL
		 * segment for the new timeline.
5394 5395 5396 5397
		 *
		 * Notify the archiver that the last WAL segment of the old timeline
		 * is ready to copy to archival storage. Otherwise, it is not archived
		 * for a while.
5398 5399
		 */
		if (endTLI != ThisTimeLineID)
5400
		{
5401 5402
			XLogFileCopy(endLogId, endLogSeg,
						 endTLI, endLogId, endLogSeg);
5403 5404 5405 5406 5407 5408 5409

			if (XLogArchivingActive())
			{
				XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
				XLogArchiveNotify(xlogpath);
			}
		}
5410 5411 5412
	}

	/*
B
Bruce Momjian 已提交
5413 5414
	 * Let's just make real sure there are not .ready or .done flags posted
	 * for the new segment.
5415
	 */
5416 5417
	XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
	XLogArchiveCleanup(xlogpath);
5418

5419
	/* Get rid of any remaining recovered timeline-history file, too */
5420
	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
B
Bruce Momjian 已提交
5421
	unlink(recoveryPath);		/* ignore any error */
5422 5423

	/*
B
Bruce Momjian 已提交
5424 5425
	 * Rename the config file out of the way, so that we don't accidentally
	 * re-enter archive recovery mode in a subsequent crash.
5426
	 */
5427 5428
	unlink(RECOVERY_COMMAND_DONE);
	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5429 5430
		ereport(FATAL,
				(errcode_for_file_access(),
5431
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
5432
						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443

	ereport(LOG,
			(errmsg("archive recovery complete")));
}

/*
 * For point-in-time recovery, this function decides whether we want to
 * stop applying the XLOG at or after the current record.
 *
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 * *includeThis is set TRUE if we should apply this record before stopping.
5444 5445
 *
 * We also track the timestamp of the latest applied COMMIT/ABORT record
5446
 * in XLogCtl->recoveryLastXTime, for logging purposes.
5447 5448
 * Also, some information is saved in recoveryStopXid et al for use in
 * annotating the new timeline's history file.
5449 5450 5451 5452 5453
 */
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
	bool		stopsHere;
B
Bruce Momjian 已提交
5454
	uint8		record_info;
B
Bruce Momjian 已提交
5455
	TimestampTz recordXtime;
5456 5457

	/* We only consider stopping at COMMIT or ABORT records */
5458 5459 5460 5461
	if (record->xl_rmid != RM_XACT_ID)
		return false;
	record_info = record->xl_info & ~XLR_INFO_MASK;
	if (record_info == XLOG_XACT_COMMIT)
5462
	{
5463
		xl_xact_commit *recordXactCommitData;
5464

5465 5466
		recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
		recordXtime = recordXactCommitData->xact_time;
5467
	}
5468
	else if (record_info == XLOG_XACT_ABORT)
5469
	{
5470
		xl_xact_abort *recordXactAbortData;
5471

5472 5473
		recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
		recordXtime = recordXactAbortData->xact_time;
5474 5475 5476 5477
	}
	else
		return false;

5478
	/* Do we have a PITR target at all? */
5479
	if (recoveryTarget == RECOVERY_TARGET_UNSET)
5480
	{
5481
		SetLatestXTime(recordXtime);
5482
		return false;
5483
	}
5484

5485
	if (recoveryTarget == RECOVERY_TARGET_XID)
5486 5487
	{
		/*
B
Bruce Momjian 已提交
5488 5489
		 * there can be only one transaction end record with this exact
		 * transactionid
5490
		 *
B
Bruce Momjian 已提交
5491
		 * when testing for an xid, we MUST test for equality only, since
B
Bruce Momjian 已提交
5492 5493 5494
		 * transactions are numbered in the order they start, not the order
		 * they complete. A higher numbered xid will complete before you about
		 * 50% of the time...
5495 5496 5497 5498 5499 5500 5501 5502
		 */
		stopsHere = (record->xl_xid == recoveryTargetXid);
		if (stopsHere)
			*includeThis = recoveryTargetInclusive;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
5503 5504 5505
		 * there can be many transactions that share the same commit time, so
		 * we stop after the last one, if we are inclusive, or stop at the
		 * first one if we are exclusive
5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516
		 */
		if (recoveryTargetInclusive)
			stopsHere = (recordXtime > recoveryTargetTime);
		else
			stopsHere = (recordXtime >= recoveryTargetTime);
		if (stopsHere)
			*includeThis = false;
	}

	if (stopsHere)
	{
5517 5518 5519 5520
		recoveryStopXid = record->xl_xid;
		recoveryStopTime = recordXtime;
		recoveryStopAfter = *includeThis;

5521 5522
		if (record_info == XLOG_XACT_COMMIT)
		{
5523
			if (recoveryStopAfter)
5524 5525
				ereport(LOG,
						(errmsg("recovery stopping after commit of transaction %u, time %s",
5526 5527
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5528 5529 5530
			else
				ereport(LOG,
						(errmsg("recovery stopping before commit of transaction %u, time %s",
5531 5532
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5533 5534 5535
		}
		else
		{
5536
			if (recoveryStopAfter)
5537 5538
				ereport(LOG,
						(errmsg("recovery stopping after abort of transaction %u, time %s",
5539 5540
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5541 5542 5543
			else
				ereport(LOG,
						(errmsg("recovery stopping before abort of transaction %u, time %s",
5544 5545
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5546
		}
5547 5548

		if (recoveryStopAfter)
5549
			SetLatestXTime(recordXtime);
5550
	}
5551
	else
5552
		SetLatestXTime(recordXtime);
5553 5554 5555 5556

	return stopsHere;
}

5557
/*
5558 5559 5560 5561 5562
 * Save timestamp of latest processed commit/abort record.
 *
 * We keep this in XLogCtl, not a simple static variable, so that it can be
 * seen by processes other than the startup process.  Note in particular
 * that CreateRestartPoint is executed in the bgwriter.
5563
 */
5564 5565
static void
SetLatestXTime(TimestampTz xtime)
5566
{
5567 5568 5569 5570 5571 5572
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->recoveryLastXTime = xtime;
	SpinLockRelease(&xlogctl->info_lck);
5573 5574 5575
}

/*
5576
 * Fetch timestamp of latest processed commit/abort record.
5577
 */
5578
static TimestampTz
5579
GetLatestXTime(void)
5580 5581 5582
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
5583
	TimestampTz xtime;
5584 5585

	SpinLockAcquire(&xlogctl->info_lck);
5586
	xtime = xlogctl->recoveryLastXTime;
5587 5588
	SpinLockRelease(&xlogctl->info_lck);

5589 5590 5591 5592 5593 5594 5595 5596 5597 5598
	return xtime;
}

/*
 * Returns bool with current recovery mode, a global state.
 */
Datum
pg_is_in_recovery(PG_FUNCTION_ARGS)
{
	PG_RETURN_BOOL(RecoveryInProgress());
5599 5600
}

5601 5602 5603 5604 5605 5606 5607 5608
/*
 * Returns time of receipt of current chunk of XLOG data, as well as
 * whether it was received from streaming replication or from archives.
 */
void
GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
{
	/*
B
Bruce Momjian 已提交
5609 5610
	 * This must be executed in the startup process, since we don't export the
	 * relevant state to shared memory.
5611 5612 5613 5614 5615 5616 5617
	 */
	Assert(InRecovery);

	*rtime = XLogReceiptTime;
	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
}

5618
/*
5619 5620
 * Note that text field supplied is a parameter name and does not require
 * translation
5621
 */
5622
#define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5623
do { \
5624
	if (currValue < minValue) \
5625
		ereport(ERROR, \
5626 5627 5628 5629 5630 5631 5632
				(errmsg("hot standby is not possible because " \
						"%s = %d is a lower setting than on the master server " \
						"(its value was %d)", \
						param_name, \
						currValue, \
						minValue))); \
} while(0)
5633 5634 5635 5636 5637 5638

/*
 * Check to see if required parameters are set high enough on this server
 * for various aspects of recovery operation.
 */
static void
5639
CheckRequiredParameterValues(void)
5640
{
5641
	/*
B
Bruce Momjian 已提交
5642 5643
	 * For archive recovery, the WAL must be generated with at least 'archive'
	 * wal_level.
5644 5645 5646 5647
	 */
	if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
	{
		ereport(WARNING,
5648 5649
				(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
				 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5650
	}
5651

5652
	/*
B
Bruce Momjian 已提交
5653 5654
	 * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
	 * we must have at least as many backend slots as the primary.
5655
	 */
5656
	if (InArchiveRecovery && EnableHotStandby)
5657 5658 5659
	{
		if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
			ereport(ERROR,
5660 5661
					(errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
					 errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5662

5663 5664
		/* We ignore autovacuum_max_workers when we make this test. */
		RecoveryRequiresIntParameter("max_connections",
5665 5666
									 MaxConnections,
									 ControlFile->MaxConnections);
5667
		RecoveryRequiresIntParameter("max_prepared_xacts",
5668 5669
									 max_prepared_xacts,
									 ControlFile->max_prepared_xacts);
5670
		RecoveryRequiresIntParameter("max_locks_per_xact",
5671 5672
									 max_locks_per_xact,
									 ControlFile->max_locks_per_xact);
5673
	}
5674 5675
}

5676
/*
T
Tom Lane 已提交
5677
 * This must be called ONCE during postmaster or standalone-backend startup
5678 5679
 */
void
T
Tom Lane 已提交
5680
StartupXLOG(void)
5681
{
5682 5683
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
5684
	bool		wasShutdown;
5685
	bool		reachedStopPoint = false;
5686
	bool		haveBackupLabel = false;
5687
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
5688 5689
				checkPointLoc,
				EndOfLog;
5690 5691
	uint32		endLogId;
	uint32		endLogSeg;
5692
	XLogRecord *record;
5693
	uint32		freespace;
5694
	TransactionId oldestActiveXID;
5695

5696
	/*
5697 5698
	 * Read control file and check XLOG status looks valid.
	 *
5699 5700
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
5701
	 */
5702
	ReadControlFile();
5703

5704
	if (ControlFile->state < DB_SHUTDOWNED ||
5705
		ControlFile->state > DB_IN_PRODUCTION ||
5706
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5707 5708
		ereport(FATAL,
				(errmsg("control file contains invalid data")));
5709 5710

	if (ControlFile->state == DB_SHUTDOWNED)
5711 5712 5713
		ereport(LOG,
				(errmsg("database system was shut down at %s",
						str_time(ControlFile->time))));
5714 5715 5716 5717
	else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
		ereport(LOG,
				(errmsg("database system was shut down in recovery at %s",
						str_time(ControlFile->time))));
5718
	else if (ControlFile->state == DB_SHUTDOWNING)
5719
		ereport(LOG,
5720
				(errmsg("database system shutdown was interrupted; last known up at %s",
5721
						str_time(ControlFile->time))));
5722
	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5723
		ereport(LOG,
B
Bruce Momjian 已提交
5724 5725 5726 5727
		   (errmsg("database system was interrupted while in recovery at %s",
				   str_time(ControlFile->time)),
			errhint("This probably means that some data is corrupted and"
					" you will have to use the last backup for recovery.")));
5728 5729
	else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
		ereport(LOG,
B
Bruce Momjian 已提交
5730 5731
				(errmsg("database system was interrupted while in recovery at log time %s",
						str_time(ControlFile->checkPointCopy.time)),
5732
				 errhint("If this has occurred more than once some data might be corrupted"
B
Bruce Momjian 已提交
5733
			  " and you might need to choose an earlier recovery target.")));
5734
	else if (ControlFile->state == DB_IN_PRODUCTION)
5735
		ereport(LOG,
B
Bruce Momjian 已提交
5736 5737
			  (errmsg("database system was interrupted; last known up at %s",
					  str_time(ControlFile->time))));
5738

5739 5740
	/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
5741
	if (ControlFile->state != DB_SHUTDOWNED)
5742
		pg_usleep(60000000L);
5743 5744
#endif

5745 5746
	/*
	 * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5747 5748
	 * someone has performed a copy for PITR, these directories may have been
	 * excluded and need to be re-created.
5749 5750 5751
	 */
	ValidateXLOGDirectoryStructure();

5752
	/*
B
Bruce Momjian 已提交
5753 5754 5755 5756 5757 5758
	 * Clear out any old relcache cache files.	This is *necessary* if we do
	 * any WAL replay, since that would probably result in the cache files
	 * being out of sync with database reality.  In theory we could leave them
	 * in place if the database had been cleanly shut down, but it seems
	 * safest to just remove them always and let them be rebuilt during the
	 * first backend startup.
5759 5760 5761
	 */
	RelationCacheInitFileRemove();

5762
	/*
B
Bruce Momjian 已提交
5763 5764
	 * Initialize on the assumption we want to recover to the same timeline
	 * that's active according to pg_control.
5765 5766 5767
	 */
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

5768
	/*
B
Bruce Momjian 已提交
5769 5770
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
5771 5772 5773
	 */
	readRecoveryCommandFile();

5774 5775 5776
	/* Now we can determine the list of expected TLIs */
	expectedTLIs = readTimeLineHistory(recoveryTargetTLI);

5777 5778 5779 5780 5781 5782
	/*
	 * If pg_control's timeline is not in expectedTLIs, then we cannot
	 * proceed: the backup is not part of the history of the requested
	 * timeline.
	 */
	if (!list_member_int(expectedTLIs,
B
Bruce Momjian 已提交
5783
						 (int) ControlFile->checkPointCopy.ThisTimeLineID))
5784 5785 5786 5787 5788
		ereport(FATAL,
				(errmsg("requested timeline %u is not a child of database system timeline %u",
						recoveryTargetTLI,
						ControlFile->checkPointCopy.ThisTimeLineID)));

5789
	/*
B
Bruce Momjian 已提交
5790 5791 5792
	 * Save the selected recovery target timeline ID and
	 * archive_cleanup_command in shared memory so that other processes can
	 * see them
5793
	 */
5794
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
5795 5796 5797
	strncpy(XLogCtl->archiveCleanupCommand,
			archiveCleanupCommand ? archiveCleanupCommand : "",
			sizeof(XLogCtl->archiveCleanupCommand));
5798

5799 5800 5801 5802 5803 5804 5805
	if (InArchiveRecovery)
	{
		if (StandbyMode)
			ereport(LOG,
					(errmsg("entering standby mode")));
		else if (recoveryTarget == RECOVERY_TARGET_XID)
			ereport(LOG,
B
Bruce Momjian 已提交
5806 5807
					(errmsg("starting point-in-time recovery to XID %u",
							recoveryTargetXid)));
5808 5809 5810 5811 5812 5813 5814 5815 5816
		else if (recoveryTarget == RECOVERY_TARGET_TIME)
			ereport(LOG,
					(errmsg("starting point-in-time recovery to %s",
							timestamptz_to_str(recoveryTargetTime))));
		else
			ereport(LOG,
					(errmsg("starting archive recovery")));
	}

5817
	if (read_backup_label(&checkPointLoc))
T
Tom Lane 已提交
5818
	{
5819
		/*
B
Bruce Momjian 已提交
5820 5821
		 * When a backup_label file is present, we want to roll forward from
		 * the checkpoint it identifies, rather than using pg_control.
5822
		 */
5823
		record = ReadCheckpointRecord(checkPointLoc, 0);
5824 5825
		if (record != NULL)
		{
5826
			ereport(DEBUG1,
5827
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
5828
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5829 5830 5831 5832 5833
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
		{
			ereport(PANIC,
B
Bruce Momjian 已提交
5834 5835
					(errmsg("could not locate required checkpoint record"),
					 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5836
		}
5837 5838
		/* set flag to delete it later */
		haveBackupLabel = true;
T
Tom Lane 已提交
5839 5840 5841
	}
	else
	{
5842
		/*
B
Bruce Momjian 已提交
5843 5844
		 * Get the last valid checkpoint record.  If the latest one according
		 * to pg_control is broken, try the next-to-last one.
5845 5846
		 */
		checkPointLoc = ControlFile->checkPoint;
5847
		RedoStartLSN = ControlFile->checkPointCopy.redo;
5848
		record = ReadCheckpointRecord(checkPointLoc, 1);
T
Tom Lane 已提交
5849 5850
		if (record != NULL)
		{
5851
			ereport(DEBUG1,
5852
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
5853
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
T
Tom Lane 已提交
5854
		}
5855
		else if (StandbyMode)
5856 5857 5858 5859 5860 5861 5862 5863
		{
			/*
			 * The last valid checkpoint record required for a streaming
			 * recovery exists in neither standby nor the primary.
			 */
			ereport(PANIC,
					(errmsg("could not locate a valid checkpoint record")));
		}
T
Tom Lane 已提交
5864
		else
5865 5866
		{
			checkPointLoc = ControlFile->prevCheckPoint;
5867
			record = ReadCheckpointRecord(checkPointLoc, 2);
5868 5869 5870
			if (record != NULL)
			{
				ereport(LOG,
B
Bruce Momjian 已提交
5871 5872 5873
						(errmsg("using previous checkpoint record at %X/%X",
							  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
				InRecovery = true;		/* force recovery even if SHUTDOWNED */
5874 5875 5876
			}
			else
				ereport(PANIC,
B
Bruce Momjian 已提交
5877
					 (errmsg("could not locate a valid checkpoint record")));
5878
		}
T
Tom Lane 已提交
5879
	}
5880

T
Tom Lane 已提交
5881 5882 5883
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5884

5885
	ereport(DEBUG1,
B
Bruce Momjian 已提交
5886 5887 5888
			(errmsg("redo record is at %X/%X; shutdown %s",
					checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
					wasShutdown ? "TRUE" : "FALSE")));
5889
	ereport(DEBUG1,
5890 5891 5892
			(errmsg("next transaction ID: %u/%u; next OID: %u",
					checkPoint.nextXidEpoch, checkPoint.nextXid,
					checkPoint.nextOid)));
5893
	ereport(DEBUG1,
5894 5895
			(errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
					checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5896 5897 5898
	ereport(DEBUG1,
			(errmsg("oldest unfrozen transaction ID: %u, in database %u",
					checkPoint.oldestXid, checkPoint.oldestXidDB)));
5899
	if (!TransactionIdIsNormal(checkPoint.nextXid))
5900
		ereport(PANIC,
5901
				(errmsg("invalid next transaction ID")));
5902 5903 5904

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
5905
	ShmemVariableCache->oidCount = 0;
5906
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5907
	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5908

5909
	/*
B
Bruce Momjian 已提交
5910 5911 5912
	 * We must replay WAL entries using the same TimeLineID they were created
	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
	 * also xlog_redo()).
5913
	 */
5914
	ThisTimeLineID = checkPoint.ThisTimeLineID;
5915

5916
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
5917

5918
	if (XLByteLT(RecPtr, checkPoint.redo))
5919 5920
		ereport(PANIC,
				(errmsg("invalid redo in checkpoint record")));
5921

5922
	/*
B
Bruce Momjian 已提交
5923
	 * Check whether we need to force recovery from WAL.  If it appears to
B
Bruce Momjian 已提交
5924 5925
	 * have been a clean shutdown and we did not have a recovery.conf file,
	 * then assume no recovery needed.
5926
	 */
5927
	if (XLByteLT(checkPoint.redo, RecPtr))
5928
	{
T
Tom Lane 已提交
5929
		if (wasShutdown)
5930
			ereport(PANIC,
B
Bruce Momjian 已提交
5931
					(errmsg("invalid redo record in shutdown checkpoint")));
V
WAL  
Vadim B. Mikheev 已提交
5932
		InRecovery = true;
5933 5934
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
5935
		InRecovery = true;
5936 5937 5938 5939 5940
	else if (InArchiveRecovery)
	{
		/* force recovery due to presence of recovery.conf */
		InRecovery = true;
	}
5941

V
WAL  
Vadim B. Mikheev 已提交
5942
	/* REDO */
5943
	if (InRecovery)
5944
	{
B
Bruce Momjian 已提交
5945
		int			rmid;
B
Bruce Momjian 已提交
5946

5947 5948
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
5949

5950
		/*
B
Bruce Momjian 已提交
5951 5952 5953 5954
		 * Update pg_control to show that we are recovering and to show the
		 * selected checkpoint as the place we are starting from. We also mark
		 * pg_control with any minimum recovery stop point obtained from a
		 * backup history file.
5955
		 */
5956
		if (InArchiveRecovery)
5957
			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5958
		else
5959
		{
5960
			ereport(LOG,
5961 5962
					(errmsg("database system was not properly shut down; "
							"automatic recovery in progress")));
5963 5964 5965 5966 5967
			ControlFile->state = DB_IN_CRASH_RECOVERY;
		}
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = checkPointLoc;
		ControlFile->checkPointCopy = checkPoint;
5968 5969 5970 5971 5972 5973
		if (InArchiveRecovery)
		{
			/* initialize minRecoveryPoint if not set yet */
			if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
				ControlFile->minRecoveryPoint = checkPoint.redo;
		}
B
Bruce Momjian 已提交
5974

5975 5976 5977 5978 5979 5980
		/*
		 * set backupStartupPoint if we're starting archive recovery from a
		 * base backup
		 */
		if (haveBackupLabel)
			ControlFile->backupStartPoint = checkPoint.redo;
5981
		ControlFile->time = (pg_time_t) time(NULL);
5982
		/* No need to hold ControlFileLock yet, we aren't up far enough */
5983 5984
		UpdateControlFile();

5985
		/* initialize our local copy of minRecoveryPoint */
5986 5987 5988 5989 5990 5991 5992
		minRecoveryPoint = ControlFile->minRecoveryPoint;

		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
		pgstat_reset_all();

5993
		/*
B
Bruce Momjian 已提交
5994 5995 5996 5997 5998 5999
		 * If there was a backup label file, it's done its job and the info
		 * has now been propagated into pg_control.  We must get rid of the
		 * label file so that if we crash during recovery, we'll pick up at
		 * the latest recovery restartpoint instead of going all the way back
		 * to the backup start point.  It seems prudent though to just rename
		 * the file out of the way rather than delete it completely.
6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010
		 */
		if (haveBackupLabel)
		{
			unlink(BACKUP_LABEL_OLD);
			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
				ereport(FATAL,
						(errcode_for_file_access(),
						 errmsg("could not rename file \"%s\" to \"%s\": %m",
								BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
		}

6011 6012 6013
		/* Check that the GUCs used to generate the WAL allow recovery */
		CheckRequiredParameterValues();

6014
		/*
B
Bruce Momjian 已提交
6015 6016
		 * Initialize for Hot Standby, if enabled. We won't let backends in
		 * yet, not until we've reached the min recovery point specified in
B
Bruce Momjian 已提交
6017
		 * control file and we've established a recovery snapshot from a
6018 6019
		 * running-xacts WAL record.
		 */
6020
		if (InArchiveRecovery && EnableHotStandby)
6021 6022
		{
			TransactionId *xids;
B
Bruce Momjian 已提交
6023
			int			nxids;
6024

6025
			ereport(DEBUG1,
6026
					(errmsg("initializing for hot standby")));
6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041

			InitRecoveryTransactionEnvironment();

			if (wasShutdown)
				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
			else
				oldestActiveXID = checkPoint.oldestActiveXid;
			Assert(TransactionIdIsValid(oldestActiveXID));

			/* Startup commit log and related stuff */
			StartupCLOG();
			StartupSUBTRANS(oldestActiveXID);
			StartupMultiXact();

			ProcArrayInitRecoveryInfo(oldestActiveXID);
6042 6043 6044

			/*
			 * If we're beginning at a shutdown checkpoint, we know that
B
Bruce Momjian 已提交
6045 6046 6047
			 * nothing was running on the master at this point. So fake-up an
			 * empty running-xacts record and use that here and now. Recover
			 * additional standby state for prepared transactions.
6048 6049 6050 6051
			 */
			if (wasShutdown)
			{
				RunningTransactionsData running;
6052
				TransactionId latestCompletedXid;
6053 6054

				/*
B
Bruce Momjian 已提交
6055 6056 6057 6058
				 * Construct a RunningTransactions snapshot representing a
				 * shut down server, with only prepared transactions still
				 * alive. We're never overflowed at this point because all
				 * subxids are listed with their parent prepared transactions.
6059 6060 6061 6062 6063
				 */
				running.xcnt = nxids;
				running.subxid_overflow = false;
				running.nextXid = checkPoint.nextXid;
				running.oldestRunningXid = oldestActiveXID;
6064 6065
				latestCompletedXid = checkPoint.nextXid;
				TransactionIdRetreat(latestCompletedXid);
6066
				Assert(TransactionIdIsNormal(latestCompletedXid));
6067
				running.latestCompletedXid = latestCompletedXid;
6068 6069 6070 6071 6072 6073
				running.xids = xids;

				ProcArrayApplyRecoveryInfo(&running);

				StandbyRecoverPreparedTransactions(false);
			}
6074 6075
		}

6076
		/* Initialize resource managers */
6077 6078 6079 6080 6081 6082
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

6083
		/*
6084 6085
		 * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
		 * recoveryLastXTime.
6086 6087
		 *
		 * This is slightly confusing if we're starting from an online
B
Bruce Momjian 已提交
6088 6089 6090 6091 6092 6093
		 * checkpoint; we've just read and replayed the chekpoint record, but
		 * we're going to start replay from its redo pointer, which precedes
		 * the location of the checkpoint record itself. So even though the
		 * last record we've replayed is indeed ReadRecPtr, we haven't
		 * replayed all the preceding records yet. That's OK for the current
		 * use of these variables.
6094 6095 6096 6097
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->replayEndRecPtr = ReadRecPtr;
		xlogctl->recoveryLastRecPtr = ReadRecPtr;
6098
		xlogctl->recoveryLastXTime = 0;
6099 6100
		SpinLockRelease(&xlogctl->info_lck);

6101 6102 6103
		/* Also ensure XLogReceiptTime has a sane value */
		XLogReceiptTime = GetCurrentTimestamp();

6104
		/*
B
Bruce Momjian 已提交
6105 6106 6107 6108 6109
		 * Let postmaster know we've started redo now, so that it can launch
		 * bgwriter to perform restartpoints.  We don't bother during crash
		 * recovery as restartpoints can only be performed during archive
		 * recovery.  And we'd like to keep crash recovery simple, to avoid
		 * introducing bugs that could affect you when recovering after crash.
6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122
		 *
		 * After this point, we can no longer assume that we're the only
		 * process in addition to postmaster!  Also, fsync requests are
		 * subsequently to be handled by the bgwriter, not locally.
		 */
		if (InArchiveRecovery && IsUnderPostmaster)
		{
			SetForwardFsyncRequests();
			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
			bgwriterLaunched = true;
		}

		/*
B
Bruce Momjian 已提交
6123 6124
		 * Allow read-only connections immediately if we're consistent
		 * already.
6125 6126 6127
		 */
		CheckRecoveryConsistency();

6128
		/*
B
Bruce Momjian 已提交
6129 6130
		 * Find the first record that logically follows the checkpoint --- it
		 * might physically precede it, though.
6131
		 */
6132
		if (XLByteLT(checkPoint.redo, RecPtr))
6133 6134
		{
			/* back up to find the record */
6135
			record = ReadRecord(&(checkPoint.redo), PANIC, false);
6136
		}
B
Bruce Momjian 已提交
6137
		else
6138
		{
6139
			/* just have to read next record after CheckPoint */
6140
			record = ReadRecord(NULL, LOG, false);
6141
		}
6142

T
Tom Lane 已提交
6143
		if (record != NULL)
6144
		{
6145 6146
			bool		recoveryContinue = true;
			bool		recoveryApply = true;
B
Bruce Momjian 已提交
6147
			ErrorContextCallback errcontext;
6148
			TimestampTz xtime;
6149

V
WAL  
Vadim B. Mikheev 已提交
6150
			InRedo = true;
6151

6152 6153 6154
			ereport(LOG,
					(errmsg("redo starts at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6155

6156 6157 6158
			/*
			 * main redo apply loop
			 */
6159 6160
			do
			{
6161
#ifdef WAL_DEBUG
6162
				if (XLOG_DEBUG ||
B
Bruce Momjian 已提交
6163
				 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6164
					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
V
WAL  
Vadim B. Mikheev 已提交
6165
				{
B
Bruce Momjian 已提交
6166
					StringInfoData buf;
V
WAL  
Vadim B. Mikheev 已提交
6167

6168 6169
					initStringInfo(&buf);
					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
6170 6171
									 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
									 EndRecPtr.xlogid, EndRecPtr.xrecoff);
6172 6173 6174 6175
					xlog_outrec(&buf, record);
					appendStringInfo(&buf, " - ");
					RmgrTable[record->xl_rmid].rm_desc(&buf,
													   record->xl_info,
B
Bruce Momjian 已提交
6176
													 XLogRecGetData(record));
6177 6178
					elog(LOG, "%s", buf.data);
					pfree(buf.data);
V
WAL  
Vadim B. Mikheev 已提交
6179
				}
6180
#endif
V
WAL  
Vadim B. Mikheev 已提交
6181

6182 6183
				/* Handle interrupt signals of startup process */
				HandleStartupProcInterrupts();
6184

6185 6186
				/* Allow read-only connections if we're consistent now */
				CheckRecoveryConsistency();
6187

6188 6189 6190 6191 6192
				/*
				 * Have we reached our recovery target?
				 */
				if (recoveryStopsHere(record, &recoveryApply))
				{
B
Bruce Momjian 已提交
6193
					reachedStopPoint = true;	/* see below */
6194 6195 6196 6197 6198
					recoveryContinue = false;
					if (!recoveryApply)
						break;
				}

6199 6200 6201 6202 6203 6204
				/* Setup error traceback support for ereport() */
				errcontext.callback = rm_redo_error_callback;
				errcontext.arg = (void *) record;
				errcontext.previous = error_context_stack;
				error_context_stack = &errcontext;

6205 6206
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
B
Bruce Momjian 已提交
6207
												 ShmemVariableCache->nextXid))
6208 6209 6210 6211 6212
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}

6213
				/*
6214 6215
				 * Update shared replayEndRecPtr before replaying this record,
				 * so that XLogFlush will update minRecoveryPoint correctly.
6216 6217 6218 6219 6220
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->replayEndRecPtr = EndRecPtr;
				SpinLockRelease(&xlogctl->info_lck);

B
Bruce Momjian 已提交
6221 6222 6223 6224
				/*
				 * If we are attempting to enter Hot Standby mode, process
				 * XIDs we see
				 */
6225 6226
				if (standbyState >= STANDBY_INITIALIZED &&
					TransactionIdIsValid(record->xl_xid))
6227 6228
					RecordKnownAssignedTransactionIds(record->xl_xid);

6229
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6230

6231 6232 6233
				/* Pop the error context stack */
				error_context_stack = errcontext.previous;

6234 6235 6236 6237 6238 6239 6240 6241
				/*
				 * Update shared recoveryLastRecPtr after this record has been
				 * replayed.
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->recoveryLastRecPtr = EndRecPtr;
				SpinLockRelease(&xlogctl->info_lck);

6242 6243
				LastRec = ReadRecPtr;

6244
				record = ReadRecord(NULL, LOG, false);
6245
			} while (record != NULL && recoveryContinue);
B
Bruce Momjian 已提交
6246

6247 6248 6249 6250
			/*
			 * end of main redo apply loop
			 */

6251 6252 6253
			ereport(LOG,
					(errmsg("redo done at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6254 6255
			xtime = GetLatestXTime();
			if (xtime)
6256
				ereport(LOG,
B
Bruce Momjian 已提交
6257
					 (errmsg("last completed transaction was at log time %s",
6258
							 timestamptz_to_str(xtime))));
V
WAL  
Vadim B. Mikheev 已提交
6259
			InRedo = false;
6260 6261
		}
		else
6262 6263
		{
			/* there are no WAL records following the checkpoint */
6264 6265
			ereport(LOG,
					(errmsg("redo is not required")));
6266
		}
V
WAL  
Vadim B. Mikheev 已提交
6267 6268
	}

6269 6270 6271 6272 6273 6274 6275 6276 6277 6278
	/*
	 * If we launched a WAL receiver, it should be gone by now. It will trump
	 * over the startup checkpoint and subsequent records if it's still alive,
	 * so be extra sure that it's gone.
	 */
	if (WalRcvInProgress())
		elog(PANIC, "wal receiver still active");

	/*
	 * We are now done reading the xlog from stream. Turn off streaming
B
Bruce Momjian 已提交
6279 6280
	 * recovery to force fetching the files (which would be required at end of
	 * recovery, e.g., timeline history file) from archive or pg_xlog.
6281
	 */
6282
	StandbyMode = false;
6283

T
Tom Lane 已提交
6284
	/*
B
Bruce Momjian 已提交
6285 6286
	 * Re-fetch the last valid or last applied record, so we can identify the
	 * exact endpoint of what we consider the valid portion of WAL.
T
Tom Lane 已提交
6287
	 */
6288
	record = ReadRecord(&LastRec, PANIC, false);
T
Tom Lane 已提交
6289
	EndOfLog = EndRecPtr;
6290 6291
	XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);

6292 6293
	/*
	 * Complain if we did not roll forward far enough to render the backup
6294 6295 6296 6297
	 * dump consistent.  Note: it is indeed okay to look at the local variable
	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
	 * advanced beyond the WAL we processed.
6298
	 */
6299 6300 6301
	if (InArchiveRecovery &&
		(XLByteLT(EndOfLog, minRecoveryPoint) ||
		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6302
	{
6303
		if (reachedStopPoint)	/* stopped because of stop request */
6304
			ereport(FATAL,
6305
					(errmsg("requested recovery stop point is before consistent recovery point")));
B
Bruce Momjian 已提交
6306
		else	/* ran off end of WAL */
6307
			ereport(FATAL,
6308
					(errmsg("WAL ends before consistent recovery point")));
6309 6310
	}

6311 6312 6313
	/*
	 * Consider whether we need to assign a new timeline ID.
	 *
B
Bruce Momjian 已提交
6314 6315
	 * If we are doing an archive recovery, we always assign a new ID.	This
	 * handles a couple of issues.	If we stopped short of the end of WAL
6316 6317
	 * during recovery, then we are clearly generating a new timeline and must
	 * assign it a unique new ID.  Even if we ran to the end, modifying the
B
Bruce Momjian 已提交
6318 6319
	 * current last segment is problematic because it may result in trying to
	 * overwrite an already-archived copy of that segment, and we encourage
6320 6321 6322 6323
	 * DBAs to make their archive_commands reject that.  We can dodge the
	 * problem by making the new active segment have a new timeline ID.
	 *
	 * In a normal crash recovery, we can just extend the timeline we were in.
6324
	 */
6325
	if (InArchiveRecovery)
6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336
	{
		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
		ereport(LOG,
				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
							 curFileTLI, endLogId, endLogSeg);
	}

	/* Save the selected TimeLineID in shared memory, too */
	XLogCtl->ThisTimeLineID = ThisTimeLineID;

6337
	/*
B
Bruce Momjian 已提交
6338 6339 6340 6341
	 * We are now done reading the old WAL.  Turn off archive fetching if it
	 * was active, and make a writable copy of the last WAL segment. (Note
	 * that we also have a copy of the last block of the old WAL in readBuf;
	 * we will use that below.)
6342 6343
	 */
	if (InArchiveRecovery)
6344
		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
6345 6346 6347 6348 6349 6350 6351 6352

	/*
	 * Prepare to write WAL starting at EndOfLog position, and init xlog
	 * buffer cache using the block containing the last record from the
	 * previous incarnation.
	 */
	openLogId = endLogId;
	openLogSeg = endLogSeg;
6353
	openLogFile = XLogFileOpen(openLogId, openLogSeg);
T
Tom Lane 已提交
6354
	openLogOff = 0;
V
WAL  
Vadim B. Mikheev 已提交
6355
	Insert = &XLogCtl->Insert;
6356
	Insert->PrevRecord = LastRec;
6357 6358
	XLogCtl->xlblocks[0].xlogid = openLogId;
	XLogCtl->xlblocks[0].xrecoff =
6359
		((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
B
Bruce Momjian 已提交
6360 6361

	/*
B
Bruce Momjian 已提交
6362 6363 6364
	 * Tricky point here: readBuf contains the *last* block that the LastRec
	 * record spans, not the one it starts in.	The last block is indeed the
	 * one we want to use.
T
Tom Lane 已提交
6365
	 */
6366 6367
	Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
	memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6368
	Insert->currpos = (char *) Insert->currpage +
6369
		(EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
6370

T
Tom Lane 已提交
6371
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
6372

T
Tom Lane 已提交
6373 6374 6375
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
6376

T
Tom Lane 已提交
6377 6378
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
6379

6380 6381 6382 6383 6384 6385 6386 6387 6388 6389
	freespace = INSERT_FREESPACE(Insert);
	if (freespace > 0)
	{
		/* Make sure rest of page is zero */
		MemSet(Insert->currpos, 0, freespace);
		XLogCtl->Write.curridx = 0;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
6390 6391
		 * Whenever Write.LogwrtResult points to exactly the end of a page,
		 * Write.curridx must point to the *next* page (see XLogWrite()).
6392
		 *
B
Bruce Momjian 已提交
6393
		 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
B
Bruce Momjian 已提交
6394
		 * this is sufficient.	The first actual attempt to insert a log
6395
		 * record will advance the insert state.
6396 6397 6398 6399
		 */
		XLogCtl->Write.curridx = NextBufIdx(0);
	}

6400
	/* Pre-scan prepared transactions to find out the range of XIDs present */
6401
	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6402

V
WAL  
Vadim B. Mikheev 已提交
6403
	if (InRecovery)
6404
	{
B
Bruce Momjian 已提交
6405
		int			rmid;
6406

6407 6408 6409 6410 6411 6412 6413
		/*
		 * Resource managers might need to write WAL records, eg, to record
		 * index cleanup actions.  So temporarily enable XLogInsertAllowed in
		 * this process only.
		 */
		LocalSetXLogInsertAllowed();

6414 6415 6416 6417 6418 6419 6420 6421 6422
		/*
		 * Allow resource managers to do any required cleanup.
		 */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_cleanup != NULL)
				RmgrTable[rmid].rm_cleanup();
		}

6423 6424 6425
		/* Disallow XLogInsert again */
		LocalXLogInsertAllowed = -1;

6426 6427 6428 6429 6430 6431
		/*
		 * Check to see if the XLOG sequence contained any unresolved
		 * references to uninitialized pages.
		 */
		XLogCheckInvalidPages();

T
Tom Lane 已提交
6432
		/*
6433
		 * Perform a checkpoint to update all our recovery activity to disk.
6434
		 *
6435 6436 6437 6438 6439
		 * Note that we write a shutdown checkpoint rather than an on-line
		 * one. This is not particularly critical, but since we may be
		 * assigning a new TLI, using a shutdown checkpoint allows us to have
		 * the rule that TLI only changes in shutdown checkpoints, which
		 * allows some extra error checking in xlog_redo.
T
Tom Lane 已提交
6440
		 */
6441 6442 6443 6444 6445 6446
		if (bgwriterLaunched)
			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
							  CHECKPOINT_IMMEDIATE |
							  CHECKPOINT_WAIT);
		else
			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6447

T
Tom Lane 已提交
6448 6449 6450
		/*
		 * And finally, execute the recovery_end_command, if any.
		 */
6451
		if (recoveryEndCommand)
6452 6453 6454
			ExecuteRecoveryCommand(recoveryEndCommand,
								   "recovery_end_command",
								   true);
6455
	}
6456

T
Tom Lane 已提交
6457 6458 6459
	/*
	 * Preallocate additional log files, if wanted.
	 */
6460
	PreallocXlogFiles(EndOfLog);
6461

6462 6463 6464
	/*
	 * Okay, we're officially UP.
	 */
V
WAL  
Vadim B. Mikheev 已提交
6465
	InRecovery = false;
6466

6467
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6468
	ControlFile->state = DB_IN_PRODUCTION;
6469
	ControlFile->time = (pg_time_t) time(NULL);
6470
	UpdateControlFile();
6471
	LWLockRelease(ControlFileLock);
6472

6473
	/* start the archive_timeout timer running */
6474
	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6475

6476 6477 6478 6479
	/* initialize shared-memory copy of latest checkpoint XID/epoch */
	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;

6480 6481 6482 6483
	/* also initialize latestCompletedXid, to nextXid - 1 */
	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);

6484
	/*
B
Bruce Momjian 已提交
6485 6486
	 * Start up the commit log and related stuff, too. In hot standby mode we
	 * did this already before WAL replay.
6487 6488 6489 6490 6491 6492 6493
	 */
	if (standbyState == STANDBY_DISABLED)
	{
		StartupCLOG();
		StartupSUBTRANS(oldestActiveXID);
		StartupMultiXact();
	}
6494

6495 6496 6497
	/* Reload shared-memory state for prepared transactions */
	RecoverPreparedTransactions();

6498 6499 6500 6501 6502 6503 6504
	/*
	 * Shutdown the recovery environment. This must occur after
	 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
	 */
	if (standbyState != STANDBY_DISABLED)
		ShutdownRecoveryTransactionEnvironment();

T
Tom Lane 已提交
6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}
6516 6517 6518 6519 6520 6521
	if (readRecordBuf)
	{
		free(readRecordBuf);
		readRecordBuf = NULL;
		readRecordBufSize = 0;
	}
6522

6523 6524 6525 6526 6527 6528 6529
	/*
	 * If any of the critical GUCs have changed, log them before we allow
	 * backends to write WAL.
	 */
	LocalSetXLogInsertAllowed();
	XLogReportParameters();

6530
	/*
B
Bruce Momjian 已提交
6531
	 * All done.  Allow backends to write WAL.	(Although the bool flag is
6532 6533 6534
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
6535
	 */
6536 6537 6538 6539 6540 6541 6542 6543
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
	}
6544 6545
}

6546 6547 6548 6549 6550 6551 6552 6553
/*
 * Checks if recovery has reached a consistent state. When consistency is
 * reached and we have a valid starting standby snapshot, tell postmaster
 * that it can start accepting read-only connections.
 */
static void
CheckRecoveryConsistency(void)
{
B
Bruce Momjian 已提交
6554
	static bool backendsAllowed = false;
6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569

	/*
	 * Have we passed our safe starting point?
	 */
	if (!reachedMinRecoveryPoint &&
		XLByteLE(minRecoveryPoint, EndRecPtr) &&
		XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
	{
		reachedMinRecoveryPoint = true;
		ereport(LOG,
				(errmsg("consistent recovery state reached at %X/%X",
						EndRecPtr.xlogid, EndRecPtr.xrecoff)));
	}

	/*
B
Bruce Momjian 已提交
6570 6571 6572
	 * Have we got a valid starting snapshot that will allow queries to be
	 * run? If so, we can tell postmaster that the database is consistent now,
	 * enabling connections.
6573 6574 6575 6576 6577 6578 6579
	 */
	if (standbyState == STANDBY_SNAPSHOT_READY &&
		!backendsAllowed &&
		reachedMinRecoveryPoint &&
		IsUnderPostmaster)
	{
		backendsAllowed = true;
6580
		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
6581 6582 6583
	}
}

6584 6585 6586
/*
 * Is the system still in recovery?
 *
6587 6588 6589
 * Unlike testing InRecovery, this works in any process that's connected to
 * shared memory.
 *
6590 6591 6592 6593 6594 6595 6596
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
 * variables the first time we see that recovery is finished.
 */
bool
RecoveryInProgress(void)
{
	/*
B
Bruce Momjian 已提交
6597 6598 6599
	 * We check shared state each time only until we leave recovery mode. We
	 * can't re-enter recovery, so there's no need to keep checking after the
	 * shared variable has once been seen false.
6600 6601 6602 6603 6604 6605 6606 6607
	 */
	if (!LocalRecoveryInProgress)
		return false;
	else
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

6608 6609
		/* spinlock is essential on machines with weak memory ordering! */
		SpinLockAcquire(&xlogctl->info_lck);
6610
		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6611
		SpinLockRelease(&xlogctl->info_lck);
6612 6613

		/*
6614
		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6615
		 * is finished. InitPostgres() relies upon this behaviour to ensure
B
Bruce Momjian 已提交
6616
		 * that InitXLOGAccess() is called at backend startup.	(If you change
6617
		 * this, see also LocalSetXLogInsertAllowed.)
6618 6619 6620 6621 6622 6623
		 */
		if (!LocalRecoveryInProgress)
			InitXLOGAccess();

		return LocalRecoveryInProgress;
	}
T
Tom Lane 已提交
6624 6625
}

6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636
/*
 * Is this process allowed to insert new WAL records?
 *
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
 * But we also have provisions for forcing the result "true" or "false"
 * within specific processes regardless of the global state.
 */
bool
XLogInsertAllowed(void)
{
	/*
B
Bruce Momjian 已提交
6637 6638 6639
	 * If value is "unconditionally true" or "unconditionally false", just
	 * return it.  This provides the normal fast path once recovery is known
	 * done.
6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650
	 */
	if (LocalXLogInsertAllowed >= 0)
		return (bool) LocalXLogInsertAllowed;

	/*
	 * Else, must check to see if we're still in recovery.
	 */
	if (RecoveryInProgress())
		return false;

	/*
B
Bruce Momjian 已提交
6651 6652
	 * On exit from recovery, reset to "unconditionally true", since there is
	 * no need to keep checking.
6653 6654 6655 6656 6657 6658 6659
	 */
	LocalXLogInsertAllowed = 1;
	return true;
}

/*
 * Make XLogInsertAllowed() return true in the current process only.
6660 6661 6662
 *
 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
 * and even call LocalSetXLogInsertAllowed() again after that.
6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673
 */
static void
LocalSetXLogInsertAllowed(void)
{
	Assert(LocalXLogInsertAllowed == -1);
	LocalXLogInsertAllowed = 1;

	/* Initialize as RecoveryInProgress() would do when switching state */
	InitXLOGAccess();
}

6674 6675
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
6676 6677 6678
 *
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
6679
 */
T
Tom Lane 已提交
6680
static XLogRecord *
6681
ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
T
Tom Lane 已提交
6682 6683 6684 6685 6686
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
6687 6688 6689 6690
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
6691
				(errmsg("invalid primary checkpoint link in control file")));
6692 6693 6694 6695 6696 6697 6698
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint link in control file")));
				break;
			default:
				ereport(LOG,
B
Bruce Momjian 已提交
6699
				   (errmsg("invalid checkpoint link in backup_label file")));
6700 6701
				break;
		}
T
Tom Lane 已提交
6702 6703 6704
		return NULL;
	}

6705
	record = ReadRecord(&RecPtr, LOG, true);
T
Tom Lane 已提交
6706 6707 6708

	if (record == NULL)
	{
6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid primary checkpoint record")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint record")));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid checkpoint record")));
				break;
		}
T
Tom Lane 已提交
6724 6725 6726 6727
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid resource manager ID in primary checkpoint record")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid resource manager ID in secondary checkpoint record")));
				break;
			default:
				ereport(LOG,
B
Bruce Momjian 已提交
6740
				(errmsg("invalid resource manager ID in checkpoint record")));
6741 6742
				break;
		}
T
Tom Lane 已提交
6743 6744 6745 6746 6747
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
6748 6749 6750 6751
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
6752
				   (errmsg("invalid xl_info in primary checkpoint record")));
6753 6754 6755
				break;
			case 2:
				ereport(LOG,
B
Bruce Momjian 已提交
6756
				 (errmsg("invalid xl_info in secondary checkpoint record")));
6757 6758 6759 6760 6761 6762
				break;
			default:
				ereport(LOG,
						(errmsg("invalid xl_info in checkpoint record")));
				break;
		}
T
Tom Lane 已提交
6763 6764
		return NULL;
	}
6765 6766
	if (record->xl_len != sizeof(CheckPoint) ||
		record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
T
Tom Lane 已提交
6767
	{
6768 6769 6770 6771
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
6772
					(errmsg("invalid length of primary checkpoint record")));
6773 6774 6775
				break;
			case 2:
				ereport(LOG,
B
Bruce Momjian 已提交
6776
				  (errmsg("invalid length of secondary checkpoint record")));
6777 6778 6779 6780 6781 6782
				break;
			default:
				ereport(LOG,
						(errmsg("invalid length of checkpoint record")));
				break;
		}
T
Tom Lane 已提交
6783 6784 6785
		return NULL;
	}
	return record;
6786 6787
}

V
WAL  
Vadim B. Mikheev 已提交
6788
/*
6789 6790
 * This must be called during startup of a backend process, except that
 * it need not be called in a standalone backend (which does StartupXLOG
6791
 * instead).  We need to initialize the local copies of ThisTimeLineID and
6792 6793
 * RedoRecPtr.
 *
6794
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6795
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6796
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
V
WAL  
Vadim B. Mikheev 已提交
6797 6798
 */
void
6799
InitXLOGAccess(void)
V
WAL  
Vadim B. Mikheev 已提交
6800
{
6801 6802
	/* ThisTimeLineID doesn't change so we need no lock to copy it */
	ThisTimeLineID = XLogCtl->ThisTimeLineID;
6803
	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
6804

6805 6806
	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
	(void) GetRedoRecPtr();
6807 6808 6809 6810 6811 6812 6813 6814
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
6815 6816
GetRedoRecPtr(void)
{
6817 6818 6819
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

6820
	SpinLockAcquire(&xlogctl->info_lck);
6821 6822
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6823
	SpinLockRelease(&xlogctl->info_lck);
6824 6825

	return RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
6826 6827
}

6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841
/*
 * GetInsertRecPtr -- Returns the current insert position.
 *
 * NOTE: The value *actually* returned is the position of the last full
 * xlog page. It lags behind the real insert position by at most 1 page.
 * For that, we don't need to acquire WALInsertLock which can be quite
 * heavily contended, and an approximation is enough for the current
 * usage of this function.
 */
XLogRecPtr
GetInsertRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
B
Bruce Momjian 已提交
6842
	XLogRecPtr	recptr;
6843 6844 6845 6846 6847 6848 6849 6850

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtRqst.Write;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

6851
/*
6852 6853
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
 * position known to be fsync'd to disk.
6854 6855
 */
XLogRecPtr
6856
GetFlushRecPtr(void)
6857 6858 6859 6860 6861 6862
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
6863
	recptr = xlogctl->LogwrtResult.Flush;
6864 6865 6866 6867 6868
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

6869 6870 6871
/*
 * Get the time of the last xlog segment switch
 */
6872
pg_time_t
6873 6874
GetLastSegSwitchTime(void)
{
6875
	pg_time_t	result;
6876 6877 6878 6879 6880 6881 6882 6883 6884

	/* Need WALWriteLock, but shared lock is sufficient */
	LWLockAcquire(WALWriteLock, LW_SHARED);
	result = XLogCtl->Write.lastSegSwitchTime;
	LWLockRelease(WALWriteLock);

	return result;
}

6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895
/*
 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
 *
 * This is exported for use by code that would like to have 64-bit XIDs.
 * We don't really support such things, but all XIDs within the system
 * can be presumed "close to" the result, and thus the epoch associated
 * with them can be determined.
 */
void
GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
{
B
Bruce Momjian 已提交
6896 6897 6898
	uint32		ckptXidEpoch;
	TransactionId ckptXid;
	TransactionId nextXid;
6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924

	/* Must read checkpoint info first, else have race condition */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		ckptXidEpoch = xlogctl->ckptXidEpoch;
		ckptXid = xlogctl->ckptXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* Now fetch current nextXid */
	nextXid = ReadNewTransactionId();

	/*
	 * nextXid is certainly logically later than ckptXid.  So if it's
	 * numerically less, it must have wrapped into the next epoch.
	 */
	if (nextXid < ckptXid)
		ckptXidEpoch++;

	*xid = nextXid;
	*epoch = ckptXidEpoch;
}

6925 6926 6927 6928 6929 6930 6931 6932 6933 6934
/*
 * GetRecoveryTargetTLI - get the recovery target timeline ID
 */
TimeLineID
GetRecoveryTargetTLI(void)
{
	/* RecoveryTargetTLI doesn't change so we need no lock to copy it */
	return XLogCtl->RecoveryTargetTLI;
}

6935
/*
T
Tom Lane 已提交
6936
 * This must be called ONCE during postmaster or standalone-backend shutdown
6937 6938
 */
void
6939
ShutdownXLOG(int code, Datum arg)
6940
{
6941 6942
	ereport(LOG,
			(errmsg("shutting down")));
6943

6944 6945 6946
	if (RecoveryInProgress())
		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	else
6947 6948 6949 6950 6951 6952 6953 6954 6955 6956
	{
		/*
		 * If archiving is enabled, rotate the last XLOG file so that all the
		 * remaining records are archived (postmaster wakes up the archiver
		 * process one more time at the end of shutdown). The checkpoint
		 * record will go to the next XLOG file and won't be archived (yet).
		 */
		if (XLogArchivingActive() && XLogArchiveCommandSet())
			RequestXLogSwitch();

6957
		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6958
	}
6959
	ShutdownCLOG();
6960
	ShutdownSUBTRANS();
6961
	ShutdownMultiXact();
6962

6963 6964
	ereport(LOG,
			(errmsg("database system is shut down")));
6965 6966
}

6967
/*
6968 6969 6970
 * Log start of a checkpoint.
 */
static void
6971
LogCheckpointStart(int flags, bool restartpoint)
6972
{
6973
	const char *msg;
6974 6975

	/*
6976 6977
	 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
	 * the main message, but what about all the flags?
6978 6979
	 */
	if (restartpoint)
6980
		msg = "restartpoint starting:%s%s%s%s%s%s%s";
6981
	else
6982
		msg = "checkpoint starting:%s%s%s%s%s%s%s";
6983 6984

	elog(LOG, msg,
6985
		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6986
		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6987 6988 6989 6990 6991 6992 6993
		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
		 (flags & CHECKPOINT_FORCE) ? " force" : "",
		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}

6994
/*
6995 6996 6997
 * Log end of a checkpoint.
 */
static void
6998
LogCheckpointEnd(bool restartpoint)
6999
{
B
Bruce Momjian 已提交
7000 7001 7002 7003 7004 7005
	long		write_secs,
				sync_secs,
				total_secs;
	int			write_usecs,
				sync_usecs,
				total_usecs;
7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020

	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

	TimestampDifference(CheckpointStats.ckpt_start_t,
						CheckpointStats.ckpt_end_t,
						&total_secs, &total_usecs);

	TimestampDifference(CheckpointStats.ckpt_write_t,
						CheckpointStats.ckpt_sync_t,
						&write_secs, &write_usecs);

	TimestampDifference(CheckpointStats.ckpt_sync_t,
						CheckpointStats.ckpt_sync_end_t,
						&sync_secs, &sync_usecs);

7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040
	if (restartpoint)
		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
	else
		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
			 "%d transaction log file(s) added, %d removed, %d recycled; "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 CheckpointStats.ckpt_segs_added,
			 CheckpointStats.ckpt_segs_removed,
			 CheckpointStats.ckpt_segs_recycled,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
7041 7042
}

T
Tom Lane 已提交
7043 7044
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
7045
 *
7046 7047
 * flags is a bitwise OR of the following:
 *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7048
 *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7049
 *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7050
 *		ignoring checkpoint_completion_target parameter.
7051
 *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
7052
 *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7053
 *		CHECKPOINT_END_OF_RECOVERY).
7054
 *
7055
 * Note: flags contains other bits, of interest here only for logging purposes.
7056 7057
 * In particular note that this routine is synchronous and does not pay
 * attention to CHECKPOINT_WAIT.
T
Tom Lane 已提交
7058
 */
7059
void
7060
CreateCheckPoint(int flags)
7061
{
7062
	bool		shutdown;
7063 7064 7065
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
7066
	XLogRecData rdata;
7067
	uint32		freespace;
V
Vadim B. Mikheev 已提交
7068 7069
	uint32		_logId;
	uint32		_logSeg;
7070 7071
	TransactionId *inCommitXids;
	int			nInCommit;
V
Vadim B. Mikheev 已提交
7072

7073 7074 7075 7076
	/*
	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
	 * issued at a different time.
	 */
7077
	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7078 7079 7080
		shutdown = true;
	else
		shutdown = false;
7081

7082 7083 7084
	/* sanity check */
	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
		elog(ERROR, "can't create a checkpoint during recovery");
7085

7086 7087 7088 7089 7090 7091 7092 7093
	/*
	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
	 * (This is just pro forma, since in the present system structure there is
	 * only one process that is allowed to issue checkpoints at any given
	 * time.)
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

7094 7095 7096 7097 7098 7099 7100 7101 7102 7103
	/*
	 * Prepare to accumulate statistics.
	 *
	 * Note: because it is possible for log_checkpoints to change while a
	 * checkpoint proceeds, we always accumulate stats, even if
	 * log_checkpoints is currently off.
	 */
	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

7104 7105 7106
	/*
	 * Use a critical section to force system panic if we have trouble.
	 */
7107 7108
	START_CRIT_SECTION();

7109 7110
	if (shutdown)
	{
7111
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7112
		ControlFile->state = DB_SHUTDOWNING;
7113
		ControlFile->time = (pg_time_t) time(NULL);
7114
		UpdateControlFile();
7115
		LWLockRelease(ControlFileLock);
7116
	}
T
Tom Lane 已提交
7117

7118
	/*
B
Bruce Momjian 已提交
7119 7120 7121
	 * Let smgr prepare for checkpoint; this has to happen before we determine
	 * the REDO pointer.  Note that smgr must not do anything that'd have to
	 * be undone if we decide no checkpoint is needed.
7122 7123 7124 7125
	 */
	smgrpreckpt();

	/* Begin filling in the checkpoint WAL record */
7126
	MemSet(&checkPoint, 0, sizeof(checkPoint));
7127
	checkPoint.time = (pg_time_t) time(NULL);
7128

7129
	/*
7130 7131
	 * We must hold WALInsertLock while examining insert state to determine
	 * the checkpoint REDO pointer.
7132
	 */
7133
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
7134 7135

	/*
B
Bruce Momjian 已提交
7136 7137 7138 7139 7140 7141 7142 7143
	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
	 * any XLOG records since the start of the last checkpoint, skip the
	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
	 * when the system is idle. That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the previous
	 * checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
7144
	 *
7145 7146 7147 7148
	 * We have to make two tests to determine that nothing has happened since
	 * the start of the last checkpoint: current insertion point must match
	 * the end of the last checkpoint record, and its redo pointer must point
	 * to itself.
T
Tom Lane 已提交
7149
	 */
7150 7151
	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
				  CHECKPOINT_FORCE)) == 0)
T
Tom Lane 已提交
7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
7164 7165
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
7166 7167 7168 7169 7170
			END_CRIT_SECTION();
			return;
		}
	}

7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181
	/*
	 * An end-of-recovery checkpoint is created before anyone is allowed to
	 * write WAL. To allow us to write the checkpoint record, temporarily
	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
	 */
	if (flags & CHECKPOINT_END_OF_RECOVERY)
		LocalSetXLogInsertAllowed();

	checkPoint.ThisTimeLineID = ThisTimeLineID;

T
Tom Lane 已提交
7182 7183 7184
	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
B
Bruce Momjian 已提交
7185 7186 7187 7188
	 * NB: this is NOT necessarily where the checkpoint record itself will be,
	 * since other backends may insert more XLOG records while we're off doing
	 * the buffer flush work.  Those XLOG records are logically after the
	 * checkpoint, even though physically before it.  Got that?
T
Tom Lane 已提交
7189 7190
	 */
	freespace = INSERT_FREESPACE(Insert);
7191 7192
	if (freespace < SizeOfXLogRecord)
	{
7193
		(void) AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
7194
		/* OK to ignore update return flag, since we will do flush anyway */
7195
		freespace = INSERT_FREESPACE(Insert);
7196
	}
T
Tom Lane 已提交
7197
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
7198

T
Tom Lane 已提交
7199
	/*
B
Bruce Momjian 已提交
7200 7201
	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
	 * must be done while holding the insert lock AND the info_lck.
7202
	 *
B
Bruce Momjian 已提交
7203
	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
B
Bruce Momjian 已提交
7204 7205 7206 7207 7208
	 * pointing past where it really needs to point.  This is okay; the only
	 * consequence is that XLogInsert might back up whole buffers that it
	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
	 * XLogInserts that happen while we are dumping buffers must assume that
	 * their buffer changes are not included in the checkpoint.
T
Tom Lane 已提交
7209
	 */
7210 7211 7212 7213
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

7214
		SpinLockAcquire(&xlogctl->info_lck);
7215
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
7216
		SpinLockRelease(&xlogctl->info_lck);
7217
	}
B
Bruce Momjian 已提交
7218

T
Tom Lane 已提交
7219
	/*
7220 7221
	 * Now we can release WAL insert lock, allowing other xacts to proceed
	 * while we are flushing disk buffers.
T
Tom Lane 已提交
7222
	 */
7223
	LWLockRelease(WALInsertLock);
7224

7225
	/*
B
Bruce Momjian 已提交
7226 7227
	 * If enabled, log checkpoint start.  We postpone this until now so as not
	 * to log anything if we decided to skip the checkpoint.
7228 7229
	 */
	if (log_checkpoints)
7230
		LogCheckpointStart(flags, false);
7231

7232 7233
	TRACE_POSTGRESQL_CHECKPOINT_START(flags);

7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248
	/*
	 * Before flushing data, we must wait for any transactions that are
	 * currently in their commit critical sections.  If an xact inserted its
	 * commit record into XLOG just before the REDO point, then a crash
	 * restart from the REDO point would not replay that record, which means
	 * that our flushing had better include the xact's update of pg_clog.  So
	 * we wait till he's out of his commit critical section before proceeding.
	 * See notes in RecordTransactionCommit().
	 *
	 * Because we've already released WALInsertLock, this test is a bit fuzzy:
	 * it is possible that we will wait for xacts we didn't really need to
	 * wait for.  But the delay should be short and it seems better to make
	 * checkpoint take a bit longer than to hold locks longer than necessary.
	 * (In fact, the whole reason we have this issue is that xact.c does
	 * commit record XLOG insertion and clog update as two separate steps
B
Bruce Momjian 已提交
7249 7250
	 * protected by different locks, but again that seems best on grounds of
	 * minimizing lock contention.)
7251
	 *
B
Bruce Momjian 已提交
7252 7253
	 * A transaction that has not yet set inCommit when we look cannot be at
	 * risk, since he's not inserted his commit record yet; and one that's
7254 7255 7256 7257 7258 7259 7260
	 * already cleared it is not at risk either, since he's done fixing clog
	 * and we will correctly flush the update below.  So we cannot miss any
	 * xacts we need to wait for.
	 */
	nInCommit = GetTransactionsInCommit(&inCommitXids);
	if (nInCommit > 0)
	{
B
Bruce Momjian 已提交
7261 7262 7263
		do
		{
			pg_usleep(10000L);	/* wait for 10 msec */
7264 7265 7266 7267
		} while (HaveTransactionsInCommit(inCommitXids, nInCommit));
	}
	pfree(inCommitXids);

7268 7269 7270
	/*
	 * Get the other info we need for the checkpoint record.
	 */
7271
	LWLockAcquire(XidGenLock, LW_SHARED);
7272
	checkPoint.nextXid = ShmemVariableCache->nextXid;
7273 7274
	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7275
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
7276

7277 7278 7279 7280 7281
	/* Increase XID epoch if we've wrapped around since last checkpoint */
	checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
		checkPoint.nextXidEpoch++;

7282
	LWLockAcquire(OidGenLock, LW_SHARED);
7283
	checkPoint.nextOid = ShmemVariableCache->nextOid;
7284 7285
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
7286
	LWLockRelease(OidGenLock);
7287

7288 7289 7290
	MultiXactGetCheckptMulti(shutdown,
							 &checkPoint.nextMulti,
							 &checkPoint.nextMultiOffset);
7291

T
Tom Lane 已提交
7292
	/*
B
Bruce Momjian 已提交
7293 7294
	 * Having constructed the checkpoint record, ensure all shmem disk buffers
	 * and commit-log buffers are flushed to disk.
7295
	 *
7296 7297
	 * This I/O could fail for various reasons.  If so, we will fail to
	 * complete the checkpoint, but there is no reason to force a system
7298
	 * panic. Accordingly, exit critical section while doing it.
T
Tom Lane 已提交
7299
	 */
7300 7301
	END_CRIT_SECTION();

7302
	CheckPointGuts(checkPoint.redo, flags);
7303

7304
	/*
B
Bruce Momjian 已提交
7305 7306 7307
	 * Take a snapshot of running transactions and write this to WAL. This
	 * allows us to reconstruct the state of running transactions during
	 * archive recovery, if required. Skip, if this info disabled.
7308 7309 7310 7311 7312 7313 7314
	 *
	 * If we are shutting down, or Startup process is completing crash
	 * recovery we don't need to write running xact data.
	 *
	 * Update checkPoint.nextXid since we have a later value
	 */
	if (!shutdown && XLogStandbyInfoActive())
B
Bruce Momjian 已提交
7315
		LogStandbySnapshot(&checkPoint.oldestActiveXid, &checkPoint.nextXid);
7316 7317 7318
	else
		checkPoint.oldestActiveXid = InvalidTransactionId;

7319 7320
	START_CRIT_SECTION();

T
Tom Lane 已提交
7321 7322 7323
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
B
Bruce Momjian 已提交
7324
	rdata.data = (char *) (&checkPoint);
7325
	rdata.len = sizeof(checkPoint);
7326
	rdata.buffer = InvalidBuffer;
7327 7328
	rdata.next = NULL;

T
Tom Lane 已提交
7329 7330 7331 7332 7333 7334
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
7335

7336
	/*
B
Bruce Momjian 已提交
7337 7338 7339 7340
	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
	 * overwritten at next startup.  No-one should even try, this just allows
	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
	 * to just temporarily disable writing until the system has exited
7341 7342 7343 7344 7345
	 * recovery.
	 */
	if (shutdown)
	{
		if (flags & CHECKPOINT_END_OF_RECOVERY)
B
Bruce Momjian 已提交
7346
			LocalXLogInsertAllowed = -1;		/* return to "check" state */
7347
		else
B
Bruce Momjian 已提交
7348
			LocalXLogInsertAllowed = 0; /* never again write WAL */
7349 7350
	}

T
Tom Lane 已提交
7351
	/*
B
Bruce Momjian 已提交
7352 7353
	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
	 * = end of actual checkpoint record.
T
Tom Lane 已提交
7354 7355
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
7356 7357
		ereport(PANIC,
				(errmsg("concurrent transaction log activity while database system is shutting down")));
7358

T
Tom Lane 已提交
7359
	/*
7360 7361
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
T
Tom Lane 已提交
7362
	 */
7363
	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
7364

T
Tom Lane 已提交
7365 7366 7367
	/*
	 * Update the control file.
	 */
7368
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7369 7370
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
7371 7372 7373
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
7374
	ControlFile->time = (pg_time_t) time(NULL);
7375 7376
	/* crash recovery should always recover to the end of WAL */
	MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
7377
	UpdateControlFile();
7378
	LWLockRelease(ControlFileLock);
7379

7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390
	/* Update shared-memory copy of checkpoint XID/epoch */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
		xlogctl->ckptXid = checkPoint.nextXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

7391
	/*
B
Bruce Momjian 已提交
7392
	 * We are now done with critical updates; no need for system panic if we
7393
	 * have trouble while fooling with old log segments.
7394 7395 7396
	 */
	END_CRIT_SECTION();

7397 7398 7399 7400 7401
	/*
	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
	 */
	smgrpostckpt();

V
Vadim B. Mikheev 已提交
7402
	/*
7403 7404
	 * Delete old log files (those no longer needed even for previous
	 * checkpoint or the standbys in XLOG streaming).
7405
	 */
7406
	if (_logId || _logSeg)
7407
	{
7408 7409
		/*
		 * Calculate the last segment that we need to retain because of
B
Bruce Momjian 已提交
7410 7411
		 * wal_keep_segments, by subtracting wal_keep_segments from the new
		 * checkpoint location.
7412
		 */
7413
		if (wal_keep_segments > 0)
7414
		{
7415 7416 7417 7418 7419 7420 7421
			uint32		log;
			uint32		seg;
			int			d_log;
			int			d_seg;

			XLByteToSeg(recptr, log, seg);

7422 7423
			d_seg = wal_keep_segments % XLogSegsPerFile;
			d_log = wal_keep_segments / XLogSegsPerFile;
7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440
			if (seg < d_seg)
			{
				d_log += 1;
				seg = seg - d_seg + XLogSegsPerFile;
			}
			else
				seg = seg - d_seg;
			/* avoid underflow, don't go below (0,1) */
			if (log < d_log || (log == d_log && seg == 0))
			{
				log = 0;
				seg = 1;
			}
			else
				log = log - d_log;

			/* don't delete WAL segments newer than the calculated segment */
7441 7442
			if (log < _logId || (log == _logId && seg < _logSeg))
			{
B
Bruce Momjian 已提交
7443 7444
				_logId = log;
				_logSeg = seg;
7445 7446 7447
			}
		}

T
Tom Lane 已提交
7448
		PrevLogSeg(_logId, _logSeg);
7449
		RemoveOldXlogFiles(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
7450 7451
	}

T
Tom Lane 已提交
7452
	/*
7453 7454
	 * Make more log segments if needed.  (Do this after recycling old log
	 * segments, since that may supply some of the needed files.)
T
Tom Lane 已提交
7455 7456
	 */
	if (!shutdown)
7457
		PreallocXlogFiles(recptr);
T
Tom Lane 已提交
7458

7459
	/*
B
Bruce Momjian 已提交
7460 7461 7462 7463 7464
	 * Truncate pg_subtrans if possible.  We can throw away all data before
	 * the oldest XMIN of any running transaction.	No future transaction will
	 * attempt to reference any pg_subtrans entry older than that (see Asserts
	 * in subtrans.c).	During recovery, though, we mustn't do this because
	 * StartupSUBTRANS hasn't been called yet.
7465
	 */
7466
	if (!RecoveryInProgress())
7467
		TruncateSUBTRANS(GetOldestXmin(true, false));
7468

7469 7470
	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
7471
		LogCheckpointEnd(false);
7472

7473 7474 7475 7476 7477
	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
									 NBuffers,
									 CheckpointStats.ckpt_segs_added,
									 CheckpointStats.ckpt_segs_removed,
									 CheckpointStats.ckpt_segs_recycled);
7478

7479
	LWLockRelease(CheckpointLock);
7480
}
V
WAL  
Vadim B. Mikheev 已提交
7481

7482 7483 7484 7485 7486 7487 7488
/*
 * Flush all data in shared memory to disk, and fsync
 *
 * This is the common code shared between regular checkpoints and
 * recovery restartpoints.
 */
static void
7489
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7490 7491 7492 7493
{
	CheckPointCLOG();
	CheckPointSUBTRANS();
	CheckPointMultiXact();
7494
	CheckPointRelationMap();
B
Bruce Momjian 已提交
7495
	CheckPointBuffers(flags);	/* performs all required fsyncs */
7496 7497 7498 7499 7500
	/* We deliberately delay 2PC checkpointing as long as possible */
	CheckPointTwoPhase(checkPointRedo);
}

/*
7501 7502 7503 7504 7505 7506 7507 7508
 * Save a checkpoint for recovery restart if appropriate
 *
 * This function is called each time a checkpoint record is read from XLOG.
 * It must determine whether the checkpoint represents a safe restartpoint or
 * not.  If so, the checkpoint record is stashed in shared memory so that
 * CreateRestartPoint can consult it.  (Note that the latter function is
 * executed by the bgwriter, while this one will be executed by the startup
 * process.)
7509 7510 7511 7512
 */
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
B
Bruce Momjian 已提交
7513
	int			rmid;
7514

7515 7516
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527

	/*
	 * Is it safe to checkpoint?  We must ask each of the resource managers
	 * whether they have any partial state information that might prevent a
	 * correct restart from this point.  If so, we skip this opportunity, but
	 * return at the next checkpoint record for another try.
	 */
	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
	{
		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7528
			{
7529
				elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
7530 7531 7532
					 rmid,
					 checkPoint->redo.xlogid,
					 checkPoint->redo.xrecoff);
7533
				return;
7534
			}
7535 7536 7537
	}

	/*
7538 7539
	 * Copy the checkpoint record to shared memory, so that bgwriter can use
	 * it the next time it wants to perform a restartpoint.
7540 7541 7542 7543 7544 7545 7546 7547
	 */
	SpinLockAcquire(&xlogctl->info_lck);
	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
}

/*
7548 7549
 * Establish a restartpoint if possible.
 *
7550 7551 7552 7553 7554
 * This is similar to CreateCheckPoint, but is used during WAL recovery
 * to establish a point from which recovery can roll forward without
 * replaying the entire recovery log.
 *
 * Returns true if a new restartpoint was established. We can only establish
7555
 * a restartpoint if we have replayed a safe checkpoint record since last
7556 7557 7558 7559 7560
 * restartpoint.
 */
bool
CreateRestartPoint(int flags)
{
7561 7562
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;
7563 7564
	uint32		_logId;
	uint32		_logSeg;
B
Bruce Momjian 已提交
7565
	TimestampTz xtime;
7566

7567 7568 7569 7570 7571 7572 7573 7574 7575
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
	 * happens at a time.
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

7576
	/* Get a local copy of the last safe checkpoint record. */
7577 7578 7579 7580 7581
	SpinLockAcquire(&xlogctl->info_lck);
	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);

7582
	/*
7583 7584 7585 7586 7587 7588
	 * Check that we're still in recovery mode. It's ok if we exit recovery
	 * mode after this check, the restart point is valid anyway.
	 */
	if (!RecoveryInProgress())
	{
		ereport(DEBUG2,
7589
			  (errmsg("skipping restartpoint, recovery has already ended")));
7590 7591 7592 7593 7594 7595 7596 7597 7598
		LWLockRelease(CheckpointLock);
		return false;
	}

	/*
	 * If the last checkpoint record we've replayed is already our last
	 * restartpoint, we can't perform a new restart point. We still update
	 * minRecoveryPoint in that case, so that if this is a shutdown restart
	 * point, we won't start up earlier than before. That's not strictly
B
Bruce Momjian 已提交
7599 7600 7601 7602
	 * necessary, but when hot standby is enabled, it would be rather weird if
	 * the database opened up for read-only connections at a point-in-time
	 * before the last shutdown. Such time travel is still possible in case of
	 * immediate shutdown, though.
7603 7604
	 *
	 * We don't explicitly advance minRecoveryPoint when we do create a
7605 7606
	 * restartpoint. It's assumed that flushing the buffers will do that as a
	 * side-effect.
7607
	 */
7608 7609 7610
	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
		XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
	{
7611 7612
		XLogRecPtr	InvalidXLogRecPtr = {0, 0};

7613 7614
		ereport(DEBUG2,
				(errmsg("skipping restartpoint, already performed at %X/%X",
7615
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
7616 7617

		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7618 7619 7620 7621 7622 7623 7624 7625
		if (flags & CHECKPOINT_IS_SHUTDOWN)
		{
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
			ControlFile->time = (pg_time_t) time(NULL);
			UpdateControlFile();
			LWLockRelease(ControlFileLock);
		}
7626 7627 7628 7629
		LWLockRelease(CheckpointLock);
		return false;
	}

7630
	/*
B
Bruce Momjian 已提交
7631 7632 7633
	 * Update the shared RedoRecPtr so that the startup process can calculate
	 * the number of segments replayed since last restartpoint, and request a
	 * restartpoint if it exceeds checkpoint_segments.
7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644
	 *
	 * You need to hold WALInsertLock and info_lck to update it, although
	 * during recovery acquiring WALInsertLock is just pro forma, because
	 * there is no other processes updating Insert.RedoRecPtr.
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
	SpinLockRelease(&xlogctl->info_lck);
	LWLockRelease(WALInsertLock);

7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656
	if (log_checkpoints)
	{
		/*
		 * Prepare to accumulate statistics.
		 */
		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

		LogCheckpointStart(flags, true);
	}

	CheckPointGuts(lastCheckPoint.redo, flags);
7657

7658 7659 7660 7661 7662 7663
	/*
	 * Select point at which we can truncate the xlog, which we base on the
	 * prior checkpoint's earliest info.
	 */
	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);

7664
	/*
7665 7666
	 * Update pg_control, using current time.  Check that it still shows
	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
B
Bruce Momjian 已提交
7667 7668
	 * this is a quick hack to make sure nothing really bad happens if somehow
	 * we get here after the end-of-recovery checkpoint.
7669
	 */
7670
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7671 7672 7673 7674 7675 7676 7677
	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
		XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
	{
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = lastCheckPointRecPtr;
		ControlFile->checkPointCopy = lastCheckPoint;
		ControlFile->time = (pg_time_t) time(NULL);
7678 7679
		if (flags & CHECKPOINT_IS_SHUTDOWN)
			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7680 7681
		UpdateControlFile();
	}
7682
	LWLockRelease(ControlFileLock);
7683

7684 7685 7686 7687 7688 7689 7690
	/*
	 * Delete old log files (those no longer needed even for previous
	 * checkpoint/restartpoint) to prevent the disk holding the xlog from
	 * growing full. We don't need do this during normal recovery, but during
	 * streaming recovery we have to or the disk will eventually fill up from
	 * old log files streamed from master.
	 */
7691
	if (WalRcvInProgress() && (_logId || _logSeg))
7692 7693 7694 7695
	{
		XLogRecPtr	endptr;

		/* Get the current (or recent) end of xlog */
7696
		endptr = GetWalRcvWriteRecPtr(NULL);
7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707

		PrevLogSeg(_logId, _logSeg);
		RemoveOldXlogFiles(_logId, _logSeg, endptr);

		/*
		 * Make more log segments if needed.  (Do this after recycling old log
		 * segments, since that may supply some of the needed files.)
		 */
		PreallocXlogFiles(endptr);
	}

7708
	/*
7709 7710 7711
	 * Currently, there is no need to truncate pg_subtrans during recovery. If
	 * we did do that, we will need to have called StartupSUBTRANS() already
	 * and then TruncateSUBTRANS() would go here.
7712 7713 7714 7715 7716 7717
	 */

	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
		LogCheckpointEnd(true);

7718
	xtime = GetLatestXTime();
7719
	ereport((log_checkpoints ? LOG : DEBUG2),
7720 7721
			(errmsg("recovery restart point at %X/%X",
					lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff),
B
Bruce Momjian 已提交
7722 7723
		   xtime ? errdetail("last completed transaction was at log time %s",
							 timestamptz_to_str(xtime)) : 0));
7724 7725

	LWLockRelease(CheckpointLock);
7726 7727

	/*
7728
	 * Finally, execute archive_cleanup_command, if any.
7729
	 */
7730 7731 7732
	if (XLogCtl->archiveCleanupCommand[0])
		ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
							   "archive_cleanup_command",
7733 7734
							   false);

7735
	return true;
7736 7737
}

T
Tom Lane 已提交
7738 7739 7740
/*
 * Write a NEXTOID log record
 */
7741 7742 7743
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
7744
	XLogRecData rdata;
7745

B
Bruce Momjian 已提交
7746
	rdata.data = (char *) (&nextOid);
7747
	rdata.len = sizeof(Oid);
7748
	rdata.buffer = InvalidBuffer;
7749 7750
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
B
Bruce Momjian 已提交
7751

7752 7753
	/*
	 * We need not flush the NEXTOID record immediately, because any of the
B
Bruce Momjian 已提交
7754 7755 7756 7757 7758
	 * just-allocated OIDs could only reach disk as part of a tuple insert or
	 * update that would have its own XLOG record that must follow the NEXTOID
	 * record.	Therefore, the standard buffer LSN interlock applied to those
	 * records will ensure no such OID reaches disk before the NEXTOID record
	 * does.
7759 7760
	 *
	 * Note, however, that the above statement only covers state "within" the
B
Bruce Momjian 已提交
7761 7762
	 * database.  When we use a generated OID as a file or directory name, we
	 * are in a sense violating the basic WAL rule, because that filesystem
7763
	 * change may reach disk before the NEXTOID WAL record does.  The impact
B
Bruce Momjian 已提交
7764 7765 7766 7767 7768
	 * of this is that if a database crash occurs immediately afterward, we
	 * might after restart re-generate the same OID and find that it conflicts
	 * with the leftover file or directory.  But since for safety's sake we
	 * always loop until finding a nonconflicting filename, this poses no real
	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7769 7770 7771
	 */
}

7772 7773 7774 7775 7776 7777 7778 7779 7780 7781
/*
 * Write an XLOG SWITCH record.
 *
 * Here we just blindly issue an XLogInsert request for the record.
 * All the magic happens inside XLogInsert.
 *
 * The return value is either the end+1 address of the switch record,
 * or the end+1 address of the prior segment if we did not need to
 * write a switch record because we are already at segment start.
 */
7782
XLogRecPtr
7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798
RequestXLogSwitch(void)
{
	XLogRecPtr	RecPtr;
	XLogRecData rdata;

	/* XLOG SWITCH, alone among xlog record types, has no data */
	rdata.buffer = InvalidBuffer;
	rdata.data = NULL;
	rdata.len = 0;
	rdata.next = NULL;

	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);

	return RecPtr;
}

7799
/*
7800 7801
 * Check if any of the GUC parameters that are critical for hot standby
 * have changed, and update the value in pg_control file if necessary.
7802
 */
7803 7804
static void
XLogReportParameters(void)
7805
{
7806 7807 7808
	if (wal_level != ControlFile->wal_level ||
		MaxConnections != ControlFile->MaxConnections ||
		max_prepared_xacts != ControlFile->max_prepared_xacts ||
7809
		max_locks_per_xact != ControlFile->max_locks_per_xact)
7810 7811
	{
		/*
B
Bruce Momjian 已提交
7812 7813 7814 7815 7816
		 * The change in number of backend slots doesn't need to be WAL-logged
		 * if archiving is not enabled, as you can't start archive recovery
		 * with wal_level=minimal anyway. We don't really care about the
		 * values in pg_control either if wal_level=minimal, but seems better
		 * to keep them up-to-date to avoid confusion.
7817 7818 7819 7820 7821
		 */
		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
		{
			XLogRecData rdata;
			xl_parameter_change xlrec;
7822

7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834
			xlrec.MaxConnections = MaxConnections;
			xlrec.max_prepared_xacts = max_prepared_xacts;
			xlrec.max_locks_per_xact = max_locks_per_xact;
			xlrec.wal_level = wal_level;

			rdata.buffer = InvalidBuffer;
			rdata.data = (char *) &xlrec;
			rdata.len = sizeof(xlrec);
			rdata.next = NULL;

			XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
		}
7835

7836 7837 7838 7839 7840 7841
		ControlFile->MaxConnections = MaxConnections;
		ControlFile->max_prepared_xacts = max_prepared_xacts;
		ControlFile->max_locks_per_xact = max_locks_per_xact;
		ControlFile->wal_level = wal_level;
		UpdateControlFile();
	}
7842 7843
}

T
Tom Lane 已提交
7844 7845
/*
 * XLOG resource manager's routines
7846 7847
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
7848
 * not all record types are related to control file updates.
T
Tom Lane 已提交
7849
 */
V
WAL  
Vadim B. Mikheev 已提交
7850 7851 7852
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
7853
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
7854

7855 7856 7857
	/* Backup blocks are not used in xlog records */
	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

7858
	if (info == XLOG_NEXTOID)
7859
	{
B
Bruce Momjian 已提交
7860
		Oid			nextOid;
7861 7862 7863

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
7864
		{
7865
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
7878 7879
		MultiXactSetNextMXact(checkPoint.nextMulti,
							  checkPoint.nextMultiOffset);
7880
		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
B
Bruce Momjian 已提交
7881

7882
		/*
B
Bruce Momjian 已提交
7883 7884 7885
		 * If we see a shutdown checkpoint while waiting for an end-of-backup
		 * record, the backup was cancelled and the end-of-backup record will
		 * never arrive.
7886 7887 7888 7889 7890 7891
		 */
		if (InArchiveRecovery &&
			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
			ereport(ERROR,
					(errmsg("online backup was cancelled, recovery cannot continue")));

7892
		/*
B
Bruce Momjian 已提交
7893 7894 7895 7896
		 * If we see a shutdown checkpoint, we know that nothing was running
		 * on the master at this point. So fake-up an empty running-xacts
		 * record and use that here and now. Recover additional standby state
		 * for prepared transactions.
7897
		 */
7898 7899
		if (standbyState >= STANDBY_INITIALIZED)
		{
7900 7901 7902
			TransactionId *xids;
			int			nxids;
			TransactionId oldestActiveXID;
7903
			TransactionId latestCompletedXid;
7904 7905 7906 7907
			RunningTransactionsData running;

			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);

7908
			/*
7909
			 * Construct a RunningTransactions snapshot representing a shut
B
Bruce Momjian 已提交
7910 7911 7912
			 * down server, with only prepared transactions still alive. We're
			 * never overflowed at this point because all subxids are listed
			 * with their parent prepared transactions.
7913
			 */
7914 7915 7916 7917
			running.xcnt = nxids;
			running.subxid_overflow = false;
			running.nextXid = checkPoint.nextXid;
			running.oldestRunningXid = oldestActiveXID;
7918 7919
			latestCompletedXid = checkPoint.nextXid;
			TransactionIdRetreat(latestCompletedXid);
7920
			Assert(TransactionIdIsNormal(latestCompletedXid));
7921
			running.latestCompletedXid = latestCompletedXid;
7922 7923 7924 7925 7926
			running.xids = xids;

			ProcArrayApplyRecoveryInfo(&running);

			StandbyRecoverPreparedTransactions(true);
7927 7928
		}

7929 7930 7931 7932
		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

7933
		/*
B
Bruce Momjian 已提交
7934
		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
7935 7936 7937 7938 7939 7940 7941 7942
		 */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
		{
			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
				!list_member_int(expectedTLIs,
								 (int) checkPoint.ThisTimeLineID))
				ereport(PANIC,
						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7943 7944 7945
								checkPoint.ThisTimeLineID, ThisTimeLineID)));
			/* Following WAL records should be run with new TLI */
			ThisTimeLineID = checkPoint.ThisTimeLineID;
7946
		}
7947 7948

		RecoveryRestartPoint(&checkPoint);
T
Tom Lane 已提交
7949 7950 7951 7952 7953 7954
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7955
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
7956 7957
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
7958 7959 7960 7961 7962 7963
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
7964 7965
		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
								  checkPoint.nextMultiOffset);
7966 7967
		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
								  checkPoint.oldestXid))
7968 7969
			SetTransactionIdLimit(checkPoint.oldestXid,
								  checkPoint.oldestXidDB);
7970 7971 7972 7973 7974

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

7975 7976
		/* TLI should not change in an on-line checkpoint */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7977
			ereport(PANIC,
7978 7979
					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
							checkPoint.ThisTimeLineID, ThisTimeLineID)));
7980 7981

		RecoveryRestartPoint(&checkPoint);
7982
	}
7983 7984 7985 7986
	else if (info == XLOG_NOOP)
	{
		/* nothing to do here */
	}
7987 7988 7989 7990
	else if (info == XLOG_SWITCH)
	{
		/* nothing to do here */
	}
7991 7992 7993
	else if (info == XLOG_BACKUP_END)
	{
		XLogRecPtr	startpoint;
B
Bruce Momjian 已提交
7994

7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017
		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

		if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
		{
			/*
			 * We have reached the end of base backup, the point where
			 * pg_stop_backup() was done. The data on disk is now consistent.
			 * Reset backupStartPoint, and update minRecoveryPoint to make
			 * sure we don't allow starting up at an earlier point even if
			 * recovery is stopped and restarted soon after this.
			 */
			elog(DEBUG1, "end of backup reached");

			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

			if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
				ControlFile->minRecoveryPoint = lsn;
			MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
			UpdateControlFile();

			LWLockRelease(ControlFileLock);
		}
	}
8018
	else if (info == XLOG_PARAMETER_CHANGE)
8019
	{
8020 8021 8022 8023 8024
		xl_parameter_change xlrec;

		/* Update our copy of the parameters in pg_control */
		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));

8025
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8026 8027 8028 8029
		ControlFile->MaxConnections = xlrec.MaxConnections;
		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
		ControlFile->wal_level = xlrec.wal_level;
B
Bruce Momjian 已提交
8030

8031
		/*
B
Bruce Momjian 已提交
8032 8033 8034 8035 8036 8037
		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
		 * recover back up to this point before allowing hot standby again.
		 * This is particularly important if wal_level was set to 'archive'
		 * before, and is now 'hot_standby', to ensure you don't run queries
		 * against the WAL preceding the wal_level change. Same applies to
		 * decreasing max_* settings.
8038 8039 8040 8041 8042 8043 8044 8045
		 */
		minRecoveryPoint = ControlFile->minRecoveryPoint;
		if ((minRecoveryPoint.xlogid != 0 || minRecoveryPoint.xrecoff != 0)
			&& XLByteLT(minRecoveryPoint, lsn))
		{
			ControlFile->minRecoveryPoint = lsn;
		}

8046
		UpdateControlFile();
8047
		LWLockRelease(ControlFileLock);
8048 8049 8050

		/* Check to see if any changes to max_connections give problems */
		CheckRequiredParameterValues();
8051
	}
V
WAL  
Vadim B. Mikheev 已提交
8052
}
B
Bruce Momjian 已提交
8053

V
WAL  
Vadim B. Mikheev 已提交
8054
void
8055
xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
8056
{
B
Bruce Momjian 已提交
8057
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
8058

T
Tom Lane 已提交
8059 8060
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
8061
	{
B
Bruce Momjian 已提交
8062 8063
		CheckPoint *checkpoint = (CheckPoint *) rec;

8064
		appendStringInfo(buf, "checkpoint: redo %X/%X; "
8065
						 "tli %u; xid %u/%u; oid %u; multi %u; offset %u; "
8066
						 "oldest xid %u in DB %u; oldest running xid %u; %s",
B
Bruce Momjian 已提交
8067 8068 8069 8070 8071 8072
						 checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
						 checkpoint->ThisTimeLineID,
						 checkpoint->nextXidEpoch, checkpoint->nextXid,
						 checkpoint->nextOid,
						 checkpoint->nextMulti,
						 checkpoint->nextMultiOffset,
8073 8074
						 checkpoint->oldestXid,
						 checkpoint->oldestXidDB,
8075
						 checkpoint->oldestActiveXid,
B
Bruce Momjian 已提交
8076
				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
8077
	}
8078 8079 8080 8081
	else if (info == XLOG_NOOP)
	{
		appendStringInfo(buf, "xlog no-op");
	}
8082 8083
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
8084
		Oid			nextOid;
8085 8086

		memcpy(&nextOid, rec, sizeof(Oid));
8087
		appendStringInfo(buf, "nextOid: %u", nextOid);
8088
	}
8089 8090 8091 8092
	else if (info == XLOG_SWITCH)
	{
		appendStringInfo(buf, "xlog switch");
	}
8093 8094
	else if (info == XLOG_BACKUP_END)
	{
B
Bruce Momjian 已提交
8095
		XLogRecPtr	startpoint;
8096 8097 8098 8099 8100

		memcpy(&startpoint, rec, sizeof(XLogRecPtr));
		appendStringInfo(buf, "backup end: %X/%X",
						 startpoint.xlogid, startpoint.xrecoff);
	}
8101
	else if (info == XLOG_PARAMETER_CHANGE)
8102
	{
8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118
		xl_parameter_change xlrec;
		const char *wal_level_str;
		const struct config_enum_entry *entry;

		memcpy(&xlrec, rec, sizeof(xl_parameter_change));

		/* Find a string representation for wal_level */
		wal_level_str = "?";
		for (entry = wal_level_options; entry->name; entry++)
		{
			if (entry->val == xlrec.wal_level)
			{
				wal_level_str = entry->name;
				break;
			}
		}
8119

8120 8121 8122 8123 8124
		appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s",
						 xlrec.MaxConnections,
						 xlrec.max_prepared_xacts,
						 xlrec.max_locks_per_xact,
						 wal_level_str);
8125
	}
V
WAL  
Vadim B. Mikheev 已提交
8126
	else
8127
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
8128 8129
}

8130
#ifdef WAL_DEBUG
8131

V
WAL  
Vadim B. Mikheev 已提交
8132
static void
8133
xlog_outrec(StringInfo buf, XLogRecord *record)
V
WAL  
Vadim B. Mikheev 已提交
8134
{
B
Bruce Momjian 已提交
8135
	int			i;
8136

8137
	appendStringInfo(buf, "prev %X/%X; xid %u",
8138 8139
					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
					 record->xl_xid);
8140

8141 8142 8143
	appendStringInfo(buf, "; len %u",
					 record->xl_len);

8144
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8145
	{
8146
		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
B
Bruce Momjian 已提交
8147
			appendStringInfo(buf, "; bkpb%d", i + 1);
8148 8149
	}

8150
	appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
8151
}
B
Bruce Momjian 已提交
8152
#endif   /* WAL_DEBUG */
8153 8154 8155


/*
8156 8157
 * Return the (possible) sync flag used for opening a file, depending on the
 * value of the GUC wal_sync_method.
8158
 */
8159 8160
static int
get_sync_bit(int method)
8161
{
B
Bruce Momjian 已提交
8162
	int			o_direct_flag = 0;
8163

8164 8165 8166
	/* If fsync is disabled, never open in sync mode */
	if (!enableFsync)
		return 0;
8167

8168 8169 8170
	/*
	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
	 * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
B
Bruce Momjian 已提交
8171 8172 8173 8174 8175
	 * disabled, otherwise the archive command or walsender process will read
	 * the WAL soon after writing it, which is guaranteed to cause a physical
	 * read if we bypassed the kernel cache. We also skip the
	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
	 * reason.
8176 8177 8178 8179 8180 8181 8182 8183 8184
	 *
	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
	 * written by walreceiver is normally read by the startup process soon
	 * after its written. Also, walreceiver performs unaligned writes, which
	 * don't work with O_DIRECT, so it is required for correctness too.
	 */
	if (!XLogIsNeeded() && !am_walreceiver)
		o_direct_flag = PG_O_DIRECT;

8185
	switch (method)
8186
	{
8187 8188 8189 8190 8191 8192
			/*
			 * enum values for all sync options are defined even if they are
			 * not supported on the current platform.  But if not, they are
			 * not included in the enum option array, and therefore will never
			 * be seen here.
			 */
8193 8194 8195
		case SYNC_METHOD_FSYNC:
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
		case SYNC_METHOD_FDATASYNC:
8196
			return 0;
8197
#ifdef OPEN_SYNC_FLAG
8198
		case SYNC_METHOD_OPEN:
8199
			return OPEN_SYNC_FLAG | o_direct_flag;
8200 8201
#endif
#ifdef OPEN_DATASYNC_FLAG
8202
		case SYNC_METHOD_OPEN_DSYNC:
8203
			return OPEN_DATASYNC_FLAG | o_direct_flag;
8204
#endif
8205
		default:
8206 8207
			/* can't happen (unless we are out of sync with option array) */
			elog(ERROR, "unrecognized wal_sync_method: %d", method);
8208
			return 0;			/* silence warning */
8209
	}
8210
}
8211

8212 8213 8214 8215 8216 8217
/*
 * GUC support
 */
bool
assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
{
8218
	if (!doit)
8219
		return true;
8220

8221
	if (sync_method != new_sync_method)
8222 8223
	{
		/*
B
Bruce Momjian 已提交
8224 8225
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
B
Bruce Momjian 已提交
8226 8227
		 * changing, close the log file so it will be reopened (with new flag
		 * bit) at next use.
8228 8229 8230 8231
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
8232 8233
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8234 8235
						 errmsg("could not fsync log file %u, segment %u: %m",
								openLogId, openLogSeg)));
8236
			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8237
				XLogFileClose();
8238 8239
		}
	}
8240

8241
	return true;
8242 8243 8244 8245
}


/*
8246 8247 8248 8249
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
8250
 */
8251 8252
void
issue_xlog_fsync(int fd, uint32 log, uint32 seg)
8253 8254 8255
{
	switch (sync_method)
	{
8256
		case SYNC_METHOD_FSYNC:
8257
			if (pg_fsync_no_writethrough(fd) != 0)
8258 8259
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8260
						 errmsg("could not fsync log file %u, segment %u: %m",
8261
								log, seg)));
8262
			break;
8263 8264
#ifdef HAVE_FSYNC_WRITETHROUGH
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
8265
			if (pg_fsync_writethrough(fd) != 0)
8266 8267
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8268
						 errmsg("could not fsync write-through log file %u, segment %u: %m",
8269
								log, seg)));
8270 8271
			break;
#endif
8272 8273
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
8274
			if (pg_fdatasync(fd) != 0)
8275 8276
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8277
					errmsg("could not fdatasync log file %u, segment %u: %m",
8278
						   log, seg)));
8279 8280 8281
			break;
#endif
		case SYNC_METHOD_OPEN:
8282
		case SYNC_METHOD_OPEN_DSYNC:
8283 8284 8285
			/* write synced it already */
			break;
		default:
8286
			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8287 8288 8289
			break;
	}
}
8290 8291 8292 8293 8294 8295 8296 8297 8298


/*
 * pg_start_backup: set up for taking an on-line backup dump
 *
 * Essentially what this does is to create a backup label file in $PGDATA,
 * where it will be archived as part of the backup dump.  The label file
 * contains the user-supplied label string (typically this would be used
 * to tell where the backup dump will be stored) and the starting time and
8299
 * starting WAL location for the dump.
8300 8301 8302 8303 8304
 */
Datum
pg_start_backup(PG_FUNCTION_ARGS)
{
	text	   *backupid = PG_GETARG_TEXT_P(0);
8305
	bool		fast = PG_GETARG_BOOL(1);
8306
	char	   *backupidstr;
8307
	XLogRecPtr	checkpointloc;
8308
	XLogRecPtr	startpoint;
8309
	pg_time_t	stamp_time;
8310 8311 8312 8313 8314 8315 8316
	char		strfbuf[128];
	char		xlogfilename[MAXFNAMELEN];
	uint32		_logId;
	uint32		_logSeg;
	struct stat stat_buf;
	FILE	   *fp;

B
Bruce Momjian 已提交
8317
	if (!superuser())
8318 8319
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8320
				 errmsg("must be superuser to run a backup")));
8321

8322 8323 8324 8325 8326 8327
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8328
	if (!XLogIsNeeded())
8329 8330
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
B
Bruce Momjian 已提交
8331
			  errmsg("WAL level not sufficient for making an online backup"),
8332
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8333

8334
	backupidstr = text_to_cstring(backupid);
B
Bruce Momjian 已提交
8335

8336
	/*
8337 8338 8339
	 * Mark backup active in shared memory.  We must do full-page WAL writes
	 * during an on-line backup even if not doing so at other times, because
	 * it's quite possible for the backup dump to obtain a "torn" (partially
B
Bruce Momjian 已提交
8340 8341 8342 8343 8344 8345 8346 8347 8348
	 * written) copy of a database page if it reads the page concurrently with
	 * our write to the same page.	This can be fixed as long as the first
	 * write to the page in the WAL sequence is a full-page write. Hence, we
	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
	 * are no dirty pages in shared memory that might get dumped while the
	 * backup is in progress without having a corresponding WAL record.  (Once
	 * the backup is complete, we need not force full-page writes anymore,
	 * since we expect that any pages not modified during the backup interval
	 * must have been correctly captured by the backup.)
8349
	 *
B
Bruce Momjian 已提交
8350 8351
	 * We must hold WALInsertLock to change the value of forcePageWrites, to
	 * ensure adequate interlocking against XLogInsert().
8352
	 */
8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (XLogCtl->Insert.forcePageWrites)
	{
		LWLockRelease(WALInsertLock);
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("a backup is already in progress"),
				 errhint("Run pg_stop_backup() and try again.")));
	}
	XLogCtl->Insert.forcePageWrites = true;
	LWLockRelease(WALInsertLock);
B
Bruce Momjian 已提交
8364

8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377
	/*
	 * Force an XLOG file switch before the checkpoint, to ensure that the WAL
	 * segment the checkpoint is written to doesn't contain pages with old
	 * timeline IDs. That would otherwise happen if you called
	 * pg_start_backup() right after restoring from a PITR archive: the first
	 * WAL segment containing the startup checkpoint has pages in the
	 * beginning with the old timeline ID. That can cause trouble at recovery:
	 * we won't have a history file covering the old timeline if pg_xlog
	 * directory was not included in the base backup and the WAL archive was
	 * cleared too before starting the backup.
	 */
	RequestXLogSwitch();

8378 8379
	/* Ensure we release forcePageWrites if fail below */
	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
8380 8381
	{
		/*
B
Bruce Momjian 已提交
8382
		 * Force a CHECKPOINT.	Aside from being necessary to prevent torn
8383 8384 8385
		 * page problems, this guarantees that two successive backup runs will
		 * have different checkpoint positions and hence different history
		 * file names, even if nothing happened in between.
8386
		 *
8387 8388
		 * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
		 * fast = true).  Otherwise this can take awhile.
8389
		 */
8390 8391
		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
						  (fast ? CHECKPOINT_IMMEDIATE : 0));
8392

8393 8394 8395 8396 8397
		/*
		 * Now we need to fetch the checkpoint record location, and also its
		 * REDO pointer.  The oldest point in WAL that would be needed to
		 * restore starting from the checkpoint is precisely the REDO pointer.
		 */
8398
		LWLockAcquire(ControlFileLock, LW_SHARED);
8399 8400 8401
		checkpointloc = ControlFile->checkPoint;
		startpoint = ControlFile->checkPointCopy.redo;
		LWLockRelease(ControlFileLock);
B
Bruce Momjian 已提交
8402

8403 8404
		XLByteToSeg(startpoint, _logId, _logSeg);
		XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
B
Bruce Momjian 已提交
8405

8406 8407 8408 8409 8410
		/* Use the log timezone here, not the session timezone */
		stamp_time = (pg_time_t) time(NULL);
		pg_strftime(strfbuf, sizeof(strfbuf),
					"%Y-%m-%d %H:%M:%S %Z",
					pg_localtime(&stamp_time, log_timezone));
8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436

		/*
		 * Check for existing backup label --- implies a backup is already
		 * running.  (XXX given that we checked forcePageWrites above, maybe
		 * it would be OK to just unlink any such label file?)
		 */
		if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
		{
			if (errno != ENOENT)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not stat file \"%s\": %m",
								BACKUP_LABEL_FILE)));
		}
		else
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is already in progress"),
					 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
							 BACKUP_LABEL_FILE)));

		/*
		 * Okay, write the file
		 */
		fp = AllocateFile(BACKUP_LABEL_FILE, "w");
		if (!fp)
8437 8438
			ereport(ERROR,
					(errcode_for_file_access(),
8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450
					 errmsg("could not create file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
				startpoint.xlogid, startpoint.xrecoff, xlogfilename);
		fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
				checkpointloc.xlogid, checkpointloc.xrecoff);
		fprintf(fp, "START TIME: %s\n", strfbuf);
		fprintf(fp, "LABEL: %s\n", backupidstr);
		if (fflush(fp) || ferror(fp) || FreeFile(fp))
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not write file \"%s\": %m",
8451
							BACKUP_LABEL_FILE)));
8452
	}
8453
	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
B
Bruce Momjian 已提交
8454

8455
	/*
8456
	 * We're done.  As a convenience, return the starting WAL location.
8457 8458 8459
	 */
	snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
			 startpoint.xlogid, startpoint.xrecoff);
8460
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
8461 8462
}

8463 8464 8465 8466 8467 8468 8469 8470 8471 8472
/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
	/* Turn off forcePageWrites on failure */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	XLogCtl->Insert.forcePageWrites = false;
	LWLockRelease(WALInsertLock);
}

8473 8474 8475
/*
 * pg_stop_backup: finish taking an on-line backup dump
 *
8476 8477 8478 8479
 * We write an end-of-backup WAL record, and remove the backup label file
 * created by pg_start_backup, creating a backup history file in pg_xlog
 * instead (whence it will immediately be archived). The backup history file
 * contains the same info found in the label file, plus the backup-end time
8480
 * and WAL location. Before 9.0, the backup-end time was read from the backup
8481 8482 8483
 * history file at the beginning of archive recovery, but we now use the WAL
 * record for that and the file is for informational and debug purposes only.
 *
8484
 * Note: different from CancelBackup which just cancels online backup mode.
8485 8486 8487 8488 8489 8490
 */
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
{
	XLogRecPtr	startpoint;
	XLogRecPtr	stoppoint;
B
Bruce Momjian 已提交
8491
	XLogRecData rdata;
8492
	pg_time_t	stamp_time;
8493
	char		strfbuf[128];
8494
	char		histfilepath[MAXPGPATH];
8495 8496
	char		startxlogfilename[MAXFNAMELEN];
	char		stopxlogfilename[MAXFNAMELEN];
8497 8498
	char		lastxlogfilename[MAXFNAMELEN];
	char		histfilename[MAXFNAMELEN];
8499 8500 8501 8502 8503 8504
	uint32		_logId;
	uint32		_logSeg;
	FILE	   *lfp;
	FILE	   *fp;
	char		ch;
	int			ich;
8505 8506
	int			seconds_before_warning;
	int			waits = 0;
8507
	bool		reported_waiting = false;
8508

B
Bruce Momjian 已提交
8509
	if (!superuser())
8510 8511 8512
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to run a backup"))));
B
Bruce Momjian 已提交
8513

8514 8515 8516 8517 8518 8519
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8520
	if (!XLogIsNeeded())
8521 8522
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
B
Bruce Momjian 已提交
8523
			  errmsg("WAL level not sufficient for making an online backup"),
8524
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8525

8526
	/*
8527
	 * OK to clear forcePageWrites
8528 8529
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8530
	XLogCtl->Insert.forcePageWrites = false;
8531 8532 8533 8534 8535
	LWLockRelease(WALInsertLock);

	/*
	 * Open the existing label file
	 */
8536
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8537 8538 8539 8540 8541 8542
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
8543
							BACKUP_LABEL_FILE)));
8544 8545 8546 8547
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("a backup is not in progress")));
	}
B
Bruce Momjian 已提交
8548

8549
	/*
B
Bruce Momjian 已提交
8550 8551
	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
	 * but we are not expecting any variability in the file format).
8552 8553 8554 8555 8556 8557
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
			   &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
			   &ch) != 4 || ch != '\n')
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8558
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
8559

8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574
	/*
	 * Write the backup-end xlog record
	 */
	rdata.data = (char *) (&startpoint);
	rdata.len = sizeof(startpoint);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);

	/*
	 * Force a switch to a new xlog segment file, so that the backup is valid
	 * as soon as archiver moves out the current segment file.
	 */
	RequestXLogSwitch();

8575
	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
8576 8577 8578 8579 8580 8581 8582 8583
	XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);

	/* Use the log timezone here, not the session timezone */
	stamp_time = (pg_time_t) time(NULL);
	pg_strftime(strfbuf, sizeof(strfbuf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&stamp_time, log_timezone));

8584 8585 8586 8587
	/*
	 * Write the backup history file
	 */
	XLByteToSeg(startpoint, _logId, _logSeg);
8588
	BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
8589
						  startpoint.xrecoff % XLogSegSize);
8590
	fp = AllocateFile(histfilepath, "w");
8591 8592 8593 8594
	if (!fp)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m",
8595
						histfilepath)));
8596 8597 8598 8599
	fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
			startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
	fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
			stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
8600
	/* transfer remaining lines from label to history file */
8601 8602 8603 8604 8605 8606 8607
	while ((ich = fgetc(lfp)) != EOF)
		fputc(ich, fp);
	fprintf(fp, "STOP TIME: %s\n", strfbuf);
	if (fflush(fp) || ferror(fp) || FreeFile(fp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write file \"%s\": %m",
8608
						histfilepath)));
B
Bruce Momjian 已提交
8609

8610 8611 8612 8613 8614 8615 8616
	/*
	 * Close and remove the backup label file
	 */
	if (ferror(lfp) || FreeFile(lfp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
8617 8618
						BACKUP_LABEL_FILE)));
	if (unlink(BACKUP_LABEL_FILE) != 0)
8619 8620 8621
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not remove file \"%s\": %m",
8622
						BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
8623

8624
	/*
B
Bruce Momjian 已提交
8625 8626 8627
	 * Clean out any no-longer-needed history files.  As a side effect, this
	 * will post a .ready file for the newly created history file, notifying
	 * the archiver that history file may be archived immediately.
8628
	 */
8629
	CleanupBackupHistory();
B
Bruce Momjian 已提交
8630

8631
	/*
8632
	 * If archiving is enabled, wait for all the required WAL files to be
B
Bruce Momjian 已提交
8633 8634 8635 8636 8637 8638
	 * archived before returning. If archiving isn't enabled, the required WAL
	 * needs to be transported via streaming replication (hopefully with
	 * wal_keep_segments set high enough), or some more exotic mechanism like
	 * polling and copying files from pg_xlog with script. We have no
	 * knowledge of those mechanisms, so it's up to the user to ensure that he
	 * gets all the required WAL.
8639 8640
	 *
	 * We wait until both the last WAL file filled during backup and the
B
Bruce Momjian 已提交
8641 8642 8643
	 * history file have been archived, and assume that the alphabetic sorting
	 * property of the WAL files ensures any earlier WAL files are safely
	 * archived as well.
8644
	 *
8645 8646
	 * We wait forever, since archive_command is supposed to work and we
	 * assume the admin wanted his backup to work completely. If you don't
B
Bruce Momjian 已提交
8647 8648
	 * wish to wait, you can set statement_timeout.  Also, some notices are
	 * issued to clue in anyone who might be doing this interactively.
8649
	 */
8650 8651
	if (XLogArchivingActive())
	{
B
Bruce Momjian 已提交
8652 8653
		XLByteToPrevSeg(stoppoint, _logId, _logSeg);
		XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
8654

B
Bruce Momjian 已提交
8655 8656 8657
		XLByteToSeg(startpoint, _logId, _logSeg);
		BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
							  startpoint.xrecoff % XLogSegSize);
8658

B
Bruce Momjian 已提交
8659 8660
		seconds_before_warning = 60;
		waits = 0;
8661

B
Bruce Momjian 已提交
8662 8663
		while (XLogArchiveIsBusy(lastxlogfilename) ||
			   XLogArchiveIsBusy(histfilename))
8664
		{
B
Bruce Momjian 已提交
8665
			CHECK_FOR_INTERRUPTS();
8666

B
Bruce Momjian 已提交
8667 8668 8669 8670 8671 8672
			if (!reported_waiting && waits > 5)
			{
				ereport(NOTICE,
						(errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
				reported_waiting = true;
			}
8673

B
Bruce Momjian 已提交
8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685
			pg_usleep(1000000L);

			if (++waits >= seconds_before_warning)
			{
				seconds_before_warning *= 2;	/* This wraps in >10 years... */
				ereport(WARNING,
						(errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
								waits),
						 errhint("Check that your archive_command is executing properly.  "
								 "pg_stop_backup can be cancelled safely, "
								 "but the database backup will not be usable without all the WAL segments.")));
			}
8686 8687
		}

B
Bruce Momjian 已提交
8688 8689
		ereport(NOTICE,
				(errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
8690 8691 8692 8693
	}
	else
		ereport(NOTICE,
				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
8694

8695
	/*
8696
	 * We're done.  As a convenience, return the ending WAL location.
8697 8698 8699
	 */
	snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
			 stoppoint.xlogid, stoppoint.xrecoff);
8700
	PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
8701
}
8702

8703 8704 8705 8706 8707 8708
/*
 * pg_switch_xlog: switch to next xlog file
 */
Datum
pg_switch_xlog(PG_FUNCTION_ARGS)
{
B
Bruce Momjian 已提交
8709
	XLogRecPtr	switchpoint;
8710 8711 8712 8713 8714
	char		location[MAXFNAMELEN];

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
B
Bruce Momjian 已提交
8715
			 (errmsg("must be superuser to switch transaction log files"))));
8716

8717 8718 8719 8720 8721 8722
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8723 8724 8725 8726 8727 8728 8729
	switchpoint = RequestXLogSwitch();

	/*
	 * As a convenience, return the WAL location of the switch record
	 */
	snprintf(location, sizeof(location), "%X/%X",
			 switchpoint.xlogid, switchpoint.xrecoff);
8730
	PG_RETURN_TEXT_P(cstring_to_text(location));
8731 8732 8733
}

/*
8734 8735 8736 8737 8738
 * Report the current WAL write location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to an external
 * archiving process.  Note that the data before this point is written out
 * to the kernel, but is not necessarily synced to disk.
8739 8740 8741
 */
Datum
pg_current_xlog_location(PG_FUNCTION_ARGS)
8742 8743 8744
{
	char		location[MAXFNAMELEN];

8745 8746 8747 8748 8749 8750
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762
	/* Make sure we have an up-to-date local LogwrtResult */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	snprintf(location, sizeof(location), "%X/%X",
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
8763
	PG_RETURN_TEXT_P(cstring_to_text(location));
8764 8765 8766 8767 8768 8769 8770 8771 8772
}

/*
 * Report the current WAL insert location (same format as pg_start_backup etc)
 *
 * This function is mostly for debugging purposes.
 */
Datum
pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
8773 8774 8775 8776 8777
{
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecPtr	current_recptr;
	char		location[MAXFNAMELEN];

8778 8779 8780 8781 8782 8783
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8784 8785 8786 8787 8788 8789 8790 8791 8792
	/*
	 * Get the current end-of-WAL position ... shared lock is sufficient
	 */
	LWLockAcquire(WALInsertLock, LW_SHARED);
	INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
	LWLockRelease(WALInsertLock);

	snprintf(location, sizeof(location), "%X/%X",
			 current_recptr.xlogid, current_recptr.xrecoff);
8793
	PG_RETURN_TEXT_P(cstring_to_text(location));
8794 8795
}

8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807
/*
 * Report the last WAL receive location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is guaranteed to be received
 * and synced to disk by walreceiver.
 */
Datum
pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
{
	XLogRecPtr	recptr;
	char		location[MAXFNAMELEN];

8808
	recptr = GetWalRcvWriteRecPtr(NULL);
8809

8810 8811 8812
	if (recptr.xlogid == 0 && recptr.xrecoff == 0)
		PG_RETURN_NULL();

8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835
	snprintf(location, sizeof(location), "%X/%X",
			 recptr.xlogid, recptr.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

/*
 * Report the last WAL replay location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to read-only
 * connections during recovery.
 */
Datum
pg_last_xlog_replay_location(PG_FUNCTION_ARGS)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;
	char		location[MAXFNAMELEN];

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->recoveryLastRecPtr;
	SpinLockRelease(&xlogctl->info_lck);

8836 8837 8838
	if (recptr.xlogid == 0 && recptr.xrecoff == 0)
		PG_RETURN_NULL();

8839 8840 8841 8842 8843
	snprintf(location, sizeof(location), "%X/%X",
			 recptr.xlogid, recptr.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863
/*
 * Compute an xlog file name and decimal byte offset given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 *
 * Note that a location exactly at a segment boundary is taken to be in
 * the previous segment.  This is usually the right thing, since the
 * expected usage is to determine which xlog file(s) are ready to archive.
 */
Datum
pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	uint32		xrecoff;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];
B
Bruce Momjian 已提交
8864 8865 8866 8867 8868
	Datum		values[2];
	bool		isnull[2];
	TupleDesc	resultTupleDesc;
	HeapTuple	resultHeapTuple;
	Datum		result;
8869

8870 8871 8872 8873 8874 8875
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("pg_xlogfile_name_offset() cannot be executed during recovery.")));

8876 8877 8878
	/*
	 * Read input and parse
	 */
8879
	locationstr = text_to_cstring(location);
8880 8881 8882 8883

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
P
Peter Eisentraut 已提交
8884
				 errmsg("could not parse transaction log location \"%s\"",
8885 8886 8887 8888 8889
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

8890
	/*
B
Bruce Momjian 已提交
8891 8892
	 * Construct a tuple descriptor for the result row.  This must match this
	 * function's pg_proc entry!
8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904
	 */
	resultTupleDesc = CreateTemplateTupleDesc(2, false);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
					   TEXTOID, -1, 0);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
					   INT4OID, -1, 0);

	resultTupleDesc = BlessTupleDesc(resultTupleDesc);

	/*
	 * xlogfilename
	 */
8905 8906 8907
	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

8908
	values[0] = CStringGetTextDatum(xlogfilename);
8909 8910 8911 8912 8913
	isnull[0] = false;

	/*
	 * offset
	 */
8914 8915
	xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;

8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926
	values[1] = UInt32GetDatum(xrecoff);
	isnull[1] = false;

	/*
	 * Tuple jam: Having first prepared your Datums, then squash together
	 */
	resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);

	result = HeapTupleGetDatum(resultHeapTuple);

	PG_RETURN_DATUM(result);
8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944
}

/*
 * Compute an xlog file name given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 */
Datum
pg_xlogfile_name(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];

8945 8946 8947 8948
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
B
Bruce Momjian 已提交
8949
		 errhint("pg_xlogfile_name() cannot be executed during recovery.")));
8950

8951
	locationstr = text_to_cstring(location);
8952 8953 8954 8955

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
P
Peter Eisentraut 已提交
8956
				 errmsg("could not parse transaction log location \"%s\"",
8957 8958 8959 8960 8961 8962 8963 8964
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

8965
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
8966 8967
}

8968 8969 8970 8971 8972
/*
 * read_backup_label: check to see if a backup_label file is present
 *
 * If we see a backup_label during recovery, we assume that we are recovering
 * from a backup dump file, and we therefore roll forward from the checkpoint
B
Bruce Momjian 已提交
8973
 * identified by the label file, NOT what pg_control says.	This avoids the
8974 8975 8976 8977 8978
 * problem that pg_control might have been archived one or more checkpoints
 * later than the start of the dump, and so if we rely on it as the start
 * point, we will fail to restore a consistent database state.
 *
 * Returns TRUE if a backup_label was found (and fills the checkpoint
8979 8980
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
 * respectively); returns FALSE if not.
8981 8982
 */
static bool
8983
read_backup_label(XLogRecPtr *checkPointLoc)
8984 8985 8986 8987 8988 8989 8990 8991 8992
{
	char		startxlogfilename[MAXFNAMELEN];
	TimeLineID	tli;
	FILE	   *lfp;
	char		ch;

	/*
	 * See if label file is present
	 */
8993
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8994 8995 8996 8997 8998 8999
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
9000
							BACKUP_LABEL_FILE)));
9001 9002
		return false;			/* it's not there, all is fine */
	}
B
Bruce Momjian 已提交
9003

9004
	/*
B
Bruce Momjian 已提交
9005 9006 9007
	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
	 * is pretty crude, but we are not expecting any variability in the file
	 * format).
9008 9009
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9010
			   &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
9011 9012 9013
			   startxlogfilename, &ch) != 5 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9014
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9015 9016 9017 9018 9019
	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
			   &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
			   &ch) != 3 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9020
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9021 9022 9023 9024
	if (ferror(lfp) || FreeFile(lfp))
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
9025
						BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
9026

9027 9028 9029
	return true;
}

9030 9031 9032 9033 9034 9035
/*
 * Error context callback for errors occurring during rm_redo().
 */
static void
rm_redo_error_callback(void *arg)
{
B
Bruce Momjian 已提交
9036 9037
	XLogRecord *record = (XLogRecord *) arg;
	StringInfoData buf;
9038 9039

	initStringInfo(&buf);
9040 9041
	RmgrTable[record->xl_rmid].rm_desc(&buf,
									   record->xl_info,
9042 9043 9044 9045 9046 9047 9048 9049
									   XLogRecGetData(record));

	/* don't bother emitting empty description */
	if (buf.len > 0)
		errcontext("xlog redo %s", buf.data);

	pfree(buf.data);
}
9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086

/*
 * BackupInProgress: check if online backup mode is active
 *
 * This is done by checking for existence of the "backup_label" file.
 */
bool
BackupInProgress(void)
{
	struct stat stat_buf;

	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}

/*
 * CancelBackup: rename the "backup_label" file to cancel backup mode
 *
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
 * Note that this will render an online backup in progress useless.
 * To correctly finish an online backup, pg_stop_backup must be called.
 */
void
CancelBackup(void)
{
	struct stat stat_buf;

	/* if the file is not there, return */
	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
		return;

	/* remove leftover file from previously cancelled backup if it exists */
	unlink(BACKUP_LABEL_OLD);

	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
	{
		ereport(LOG,
				(errmsg("online backup mode cancelled"),
9087
				 errdetail("\"%s\" was renamed to \"%s\".",
9088
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9089 9090 9091 9092 9093
	}
	else
	{
		ereport(WARNING,
				(errcode_for_file_access(),
9094 9095
				 errmsg("online backup mode was not cancelled"),
				 errdetail("Could not rename \"%s\" to \"%s\": %m.",
9096
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9097 9098 9099
	}
}

9100
/* ------------------------------------------------------
9101
 *	Startup Process main entry point and signal handlers
9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116
 * ------------------------------------------------------
 */

/*
 * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
 *
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
 */
static void
startupproc_quickdie(SIGNAL_ARGS)
{
	PG_SETMASK(&BlockSig);

	/*
9117 9118 9119 9120 9121 9122 9123 9124 9125 9126
	 * We DO NOT want to run proc_exit() callbacks -- we're here because
	 * shared memory may be corrupted, so we don't want to try to clean up our
	 * transaction.  Just nail the windows shut and get out of town.  Now that
	 * there's an atexit callback to prevent third-party code from breaking
	 * things by calling exit() directly, we have to reset the callbacks
	 * explicitly to make this work as intended.
	 */
	on_exit_reset();

	/*
9127 9128 9129
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
9130
	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
9131 9132
	 * should ensure the postmaster sees this as a crash, too, but no harm in
	 * being doubly sure.)
9133 9134 9135 9136 9137
	 */
	exit(2);
}


9138 9139 9140 9141 9142 9143 9144
/* SIGHUP: set flag to re-read config file at next convenient time */
static void
StartupProcSigHupHandler(SIGNAL_ARGS)
{
	got_SIGHUP = true;
}

9145 9146 9147 9148 9149
/* SIGTERM: set flag to abort redo and exit */
static void
StartupProcShutdownHandler(SIGNAL_ARGS)
{
	if (in_restore_command)
9150
		proc_exit(1);
9151 9152 9153 9154
	else
		shutdown_requested = true;
}

9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166
/* Handle SIGHUP and SIGTERM signals of startup process */
void
HandleStartupProcInterrupts(void)
{
	/*
	 * Check if we were requested to re-read config file.
	 */
	if (got_SIGHUP)
	{
		got_SIGHUP = false;
		ProcessConfigFile(PGC_SIGHUP);
	}
B
Bruce Momjian 已提交
9167

9168 9169 9170 9171 9172
	/*
	 * Check if we were requested to exit without finishing recovery.
	 */
	if (shutdown_requested)
		proc_exit(1);
9173 9174 9175 9176 9177 9178 9179

	/*
	 * Emergency bailout if postmaster has died.  This is to avoid the
	 * necessity for manual cleanup of all postmaster children.
	 */
	if (IsUnderPostmaster && !PostmasterIsAlive(true))
		exit(1);
9180 9181
}

9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195
/* Main entry point for startup process */
void
StartupProcessMain(void)
{
	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
9196 9197 9198 9199 9200
	 * Properly accept or ignore signals the postmaster might send us.
	 *
	 * Note: ideally we'd not enable handle_standby_sig_alarm unless actually
	 * doing hot standby, but we don't know that yet.  Rely on it to not do
	 * anything if it shouldn't.
9201
	 */
9202 9203
	pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */
	pqsignal(SIGINT, SIG_IGN);	/* ignore query cancel */
B
Bruce Momjian 已提交
9204 9205
	pqsignal(SIGTERM, StartupProcShutdownHandler);		/* request shutdown */
	pqsignal(SIGQUIT, startupproc_quickdie);	/* hard crash time */
9206
	if (EnableHotStandby)
B
Bruce Momjian 已提交
9207 9208
		pqsignal(SIGALRM, handle_standby_sig_alarm);	/* ignored unless
														 * InHotStandby */
9209 9210
	else
		pqsignal(SIGALRM, SIG_IGN);
9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, SIG_IGN);
	pqsignal(SIGUSR2, SIG_IGN);

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

9229
	StartupXLOG();
9230

9231
	/*
9232 9233
	 * Exit normally. Exit code 0 tells postmaster that we completed recovery
	 * successfully.
9234
	 */
9235 9236
	proc_exit(0);
}
9237 9238 9239

/*
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
9240
 * Returns true if the page is read successfully.
9241 9242 9243
 *
 * This is responsible for restoring files from archive as needed, as well
 * as for waiting for the requested WAL record to arrive in standby mode.
9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257
 *
 * 'emode' specifies the log level used for reporting "file not found" or
 * "end of WAL" situations in archive recovery, or in standby mode when a
 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
 * false in those situations, on higher log levels the ereport() won't
 * return.
 *
 * In standby mode, if after a successful return of XLogPageRead() the
 * caller finds the record it's interested in to be broken, it should
 * ereport the error with the level determined by
 * emode_for_corrupt_record(), and then set "failedSources |= readSource"
 * and call XLogPageRead() again with the same arguments. This lets
 * XLogPageRead() to try fetching the record from another source, or to
 * sleep and retry.
9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268
 */
static bool
XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess)
{
	static XLogRecPtr receivedUpto = {0, 0};
	bool		switched_segment = false;
	uint32		targetPageOff;
	uint32		targetRecOff;
	uint32		targetId;
	uint32		targetSeg;
9269
	static pg_time_t last_fail_time = 0;
9270 9271 9272 9273 9274 9275

	XLByteToSeg(*RecPtr, targetId, targetSeg);
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;

	/* Fast exit if we have read the record in the current buffer already */
9276
	if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
9277 9278 9279 9280 9281 9282 9283 9284 9285
		targetPageOff == readOff && targetRecOff < readLen)
		return true;

	/*
	 * See if we need to switch to a new segment because the requested record
	 * is not in the currently open one.
	 */
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
	{
9286
		/*
B
Bruce Momjian 已提交
9287 9288
		 * Signal bgwriter to start a restartpoint if we've replayed too much
		 * xlog since the last one.
9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299
		 */
		if (StandbyMode && bgwriterLaunched)
		{
			if (XLogCheckpointNeeded(readId, readSeg))
			{
				(void) GetRedoRecPtr();
				if (XLogCheckpointNeeded(readId, readSeg))
					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
			}
		}

9300 9301
		close(readFile);
		readFile = -1;
9302
		readSource = 0;
9303 9304 9305 9306
	}

	XLByteToSeg(*RecPtr, readId, readSeg);

9307
retry:
9308 9309
	/* See if we need to retrieve more data */
	if (readFile < 0 ||
9310
		(readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
9311 9312 9313 9314 9315
	{
		if (StandbyMode)
		{
			/*
			 * In standby mode, wait for the requested record to become
B
Bruce Momjian 已提交
9316 9317
			 * available, either via restore_command succeeding to restore the
			 * segment, or via walreceiver having streamed the record.
9318 9319 9320 9321 9322
			 */
			for (;;)
			{
				if (WalRcvInProgress())
				{
B
Bruce Momjian 已提交
9323
					bool		havedata;
9324

9325 9326 9327
					/*
					 * If we find an invalid record in the WAL streamed from
					 * master, something is seriously wrong. There's little
B
Bruce Momjian 已提交
9328 9329 9330 9331 9332 9333
					 * chance that the problem will just go away, but PANIC is
					 * not good for availability either, especially in hot
					 * standby mode. Disconnect, and retry from
					 * archive/pg_xlog again. The WAL in the archive should be
					 * identical to what was streamed, so it's unlikely that
					 * it helps, but one can hope...
9334 9335 9336 9337 9338 9339 9340
					 */
					if (failedSources & XLOG_FROM_STREAM)
					{
						ShutdownWalRcv();
						continue;
					}

9341
					/*
9342 9343 9344 9345 9346 9347
					 * Walreceiver is active, so see if new data has arrived.
					 *
					 * We only advance XLogReceiptTime when we obtain fresh
					 * WAL from walreceiver and observe that we had already
					 * processed everything before the most recent "chunk"
					 * that it flushed to disk.  In steady state where we are
B
Bruce Momjian 已提交
9348 9349
					 * keeping up with the incoming data, XLogReceiptTime will
					 * be updated on each cycle.  When we are behind,
9350 9351
					 * XLogReceiptTime will not advance, so the grace time
					 * alloted to conflicting queries will decrease.
9352 9353
					 */
					if (XLByteLT(*RecPtr, receivedUpto))
9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369
						havedata = true;
					else
					{
						XLogRecPtr	latestChunkStart;

						receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
						if (XLByteLT(*RecPtr, receivedUpto))
						{
							havedata = true;
							if (!XLByteLT(*RecPtr, latestChunkStart))
								XLogReceiptTime = GetCurrentTimestamp();
						}
						else
							havedata = false;
					}
					if (havedata)
9370 9371 9372
					{
						/*
						 * Great, streamed far enough. Open the file if it's
9373 9374 9375
						 * not open already.  Use XLOG_FROM_STREAM so that
						 * source info is set correctly and XLogReceiptTime
						 * isn't changed.
9376 9377 9378 9379 9380
						 */
						if (readFile < 0)
						{
							readFile =
								XLogFileRead(readId, readSeg, PANIC,
9381
											 recoveryTargetTLI,
9382 9383
											 XLOG_FROM_STREAM, false);
							Assert(readFile >= 0);
9384
							switched_segment = true;
9385 9386 9387 9388
						}
						else
						{
							/* just make sure source info is correct... */
9389
							readSource = XLOG_FROM_STREAM;
9390
							XLogReceiptSource = XLOG_FROM_STREAM;
9391 9392 9393 9394
						}
						break;
					}

9395 9396 9397
					/*
					 * Data not here yet, so check for trigger then sleep.
					 */
9398
					if (CheckForStandbyTrigger())
9399
						goto triggered;
9400 9401 9402 9403 9404 9405 9406 9407 9408

					/*
					 * When streaming is active, we want to react quickly when
					 * the next WAL record arrives, so sleep only a bit.
					 */
					pg_usleep(100000L); /* 100ms */
				}
				else
				{
B
Bruce Momjian 已提交
9409 9410
					int			sources;
					pg_time_t	now;
9411

9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425
					/*
					 * Until walreceiver manages to reconnect, poll the
					 * archive.
					 */
					if (readFile >= 0)
					{
						close(readFile);
						readFile = -1;
					}
					/* Reset curFileTLI if random fetch. */
					if (randAccess)
						curFileTLI = 0;

					/*
9426 9427
					 * Try to restore the file from archive, or read an
					 * existing file from pg_xlog.
9428
					 */
9429 9430
					sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
					if (!(sources & ~failedSources))
9431 9432
					{
						/*
9433 9434 9435 9436 9437 9438 9439 9440
						 * We've exhausted all options for retrieving the
						 * file. Retry ...
						 */
						failedSources = 0;

						/*
						 * ... but sleep first if it hasn't been long since
						 * last attempt.
9441
						 */
9442 9443 9444 9445 9446 9447 9448 9449 9450 9451
						now = (pg_time_t) time(NULL);
						if ((now - last_fail_time) < 5)
						{
							pg_usleep(1000000L * (5 - (now - last_fail_time)));
							now = (pg_time_t) time(NULL);
						}
						last_fail_time = now;

						/*
						 * If primary_conninfo is set, launch walreceiver to
B
Bruce Momjian 已提交
9452 9453
						 * try to stream the missing WAL, before retrying to
						 * restore from archive/pg_xlog.
9454 9455 9456
						 *
						 * If fetching_ckpt is TRUE, RecPtr points to the
						 * initial checkpoint location. In that case, we use
B
Bruce Momjian 已提交
9457 9458 9459 9460
						 * RedoStartLSN as the streaming start position
						 * instead of RecPtr, so that when we later jump
						 * backwards to start redo at RedoStartLSN, we will
						 * have the logs streamed already.
9461 9462 9463 9464
						 */
						if (PrimaryConnInfo)
						{
							RequestXLogStreaming(
B
Bruce Momjian 已提交
9465 9466
									  fetching_ckpt ? RedoStartLSN : *RecPtr,
												 PrimaryConnInfo);
9467 9468
							continue;
						}
9469
					}
9470 9471 9472 9473 9474
					/* Don't try to read from a source that just failed */
					sources &= ~failedSources;
					readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
												  sources);
					switched_segment = true;
9475
					if (readFile >= 0)
9476
						break;
9477 9478

					/*
9479 9480 9481 9482 9483
					 * Nope, not found in archive and/or pg_xlog.
					 */
					failedSources |= sources;

					/*
B
Bruce Momjian 已提交
9484 9485 9486 9487
					 * Check to see if the trigger file exists. Note that we
					 * do this only after failure, so when you create the
					 * trigger file, we still finish replaying as much as we
					 * can from archive and pg_xlog before failover.
9488
					 */
9489 9490
					if (CheckForStandbyTrigger())
						goto triggered;
9491 9492 9493
				}

				/*
B
Bruce Momjian 已提交
9494 9495
				 * This possibly-long loop needs to handle interrupts of
				 * startup process.
9496 9497 9498 9499 9500 9501 9502 9503 9504
				 */
				HandleStartupProcInterrupts();
			}
		}
		else
		{
			/* In archive or crash recovery. */
			if (readFile < 0)
			{
B
Bruce Momjian 已提交
9505
				int			sources;
9506

9507 9508 9509
				/* Reset curFileTLI if random fetch. */
				if (randAccess)
					curFileTLI = 0;
9510 9511 9512 9513 9514

				sources = XLOG_FROM_PG_XLOG;
				if (InArchiveRecovery)
					sources |= XLOG_FROM_ARCHIVE;

9515
				readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
9516
											  sources);
9517 9518 9519 9520 9521 9522 9523 9524
				switched_segment = true;
				if (readFile < 0)
					return false;
			}
		}
	}

	/*
B
Bruce Momjian 已提交
9525 9526
	 * At this point, we have the right segment open and if we're streaming we
	 * know the requested record is in it.
9527 9528 9529 9530
	 */
	Assert(readFile != -1);

	/*
B
Bruce Momjian 已提交
9531 9532 9533 9534
	 * If the current segment is being streamed from master, calculate how
	 * much of the current page we have received already. We know the
	 * requested record has been received, but this is for the benefit of
	 * future calls, to allow quick exit at the top of this function.
9535
	 */
9536
	if (readSource == XLOG_FROM_STREAM)
9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560
	{
		if (RecPtr->xlogid != receivedUpto.xlogid ||
			(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
		{
			readLen = XLOG_BLCKSZ;
		}
		else
			readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
	}
	else
		readLen = XLOG_BLCKSZ;

	if (switched_segment && targetPageOff != 0)
	{
		/*
		 * Whenever switching to a new WAL segment, we read the first page of
		 * the file and validate its header, even if that's not where the
		 * target record is.  This is so that we can check the additional
		 * identification info that is present in the first page's "long"
		 * header.
		 */
		readOff = 0;
		if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
		{
9561
			ereport(emode_for_corrupt_record(emode, *RecPtr),
9562 9563 9564 9565 9566
					(errcode_for_file_access(),
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
							readId, readSeg, readOff)));
			goto next_record_is_invalid;
		}
9567
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
9568 9569 9570 9571 9572 9573 9574
			goto next_record_is_invalid;
	}

	/* Read the requested page */
	readOff = targetPageOff;
	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
	{
9575
		ereport(emode_for_corrupt_record(emode, *RecPtr),
9576
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
9577 9578
		 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
				readId, readSeg, readOff)));
9579 9580 9581 9582
		goto next_record_is_invalid;
	}
	if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
	{
9583
		ereport(emode_for_corrupt_record(emode, *RecPtr),
9584
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
9585 9586
		 errmsg("could not read from log file %u, segment %u, offset %u: %m",
				readId, readSeg, readOff)));
9587 9588
		goto next_record_is_invalid;
	}
9589
	if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
9590 9591 9592 9593 9594 9595 9596 9597 9598 9599
		goto next_record_is_invalid;

	Assert(targetId == readId);
	Assert(targetSeg == readSeg);
	Assert(targetPageOff == readOff);
	Assert(targetRecOff < readLen);

	return true;

next_record_is_invalid:
9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614
	failedSources |= readSource;

	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return false;

triggered:
9615 9616 9617 9618
	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
9619
	readSource = 0;
9620 9621 9622 9623

	return false;
}

9624 9625 9626 9627 9628
/*
 * Determine what log level should be used to report a corrupt WAL record
 * in the current WAL page, previously read by XLogPageRead().
 *
 * 'emode' is the error mode that would be used to report a file-not-found
B
Bruce Momjian 已提交
9629
 * or legitimate end-of-WAL situation.	 Generally, we use it as-is, but if
9630
 * we're retrying the exact same record that we've tried previously, only
B
Bruce Momjian 已提交
9631
 * complain the first time to keep the noise down.	However, we only do when
9632 9633 9634
 * reading from pg_xlog, because we don't expect any invalid records in archive
 * or in records streamed from master. Files in the archive should be complete,
 * and we should never hit the end of WAL because we stop and wait for more WAL
B
Bruce Momjian 已提交
9635
 * to arrive before replaying it.
9636 9637 9638 9639 9640
 *
 * NOTE: This function remembers the RecPtr value it was last called with,
 * to suppress repeated messages about the same record. Only call this when
 * you are about to ereport(), or you might cause a later message to be
 * erroneously suppressed.
9641 9642
 */
static int
9643
emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
9644
{
9645 9646
	static XLogRecPtr lastComplaint = {0, 0};

9647
	if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
9648 9649 9650 9651 9652 9653
	{
		if (XLByteEQ(RecPtr, lastComplaint))
			emode = DEBUG1;
		else
			lastComplaint = RecPtr;
	}
9654 9655 9656
	return emode;
}

9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679
/*
 * Check to see if the trigger file exists. If it does, request postmaster
 * to shut down walreceiver, wait for it to exit, remove the trigger
 * file, and return true.
 */
static bool
CheckForStandbyTrigger(void)
{
	struct stat stat_buf;

	if (TriggerFile == NULL)
		return false;

	if (stat(TriggerFile, &stat_buf) == 0)
	{
		ereport(LOG,
				(errmsg("trigger file found: %s", TriggerFile)));
		ShutdownWalRcv();
		unlink(TriggerFile);
		return true;
	}
	return false;
}