xlog.c 341.0 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
B
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.345 2009/06/26 20:29:04 tgl Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <ctype.h>
T
Tom Lane 已提交
18
#include <signal.h>
19
#include <time.h>
20
#include <fcntl.h>
21
#include <sys/stat.h>
22
#include <sys/time.h>
23 24
#include <sys/wait.h>
#include <unistd.h>
25

26
#include "access/clog.h"
27
#include "access/multixact.h"
28
#include "access/distributedlog.h"
29
#include "access/subtrans.h"
30
#include "access/transam.h"
31
#include "access/tuptoaster.h"
32
#include "access/twophase.h"
33
#include "access/xact.h"
34
#include "access/xlog_internal.h"
35
#include "access/xlogdefs.h"
36
#include "access/xlogutils.h"
37
#include "catalog/catalog.h"
38
#include "catalog/catversion.h"
H
Heikki Linnakangas 已提交
39
#include "catalog/pg_authid.h"
T
Tom Lane 已提交
40
#include "catalog/pg_control.h"
41
#include "catalog/pg_type.h"
42 43 44
#include "catalog/pg_database.h"
#include "catalog/pg_tablespace.h"
#include "executor/spi.h"
45
#include "funcapi.h"
46 47
#include "libpq/pqsignal.h"
#include "libpq/hba.h"
48
#include "miscadmin.h"
49
#include "pgstat.h"
50
#include "postmaster/bgwriter.h"
51
#include "postmaster/postmaster.h"
52
#include "storage/bufpage.h"
53
#include "storage/bufmgr.h"
54
#include "storage/fd.h"
55
#include "storage/ipc.h"
56
#include "storage/latch.h"
57
#include "storage/pmsignal.h"
58
#include "storage/procarray.h"
59
#include "storage/smgr.h"
60
#include "storage/spin.h"
61
#include "utils/builtins.h"
62
#include "utils/nabstime.h"
63 64 65 66 67 68 69 70 71
#include "utils/faultinjector.h"
#include "utils/flatfiles.h"
#include "utils/guc.h"
#include "utils/ps_status.h"
#include "pg_trace.h"
#include "utils/catcache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
#include "utils/pg_crc.h"
72
#include "utils/ps_status.h"
73 74 75 76 77 78 79 80
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/backendid.h"
#include "storage/sinvaladt.h"

#include "cdb/cdbtm.h"
#include "cdb/cdbvars.h"
#include "cdb/cdbmirroredflatfile.h"
81
#include "utils/resscheduler.h"
82
#include "utils/snapmgr.h"
83

84
extern uint32 bootstrap_data_checksum_version;
85

86 87 88
/* File path names (all relative to $PGDATA) */
#define RECOVERY_COMMAND_FILE	"recovery.conf"
#define RECOVERY_COMMAND_DONE	"recovery.done"
89
#define PROMOTE_SIGNAL_FILE "promote"
90 91


T
Tom Lane 已提交
92 93
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
94
int			XLOGbuffers = 8;
95
int			XLogArchiveTimeout = 0;
96
bool		XLogArchiveMode = false;
97
char	   *XLogArchiveCommand = NULL;
98
bool		fullPageWrites = true;
99 100
char   *wal_consistency_checking_string = NULL;
bool   *wal_consistency_checking = NULL;
101
bool		log_checkpoints = false;
102
int			sync_method = DEFAULT_SYNC_METHOD;
T
Tom Lane 已提交
103

104 105 106 107
#ifdef WAL_DEBUG
bool		XLOG_DEBUG = false;
#endif

108
/*
109 110 111 112 113
 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 * When we are done with an old XLOG segment file, we will recycle it as a
 * future XLOG segment as long as there aren't already XLOGfileslop future
 * segments; else we'll delete it.  This could be made a separate GUC
 * variable, but at present I think it's sufficient to hardwire it as
B
Bruce Momjian 已提交
114
 * 2*CheckPointSegments+1.	Under normal conditions, a checkpoint will free
115 116 117
 * no more than 2*CheckPointSegments log segments, and we want to recycle all
 * of them; the +1 allows boundary cases to happen without wasting a
 * delete/create-segment cycle.
118 119 120
 */
#define XLOGfileslop	(2*CheckPointSegments + 1)

121
bool am_startup = false;
122

123
/*
124
 * GUC support
125
 */
126
const struct config_enum_entry sync_method_options[] = {
127
	{"fsync", SYNC_METHOD_FSYNC, false},
128
#ifdef HAVE_FSYNC_WRITETHROUGH
129
	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
130 131
#endif
#ifdef HAVE_FDATASYNC
132
	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
133 134
#endif
#ifdef OPEN_SYNC_FLAG
135
	{"open_sync", SYNC_METHOD_OPEN, false},
136 137
#endif
#ifdef OPEN_DATASYNC_FLAG
138
	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
139
#endif
140
	{NULL, 0, false}
141
};
T
Tom Lane 已提交
142

143 144 145 146 147 148 149
/*
 * Statistics for current checkpoint are collected in this global struct.
 * Because only the background writer or a stand-alone backend can perform
 * checkpoints, this will be unused in normal backends.
 */
CheckpointStatsData CheckpointStats;

T
Tom Lane 已提交
150
/*
151 152
 * ThisTimeLineID will be same in all backends --- it identifies current
 * WAL timeline for the database system.
T
Tom Lane 已提交
153
 */
154
TimeLineID	ThisTimeLineID = 0;
V
WAL  
Vadim B. Mikheev 已提交
155

156
/*
157
 * Are we doing recovery from XLOG?
158
 *
159 160 161 162 163
 * This is only ever true in the startup process; it should be read as meaning
 * "this process is replaying WAL records", rather than "the system is in
 * recovery mode".  It should be examined primarily by functions that need
 * to act differently when called from a WAL redo function (e.g., to skip WAL
 * logging).  To check whether the system is in recovery regardless of which
164 165
 * process you're running in, use RecoveryInProgress().
 */
T
Tom Lane 已提交
166
bool		InRecovery = false;
B
Bruce Momjian 已提交
167

168 169 170 171 172
/*
 * Local copy of SharedRecoveryInProgress variable. True actually means "not
 * known, need to check the shared state".
 */
static bool LocalRecoveryInProgress = true;
B
Bruce Momjian 已提交
173

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/*
 * Local state for XLogInsertAllowed():
 *		1: unconditionally allowed to insert XLOG
 *		0: unconditionally not allowed to insert XLOG
 *		-1: must check RecoveryInProgress(); disallow until it is false
 * Most processes start with -1 and transition to 1 after seeing that recovery
 * is not in progress.  But we can also force the value for special cases.
 * The coding in XLogInsertAllowed() depends on the first two of these states
 * being numerically the same as bool true and false.
 */
static int	LocalXLogInsertAllowed = -1;

/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;

189
/* Was the last xlog file restored from archive, or local? */
B
Bruce Momjian 已提交
190
static bool restoredFromArchive = false;
191

192
/* options taken from recovery.conf */
193
#ifdef NOT_USED
194
static char *recoveryRestoreCommand = NULL;
195
#endif
196
static char *recoveryEndCommand = NULL;
197 198 199
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
B
Bruce Momjian 已提交
200
static TransactionId recoveryTargetXid;
201
static TimestampTz recoveryTargetTime;
202
static TimestampTz recoveryLastXTime = 0;
203

204 205 206
static char *replay_image_masked = NULL;
static char *master_image_masked = NULL;

207 208 209 210 211 212 213
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyModeRequested = false;
static char *PrimaryConnInfo = NULL;

/* are we currently in standby mode? */
bool StandbyMode = false;

214
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
B
Bruce Momjian 已提交
215
static TransactionId recoveryStopXid;
216
static TimestampTz recoveryStopTime;
B
Bruce Momjian 已提交
217
static bool recoveryStopAfter;
218 219 220 221 222 223 224 225 226 227 228 229 230

/*
 * During normal operation, the only timeline we care about is ThisTimeLineID.
 * During recovery, however, things are more complicated.  To simplify life
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 * scan through the WAL history (that is, it is the line that was active when
 * the currently-scanned WAL record was generated).  We also need these
 * timeline values:
 *
 * recoveryTargetTLI: the desired timeline that we want to end in.
 *
 * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 * its known parents, newest first (so recoveryTargetTLI is always the
B
Bruce Momjian 已提交
231
 * first list member).	Only these TLIs are expected to be seen in the WAL
232 233 234 235 236 237 238 239 240
 * segments we read, and indeed only these TLIs will be considered as
 * candidate WAL files to open at all.
 *
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
 * (This is not necessarily the same as ThisTimeLineID, because we could
 * be scanning data that was copied from an ancestor timeline when the current
 * file was created.)  During a sequential scan we do not allow this value
 * to decrease.
 */
B
Bruce Momjian 已提交
241
static TimeLineID recoveryTargetTLI;
242
List *expectedTLIs;
B
Bruce Momjian 已提交
243
static TimeLineID curFileTLI;
244

T
Tom Lane 已提交
245 246
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
247 248 249 250
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 * end+1 of the last record, and is reset when we end a top-level transaction,
 * or start a new one; so it can be used to tell if the current transaction has
 * created any XLOG records.
T
Tom Lane 已提交
251 252
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
253

254
XLogRecPtr	XactLastRecEnd = {0, 0};
255

256 257 258 259
static uint32 ProcLastRecTotalLen = 0;

static uint32 ProcLastRecDataLen = 0;

260 261
static XLogRecPtr InvalidXLogRecPtr = {0, 0};

T
Tom Lane 已提交
262 263 264
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
265
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
266
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
B
Bruce Momjian 已提交
267
 * hold the Insert lock).  See XLogInsert for details.	We are also allowed
268
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
269 270
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 * InitXLOGAccess.
T
Tom Lane 已提交
271
 */
272
static XLogRecPtr RedoRecPtr;
273

274 275 276 277 278 279 280 281 282 283 284 285
/*
 * RedoStartLSN points to the checkpoint's REDO location which is specified
 * in a backup label file, backup history file or control file. In standby
 * mode, XLOG streaming usually starts from the position where an invalid
 * record was found. But if we fail to read even the initial checkpoint
 * record, we use the REDO location instead of the checkpoint location as
 * the start position of XLOG streaming. Otherwise we would have to jump
 * backwards to the REDO location after reading the checkpoint record,
 * because the REDO record can precede the checkpoint record.
 */
static XLogRecPtr RedoStartLSN = {0, 0};

T
Tom Lane 已提交
286 287 288 289 290 291 292 293 294
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
295
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
296 297 298
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
299 300 301 302
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
303 304 305
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
306 307
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
308 309 310
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
311
 * but is updated when convenient.	Again, it exists for the convenience of
312
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
313 314 315 316 317 318 319 320 321 322
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
323 324 325 326 327 328 329 330 331 332 333 334 335
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
336
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
337 338
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the bgwriter, this is just pro forma).
339
 *
T
Tom Lane 已提交
340 341
 *----------
 */
342

T
Tom Lane 已提交
343
typedef struct XLogwrtRqst
344
{
T
Tom Lane 已提交
345 346
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
347
} XLogwrtRqst;
348

349 350 351 352 353 354
typedef struct XLogwrtResult
{
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
} XLogwrtResult;

T
Tom Lane 已提交
355 356 357
/*
 * Shared state data for XLogInsert.
 */
358 359
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
360 361
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
362
	int			curridx;		/* current block index in cache */
B
Bruce Momjian 已提交
363 364 365
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
366
	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
367 368 369 370 371 372 373 374 375 376 377

	/*
	 * exclusiveBackup is true if a backup started with pg_start_backup() is
	 * in progress, and nonExclusiveBackups is a counter indicating the number
	 * of streaming base backups currently in progress. forcePageWrites is set
	 * to true when either of these is non-zero. lastBackupStart is the latest
	 * checkpoint redo location used as a starting point for an online backup.
	 */
	bool		exclusiveBackup;
	int			nonExclusiveBackups;
	XLogRecPtr	lastBackupStart;
378 379
} XLogCtlInsert;

T
Tom Lane 已提交
380 381 382
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
383 384
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
385 386
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	int			curridx;		/* cache index of next block to write */
387
	pg_time_t	lastSegSwitchTime;		/* time of last xlog segment switch */
388 389
} XLogCtlWrite;

T
Tom Lane 已提交
390 391 392
/*
 * Total shared-memory state for XLOG.
 */
393 394
typedef struct XLogCtlData
{
395
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
396
	XLogCtlInsert Insert;
397

T
Tom Lane 已提交
398
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
399 400
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
401 402
	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
	TransactionId ckptXid;
B
Bruce Momjian 已提交
403
	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
404 405
	uint32		lastRemovedLog; /* latest removed/recycled XLOG segment */
	uint32		lastRemovedSeg;
406

407
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
408 409
	XLogCtlWrite Write;

410 411 412 413 414 415 416 417 418 419 420
	/* Protected by ChangeTrackingTransitionLock. */
	XLogRecPtr	lastChangeTrackingEndLoc;
								/*
								 * End + 1 of the last XLOG record inserted and
 								 * (possible) change tracked.
 								 */

	/* Resynchronize */
	bool		sendingResynchronizeTransitionMsg;
	slock_t		resynchronize_lck;		/* locks shared variables shown above */

T
Tom Lane 已提交
421
	/*
B
Bruce Momjian 已提交
422 423 424
	 * These values do not change after startup, although the pointed-to pages
	 * and xlblocks values certainly do.  Permission to read/write the pages
	 * and xlblocks values depends on WALInsertLock and WALWriteLock.
T
Tom Lane 已提交
425
	 */
B
Bruce Momjian 已提交
426
	char	   *pages;			/* buffers for unwritten XLOG pages */
427
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
428
	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
429
	TimeLineID	ThisTimeLineID;
T
Tom Lane 已提交
430

431
	/*
432
	 * SharedRecoveryInProgress indicates if we're still in crash or archive
433
	 * recovery.  Protected by info_lck.
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
	 */
	bool		SharedRecoveryInProgress;

	/*
	 * recoveryWakeupLatch is used to wake up the startup process to continue
	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
	 * to appear.
	 */
	Latch		recoveryWakeupLatch;

	/*
	 * the standby's dbid when it runs.  Used in mmxlog to emit standby filepath.
	 * Protected by info_lck
	 */
	int16		standbyDbid;

450
	/*
451 452
	 * During recovery, we keep a copy of the latest checkpoint record here.
	 * Used by the background writer when it wants to create a restartpoint.
453 454 455 456 457 458
	 *
	 * Protected by info_lck.
	 */
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;

459 460 461 462 463 464 465 466 467 468 469
	/*
	 * Save the location of the last checkpoint record to enable supressing
	 * unnecessary checkpoint records -- when no new xlog has been written
	 * since the last one.
	 */
	bool 		haveLastCheckpointLoc;
	XLogRecPtr	lastCheckpointLoc;
	XLogRecPtr	lastCheckpointEndLoc;

	/*
	 * lastReplayedEndRecPtr points to end+1 of the last record successfully
470 471 472
	 * replayed. When we're currently replaying a record, ie. in a redo
	 * function, replayEndRecPtr points to the end+1 of the record being
	 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
473 474
	 */
	XLogRecPtr	lastReplayedEndRecPtr;
475 476 477
	XLogRecPtr	replayEndRecPtr;

	slock_t		info_lck;		/* locks shared variables shown above */
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512

	/* current effective recovery target timeline */
	TimeLineID	RecoveryTargetTLI;

	/*
	 * timestamp of when we started replaying the current chunk of WAL data,
	 * only relevant for replication or archive recovery
	 */
	TimestampTz currentChunkStartTime;

	/*
	 * Save the redo range used in Pass 1 recovery so it can be used in subsequent passes.
	 */
	bool		multipleRecoveryPassesNeeded;
	XLogRecPtr	pass1StartLoc;
	XLogRecPtr	pass1LastLoc;
	XLogRecPtr	pass1LastCheckpointLoc;

	/*=================Pass 4 PersistentTable-Cat verification================*/
	/*If true integrity checks will be performed in Pass4.*/
	bool		integrityCheckNeeded;

	/*
	 * Currently set database and tablespace to be verified for database specific
	 * PT-Cat verification in Pass4. These fields also act as implicit flags
	 * PT-Cat which indicate if there are any more databases to perform
	 * PT-Cat verifications checks on.
	 */
	Oid			currentDatabaseToVerify;
	Oid			tablespaceOfCurrentDatabaseToVerify;

	/*Indicates if pass4 PT-Cat verification checks passed*/
	bool		pass4_PTCatVerificationPassed;
	/*==========Pass 4 PersistentTable-Cat verification End===================*/

513 514
} XLogCtlData;

515
static XLogCtlData *XLogCtl = NULL;
516

517
/*
T
Tom Lane 已提交
518
 * We maintain an image of pg_control in shared memory.
519
 */
520
static ControlFileData *ControlFile = NULL;
521

522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
typedef struct ControlFileWatch
{
	bool		watcherInitialized;
	XLogRecPtr	current_checkPointLoc;		/* current last check point record ptr */
	XLogRecPtr	current_prevCheckPointLoc;  /* current previous check point record ptr */
	XLogRecPtr	current_checkPointCopy_redo;
								/* current checkpointCopy value for
								 * next RecPtr available when we began to
								 * create CheckPoint (i.e. REDO start point) */

} ControlFileWatch;


/*
 * We keep the watcher in shared memory.
 */
static ControlFileWatch *ControlFileWatcher = NULL;

T
Tom Lane 已提交
540 541 542 543 544
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
545

T
Tom Lane 已提交
546 547
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
548
	(XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
T
Tom Lane 已提交
549 550 551 552 553 554

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
555
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
556 557 558 559 560 561 562
	)

#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
563

T
Tom Lane 已提交
564 565 566 567
/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
568
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
569

570 571 572 573 574 575 576 577 578
/*
 * Codes indicating where we got a WAL file from during recovery, or where
 * to attempt to get one.  These are chosen so that they can be OR'd together
 * in a bitmask state variable.
 */
#define XLOG_FROM_ARCHIVE		(1<<0)	/* Restored using restore_command */
#define XLOG_FROM_PG_XLOG		(1<<1)	/* Existing file in pg_xlog */
#define XLOG_FROM_STREAM		(1<<2)	/* Streamed from master */

T
Tom Lane 已提交
579 580 581 582 583 584
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
585
static MirroredFlatFileOpen	mirroredLogFileOpen = MirroredFlatFileOpen_Init;
T
Tom Lane 已提交
586 587 588
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
589

T
Tom Lane 已提交
590 591 592 593
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
594 595 596
 * will be just past that page.readLen indicates how much of the current
 * page has been read into readBuf, and readSource indicates where we got
 * the currently open file from.
T
Tom Lane 已提交
597
 */
598 599 600 601
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
static uint32 readLen = 0;
static int	readSource = 0;		/* XLOG_FROM_* code */

/*
 * Keeps track of which sources we've tried to read the current WAL
 * record from and failed.
 */
static int	failedSources = 0;	/* OR of XLOG_FROM_* codes */

/*
 * These variables track when we last obtained some WAL data to process,
 * and where we got it from.  (XLogReceiptSource is initially the same as
 * readSource, but readSource gets reset to zero when we don't have data
 * to process right now.)
 */
static TimestampTz XLogReceiptTime = 0;
static int	XLogReceiptSource = 0;		/* XLOG_FROM_* code */
B
Bruce Momjian 已提交
619

620
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
T
Tom Lane 已提交
621
static char *readBuf = NULL;
B
Bruce Momjian 已提交
622

623 624 625 626
/* Buffer for current ReadRecord result (expandable) */
static char *readRecordBuf = NULL;
static uint32 readRecordBufSize = 0;

T
Tom Lane 已提交
627
/* State information for XLOG reading */
B
Bruce Momjian 已提交
628 629
static XLogRecPtr ReadRecPtr;	/* start of last record read */
static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
630
static XLogRecord *nextRecord = NULL;
631
static TimeLineID lastPageTLI = 0;
632
static TimeLineID lastSegmentTLI = 0;
633

634 635 636
static XLogRecPtr minRecoveryPoint;		/* local copy of
										 * ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
637

V
WAL  
Vadim B. Mikheev 已提交
638 639
static bool InRedo = false;

640
/*
641
 * Flags set by interrupt handlers for later service in the redo loop.
642 643 644 645 646 647 648 649 650
 */
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t shutdown_requested = false;

/*
 * Flag set when executing a restore command, to tell SIGTERM signal handler
 * that it's safe to just proc_exit.
 */
static volatile sig_atomic_t in_restore_command = false;
651 652


653 654
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
655 656
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
657
static void XLogArchiveCleanup(const char *xlog);
658
static void exitArchiveRecovery(TimeLineID endTLI,
B
Bruce Momjian 已提交
659
					uint32 endLogId, uint32 endLogSeg);
660
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
661
static void LocalSetXLogInsertAllowed(void);
662
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
T
Tom Lane 已提交
663

664
static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
665 666 667
							bool wal_check_consistency_enabled,
							XLogRecPtr *lsn, BkpBlock *bkpb);
static void RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
668
				char *blk, bool get_cleanup_lock, bool keep_buffer);
669

670 671
static bool AdvanceXLInsertBuffer(bool new_segment);
static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
672 673 674
static void XLogFileInit(
			 MirroredFlatFileOpen *mirroredOpen,
			 uint32 log, uint32 seg,
B
Bruce Momjian 已提交
675
			 bool *use_existent, bool use_lock);
676 677
static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
678
					   bool use_lock, char *tmpsimpleFileName);
679
static void XLogFileClose(void);
680 681 682 683 684 685 686 687
static void XLogFileOpen(
				MirroredFlatFileOpen *mirroredOpen,
				uint32 log,
				uint32 seg);

static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess);

688
#ifdef NOT_USED
689
static bool RestoreArchivedFile(char *path, const char *xlogfname,
B
Bruce Momjian 已提交
690
					const char *recovername, off_t expectedSize);
691
static void ExecuteRecoveryEndCommand(void);
692
#endif
693
static void PreallocXlogFiles(XLogRecPtr endptr);
694
static void UpdateLastRemovedPtr(char *filename);
695
static void ValidateXLOGDirectoryStructure(void);
696
static void CleanupBackupHistory(void);
697
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
698
#ifdef NOT_USED
699
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
700
#endif
701
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
702 703
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool segmentonly);

704 705 706 707 708 709 710 711 712
typedef struct CheckpointExtendedRecord
{
	TMGXACT_CHECKPOINT	*dtxCheckpoint;
	uint32				dtxCheckpointLen;
	prepared_transaction_agg_state  *ptas;
} CheckpointExtendedRecord;

static void UnpackCheckPointRecord(XLogRecord *record,
								   CheckpointExtendedRecord *ckptExtended);
713 714 715
static bool existsTimeLineHistory(TimeLineID probeTLI);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
B
Bruce Momjian 已提交
716 717
					 TimeLineID endTLI,
					 uint32 endLogId, uint32 endLogSeg);
718 719 720 721
static void ControlFileWatcherSaveInitial(void);
static void ControlFileWatcherCheckForChange(void);
static bool XLogGetWriteAndFlushedLoc(XLogRecPtr *writeLoc, XLogRecPtr *flushedLoc);
static XLogRecPtr XLogInsert_Internal(RmgrId rmid, uint8 info, XLogRecData *rdata, TransactionId headerXid);
T
Tom Lane 已提交
722 723
static void WriteControlFile(void);
static void ReadControlFile(void);
724

725
static char *str_time(pg_time_t tnow);
726

727
static void xlog_outrec(StringInfo buf, XLogRecord *record);
728 729
static void pg_start_backup_callback(int code, Datum arg);
static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired);
730
static void rm_redo_error_callback(void *arg);
731
static int	get_sync_bit(int method);
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747

/* New functions added for WAL replication */
static void SetCurrentChunkStartTime(TimestampTz xtime);
static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources);
static void XLogProcessCheckpointRecord(XLogRecord *rec, XLogRecPtr loc);

typedef struct RedoErrorCallBack
{
	XLogRecPtr	location;

	XLogRecord 	*record;
} RedoErrorCallBack;

void HandleStartupProcInterrupts(void);
static bool CheckForStandbyTrigger(void);

748
static void GetXLogCleanUpTo(XLogRecPtr recptr, uint32 *_logId, uint32 *_logSeg);
749
static void checkXLogConsistency(XLogRecord *record, XLogRecPtr EndRecPtr);
750

751 752 753 754 755 756 757 758 759 760
static char *XLogContiguousCopy(
	XLogRecord 		*record,

	XLogRecData 	*rdata)
{
	XLogRecData *rdt;
	int32 len;
	char *buffer;

	rdt = rdata;
761
	len = SizeOfXLogRecord;
762 763 764 765 766 767 768 769 770 771 772
	while (rdt != NULL)
	{
		if (rdt->data != NULL)
		{
			len += rdt->len;
		}
		rdt = rdt->next;
	}

	buffer = (char*)palloc(len);

773
	memcpy(buffer, record, SizeOfXLogRecord);
774
	rdt = rdata;
775
	len = SizeOfXLogRecord;
776 777 778 779 780 781 782 783 784 785 786 787
	while (rdt != NULL)
	{
		if (rdt->data != NULL)
		{
			memcpy(&buffer[len], rdt->data, rdt->len);
			len += rdt->len;
		}
		rdt = rdt->next;
	}

	return buffer;
}
T
Tom Lane 已提交
788 789 790 791

/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
792
 * the rdata chain (see xlog.h for notes about rdata).
T
Tom Lane 已提交
793 794 795 796 797 798 799 800 801 802 803
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
804
XLogRecPtr
805
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
806
{
807 808 809 810 811 812 813 814 815 816 817 818 819 820
	return XLogInsert_Internal(rmid, info, rdata, GetCurrentTransactionIdIfAny());
}

XLogRecPtr
XLogInsert_OverrideXid(RmgrId rmid, uint8 info, XLogRecData *rdata, TransactionId overrideXid)
{
	return XLogInsert_Internal(rmid, info, rdata, overrideXid);
}


static XLogRecPtr
XLogInsert_Internal(RmgrId rmid, uint8 info, XLogRecData *rdata, TransactionId headerXid)
{

B
Bruce Momjian 已提交
821 822
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
823
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
824 825 826
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
827
	int			curridx;
B
Bruce Momjian 已提交
828 829 830 831 832
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
833 834 835 836
	XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
	pg_crc32	rdata_crc;
B
Bruce Momjian 已提交
837 838 839 840
	uint32		len,
				write_len;
	unsigned	i;
	bool		updrqst;
841
	bool		doPageWrites;
842
	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
843
	uint8       extended_info = 0;
844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879

    /* Safety check in case our assumption is ever broken. */
	/* NOTE: This is slightly modified from the one in xact.c -- the test for */
 	/* NOTE: seqXlogWrite is omitted... */
	/* NOTE: some local-only changes are OK */
 	if (Gp_role == GP_ROLE_EXECUTE && !Gp_is_writer)
 	{
 		/*
 	     * we better only do really minor things on the reader that result
 	     * in writing to the xlog here at commit.  for now sequences
 	     * should be the only one
 	     */
		if (DistributedTransactionContext == DTX_CONTEXT_LOCAL_ONLY)
		{
			/* MPP-1687: readers may under some circumstances extend the CLOG
			 * rmid == RM_CLOG_ID and info having CLOG_ZEROPAGE set */
			elog(LOG, "Reader qExec committing LOCAL_ONLY changes. (%d %d)", rmid, info);
		}
		else
		{
			/*
			 * We are allowing the QE Reader to write to support error tables.
			 */
			elog(DEBUG1, "Reader qExec writing changes. (%d %d)", rmid, info);
#ifdef nothing
			ereport(ERROR,
					(errmsg("Reader qExec had local changes to commit! (rmid = %u)",
							rmid),
					 errdetail("A Reader qExec tried to commit local changes.  "
							   "Only the single Writer qExec can do so. "),
					 errhint("This is most likely the result of a feature being turned "
							 "on that violates the single WRITER principle")));
#endif
		}
 	}

880 881 882
	/* GPDB_84_MERGE_FIXME: This cross-check was added in upstream, but it's failing
	 * in Startup pass 2. Disable it for now. */
#if 0
883
	/* cross-check on whether we should be here or not */
884 885
	if (!XLogInsertAllowed())
		elog(ERROR, "cannot make new WAL entries during recovery");
886
#endif
887

888
	/* info's high bits are reserved for use by me */
V
Vadim B. Mikheev 已提交
889
	if (info & XLR_INFO_MASK)
890
		elog(PANIC, "invalid xlog info mask %02X", info);
V
Vadim B. Mikheev 已提交
891

892 893
	TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

T
Tom Lane 已提交
894
	/*
B
Bruce Momjian 已提交
895 896
	 * In bootstrap mode, we don't actually log anything but XLOG resources;
	 * return a phony record pointer.
T
Tom Lane 已提交
897
	 */
V
Vadim B. Mikheev 已提交
898
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
899 900
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
901
		RecPtr.xrecoff = SizeOfXLogLongPHD;		/* start of 1st chkpt record */
902
		return RecPtr;
V
WAL  
Vadim B. Mikheev 已提交
903 904
	}

905 906 907 908 909 910 911 912 913
	/*
	 * Enforce consistency checks for this record if user is looking for
	 * it. Do this before at the beginning of this routine to give the
	 * possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY
	 * directly for a record.
	 */
	if (wal_consistency_checking[rmid])
		extended_info |= XLR_CHECK_CONSISTENCY;

T
Tom Lane 已提交
914
	/*
915
	 * Here we scan the rdata chain, determine which buffers must be backed
T
Tom Lane 已提交
916
	 * up, and compute the CRC values for the data.  Note that the record
B
Bruce Momjian 已提交
917 918 919 920
	 * header isn't added into the CRC initially since we don't know the final
	 * length or info bits quite yet.  Thus, the CRC will represent the CRC of
	 * the whole record in the order "rdata, then backup blocks, then record
	 * header".
T
Tom Lane 已提交
921
	 *
922 923 924 925 926
	 * We may have to loop back to here if a race condition is detected below.
	 * We could prevent the race by doing all this work while holding the
	 * insert lock, but it seems better to avoid doing CRC calculations while
	 * holding the lock.  This means we have to be careful about modifying the
	 * rdata chain until we know we aren't going to loop back again.  The only
B
Bruce Momjian 已提交
927 928 929 930 931
	 * change we allow ourselves to make earlier is to set rdt->data = NULL in
	 * chain items we have decided we will have to back up the whole buffer
	 * for.  This is OK because we will certainly decide the same thing again
	 * for those items if we do it over; doing it here saves an extra pass
	 * over the chain later.
T
Tom Lane 已提交
932
	 */
933
begin:;
T
Tom Lane 已提交
934 935 936 937 938 939
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

940 941 942 943 944 945 946 947
	/*
	 * Decide if we need to do full-page writes in this XLOG record: true if
	 * full_page_writes is on or we have a PITR request for it.  Since we
	 * don't yet have the insert lock, forcePageWrites could change under us,
	 * but we'll recheck it once we have the lock.
	 */
	doPageWrites = fullPageWrites || Insert->forcePageWrites;

948
	INIT_CRC32C(rdata_crc);
T
Tom Lane 已提交
949
	len = 0;
B
Bruce Momjian 已提交
950
	for (rdt = rdata;;)
951 952 953
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
954
			/* Simple data, just include it */
955
			len += rdt->len;
956
			COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
957
		}
T
Tom Lane 已提交
958
		else
959
		{
T
Tom Lane 已提交
960 961
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
962
			{
T
Tom Lane 已提交
963
				if (rdt->buffer == dtbuf[i])
964
				{
965 966 967 968 969 970 971
					/*
					 * Buffer already referenced by earlier chain item and
					 * will be applied then only ignore it. Block can exist
					 * for consistency check purpose and hence should include
					 * original data along if its only for that purpose.
					 */
					if (dtbuf_bkp[i] && (dtbuf_xlg[i].block_info & BLOCK_APPLY))
T
Tom Lane 已提交
972 973 974 975
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
976
						COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
977 978
					}
					break;
979
				}
T
Tom Lane 已提交
980
				if (dtbuf[i] == InvalidBuffer)
981
				{
T
Tom Lane 已提交
982 983
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
984

985
					if (doPageWrites && XLogCheckBuffer(rdt, true,
986
										(extended_info & XLR_CHECK_CONSISTENCY) != 0,
987
										&(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
T
Tom Lane 已提交
988 989
					{
						dtbuf_bkp[i] = true;
990 991 992 993 994 995 996 997 998 999 1000

						if (dtbuf_xlg[i].block_info & BLOCK_APPLY)
							rdt->data = NULL;
						else
						{
							if (rdt->data)
							{
								len += rdt->len;
								COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
							}
						}
T
Tom Lane 已提交
1001 1002 1003 1004
					}
					else if (rdt->data)
					{
						len += rdt->len;
1005
						COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
1006 1007
					}
					break;
1008 1009
				}
			}
T
Tom Lane 已提交
1010
			if (i >= XLR_MAX_BKP_BLOCKS)
1011
				elog(PANIC, "can backup at most %d blocks per xlog record",
T
Tom Lane 已提交
1012
					 XLR_MAX_BKP_BLOCKS);
1013
		}
1014
		/* Break out of loop when rdt points to last chain item */
1015 1016 1017 1018 1019
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
	/*
	 * Now add the backup block headers and data into the CRC
	 */
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (dtbuf_bkp[i])
		{
			BkpBlock   *bkpb = &(dtbuf_xlg[i]);
			char	   *page;

1030
			COMP_CRC32C(rdata_crc,
1031 1032 1033 1034 1035
					   (char *) bkpb,
					   sizeof(BkpBlock));
			page = (char *) BufferGetBlock(dtbuf[i]);
			if (bkpb->hole_length == 0)
			{
1036
				COMP_CRC32C(rdata_crc, page, BLCKSZ);
1037 1038 1039 1040
			}
			else
			{
				/* must skip the hole */
1041 1042
				COMP_CRC32C(rdata_crc, page, bkpb->hole_offset);
				COMP_CRC32C(rdata_crc,
1043 1044 1045 1046 1047 1048
						   page + (bkpb->hole_offset + bkpb->hole_length),
						   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
			}
		}
	}

T
Tom Lane 已提交
1049
	/*
1050 1051
	 * NOTE: We disallow len == 0 because it provides a useful bit of extra
	 * error checking in ReadRecord.  This means that all callers of
B
Bruce Momjian 已提交
1052 1053 1054
	 * XLogInsert must supply at least some not-in-a-buffer data.  However, we
	 * make an exception for XLOG SWITCH records because we don't want them to
	 * ever cross a segment boundary.
T
Tom Lane 已提交
1055
	 */
1056
	if (len == 0 && !isLogSwitch)
1057
		elog(PANIC, "invalid xlog record length %u", len);
1058

1059
	START_CRIT_SECTION();
1060

1061 1062 1063
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
1064
	/*
B
Bruce Momjian 已提交
1065 1066 1067
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to go
	 * back and recompute everything.  This can only happen just after a
	 * checkpoint, so it's better to be slow in this case and fast otherwise.
1068 1069
	 *
	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
B
Bruce Momjian 已提交
1070 1071
	 * affect the contents of the XLOG record, so we'll update our local copy
	 * but not force a recomputation.
T
Tom Lane 已提交
1072 1073
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
1074
	{
T
Tom Lane 已提交
1075 1076 1077
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

1078
		if (doPageWrites)
1079
		{
1080
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
T
Tom Lane 已提交
1081
			{
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
				if (dtbuf[i] == InvalidBuffer)
					continue;
				if (dtbuf_bkp[i] == false &&
					XLByteLE(dtbuf_lsn[i], RedoRecPtr))
				{
					/*
					 * Oops, this buffer now needs to be backed up, but we
					 * didn't think so above.  Start over.
					 */
					LWLockRelease(WALInsertLock);
1092

1093 1094 1095
					END_CRIT_SECTION();
					goto begin;
				}
T
Tom Lane 已提交
1096
			}
1097 1098 1099
		}
	}

1100
	/*
B
Bruce Momjian 已提交
1101 1102 1103 1104
	 * Also check to see if forcePageWrites was just turned on; if we weren't
	 * already doing full-page writes then go back and recompute. (If it was
	 * just turned off, we could recompute the record without full pages, but
	 * we choose not to bother.)
1105 1106 1107 1108 1109 1110 1111 1112 1113
	 */
	if (Insert->forcePageWrites && !doPageWrites)
	{
		/* Oops, must redo it with full-page data */
		LWLockRelease(WALInsertLock);
		END_CRIT_SECTION();
		goto begin;
	}

T
Tom Lane 已提交
1114
	/*
B
Bruce Momjian 已提交
1115 1116 1117 1118
	 * Make additional rdata chain entries for the backup blocks, so that we
	 * don't need to special-case them in the write loop.  Note that we have
	 * now irrevocably changed the input rdata chain.  At the exit of this
	 * loop, write_len includes the backup block data.
T
Tom Lane 已提交
1119
	 *
1120 1121 1122
	 * Also set the appropriate info bits to show which buffers were backed
	 * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
	 * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
T
Tom Lane 已提交
1123 1124 1125
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1126
	{
1127 1128 1129
		BkpBlock   *bkpb;
		char	   *page;

1130
		if (!dtbuf_bkp[i])
1131 1132
			continue;

T
Tom Lane 已提交
1133
		info |= XLR_SET_BKP_BLOCK(i);
1134

1135 1136 1137 1138 1139
		bkpb = &(dtbuf_xlg[i]);
		page = (char *) BufferGetBlock(dtbuf[i]);

		rdt->next = &(dtbuf_rdt1[i]);
		rdt = rdt->next;
1140

1141 1142
		rdt->data = (char *) bkpb;
		rdt->len = sizeof(BkpBlock);
T
Tom Lane 已提交
1143
		write_len += sizeof(BkpBlock);
1144

1145 1146
		rdt->next = &(dtbuf_rdt2[i]);
		rdt = rdt->next;
1147

1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
		if (bkpb->hole_length == 0)
		{
			rdt->data = page;
			rdt->len = BLCKSZ;
			write_len += BLCKSZ;
			rdt->next = NULL;
		}
		else
		{
			/* must skip the hole */
			rdt->data = page;
			rdt->len = bkpb->hole_offset;
			write_len += bkpb->hole_offset;

			rdt->next = &(dtbuf_rdt3[i]);
			rdt = rdt->next;

			rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
			rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
			write_len += rdt->len;
			rdt->next = NULL;
		}
1170 1171
	}

1172 1173 1174 1175 1176 1177 1178
	/*
	 * If we backed up any full blocks and online backup is not in progress,
	 * mark the backup blocks as removable.  This allows the WAL archiver to
	 * know whether it is safe to compress archived WAL data by transforming
	 * full-block records into the non-full-block format.
	 *
	 * Note: we could just set the flag whenever !forcePageWrites, but
B
Bruce Momjian 已提交
1179 1180
	 * defining it like this leaves the info bit free for some potential other
	 * use in records without any backup blocks.
1181 1182 1183 1184
	 */
	if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
		info |= XLR_BKP_REMOVABLE;

1185
	/*
1186
	 * If there isn't enough space on the current XLOG page for a record
B
Bruce Momjian 已提交
1187
	 * header, advance to the next page (leaving the unused space as zeroes).
1188
	 */
T
Tom Lane 已提交
1189 1190
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
1191 1192
	if (freespace < SizeOfXLogRecord)
	{
1193
		updrqst = AdvanceXLInsertBuffer(false);
1194 1195 1196
		freespace = INSERT_FREESPACE(Insert);
	}

1197
	/* Compute record's XLOG location */
T
Tom Lane 已提交
1198
	curridx = Insert->curridx;
1199 1200 1201
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/*
B
Bruce Momjian 已提交
1202 1203 1204 1205 1206
	 * If the record is an XLOG_SWITCH, and we are exactly at the start of a
	 * segment, we need not insert it (and don't want to because we'd like
	 * consecutive switch requests to be no-ops).  Instead, make sure
	 * everything is written and flushed through the end of the prior segment,
	 * and return the prior segment's end address.
1207 1208 1209 1210
	 */
	if (isLogSwitch &&
		(RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
	{
1211
		/* We can release insert lock immediately */
1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
		LWLockRelease(WALInsertLock);

		RecPtr.xrecoff -= SizeOfXLogLongPHD;
		if (RecPtr.xrecoff == 0)
		{
			/* crossing a logid boundary */
			RecPtr.xlogid -= 1;
			RecPtr.xrecoff = XLogFileSize;
		}

		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(RecPtr, LogwrtResult.Flush))
		{
			XLogwrtRqst FlushRqst;

			FlushRqst.Write = RecPtr;
			FlushRqst.Flush = RecPtr;
			XLogWrite(FlushRqst, false, false);
		}
		LWLockRelease(WALWriteLock);

		END_CRIT_SECTION();

		return RecPtr;
	}
T
Tom Lane 已提交
1238

1239 1240
	/* Insert record header */

1241
	record = (XLogRecord *) Insert->currpos;
1242
	record->xl_prev = Insert->PrevRecord;
1243
	record->xl_xid = headerXid;
1244
	record->xl_tot_len = SizeOfXLogRecord + write_len;
T
Tom Lane 已提交
1245
	record->xl_len = len;		/* doesn't include backup blocks */
1246
	record->xl_info = info;
1247
	record->xl_rmid = rmid;
1248
	record->xl_extended_info = extended_info;
1249

1250
	/* Now we can finish computing the record's CRC */
1251
	COMP_CRC32C(rdata_crc, (char *) record + sizeof(pg_crc32),
1252
			   SizeOfXLogRecord - sizeof(pg_crc32));
1253
	FIN_CRC32C(rdata_crc);
1254 1255
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
1256 1257 1258 1259
	/* Record begin of record in appropriate places */
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

1260 1261 1262
	ProcLastRecTotalLen = record->xl_tot_len;
	ProcLastRecDataLen = write_len;

1263
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
1264
	freespace -= SizeOfXLogRecord;
1265

1266 1267 1268 1269 1270 1271 1272 1273 1274 1275
	if (Debug_xlog_insert_print)
	{
		StringInfoData buf;
		char *contiguousCopy;

		initStringInfo(&buf);
		appendStringInfo(&buf, "XLOG INSERT @ %s, total length %u, data length %u: ",
						 XLogLocationToString(&RecPtr),
						 ProcLastRecTotalLen,
						 ProcLastRecDataLen);
1276
		xlog_outrec(&buf, record);
1277 1278 1279 1280 1281 1282 1283 1284 1285 1286

		contiguousCopy = XLogContiguousCopy(record, rdata);
		appendStringInfo(&buf, " - ");
		RmgrTable[record->xl_rmid].rm_desc(&buf, RecPtr, (XLogRecord*)contiguousCopy);
		pfree(contiguousCopy);

		elog(LOG, "%s", buf.data);
		pfree(buf.data);
	}

T
Tom Lane 已提交
1287 1288 1289 1290
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
1291
	{
1292 1293 1294 1295
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
1296
		{
1297 1298 1299 1300 1301
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
1302
				write_len -= freespace;
1303 1304 1305
			}
			else
			{
1306
				/* enough room to write whole data. do it. */
1307 1308
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
1309
				write_len -= rdata->len;
1310 1311 1312 1313
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
1314 1315
		}

1316
		/* Use next buffer */
1317
		updrqst = AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
1318 1319 1320 1321 1322 1323
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
1324
		freespace = INSERT_FREESPACE(Insert);
1325
	}
1326

T
Tom Lane 已提交
1327 1328
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
1329
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
1330
	freespace = INSERT_FREESPACE(Insert);
1331

V
Vadim B. Mikheev 已提交
1332
	/*
B
Bruce Momjian 已提交
1333 1334
	 * The recptr I return is the beginning of the *next* record. This will be
	 * stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
1335
	 */
T
Tom Lane 已提交
1336
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
1337

1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
	/*
	 * If the record is an XLOG_SWITCH, we must now write and flush all the
	 * existing data, and then forcibly advance to the start of the next
	 * segment.  It's not good to do this I/O while holding the insert lock,
	 * but there seems too much risk of confusion if we try to release the
	 * lock sooner.  Fortunately xlog switch needn't be a high-performance
	 * operation anyway...
	 */
	if (isLogSwitch)
	{
		XLogCtlWrite *Write = &XLogCtl->Write;
		XLogwrtRqst FlushRqst;
		XLogRecPtr	OldSegEnd;

1352 1353
		TRACE_POSTGRESQL_XLOG_SWITCH();

1354 1355 1356
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

		/*
B
Bruce Momjian 已提交
1357 1358
		 * Flush through the end of the page containing XLOG_SWITCH, and
		 * perform end-of-segment actions (eg, notifying archiver).
1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
		 */
		WriteRqst = XLogCtl->xlblocks[curridx];
		FlushRqst.Write = WriteRqst;
		FlushRqst.Flush = WriteRqst;
		XLogWrite(FlushRqst, false, true);

		/* Set up the next buffer as first page of next segment */
		/* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
		(void) AdvanceXLInsertBuffer(true);

		/* There should be no unwritten data */
		curridx = Insert->curridx;
		Assert(curridx == Write->curridx);

		/* Compute end address of old segment */
		OldSegEnd = XLogCtl->xlblocks[curridx];
		OldSegEnd.xrecoff -= XLOG_BLCKSZ;
		if (OldSegEnd.xrecoff == 0)
		{
			/* crossing a logid boundary */
			OldSegEnd.xlogid -= 1;
			OldSegEnd.xrecoff = XLogFileSize;
		}

		/* Make it look like we've written and synced all of old segment */
		LogwrtResult.Write = OldSegEnd;
		LogwrtResult.Flush = OldSegEnd;

		/*
		 * Update shared-memory status --- this code should match XLogWrite
		 */
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire(&xlogctl->info_lck);
			xlogctl->LogwrtResult = LogwrtResult;
			if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
				xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
			if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
				xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
			SpinLockRelease(&xlogctl->info_lck);
		}

		Write->LogwrtResult = LogwrtResult;

		LWLockRelease(WALWriteLock);

		updrqst = false;		/* done already */
	}
1409
	else
1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425
	{
		/* normal case, ie not xlog switch */

		/* Need to update shared LogwrtRqst if some block was filled up */
		if (freespace < SizeOfXLogRecord)
		{
			/* curridx is filled and available for writing out */
			updrqst = true;
		}
		else
		{
			/* if updrqst already set, write through end of previous buf */
			curridx = PrevBufIdx(curridx);
		}
		WriteRqst = XLogCtl->xlblocks[curridx];
	}
1426

1427
	LWLockRelease(WALInsertLock);
1428 1429 1430

	if (updrqst)
	{
1431 1432 1433
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1434
		SpinLockAcquire(&xlogctl->info_lck);
T
Tom Lane 已提交
1435
		/* advance global request to include new block(s) */
1436 1437
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
1438
		/* update local result copy while I have the chance */
1439
		LogwrtResult = xlogctl->LogwrtResult;
1440
		SpinLockRelease(&xlogctl->info_lck);
1441 1442
	}

1443
	XactLastRecEnd = RecPtr;
1444

1445
	END_CRIT_SECTION();
1446

1447
	return RecPtr;
1448
}
1449

1450 1451 1452 1453 1454 1455 1456 1457 1458
XLogRecPtr
XLogLastInsertBeginLoc(void)
{
	return ProcLastRecPtr;
}

XLogRecPtr
XLogLastInsertEndLoc(void)
{
1459
	return XactLastRecEnd;
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
}

XLogRecPtr
XLogLastChangeTrackedLoc(void)
{
	return XLogCtl->lastChangeTrackingEndLoc;
}

uint32
XLogLastInsertTotalLen(void)
{
	return ProcLastRecTotalLen;
}

uint32
XLogLastInsertDataLen(void)
{
	return ProcLastRecDataLen;
}

1480
/*
1481 1482 1483
 * Determine whether the buffer referenced by an XLogRecData item has to
 * be backed up, and if so fill a BkpBlock struct for it.  In any case
 * save the buffer's LSN at *lsn.
1484
 */
1485
static bool
1486
XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
1487
				bool wal_check_consistency_enabled,
1488
				XLogRecPtr *lsn, BkpBlock *bkpb)
1489
{
1490
	Page		page;
1491
	bool needs_backup;
1492

1493
	page = BufferGetPage(rdata->buffer);
1494 1495

	/*
1496 1497 1498 1499
	 * We assume page LSN is first data on *every* page that can be passed
	 * to XLogInsert, whether it has the standard page layout or not. We
	 * don't need to take the buffer header lock for PageGetLSN if we hold
	 * an exclusive lock on the page and/or the relation.
1500
	 */
1501
	if (holdsExclusiveLock)
1502
		*lsn = PageGetLSN(page);
1503 1504
	else
		*lsn = BufferGetLSNAtomic(rdata->buffer);
1505

1506
	needs_backup = XLByteLE(((PageHeader) page)->pd_lsn, RedoRecPtr);
1507 1508

	if (needs_backup || wal_check_consistency_enabled)
1509
	{
1510 1511 1512
		/*
		 * The page needs to be backed up, so set up *bkpb
		 */
1513
		BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
		bkpb->block_info = 0;

		/*
		 * If WAL consistency checking is enabled for the
		 * resource manager of this WAL record, a full-page
		 * image is included in the record for the block
		 * modified. During redo, the full-page is replayed
		 * only if block_apply is set.
		 */
		if (needs_backup)
			bkpb->block_info |= BLOCK_APPLY;
1525

1526 1527 1528
		if (rdata->buffer_std)
		{
			/* Assume we can omit data between pd_lower and pd_upper */
1529 1530
			uint16		lower = ((PageHeader) page)->pd_lower;
			uint16		upper = ((PageHeader) page)->pd_upper;
1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551

			if (lower >= SizeOfPageHeaderData &&
				upper > lower &&
				upper <= BLCKSZ)
			{
				bkpb->hole_offset = lower;
				bkpb->hole_length = upper - lower;
			}
			else
			{
				/* No "hole" to compress out */
				bkpb->hole_offset = 0;
				bkpb->hole_length = 0;
			}
		}
		else
		{
			/* Not a standard page header, don't try to eliminate "hole" */
			bkpb->hole_offset = 0;
			bkpb->hole_length = 0;
		}
1552

1553
		return true;			/* buffer requires backup */
1554
	}
1555 1556

	return false;				/* buffer does not need to be backed up */
1557 1558
}

1559 1560 1561 1562 1563 1564
/*
 * XLogArchiveNotify
 *
 * Create an archive notification file
 *
 * The name of the notification file is the message that will be picked up
1565
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
1566
 * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1567
 * then when complete, rename it to 0000000100000001000000C6.done
1568 1569 1570 1571 1572
 */
static void
XLogArchiveNotify(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
B
Bruce Momjian 已提交
1573
	FILE	   *fd;
1574 1575 1576 1577

	/* insert an otherwise empty file called <XLOG>.ready */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	fd = AllocateFile(archiveStatusPath, "w");
B
Bruce Momjian 已提交
1578 1579
	if (fd == NULL)
	{
1580 1581 1582 1583 1584 1585
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not create archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}
B
Bruce Momjian 已提交
1586 1587
	if (FreeFile(fd))
	{
1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not write archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}

	/* Notify archiver that it's got something to do */
	if (IsUnderPostmaster)
		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
}

/*
 * Convenience routine to notify using log/seg representation of filename
 */
static void
XLogArchiveNotifySeg(uint32 log, uint32 seg)
{
	char		xlog[MAXFNAMELEN];

1608
	XLogFileName(xlog, ThisTimeLineID, log, seg);
1609 1610 1611 1612
	XLogArchiveNotify(xlog);
}

/*
1613
 * XLogArchiveCheckDone
1614
 *
1615 1616 1617 1618
 * This is called when we are ready to delete or recycle an old XLOG segment
 * file or backup history file.  If it is okay to delete it then return true.
 * If it is not time to delete it, make sure a .ready file exists, and return
 * false.
1619 1620
 *
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1621 1622 1623 1624
 * then return false; else create <XLOG>.ready and return false.
 *
 * The reason we do things this way is so that if the original attempt to
 * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1625 1626
 */
static bool
1627
XLogArchiveCheckDone(const char *xlog)
1628 1629 1630 1631
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

1632 1633 1634 1635 1636
	/* Always deletable if archiving is off */
	if (!XLogArchivingActive())
		return true;

	/* First check for .done --- this means archiver is done with it */
1637 1638 1639 1640 1641 1642 1643
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
B
Bruce Momjian 已提交
1644
		return false;
1645 1646 1647 1648 1649 1650 1651

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Retry creation of the .ready file */
1652
	XLogArchiveNotify(xlog);
1653 1654 1655
	return false;
}

1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
/*
 * XLogArchiveIsBusy
 *
 * Check to see if an XLOG segment file is still unarchived.
 * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 * the first place we aren't chartered to recreate the .ready file, and
 * in the second place we should consider that if the file is already gone
 * then it's not busy.  (This check is needed to handle the race condition
 * that a checkpoint already deleted the no-longer-needed file.)
 */
static bool
XLogArchiveIsBusy(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

	/* First check for .done --- this means archiver is done with it */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/*
1688 1689 1690
	 * Check to see if the WAL file has been removed by checkpoint, which
	 * implies it has already been archived, and explains why we can't see a
	 * status file for it.
1691 1692 1693 1694 1695 1696 1697 1698 1699
	 */
	snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
	if (stat(archiveStatusPath, &stat_buf) != 0 &&
		errno == ENOENT)
		return false;

	return true;
}

1700 1701 1702
/*
 * XLogArchiveCleanup
 *
1703
 * Cleanup archive notification file(s) for a particular xlog segment
1704 1705 1706 1707
 */
static void
XLogArchiveCleanup(const char *xlog)
{
B
Bruce Momjian 已提交
1708
	char		archiveStatusPath[MAXPGPATH];
1709

1710
	/* Remove the .done file */
1711 1712 1713
	StatusFilePath(archiveStatusPath, xlog, ".done");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1714 1715 1716 1717 1718

	/* Remove the .ready file if present --- normally it shouldn't be */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1719 1720
}

T
Tom Lane 已提交
1721 1722 1723 1724
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
1725 1726 1727 1728
 * If new_segment is TRUE then we set up the next buffer page as the first
 * page of the next xlog segment file, possibly but not usually the next
 * consecutive file page.
 *
T
Tom Lane 已提交
1729
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
1730
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
1731 1732 1733
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
1734
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
1735 1736
 */
static bool
1737
AdvanceXLInsertBuffer(bool new_segment)
1738
{
T
Tom Lane 已提交
1739 1740
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
1741
	int			nextidx = NextBufIdx(Insert->curridx);
T
Tom Lane 已提交
1742 1743 1744
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
1745 1746
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
1747

T
Tom Lane 已提交
1748 1749 1750
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
1751

T
Tom Lane 已提交
1752
	/*
B
Bruce Momjian 已提交
1753 1754 1755
	 * Get ending-offset of the buffer page we need to replace (this may be
	 * zero if the buffer hasn't been used yet).  Fall through if it's already
	 * written out.
T
Tom Lane 已提交
1756 1757 1758 1759 1760 1761
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
1762

T
Tom Lane 已提交
1763
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1764

1765
		/* Before waiting, get info_lck and update LogwrtResult */
1766 1767 1768 1769
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

1770
			SpinLockAcquire(&xlogctl->info_lck);
1771 1772 1773
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
1774
			SpinLockRelease(&xlogctl->info_lck);
1775
		}
1776 1777 1778 1779 1780 1781 1782 1783 1784

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
1785
		{
1786 1787 1788 1789
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1790
			{
1791 1792 1793
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
1794
			}
1795
			else
T
Tom Lane 已提交
1796 1797
			{
				/*
B
Bruce Momjian 已提交
1798 1799
				 * Have to write buffers while holding insert lock. This is
				 * not good, so only write as much as we absolutely must.
T
Tom Lane 已提交
1800
				 */
1801
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
T
Tom Lane 已提交
1802 1803 1804
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
1805
				XLogWrite(WriteRqst, false, false);
1806
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1807
				Insert->LogwrtResult = LogwrtResult;
1808
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1809 1810 1811 1812
			}
		}
	}

T
Tom Lane 已提交
1813
	/*
B
Bruce Momjian 已提交
1814 1815
	 * Now the next buffer slot is free and we can set it up to be the next
	 * output page.
T
Tom Lane 已提交
1816
	 */
1817
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1818 1819 1820 1821 1822 1823 1824 1825

	if (new_segment)
	{
		/* force it to a segment start point */
		NewPageEndPtr.xrecoff += XLogSegSize - 1;
		NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
	}

1826
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
1827
	{
T
Tom Lane 已提交
1828
		/* crossing a logid boundary */
1829
		NewPageEndPtr.xlogid += 1;
1830
		NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1831
	}
T
Tom Lane 已提交
1832
	else
1833
		NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1834
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1835
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
B
Bruce Momjian 已提交
1836

T
Tom Lane 已提交
1837
	Insert->curridx = nextidx;
1838
	Insert->currpage = NewPage;
B
Bruce Momjian 已提交
1839 1840

	Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
B
Bruce Momjian 已提交
1841

T
Tom Lane 已提交
1842
	/*
B
Bruce Momjian 已提交
1843 1844
	 * Be sure to re-zero the buffer so that bytes beyond what we've written
	 * will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
1845
	 */
1846
	MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1847

1848 1849 1850
	/*
	 * Fill the new page's header
	 */
B
Bruce Momjian 已提交
1851 1852
	NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;

1853
	/* NewPage->xlp_info = 0; */	/* done by memset */
B
Bruce Momjian 已提交
1854 1855
	NewPage   ->xlp_tli = ThisTimeLineID;
	NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1856
	NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
T
Tom Lane 已提交
1857

1858
	/*
1859
	 * If first page of an XLOG segment file, make it a long header.
1860 1861 1862
	 */
	if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
	{
1863
		XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1864

1865 1866
		NewLongPage->xlp_sysid = ControlFile->system_identifier;
		NewLongPage->xlp_seg_size = XLogSegSize;
1867
		NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
B
Bruce Momjian 已提交
1868 1869 1870
		NewPage   ->xlp_info |= XLP_LONG_HEADER;

		Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1871 1872
	}

T
Tom Lane 已提交
1873
	return update_needed;
1874 1875
}

1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888
/*
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
 *
 * Caller must have just finished filling the open log file (so that
 * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
 * to the open log file and see if that exceeds CheckPointSegments.
 *
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
 */
static bool
XLogCheckpointNeeded(void)
{
	/*
1889 1890
	 * A straight computation of segment number could overflow 32 bits. Rather
	 * than assuming we have working 64-bit arithmetic, we compare the
B
Bruce Momjian 已提交
1891 1892
	 * highest-order bits separately, and force a checkpoint immediately when
	 * they change.
1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904
	 */
	uint32		old_segno,
				new_segno;
	uint32		old_highbits,
				new_highbits;

	old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
		(RedoRecPtr.xrecoff / XLogSegSize);
	old_highbits = RedoRecPtr.xlogid / XLogSegSize;
	new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
	new_highbits = openLogId / XLogSegSize;
	if (new_highbits != old_highbits ||
B
Bruce Momjian 已提交
1905
		new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1906 1907 1908 1909
		return true;
	return false;
}

T
Tom Lane 已提交
1910 1911 1912
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
1913 1914 1915 1916 1917
 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
 * may stop at any convenient boundary (such as a cache or logfile boundary).
 * This option allows us to avoid uselessly issuing multiple writes when a
 * single one would do.
 *
1918 1919 1920 1921 1922 1923
 * If xlog_switch == TRUE, we are intending an xlog segment switch, so
 * perform end-of-segment actions after writing the last page, even if
 * it's not physically the end of its segment.  (NB: this will work properly
 * only if caller specifies WriteRqst == page-end and flexible == false,
 * and there is some data to write.)
 *
1924
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
1925
 */
1926
static void
1927
XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1928
{
1929
	XLogCtlWrite *Write = &XLogCtl->Write;
T
Tom Lane 已提交
1930
	bool		ispartialpage;
1931
	bool		last_iteration;
1932
	bool		finishing_seg;
1933
	bool		use_existent;
1934 1935 1936 1937
	int			curridx;
	int			npages;
	int			startidx;
	uint32		startoffset;
1938

1939 1940 1941
	/* We should always be inside a critical section here */
	Assert(CritSectionCount > 0);

B
Bruce Momjian 已提交
1942
	/*
B
Bruce Momjian 已提交
1943
	 * Update local LogwrtResult (caller probably did this already, but...)
B
Bruce Momjian 已提交
1944
	 */
T
Tom Lane 已提交
1945 1946
	LogwrtResult = Write->LogwrtResult;

1947 1948 1949
	/*
	 * Since successive pages in the xlog cache are consecutively allocated,
	 * we can usually gather multiple pages together and issue just one
B
Bruce Momjian 已提交
1950 1951 1952 1953 1954
	 * write() call.  npages is the number of pages we have determined can be
	 * written together; startidx is the cache block index of the first one,
	 * and startoffset is the file offset at which it should go. The latter
	 * two variables are only valid when npages > 0, but we must initialize
	 * all of them to keep the compiler quiet.
1955 1956 1957 1958 1959 1960 1961 1962 1963
	 */
	npages = 0;
	startidx = 0;
	startoffset = 0;

	/*
	 * Within the loop, curridx is the cache block index of the page to
	 * consider writing.  We advance Write->curridx only after successfully
	 * writing pages.  (Right now, this refinement is useless since we are
B
Bruce Momjian 已提交
1964 1965
	 * going to PANIC if any error occurs anyway; but someday it may come in
	 * useful.)
1966 1967
	 */
	curridx = Write->curridx;
B
 
Bruce Momjian 已提交
1968

T
Tom Lane 已提交
1969
	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1970
	{
1971
		/*
B
Bruce Momjian 已提交
1972 1973 1974
		 * Make sure we're not ahead of the insert process.  This could happen
		 * if we're passed a bogus WriteRqst.Write that is past the end of the
		 * last page that's been initialized by AdvanceXLInsertBuffer.
1975
		 */
1976
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1977
			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1978
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1979 1980
				 XLogCtl->xlblocks[curridx].xlogid,
				 XLogCtl->xlblocks[curridx].xrecoff);
1981

T
Tom Lane 已提交
1982
		/* Advance LogwrtResult.Write to end of current buffer page */
1983
		LogwrtResult.Write = XLogCtl->xlblocks[curridx];
T
Tom Lane 已提交
1984 1985 1986
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1987
		{
T
Tom Lane 已提交
1988
			/*
1989 1990
			 * Switch to new logfile segment.  We cannot have any pending
			 * pages here (since we dump what we have at segment end).
T
Tom Lane 已提交
1991
			 */
1992
			Assert(npages == 0);
1993
			if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
1994
				XLogFileClose();
T
Tom Lane 已提交
1995 1996
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1997 1998
			/* create/use new log file */
			use_existent = true;
1999 2000 2001 2002 2003

			XLogFileInit(
					&mirroredLogFileOpen,
					openLogId, openLogSeg,
					&use_existent, true);
T
Tom Lane 已提交
2004
			openLogOff = 0;
2005 2006
		}

2007
		/* Make sure we have the current logfile open */
2008
		if (!MirroredFlatFile_IsActive(&mirroredLogFileOpen))
2009
		{
T
Tom Lane 已提交
2010
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
2011 2012 2013 2014
			XLogFileOpen(
					&mirroredLogFileOpen,
					openLogId,
					openLogSeg);
T
Tom Lane 已提交
2015
			openLogOff = 0;
2016 2017
		}

2018 2019 2020 2021 2022
		/* Add current page to the set of pending pages-to-dump */
		if (npages == 0)
		{
			/* first of group */
			startidx = curridx;
2023
			startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
2024 2025
		}
		npages++;
2026

T
Tom Lane 已提交
2027
		/*
B
Bruce Momjian 已提交
2028 2029 2030 2031
		 * Dump the set if this will be the last loop iteration, or if we are
		 * at the last page of the cache area (since the next page won't be
		 * contiguous in memory), or if we are at the end of the logfile
		 * segment.
T
Tom Lane 已提交
2032
		 */
2033 2034
		last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);

2035
		finishing_seg = !ispartialpage &&
2036
			(startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2037

2038
		if (last_iteration ||
2039 2040
			curridx == XLogCtl->XLogCacheBlck ||
			finishing_seg)
T
Tom Lane 已提交
2041
		{
2042 2043
			char	   *from;
			Size		nbytes;
2044

2045 2046 2047 2048 2049 2050 2051
			/* Need to seek in the file? */
			if (openLogOff != startoffset)
			{
				openLogOff = startoffset;
			}

			/* OK to write the page(s) */
2052 2053
			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
			nbytes = npages * (Size) XLOG_BLCKSZ;
2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080

			/* The following code is a sanity check to try to catch the issue described in MPP-12611 */
			if (!IsBootstrapProcessingMode())
			  {
			  char   simpleFileName[MAXPGPATH];
			  XLogFileName(simpleFileName, ThisTimeLineID, openLogId, openLogSeg);
                          if (strcmp(simpleFileName, mirroredLogFileOpen.simpleFileName) != 0)
			    {
			      ereport( PANIC
				       , (errmsg_internal("Expected Xlog file name does not match current open xlog file name. \
                                                           Expected file = %s, \
                                                           open file = %s, \
                                                           WriteRqst.Write = %s, \
                                                           WriteRqst.Flush = %s "
							 , simpleFileName
							 , mirroredLogFileOpen.simpleFileName
							 , XLogLocationToString(&(WriteRqst.Write))
							 , XLogLocationToString(&(WriteRqst.Flush)))));
			    }
			  }

			if (MirroredFlatFile_Write(
							&mirroredLogFileOpen,
							openLogOff,
							from,
							nbytes,
							/* suppressError */ true))
2081 2082 2083 2084
			{
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not write to log file %u, segment %u "
P
Peter Eisentraut 已提交
2085
								"at offset %u, length %lu: %m",
2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101
								openLogId, openLogSeg,
								openLogOff, (unsigned long) nbytes)));
			}

			/* Update state for write */
			openLogOff += nbytes;
			Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
			npages = 0;

			/*
			 * If we just wrote the whole last page of a logfile segment,
			 * fsync the segment immediately.  This avoids having to go back
			 * and re-open prior segments when an fsync request comes along
			 * later. Doing it here ensures that one and only one backend will
			 * perform this fsync.
			 *
2102 2103 2104
			 * We also do this if this is the last page written for an xlog
			 * switch.
			 *
2105
			 * This is also the right place to notify the Archiver that the
B
Bruce Momjian 已提交
2106
			 * segment is ready to copy to archival storage, and to update the
2107 2108 2109
			 * timer for archive_timeout, and to signal for a checkpoint if
			 * too many logfile segments have been used since the last
			 * checkpoint.
2110
			 */
2111
			if (finishing_seg || (xlog_switch && last_iteration))
2112
			{
2113 2114 2115 2116 2117 2118 2119 2120 2121 2122
				if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
					MirroredFlatFile_Flush(
									&mirroredLogFileOpen,
									/* suppressError */ false);

				elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
					 "XLogWrite (#1): flush loc %s; write loc %s",
					 XLogLocationToString_Long(&LogwrtResult.Flush),
					 XLogLocationToString2_Long(&LogwrtResult.Write));

B
Bruce Momjian 已提交
2123
				LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */
2124 2125 2126

				if (XLogArchivingActive())
					XLogArchiveNotifySeg(openLogId, openLogSeg);
2127

2128
				Write->lastSegSwitchTime = (pg_time_t) time(NULL);
2129 2130

				/*
2131
				 * Signal bgwriter to start a checkpoint if we've consumed too
2132
				 * much xlog since the last one.  For speed, we first check
B
Bruce Momjian 已提交
2133 2134 2135
				 * using the local copy of RedoRecPtr, which might be out of
				 * date; if it looks like a checkpoint is needed, forcibly
				 * update RedoRecPtr and recheck.
2136
				 */
2137 2138
				if (IsUnderPostmaster &&
					XLogCheckpointNeeded())
2139
				{
2140 2141
					if (Debug_print_qd_mirroring)
						elog(LOG, "time for a checkpoint, signaling bgwriter");
2142 2143
					(void) GetRedoRecPtr();
					if (XLogCheckpointNeeded())
2144
						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2145
				}
2146
			}
T
Tom Lane 已提交
2147
		}
2148

T
Tom Lane 已提交
2149 2150 2151 2152 2153 2154
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
2155 2156 2157 2158 2159
		curridx = NextBufIdx(curridx);

		/* If flexible, break out of loop as soon as we wrote something */
		if (flexible && npages == 0)
			break;
2160
	}
2161 2162 2163

	Assert(npages == 0);
	Assert(curridx == Write->curridx);
2164

T
Tom Lane 已提交
2165 2166 2167 2168 2169
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
2170
	{
T
Tom Lane 已提交
2171
		/*
B
Bruce Momjian 已提交
2172 2173 2174
		 * Could get here without iterating above loop, in which case we might
		 * have no open file or the wrong one.	However, we do not need to
		 * fsync more than one file.
T
Tom Lane 已提交
2175
		 */
2176 2177
		if (sync_method != SYNC_METHOD_OPEN &&
			sync_method != SYNC_METHOD_OPEN_DSYNC)
T
Tom Lane 已提交
2178
		{
2179
			if (MirroredFlatFile_IsActive(&mirroredLogFileOpen) &&
B
Bruce Momjian 已提交
2180
				!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
2181
				XLogFileClose();
2182
			if (!MirroredFlatFile_IsActive(&mirroredLogFileOpen))
2183 2184
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
2185 2186 2187 2188
				XLogFileOpen(
						&mirroredLogFileOpen,
						openLogId,
						openLogSeg);
2189 2190
				openLogOff = 0;
			}
2191 2192 2193 2194 2195 2196 2197 2198 2199
			if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
				MirroredFlatFile_Flush(
								&mirroredLogFileOpen,
								/* suppressError */ false);

			elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
				 "XLogWrite (#2): flush loc %s; write loc %s",
				 XLogLocationToString_Long(&LogwrtResult.Flush),
				 XLogLocationToString2_Long(&LogwrtResult.Write));
T
Tom Lane 已提交
2200
		}
2201

T
Tom Lane 已提交
2202
		LogwrtResult.Flush = LogwrtResult.Write;
2203 2204
	}

T
Tom Lane 已提交
2205 2206 2207
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
2208
	 * We make sure that the shared 'request' values do not fall behind the
B
Bruce Momjian 已提交
2209 2210
	 * 'result' values.  This is not absolutely essential, but it saves some
	 * code in a couple of places.
T
Tom Lane 已提交
2211
	 */
2212 2213 2214 2215
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2216
		SpinLockAcquire(&xlogctl->info_lck);
2217 2218 2219 2220 2221
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2222
		SpinLockRelease(&xlogctl->info_lck);
2223
	}
2224

T
Tom Lane 已提交
2225 2226 2227
	Write->LogwrtResult = LogwrtResult;
}

2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243
/*
 * Record the LSN for an asynchronous transaction commit.
 * (This should not be called for aborts, nor for synchronous commits.)
 */
void
XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
		xlogctl->asyncCommitLSN = asyncCommitLSN;
	SpinLockRelease(&xlogctl->info_lck);
}

2244 2245 2246 2247
/*
 * Advance minRecoveryPoint in control file.
 *
 * If we crash during recovery, we must reach this point again before the
2248 2249
 * database is consistent.
 *
2250
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2251
 * is only updated if it's not already greater than or equal to 'lsn'.
2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266
 */
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
	/* Quick check using our local copy of the variable */
	if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
		return;

	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

	/* update local copy */
	minRecoveryPoint = ControlFile->minRecoveryPoint;

	/*
	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2267 2268
	 * i.e., we're doing crash recovery.  We never modify the control file's
	 * value in that case, so we can short-circuit future checks here too.
2269 2270 2271 2272 2273 2274 2275
	 */
	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
		updateMinRecoveryPoint = false;
	else if (force || XLByteLT(minRecoveryPoint, lsn))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
2276
		XLogRecPtr	newMinRecoveryPoint;
2277 2278 2279 2280

		/*
		 * To avoid having to update the control file too often, we update it
		 * all the way to the last record being replayed, even though 'lsn'
2281 2282 2283 2284 2285 2286 2287 2288 2289
		 * would suffice for correctness.  This also allows the 'force' case
		 * to not need a valid 'lsn' value.
		 *
		 * Another important reason for doing it this way is that the passed
		 * 'lsn' value could be bogus, i.e., past the end of available WAL,
		 * if the caller got it from a corrupted heap page.  Accepting such
		 * a value as the min recovery point would prevent us from coming up
		 * at all.  Instead, we just log a warning and continue with recovery.
		 * (See also the comments about corrupt LSNs in XLogFlush.)
2290 2291 2292 2293 2294
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
		SpinLockRelease(&xlogctl->info_lck);

2295 2296 2297 2298 2299 2300
		if (!force && XLByteLT(newMinRecoveryPoint, lsn))
			elog(WARNING,
				 "xlog min recovery request %X/%X is past current point %X/%X",
				 lsn.xlogid, lsn.xrecoff,
				 newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);

2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315
		/* update control file */
		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
		{
			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
			UpdateControlFile();
			minRecoveryPoint = newMinRecoveryPoint;

			ereport(DEBUG2,
					(errmsg("updated min recovery point to %X/%X",
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
		}
	}
	LWLockRelease(ControlFileLock);
}

T
Tom Lane 已提交
2316 2317 2318
/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
2319
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
2320 2321 2322 2323 2324 2325 2326 2327
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

2328
	/*
2329 2330 2331 2332 2333
	 * During REDO, we are reading not writing WAL.  Therefore, instead of
	 * trying to flush the WAL, we should update minRecoveryPoint instead.
	 * We test XLogInsertAllowed(), not InRecovery, because we need the
	 * bgwriter to act this way too, and because when the bgwriter tries
	 * to write the end-of-recovery checkpoint, it should indeed flush.
2334
	 */
2335
	if (!XLogInsertAllowed())
2336 2337
	{
		UpdateMinRecoveryPoint(record, false);
T
Tom Lane 已提交
2338
		return;
2339
	}
T
Tom Lane 已提交
2340

2341 2342 2343 2344 2345 2346
	if (Debug_print_qd_mirroring)
		elog(LOG, "xlog flush request %s; write %s; flush %s",
			 XLogLocationToString(&record),
			 XLogLocationToString2(&LogwrtResult.Write),
			 XLogLocationToString3(&LogwrtResult.Flush));

T
Tom Lane 已提交
2347 2348 2349 2350
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

2351
#ifdef WAL_DEBUG
2352
	if (XLOG_DEBUG)
2353
		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2354 2355 2356
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2357
#endif
2358

T
Tom Lane 已提交
2359 2360 2361 2362
	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
B
Bruce Momjian 已提交
2363 2364 2365 2366
	 * piggyback as much data as we can on each fsync: if we see any more data
	 * entered into the xlog buffer, we'll write and fsync that too, so that
	 * the final value of LogwrtResult.Flush is as large as possible. This
	 * gives us some chance of avoiding another fsync immediately after.
T
Tom Lane 已提交
2367 2368 2369 2370 2371
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

2372
	/* read LogwrtResult and update local state */
2373 2374 2375 2376
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2377
		SpinLockAcquire(&xlogctl->info_lck);
2378 2379 2380
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
2381
		SpinLockRelease(&xlogctl->info_lck);
2382
	}
2383 2384 2385

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2386
	{
2387 2388 2389 2390
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2391
		{
2392 2393 2394 2395 2396 2397
			/* try to write/flush later additions to XLOG as well */
			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
			{
				XLogCtlInsert *Insert = &XLogCtl->Insert;
				uint32		freespace = INSERT_FREESPACE(Insert);

B
Bruce Momjian 已提交
2398
				if (freespace < SizeOfXLogRecord)		/* buffer is full */
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				else
				{
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
					WriteRqstPtr.xrecoff -= freespace;
				}
				LWLockRelease(WALInsertLock);
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = WriteRqstPtr;
			}
			else
			{
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = record;
			}
2414
			XLogWrite(WriteRqst, false, false);
T
Tom Lane 已提交
2415
		}
2416
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
2417 2418 2419
	}

	END_CRIT_SECTION();
2420 2421 2422

	/*
	 * If we still haven't flushed to the request point then we have a
B
Bruce Momjian 已提交
2423 2424
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
2425
	 *
2426 2427 2428 2429
	 * Formerly we treated this as a PANIC condition, but that hurts the
	 * system's robustness rather than helping it: we do not want to take down
	 * the whole system due to corruption on one data page.  In particular, if
	 * the bad page is encountered again during recovery then we would be
2430 2431 2432 2433 2434 2435
	 * unable to restart the database at all!  (This scenario actually
	 * happened in the field several times with 7.1 releases.)  As of 8.4,
	 * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
	 * problem; the only time we can reach here during recovery is while
	 * flushing the end-of-recovery checkpoint record, and we don't expect
	 * that to have a bad LSN.
2436
	 *
2437
	 * Note that for calls from xact.c, the ERROR will
2438
	 * be promoted to PANIC since xact.c calls this routine inside a critical
B
Bruce Momjian 已提交
2439 2440
	 * section.  However, calls from bufmgr.c are not within critical sections
	 * and so we will not force a restart for a bad LSN on a data page.
2441 2442
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
2443
		elog(ERROR,
B
Bruce Momjian 已提交
2444
		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2445 2446
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2447 2448
}

2449
/*
2450

2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462
 * TODO: This is just for the matter of WAL receiver build.  We cannot
 * expose MirroredFlatFileOpen in xlog.h.
 */
int
XLogFileInitExt(uint32 log, uint32 seg, bool *use_existent, bool use_lock)
{
	MirroredFlatFileOpen mirroredOpen;

	XLogFileInit(&mirroredOpen, log, seg, use_existent, use_lock);
	return mirroredOpen.primaryFile;
}

2463 2464 2465 2466 2467 2468
/*
 * Flush xlog, but without specifying exactly where to flush to.
 *
 * We normally flush only completed blocks; but if there is nothing to do on
 * that basis, we check for unflushed async commits in the current incomplete
 * block, and flush through the latest one of those.  Thus, if async commits
B
Bruce Momjian 已提交
2469
 * are not being used, we will flush complete blocks only.	We can guarantee
2470
 * that async commits reach disk after at most three cycles; normally only
B
Bruce Momjian 已提交
2471
 * one or two.	(We allow XLogWrite to write "flexibly", meaning it can stop
2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483
 * at the end of the buffer ring; this makes a difference only with very high
 * load or long wal_writer_delay, but imposes one extra cycle for the worst
 * case for async commits.)
 *
 * This routine is invoked periodically by the background walwriter process.
 */
void
XLogBackgroundFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
	bool		flexible = true;

2484 2485 2486 2487
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return;

2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507
	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* back off to last completed page boundary */
	WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;

	/* if we have already flushed that far, consider async commit records */
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2508
		SpinLockAcquire(&xlogctl->info_lck);
2509
		WriteRqstPtr = xlogctl->asyncCommitLSN;
2510
		SpinLockRelease(&xlogctl->info_lck);
2511 2512 2513
		flexible = false;		/* ensure it all gets written */
	}

2514 2515 2516 2517 2518
	/*
	 * If already known flushed, we're done. Just need to check if we
	 * are holding an open file handle to a logfile that's no longer
	 * in use, preventing the file from being deleted.
	 */
2519
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2520
	{
2521 2522
		if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
		{
2523 2524 2525 2526 2527
			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
			{
				XLogFileClose();
			}
		}
2528
		return;
2529
	}
2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556

#ifdef WAL_DEBUG
	if (XLOG_DEBUG)
		elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
			 WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
#endif

	START_CRIT_SECTION();

	/* now wait for the write lock */
	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
	LogwrtResult = XLogCtl->Write.LogwrtResult;
	if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		XLogwrtRqst WriteRqst;

		WriteRqst.Write = WriteRqstPtr;
		WriteRqst.Flush = WriteRqstPtr;
		XLogWrite(WriteRqst, flexible, false);
	}
	LWLockRelease(WALWriteLock);

	END_CRIT_SECTION();
}

2557 2558
/*
 * Flush any previous asynchronously-committed transactions' commit records.
2559 2560 2561 2562
 *
 * NOTE: it is unwise to assume that this provides any strong guarantees.
 * In particular, because of the inexact LSN bookkeeping used by clog.c,
 * we cannot assume that hint bits will be settable for these transactions.
2563 2564 2565 2566 2567
 */
void
XLogAsyncCommitFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
B
Bruce Momjian 已提交
2568

2569 2570 2571
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

2572 2573 2574 2575
	/* There's no asynchronously committed transactions during recovery */
	if (RecoveryInProgress())
		return;

2576 2577 2578 2579 2580 2581 2582
	SpinLockAcquire(&xlogctl->info_lck);
	WriteRqstPtr = xlogctl->asyncCommitLSN;
	SpinLockRelease(&xlogctl->info_lck);

	XLogFlush(WriteRqstPtr);
}

2583 2584 2585 2586 2587 2588 2589 2590 2591
/*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
bool
XLogNeedsFlush(XLogRecPtr record)
{
2592 2593 2594 2595
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return false;

2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* check again */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	return true;
}

T
Tom Lane 已提交
2617 2618 2619
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
2620 2621 2622
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
2623
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
2624 2625
 * file was used.
 *
2626
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2627
 * place.  This should be TRUE except during bootstrap log creation.  The
2628
 * caller must *not* hold the lock at call.
2629
 *
T
Tom Lane 已提交
2630
 * Returns FD of opened file.
2631 2632 2633 2634 2635
 *
 * Note: errors here are ERROR not PANIC because we might or might not be
 * inside a critical section (eg, during checkpoint there is no reason to
 * take down the system on failure).  They will promote to PANIC if we are
 * in a critical section.
T
Tom Lane 已提交
2636
 */
2637 2638 2639 2640 2641
static void
XLogFileInit(
	MirroredFlatFileOpen *mirroredOpen,
	uint32 log, uint32 seg,
	bool *use_existent, bool use_lock)
2642
{
2643 2644
	char		simpleFileName[MAXPGPATH];
	char		tmpsimple[MAXPGPATH];
2645
	char		tmppath[MAXPGPATH];
2646
	MirroredFlatFileOpen tmpMirroredOpen;
2647
	char	   *zbuffer;
2648 2649 2650
	uint32		installed_log;
	uint32		installed_seg;
	int			max_advance;
2651
	int			nbytes;
2652
	char			*xlogDir = NULL;
2653

2654
	XLogFileName(simpleFileName, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
2655 2656

	/*
B
Bruce Momjian 已提交
2657
	 * Try to use existent file (checkpoint maker may have created it already)
V
Vadim B. Mikheev 已提交
2658
	 */
2659
	if (*use_existent)
V
Vadim B. Mikheev 已提交
2660
	{
2661 2662 2663 2664
		if (MirroredFlatFile_Open(
							mirroredOpen,
							XLOGDIR,
							simpleFileName,
2665
							O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2666 2667 2668
						    S_IRUSR | S_IWUSR,
						    /* suppressError */ true,
							/* atomic operation */ false,
2669
							/* isMirrorRecovery */ false))
V
Vadim B. Mikheev 已提交
2670
		{
2671 2672 2673 2674
			char		path[MAXPGPATH];

			XLogFilePath(path, ThisTimeLineID, log, seg);

V
Vadim B. Mikheev 已提交
2675
			if (errno != ENOENT)
2676
				ereport(ERROR,
2677
						(errcode_for_file_access(),
2678
						 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2679
								path, log, seg)));
V
Vadim B. Mikheev 已提交
2680 2681
		}
		else
2682
			return;
V
Vadim B. Mikheev 已提交
2683 2684
	}

2685
	/*
B
Bruce Momjian 已提交
2686 2687 2688 2689
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
	 * another process is doing the same thing.  If so, we will end up
	 * pre-creating an extra log segment.  That seems OK, and better than
	 * holding the lock throughout this lengthy process.
2690
	 */
2691 2692
	elog(DEBUG2, "creating and filling new WAL file");

2693 2694 2695 2696
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
		
	if (snprintf(tmpsimple, MAXPGPATH, "xlogtemp.%d", (int) getpid()) > MAXPGPATH)
	{
2697 2698 2699
		ereport(ERROR,
				(errmsg("could not generate filename xlogtemp.%d", (int)getpid())));
	}
2700

2701 2702
	if (snprintf(tmppath, MAXPGPATH, "%s/%s", xlogDir, tmpsimple) > MAXPGPATH)
	{
2703 2704 2705
		ereport(ERROR,
				(errmsg("could not generate filename %s/%s", xlogDir, tmpsimple)));
	}
2706 2707 2708 2709 2710

	MirroredFlatFile_Drop(
						  XLOGDIR,
						  tmpsimple,
						  /* suppressError */ true,
2711
						  /* isMirrorRecovery */ false);
2712

2713
	/* do not use get_sync_bit here --- want to fsync only at end of fill */
2714 2715 2716 2717 2718 2719 2720 2721
	MirroredFlatFile_Open(
						&tmpMirroredOpen,
						XLOGDIR,
						tmpsimple,
						O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					    S_IRUSR | S_IWUSR,
					    /* suppressError */ false,
						/* atomic operation */ false,
2722
						/* isMirrorRecovery */ false);
2723

2724
	/*
B
Bruce Momjian 已提交
2725 2726 2727 2728 2729 2730 2731
	 * Zero-fill the file.	We have to do this the hard way to ensure that all
	 * the file space has really been allocated --- on platforms that allow
	 * "holes" in files, just seeking to the end doesn't allocate intermediate
	 * space.  This way, we know that we have all the space and (after the
	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
	 * log file.
2732 2733 2734 2735
	 *
	 * Note: palloc zbuffer, instead of just using a local char array, to
	 * ensure it is reasonably well-aligned; this may save a few cycles
	 * transferring data to the kernel.
2736
	 */
2737 2738
	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2739
	{
2740
		errno = 0;
2741 2742 2743
		if (MirroredFlatFile_Append(
							&tmpMirroredOpen,
							zbuffer,
2744
							XLOG_BLCKSZ,
2745
							/* suppressError */ true))
T
Tom Lane 已提交
2746
		{
B
Bruce Momjian 已提交
2747
			int			save_errno = errno;
T
Tom Lane 已提交
2748

B
Bruce Momjian 已提交
2749
			/*
B
Bruce Momjian 已提交
2750
			 * If we fail to make the file, delete it to release disk space
B
Bruce Momjian 已提交
2751
			 */
2752 2753 2754 2755
			MirroredFlatFile_Drop(
							XLOGDIR,
							tmpsimple,
							/* suppressError */ false,
2756
							/* isMirrorRecovery */ false);
2757

2758 2759
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
2760

2761
			ereport(ERROR,
2762
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2763
					 errmsg("could not write to file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2764
		}
2765
	}
2766
	pfree(zbuffer);
2767

2768 2769 2770
	MirroredFlatFile_Flush(
				&tmpMirroredOpen,
				/* suppressError */ false);
2771

2772
	MirroredFlatFile_Close(&tmpMirroredOpen);
T
Tom Lane 已提交
2773

2774
	/*
2775 2776
	 * Now move the segment into place with its final name.
	 *
2777
	 * If caller didn't want to use a pre-existing file, get rid of any
B
Bruce Momjian 已提交
2778 2779 2780
	 * pre-existing file.  Otherwise, cope with possibility that someone else
	 * has created the file while we were filling ours: if so, use ours to
	 * pre-create a future log segment.
2781
	 */
2782 2783 2784 2785 2786
	installed_log = log;
	installed_seg = seg;
	max_advance = XLOGfileslop;
	if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
								*use_existent, &max_advance,
2787
								use_lock, tmpsimple))
2788
	{
2789 2790 2791 2792 2793
		/*
		 * No need for any more future segments, or InstallXLogFileSegment()
		 * failed to rename the file into place. If the rename failed, opening
		 * the file below will fail.
		 */
2794 2795 2796 2797
		MirroredFlatFile_Drop(
						XLOGDIR,
						tmpsimple,
						/* suppressError */ false,
2798
						/* isMirrorRecovery */ false);
2799 2800 2801 2802 2803 2804
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
2805 2806 2807 2808
	MirroredFlatFile_Open(
						mirroredOpen,
						XLOGDIR,
						simpleFileName,
2809
						O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2810 2811 2812
					    S_IRUSR | S_IWUSR,
					    /* suppressError */ false,
						/* atomic operation */ false,
2813
						/* isMirrorRecovery */ false);
2814 2815

	pfree(xlogDir);
2816

2817
	elog(DEBUG2, "done creating and filling new WAL file");
2818 2819
}

2820 2821 2822 2823 2824 2825 2826 2827 2828
/*
 * Create a new XLOG file segment by copying a pre-existing one.
 *
 * log, seg: identify segment to be created.
 *
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 *		a different timeline)
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
2829
 * considerations.	But we should be just as tense as XLogFileInit to avoid
2830 2831 2832 2833 2834 2835 2836 2837
 * emplacing a bogus file.
 */
static void
XLogFileCopy(uint32 log, uint32 seg,
			 TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
2838
	char		buffer[XLOG_BLCKSZ];
2839 2840 2841
	int			srcfd;
	int			fd;
	int			nbytes;
2842
	char		*xlogDir = NULL;
2843 2844 2845 2846 2847 2848 2849

	/*
	 * Open the source file
	 */
	XLogFilePath(path, srcTLI, srclog, srcseg);
	srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
2850
		ereport(ERROR,
2851 2852 2853 2854 2855 2856
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", path)));

	/*
	 * Copy into a temp file name.
	 */
2857 2858 2859 2860 2861 2862 2863
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
				 xlogDir, (int) getpid()) > MAXPGPATH)
		ereport(ERROR,
				(errmsg("could not generate filename %s/xlogtemp.%d",
						xlogDir, (int) getpid())));
	pfree(xlogDir);	
2864 2865
	unlink(tmppath);

2866 2867 2868
	elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirroring: copying xlog file '%s' to '%s'",
		 path, tmppath);

2869
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2870 2871 2872
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2873
		ereport(ERROR,
2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * Do the data copying.
	 */
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
	{
		errno = 0;
		if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			if (errno != 0)
2886
				ereport(ERROR,
2887 2888 2889
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			else
2890
				ereport(ERROR,
B
Bruce Momjian 已提交
2891
						(errmsg("not enough data in file \"%s\"", path)));
2892 2893 2894 2895 2896 2897 2898
		}
		errno = 0;
		if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			int			save_errno = errno;

			/*
B
Bruce Momjian 已提交
2899
			 * If we fail to make the file, delete it to release disk space
2900 2901 2902 2903 2904
			 */
			unlink(tmppath);
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;

2905
			ereport(ERROR,
2906
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2907
					 errmsg("could not write to file \"%s\": %m", tmppath)));
2908 2909 2910 2911
		}
	}

	if (pg_fsync(fd) != 0)
2912
		ereport(ERROR,
2913 2914 2915 2916
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
2917
		ereport(ERROR,
2918 2919 2920 2921 2922 2923 2924 2925
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));

	close(srcfd);

	/*
	 * Now move the segment into place with its final name.
	 */
2926
	if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false, NULL))
2927
		elog(ERROR, "InstallXLogFileSegment should not have failed");
2928 2929
}

2930 2931 2932 2933 2934 2935
/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
2936 2937 2938
 * *log, *seg: identify segment to install as (or first possible target).
 * When find_free is TRUE, these are modified on return to indicate the
 * actual installation location or last segment searched.
2939 2940 2941 2942 2943 2944 2945
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
2946 2947 2948 2949
 * *max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  On return, reduced
 * by the number of slots skipped over.  (Irrelevant, and may be NULL,
 * when find_free is FALSE.)
2950
 *
2951
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2952
 * place.  This should be TRUE except during bootstrap log creation.  The
2953
 * caller must *not* hold the lock at call.
2954
 *
2955 2956 2957
 * Returns TRUE if the file was installed successfully.  FALSE indicates that
 * max_advance limit was exceeded, or an error occurred while renaming the
 * file into place.
2958 2959
 */
static bool
2960 2961
InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
2962
					   bool use_lock, char* tmpsimpleFileName)
2963 2964
{
	char		path[MAXPGPATH];
2965
	char		simpleFileName[MAXPGPATH];
2966
	struct stat stat_buf;
2967 2968 2969 2970 2971
	int retval = 0;

	errno = 0;

	XLogFileName(simpleFileName, ThisTimeLineID, *log, *seg);
2972

2973
	XLogFilePath(path, ThisTimeLineID, *log, *seg);
2974 2975 2976 2977 2978

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
2979
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2980

2981 2982 2983
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
2984 2985 2986 2987 2988 2989
		if (tmpsimpleFileName) {

			MirroredFlatFile_Drop(
								  XLOGDIR,
								  simpleFileName,
								  /* suppressError */ true,
2990
								  /* isMirrorRecovery */ false);
2991 2992 2993
		} else {
			unlink(path);
		}
2994
	}
2995 2996
	else
	{
2997
		/* Find a free slot to put it in */
2998
		while (stat(path, &stat_buf) == 0)
2999
		{
3000
			if (*max_advance <= 0)
3001 3002 3003
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
3004
					LWLockRelease(ControlFileLock);
3005 3006
				return false;
			}
3007 3008
			NextLogSeg(*log, *seg);
			(*max_advance)--;
3009 3010

			XLogFileName(simpleFileName, ThisTimeLineID, *log, *seg);
3011
			XLogFilePath(path, ThisTimeLineID, *log, *seg);
3012 3013 3014 3015 3016 3017 3018
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
3019
	 */
3020
#if HAVE_WORKING_LINK
3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033

	if (tmpsimpleFileName) {
		retval = MirroredFlatFile_Rename(
										 XLOGDIR,
										 /* old name */ tmpsimpleFileName,
										 /* new name */ simpleFileName,
										 /* can exist? */ false,
										 /* isMirrorRecovery */ false);
	} else {
		retval = link(tmppath, path);
	}

	if (retval < 0)
3034 3035 3036 3037
	{
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
3038
				(errcode_for_file_access(),
3039
				 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
3040
						tmppath, path, *log, *seg)));
3041 3042
		return false;
	}
3043 3044 3045 3046 3047 3048 3049

	if (tmpsimpleFileName) {

		MirroredFlatFile_Drop(
						  XLOGDIR,
						  tmpsimpleFileName,
						  /* suppressError */ true,
3050
						  /* isMirrorRecovery */ false);
3051 3052 3053 3054
	} else {
		unlink(tmppath);
	}

3055
#else
3056 3057 3058 3059 3060 3061
	if (tmpsimpleFileName) {
		retval = MirroredFlatFile_Rename(
						  XLOGDIR,
						  /* old name */ tmpsimpleFileName,
						  /* new name */ simpleFileName,
						  /* can exist */ false,
3062
						  /* isMirrorRecovery */ false);
3063 3064 3065 3066 3067
	} else {
		retval = rename(tmppath, path);
	}

	if (retval < 0)
3068
	{
3069 3070 3071
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
3072
				(errcode_for_file_access(),
3073
				 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
3074
						tmppath, path, *log, *seg)));
3075
		return false;
3076
	}
3077
#endif
V
Vadim B. Mikheev 已提交
3078

3079
	if (use_lock)
3080
		LWLockRelease(ControlFileLock);
3081

3082
	return true;
3083 3084
}

T
Tom Lane 已提交
3085
/*
3086
 * Open a pre-existing logfile segment for writing.
T
Tom Lane 已提交
3087
 */
3088 3089 3090 3091 3092
static void
XLogFileOpen(
	MirroredFlatFileOpen *mirroredOpen,
	uint32 log,
	uint32 seg)
3093
{
3094
	char		simpleFileName[MAXPGPATH];
3095

3096
	XLogFileName(simpleFileName, ThisTimeLineID, log, seg);
3097

3098 3099 3100 3101
	if (MirroredFlatFile_Open(
					mirroredOpen,
					XLOGDIR,
					simpleFileName,
3102
					O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3103 3104 3105
					S_IRUSR | S_IWUSR,
					/* suppressError */ false,
					/* atomic operation */ false,
3106
					/* isMirrorRecovery */ false))
3107
	{
3108
		char		path[MAXPGPATH];
3109

3110
		XLogFileName(path, ThisTimeLineID, log, seg);
3111

3112 3113
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3114 3115
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
3116
	}
3117 3118
}

3119 3120 3121
/*
 * Close the current logfile segment for writing.
 */
3122
static void
3123 3124
XLogFileClose(void)
{
3125
	Assert(MirroredFlatFile_IsActive(&mirroredLogFileOpen));
3126 3127

	/*
3128
	 * WAL segment files will not be re-read in normal operation, so we advise
3129
	 * the OS to release any cached pages.	But do not do so if WAL archiving
3130
	 * is active, because archiver process could use the cache to read the WAL
3131 3132 3133 3134 3135 3136
	 * segment.
	 */
	/* GPDB_84_MERGE_FIXME: Disabled for now, because I'm not sure if this
	 * would make sense with file replication. Like with WAL replication, you
	 * don't want to DONTNEED the file, if it's just about to be read by 
	 * mirroring, to be transmitted to the mirror.
3137
	 */
3138
#ifdef NOT_USED
3139
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
A
Abhijit Subramanya 已提交
3140
	if (!XLogIsNeeded())
3141
		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3142 3143
#endif
#endif
3144
	MirroredFlatFile_Close(&mirroredLogFileOpen);
3145 3146
}

3147
#if 0 /* GPDB doesn't make use of this function */
3148
/*
3149
 * Attempt to retrieve the specified file from off-line archival storage.
3150
 * If successful, fill "path" with its complete path (note that this will be
3151 3152
 * a temp file name that doesn't follow the normal naming convention), and
 * return TRUE.
3153
 *
3154 3155 3156
 * If not successful, fill "path" with the name of the normal on-line file
 * (which may or may not actually exist, but we'll try to use it), and return
 * FALSE.
3157 3158 3159 3160
 *
 * For fixed-size files, the caller may pass the expected size as an
 * additional crosscheck on successful recovery.  If the file size is not
 * known, set expectedSize = 0.
3161
 */
3162 3163
static bool
RestoreArchivedFile(char *path, const char *xlogfname,
3164
					const char *recovername, off_t expectedSize)
3165
{
B
Bruce Momjian 已提交
3166 3167
	char		xlogpath[MAXPGPATH];
	char		xlogRestoreCmd[MAXPGPATH];
3168
	char		lastRestartPointFname[MAXPGPATH];
B
Bruce Momjian 已提交
3169 3170
	char	   *dp;
	char	   *endp;
3171
	const char *sp;
B
Bruce Momjian 已提交
3172
	int			rc;
3173
	bool		signaled;
3174
	struct stat stat_buf;
B
Bruce Momjian 已提交
3175 3176
	uint32		restartLog;
	uint32		restartSeg;
3177 3178

	/*
B
Bruce Momjian 已提交
3179 3180 3181 3182
	 * When doing archive recovery, we always prefer an archived log file even
	 * if a file of the same name exists in XLOGDIR.  The reason is that the
	 * file in XLOGDIR could be an old, un-filled or partly-filled version
	 * that was copied and restored as part of backing up $PGDATA.
3183
	 *
B
Bruce Momjian 已提交
3184
	 * We could try to optimize this slightly by checking the local copy
B
Bruce Momjian 已提交
3185 3186 3187 3188
	 * lastchange timestamp against the archived copy, but we have no API to
	 * do this, nor can we guarantee that the lastchange timestamp was
	 * preserved correctly when we copied to archive. Our aim is robustness,
	 * so we elect not to do this.
3189
	 *
3190 3191 3192
	 * If we cannot obtain the log file from the archive, however, we will try
	 * to use the XLOGDIR file if it exists.  This is so that we can make use
	 * of log segments that weren't yet transferred to the archive.
3193
	 *
3194 3195 3196 3197
	 * Notice that we don't actually overwrite any files when we copy back
	 * from archive because the recoveryRestoreCommand may inadvertently
	 * restore inappropriate xlogs, or they may be corrupt, so we may wish to
	 * fallback to the segments remaining in current XLOGDIR later. The
B
Bruce Momjian 已提交
3198 3199
	 * copy-from-archive filename is always the same, ensuring that we don't
	 * run out of disk space on long recoveries.
3200
	 */
3201
	snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
3202 3203

	/*
3204
	 * Make sure there is no existing file named recovername.
3205 3206 3207 3208 3209 3210
	 */
	if (stat(xlogpath, &stat_buf) != 0)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3211
					 errmsg("could not stat file \"%s\": %m",
3212 3213 3214 3215 3216 3217 3218
							xlogpath)));
	}
	else
	{
		if (unlink(xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
3219
					 errmsg("could not remove file \"%s\": %m",
3220 3221 3222
							xlogpath)));
	}

3223 3224
	/*
	 * Calculate the archive file cutoff point for use during log shipping
3225 3226
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
3227 3228
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
3229 3230
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
3231 3232 3233
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
3234 3235 3236 3237
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
3238 3239 3240 3241 3242 3243 3244 3245 3246
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
		/* we shouldn't need anything earlier than last restart point */
3247
		Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
3248 3249 3250 3251
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265
	/*
	 * construct the command to be executed
	 */
	dp = xlogRestoreCmd;
	endp = xlogRestoreCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryRestoreCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'p':
3266
					/* %p: relative path of target file */
3267
					sp++;
B
Bruce Momjian 已提交
3268
					StrNCpy(dp, xlogpath, endp - dp);
3269
					make_native_path(dp);
3270 3271 3272 3273 3274
					dp += strlen(dp);
					break;
				case 'f':
					/* %f: filename of desired file */
					sp++;
B
Bruce Momjian 已提交
3275
					StrNCpy(dp, xlogfname, endp - dp);
3276 3277
					dp += strlen(dp);
					break;
3278 3279 3280 3281 3282 3283
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
B
Bruce Momjian 已提交
3306
			(errmsg_internal("executing restore command \"%s\"",
3307 3308
							 xlogRestoreCmd)));

3309 3310
	/*
	 * Set in_restore_command to tell the signal handler that we should exit
3311
	 * right away on SIGTERM. We know that we're at a safe point to do that.
3312 3313 3314 3315 3316
	 * Check if we had already received the signal, so that we don't miss a
	 * shutdown request received just before this.
	 */
	in_restore_command = true;
	if (shutdown_requested)
3317
		proc_exit(1);
3318

3319
	/*
3320
	 * Copy xlog from archival storage to XLOGDIR
3321 3322
	 */
	rc = system(xlogRestoreCmd);
3323 3324 3325

	in_restore_command = false;

3326 3327
	if (rc == 0)
	{
3328 3329 3330 3331
		/*
		 * command apparently succeeded, but let's make sure the file is
		 * really there now and has the correct size.
		 *
3332 3333 3334 3335 3336
		 * XXX I made wrong-size a fatal error to ensure the DBA would notice
		 * it, but is that too strong?	We could try to plow ahead with a
		 * local copy of the file ... but the problem is that there probably
		 * isn't one, and we'd incorrectly conclude we've reached the end of
		 * WAL and we're done recovering ...
3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360
		 */
		if (stat(xlogpath, &stat_buf) == 0)
		{
			if (expectedSize > 0 && stat_buf.st_size != expectedSize)
				ereport(FATAL,
						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
								xlogfname,
								(unsigned long) stat_buf.st_size,
								(unsigned long) expectedSize)));
			else
			{
				ereport(LOG,
						(errmsg("restored log file \"%s\" from archive",
								xlogfname)));
				strcpy(path, xlogpath);
				return true;
			}
		}
		else
		{
			/* stat failed */
			if (errno != ENOENT)
				ereport(FATAL,
						(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3361
						 errmsg("could not stat file \"%s\": %m",
3362
								xlogpath)));
3363 3364 3365 3366
		}
	}

	/*
3367
	 * Remember, we rollforward UNTIL the restore fails so failure here is
B
Bruce Momjian 已提交
3368
	 * just part of the process... that makes it difficult to determine
B
Bruce Momjian 已提交
3369 3370 3371
	 * whether the restore failed because there isn't an archive to restore,
	 * or because the administrator has specified the restore program
	 * incorrectly.  We have to assume the former.
3372 3373
	 *
	 * However, if the failure was due to any sort of signal, it's best to
B
Bruce Momjian 已提交
3374 3375 3376
	 * punt and abort recovery.  (If we "return false" here, upper levels will
	 * assume that recovery is complete and start up the database!) It's
	 * essential to abort on child SIGINT and SIGQUIT, because per spec
3377
	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3378 3379 3380 3381 3382
	 * those it's a good bet we should have gotten it too.
	 *
	 * On SIGTERM, assume we have received a fast shutdown request, and exit
	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
	 * child process. If we receive it first, the signal handler will call
3383 3384 3385
	 * proc_exit, otherwise we do it here. If we or the child process received
	 * SIGTERM for any other reason than a fast shutdown request, postmaster
	 * will perform an immediate shutdown when it sees us exiting
3386
	 * unexpectedly.
3387
	 *
B
Bruce Momjian 已提交
3388 3389 3390 3391
	 * Per the Single Unix Spec, shells report exit status > 128 when a called
	 * command died on a signal.  Also, 126 and 127 are used to report
	 * problems such as an unfindable command; treat those as fatal errors
	 * too.
3392
	 */
3393
	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3394
		proc_exit(1);
3395

3396 3397 3398
	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

	ereport(signaled ? FATAL : DEBUG2,
B
Bruce Momjian 已提交
3399 3400
		(errmsg("could not restore file \"%s\" from archive: return code %d",
				xlogfname, rc)));
3401 3402

	/*
B
Bruce Momjian 已提交
3403 3404
	 * if an archived file is not available, there might still be a version of
	 * this file in XLOGDIR, so return that as the filename to open.
3405
	 *
B
Bruce Momjian 已提交
3406 3407
	 * In many recovery scenarios we expect this to fail also, but if so that
	 * just means we've reached the end of WAL.
3408
	 */
3409
	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3410
	return false;
3411
}
3412
#endif
3413

3414
#ifdef NOT_USED
3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434
/*
 * Attempt to execute the recovery_end_command.
 */
static void
ExecuteRecoveryEndCommand(void)
{
	char		xlogRecoveryEndCmd[MAXPGPATH];
	char		lastRestartPointFname[MAXPGPATH];
	char	   *dp;
	char	   *endp;
	const char *sp;
	int			rc;
	bool		signaled;
	uint32		restartLog;
	uint32		restartSeg;

	Assert(recoveryEndCommand);

	/*
	 * Calculate the archive file cutoff point for use during log shipping
3435 3436
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
3437 3438
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
3439 3440
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
3441 3442 3443
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
3444 3445 3446 3447
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

	/*
	 * construct the command to be executed
	 */
	dp = xlogRecoveryEndCmd;
	endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryEndCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
			(errmsg_internal("executing recovery end command \"%s\"",
							 xlogRecoveryEndCmd)));

	/*
T
Tom Lane 已提交
3505
	 * execute the constructed command
3506 3507 3508 3509 3510 3511
	 */
	rc = system(xlogRecoveryEndCmd);
	if (rc != 0)
	{
		/*
		 * If the failure was due to any sort of signal, it's best to punt and
3512
		 * abort recovery. See also detailed comments on signals in
3513 3514 3515 3516 3517 3518
		 * RestoreArchivedFile().
		 */
		signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

		ereport(signaled ? FATAL : WARNING,
				(errmsg("recovery_end_command \"%s\": return code %d",
3519
						xlogRecoveryEndCmd, rc)));
3520 3521
	}
}
3522
#endif
3523

V
Vadim B. Mikheev 已提交
3524
/*
3525 3526 3527 3528 3529 3530 3531 3532
 * Preallocate log files beyond the specified log endpoint.
 *
 * XXX this is currently extremely conservative, since it forces only one
 * future log segment to exist, and even that only if we are 75% done with
 * the current one.  This is only appropriate for very low-WAL-volume systems.
 * High-volume systems will be OK once they've built up a sufficient set of
 * recycled log segments, but the startup transient is likely to include
 * a lot of segment creations by foreground processes, which is not so good.
T
Tom Lane 已提交
3533
 */
3534
static void
T
Tom Lane 已提交
3535 3536 3537 3538
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
3539 3540 3541

	MirroredFlatFileOpen	mirroredOpen;

3542
	bool		use_existent;
T
Tom Lane 已提交
3543 3544

	XLByteToPrevSeg(endptr, _logId, _logSeg);
B
Bruce Momjian 已提交
3545
	if ((endptr.xrecoff - 1) % XLogSegSize >=
B
Bruce Momjian 已提交
3546
		(uint32) (0.75 * XLogSegSize))
T
Tom Lane 已提交
3547 3548
	{
		NextLogSeg(_logId, _logSeg);
3549
		use_existent = true;
3550 3551 3552 3553
		XLogFileInit(
			&mirroredOpen,
			_logId, _logSeg, &use_existent, true);
		MirroredFlatFile_Close(&mirroredOpen);
3554
		if (!use_existent)
3555
			CheckpointStats.ckpt_segs_added++;
T
Tom Lane 已提交
3556 3557 3558
	}
}

3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599
/*
 * Get the log/seg of the latest removed or recycled WAL segment.
 * Returns 0/0 if no WAL segments have been removed since startup.
 */
void
XLogGetLastRemoved(uint32 *log, uint32 *seg)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	*log = xlogctl->lastRemovedLog;
	*seg = xlogctl->lastRemovedSeg;
	SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Update the last removed log/seg pointer in shared memory, to reflect
 * that the given XLOG file has been removed.
 */
static void
UpdateLastRemovedPtr(char *filename)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	uint32		tli,
				log,
				seg;

	XLogFromFileName(filename, &tli, &log, &seg);

	SpinLockAcquire(&xlogctl->info_lck);
	if (log > xlogctl->lastRemovedLog ||
		(log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
	{
		xlogctl->lastRemovedLog = log;
		xlogctl->lastRemovedSeg = seg;
	}
	SpinLockRelease(&xlogctl->info_lck);
}

T
Tom Lane 已提交
3600
/*
3601
 * Recycle or remove all log files older or equal to passed log/seg#
3602 3603 3604
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
3605 3606
 */
static void
3607
RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
3608
{
3609 3610
	uint32		endlogId;
	uint32		endlogSeg;
3611
	int			max_advance;
B
Bruce Momjian 已提交
3612 3613
	DIR		   *xldir;
	struct dirent *xlde;
3614
	char		lastoff[MAXFNAMELEN];
B
Bruce Momjian 已提交
3615
	char		path[MAXPGPATH];
3616
	char		*xlogDir = NULL;
V
Vadim B. Mikheev 已提交
3617

3618 3619
#ifdef WIN32
	char		newpath[MAXPGPATH];
3620
	char		newfilename[MAXPGPATH];
3621
#endif
3622
	struct stat statbuf;
3623

3624 3625 3626 3627
	/*
	 * Initialize info about where to try to recycle to.  We allow recycling
	 * segments up to XLOGfileslop segments beyond the current XLOG location.
	 */
3628
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3629
	max_advance = XLOGfileslop;
V
Vadim B. Mikheev 已提交
3630

3631 3632
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	xldir = AllocateDir(xlogDir);
V
Vadim B. Mikheev 已提交
3633
	if (xldir == NULL)
3634
		ereport(ERROR,
3635
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3636
				 errmsg("could not open transaction log directory \"%s\": %m",
3637
						xlogDir)));
V
Vadim B. Mikheev 已提交
3638

3639
	XLogFileName(lastoff, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
3640

3641
	while ((xlde = ReadDir(xldir, xlogDir)) != NULL)
V
Vadim B. Mikheev 已提交
3642
	{
3643
		/*
3644
		 * We ignore the timeline part of the XLOG segment identifiers in
B
Bruce Momjian 已提交
3645 3646 3647 3648 3649
		 * deciding whether a segment is still needed.	This ensures that we
		 * won't prematurely remove a segment from a parent timeline. We could
		 * probably be a little more proactive about removing segments of
		 * non-parent timelines, but that would be a whole lot more
		 * complicated.
3650
		 *
B
Bruce Momjian 已提交
3651 3652
		 * We use the alphanumeric sorting property of the filenames to decide
		 * which ones are earlier than the lastoff segment.
3653
		 */
3654 3655 3656
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
V
Vadim B. Mikheev 已提交
3657
		{
3658
			if (XLogArchiveCheckDone(xlde->d_name))
3659
			{
3660 3661 3662 3663 3664 3665 3666
				if (snprintf(path, MAXPGPATH, "%s/%s", xlogDir, xlde->d_name) > MAXPGPATH)
				{
					ereport(ERROR, (errmsg("cannot generate filename %s/%s", xlogDir, xlde->d_name)));
				}

				/* Update the last removed location in shared memory first */
				UpdateLastRemovedPtr(xlde->d_name);
3667

3668
				/*
B
Bruce Momjian 已提交
3669
				 * Before deleting the file, see if it can be recycled as a
3670 3671 3672
				 * future log segment. Only recycle normal files, pg_standby
				 * for example can create symbolic links pointing to a
				 * separate archive directory.
3673
				 */
3674 3675
				if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
					InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3676
										   true, &max_advance, true, xlde->d_name))
3677
				{
3678
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3679 3680
							(errmsg("recycled transaction log file \"%s\"",
									xlde->d_name)));
3681
					CheckpointStats.ckpt_segs_recycled++;
3682 3683 3684 3685 3686 3687
					/* Needn't recheck that slot on future iterations */
					if (max_advance > 0)
					{
						NextLogSeg(endlogId, endlogSeg);
						max_advance--;
					}
3688 3689 3690 3691
				}
				else
				{
					/* No need for any more future segments... */
3692
					int rc = 0;
3693

3694
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3695 3696
							(errmsg("removing transaction log file \"%s\"",
									xlde->d_name)));
3697

3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713
#ifdef WIN32
					/*
					 * On Windows, if another process (e.g another backend)
					 * holds the file open in FILE_SHARE_DELETE mode, unlink
					 * will succeed, but the file will still show up in
					 * directory listing until the last handle is closed.
					 * To avoid confusing the lingering deleted file for a
					 * live WAL file that needs to be archived, rename it
					 * before deleting it.
					 *
					 * If another process holds the file open without
					 * FILE_SHARE_DELETE flag, rename will fail. We'll try
					 * again at the next checkpoint.
					 */
					snprintf(newpath, MAXPGPATH, "%s.deleted", path);
					if (rename(path, newpath) != 0)
3714 3715
					{
						ereport(LOG,
3716
								(errcode_for_file_access(),
3717
								 errmsg("could not rename old transaction log file \"%s\": %m",
3718
										path)));
3719 3720
						continue;
					}
3721
					snprintf(newfilename, MAXPGPATH, "%s.deleted", xlde->d_name);
3722
					rc = MirroredFlatFile_Drop(
3723 3724 3725
										  XLOGDIR,
										  newfilename,
										  /* suppressError */ true,
3726
										  /* isMirrorRecovery */ false);
3727
#else
3728
					rc = MirroredFlatFile_Drop(
3729 3730 3731
										  XLOGDIR,
										  xlde->d_name,
										  /* suppressError */ true,
3732
										  /* isMirrorRecovery */ false);
3733
#endif
3734

3735
					if (rc != 0)
3736 3737
					{
						ereport(LOG,
3738 3739 3740
								(errcode_for_file_access(),
								 errmsg("could not remove old transaction log file \"%s\": %m",
										path)));
3741 3742
						continue;
					}
3743

3744
					CheckpointStats.ckpt_segs_removed++;
3745
				}
3746 3747

				XLogArchiveCleanup(xlde->d_name);
3748
			}
V
Vadim B. Mikheev 已提交
3749 3750
		}
	}
B
Bruce Momjian 已提交
3751

3752
	FreeDir(xldir);
3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790
	pfree(xlogDir);
}

/*
 * Print log files in the system log.
 *
 */
void
XLogPrintLogNames(void)
{
	DIR		   *xldir;
	struct dirent *xlde;
	int count = 0;
	char *xlogDir = NULL;

	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	xldir = AllocateDir(xlogDir);
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open transaction log directory \"%s\": %m",
						xlogDir)));

	while ((xlde = ReadDir(xldir, xlogDir)) != NULL)
	{
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24)
		{
			elog(LOG,"found log file \"%s\"",
				 xlde->d_name);
			count++;
		}
	}

	FreeDir(xldir);
	pfree(xlogDir);

	elog(LOG,"%d files found", count);
V
Vadim B. Mikheev 已提交
3791 3792
}

3793
/*
3794 3795 3796
 * Remove previous backup history files.  This also retries creation of
 * .ready files for any backup history files for which XLogArchiveNotify
 * failed earlier.
3797 3798
 */
static void
3799
CleanupBackupHistory(void)
3800 3801 3802 3803
{
	DIR		   *xldir;
	struct dirent *xlde;
	char		path[MAXPGPATH];
3804
	char	*xlogDir = NULL;
3805

3806 3807
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	xldir = AllocateDir(xlogDir);
3808 3809 3810
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3811
				 errmsg("could not open transaction log directory \"%s\": %m",
3812
						xlogDir)));
3813

3814
	while ((xlde = ReadDir(xldir, xlogDir)) != NULL)
3815 3816 3817 3818 3819 3820
	{
		if (strlen(xlde->d_name) > 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
				   ".backup") == 0)
		{
3821
			if (XLogArchiveCheckDone(xlde->d_name))
3822 3823
			{
				ereport(DEBUG2,
B
Bruce Momjian 已提交
3824 3825
				(errmsg("removing transaction log backup history file \"%s\"",
						xlde->d_name)));
3826 3827 3828 3829
				if (snprintf(path, MAXPGPATH, "%s/%s", xlogDir, xlde->d_name) > MAXPGPATH)
				{
					elog(LOG, "CleanupBackupHistory: Cannot generate filename %s/%s", xlogDir, xlde->d_name);
				}
3830 3831 3832 3833 3834 3835
				unlink(path);
				XLogArchiveCleanup(xlde->d_name);
			}
		}
	}

3836
	pfree(xlogDir);
3837 3838 3839
	FreeDir(xldir);
}

T
Tom Lane 已提交
3840 3841 3842 3843
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
3844 3845 3846 3847 3848 3849 3850 3851 3852
 *
 * Note: when a backup block is available in XLOG, we restore it
 * unconditionally, even if the page in the database appears newer.
 * This is to protect ourselves against database pages that were partially
 * or incorrectly written during a crash.  We assume that the XLOG data
 * must be good because it has passed a CRC check, while the database
 * page might not be.  This will force us to replay all subsequent
 * modifications of the page that appear in XLOG, rather than possibly
 * ignoring them as already applied, but that's not a huge drawback.
3853 3854
 *
 * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3855
 * Otherwise, a normal exclusive lock is used.	At the moment, that's just
3856 3857 3858
 * pro forma, because there can't be any regular backends in the system
 * during recovery.  The 'cleanup' argument applies to all backup blocks
 * in the WAL record, that suffices for now.
T
Tom Lane 已提交
3859
 */
3860 3861
void
RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3862 3863 3864 3865 3866
{
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
3867
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
3868
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3869
	{
T
Tom Lane 已提交
3870
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3871 3872
			continue;

3873
		memcpy(&bkpb, blk, sizeof(BkpBlock));
3874 3875
		blk += sizeof(BkpBlock);

3876 3877 3878
		/* get_cleanup_lock is ignored in GPDB */
		RestoreBackupBlockContents(lsn, bkpb, blk, false, false);

3879 3880 3881
		blk += BLCKSZ - bkpb.hole_length;
	}
}
3882

3883 3884 3885 3886 3887
/*
 * Workhorse for RestoreBackupBlock usable without an xlog record
 *
 * Restores a full-page image from BkpBlock and a data pointer.
 */
3888
static void
3889 3890 3891 3892 3893
RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
						   bool get_cleanup_lock, bool keep_buffer)
{
	Buffer		buffer;
	Page		page;
3894

3895 3896 3897
	if (! (bkpb.block_info & BLOCK_APPLY))
		return;

3898
	buffer = XLogReadBuffer(bkpb.node, bkpb.block, true);
3899 3900 3901 3902 3903 3904 3905
	Assert(BufferIsValid(buffer));
#if 0 /* upstream merge */
	if (get_cleanup_lock)
		LockBufferForCleanup(buffer);
	else
		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
#endif
3906

3907
	page = (Page) BufferGetPage(buffer);
3908

3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920
	if (bkpb.hole_length == 0)
	{
		memcpy((char *) page, blk, BLCKSZ);
	}
	else
	{
		memcpy((char *) page, blk, bkpb.hole_offset);
		/* must zero-fill the hole */
		MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
		memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
			   blk + bkpb.hole_offset,
			   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3921
	}
3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933

	/*
	 * The checksum value on this page is currently invalid. We don't
	 * need to reset it here since it will be set before being written.
	 */

	PageSetLSN(page, lsn);
	MarkBufferDirty(buffer);

	if (!keep_buffer)
		UnlockReleaseBuffer(buffer);

3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961
	return;
}

bool
IsBkpBlockApplied(XLogRecord *record, uint8 block_id)
{
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

	Assert(block_id < XLR_MAX_BKP_BLOCKS);

	blk = (char *) XLogRecGetData(record) + record->xl_len;
	for (i = 0; i <= block_id; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;

		memcpy(&bkpb, blk, sizeof(BkpBlock));
		blk += sizeof(BkpBlock);

		if (i == block_id)
			return (bkpb.block_info & BLOCK_APPLY) != 0;

		blk += BLCKSZ - bkpb.hole_length;
	}

	return false;
3962 3963
}

T
Tom Lane 已提交
3964 3965 3966 3967 3968 3969 3970
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
3971 3972 3973
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
3974
	pg_crc32	crc;
3975 3976
	int			i;
	uint32		len = record->xl_len;
3977
	BkpBlock	bkpb;
3978 3979
	char	   *blk;

3980 3981 3982 3983
	/*
	 * Calculate the crc using the new fast crc32c algorithm
	 */

3984
	/* First the rmgr data */
3985 3986
	INIT_CRC32C(crc);
	COMP_CRC32C(crc, XLogRecGetData(record), len);
3987

3988
	/* Add in the backup blocks, if any */
B
Bruce Momjian 已提交
3989
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
3990
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3991
	{
B
Bruce Momjian 已提交
3992
		uint32		blen;
3993

T
Tom Lane 已提交
3994
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3995 3996
			continue;

3997 3998
		memcpy(&bkpb, blk, sizeof(BkpBlock));
		if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3999
		{
4000
			ereport(emode,
4001 4002 4003
					(errmsg("incorrect hole size in record at %X/%X",
							recptr.xlogid, recptr.xrecoff)));
			return false;
4004
		}
4005
		blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
4006
		COMP_CRC32C(crc, blk, blen);
4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019
		blk += blen;
	}

	/* Check that xl_tot_len agrees with our calculation */
	if (blk != (char *) record + record->xl_tot_len)
	{
		ereport(emode,
				(errmsg("incorrect total length in record at %X/%X",
						recptr.xlogid, recptr.xrecoff)));
		return false;
	}

	/* Finally include the record header */
4020
	COMP_CRC32C(crc, (char *) record + sizeof(pg_crc32),
4021
			   SizeOfXLogRecord - sizeof(pg_crc32));
4022
	FIN_CRC32C(crc);
4023

4024
	if (!EQ_CRC32C(record->xl_crc, crc))
4025 4026 4027
	{
		ereport(emode,
		(errmsg("incorrect resource manager data checksum in record at %X/%X",
B
Bruce Momjian 已提交
4028
				recptr.xlogid, recptr.xrecoff)));
4029
		return false;
4030 4031
	}

4032
	return true;
4033 4034
}

T
Tom Lane 已提交
4035
/*
4036
 * Verify whether pg_xlog exists
T
Tom Lane 已提交
4037
 *
4038 4039 4040
 * It is not the goal of this function to verify the contents of these
 * directories, but to help in cases where someone has performed a
 * copy but omitted pg_xlog from the copy.
T
Tom Lane 已提交
4041
 */
4042 4043
static void
ValidateXLOGDirectoryStructure(void)
4044
{
4045 4046
	char		path[MAXPGPATH];
	char	   *fullpath;
4047
	struct stat stat_buf;
T
Tom Lane 已提交
4048

4049
	fullpath = makeRelativeToTxnFilespace(XLOGDIR);
4050

4051
	/* Check for pg_xlog; if it doesn't exist, error out */
4052
	if (stat(fullpath, &stat_buf) != 0 ||
4053 4054 4055 4056
			!S_ISDIR(stat_buf.st_mode))
			ereport(FATAL,
					(errmsg("required WAL directory \"%s\" does not exist",
							XLOGDIR)));
4057 4058 4059 4060 4061

	/* Check for archive_status */
	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
	fullpath = makeRelativeToTxnFilespace(path);
	if (stat(fullpath, &stat_buf) == 0)
4062
	{
4063 4064 4065 4066 4067
		/* Check for weird cases where it exists but isn't a directory */
		if (!S_ISDIR(stat_buf.st_mode))
			ereport(FATAL, 
					(errmsg("required WAL directory \"%s\" does not exist",
							path)));
4068 4069 4070
	}
	else
	{
4071 4072 4073 4074 4075 4076
		ereport(LOG,
				(errmsg("creating missing WAL directory \"%s\"", path)));
		if (mkdir(fullpath, 0700) < 0)
			ereport(FATAL, 
					(errmsg("could not create missing directory \"%s\": %m",
							path)));
4077
	}
4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095
}

/*
 * Open a logfile segment for reading (during recovery).
 * It's assumed to be already available in pg_xlog.
 */
static int
XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
			 int source, bool notfoundOk)
{
	char		xlogfname[MAXFNAMELEN];
	char		activitymsg[MAXFNAMELEN + 16];
	char		path[MAXPGPATH];
	int			fd;

	XLogFileName(xlogfname, tli, log, seg);

	switch (source)
T
Tom Lane 已提交
4096
	{
4097 4098 4099 4100 4101 4102 4103 4104
		case XLOG_FROM_PG_XLOG:
		case XLOG_FROM_STREAM:
			XLogFilePath(path, tli, log, seg);
			restoredFromArchive = false;
			break;

		default:
			elog(ERROR, "invalid XLogFileRead source %d", source);
T
Tom Lane 已提交
4105
	}
4106

4107 4108 4109 4110 4111 4112 4113 4114 4115
	elogif(debug_xlog_record_read, LOG,
		   "xlog file read -- File read request with log %u, seg %u,"
		   "tli %u, source = %s, notfoundok = %s",
		   log, seg, (uint32) tli,
		   source == XLOG_FROM_PG_XLOG ? "xlog" : "stream",
		   notfoundOk ? "true" : "false");

	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (fd >= 0)
4116
	{
4117 4118 4119 4120 4121 4122 4123 4124 4125
		/* Success! */
		curFileTLI = tli;

		/*
		 * Report recovery progress in PS display, if we are in
		 * startup process.  There are more cases like Filerep recovery
		 * and Prepare phase where we don't want to report it.
		 */
		if (am_startup)
4126
		{
4127 4128 4129
			snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
					 xlogfname);
			set_ps_display(activitymsg, false);
4130
		}
4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188

		/* Track source of data in assorted state variables */
		readSource = source;
		XLogReceiptSource = source;
		/* In FROM_STREAM case, caller tracks receipt time, not me */
		if (source != XLOG_FROM_STREAM)
			XLogReceiptTime = GetCurrentTimestamp();

		elogif(debug_xlog_record_read, LOG,
			   "xlog file read -- Read file %s (log %u, seg %u)",
			   path, log, seg);

		return fd;
	}

	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
		ereport(PANIC,
				(errcode_for_file_access(),
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));

	elogif(debug_xlog_record_read, LOG,
		   "xlog file read -- Couldn't read file %s (log %u, seg %u)",
		   path, log, seg);
	return -1;
}


/*
 * Open a logfile segment for reading (during recovery).
 *
 * This version searches for the segment with any TLI listed in expectedTLIs.
 */
static int
XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
{
	char		path[MAXPGPATH];
	ListCell   *cell;
	int			fd;

	/*
	 * Loop looking for a suitable timeline ID: we might need to read any of
	 * the timelines listed in expectedTLIs.
	 *
	 * We expect curFileTLI on entry to be the TLI of the preceding file in
	 * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
	 * to go backwards; this prevents us from picking up the wrong file when a
	 * parent timeline extends to higher segment numbers than the child we
	 * want to read.
	 */
	foreach(cell, expectedTLIs)
	{
		TimeLineID	tli = (TimeLineID) lfirst_int(cell);

		if (tli < curFileTLI)
			break;				/* don't bother looking at too-old TLIs */

		if (sources & XLOG_FROM_PG_XLOG)
4189
		{
4190 4191 4192 4193 4194 4195 4196
			elogif(debug_xlog_record_read, LOG,
				   "xlog file read (tli) -- requesting a file read (log %u, seg %u)"
				   "with currenttli %d ", log, seg, curFileTLI);

			fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
			if (fd != -1)
				return fd;
4197
		}
4198
	}
4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244

	/* Couldn't find it.  For simplicity, complain about front timeline */
	XLogFilePath(path, recoveryTargetTLI, log, seg);
	errno = ENOENT;
	ereport(emode,
			(errcode_for_file_access(),
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
	return -1;
}


/*
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
 * Returns true if the page is read successfully.
 *
 * This is responsible for waiting for the requested WAL record to arrive in
 * standby mode.
 *
 * 'emode' specifies the log level used for reporting "file not found" or
 * "end of WAL" situations in standby mode when a trigger file is found.
 * If set to WARNING or below, XLogPageRead() returns false in those situations
 * on higher log levels the ereport() won't return.
 *
 * In standby mode, this only returns false if promotion has been triggered.
 * Otherwise it keeps sleeping and retrying indefinitely.
 */
static bool
XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess)
{
	static XLogRecPtr receivedUpto = {0, 0};
	bool		switched_segment = false;
	uint32		targetPageOff;
	uint32		targetRecOff;
	uint32		targetId;
	uint32		targetSeg;
	static pg_time_t last_fail_time = 0;

	XLByteToSeg(*RecPtr, targetId, targetSeg);
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;

	/* Fast exit if we have read the record in the current buffer already */
	if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
		targetPageOff == readOff && targetRecOff < readLen)
4245
	{
4246 4247 4248 4249 4250 4251
		elogif(debug_xlog_record_read, LOG,
			   "xlog page read -- Requested record %X/%X (targetlogid %u,"
			   "targetset %u, targetpageoff %u, targetrecoff %u) already"
			   "exists in current read buffer",
			   RecPtr->xlogid, RecPtr->xrecoff,
			   targetId, targetSeg, targetPageOff, targetRecOff);
B
Bruce Momjian 已提交
4252

4253
		return true;
4254 4255
	}

4256 4257 4258 4259
	/*
	 * See if we need to switch to a new segment because the requested record
	 * is not in the currently open one.
	 */
T
Tom Lane 已提交
4260
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
4261
	{
4262 4263 4264 4265 4266
		elogif(debug_xlog_record_read, LOG,
			   "xlog page read -- Requested record %X/%X does not exist in"
			   "current read xlog file (readlog %u, readseg %u)",
			   RecPtr->xlogid, RecPtr->xrecoff, readId, readSeg);

4267 4268
		close(readFile);
		readFile = -1;
4269
		readSource = 0;
4270
	}
4271

T
Tom Lane 已提交
4272
	XLByteToSeg(*RecPtr, readId, readSeg);
4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283

	elogif(debug_xlog_record_read, LOG,
		   "xlog page read -- Requested record %X/%X has targetlogid %u, "
		   "targetseg %u, targetpageoff %u, targetrecoff %u",
		   RecPtr->xlogid, RecPtr->xrecoff,
		   targetId, targetSeg, targetPageOff, targetRecOff);

retry:
	/* See if we need to retrieve more data */
	if (readFile < 0 ||
		(readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
4284
	{
4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295
		if (StandbyMode)
		{
			/*
			 * In standby mode, wait for the requested record to become
			 * available, via WAL receiver having streamed the record.
			 */
			for (;;)
			{
				if (WalRcvInProgress())
				{
					bool		havedata;
4296

4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519
					/*
					 * If we find an invalid record in the WAL streamed from
					 * master, something is seriously wrong. There's little
					 * chance that the problem will just go away, but PANIC is
					 * not good for availability. Disconnect, and retry from
					 * pg_xlog again (That may spawn the Wal receiver again!).
					 * XXX
					 */
					if (failedSources & XLOG_FROM_STREAM)
					{
						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- Xlog from stream is a failed"
							   "source, hence requesting walreceiver shutdown.");

						ShutdownWalRcv();
						continue;
					}

					/*
					 * WAL receiver is active, so see if new data has arrived.
					 *
					 * We only advance XLogReceiptTime when we obtain fresh
					 * WAL from walreceiver and observe that we had already
					 * processed everything before the most recent "chunk"
					 * that it flushed to disk.  In steady state where we are
					 * keeping up with the incoming data, XLogReceiptTime will
					 * be updated on each cycle.  When we are behind,
					 * XLogReceiptTime will not advance, so the grace time
					 * alloted to conflicting queries will decrease.
					 */
					if (XLByteLT(*RecPtr, receivedUpto))
						havedata = true;
					else
					{
						XLogRecPtr	latestChunkStart;

						receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
						if (XLByteLT(*RecPtr, receivedUpto))
						{
							havedata = true;
							if (!XLByteLT(*RecPtr, latestChunkStart))
							{
								XLogReceiptTime = GetCurrentTimestamp();
								SetCurrentChunkStartTime(XLogReceiptTime);
							}
						}
						else
							havedata = false;
					}

					if (havedata)
					{
						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- There is enough xlog data to be "
							   "read (receivedupto %X/%X, requestedrec %X/%X)",
							   receivedUpto.xlogid, receivedUpto.xrecoff,
							   RecPtr->xlogid, RecPtr->xrecoff);

						/*
						 * Great, streamed far enough. Open the file if it's
						 * not open already.  Use XLOG_FROM_STREAM so that
						 * source info is set correctly and XLogReceiptTime
						 * isn't changed.
						 */
						if (readFile < 0)
						{
							readFile =
								XLogFileRead(readId, readSeg, PANIC,
											 recoveryTargetTLI,
											 XLOG_FROM_STREAM, false);
							Assert(readFile >= 0);
							switched_segment = true;
						}
						else
						{
							/* just make sure source info is correct... */
							readSource = XLOG_FROM_STREAM;
							XLogReceiptSource = XLOG_FROM_STREAM;
						}
						break;
					}

					/*
					 * Data not here yet, so check for trigger then sleep for
					 * five seconds like in the WAL file polling case below.
					 */
					if (CheckForStandbyTrigger())
					{
						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- Standby trigger was activated");

						goto retry;
					}

					elogif(debug_xlog_record_read, LOG,
						   "xlog page read -- No xlog data to read as of now. "
						   "Will Wait on latch till some event occurs");

					/*
					 * Wait for more WAL to arrive, or timeout to be reached
					 */
					WaitLatch(&XLogCtl->recoveryWakeupLatch,
							  WL_LATCH_SET | WL_TIMEOUT,
							  5000L);
					ResetLatch(&XLogCtl->recoveryWakeupLatch);
				}
				else
				{
					int			sources;
					pg_time_t	now;

					if (readFile >= 0)
					{
						close(readFile);
						readFile = -1;
					}

					/* Reset curFileTLI if random fetch. */
					if (randAccess)
						curFileTLI = 0;

					/* Read an existing file from pg_xlog. */
					sources = XLOG_FROM_PG_XLOG;
					if (!(sources & ~failedSources))
					{
						/*
						 * Check if we have been asked to be promoted. If yes,
						 * no use of requesting a new WAL receiver
						 */
						if (CheckForStandbyTrigger())
							goto triggered;

						/*
						 * We've exhausted all options for retrieving the
						 * file. Retry.
						 */
						failedSources = 0;

						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- All read sources have failed. So, retry.");

						/*
						 * If it hasn't been long since last attempt, sleep to
						 * avoid busy-waiting.
						 */
						now = (pg_time_t) time(NULL);
						if ((now - last_fail_time) < 5)
						{
							pg_usleep(1000000L * (5 - (now - last_fail_time)));
							now = (pg_time_t) time(NULL);
						}
						last_fail_time = now;

						/*
						 * If primary_conninfo is set, launch walreceiver to
						 * try to stream the missing WAL.
						 *
						 * If fetching_ckpt is TRUE, RecPtr points to the
						 * initial checkpoint location. In that case, we use
						 * RedoStartLSN as the streaming start position
						 * instead of RecPtr, so that when we later jump
						 * backwards to start redo at RedoStartLSN, we will
						 * have the logs streamed already.
						 */
						if (PrimaryConnInfo)
						{
							RequestXLogStreaming(
									  fetching_ckpt ? RedoStartLSN : *RecPtr,
												 PrimaryConnInfo);
							continue;
						}
					}
					/* Don't try to read from a source that just failed */
					sources &= ~failedSources;
					readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
												  sources);
					switched_segment = true;
					if (readFile >= 0)
						break;

					/*
					 * Nope, not found in pg_xlog.
					 */
					failedSources |= sources;

					/*
					 * Check to see if the trigger file exists. Note that we
					 * do this only after failure, so when you create the
					 * trigger file, we still finish replaying as much as we
					 * can from pg_xlog before failover.
					 */
					if (CheckForStandbyTrigger())
						goto triggered;
				}

				/*
				 * This possibly-long loop needs to handle interrupts of
				 * startup process.
				 */
				HandleStartupProcInterrupts();
			}
		}
		else
		{
			/* In crash recovery. */
			if (readFile < 0)
			{
				int			sources;

				/* Reset curFileTLI if random fetch. */
				if (randAccess)
					curFileTLI = 0;

				sources = XLOG_FROM_PG_XLOG;

				readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
											sources);
				switched_segment = true;
				if (readFile < 0)
					return false;
			}
		}
	}
4520

4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547
	/*
	 * At this point, we have the right segment open and if we're streaming we
	 * know the requested record is in it.
	 */
	Assert(readFile != -1);

	/*
	 * If the current segment is being streamed from master, calculate how
	 * much of the current page we have received already. We know the
	 * requested record has been received, but this is for the benefit of
	 * future calls, to allow quick exit at the top of this function.
	 */
	if (readSource == XLOG_FROM_STREAM)
	{
		if (RecPtr->xlogid != receivedUpto.xlogid ||
			(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
		{
			readLen = XLOG_BLCKSZ;
		}
		else
			readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
	}
	else
		readLen = XLOG_BLCKSZ;

	if (switched_segment && targetPageOff != 0)
	{
4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563
		/*
		 * Whenever switching to a new WAL segment, we read the first page of
		 * the file and validate its header, even if that's not where the
		 * target record is.  This is so that we can check the additional
		 * identification info that is present in the first page's "long"
		 * header.
		 */
		readOff = 0;
		if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
		{
			ereport(emode,
					(errcode_for_file_access(),
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
							readId, readSeg, readOff)));
			goto next_record_is_invalid;
		}
4564
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
T
Tom Lane 已提交
4565
		{
4566 4567
			ereport(emode,
					(errcode_for_file_access(),
4568
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
4569
							readId, readSeg, readOff)));
T
Tom Lane 已提交
4570 4571
			goto next_record_is_invalid;
		}
4572
	}
4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684

	/* Read the requested page */
	readOff = targetPageOff;
	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
	{
		ereport(emode,
				(errcode_for_file_access(),
		 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
				readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
	{
		ereport(emode,
				(errcode_for_file_access(),
		 errmsg("could not read from log file %u, segment %u, offset %u: %m",
				readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, false))
	{
		elogif(debug_xlog_record_read, LOG,
			   "xlog page read -- xlog page header invalid");
		goto next_record_is_invalid;
	}

	Assert(targetId == readId);
	Assert(targetSeg == readSeg);
	Assert(targetPageOff == readOff);
	Assert(targetRecOff < readLen);

	return true;

next_record_is_invalid:

	elogif(debug_xlog_record_read, LOG,
		   "xlog page read -- next record is invalid.");

	failedSources |= readSource;

	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return false;

triggered:
	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	return false;
}

/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
 * (emode must be either PANIC or LOG.)
 *
 * The record is copied into readRecordBuf, so that on successful return,
 * the returned record pointer always points there.
 */
XLogRecord *
XLogReadRecord(XLogRecPtr *RecPtr, bool fetching_ckpt, int emode)
{
	XLogRecord *record;
	char	   *buffer;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
	bool		randAccess = false;
	uint32		len,
				total_len;
	uint32		targetRecOff;
	uint32		pageHeaderSize;

	if (readBuf == NULL)
	{
		/*
		 * First time through, permanently allocate readBuf.  We do it this
		 * way, rather than just making a static array, for two reasons: (1)
		 * no need to waste the storage in most instantiations of the backend;
		 * (2) a static char array isn't guaranteed to have any particular
		 * alignment, whereas malloc() will provide MAXALIGN'd storage.
		 */
		readBuf = (char *) malloc(XLOG_BLCKSZ);
		if(!readBuf)
			ereport(PANIC, (errmsg("Cannot allocate memory for read log record. Out of Memory")));
	}

	if (RecPtr == NULL)
	{
		RecPtr = &tmpRecPtr;

		/*
		 * RecPtr is pointing to end+1 of the previous WAL record. We must
		 * advance it if necessary to where the next record starts.  First,
		 * align to next page if no more records can fit on the current page.
		 */
		if (nextRecord == NULL)
		{
			/* align old recptr to next page */
4685 4686 4687
			if (RecPtr->xrecoff % XLOG_BLCKSZ != 0)
				RecPtr->xrecoff += (XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ);
			if (RecPtr->xrecoff >= XLogFileSize)
4688
			{
4689 4690
				(RecPtr->xlogid)++;
				RecPtr->xrecoff = 0;
4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737
			}
		}
		/* We will account for page header size below */
	}
	else
	{
		/*
		 * In this case, the passed-in record pointer should already be
		 * pointing to a valid record starting position.
		 */
		if (!XRecOffIsValid(RecPtr->xrecoff))
			ereport(PANIC,
					(errmsg("invalid record offset at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));

		/*
		 * Since we are going to a random position in WAL, forget any prior
		 * state about what timeline we were in, and allow it to be any
		 * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
		 * to go backwards (but we can't reset that variable right here, since
		 * we might not change files at all).
		 */
		lastPageTLI = 0;		/* see comment in ValidXLOGHeader */
		lastSegmentTLI = 0;
		randAccess = true;		/* allow curFileTLI to go backwards too */
	}

	/* This is the first try to read this page. */
	failedSources = 0;
retry:
	/* Read the page containing the record */
	if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
	{
		/*
		 * In standby mode, XLogPageRead returning false means that promotion
		 * has been triggered.
		 */
		if (StandbyMode)
			return NULL;
		else
			goto next_record_is_invalid;
	}

	/* *********Above this xlogpageread should called ***********/
	pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
	if (targetRecOff == 0)
4738 4739
	{
		/*
4740 4741 4742 4743
		 * At page start, so skip over page header.  The Assert checks that
		 * we're not scribbling on caller's record pointer; it's OK because we
		 * can only get here in the continuing-from-prev-record case, since
		 * XRecOffIsValid rejected the zero-page-offset case otherwise.
4744
		 */
4745
		Assert(RecPtr == &tmpRecPtr);
4746
		RecPtr->xrecoff += pageHeaderSize;
4747 4748 4749 4750 4751 4752 4753 4754 4755
		targetRecOff = pageHeaderSize;
	}
	else if (targetRecOff < pageHeaderSize)
	{
		ereport(emode,
				(errmsg("invalid record offset at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
T
Tom Lane 已提交
4756
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
4757
		targetRecOff == pageHeaderSize)
4758
	{
4759 4760 4761
		ereport(emode,
				(errmsg("contrecord is requested by %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
4762 4763
		goto next_record_is_invalid;
	}
4764
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
4765

T
Tom Lane 已提交
4766
	/*
B
Bruce Momjian 已提交
4767 4768
	 * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
	 * required.
T
Tom Lane 已提交
4769
	 */
4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		if (record->xl_len != 0)
		{
			ereport(emode,
					(errmsg("invalid xlog switch record at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else if (record->xl_len == 0)
4781
	{
4782 4783 4784
		ereport(emode,
				(errmsg("record with zero length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
4785 4786
		goto next_record_is_invalid;
	}
4787 4788 4789 4790 4791 4792 4793 4794 4795
	if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
		record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
		XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
	{
		ereport(emode,
				(errmsg("invalid record length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
4796 4797 4798 4799
	if (record->xl_rmid > RM_MAX_ID)
	{
		ereport(emode,
				(errmsg("invalid resource manager ID %u at %X/%X",
B
Bruce Momjian 已提交
4800
						record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
4801 4802
		goto next_record_is_invalid;
	}
4803 4804 4805
	if (randAccess)
	{
		/*
B
Bruce Momjian 已提交
4806 4807
		 * We can't exactly verify the prev-link, but surely it should be less
		 * than the record's own address.
4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820
		 */
		if (!XLByteLT(record->xl_prev, *RecPtr))
		{
			ereport(emode,
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else
	{
		/*
B
Bruce Momjian 已提交
4821 4822 4823
		 * Record's prev-link should exactly match our previous location. This
		 * check guards against torn WAL pages where a stale but valid-looking
		 * WAL record starts on a sector boundary.
4824 4825 4826 4827 4828 4829 4830 4831 4832 4833
		 */
		if (!XLByteEQ(record->xl_prev, ReadRecPtr))
		{
			ereport(emode,
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
B
Bruce Momjian 已提交
4834

T
Tom Lane 已提交
4835
	/*
B
Bruce Momjian 已提交
4836
	 * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
4837 4838 4839 4840
	 * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
	 * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
	 * enough for all "normal" records, but very large commit or abort records
	 * might need more space.)
T
Tom Lane 已提交
4841
	 */
4842
	total_len = record->xl_tot_len;
4843
	if (total_len > readRecordBufSize)
4844
	{
4845 4846
		uint32		newSize = total_len;

4847 4848
		newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
		newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
4849 4850 4851 4852 4853 4854 4855
		if (readRecordBuf)
			free(readRecordBuf);
		readRecordBuf = (char *) malloc(newSize);
		if (!readRecordBuf)
		{
			readRecordBufSize = 0;
			ereport(emode,
4856 4857
					(errmsg("cannot allocate %u bytes for record at %X/%X",
							newSize, RecPtr->xlogid, RecPtr->xrecoff)));
4858 4859 4860
			goto next_record_is_invalid;
		}
		readRecordBufSize = newSize;
4861
	}
4862 4863

	buffer = readRecordBuf;
4864
	nextRecord = NULL;
4865
	len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
T
Tom Lane 已提交
4866
	if (total_len > len)
4867
	{
T
Tom Lane 已提交
4868 4869
		/* Need to reassemble record */
		XLogContRecord *contrecord;
4870
		XLogRecPtr	pagelsn;
B
Bruce Momjian 已提交
4871
		uint32		gotlen = len;
4872

4873 4874 4875 4876
		/* Initialize pagelsn to the beginning of the page this record is on */
		pagelsn = *RecPtr;
		pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;

T
Tom Lane 已提交
4877
		memcpy(buffer, record, len);
4878
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
4879
		buffer += len;
4880
		for (;;)
4881
		{
4882 4883 4884
			/* Calculate pointer to beginning of next page */
			pagelsn.xrecoff += XLOG_BLCKSZ;
			if (pagelsn.xrecoff >= XLogFileSize)
4885
			{
4886 4887
				(pagelsn.xlogid)++;
				pagelsn.xrecoff = 0;
4888
			}
4889 4890
			/* Wait for the next page to become available */
			if (!XLogPageRead(&pagelsn, emode, false, false))
T
Tom Lane 已提交
4891
			{
4892 4893 4894 4895 4896 4897 4898 4899
				/*
				 * In standby-mode, XLogPageRead returning false means that
				 * promotion has been triggered.
				 */
				if (StandbyMode)
					return NULL;
				else
					goto next_record_is_invalid;
T
Tom Lane 已提交
4900
			}
4901

T
Tom Lane 已提交
4902
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
4903
			{
4904 4905 4906
				ereport(emode,
						(errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
								readId, readSeg, readOff)));
4907 4908
				goto next_record_is_invalid;
			}
4909 4910
			pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
			contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
B
Bruce Momjian 已提交
4911
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
4912
				total_len != (contrecord->xl_rem_len + gotlen))
4913
			{
4914 4915 4916 4917
				ereport(emode,
						(errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
								contrecord->xl_rem_len,
								readId, readSeg, readOff)));
4918 4919
				goto next_record_is_invalid;
			}
4920
			len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
T
Tom Lane 已提交
4921
			if (contrecord->xl_rem_len > len)
4922
			{
B
Bruce Momjian 已提交
4923
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
4924 4925 4926 4927 4928 4929 4930 4931 4932 4933
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
4934
		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
4935
		if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
4936
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
T
Tom Lane 已提交
4937
		{
B
Bruce Momjian 已提交
4938
			nextRecord = (XLogRecord *) ((char *) contrecord +
B
Bruce Momjian 已提交
4939
					MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
T
Tom Lane 已提交
4940 4941 4942
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
4943 4944
			pageHeaderSize +
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
T
Tom Lane 已提交
4945
		ReadRecPtr = *RecPtr;
4946
		/* needn't worry about XLOG SWITCH, it can't cross page boundaries */
T
Tom Lane 已提交
4947
		return record;
4948 4949
	}

T
Tom Lane 已提交
4950 4951 4952
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
4953
	if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
T
Tom Lane 已提交
4954 4955 4956 4957 4958 4959
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
B
Bruce Momjian 已提交
4960

4961 4962 4963 4964 4965 4966 4967 4968 4969
	/*
	 * Special processing if it's an XLOG SWITCH record
	 */
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		/* Pretend it extends to end of segment */
		EndRecPtr.xrecoff += XLogSegSize - 1;
		EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
		nextRecord = NULL;		/* definitely not on same page */
B
Bruce Momjian 已提交
4970

4971
		/*
B
Bruce Momjian 已提交
4972 4973 4974
		 * Pretend that readBuf contains the last page of the segment. This is
		 * just to avoid Assert failure in StartupXLOG if XLOG ends with this
		 * segment.
4975 4976 4977
		 */
		readOff = XLogSegSize - XLOG_BLCKSZ;
	}
4978 4979 4980 4981 4982 4983

	elogif(debug_xlog_record_read, LOG,
		   "xlog read record -- Read record %X/%X successfully with endrecptr %X/%X",
		   ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
		   EndRecPtr.xlogid, EndRecPtr.xrecoff);

T
Tom Lane 已提交
4984
	return (XLogRecord *) buffer;
4985

4986 4987 4988 4989 4990 4991 4992
next_record_is_invalid:

	elogif(debug_xlog_record_read, LOG,
		   "xlog record read -- next record is invalid.");

	failedSources |= readSource;

4993 4994 4995 4996 4997 4998
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}

4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039
	nextRecord = NULL;

	/* In standby-mode, keep trying */
	if (StandbyMode && !CheckForStandbyTrigger())
		goto retry;
	else
		return NULL;
}

/*
 * Close, re-set and clean all the necessary resources used during reading
 * XLog records.
 */
void
XLogCloseReadRecord(void)
{
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	else
		Assert(readFile == -1);

	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	if (readRecordBuf)
	{
		free(readRecordBuf);
		readRecordBuf = NULL;
	}

	readId = 0;
	readSeg = 0;
	readOff = 0;
	readLen = 0;
	readRecordBufSize = 0;
T
Tom Lane 已提交
5040
	nextRecord = NULL;
5041 5042 5043 5044 5045

	memset(&ReadRecPtr, 0, sizeof(XLogRecPtr));
	memset(&EndRecPtr, 0, sizeof(XLogRecPtr));

	elog((Debug_print_qd_mirroring ? LOG : DEBUG1),"close read record");
5046 5047
}

5048 5049 5050 5051
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
5052
 * ReadRecord.	It's not intended for use from anywhere else.
5053 5054
 */
static bool
5055
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool segmentonly)
5056
{
5057 5058
	XLogRecPtr	recaddr;

5059 5060
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
5061 5062 5063
		ereport(emode,
				(errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
						hdr->xlp_magic, readId, readSeg, readOff)));
5064 5065 5066 5067
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
5068 5069 5070
		ereport(emode,
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
5071 5072
		return false;
	}
5073
	if (hdr->xlp_info & XLP_LONG_HEADER)
5074
	{
5075
		XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
B
Bruce Momjian 已提交
5076

5077
		if (longhdr->xlp_sysid != ControlFile->system_identifier)
5078
		{
5079 5080
			char		fhdrident_str[32];
			char		sysident_str[32];
5081

5082
			/*
B
Bruce Momjian 已提交
5083 5084
			 * Format sysids separately to keep platform-dependent format code
			 * out of the translatable message string.
5085 5086 5087 5088 5089 5090 5091
			 */
			snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
					 longhdr->xlp_sysid);
			snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
					 ControlFile->system_identifier);
			ereport(emode,
					(errmsg("WAL file is from different system"),
B
Bruce Momjian 已提交
5092 5093
					 errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
							   fhdrident_str, sysident_str)));
5094 5095 5096 5097 5098 5099
			return false;
		}
		if (longhdr->xlp_seg_size != XLogSegSize)
		{
			ereport(emode,
					(errmsg("WAL file is from different system"),
B
Bruce Momjian 已提交
5100
					 errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
5101 5102
			return false;
		}
5103 5104 5105 5106 5107 5108 5109
		if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
		{
			ereport(emode,
					(errmsg("WAL file is from different system"),
					 errdetail("Incorrect XLOG_BLCKSZ in page header.")));
			return false;
		}
5110
	}
5111 5112 5113 5114 5115 5116 5117 5118 5119
	else if (readOff == 0)
	{
		/* hmm, first page of file doesn't have a long header? */
		ereport(emode,
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
		return false;
	}

5120 5121 5122 5123 5124 5125
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
		ereport(emode,
				(errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
B
Bruce Momjian 已提交
5126
						hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Check page TLI is one of the expected values.
	 */
	if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
	{
		ereport(emode,
				(errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
						hdr->xlp_tli,
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Since child timelines are always assigned a TLI greater than their
	 * immediate parent's TLI, we should never see TLI go backwards across
	 * successive pages of a consistent WAL sequence.
	 *
B
Bruce Momjian 已提交
5148
	 * Of course this check should only be applied when advancing sequentially
5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159
	 * across pages; therefore ReadRecord resets lastPageTLI and
	 * lastSegmentTLI to zero when going to a random page.
	 *
	 * Sometimes we re-open a segment that's already been partially replayed.
	 * In that case we cannot perform the normal TLI check: if there is a
	 * timeline switch within the segment, the first page has a smaller TLI
	 * than later pages following the timeline switch, and we might've read
	 * them already. As a weaker test, we still check that it's not smaller
	 * than the TLI we last saw at the beginning of a segment. Pass
	 * segmentonly = true when re-validating the first page like that, and the
	 * page you're actually interested in comes later.
5160
	 */
5161
	if (hdr->xlp_tli < (segmentonly ? lastSegmentTLI : lastPageTLI))
5162 5163 5164 5165 5166 5167 5168 5169
	{
		ereport(emode,
				(errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
						hdr->xlp_tli, lastPageTLI,
						readId, readSeg, readOff)));
		return false;
	}
	lastPageTLI = hdr->xlp_tli;
5170 5171 5172
	if (readOff == 0)
		lastSegmentTLI = hdr->xlp_tli;

5173 5174 5175 5176 5177 5178 5179
	return true;
}

/*
 * Try to read a timeline's history file.
 *
 * If successful, return the list of component TLIs (the given TLI followed by
B
Bruce Momjian 已提交
5180
 * its ancestor TLIs).	If we can't find the history file, assume that the
5181 5182 5183
 * timeline has no parents, and return a list of just the specified timeline
 * ID.
 */
5184 5185
List *
XLogReadTimeLineHistory(TimeLineID targetTLI)
5186 5187 5188 5189
{
	List	   *result;
	char		path[MAXPGPATH];
	char		fline[MAXPGPATH];
B
Bruce Momjian 已提交
5190
	FILE	   *fd;
5191

5192 5193 5194 5195 5196
	/* Timeline 1 does not have a history file, so no need to check */
	if (targetTLI == 1)
		return list_make1_int((int) targetTLI);

	TLHistoryFilePath(path, targetTLI);
5197

B
Bruce Momjian 已提交
5198
	fd = AllocateFile(path, "r");
5199 5200 5201 5202 5203
	if (fd == NULL)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
5204
					 errmsg("could not open file \"%s\": %m", path)));
5205 5206 5207 5208 5209 5210
		/* Not there, so assume no parents */
		return list_make1_int((int) targetTLI);
	}

	result = NIL;

B
Bruce Momjian 已提交
5211 5212 5213
	/*
	 * Parse the file...
	 */
5214
	while (fgets(fline, sizeof(fline), fd) != NULL)
5215 5216
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
5217 5218 5219
		char	   *ptr;
		char	   *endptr;
		TimeLineID	tli;
5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239

		for (ptr = fline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* expect a numeric timeline ID as first field of line */
		tli = (TimeLineID) strtoul(ptr, &endptr, 0);
		if (endptr == ptr)
			ereport(FATAL,
					(errmsg("syntax error in history file: %s", fline),
					 errhint("Expected a numeric timeline ID.")));

		if (result &&
			tli <= (TimeLineID) linitial_int(result))
			ereport(FATAL,
					(errmsg("invalid data in history file: %s", fline),
B
Bruce Momjian 已提交
5240
				   errhint("Timeline IDs must be in increasing sequence.")));
5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253

		/* Build list with newest item first */
		result = lcons_int((int) tli, result);

		/* we ignore the remainder of each line */
	}

	FreeFile(fd);

	if (result &&
		targetTLI <= (TimeLineID) linitial_int(result))
		ereport(FATAL,
				(errmsg("invalid data in history file \"%s\"", path),
B
Bruce Momjian 已提交
5254
			errhint("Timeline IDs must be less than child timeline's ID.")));
5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271

	result = lcons_int((int) targetTLI, result);

	ereport(DEBUG3,
			(errmsg_internal("history of timeline %u is %s",
							 targetTLI, nodeToString(result))));

	return result;
}

/*
 * Probe whether a timeline history file exists for the given timeline ID
 */
static bool
existsTimeLineHistory(TimeLineID probeTLI)
{
	char		path[MAXPGPATH];
B
Bruce Momjian 已提交
5272
	FILE	   *fd;
5273

5274
	TLHistoryFilePath(path, probeTLI);
5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286

	fd = AllocateFile(path, "r");
	if (fd != NULL)
	{
		FreeFile(fd);
		return true;
	}
	else
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
5287
					 errmsg("could not open file \"%s\": %m", path)));
5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305
		return false;
	}
}

/*
 * Find the newest existing timeline, assuming that startTLI exists.
 *
 * Note: while this is somewhat heuristic, it does positively guarantee
 * that (result + 1) is not a known timeline, and therefore it should
 * be safe to assign that ID to a new timeline.
 */
static TimeLineID
findNewestTimeLine(TimeLineID startTLI)
{
	TimeLineID	newestTLI;
	TimeLineID	probeTLI;

	/*
B
Bruce Momjian 已提交
5306 5307
	 * The algorithm is just to probe for the existence of timeline history
	 * files.  XXX is it useful to allow gaps in the sequence?
5308 5309 5310
	 */
	newestTLI = startTLI;

B
Bruce Momjian 已提交
5311
	for (probeTLI = startTLI + 1;; probeTLI++)
5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334
	{
		if (existsTimeLineHistory(probeTLI))
		{
			newestTLI = probeTLI;		/* probeTLI exists */
		}
		else
		{
			/* doesn't exist, assume we're done */
			break;
		}
	}

	return newestTLI;
}

/*
 * Create a new timeline history file.
 *
 *	newTLI: ID of the new timeline
 *	parentTLI: ID of its immediate parent
 *	endTLI et al: ID of the last used WAL file, for annotation purposes
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
5335
 * considerations.	But we should be just as tense as XLogFileInit to avoid
5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349
 * emplacing a bogus file.
 */
static void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
					 TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		xlogfname[MAXFNAMELEN];
	char		buffer[BLCKSZ];
	int			srcfd;
	int			fd;
	int			nbytes;
5350
	char		*xlogDir = NULL;
5351

B
Bruce Momjian 已提交
5352
	Assert(newTLI > parentTLI); /* else bad selection of newTLI */
5353 5354 5355 5356

	/*
	 * Write into a temp file name.
	 */
5357 5358 5359 5360 5361 5362
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d", xlogDir, (int) getpid()) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate filename %s/xlogtemp.%d", xlogDir, (int) getpid())));
	}
	pfree(xlogDir);
5363 5364
	unlink(tmppath);

5365
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
5366 5367 5368
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
5369
		ereport(ERROR,
5370 5371 5372
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

5373
	TLHistoryFilePath(path, parentTLI);
5374 5375 5376 5377 5378

	srcfd = BasicOpenFile(path, O_RDONLY, 0);
	if (srcfd < 0)
	{
		if (errno != ENOENT)
5379
			ereport(ERROR,
5380
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
5381
					 errmsg("could not open file \"%s\": %m", path)));
5382 5383 5384 5385 5386 5387 5388 5389 5390
		/* Not there, so assume parent has no parents */
	}
	else
	{
		for (;;)
		{
			errno = 0;
			nbytes = (int) read(srcfd, buffer, sizeof(buffer));
			if (nbytes < 0 || errno != 0)
5391
				ereport(ERROR,
5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			if (nbytes == 0)
				break;
			errno = 0;
			if ((int) write(fd, buffer, nbytes) != nbytes)
			{
				int			save_errno = errno;

				/*
				 * If we fail to make the file, delete it to release disk
				 * space
				 */
				unlink(tmppath);
B
Bruce Momjian 已提交
5406 5407

				/*
B
Bruce Momjian 已提交
5408
				 * if write didn't set errno, assume problem is no disk space
B
Bruce Momjian 已提交
5409
				 */
5410 5411
				errno = save_errno ? save_errno : ENOSPC;

5412
				ereport(ERROR,
5413
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
5414
					 errmsg("could not write to file \"%s\": %m", tmppath)));
5415 5416 5417 5418 5419 5420 5421 5422
			}
		}
		close(srcfd);
	}

	/*
	 * Append one line with the details of this timeline split.
	 *
B
Bruce Momjian 已提交
5423 5424
	 * If we did have a parent file, insert an extra newline just in case the
	 * parent file failed to end with one.
5425 5426 5427 5428 5429 5430 5431 5432 5433 5434
	 */
	XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);

	snprintf(buffer, sizeof(buffer),
			 "%s%u\t%s\t%s transaction %u at %s\n",
			 (srcfd < 0) ? "" : "\n",
			 parentTLI,
			 xlogfname,
			 recoveryStopAfter ? "after" : "before",
			 recoveryStopXid,
5435
			 timestamptz_to_str(recoveryStopTime));
5436 5437 5438 5439 5440 5441 5442 5443

	nbytes = strlen(buffer);
	errno = 0;
	if ((int) write(fd, buffer, nbytes) != nbytes)
	{
		int			save_errno = errno;

		/*
B
Bruce Momjian 已提交
5444
		 * If we fail to make the file, delete it to release disk space
5445 5446 5447 5448 5449
		 */
		unlink(tmppath);
		/* if write didn't set errno, assume problem is no disk space */
		errno = save_errno ? save_errno : ENOSPC;

5450
		ereport(ERROR,
5451 5452 5453 5454 5455
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", tmppath)));
	}

	if (pg_fsync(fd) != 0)
5456
		ereport(ERROR,
5457 5458 5459 5460
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
5461
		ereport(ERROR,
5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));


	/*
	 * Now move the completed history file into place with its final name.
	 */
	TLHistoryFilePath(path, newTLI);

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
	 */
#if HAVE_WORKING_LINK
	if (link(tmppath, path) < 0)
5478
		ereport(ERROR,
5479 5480 5481 5482 5483 5484
				(errcode_for_file_access(),
				 errmsg("could not link file \"%s\" to \"%s\": %m",
						tmppath, path)));
	unlink(tmppath);
#else
	if (rename(tmppath, path) < 0)
5485
		ereport(ERROR,
5486 5487 5488 5489 5490 5491 5492 5493 5494 5495
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						tmppath, path)));
#endif

	/* The history file can be archived immediately. */
	TLHistoryFileName(histfname, newTLI);
	XLogArchiveNotify(histfname);
}

5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553
static void
ControlFileWatcherSaveInitial(void)
{
	ControlFileWatcher->current_checkPointLoc = ControlFile->checkPoint;
	ControlFileWatcher->current_prevCheckPointLoc = ControlFile->prevCheckPoint;
	ControlFileWatcher->current_checkPointCopy_redo = ControlFile->checkPointCopy.redo;

	if (Debug_print_control_checkpoints)
		elog(LOG,"pg_control checkpoint: initial values (checkpoint loc %s, previous loc %s, copy's redo loc %s)",
			 XLogLocationToString_Long(&ControlFile->checkPoint),
			 XLogLocationToString2_Long(&ControlFile->prevCheckPoint),
			 XLogLocationToString3_Long(&ControlFile->checkPointCopy.redo));

	ControlFileWatcher->watcherInitialized = true;
}

static void
ControlFileWatcherCheckForChange(void)
{
	XLogRecPtr  writeLoc;
	XLogRecPtr  flushedLoc;

	if (!XLByteEQ(ControlFileWatcher->current_checkPointLoc,ControlFile->checkPoint) ||
		!XLByteEQ(ControlFileWatcher->current_prevCheckPointLoc,ControlFile->prevCheckPoint) ||
		!XLByteEQ(ControlFileWatcher->current_checkPointCopy_redo,ControlFile->checkPointCopy.redo))
	{
		ControlFileWatcher->current_checkPointLoc = ControlFile->checkPoint;
		ControlFileWatcher->current_prevCheckPointLoc = ControlFile->prevCheckPoint;
		ControlFileWatcher->current_checkPointCopy_redo = ControlFile->checkPointCopy.redo;

		if (XLogGetWriteAndFlushedLoc(&writeLoc, &flushedLoc))
		{
			bool problem = XLByteLE(flushedLoc,ControlFile->checkPoint);
			if (problem)
				elog(PANIC,"Checkpoint location %s for pg_control file is not flushed (write loc %s, flushed loc is %s)",
				     XLogLocationToString_Long(&ControlFile->checkPoint),
				     XLogLocationToString2_Long(&writeLoc),
				     XLogLocationToString3_Long(&flushedLoc));

			if (Debug_print_control_checkpoints)
				elog(LOG,"pg_control checkpoint: change (checkpoint loc %s, previous loc %s, copy's redo loc %s, write loc %s, flushed loc %s)",
					 XLogLocationToString_Long(&ControlFile->checkPoint),
					 XLogLocationToString2_Long(&ControlFile->prevCheckPoint),
					 XLogLocationToString3_Long(&ControlFile->checkPointCopy.redo),
					 XLogLocationToString4_Long(&writeLoc),
					 XLogLocationToString5_Long(&flushedLoc));
		}
		else
		{
			if (Debug_print_control_checkpoints)
				elog(LOG,"pg_control checkpoint: change (checkpoint loc %s, previous loc %s, copy's redo loc %s)",
					 XLogLocationToString_Long(&ControlFile->checkPoint),
					 XLogLocationToString2_Long(&ControlFile->prevCheckPoint),
					 XLogLocationToString3_Long(&ControlFile->checkPointCopy.redo));
		}
	}
}

5554 5555
/*
 * I/O routines for pg_control
5556 5557
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
5558
 * contents of pg_control.	WriteControlFile() initializes pg_control
5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */
static void
WriteControlFile(void)
{
5571 5572
	MirroredFlatFileOpen	mirroredOpen;

B
Bruce Momjian 已提交
5573
	char		buffer[PG_CONTROL_SIZE];		/* need not be aligned */
5574 5575

	/*
T
Tom Lane 已提交
5576
	 * Initialize version and compatibility-check fields
5577
	 */
T
Tom Lane 已提交
5578 5579
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
5580 5581 5582 5583

	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
	ControlFile->floatFormat = FLOATFORMAT_VALUE;

5584 5585
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
5586
	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
5587
	ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
5588 5589

	ControlFile->nameDataLen = NAMEDATALEN;
5590
	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
5591

5592 5593
	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

5594
#ifdef HAVE_INT64_TIMESTAMP
5595
	ControlFile->enableIntTimes = true;
5596
#else
5597
	ControlFile->enableIntTimes = false;
5598
#endif
5599 5600
	ControlFile->float4ByVal = FLOAT4PASSBYVAL;
	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
5601

T
Tom Lane 已提交
5602
	/* Contents are protected with a CRC */
5603 5604
	INIT_CRC32C(ControlFile->crc);
	COMP_CRC32C(ControlFile->crc,
5605 5606
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
5607
	FIN_CRC32C(ControlFile->crc);
T
Tom Lane 已提交
5608

5609
	/*
5610 5611 5612 5613 5614
	 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
	 * excess over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail when we
	 * check the contents of the file, but hopefully with a more specific
	 * error than "couldn't read pg_control".
5615
	 */
5616 5617
	if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
		elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
5618

5619
	memset(buffer, 0, PG_CONTROL_SIZE);
5620 5621
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

5622 5623 5624 5625 5626 5627 5628 5629
	MirroredFlatFile_Open(
					&mirroredOpen,
					XLOG_CONTROL_FILE_SUBDIR,
					XLOG_CONTROL_FILE_SIMPLE,
					O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					S_IRUSR | S_IWUSR,
					/* suppressError */ false,
					/* atomic operation */ false,
5630
					/* isMirrorRecovery */ false);
5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645

	MirroredFlatFile_Write(
					&mirroredOpen,
					0,
					buffer,
					PG_CONTROL_SIZE,
					/* suppressError */ false);

	MirroredFlatFile_Flush(
					&mirroredOpen,
					/* suppressError */ false);

	MirroredFlatFile_Close(&mirroredOpen);

	ControlFileWatcherSaveInitial();
5646 5647 5648 5649 5650
}

static void
ReadControlFile(void)
{
5651
	pg_crc32	crc;
5652 5653 5654 5655 5656
	int			fd;

	/*
	 * Read data...
	 */
5657 5658 5659
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
5660
	if (fd < 0)
5661 5662 5663
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
5664
						XLOG_CONTROL_FILE)));
5665 5666

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
5667 5668
		ereport(PANIC,
				(errcode_for_file_access(),
5669
				 errmsg("could not read from control file: %m")));
5670 5671 5672

	close(fd);

T
Tom Lane 已提交
5673
	/*
B
Bruce Momjian 已提交
5674 5675 5676 5677
	 * Check for expected pg_control format version.  If this is wrong, the
	 * CRC check will likely fail because we'll be checking the wrong number
	 * of bytes.  Complaining about wrong version will probably be more
	 * enlightening than complaining about wrong CRC.
T
Tom Lane 已提交
5678
	 */
5679 5680 5681 5682 5683

	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
5684 5685
		 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
			ControlFile->pg_control_version, ControlFile->pg_control_version,
5686 5687 5688
						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));

T
Tom Lane 已提交
5689
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
5690 5691 5692
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
B
Bruce Momjian 已提交
5693 5694
				  " but the server was compiled with PG_CONTROL_VERSION %d.",
						ControlFile->pg_control_version, PG_CONTROL_VERSION),
5695
				 errhint("It looks like you need to initdb.")));
5696

T
Tom Lane 已提交
5697
	/* Now check the CRC. */
5698 5699
	INIT_CRC32C(crc);
	COMP_CRC32C(crc,
5700 5701
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
5702
	FIN_CRC32C(crc);
5703

5704
	if (!EQ_CRC32C(crc, ControlFile->crc))
5705 5706
		ereport(FATAL,
				(errmsg("incorrect checksum in control file")));
5707

5708
	/*
5709
	 * Do compatibility checking immediately.  If the database isn't
5710 5711
	 * compatible with the backend executable, we want to abort before we can
	 * possibly do any damage.
5712
	 */
5713
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
5714 5715 5716
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
B
Bruce Momjian 已提交
5717 5718
				  " but the server was compiled with CATALOG_VERSION_NO %d.",
						ControlFile->catalog_version_no, CATALOG_VERSION_NO),
5719
				 errhint("It looks like you need to initdb.")));
5720 5721 5722
	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5723 5724 5725 5726
		   errdetail("The database cluster was initialized with MAXALIGN %d,"
					 " but the server was compiled with MAXALIGN %d.",
					 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
				 errhint("It looks like you need to initdb.")));
5727 5728 5729
	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
P
Peter Eisentraut 已提交
5730
				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
5731
				 errhint("It looks like you need to initdb.")));
5732
	if (ControlFile->blcksz != BLCKSZ)
5733 5734
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5735 5736 5737 5738
			 errdetail("The database cluster was initialized with BLCKSZ %d,"
					   " but the server was compiled with BLCKSZ %d.",
					   ControlFile->blcksz, BLCKSZ),
				 errhint("It looks like you need to recompile or initdb.")));
5739
	if (ControlFile->relseg_size != RELSEG_SIZE)
5740 5741
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5742 5743 5744 5745
		errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
				  " but the server was compiled with RELSEG_SIZE %d.",
				  ControlFile->relseg_size, RELSEG_SIZE),
				 errhint("It looks like you need to recompile or initdb.")));
5746 5747 5748
	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5749 5750 5751
		errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
				  " but the server was compiled with XLOG_BLCKSZ %d.",
				  ControlFile->xlog_blcksz, XLOG_BLCKSZ),
5752
				 errhint("It looks like you need to recompile or initdb.")));
5753 5754 5755 5756
	if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
B
Bruce Momjian 已提交
5757
					   " but the server was compiled with XLOG_SEG_SIZE %d.",
5758
						   ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
B
Bruce Momjian 已提交
5759
				 errhint("It looks like you need to recompile or initdb.")));
5760
	if (ControlFile->nameDataLen != NAMEDATALEN)
5761 5762
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5763 5764 5765 5766
		errdetail("The database cluster was initialized with NAMEDATALEN %d,"
				  " but the server was compiled with NAMEDATALEN %d.",
				  ControlFile->nameDataLen, NAMEDATALEN),
				 errhint("It looks like you need to recompile or initdb.")));
5767
	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
5768 5769
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
5770
				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
B
Bruce Momjian 已提交
5771
					  " but the server was compiled with INDEX_MAX_KEYS %d.",
5772
						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
B
Bruce Momjian 已提交
5773
				 errhint("It looks like you need to recompile or initdb.")));
5774 5775 5776 5777
	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
B
Bruce Momjian 已提交
5778 5779
				" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
			  ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
5780
				 errhint("It looks like you need to recompile or initdb.")));
5781 5782

#ifdef HAVE_INT64_TIMESTAMP
5783
	if (ControlFile->enableIntTimes != true)
5784 5785 5786
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
5787 5788
				  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
5789
#else
5790
	if (ControlFile->enableIntTimes != false)
5791 5792 5793
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
5794 5795
			   " but the server was compiled without HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
5796 5797
#endif

5798 5799 5800 5801 5802
#ifdef USE_FLOAT4_BYVAL
	if (ControlFile->float4ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
5803
					  " but the server was compiled with USE_FLOAT4_BYVAL."),
5804 5805 5806 5807 5808
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float4ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
5809 5810
		errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
				  " but the server was compiled without USE_FLOAT4_BYVAL."),
5811 5812 5813 5814 5815 5816 5817 5818
				 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT8_BYVAL
	if (ControlFile->float8ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
5819
					  " but the server was compiled with USE_FLOAT8_BYVAL."),
5820 5821 5822 5823 5824
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float8ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
5825 5826
		errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
				  " but the server was compiled without USE_FLOAT8_BYVAL."),
5827 5828 5829
				 errhint("It looks like you need to recompile or initdb.")));
#endif

5830 5831 5832 5833
	/* Make the initdb settings visible as GUC variables, too */
	SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
					PGC_INTERNAL, PGC_S_OVERRIDE);

5834 5835 5836 5837 5838 5839 5840 5841
	if (!ControlFileWatcher->watcherInitialized)
	{
		ControlFileWatcherSaveInitial();
	}
	else
	{
		ControlFileWatcherCheckForChange();
	}
5842 5843
}

5844 5845
static bool
XLogGetWriteAndFlushedLoc(XLogRecPtr *writeLoc, XLogRecPtr *flushedLoc)
5846
{
5847 5848
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
5849

5850 5851 5852 5853
	SpinLockAcquire(&xlogctl->info_lck);
	*writeLoc = xlogctl->LogwrtResult.Write;
	*flushedLoc = xlogctl->LogwrtResult.Flush;
	SpinLockRelease(&xlogctl->info_lck);
5854

5855 5856
	return (writeLoc->xlogid != 0 || writeLoc->xrecoff != 0);
}
5857

5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884
/*
 * Very specific purpose routine for FileRep that flushes out XLOG records from the
 * XLOG memory cache to disk.
 */
void
XLogFileRepFlushCache(
	XLogRecPtr	*lastChangeTrackingEndLoc)
{
	/*
	 * We hold the ChangeTrackingTransitionLock EXCLUSIVE, thus the lastChangeTrackingEndLoc
	 * value is the previous location -- the one we want.
	 *
	 * Since the lock is acquired after ALL WRITES and FSYNCS in XLogInsert_Internal,
	 * we know this flush is safe (i.e. will not hang) and will push out all XLOG records we
	 * want to see in the next call to ChangeTracking_CreateInitialFromPreviousCheckpoint.
	 */

	*lastChangeTrackingEndLoc = XLogCtl->lastChangeTrackingEndLoc;

	XLogFlush(*lastChangeTrackingEndLoc);
}

void
UpdateControlFile(void)
{
	MirroredFlatFileOpen	mirroredOpen;

5885 5886
	INIT_CRC32C(ControlFile->crc);
	COMP_CRC32C(ControlFile->crc,
5887 5888
				   (char *) ControlFile,
				   offsetof(ControlFileData, crc));
5889
	FIN_CRC32C(ControlFile->crc);
5890 5891 5892 5893 5894 5895 5896 5897 5898

	MirroredFlatFile_Open(
					&mirroredOpen,
					XLOG_CONTROL_FILE_SUBDIR,
					XLOG_CONTROL_FILE_SIMPLE,
					O_RDWR | PG_BINARY,
					S_IRUSR | S_IWUSR,
					/* suppressError */ false,
					/* atomic operation */ false,
5899
					/* isMirrorRecovery */ false);
5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926

	MirroredFlatFile_Write(
					&mirroredOpen,
					0,
					ControlFile,
					PG_CONTROL_SIZE,
					/* suppressError */ false);

	MirroredFlatFile_Flush(
					&mirroredOpen,
					/* suppressError */ false);

	MirroredFlatFile_Close(&mirroredOpen);

	Assert (ControlFileWatcher->watcherInitialized);

	ControlFileWatcherCheckForChange();
}

/*
 * Returns the unique system identifier from control file.
 */
uint64
GetSystemIdentifier(void)
{
	Assert(ControlFile != NULL);
	return ControlFile->system_identifier;
5927 5928
}

5929
/*
T
Tom Lane 已提交
5930
 * Initialization of shared memory for XLOG
5931
 */
5932
Size
5933
XLOGShmemSize(void)
5934
{
5935
	Size		size;
5936

5937 5938 5939 5940 5941 5942 5943
	/* XLogCtl */
	size = sizeof(XLogCtlData);
	/* xlblocks array */
	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
	/* extra alignment padding for XLOG I/O buffers */
	size = add_size(size, ALIGNOF_XLOG_BUFFER);
	/* and the buffers themselves */
5944
	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5945 5946

	/*
B
Bruce Momjian 已提交
5947 5948 5949
	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
	 * routine again below to compute the actual allocation size.
5950 5951
	 */

5952 5953 5954 5955
	/*
	 * Similary, we also don't PgControlWatch for the above reasons, too.
	 */

5956
	return size;
5957 5958 5959 5960 5961
}

void
XLOGShmemInit(void)
{
5962
	bool		foundCFile,
5963 5964
				foundXLog,
				foundCFileWatcher;
5965
	char	   *allocptr;
5966

5967
	ControlFile = (ControlFileData *)
5968
		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5969 5970
	ControlFileWatcher = (ControlFileWatch *)
		ShmemInitStruct("Control File Watcher", sizeof(ControlFileWatch), &foundCFileWatcher);
5971 5972
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5973

5974
	if (foundCFile || foundXLog || foundCFileWatcher)
5975 5976
	{
		/* both should be present or neither */
5977
		Assert(foundCFile && foundXLog && foundCFileWatcher);
5978 5979
		return;
	}
5980

T
Tom Lane 已提交
5981
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
5982

5983 5984
	XLogCtl->pass4_PTCatVerificationPassed = true;

T
Tom Lane 已提交
5985
	/*
B
Bruce Momjian 已提交
5986 5987 5988
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
	 * multiple of the alignment for same, so no extra alignment padding is
	 * needed here.
T
Tom Lane 已提交
5989
	 */
5990 5991
	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
T
Tom Lane 已提交
5992
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5993
	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
B
Bruce Momjian 已提交
5994

T
Tom Lane 已提交
5995
	/*
5996
	 * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
T
Tom Lane 已提交
5997
	 */
5998 5999
	allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
	XLogCtl->pages = allocptr;
6000
	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
T
Tom Lane 已提交
6001 6002

	/*
B
Bruce Momjian 已提交
6003 6004
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
	 * in additional info.)
T
Tom Lane 已提交
6005 6006
	 */
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
6007
	XLogCtl->SharedRecoveryInProgress = true;
T
Tom Lane 已提交
6008
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
6009
	SpinLockInit(&XLogCtl->info_lck);
6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025
	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);

	XLogCtl->haveLastCheckpointLoc = false;
	memset(&XLogCtl->lastCheckpointLoc, 0, sizeof(XLogRecPtr));
	memset(&XLogCtl->lastCheckpointEndLoc, 0, sizeof(XLogRecPtr));

	/*
	 * Initialize the shared memory by the parameter given to postmaster.
	 * GpStandbyDbid could be inconsistent with the catalog if the postmaster
	 * is given wrong id, but there is no chance to check it in this early
	 * stage of startup, and this is how we have been doing historically.
	 */
	XLogCtl->standbyDbid = GpStandbyDbid;

	SpinLockInit(&XLogCtl->resynchronize_lck);
}
T
Tom Lane 已提交
6026

6027 6028 6029 6030 6031 6032 6033 6034 6035
/**
 * This should be called when we are sure that it is safe to try to read the control file and BEFORE
 *  we have launched any child processes that need access to collation and ctype data.
 *
 * It is not safe to read the control file on a mirror because it may not be synchronized
 */
void
XLogStartupInit(void)
{
6036
	/*
B
Bruce Momjian 已提交
6037 6038 6039
	 * If we are not in bootstrap mode, pg_control should already exist. Read
	 * and validate it immediately (see comments in ReadControlFile() for the
	 * reasons why).
6040 6041 6042
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
6043 6044
}

6045 6046 6047 6048 6049 6050 6051 6052 6053 6054
/*
 * Are checksums enabled for data pages?
 */
bool
DataChecksumsEnabled(void)
{
	Assert(ControlFile != NULL);
	return (ControlFile->data_checksum_version > 0);
}

6055
/*
T
Tom Lane 已提交
6056 6057
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
6058 6059
 */
void
T
Tom Lane 已提交
6060
BootStrapXLOG(void)
6061
{
6062
	CheckPoint	checkPoint;
T
Tom Lane 已提交
6063 6064
	char	   *buffer;
	XLogPageHeader page;
6065
	XLogLongPageHeader longpage;
6066
	XLogRecord *record;
B
Bruce Momjian 已提交
6067
	bool		use_existent;
6068 6069
	uint64		sysidentifier;
	struct timeval tv;
6070
	pg_crc32	crc;
6071

6072
	/*
B
Bruce Momjian 已提交
6073 6074 6075 6076 6077 6078 6079 6080 6081 6082
	 * Select a hopefully-unique system identifier code for this installation.
	 * We use the result of gettimeofday(), including the fractional seconds
	 * field, as being about as unique as we can easily get.  (Think not to
	 * use random(), since it hasn't been seeded and there's no portable way
	 * to seed it other than the system clock value...)  The upper half of the
	 * uint64 value is just the tv_sec part, while the lower half is the XOR
	 * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
	 * unnecessarily if "uint64" is really only 32 bits wide.  A person
	 * knowing this encoding can determine the initialization time of the
	 * installation, which could perhaps be useful sometimes.
6083 6084 6085 6086 6087
	 */
	gettimeofday(&tv, NULL);
	sysidentifier = ((uint64) tv.tv_sec) << 32;
	sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

6088 6089 6090
	/* First timeline ID is always 1 */
	ThisTimeLineID = 1;

6091
	/* page buffer must be aligned suitably for O_DIRECT */
6092
	buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
6093
	page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
6094
	memset(page, 0, XLOG_BLCKSZ);
T
Tom Lane 已提交
6095

6096 6097 6098 6099 6100 6101 6102
	/*
	 * Set up information for the initial checkpoint record
	 *
	 * The initial checkpoint record is written to the beginning of the WAL
	 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
	 * used, so that we can use 0/0 to mean "before any valid WAL segment".
	 */
6103
	checkPoint.redo.xlogid = 0;
6104
	checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
6105
	checkPoint.ThisTimeLineID = ThisTimeLineID;
6106
	checkPoint.nextXidEpoch = 0;
6107
	checkPoint.nextXid = FirstNormalTransactionId;
6108
	checkPoint.nextOid = FirstBootstrapObjectId;
6109
	checkPoint.nextRelfilenode = FirstNormalObjectId;
6110
	checkPoint.nextMulti = FirstMultiXactId;
6111
	checkPoint.nextMultiOffset = 0;
6112
	checkPoint.time = (pg_time_t) time(NULL);
6113

6114 6115 6116
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;
6117 6118
	ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
	ShmemVariableCache->relfilenodeCount = 0;
6119
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6120

6121
	/* Set up the XLOG page header */
6122
	page->xlp_magic = XLOG_PAGE_MAGIC;
6123 6124
	page->xlp_info = XLP_LONG_HEADER;
	page->xlp_tli = ThisTimeLineID;
6125
	page->xlp_pageaddr.xlogid = 0;
6126
	page->xlp_pageaddr.xrecoff = XLogSegSize;
6127 6128 6129
	longpage = (XLogLongPageHeader) page;
	longpage->xlp_sysid = sysidentifier;
	longpage->xlp_seg_size = XLogSegSize;
6130
	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
6131 6132

	/* Insert the initial checkpoint record */
6133
	record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
6134
	record->xl_prev.xlogid = 0;
6135
	record->xl_prev.xrecoff = 0;
6136
	record->xl_xid = InvalidTransactionId;
6137
	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
6138
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
6139
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
6140
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
6141
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
6142

6143 6144 6145
	INIT_CRC32C(crc);
	COMP_CRC32C(crc, &checkPoint, sizeof(checkPoint));
	COMP_CRC32C(crc, (char *) record + sizeof(pg_crc32),
6146
			   SizeOfXLogRecord - sizeof(pg_crc32));
6147
	FIN_CRC32C(crc);
6148

6149 6150
	record->xl_crc = crc;

6151
	/* Create first XLOG segment file */
6152
	use_existent = false;
6153 6154 6155
	XLogFileInit(
		&mirroredLogFileOpen,
		0, 1, &use_existent, false);
6156

6157
	/* Write the first page with the initial record */
6158
	errno = 0;
6159 6160 6161 6162 6163
	if (MirroredFlatFile_Append(
			&mirroredLogFileOpen,
			page,
			XLOG_BLCKSZ,
			/* suppressError */ true))
6164
	{
6165 6166
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
6167
			  errmsg("could not write bootstrap transaction log file: %m")));
6168
	}
6169

6170 6171 6172
	if (MirroredFlatFile_Flush(
			&mirroredLogFileOpen,
			/* suppressError */ true))
6173 6174
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
6175
			  errmsg("could not fsync bootstrap transaction log file: %m")));
6176

6177 6178
	MirroredFlatFile_Close(
			&mirroredLogFileOpen);
6179

6180 6181
	/* Now create pg_control */

6182
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
6183
	/* Initialize pg_control status fields */
6184
	ControlFile->system_identifier = sysidentifier;
T
Tom Lane 已提交
6185 6186
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
6187
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
6188
	ControlFile->checkPointCopy = checkPoint;
6189 6190
	ControlFile->data_checksum_version = bootstrap_data_checksum_version;

6191
	/* some additional ControlFile fields are set in WriteControlFile() */
6192

6193
	WriteControlFile();
6194 6195 6196

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
6197
	BootStrapSUBTRANS();
6198
	BootStrapMultiXact();
6199
	DistributedLog_BootStrap();
6200

6201
	pfree(buffer);
6202 6203
}

6204
static char *
6205
str_time(pg_time_t tnow)
6206
{
6207
	static char buf[128];
6208

6209
	pg_strftime(buf, sizeof(buf),
6210 6211
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&tnow, log_timezone));
6212

6213
	return buf;
6214 6215
}

6216 6217
/*
 * See if there is a recovery command file (recovery.conf), and if so
6218
 * read in parameters for recovery in standby mode.
6219 6220 6221 6222 6223
 *
 * XXX longer term intention is to expand this to
 * cater for additional parameters and controls
 * possibly use a flex lexer similar to the GUC one
 */
6224 6225
void
XLogReadRecoveryCommandFile(int emode)
6226
{
B
Bruce Momjian 已提交
6227 6228 6229 6230 6231 6232
	FILE	   *fd;
	char		cmdline[MAXPGPATH];
	TimeLineID	rtli = 0;
	bool		rtliGiven = false;
	bool		syntaxError = false;

6233
	fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
6234 6235 6236
	if (fd == NULL)
	{
		if (errno == ENOENT)
6237
			return;				/* not there, so no recovery in standby mode */
6238
		ereport(FATAL,
B
Bruce Momjian 已提交
6239
				(errcode_for_file_access(),
6240
				 errmsg("could not open recovery command file \"%s\": %m",
6241
						RECOVERY_COMMAND_FILE)));
6242 6243
	}

6244 6245 6246
	ereport(emode,
			(errmsg("Found recovery.conf file, checking appropriate parameters "
					" for recovery in standby mode")));
6247

B
Bruce Momjian 已提交
6248 6249 6250
	/*
	 * Parse the file...
	 */
6251
	while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
6252 6253
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
6254 6255 6256
		char	   *ptr;
		char	   *tok1;
		char	   *tok2;
6257 6258 6259 6260 6261 6262 6263 6264 6265 6266

		for (ptr = cmdline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* identify the quoted parameter value */
B
Bruce Momjian 已提交
6267
		tok1 = strtok(ptr, "'");
6268 6269 6270 6271 6272
		if (!tok1)
		{
			syntaxError = true;
			break;
		}
B
Bruce Momjian 已提交
6273
		tok2 = strtok(NULL, "'");
6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286
		if (!tok2)
		{
			syntaxError = true;
			break;
		}
		/* reparse to get just the parameter name */
		tok1 = strtok(ptr, " \t=");
		if (!tok1)
		{
			syntaxError = true;
			break;
		}

6287
		if (strcmp(tok1, "primary_conninfo") == 0)
B
Bruce Momjian 已提交
6288
		{
6289 6290 6291 6292
			PrimaryConnInfo = pstrdup(tok2);
			ereport(emode,
					(errmsg("primary_conninfo = \"%s\"",
							PrimaryConnInfo)));
6293
		}
6294 6295 6296 6297 6298 6299 6300
		else if (strcmp(tok1, "recovery_end_command") == 0)
		{
			recoveryEndCommand = pstrdup(tok2);
			ereport(LOG,
					(errmsg("recovery_end_command = '%s'",
							recoveryEndCommand)));
		}
B
Bruce Momjian 已提交
6301 6302
		else if (strcmp(tok1, "recovery_target_timeline") == 0)
		{
6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321
			rtliGiven = true;
			if (strcmp(tok2, "latest") == 0)
				rtli = 0;
			else
			{
				errno = 0;
				rtli = (TimeLineID) strtoul(tok2, NULL, 0);
				if (errno == EINVAL || errno == ERANGE)
					ereport(FATAL,
							(errmsg("recovery_target_timeline is not a valid number: \"%s\"",
									tok2)));
			}
			if (rtli)
				ereport(LOG,
						(errmsg("recovery_target_timeline = %u", rtli)));
			else
				ereport(LOG,
						(errmsg("recovery_target_timeline = latest")));
		}
B
Bruce Momjian 已提交
6322 6323
		else if (strcmp(tok1, "recovery_target_xid") == 0)
		{
6324 6325 6326 6327
			errno = 0;
			recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
			if (errno == EINVAL || errno == ERANGE)
				ereport(FATAL,
B
Bruce Momjian 已提交
6328 6329
				 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
						 tok2)));
6330 6331 6332 6333 6334 6335
			ereport(LOG,
					(errmsg("recovery_target_xid = %u",
							recoveryTargetXid)));
			recoveryTarget = true;
			recoveryTargetExact = true;
		}
B
Bruce Momjian 已提交
6336 6337
		else if (strcmp(tok1, "recovery_target_time") == 0)
		{
6338 6339 6340 6341 6342 6343 6344 6345
			/*
			 * if recovery_target_xid specified, then this overrides
			 * recovery_target_time
			 */
			if (recoveryTargetExact)
				continue;
			recoveryTarget = true;
			recoveryTargetExact = false;
B
Bruce Momjian 已提交
6346

6347
			/*
6348
			 * Convert the time string given by the user to TimestampTz form.
6349
			 */
6350 6351
			recoveryTargetTime =
				DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
B
Bruce Momjian 已提交
6352
														CStringGetDatum(tok2),
6353 6354
												ObjectIdGetDatum(InvalidOid),
														Int32GetDatum(-1)));
6355
			ereport(LOG,
6356
					(errmsg("recovery_target_time = '%s'",
6357
							timestamptz_to_str(recoveryTargetTime))));
6358
		}
B
Bruce Momjian 已提交
6359
		else if (strcmp(tok1, "recovery_target_inclusive") == 0)
6360 6361 6362 6363
		{
			/*
			 * does nothing if a recovery_target is not also set
			 */
6364
			if (!parse_bool(tok2, &recoveryTargetInclusive))
6365 6366 6367
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
6368
			ereport(LOG,
6369 6370 6371 6372 6373
					(errmsg("standby_mode = %s", tok2)));
		}
		else if (strcmp(tok1, "standby_mode") == 0)
		{
			if (!parse_bool(tok2, &StandbyModeRequested))
6374 6375
				  ereport(ERROR,
							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6376
					  errmsg("parameter \"standby_mode\" requires a Boolean value")));
6377
		}
6378 6379 6380 6381 6382 6383 6384 6385
		else
			ereport(FATAL,
					(errmsg("unrecognized recovery parameter \"%s\"",
							tok1)));
	}

	FreeFile(fd);

B
Bruce Momjian 已提交
6386 6387
	if (syntaxError)
		ereport(FATAL,
6388 6389
				(errmsg("syntax error in recovery command file: %s",
						cmdline),
B
Bruce Momjian 已提交
6390
			  errhint("Lines should have the format parameter = 'value'.")));
6391 6392

	/*
6393
	 * Check for compulsory parameters
6394
	 */
6395
	if (StandbyModeRequested)
6396
	{
6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408
		if (PrimaryConnInfo == NULL)
			ereport(FATAL,
					(errmsg("recovery command file \"%s\" primary_conninfo not specified",
							RECOVERY_COMMAND_FILE),
					 errhint("The database server in standby mode needs primary_connection to connect to primary.")));
	}
	else
	{
		/* Currently, standby mode request is a must if recovery.conf file exists */
		ereport(FATAL,
				(errmsg("recovery command file \"%s\" request for standby mode not specified",
						RECOVERY_COMMAND_FILE)));
6409
	}
6410 6411 6412 6413 6414 6415
}

/*
 * Exit archive-recovery state
 */
static void
6416
exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
6417
{
B
Bruce Momjian 已提交
6418 6419
	char		recoveryPath[MAXPGPATH];
	char		xlogpath[MAXPGPATH];
6420
	char		*xlogDir = NULL;
6421
	XLogRecPtr	InvalidXLogRecPtr = {0, 0};
6422 6423

	/*
6424
	 * We are no longer in archive recovery state.
6425 6426 6427
	 */
	InArchiveRecovery = false;

6428 6429 6430 6431
	/*
	 * Update min recovery point one last time.
	 */
	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
6432 6433

	/*
B
Bruce Momjian 已提交
6434 6435 6436
	 * We should have the ending log segment currently open.  Verify, and then
	 * close it (to avoid problems on Windows with trying to rename or delete
	 * an open file).
6437 6438 6439 6440 6441 6442 6443 6444 6445
	 */
	Assert(readFile >= 0);
	Assert(readId == endLogId);
	Assert(readSeg == endLogSeg);

	close(readFile);
	readFile = -1;

	/*
B
Bruce Momjian 已提交
6446 6447 6448 6449 6450 6451 6452
	 * If the segment was fetched from archival storage, we want to replace
	 * the existing xlog segment (if any) with the archival version.  This is
	 * because whatever is in XLOGDIR is very possibly older than what we have
	 * from the archives, since it could have come from restoring a PGDATA
	 * backup.	In any case, the archival version certainly is more
	 * descriptive of what our current database state is, because that is what
	 * we replayed from.
6453
	 *
6454 6455
	 * Note that if we are establishing a new timeline, ThisTimeLineID is
	 * already set to the new value, and so we will create a new file instead
6456 6457
	 * of overwriting any existing file.  (This is, in fact, always the case
	 * at present.)
6458
	 */
6459 6460 6461 6462 6463
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(recoveryPath, MAXPGPATH, "%s/RECOVERYXLOG", xlogDir) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate path %s/RECOVERYXLOG", xlogDir)));	
	}
6464
	XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
6465 6466 6467 6468 6469 6470 6471 6472 6473 6474

	if (restoredFromArchive)
	{
		ereport(DEBUG3,
				(errmsg_internal("moving last restored xlog to \"%s\"",
								 xlogpath)));
		unlink(xlogpath);		/* might or might not exist */
		if (rename(recoveryPath, xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
6475
					 errmsg("could not rename file \"%s\" to \"%s\": %m",
6476 6477 6478 6479 6480 6481 6482 6483 6484 6485
							recoveryPath, xlogpath)));
		/* XXX might we need to fix permissions on the file? */
	}
	else
	{
		/*
		 * If the latest segment is not archival, but there's still a
		 * RECOVERYXLOG laying about, get rid of it.
		 */
		unlink(recoveryPath);	/* ignore any error */
B
Bruce Momjian 已提交
6486

6487
		/*
B
Bruce Momjian 已提交
6488 6489 6490
		 * If we are establishing a new timeline, we have to copy data from
		 * the last WAL segment of the old timeline to create a starting WAL
		 * segment for the new timeline.
6491 6492 6493 6494
		 *
		 * Notify the archiver that the last WAL segment of the old timeline
		 * is ready to copy to archival storage. Otherwise, it is not archived
		 * for a while.
6495 6496
		 */
		if (endTLI != ThisTimeLineID)
6497
		{
6498 6499
			XLogFileCopy(endLogId, endLogSeg,
						 endTLI, endLogId, endLogSeg);
6500 6501 6502 6503 6504 6505 6506

			if (XLogArchivingActive())
			{
				XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
				XLogArchiveNotify(xlogpath);
			}
		}
6507 6508 6509
	}

	/*
B
Bruce Momjian 已提交
6510 6511
	 * Let's just make real sure there are not .ready or .done flags posted
	 * for the new segment.
6512
	 */
6513 6514
	XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
	XLogArchiveCleanup(xlogpath);
6515

6516
	/* Get rid of any remaining recovered timeline-history file, too */
6517 6518 6519 6520
	if (snprintf(recoveryPath, MAXPGPATH, "%s/RECOVERYHISTORY", xlogDir) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate path %s/RECOVERYHISTORY", xlogDir)));
	}
B
Bruce Momjian 已提交
6521
	unlink(recoveryPath);		/* ignore any error */
6522 6523

	/*
B
Bruce Momjian 已提交
6524 6525
	 * Rename the config file out of the way, so that we don't accidentally
	 * re-enter archive recovery mode in a subsequent crash.
6526
	 */
6527 6528
	unlink(RECOVERY_COMMAND_DONE);
	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
6529 6530
		ereport(FATAL,
				(errcode_for_file_access(),
6531
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
6532
						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
6533

6534
	pfree(xlogDir);
6535 6536 6537 6538 6539 6540 6541 6542
}

/*
 * For point-in-time recovery, this function decides whether we want to
 * stop applying the XLOG at or after the current record.
 *
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 * *includeThis is set TRUE if we should apply this record before stopping.
6543 6544 6545
 *
 * We also track the timestamp of the latest applied COMMIT/ABORT record
 * in recoveryLastXTime, for logging purposes.
6546 6547
 * Also, some information is saved in recoveryStopXid et al for use in
 * annotating the new timeline's history file.
6548 6549 6550 6551 6552
 */
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
	bool		stopsHere;
B
Bruce Momjian 已提交
6553
	uint8		record_info;
B
Bruce Momjian 已提交
6554
	TimestampTz recordXtime;
6555 6556 6557 6558 6559 6560 6561

	/* We only consider stopping at COMMIT or ABORT records */
	if (record->xl_rmid != RM_XACT_ID)
		return false;
	record_info = record->xl_info & ~XLR_INFO_MASK;
	if (record_info == XLOG_XACT_COMMIT)
	{
B
Bruce Momjian 已提交
6562
		xl_xact_commit *recordXactCommitData;
6563 6564

		recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
6565
		recordXtime = recordXactCommitData->xact_time;
6566 6567 6568
	}
	else if (record_info == XLOG_XACT_ABORT)
	{
B
Bruce Momjian 已提交
6569
		xl_xact_abort *recordXactAbortData;
6570 6571

		recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
6572
		recordXtime = recordXactAbortData->xact_time;
6573 6574 6575 6576
	}
	else
		return false;

6577 6578
	/* Do we have a PITR target at all? */
	if (!recoveryTarget)
6579 6580
	{
		recoveryLastXTime = recordXtime;
6581
		return false;
6582
	}
6583

6584 6585 6586
	if (recoveryTargetExact)
	{
		/*
B
Bruce Momjian 已提交
6587 6588
		 * there can be only one transaction end record with this exact
		 * transactionid
6589
		 *
B
Bruce Momjian 已提交
6590
		 * when testing for an xid, we MUST test for equality only, since
B
Bruce Momjian 已提交
6591 6592 6593
		 * transactions are numbered in the order they start, not the order
		 * they complete. A higher numbered xid will complete before you about
		 * 50% of the time...
6594 6595 6596 6597 6598 6599 6600 6601
		 */
		stopsHere = (record->xl_xid == recoveryTargetXid);
		if (stopsHere)
			*includeThis = recoveryTargetInclusive;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
6602 6603 6604
		 * there can be many transactions that share the same commit time, so
		 * we stop after the last one, if we are inclusive, or stop at the
		 * first one if we are exclusive
6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615
		 */
		if (recoveryTargetInclusive)
			stopsHere = (recordXtime > recoveryTargetTime);
		else
			stopsHere = (recordXtime >= recoveryTargetTime);
		if (stopsHere)
			*includeThis = false;
	}

	if (stopsHere)
	{
6616 6617 6618 6619
		recoveryStopXid = record->xl_xid;
		recoveryStopTime = recordXtime;
		recoveryStopAfter = *includeThis;

6620 6621
		if (record_info == XLOG_XACT_COMMIT)
		{
6622
			if (recoveryStopAfter)
6623 6624
				ereport(LOG,
						(errmsg("recovery stopping after commit of transaction %u, time %s",
6625 6626
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6627 6628 6629
			else
				ereport(LOG,
						(errmsg("recovery stopping before commit of transaction %u, time %s",
6630 6631
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6632 6633 6634
		}
		else
		{
6635
			if (recoveryStopAfter)
6636 6637
				ereport(LOG,
						(errmsg("recovery stopping after abort of transaction %u, time %s",
6638 6639
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6640 6641 6642
			else
				ereport(LOG,
						(errmsg("recovery stopping before abort of transaction %u, time %s",
6643 6644
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6645
		}
6646

6647 6648
		if (recoveryStopAfter)
			recoveryLastXTime = recordXtime;
6649
	}
6650 6651
	else
		recoveryLastXTime = recordXtime;
6652 6653 6654 6655

	return stopsHere;
}

6656
/*
6657 6658 6659 6660
 * Save timestamp of the next chunk of WAL records to apply.
 *
 * We keep this in XLogCtl, not a simple static variable, so that it can be
 * seen by all backends.
6661
 */
6662 6663
static void
SetCurrentChunkStartTime(TimestampTz xtime)
6664
{
6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->currentChunkStartTime = xtime;
	SpinLockRelease(&xlogctl->info_lck);
}

static void
printEndOfXLogFile(XLogRecPtr	*loc)
{
	uint32 seg = loc->xrecoff / XLogSegSize;

	XLogRecPtr roundedDownLoc;

6680
	XLogRecord *record;
6681
	XLogRecPtr	LastRec;
6682

6683
	/*
6684 6685
	 * Go back to the beginning of the log file and read forward to find
	 * the end of the transaction log.
6686
	 */
6687 6688
	roundedDownLoc.xlogid = loc->xlogid;
	roundedDownLoc.xrecoff = (seg * XLogSegSize) + SizeOfXLogLongPHD;
6689

6690
	XLogCloseReadRecord();
6691

6692 6693 6694 6695 6696 6697 6698
	record = XLogReadRecord(&roundedDownLoc, false, LOG);
	if (record == NULL)
	{
		elog(LOG,"Couldn't read transaction log file (logid %d, seg %d)",
			 loc->xlogid, seg);
		return;
	}
6699

6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715
	do
	{
		LastRec = ReadRecPtr;

		record = XLogReadRecord(NULL, false, DEBUG5);
	} while (record != NULL);

	record = XLogReadRecord(&LastRec, false, ERROR);

	elog(LOG,"found end of transaction log file %s",
		 XLogLocationToString_Long(&EndRecPtr));

	XLogCloseReadRecord();
}

static void
6716
StartupXLOG_InProduction(bool bgwriterLaunched)
6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727
{
}

static void
ApplyStartupRedo(
	XLogRecPtr		*beginLoc,

	XLogRecPtr		*lsn,

	XLogRecord		*record)
{
6728 6729
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750
	RedoErrorCallBack redoErrorCallBack;

	ErrorContextCallback errcontext;

	/* Setup error traceback support for ereport() */
	redoErrorCallBack.location = *beginLoc;
	redoErrorCallBack.record = record;

	errcontext.callback = rm_redo_error_callback;
	errcontext.arg = (void *) &redoErrorCallBack;
	errcontext.previous = error_context_stack;
	error_context_stack = &errcontext;

	/* nextXid must be beyond record's xid */
	if (TransactionIdFollowsOrEquals(record->xl_xid,
									 ShmemVariableCache->nextXid))
	{
		ShmemVariableCache->nextXid = record->xl_xid;
		TransactionIdAdvance(ShmemVariableCache->nextXid);
	}

T
Tom Lane 已提交
6751
	/*
6752 6753
	 * Update shared replayEndRecPtr before replaying this record,
	 * so that XLogFlush will update minRecoveryPoint correctly.
T
Tom Lane 已提交
6754
	 */
6755 6756 6757
	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->replayEndRecPtr = EndRecPtr;
	SpinLockRelease(&xlogctl->info_lck);
6758 6759 6760

	RmgrTable[record->xl_rmid].rm_redo(*beginLoc, *lsn, record);

6761 6762 6763 6764 6765 6766 6767 6768
	/*
	 * After redo, check whether the backup pages associated with
	 * the WAL record are consistent with the existing pages. This
	 * check is done only if consistency check is enabled for this
	 * record.
	 */
	if ((record->xl_extended_info & XLR_CHECK_CONSISTENCY) != 0)
		checkXLogConsistency(record, *lsn);
6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783
	/* Pop the error context stack */
	error_context_stack = errcontext.previous;

}

/*
 * Process passed checkpoint record either during normal recovery or
 * in standby mode.
 *
 * If in standby mode, master mirroring information stored by the checkpoint
 * record is processed as well.
 */
static void
XLogProcessCheckpointRecord(XLogRecord *rec, XLogRecPtr loc)
{
6784
	CheckpointExtendedRecord ckptExtended;
6785

6786
	UnpackCheckPointRecord(rec, &ckptExtended);
6787

6788
	if (ckptExtended.dtxCheckpoint)
6789
	{
6790
		/* Handle the DTX information. */
6791
		UtilityModeFindOrCreateDtmRedoFile();
6792
		redoDtxCheckPoint(ckptExtended.dtxCheckpoint);
6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818
		UtilityModeCloseDtmRedoFile();
	}
}


/*
 * This must be called ONCE during postmaster or standalone-backend startup
 */
void
StartupXLOG(void)
{
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
	bool		wasShutdown;
	bool		reachedStopPoint = false;
	bool		haveBackupLabel = false;
	XLogRecPtr	RecPtr,
				LastRec,
				checkPointLoc,
				EndOfLog;
	uint32		endLogId;
	uint32		endLogSeg;
	XLogRecord *record;
	uint32		freespace;
	bool		multipleRecoveryPassesNeeded = false;
	bool		backupEndRequired = false;
6819
	bool		bgwriterLaunched = false;
6820 6821 6822 6823 6824 6825 6826 6827 6828

	/*
	 * Read control file and check XLOG status looks valid.
	 *
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
	 */
	ReadControlFile();

6829
	if (ControlFile->state < DB_SHUTDOWNED ||
6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840
		ControlFile->state > DB_IN_PRODUCTION ||
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		ereport(FATAL,
				(errmsg("control file contains invalid data")));

	if (ControlFile->state == DB_SHUTDOWNED)
		ereport(LOG,
				(errmsg("database system was shut down at %s",
						str_time(ControlFile->time))));
	else if (ControlFile->state == DB_SHUTDOWNING)
		ereport(LOG,
6841
				(errmsg("database system shutdown was interrupted; last known up at %s",
6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872
						str_time(ControlFile->time))));
	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
		ereport(LOG,
		   (errmsg("database system was interrupted while in recovery at %s",
				   str_time(ControlFile->time)),
			errhint("This probably means that some data is corrupted and"
					" you will have to use the last backup for recovery."),
			errSendAlert(true)));
	else if (ControlFile->state == DB_IN_STANDBY_MODE)
		ereport(LOG,
				(errmsg("database system was interrupted while in standby mode at  %s",
						str_time(ControlFile->checkPointCopy.time)),
						errhint("This probably means something unexpected happened either"
								" during replay at standby or receipt of XLog from primary."),
				 errSendAlert(true)));
	else if (ControlFile->state == DB_IN_STANDBY_PROMOTED)
		ereport(LOG,
				(errmsg("database system was interrupted after standby was promoted at %s",
						str_time(ControlFile->checkPointCopy.time)),
				 errhint("If this has occurred more than once something unexpected is happening"
				" after standby has been promoted"),
				 errSendAlert(true)));
	else if (ControlFile->state == DB_IN_STANDBY_NEW_TLI_SET)
		ereport(LOG,
				(errmsg("database system was interrupted post new TLI was setup on standby promotion at %s",
						str_time(ControlFile->checkPointCopy.time)),
						 errhint("If this has occurred more than once something unexpected is happening"
						" after standby has been promoted and new TLI has been set"),
				 errSendAlert(true)));
	else if (ControlFile->state == DB_IN_PRODUCTION)
		ereport(LOG,
6873
				(errmsg("database system was interrupted; last known up at %s",
6874 6875 6876 6877 6878 6879 6880 6881
						str_time(ControlFile->time))));

	/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
	if (ControlFile->state != DB_SHUTDOWNED)
		pg_usleep(60000000L);
#endif

6882
	/*
6883 6884 6885
	 * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
	 * someone has performed a copy for PITR, these directories may have been
	 * excluded and need to be re-created.
6886
	 */
6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909
	ValidateXLOGDirectoryStructure();

	/*
	 * Clear out any old relcache cache files.	This is *necessary* if we do
	 * any WAL replay, since that would probably result in the cache files
	 * being out of sync with database reality.  In theory we could leave them
	 * in place if the database had been cleanly shut down, but it seems
	 * safest to just remove them always and let them be rebuilt during the
	 * first backend startup.
	 */
	RelationCacheInitFileRemove();

	/*
	 * Initialize on the assumption we want to recover to the same timeline
	 * that's active according to pg_control.
	 */
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

	/*
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
	 */
	XLogReadRecoveryCommandFile(LOG);
6910

6911
	if (StandbyModeRequested)
T
Tom Lane 已提交
6912
	{
6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956
		Assert(ControlFile->state != DB_IN_CRASH_RECOVERY
				&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET);

		/*
		 * If the standby was promoted (last time) and recovery.conf
		 * is still found this time with standby mode request,
		 * it means the standby crashed post promotion but before recovery.conf
		 * cleanup. Hence, it is not considered a standby request this time.
		 */
		if (ControlFile->state == DB_IN_STANDBY_PROMOTED)
			StandbyModeRequested = false;
	}

	/* Now we can determine the list of expected TLIs */
	expectedTLIs = XLogReadTimeLineHistory(recoveryTargetTLI);

	/*
	 * If pg_control's timeline is not in expectedTLIs, then we cannot
	 * proceed: the backup is not part of the history of the requested
	 * timeline.
	 */
	if (!list_member_int(expectedTLIs,
						 (int) ControlFile->checkPointCopy.ThisTimeLineID))
		ereport(FATAL,
				(errmsg("requested timeline %u is not a child of database system timeline %u",
						recoveryTargetTLI,
						ControlFile->checkPointCopy.ThisTimeLineID)));
	/*
	 * Save the selected recovery target timeline ID in shared memory so that
	 * other processes can see them
	 */
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;

	if (StandbyModeRequested)
		ereport(LOG,
				(errmsg("entering standby mode")));

	/*
	 * Take ownership of the wakeup latch if we're going to sleep during
	 * recovery.
	 */
	if (StandbyModeRequested)
		OwnLatch(&XLogCtl->recoveryWakeupLatch);

6957 6958 6959 6960 6961 6962 6963
	/*
	 * Allocate pages dedicated to WAL consistency checks, those had better
	 * be aligned.
	 */
	replay_image_masked = (char *) palloc(BLCKSZ);
	master_image_masked = (char *) palloc(BLCKSZ);

6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979
	if (read_backup_label(&checkPointLoc, &backupEndRequired))
	{
		/*
		 * Currently, it is assumed that a backup file exists iff a base backup
		 * has been performed and then the recovery.conf file is generated, thus
		 * standby mode has to be requested
		 */
		if (!StandbyModeRequested)
			ereport(FATAL,
					(errmsg("Found backup.label file without any standby mode request")));

		/* Activate recovery in standby mode */
		StandbyMode = true;

		Assert(backupEndRequired);

6980
		/*
B
Bruce Momjian 已提交
6981 6982
		 * When a backup_label file is present, we want to roll forward from
		 * the checkpoint it identifies, rather than using pg_control.
6983
		 */
6984
		record = ReadCheckpointRecord(checkPointLoc, 0);
6985 6986
		if (record != NULL)
		{
6987 6988
			memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
			wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6989
			ereport(DEBUG1,
6990
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
6991
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6992
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
6993 6994

			/*
6995 6996 6997 6998
			 * Make sure that REDO location exists. This may not be
			 * the case if there was a crash during an online backup,
			 * which left a backup_label around that references a WAL
			 * segment that's already been archived.
6999 7000 7001 7002 7003 7004 7005 7006
			 */
			if (XLByteLT(checkPoint.redo, checkPointLoc))
			{
				if (!XLogReadRecord(&(checkPoint.redo), false, LOG))
					ereport(FATAL,
							(errmsg("could not find redo location referenced by checkpoint record"),
							 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
			}
7007 7008 7009
		}
		else
		{
7010
			ereport(FATAL,
B
Bruce Momjian 已提交
7011 7012
					(errmsg("could not locate required checkpoint record"),
					 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
7013
			wasShutdown = false; /* keep compiler quiet */
7014
		}
7015 7016
		/* set flag to delete it later */
		haveBackupLabel = true;
T
Tom Lane 已提交
7017 7018 7019
	}
	else
	{
7020 7021 7022 7023 7024 7025
		if (StandbyModeRequested)
		{
			/* Activate recovery in standby mode */
			StandbyMode = true;
		}

7026
		/*
B
Bruce Momjian 已提交
7027 7028
		 * Get the last valid checkpoint record.  If the latest one according
		 * to pg_control is broken, try the next-to-last one.
7029 7030
		 */
		checkPointLoc = ControlFile->checkPoint;
7031 7032
		RedoStartLSN = ControlFile->checkPointCopy.redo;

7033
		record = ReadCheckpointRecord(checkPointLoc, 1);
T
Tom Lane 已提交
7034 7035
		if (record != NULL)
		{
7036
			ereport(DEBUG1,
7037
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
7038
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
T
Tom Lane 已提交
7039
		}
7040 7041 7042 7043 7044 7045 7046 7047 7048
		else if (StandbyMode)
		{
			/*
			 * The last valid checkpoint record required for a streaming
			 * recovery exists in neither standby nor the primary.
			 */
			ereport(PANIC,
					(errmsg("could not locate a valid checkpoint record")));
		}
T
Tom Lane 已提交
7049
		else
7050
		{
7051 7052
			printEndOfXLogFile(&checkPointLoc);

7053
			checkPointLoc = ControlFile->prevCheckPoint;
7054
			record = ReadCheckpointRecord(checkPointLoc, 2);
7055 7056 7057
			if (record != NULL)
			{
				ereport(LOG,
B
Bruce Momjian 已提交
7058 7059 7060
						(errmsg("using previous checkpoint record at %X/%X",
							  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
				InRecovery = true;		/* force recovery even if SHUTDOWNED */
7061 7062
			}
			else
7063 7064
			{
				printEndOfXLogFile(&checkPointLoc);
7065
				ereport(PANIC,
B
Bruce Momjian 已提交
7066
					 (errmsg("could not locate a valid checkpoint record")));
7067
			}
7068
		}
7069 7070
		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
T
Tom Lane 已提交
7071
	}
7072

T
Tom Lane 已提交
7073
	LastRec = RecPtr = checkPointLoc;
7074 7075 7076 7077 7078 7079 7080 7081 7082
	XLogCtl->pass1LastCheckpointLoc = checkPointLoc;

	/*
	 * Currently, standby mode (WAL based replication support) is not provided
	 * to segments.
	 * Hence it's okay to do the following only once on the segments as there
	 * will be only one checkpoint to be analyzed.
	 */
	if (GpIdentity.segindex != MASTER_CONTENT_ID)
7083 7084 7085 7086 7087 7088
	{
		CheckpointExtendedRecord ckptExtended;
		UnpackCheckPointRecord(record, &ckptExtended);
		if (ckptExtended.ptas)
			SetupCheckpointPreparedTransactionList(ckptExtended.ptas);
	}
7089 7090 7091 7092 7093 7094

	/*
	 * Find Xacts that are distributed committed from the checkpoint record and
	 * store them such that they can utilized later during DTM recovery.
	 */
	XLogProcessCheckpointRecord(record, checkPointLoc);
7095

7096
	ereport(DEBUG1,
B
Bruce Momjian 已提交
7097 7098 7099
			(errmsg("redo record is at %X/%X; shutdown %s",
					checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
					wasShutdown ? "TRUE" : "FALSE")));
7100
	ereport(DEBUG1,
7101
			(errmsg("next transaction ID: %u/%u; next OID: %u; next relfilenode: %u",
7102
					checkPoint.nextXidEpoch, checkPoint.nextXid,
7103
					checkPoint.nextOid, checkPoint.nextRelfilenode)));
7104
	ereport(DEBUG1,
7105 7106
			(errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
					checkPoint.nextMulti, checkPoint.nextMultiOffset)));
7107

7108
	if (!TransactionIdIsNormal(checkPoint.nextXid))
7109
		ereport(PANIC,
7110
				(errmsg("invalid next transaction ID")));
7111

7112
	/* initialize shared memory variables from the checkpoint record */
7113 7114
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
7115
	ShmemVariableCache->oidCount = 0;
7116 7117
	ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
	ShmemVariableCache->relfilenodeCount = 0;
7118
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
7119 7120
	XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
	XLogCtl->ckptXid = checkPoint.nextXid;
7121

7122
	/*
B
Bruce Momjian 已提交
7123 7124 7125
	 * We must replay WAL entries using the same TimeLineID they were created
	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
	 * also xlog_redo()).
7126
	 */
7127
	ThisTimeLineID = checkPoint.ThisTimeLineID;
7128

7129
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
7130

7131
	if (XLByteLT(RecPtr, checkPoint.redo))
7132 7133
		ereport(PANIC,
				(errmsg("invalid redo in checkpoint record")));
7134

7135
	/*
B
Bruce Momjian 已提交
7136
	 * Check whether we need to force recovery from WAL.  If it appears to
B
Bruce Momjian 已提交
7137 7138
	 * have been a clean shutdown and we did not have a recovery.conf file,
	 * then assume no recovery needed.
7139
	 */
7140
	if (XLByteLT(checkPoint.redo, RecPtr))
7141
	{
T
Tom Lane 已提交
7142
		if (wasShutdown)
7143
			ereport(PANIC,
B
Bruce Momjian 已提交
7144
					(errmsg("invalid redo record in shutdown checkpoint")));
V
WAL  
Vadim B. Mikheev 已提交
7145
		InRecovery = true;
7146
	}
7147
	else if (StandbyModeRequested)
7148 7149
	{
		/* force recovery due to presence of recovery.conf */
7150 7151
		ereport(LOG,
				(errmsg("setting recovery standby mode active")));
7152 7153
		InRecovery = true;
	}
7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167
	else if (ControlFile->state != DB_SHUTDOWNED)
		InRecovery = true;

	if (InRecovery && !IsUnderPostmaster)
	{
		ereport(FATAL,
				(errmsg("Database must be shutdown cleanly when using single backend start")));
	}

	if (InRecovery && gp_before_persistence_work)
	{
		ereport(FATAL,
				(errmsg("Database must be shutdown cleanly when using gp_before_persistence_work = on")));
	}
7168

7169
	/* Recovery from xlog */
7170
	if (InRecovery)
7171
	{
B
Bruce Momjian 已提交
7172
		int			rmid;
7173

7174 7175 7176
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

7177
		/*
B
Bruce Momjian 已提交
7178 7179
		 * Update pg_control to show that we are recovering and to show the
		 * selected checkpoint as the place we are starting from. We also mark
7180
		 * pg_control with any minimum recovery stop point
7181
		 */
7182
		if (StandbyMode)
7183
		{
7184
			ereport(LOG,
7185 7186
					(errmsg("recovery in standby mode in progress")));
			ControlFile->state = DB_IN_STANDBY_MODE;
7187
		}
7188
		else
7189
		{
7190
			ereport(LOG,
7191 7192
					(errmsg("database system was not properly shut down; "
							"automatic recovery in progress")));
7193 7194 7195 7196

			if (ControlFile->state != DB_IN_STANDBY_PROMOTED
				&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET)
				ControlFile->state = DB_IN_CRASH_RECOVERY;
7197
		}
7198

7199 7200 7201
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = checkPointLoc;
		ControlFile->checkPointCopy = checkPoint;
7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217

		if (StandbyMode)
		{
			/* initialize minRecoveryPoint if not set yet */
			if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
				ControlFile->minRecoveryPoint = checkPoint.redo;
		}

		/* Set backupStartPoint if we're starting recovery from a base backup. */
		if (haveBackupLabel)
		{
			Assert(ControlFile->state == DB_IN_STANDBY_MODE);
			ControlFile->backupStartPoint = checkPoint.redo;
			ControlFile->backupEndRequired = backupEndRequired;
		}

7218
		ControlFile->time = (pg_time_t) time(NULL);
7219
		/* No need to hold ControlFileLock yet, we aren't up far enough */
7220 7221
		UpdateControlFile();

7222 7223
		/* initialize our local copy of minRecoveryPoint */
		minRecoveryPoint = ControlFile->minRecoveryPoint;
7224

7225 7226 7227
		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
7228 7229
		pgstat_reset_all();

7230
		/*
B
Bruce Momjian 已提交
7231 7232 7233 7234 7235 7236
		 * If there was a backup label file, it's done its job and the info
		 * has now been propagated into pg_control.  We must get rid of the
		 * label file so that if we crash during recovery, we'll pick up at
		 * the latest recovery restartpoint instead of going all the way back
		 * to the backup start point.  It seems prudent though to just rename
		 * the file out of the way rather than delete it completely.
7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247
		 */
		if (haveBackupLabel)
		{
			unlink(BACKUP_LABEL_OLD);
			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
				ereport(FATAL,
						(errcode_for_file_access(),
						 errmsg("could not rename file \"%s\" to \"%s\": %m",
								BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
		}

7248 7249
		UtilityModeFindOrCreateDtmRedoFile();

7250
		/* Initialize resource managers */
7251 7252 7253 7254 7255 7256
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

7257
		/*
7258 7259 7260
		 * Initialize shared variables for tracking progress of WAL replay,
		 * as if we had just replayed the record before the REDO location
		 * (or the checkpoint record itself, if it's a shutdown checkpoint).
7261 7262
		 */
		SpinLockAcquire(&xlogctl->info_lck);
7263 7264 7265 7266 7267
		if (XLByteLT(checkPoint.redo, RecPtr))
			xlogctl->replayEndRecPtr = checkPoint.redo;
		else
			xlogctl->replayEndRecPtr = EndRecPtr;
		xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
7268 7269 7270 7271 7272 7273 7274 7275
		xlogctl->currentChunkStartTime = 0;
		SpinLockRelease(&xlogctl->info_lck);

		/* Also ensure XLogReceiptTime has a sane value */
		XLogReceiptTime = GetCurrentTimestamp();

		/*
		 * Find the first record that logically follows the checkpoint --- it
B
Bruce Momjian 已提交
7276
		 * might physically precede it, though.
7277
		 */
7278
		if (XLByteLT(checkPoint.redo, RecPtr))
7279 7280
		{
			/* back up to find the record */
7281
			record = XLogReadRecord(&(checkPoint.redo), false, PANIC);
7282
		}
B
Bruce Momjian 已提交
7283
		else
7284
		{
7285
			/* just have to read next record after CheckPoint */
7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301
			record = XLogReadRecord(NULL, false, LOG);
		}

		/*
		 * In case where its not a clean shutdown but it doesn't have a record
		 * following the checkpoint record, just proceed with the Pass 2, 3, 4
		 * to clear any inconsistent entries in Persistent Tables without
		 * doing the whole redo loop below.
		 */
		if (record == NULL)	
		{
			/*
			 * There are no WAL records following the checkpoint
			 */
			ereport(LOG,
					(errmsg("no record for redo after checkpoint, skip redo and proceed for recovery pass")));
7302
		}
7303

7304 7305 7306
		/*
		 * main redo apply loop, executed if we have record after checkpoint
		 */
T
Tom Lane 已提交
7307
		if (record != NULL)
7308
		{
7309 7310
			bool		recoveryContinue = true;
			bool		recoveryApply = true;
7311
			bool		reachedMinRecoveryPoint = false;
7312

7313
			InRedo = true;
7314

7315 7316
			if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
				ereport(LOG,
7317 7318
						(errmsg("redo starts at %X/%X",
								ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
7319 7320 7321 7322 7323
			else
				ereport(LOG,
						(errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
								ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
7324

7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343
			/*
			 * Let postmaster know we've started redo now, so that it can
			 * launch bgwriter to perform restartpoints.  We don't bother
			 * during crash recovery as restartpoints can only be performed
			 * during archive recovery.  And we'd like to keep crash recovery
			 * simple, to avoid introducing bugs that could you from
			 * recovering after crash.
			 *
			 * After this point, we can no longer assume that we're the only
			 * process in addition to postmaster!  Also, fsync requests are
			 * subsequently to be handled by the bgwriter, not locally.
			 */
			if (InArchiveRecovery && IsUnderPostmaster)
			{
				SetForwardFsyncRequests();
				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
				bgwriterLaunched = true;
			}

7344 7345 7346
			/*
			 * main redo apply loop
			 */
7347 7348
			do
			{
7349
				HandleStartupProcInterrupts();
V
WAL  
Vadim B. Mikheev 已提交
7350

7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383
				/*
				 * Check if we were requested to re-read config file.
				 */
				if (got_SIGHUP)
				{
					got_SIGHUP = false;
					ProcessConfigFile(PGC_SIGHUP);
				}

				/*
				 * Check if we were requested to exit without finishing
				 * recovery.
				 */
				if (shutdown_requested)
					proc_exit(1);

				/*
				 * Have we passed our safe starting point? If so, we can tell
				 * postmaster that the database is consistent now.
				 */
				if (!reachedMinRecoveryPoint &&
					XLByteLT(minRecoveryPoint, EndRecPtr))
				{
					reachedMinRecoveryPoint = true;
					if (InArchiveRecovery)
					{
						ereport(LOG,
							  (errmsg("consistent recovery state reached")));
						if (IsUnderPostmaster)
							SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
					}
				}

7384 7385 7386 7387 7388
				/*
				 * Have we reached our recovery target?
				 */
				if (recoveryStopsHere(record, &recoveryApply))
				{
B
Bruce Momjian 已提交
7389
					reachedStopPoint = true;	/* see below */
7390 7391 7392 7393 7394
					recoveryContinue = false;
					if (!recoveryApply)
						break;
				}

7395
				/*
7396 7397 7398
				 * See if this record is a checkpoint, if yes then uncover it to
				 * find distributed committed Xacts.
				 * No need to unpack checkpoint in crash recovery mode
7399
				 */
7400 7401 7402 7403 7404 7405
				uint8 xlogRecInfo = record->xl_info & ~XLR_INFO_MASK;

				if (IsStandbyMode() &&
					record->xl_rmid == RM_XLOG_ID &&
					(xlogRecInfo == XLOG_CHECKPOINT_SHUTDOWN
					 || xlogRecInfo == XLOG_CHECKPOINT_ONLINE))
7406
				{
7407 7408 7409
					XLogProcessCheckpointRecord(record, ReadRecPtr);
					memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
				}
7410

7411 7412 7413 7414 7415 7416 7417
				/*
				 * Update shared replayEndRecPtr before replaying this record,
				 * so that XLogFlush will update minRecoveryPoint correctly.
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->replayEndRecPtr = EndRecPtr;
				SpinLockRelease(&xlogctl->info_lck);
7418

7419
				ApplyStartupRedo(&ReadRecPtr, &EndRecPtr, record);
7420

7421 7422 7423 7424 7425 7426 7427
				/*
				 * Update lastReplayedEndRecPtr after this record has been
				 * successfully replayed.
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->lastReplayedEndRecPtr = EndRecPtr;
				SpinLockRelease(&xlogctl->info_lck);
7428

7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441
				/*
				 * GPDB_84_MERGE_FIXME: Create restartpoints aggressively.
				 *
				 * In PostgreSQL, the bgwriter creates restartpoints during archive
				 * recovery at its own leisure. In GDPB, with WAL replication based
				 * mirroring, that was tripping the gp_replica_check checks, because
				 * it bypasses the shared buffer cache and reads directly from disk.
				 * For now, restore the old behavior, before the upstream change
				 * to start bgwriter during archive recovery, and create a
				 * restartpoint immediately after replaying a checkpoint record.
				 */
				{
					uint8 xlogRecInfo = record->xl_info & ~XLR_INFO_MASK;
7442

7443 7444 7445
					if (record->xl_rmid == RM_XLOG_ID &&
						(xlogRecInfo == XLOG_CHECKPOINT_SHUTDOWN ||
						 xlogRecInfo == XLOG_CHECKPOINT_ONLINE))
7446
					{
7447 7448 7449 7450
						if (bgwriterLaunched)
							RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
						else
							CreateRestartPoint(CHECKPOINT_IMMEDIATE);
7451
					}
7452
				}
7453 7454 7455

				LastRec = ReadRecPtr;

7456 7457
				record = XLogReadRecord(NULL, false, LOG);
			} while (record != NULL && recoveryContinue);
7458

7459 7460 7461
			ereport(LOG,
					(errmsg("redo done at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
7462

7463 7464
			if (recoveryLastXTime)
				ereport(LOG,
B
Bruce Momjian 已提交
7465 7466
					 (errmsg("last completed transaction was at log time %s",
							 timestamptz_to_str(recoveryLastXTime))));
V
WAL  
Vadim B. Mikheev 已提交
7467
			InRedo = false;
7468
		}
7469 7470 7471
		/*
		 * end of main redo apply loop
		 */
V
WAL  
Vadim B. Mikheev 已提交
7472 7473
	}

T
Tom Lane 已提交
7474
	/*
7475 7476 7477
	 * Kill WAL receiver, if it's still running, before we continue to write
	 * the startup checkpoint record. It will trump over the checkpoint and
	 * subsequent records if it's still alive when we start writing WAL.
T
Tom Lane 已提交
7478
	 */
7479
	ShutdownWalRcv();
7480

7481
	/*
7482 7483 7484 7485 7486 7487 7488 7489
	 * We don't need the latch anymore. It's not strictly necessary to disown
	 * it, but let's do it for the sake of tidiness.
	 */
	if (StandbyModeRequested)
		DisownLatch(&XLogCtl->recoveryWakeupLatch);

	/*
	 * We are now done reading the xlog from stream.
7490
	 */
7491
	if (StandbyMode)
7492
	{
7493 7494 7495 7496 7497
		Assert(ControlFile->state == DB_IN_STANDBY_MODE);
		StandbyMode = false;

		/* Transition to promoted mode */
		ControlFile->state = DB_IN_STANDBY_PROMOTED;
7498
		ControlFile->time = (pg_time_t) time(NULL);
7499
		UpdateControlFile();
7500 7501
	}

7502
	/*
7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514
	 * Re-fetch the last valid or last applied record, so we can identify the
	 * exact endpoint of what we consider the valid portion of WAL.
	 */
	record = XLogReadRecord(&LastRec, false, PANIC);
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);

	elog(LOG,"end of transaction log location is %s",
		 XLogLocationToString(&EndOfLog));

	/*
	 * Complain if we did not roll forward far enough to render the backup
7515 7516 7517 7518
	 * dump consistent.  Note: it is indeed okay to look at the local variable
	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
	 * advanced beyond the WAL we processed.
7519
	 */
7520 7521 7522
	if (InRecovery &&
		(XLByteLT(EndOfLog, ControlFile->minRecoveryPoint) ||
		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7523
	{
7524 7525 7526
		if (reachedStopPoint)
		{
			/* stopped because of stop request */
7527
			ereport(FATAL,
7528
					(errmsg("requested recovery stop point is before consistent recovery point")));
7529
		}
7530

7531 7532 7533 7534 7535 7536 7537 7538 7539
		/*
		 * Ran off end of WAL before reaching end-of-backup WAL record, or
		 * minRecoveryPoint. That's usually a bad sign, indicating that you
		 * tried to recover from an online backup but never called
		 * pg_stop_backup(), or you didn't archive all the WAL up to that
		 * point. However, this also happens in crash recovery, if the system
		 * crashes while an online backup is in progress. We must not treat
		 * that as an error, or the database will refuse to start up.
		 */
7540
		// WALREP_FIXME: But we should probably do this check in standby mode, too
7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554
		if (StandbyModeRequested || ControlFile->backupEndRequired)
		{
			if (ControlFile->backupEndRequired)
				ereport(FATAL,
						(errmsg("WAL ends before end of online backup"),
						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
				ereport(FATAL,
						(errmsg("WAL ends before end of online backup"),
						 errhint("Online backup should be complete, and all WAL up to that point must be available at recovery.")));
			else
				ereport(FATAL,
					  (errmsg("WAL ends before consistent recovery point")));
		}
7555 7556
	}

7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581
	/*
	 * Consider whether we need to assign a new timeline ID.
	 *
	 * If we are doing an archive recovery, we always assign a new ID.	This
	 * handles a couple of issues.	If we stopped short of the end of WAL
	 * during recovery, then we are clearly generating a new timeline and must
	 * assign it a unique new ID.  Even if we ran to the end, modifying the
	 * current last segment is problematic because it may result in trying to
	 * overwrite an already-archived copy of that segment, and we encourage
	 * DBAs to make their archive_commands reject that.  We can dodge the
	 * problem by making the new active segment have a new timeline ID.
	 *
	 * In a normal crash recovery, we can just extend the timeline we were in.
	 *
	 * GPDB: Greenplum doesn't support archive recovery.
	 */
	if (InArchiveRecovery)
	{
		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
		ereport(LOG,
				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
							 curFileTLI, endLogId, endLogSeg);
	}

7582 7583 7584
	/* Save the selected TimeLineID in shared memory, too */
	XLogCtl->ThisTimeLineID = ThisTimeLineID;

7585 7586 7587 7588 7589 7590 7591 7592 7593
	/*
	 * We are now done reading the old WAL.  Turn off archive fetching if it
	 * was active, and make a writable copy of the last WAL segment. (Note
	 * that we also have a copy of the last block of the old WAL in readBuf;
	 * we will use that below.)
	 */
	if (InArchiveRecovery)
		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);

7594 7595 7596 7597 7598 7599 7600
	/*
	 * Prepare to write WAL starting at EndOfLog position, and init xlog
	 * buffer cache using the block containing the last record from the
	 * previous incarnation.
	 */
	openLogId = endLogId;
	openLogSeg = endLogSeg;
7601 7602 7603 7604
	XLogFileOpen(
			&mirroredLogFileOpen,
			openLogId,
			openLogSeg);
T
Tom Lane 已提交
7605
	openLogOff = 0;
V
WAL  
Vadim B. Mikheev 已提交
7606
	Insert = &XLogCtl->Insert;
7607
	Insert->PrevRecord = LastRec;
7608 7609
	XLogCtl->xlblocks[0].xlogid = openLogId;
	XLogCtl->xlblocks[0].xrecoff =
7610
		((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
B
Bruce Momjian 已提交
7611 7612

	/*
B
Bruce Momjian 已提交
7613 7614 7615
	 * Tricky point here: readBuf contains the *last* block that the LastRec
	 * record spans, not the one it starts in.	The last block is indeed the
	 * one we want to use.
T
Tom Lane 已提交
7616
	 */
7617 7618
	Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
	memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
7619
	Insert->currpos = (char *) Insert->currpage +
7620
		(EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
7621

T
Tom Lane 已提交
7622
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
7623

T
Tom Lane 已提交
7624 7625 7626
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
7627

T
Tom Lane 已提交
7628 7629
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
7630

7631 7632 7633 7634 7635 7636 7637 7638 7639 7640
	freespace = INSERT_FREESPACE(Insert);
	if (freespace > 0)
	{
		/* Make sure rest of page is zero */
		MemSet(Insert->currpos, 0, freespace);
		XLogCtl->Write.curridx = 0;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
7641 7642
		 * Whenever Write.LogwrtResult points to exactly the end of a page,
		 * Write.curridx must point to the *next* page (see XLogWrite()).
7643
		 *
B
Bruce Momjian 已提交
7644
		 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
B
Bruce Momjian 已提交
7645
		 * this is sufficient.	The first actual attempt to insert a log
7646
		 * record will advance the insert state.
7647 7648 7649 7650
		 */
		XLogCtl->Write.curridx = NextBufIdx(0);
	}

V
WAL  
Vadim B. Mikheev 已提交
7651
	if (InRecovery)
7652
	{
7653 7654 7655
		/*
		 * Close down Recovery for Startup PASS 1.
		 */
B
Bruce Momjian 已提交
7656
		int			rmid;
7657

7658 7659 7660 7661 7662 7663 7664
		/*
		 * Resource managers might need to write WAL records, eg, to record
		 * index cleanup actions.  So temporarily enable XLogInsertAllowed in
		 * this process only.
		 */
		LocalSetXLogInsertAllowed();

7665 7666 7667 7668 7669 7670 7671 7672 7673
		/*
		 * Allow resource managers to do any required cleanup.
		 */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_cleanup != NULL)
				RmgrTable[rmid].rm_cleanup();
		}

7674 7675 7676
		/* Disallow XLogInsert again */
		LocalXLogInsertAllowed = -1;

7677 7678 7679 7680 7681 7682
		/*
		 * Check to see if the XLOG sequence contained any unresolved
		 * references to uninitialized pages.
		 */
		XLogCheckInvalidPages();

7683 7684 7685 7686 7687
		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
		pgstat_reset_all();

T
Tom Lane 已提交
7688
		/*
7689
		 * Perform a checkpoint to update all our recovery activity to disk.
7690
		 *
7691 7692 7693 7694 7695
		 * Note that we write a shutdown checkpoint rather than an on-line
		 * one. This is not particularly critical, but since we may be
		 * assigning a new TLI, using a shutdown checkpoint allows us to have
		 * the rule that TLI only changes in shutdown checkpoints, which
		 * allows some extra error checking in xlog_redo.
T
Tom Lane 已提交
7696
		 */
7697 7698 7699 7700 7701 7702
		if (bgwriterLaunched)
			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
							  CHECKPOINT_IMMEDIATE |
							  CHECKPOINT_WAIT);
		else
			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7703

7704
		UtilityModeCloseDtmRedoFile();
7705
	}
7706

T
Tom Lane 已提交
7707 7708 7709
	/*
	 * Preallocate additional log files, if wanted.
	 */
7710
	PreallocXlogFiles(EndOfLog);
7711

7712
	/*
7713
	 * Okay, we're officially UP.
7714
	 */
V
WAL  
Vadim B. Mikheev 已提交
7715
	InRecovery = false;
7716

7717
	/* start the archive_timeout timer running */
7718
	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
7719

7720 7721 7722 7723
	/* initialize shared-memory copy of latest checkpoint XID/epoch */
	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;

7724
	TransactionId oldestActiveXID;
V
Vadim B. Mikheev 已提交
7725

7726 7727
	/* Pre-scan prepared transactions to find out the range of XIDs present */
	oldestActiveXID = PrescanPreparedTransactions();
7728

7729
	elog(LOG, "Oldest active transaction from prepared transactions %u", oldestActiveXID);
7730

7731
	/*
7732 7733 7734 7735 7736 7737 7738 7739
	 * Initialize TransactionXmin to current oldestActiveXID, generally
	 * initialized during GetSnapshotData(). This is to avoid situations where
	 * scanning pg_authid or other tables mostly in BuildFlatFiles() below via
	 * SnapshotNow may try to chase down pg_subtrans for older "sub-committed"
	 * transaction, file corresponding to which may not and is not supposed to
	 * exist. Setting this here will avoid calling SubTransGetParent() in
	 * TransactionIdDidCommit() for older XIDs. Also, set RecentGlobalXmin
	 * since Heap access method functions needs it to have good value as well.
7740
	 */
7741
	TransactionXmin = RecentGlobalXmin = oldestActiveXID;
T
Tom Lane 已提交
7742

7743 7744 7745 7746 7747 7748 7749
	/* Start up the commit log and related stuff, too */
	StartupCLOG();
	StartupSUBTRANS(oldestActiveXID);
	StartupMultiXact();
	DistributedLog_Startup(
						oldestActiveXID,
						ShmemVariableCache->nextXid);
7750

7751 7752 7753 7754 7755 7756
	/* also initialize latestCompletedXid, to nextXid - 1 */
	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
	elog(LOG, "latest completed transaction id is %u and next transaction id is %u",
		ShmemVariableCache->latestCompletedXid,
		ShmemVariableCache->nextXid);
T
Tom Lane 已提交
7757

7758 7759
	/* Reload shared-memory state for prepared transactions */
	RecoverPreparedTransactions();
B
Bruce Momjian 已提交
7760

T
Tom Lane 已提交
7761
	/*
7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777
	 * Perform a checkpoint to update all our recovery activity to disk.
	 *
	 * Note that we write a shutdown checkpoint rather than an on-line
	 * one. This is not particularly critical, but since we may be
	 * assigning a new TLI, using a shutdown checkpoint allows us to have
	 * the rule that TLI only changes in shutdown checkpoints, which
	 * allows some extra error checking in xlog_redo.
	 *
	 * Note that - Creation of shutdown checkpoint changes the state in pg_control.
	 * If that happens when we are standby who was recently promoted, the
	 * state in pg_control indicating promotion phases (e.g. DB_IN_STANDBY_PROMOTION,
	 * DB_INSTANDBY_NEW_TLI_SET) before the checkpoint creation will get
	 * overwritten posing a problem for further flow. Hence, CreateCheckpoint()
	 * has an ungly hack to avoid this situation and thus we avoid change of
	 * pg_control state just in this special situation. CreateCheckpoint() also
	 * has a comment referring this.
T
Tom Lane 已提交
7778
	 */
7779 7780 7781 7782 7783 7784
	if (bgwriterLaunched)
		RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
						  CHECKPOINT_IMMEDIATE |
						  CHECKPOINT_WAIT);
	else
		CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7785

7786
#ifdef NOT_USED
T
Tom Lane 已提交
7787
	/*
7788
	 * And finally, execute the recovery_end_command, if any.
T
Tom Lane 已提交
7789
	 */
7790 7791 7792 7793 7794
	if (recoveryEndCommand)
		ExecuteRecoveryCommand(recoveryEndCommand,
							   "recovery_end_command",
							   true);
#endif
7795

T
Tom Lane 已提交
7796
	/*
7797 7798 7799
	 * If this system was a standby which was promoted (or whose catalog is not
	 * yet updated after promote), we delay going into actual production till Pass4.
	 * Pass4 updates the catalog to comply with the standby promotion changes.
T
Tom Lane 已提交
7800
	 */
7801 7802 7803
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
	if (ControlFile->state == DB_IN_STANDBY_PROMOTED
		|| ControlFile->state == DB_IN_STANDBY_NEW_TLI_SET)
7804
	{
7805 7806 7807 7808
		ControlFile->state = DB_IN_STANDBY_NEW_TLI_SET;
		ControlFile->time = (pg_time_t) time(NULL);
		UpdateControlFile();
		ereport(LOG, (errmsg("database system is almost ready")));
7809
	}
7810
	else
V
Vadim B. Mikheev 已提交
7811
	{
7812 7813 7814 7815
		ControlFile->state = DB_IN_PRODUCTION;
		ControlFile->time = (pg_time_t) time(NULL);
		UpdateControlFile();
		ereport(LOG, (errmsg("database system is ready")));
V
Vadim B. Mikheev 已提交
7816
	}
7817
	LWLockRelease(ControlFileLock);
V
Vadim B. Mikheev 已提交
7818

7819
	{
7820
		char version[512];
7821

7822
		strcpy(version, PG_VERSION_STR " compiled on " __DATE__ " " __TIME__);
7823

7824 7825 7826 7827
#ifdef USE_ASSERT_CHECKING
		strcat(version, " (with assert checking)");
#endif
		ereport(LOG,(errmsg("%s", version)));
7828

7829
	}
T
Tom Lane 已提交
7830

7831
	/*
7832 7833 7834 7835
	 * All done.  Allow backends to write WAL.	(Although the bool flag is
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
7836
	 */
7837
	{
7838 7839
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
7840

7841 7842 7843
		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
7844 7845
	}

7846 7847
	
	if (!IsUnderPostmaster)
7848
	{
7849
		Assert(!multipleRecoveryPassesNeeded);
7850

7851
		StartupXLOG_InProduction(bgwriterLaunched);
7852

7853 7854
		ereport(LOG,
				(errmsg("Finished single backend startup")));
7855
	}
7856
	else
7857
	{
7858
		XLogCtl->multipleRecoveryPassesNeeded = multipleRecoveryPassesNeeded;
7859

7860 7861 7862 7863 7864 7865 7866 7867 7868
		if (!gp_startup_integrity_checks)
		{
			ereport(LOG,
					(errmsg("Integrity checks will be skipped because gp_startup_integrity_checks = off")));
		}
		else
		{
			XLogCtl->integrityCheckNeeded = true;
		}
7869

7870 7871 7872
		if (!XLogCtl->multipleRecoveryPassesNeeded)
		{
			StartupXLOG_InProduction(bgwriterLaunched);
T
Tom Lane 已提交
7873

7874 7875
			ereport(LOG,
					(errmsg("Finished normal startup for clean shutdown case")));
7876

7877 7878 7879 7880 7881 7882
		}
		else
		{
			ereport(LOG,
					(errmsg("Finished startup pass 1.  Proceeding to startup crash recovery passes 2 and 3.")));
		}
7883 7884
	}

7885
	XLogCloseReadRecord();
7886
}
7887 7888

/*
7889 7890 7891 7892 7893 7894 7895 7896 7897 7898
 * Determine the recovery redo start location from the pg_control file.
 *
 *    1) Only uses information from the pg_control file.
 *    2) This simplified routine does not examine the offline recovery file or
 *       the online backup labels, etc.
 *    3) This routine is a heavily reduced version of StartXLOG.
 *    4) IMPORTANT NOTE: This routine sets global variables that establish
 *       the timeline context necessary to do ReadRecord.  The ThisTimeLineID
 *       and expectedTLIs globals are set.
 *
7899
 */
7900 7901
void
XLogGetRecoveryStart(char *callerStr, char *reasonStr, XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint)
7902
{
7903 7904 7905 7906 7907
	CheckPoint	checkPoint;
	XLogRecPtr	checkPointLoc;
	XLogRecord *record;
	bool previous;
	XLogRecPtr checkPointLSN;
7908

7909 7910 7911 7912 7913 7914 7915 7916 7917 7918
	Assert(redoCheckPointLoc != NULL);
	Assert(redoCheckPoint != NULL);

	ereport((Debug_print_qd_mirroring ? LOG : DEBUG1),
			(errmsg("%s: determine restart location %s",
			 callerStr, reasonStr)));

	XLogCloseReadRecord();

	if (Debug_print_qd_mirroring)
7919
	{
7920
		XLogPrintLogNames();
7921
	}
7922 7923 7924 7925 7926 7927 7928

	/*
	 * Read control file and verify XLOG status looks valid.
	 *
	 */
	ReadControlFile();

7929
	if (ControlFile->state < DB_SHUTDOWNED ||
7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952
		ControlFile->state > DB_IN_PRODUCTION ||
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		ereport(FATAL,
				(errmsg("%s: control file contains invalid data", callerStr)));

	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
	checkPointLoc = ControlFile->checkPoint;
	ThisTimeLineID = ControlFile->checkPointCopy.ThisTimeLineID;

	/*
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
	 */
	XLogReadRecoveryCommandFile(DEBUG5);

	/* Now we can determine the list of expected TLIs */
	expectedTLIs = XLogReadTimeLineHistory(ThisTimeLineID);

	record = ReadCheckpointRecord(checkPointLoc, 1);
	if (record != NULL)
7953
	{
7954 7955 7956 7957 7958 7959
		previous = false;
		ereport((Debug_print_qd_mirroring ? LOG : DEBUG1),
				(errmsg("%s: checkpoint record is at %s (LSN %s)",
						callerStr,
						XLogLocationToString(&checkPointLoc),
						XLogLocationToString2(&EndRecPtr))));
7960
	}
7961
	else
7962
	{
7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978
		previous = true;
		checkPointLoc = ControlFile->prevCheckPoint;
		record = ReadCheckpointRecord(checkPointLoc, 2);
		if (record != NULL)
		{
			ereport((Debug_print_qd_mirroring ? LOG : DEBUG1),
					(errmsg("%s: using previous checkpoint record at %s (LSN %s)",
						    callerStr,
							XLogLocationToString(&checkPointLoc),
						    XLogLocationToString2(&EndRecPtr))));
		}
		else
		{
			ereport(ERROR,
				 (errmsg("%s: could not locate a valid checkpoint record", callerStr)));
		}
7979
	}
7980 7981 7982 7983 7984

	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	checkPointLSN = EndRecPtr;

	if (XLByteEQ(checkPointLoc,checkPoint.redo))
7985
	{
7986
		{
7987 7988 7989 7990 7991
			elog(LOG,
				 "control file has restart '%s' and redo start checkpoint at location(lsn) '%s(%s)' ",
				 (previous ? "previous " : ""),
				 XLogLocationToString3(&checkPointLoc),
				 XLogLocationToString4(&checkPointLSN));
7992
		}
7993
	}
7994
 	else if (XLByteLT(checkPointLoc, checkPoint.redo))
7995
	{
7996 7997
		ereport(ERROR,
				(errmsg("%s: invalid redo in checkpoint record", callerStr)));
7998 7999
	}
	else
8000 8001
	{
		XLogRecord *record;
8002

8003 8004 8005 8006 8007 8008 8009
		record = XLogReadRecord(&checkPoint.redo, false, LOG);
		if (record == NULL)
		{
			ereport(ERROR,
			 (errmsg("%s: first redo record before checkpoint not found at %s",
					 callerStr, XLogLocationToString(&checkPoint.redo))));
		}
8010

8011
		{
8012 8013 8014 8015 8016 8017 8018
			elog(LOG,
				 "control file has restart '%s' checkpoint at location(lsn) '%s(%s)', redo starts at location(lsn) '%s(%s)' ",
				 (previous ? "previous " : ""),
				 XLogLocationToString3(&checkPointLoc),
				 XLogLocationToString4(&checkPointLSN),
				 XLogLocationToString(&checkPoint.redo),
				 XLogLocationToString2(&EndRecPtr));
8019 8020
		}
	}
8021

8022
	XLogCloseReadRecord();
8023

8024 8025 8026 8027
	*redoCheckPointLoc = checkPointLoc;
	*redoCheckPoint = checkPoint;

}
8028 8029

/*
8030 8031 8032 8033 8034 8035 8036
 * Is the system still in recovery?
 *
 * Unlike testing InRecovery, this works in any process that's connected to
 * shared memory.
 *
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
 * variables the first time we see that recovery is finished.
8037
 */
8038 8039
bool
RecoveryInProgress(void)
8040
{
8041 8042 8043 8044 8045 8046 8047 8048
	/*
	 * We check shared state each time only until we leave recovery mode. We
	 * can't re-enter recovery, so there's no need to keep checking after the
	 * shared variable has once been seen false.
	 */
	if (!LocalRecoveryInProgress)
		return false;
	else
8049
	{
8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		/* spinlock is essential on machines with weak memory ordering! */
		SpinLockAcquire(&xlogctl->info_lck);
		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
		SpinLockRelease(&xlogctl->info_lck);

		/*
		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
		 * is finished. InitPostgres() relies upon this behaviour to ensure
		 * that InitXLOGAccess() is called at backend startup.	(If you change
		 * this, see also LocalSetXLogInsertAllowed.)
		 */
		if (!LocalRecoveryInProgress)
			InitXLOGAccess();

		return LocalRecoveryInProgress;
	}
8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128

	/*
	 * All done.  Allow backends to write WAL.  (Although the bool flag is
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
	 */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
	}
}

/*
 * Is this process allowed to insert new WAL records?
 *
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
 * But we also have provisions for forcing the result "true" or "false"
 * within specific processes regardless of the global state.
 */
bool
XLogInsertAllowed(void)
{
	/*
	 * If value is "unconditionally true" or "unconditionally false",
	 * just return it.  This provides the normal fast path once recovery
	 * is known done.
	 */
	if (LocalXLogInsertAllowed >= 0)
		return (bool) LocalXLogInsertAllowed;

	/*
	 * Else, must check to see if we're still in recovery.
	 */
	if (RecoveryInProgress())
		return false;

	/*
	 * On exit from recovery, reset to "unconditionally true", since there
	 * is no need to keep checking.
	 */
	LocalXLogInsertAllowed = 1;
	return true;
}

/*
 * Make XLogInsertAllowed() return true in the current process only.
 */
static void
LocalSetXLogInsertAllowed(void)
{
	Assert(LocalXLogInsertAllowed == -1);
	LocalXLogInsertAllowed = 1;

	/* Initialize as RecoveryInProgress() would do when switching state */
	InitXLOGAccess();
8129 8130 8131 8132 8133 8134 8135 8136
}

/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 *
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
 */
8137
static XLogRecord *
8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286
ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
{
	XLogRecord *record;
	bool sizeOk;
	uint32 delta_xl_tot_len;		/* delta of total len of entire record */
	uint32 delta_xl_len;			/* delta of total len of rmgr data */

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
				(errmsg("invalid primary checkpoint link in control file")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint link in control file")));
				break;
			default:
				ereport(LOG,
				   (errmsg("invalid checkpoint link in backup_label file")));
				break;
		}
		return NULL;
	}

	/*
	 * Set fetching_ckpt to true here, so that XLogReadRecord()
	 * uses RedoStartLSN as the start replication location used
	 * by WAL receiver (when StandbyMode is on). See comments
	 * for fetching_ckpt in XLogReadPage()
	 */
	record = XLogReadRecord(&RecPtr, true /* fetching_checkpoint */, LOG);

	if (record == NULL)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid primary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid resource manager ID in primary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid resource manager ID in secondary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
				(errmsg("invalid resource manager ID in checkpoint record at location %s",
				        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
				   (errmsg("invalid xl_info in primary checkpoint record at location %s",
				           XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
				 (errmsg("invalid xl_info in secondary checkpoint record at location %s",
				         XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid xl_info in checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}

	sizeOk = false;
	if (record->xl_len == sizeof(CheckPoint) &&
		record->xl_tot_len == SizeOfXLogRecord + sizeof(CheckPoint))
	{
		sizeOk = true;
	}
	else if (record->xl_len > sizeof(CheckPoint) &&
		record->xl_tot_len > SizeOfXLogRecord + sizeof(CheckPoint))
	{
		delta_xl_len = record->xl_len - sizeof(CheckPoint);
		delta_xl_tot_len = record->xl_tot_len - (SizeOfXLogRecord + sizeof(CheckPoint));

		if (delta_xl_len == delta_xl_tot_len)
		{
			sizeOk = true;
		}
	}

	if (!sizeOk)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
					(errmsg("invalid length of primary checkpoint at location %s",
					        XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
				  (errmsg("invalid length of secondary checkpoint record at location %s",
				          XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid length of checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}
	return record;
}

static void
UnpackCheckPointRecord(
	XLogRecord			*record,
8287
	CheckpointExtendedRecord *ckptExtended)
8288
{
8289 8290
	char *current_record_ptr;
	int remainderLen;
8291

8292
	if (record->xl_len == sizeof(CheckPoint))
8293
	{
8294 8295 8296 8297 8298 8299
		/* Special (for bootstrap, xlog switch, maybe others) */
		ckptExtended->dtxCheckpoint = NULL;
		ckptExtended->dtxCheckpointLen = 0;
		ckptExtended->ptas = NULL;
		return;
	}
8300

8301 8302
	/* Normal checkpoint Record */
	Assert(record->xl_len > sizeof(CheckPoint));
8303

8304 8305
	current_record_ptr = ((char*)XLogRecGetData(record)) + sizeof(CheckPoint);
	remainderLen = record->xl_len - sizeof(CheckPoint);
8306

8307 8308 8309 8310
	/* Start of distributed transaction information */
	ckptExtended->dtxCheckpoint = (TMGXACT_CHECKPOINT *)current_record_ptr;
	ckptExtended->dtxCheckpointLen =
		TMGXACT_CHECKPOINT_BYTES((ckptExtended->dtxCheckpoint)->committedCount);
8311

8312 8313 8314 8315 8316 8317 8318 8319
	/*
	 * The master mirror checkpoint (mmxlog) and prepared transaction aggregate state (ptas) will be skipped
	 * when gp_before_filespace_setup is ON.
	 */
	if (remainderLen > ckptExtended->dtxCheckpointLen)
	{
		current_record_ptr = current_record_ptr + ckptExtended->dtxCheckpointLen;
		remainderLen -= ckptExtended->dtxCheckpointLen;
8320

8321 8322 8323 8324 8325 8326
		/* Finally, point to prepared transaction information */
		ckptExtended->ptas = (prepared_transaction_agg_state *) current_record_ptr;
		Assert(remainderLen == PREPARED_TRANSACTION_CHECKPOINT_BYTES(ckptExtended->ptas->count));
	}
	else
	{
8327
		Assert(remainderLen == ckptExtended->dtxCheckpointLen);
8328 8329
		ckptExtended->ptas = NULL;
	}
8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346
}

/*
 * This must be called during startup of a backend process, except that
 * it need not be called in a standalone backend (which does StartupXLOG
 * instead).  We need to initialize the local copies of ThisTimeLineID and
 * RedoRecPtr.
 *
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
 */
void
InitXLOGAccess(void)
{
	/* ThisTimeLineID doesn't change so we need no lock to copy it */
	ThisTimeLineID = XLogCtl->ThisTimeLineID;
8347 8348 8349 8350 8351 8352 8353
	/* GPDB_84_MERGE_FIXME: Disabled, because FTS process was tripping it.
	 * This assertion was added by the merge, so I suspect it's been wrong
	 * all along, but we haven't noticed. */
#if 0
	Assert(ThisTimeLineID != 0);
#endif

8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420
	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
	(void) GetRedoRecPtr();
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
GetRedoRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
	SpinLockRelease(&xlogctl->info_lck);

	return RedoRecPtr;
}

/*
 * GetInsertRecPtr -- Returns the current insert position.
 *
 * NOTE: The value *actually* returned is the position of the last full
 * xlog page. It lags behind the real insert position by at most 1 page.
 * For that, we don't need to acquire WALInsertLock which can be quite
 * heavily contended, and an approximation is enough for the current
 * usage of this function.
 */
XLogRecPtr
GetInsertRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtRqst.Write;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

/*
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
 * position known to be fsync'd to disk.
 */
XLogRecPtr
GetFlushRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtResult.Flush;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

/*
 * Get the time of the last xlog segment switch
 */
8421
pg_time_t
8422 8423
GetLastSegSwitchTime(void)
{
8424
	pg_time_t	result;
8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482

	/* Need WALWriteLock, but shared lock is sufficient */
	LWLockAcquire(WALWriteLock, LW_SHARED);
	result = XLogCtl->Write.lastSegSwitchTime;
	LWLockRelease(WALWriteLock);

	return result;
}

/*
 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
 *
 * This is exported for use by code that would like to have 64-bit XIDs.
 * We don't really support such things, but all XIDs within the system
 * can be presumed "close to" the result, and thus the epoch associated
 * with them can be determined.
 */
void
GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
{
	uint32		ckptXidEpoch;
	TransactionId ckptXid;
	TransactionId nextXid;

	/* Must read checkpoint info first, else have race condition */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		ckptXidEpoch = xlogctl->ckptXidEpoch;
		ckptXid = xlogctl->ckptXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* Now fetch current nextXid */
	nextXid = ReadNewTransactionId();

	/*
	 * nextXid is certainly logically later than ckptXid.  So if it's
	 * numerically less, it must have wrapped into the next epoch.
	 */
	if (nextXid < ckptXid)
		ckptXidEpoch++;

	*xid = nextXid;
	*epoch = ckptXidEpoch;
}

/*
 * This must be called ONCE during postmaster or standalone-backend shutdown
 */
void
ShutdownXLOG(int code __attribute__((unused)) , Datum arg __attribute__((unused)) )
{
	ereport(LOG,
			(errmsg("shutting down")));

8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494
	if (RecoveryInProgress())
		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	else
	{
		/*
		 * If archiving is enabled, rotate the last XLOG file so that all the
		 * remaining records are archived (postmaster wakes up the archiver
		 * process one more time at the end of shutdown). The checkpoint
		 * record will go to the next XLOG file and won't be archived (yet).
		 */
		if (XLogArchivingActive() && XLogArchiveCommandSet())
			RequestXLogSwitch();
8495

8496 8497
		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	}
8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564
	ShutdownCLOG();
	ShutdownSUBTRANS();
	ShutdownMultiXact();
	DistributedLog_Shutdown();

	ereport(LOG,
			(errmsg("database system is shut down"),
					errSendAlert(true)));
}

/*
 * Calculate the last segment that we need to retain because of
 * keep_wal_segments, by subtracting keep_wal_segments from the passed
 * xlog location
 */
static void
CheckKeepWalSegments(XLogRecPtr recptr, uint32 *_logId, uint32 *_logSeg)
{
	uint32	log;
	uint32	seg;
	uint32	keep_log;
	uint32	keep_seg;

	if (keep_wal_segments <= 0)
		return;

	XLByteToSeg(recptr, log, seg);

	keep_seg = keep_wal_segments % XLogSegsPerFile;
	keep_log = keep_wal_segments / XLogSegsPerFile;
	ereport(DEBUG1,
			(errmsg("%s: Input %d %d (Keep %d %d) (current %d %d)",
					PG_FUNCNAME_MACRO, *_logId, *_logSeg, keep_log,
					keep_seg, log, seg)));
	if (seg < keep_seg)
	{
		keep_log += 1;
		seg = seg - keep_seg + XLogSegsPerFile;
	}
	else
	{
		seg = seg - keep_seg;
	}

	/* Avoid underflow, don't go below (0,1) */
	if (log < keep_log || (log == keep_log && seg == 0))
	{
		log = 0;
		seg = 1;
	}
	else
	{
		log = log - keep_log;
	}

	/* check not to delete WAL segments newer than the calculated segment */
	if (log < *_logId || (log == *_logId && seg < *_logSeg))
	{
		*_logId = log;
		*_logSeg = seg;
	}

	ereport(DEBUG1,
			(errmsg("%s: Output %d %d",
					PG_FUNCNAME_MACRO, *_logId, *_logSeg)));
}

8565 8566 8567 8568
/*
 * Log start of a checkpoint.
 */
static void
8569
LogCheckpointStart(int flags, bool restartpoint)
8570
{
8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582
	const char *msg;

	/*
	 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
	 * the main message, but what about all the flags?
	 */
	if (restartpoint)
		msg = "restartpoint starting:%s%s%s%s%s%s%s";
	else
		msg = "checkpoint starting:%s%s%s%s%s%s%s";

	elog(LOG, msg,
8583
		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8584
		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595
		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
		 (flags & CHECKPOINT_FORCE) ? " force" : "",
		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}

/*
 * Log end of a checkpoint.
 */
static void
8596
LogCheckpointEnd(bool restartpoint)
8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618
{
	long		write_secs,
				sync_secs,
				total_secs;
	int			write_usecs,
				sync_usecs,
				total_usecs;

	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

	TimestampDifference(CheckpointStats.ckpt_start_t,
						CheckpointStats.ckpt_end_t,
						&total_secs, &total_usecs);

	TimestampDifference(CheckpointStats.ckpt_write_t,
						CheckpointStats.ckpt_sync_t,
						&write_secs, &write_usecs);

	TimestampDifference(CheckpointStats.ckpt_sync_t,
						CheckpointStats.ckpt_sync_end_t,
						&sync_secs, &sync_usecs);

8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638
	if (restartpoint)
		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
	else
		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
			 "%d transaction log file(s) added, %d removed, %d recycled; "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 CheckpointStats.ckpt_segs_added,
			 CheckpointStats.ckpt_segs_removed,
			 CheckpointStats.ckpt_segs_recycled,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
8639 8640
}

8641 8642 8643
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 *
8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655
 * flags is a bitwise OR of the following:
 *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
 *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
 *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
 *		ignoring checkpoint_completion_target parameter.
 *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
 *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
 *		CHECKPOINT_END_OF_RECOVERY).
 *
 * Note: flags contains other bits, of interest here only for logging purposes.
 * In particular note that this routine is synchronous and does not pay
 * attention to CHECKPOINT_WAIT.
8656 8657
 */
void
8658
CreateCheckPoint(int flags)
8659
{
8660
	bool		shutdown;
8661 8662 8663 8664 8665 8666 8667 8668 8669
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecData rdata[6];
	char* 		dtxCheckPointInfo;
	int			dtxCheckPointInfoSize;
	uint32		freespace;
	uint32		_logId;
	uint32		_logSeg;
8670
	VirtualTransactionId *vxids;
8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683
	int     	nvxids;
	bool		resync_to_sync_transition;

	resync_to_sync_transition = (flags & CHECKPOINT_RESYNC_TO_INSYNC_TRANSITION) != 0;

	/*
	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
	 * issued at a different time.
	 */
	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
		shutdown = true;
	else
		shutdown = false;
8684 8685 8686 8687 8688 8689 8690 8691

	if (shutdown && ControlFile->state == DB_STARTUP)
	{
		return;
	}

#ifdef FAULT_INJECTOR
	/* During resync checkpoint has to complete otherwise segment cannot transition into Sync state */
8692
	if (! resync_to_sync_transition)
8693 8694 8695 8696 8697 8698 8699 8700 8701 8702
	{
		if (FaultInjector_InjectFaultIfSet(
										   Checkpoint,
										   DDLNotSpecified,
										   "" /* databaseName */,
										   "" /* tableName */) == FaultInjectorTypeSkip)
			return;  // skip checkpoint
	}
#endif

8703 8704 8705 8706
	/* sanity check */
	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
		elog(ERROR, "can't create a checkpoint during recovery");

8707 8708 8709 8710 8711 8712 8713 8714
	/*
	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
	 * (This is just pro forma, since in the present system structure there is
	 * only one process that is allowed to issue checkpoints at any given
	 * time.)
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

8715 8716 8717 8718 8719 8720 8721 8722 8723 8724
	/*
	 * Prepare to accumulate statistics.
	 *
	 * Note: because it is possible for log_checkpoints to change while a
	 * checkpoint proceeds, we always accumulate stats, even if
	 * log_checkpoints is currently off.
	 */
	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

8725
	if (resync_to_sync_transition)
8726 8727 8728 8729
	{
		/* database transitions to suspended state, IO activity on the segment is suspended */
		primaryMirrorSetIOSuspended(TRUE);

8730
		SIMPLE_FAULT_INJECTOR(FileRepTransitionToInSyncBeforeCheckpoint);
8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754
	}
	else
	{
		/*
		 * Normal case.
		 */
	}

	/*
	 * Use a critical section to force system panic if we have trouble.
	 */
	START_CRIT_SECTION();

	if (shutdown)
	{
		/*
		 * This is an ugly fix to dis-allow changing the pg_control
		 * state for standby promotion continuity.
		 *
		 * Refer to Startup_InProduction() for more details
		 */
		if (ControlFile->state != DB_IN_STANDBY_PROMOTED
			&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET)
		{
8755
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8756
			ControlFile->state = DB_SHUTDOWNING;
8757
			ControlFile->time = (pg_time_t) time(NULL);
8758
			UpdateControlFile();
8759
			LWLockRelease(ControlFileLock);
8760 8761 8762
		}
	}

8763 8764 8765 8766 8767 8768 8769 8770
	/*
	 * Let smgr prepare for checkpoint; this has to happen before we determine
	 * the REDO pointer.  Note that smgr must not do anything that'd have to
	 * be undone if we decide no checkpoint is needed.
	 */
	smgrpreckpt();

	/* Begin filling in the checkpoint WAL record */
8771
	MemSet(&checkPoint, 0, sizeof(checkPoint));
8772
	checkPoint.time = (pg_time_t) time(NULL);
8773 8774 8775 8776 8777

	/*
	 * The WRITE_PERSISTENT_STATE_ORDERED_LOCK gets these locks:
	 *    MirroredLock SHARED, and
	 *    PersistentObjLock EXCLUSIVE.
8778
	 * as well as set MyProc->inCommit = true.
8779 8780 8781 8782 8783 8784 8785 8786 8787
	 *
	 * The READ_PERSISTENT_STATE_ORDERED_LOCK gets this lock:
	 *    PersistentObjLock SHARED.
	 *
	 * They do this to prevent Persistent object changes during checkpoint and
	 * prevent persistent object reads while writing.  And acquire the MirroredLock
	 * at a level that blocks DDL during FileRep statechanges...
	 */

8788 8789 8790 8791
	/*
	 * We must hold WALInsertLock while examining insert state to determine
	 * the checkpoint REDO pointer.
	 */
8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

	/*
	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
	 * any XLOG records since the start of the last checkpoint, skip the
	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
	 * when the system is idle. That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the previous
	 * checkpoint record is in a different xlog page?)
	 *
	 * We have to make two tests to determine that nothing has happened since
	 * the start of the last checkpoint: current insertion point must match
	 * the end of the last checkpoint record, and its redo pointer must point
	 * to itself.
	 */
8809 8810
	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
				  CHECKPOINT_FORCE)) == 0)
8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
#ifdef originalCheckpointChecking
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
#else
		/*
		 * GP: Modified since the checkpoint record is not fixed length
		 * so we keep track of the last checkpoint locations (beginning and
		 * end) and use thoe values for comparison.
		 */
		if (XLogCtl->haveLastCheckpointLoc &&
			XLByteEQ(XLogCtl->lastCheckpointLoc,ControlFile->checkPoint) &&
			XLByteEQ(curInsert,XLogCtl->lastCheckpointEndLoc) &&
			XLByteEQ(ControlFile->checkPoint,ControlFile->checkPointCopy.redo))
#endif
		{
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);

			END_CRIT_SECTION();
			return;
		}
	}

8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853
	/*
	 * An end-of-recovery checkpoint is created before anyone is allowed to
	 * write WAL. To allow us to write the checkpoint record, temporarily
	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
	 */
	if (flags & CHECKPOINT_END_OF_RECOVERY)
		LocalSetXLogInsertAllowed();

	checkPoint.ThisTimeLineID = ThisTimeLineID;

8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891
	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will be,
	 * since other backends may insert more XLOG records while we're off doing
	 * the buffer flush work.  Those XLOG records are logically after the
	 * checkpoint, even though physically before it.  Got that?
	 */
	freespace = INSERT_FREESPACE(Insert);
	if (freespace < SizeOfXLogRecord)
	{
		(void) AdvanceXLInsertBuffer(false);
		/* OK to ignore update return flag, since we will do flush anyway */
		freespace = INSERT_FREESPACE(Insert);
	}
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);

	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
	 * must be done while holding the insert lock AND the info_lck.
	 *
	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
	 * pointing past where it really needs to point.  This is okay; the only
	 * consequence is that XLogInsert might back up whole buffers that it
	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
	 * XLogInserts that happen while we are dumping buffers must assume that
	 * their buffer changes are not included in the checkpoint.
	 */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/*
8892 8893
	 * Now we can release WAL insert lock, allowing other xacts to proceed
	 * while we are flushing disk buffers.
8894 8895 8896
	 */
	LWLockRelease(WALInsertLock);

8897 8898 8899 8900 8901
	/*
	 * If enabled, log checkpoint start.  We postpone this until now so as not
	 * to log anything if we decided to skip the checkpoint.
	 */
	if (log_checkpoints)
8902
		LogCheckpointStart(flags, false);
8903

8904 8905
	TRACE_POSTGRESQL_CHECKPOINT_START(flags);

8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929
	/*
	 * Before flushing data, we must wait for any transactions that are
	 * currently in their commit critical sections.  If an xact inserted its
	 * commit record into XLOG just before the REDO point, then a crash
	 * restart from the REDO point would not replay that record, which means
	 * that our flushing had better include the xact's update of pg_clog.  So
	 * we wait till he's out of his commit critical section before proceeding.
	 * See notes in RecordTransactionCommit().
	 *
	 * Because we've already released WALInsertLock, this test is a bit fuzzy:
	 * it is possible that we will wait for xacts we didn't really need to
	 * wait for.  But the delay should be short and it seems better to make
	 * checkpoint take a bit longer than to hold locks longer than necessary.
	 * (In fact, the whole reason we have this issue is that xact.c does
	 * commit record XLOG insertion and clog update as two separate steps
	 * protected by different locks, but again that seems best on grounds of
	 * minimizing lock contention.)
	 *
	 * A transaction that has not yet set inCommit when we look cannot be at
	 * risk, since he's not inserted his commit record yet; and one that's
	 * already cleared it is not at risk either, since he's done fixing clog
	 * and we will correctly flush the update below.  So we cannot miss any
	 * xacts we need to wait for.
	 */
8930 8931
	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
	if (nvxids > 0)
8932 8933 8934
	{
		do
		{
8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945
			/*
			 * GPDB needs to AbsorbFsyncRequests() here to avoid deadlock when
			 * fsync request queue is full while backend is in commit and
			 * performing ForgetRelationFsyncRequests() or
			 * ForgetDatabaseFsyncRequests(). Since for GPDB the mdlink
			 * happens through persistent tables cleanup, during which
			 * inCommit flag is set to avoid checkpoint from happening.
			 * PostgreSQL doesn't need this as ForgetRelationFsyncRequests()
			 * or ForgetDatabaseFsyncRequests() are not under inCommit=true.
			 */
			AbsorbFsyncRequests();
8946
			pg_usleep(10000L);	/* wait for 10 msec */
8947
		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8948
	}
8949
	pfree(vxids);
8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968

	/*
	 * Get the other info we need for the checkpoint record.
	 */
	LWLockAcquire(XidGenLock, LW_SHARED);
	checkPoint.nextXid = ShmemVariableCache->nextXid;
	LWLockRelease(XidGenLock);

	/* Increase XID epoch if we've wrapped around since last checkpoint */
	checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
		checkPoint.nextXidEpoch++;

	LWLockAcquire(OidGenLock, LW_SHARED);
	checkPoint.nextOid = ShmemVariableCache->nextOid;
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
	LWLockRelease(OidGenLock);

8969 8970 8971 8972 8973 8974
	LWLockAcquire(RelfilenodeGenLock, LW_SHARED);
	checkPoint.nextRelfilenode = ShmemVariableCache->nextRelfilenode;
	if (!shutdown)
		checkPoint.nextRelfilenode += ShmemVariableCache->relfilenodeCount;
	LWLockRelease(RelfilenodeGenLock);

8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988
	MultiXactGetCheckptMulti(shutdown,
							 &checkPoint.nextMulti,
							 &checkPoint.nextMultiOffset);

	/*
	 * Having constructed the checkpoint record, ensure all shmem disk buffers
	 * and commit-log buffers are flushed to disk.
	 *
	 * This I/O could fail for various reasons.  If so, we will fail to
	 * complete the checkpoint, but there is no reason to force a system
	 * panic. Accordingly, exit critical section while doing it.
	 */
	END_CRIT_SECTION();

8989
	CheckPointGuts(checkPoint.redo, flags);
8990 8991 8992 8993 8994

	START_CRIT_SECTION();

	/*
	 * Now insert the checkpoint record into XLOG.
8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020
	 *
	 * Here is the locking order and scope:
	 *
	 * getDtxCheckPointInfoAndLock (i.e. shmControlLock)
	 * 	READ_PERSISTENT_STATE_ORDERED_LOCK (i.e. PersistentObjLock)
	 * 		mmxlog_append_checkpoint_data
	 * 		XLogInsert
	 * 	READ_PERSISTENT_STATE_ORDERED_UNLOCK
	 * freeDtxCheckPointInfoAndUnlock
	 * XLogFlush
	 *
	 * We get the PersistentObjLock to prevent Persistent Object writers as
	 * we collect the Master Mirroring information from mmxlog_append_checkpoint_data()
	 * until finally after the checkpoint record is inserted into the XLOG to prevent the
	 * persistent information from changing.
	 *
	 * For example, if we don't hold the PersistentObjLock across mmxlog_append_checkpoint_data()
	 * and XLogInsert(), another xlog activity like drop tablespace could happen in between, which
	 * might caused wrong behavior when master standby replay checkpoint record.
	 *
	 * Master standby replay (mmxlog_read_checkpoint_data) the mmxlog information stored in the checkpoint
	 * record to recreate those persistent objects like filespace, tablespace, database dir, etc. If those
	 * objects dropped after checkpoint collected persistent objects information, but before checkpoint
	 * record write to XLOG, then the standby replay would first drop the object based on mmxlog record,
	 * then recreated based on the checkpoint record. That will ends-up left behind the directories already
	 * dropped on the master, break the consistency between the master and the standby.
9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066
	 */

	getDtxCheckPointInfoAndLock(&dtxCheckPointInfo, &dtxCheckPointInfoSize);

	rdata[0].data = (char *) (&checkPoint);
	rdata[0].len = sizeof(checkPoint);
	rdata[0].buffer = InvalidBuffer;
	rdata[0].next = &(rdata[1]);

	rdata[1].data = (char *) dtxCheckPointInfo;
	rdata[1].len = dtxCheckPointInfoSize;
	rdata[1].buffer = InvalidBuffer;
	rdata[1].next = NULL;

	prepared_transaction_agg_state *p = NULL;

	getTwoPhasePreparedTransactionData(&p, "CreateCheckPoint");
	rdata[5].data = (char*)p;
	rdata[5].buffer = InvalidBuffer;
	rdata[5].len = PREPARED_TRANSACTION_CHECKPOINT_BYTES(p->count);
	rdata[4].next = &(rdata[5]);
	rdata[5].next = NULL;

	/*
	 * Need to save the oldest prepared transaction XLogRecPtr for use later.
	 * It is not sufficient to just save the pointer because we may remove the
	 * space after it is written in XLogInsert.
	 */
	XLogRecPtr *ptrd_oldest_ptr = NULL;
	XLogRecPtr ptrd_oldest;

	memset(&ptrd_oldest, 0, sizeof(ptrd_oldest));

	ptrd_oldest_ptr = getTwoPhaseOldestPreparedTransactionXLogRecPtr(&rdata[5]);

	if (ptrd_oldest_ptr != NULL)
		memcpy(&ptrd_oldest, ptrd_oldest_ptr, sizeof(ptrd_oldest));

	recptr = XLogInsert(RM_XLOG_ID,
			            shutdown ? XLOG_CHECKPOINT_SHUTDOWN : XLOG_CHECKPOINT_ONLINE,
			            rdata);

	freeDtxCheckPointInfoAndUnlock(dtxCheckPointInfo, dtxCheckPointInfoSize, &recptr);

	XLogFlush(recptr);

9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081
	/*
	 * We mustn't write any new WAL after a shutdown checkpoint, or it will
	 * be overwritten at next startup.  No-one should even try, this just
	 * allows sanity-checking.  In the case of an end-of-recovery checkpoint,
	 * we want to just temporarily disable writing until the system has exited
	 * recovery.
	 */
	if (shutdown)
	{
		if (flags & CHECKPOINT_END_OF_RECOVERY)
			LocalXLogInsertAllowed = -1;	/* return to "check" state */
		else
			LocalXLogInsertAllowed = 0;		/* never again write WAL */
	}

9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
	 * = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
		ereport(PANIC,
				(errmsg("concurrent transaction log activity while database system is shutting down")));

	/*
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info or the oldest prepared transaction xlog record's info.
	 */
	if (ptrd_oldest_ptr != NULL && XLByteLE(ptrd_oldest, ControlFile->checkPointCopy.redo))
		XLByteToSeg(ptrd_oldest, _logId, _logSeg);
	else
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);

	/*
	 * Update the control file.
	 */
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
	if (shutdown)
	{
		/*
		 * Ugly fix to dis-allow changing pg_control state
		 * for standby promotion continuity
		 */
		if (ControlFile->state != DB_IN_STANDBY_PROMOTED
			&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET)
			ControlFile->state = DB_SHUTDOWNED;
	}

	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
	/* crash recovery should always recover to the end of WAL */
	MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
9119
	ControlFile->time = (pg_time_t) time(NULL);
9120 9121 9122 9123 9124 9125

	/*
	 * Save the last checkpoint position.
	 */
	XLogCtl->haveLastCheckpointLoc = true;
	XLogCtl->lastCheckpointLoc = ProcLastRecPtr;
9126
	XLogCtl->lastCheckpointEndLoc = XactLastRecEnd;
9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143

	UpdateControlFile();
	LWLockRelease(ControlFileLock);

	/* Update shared-memory copy of checkpoint XID/epoch */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
		xlogctl->ckptXid = checkPoint.nextXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/*
	 * We are now done with critical updates; no need for system panic if we
9144
	 * have trouble while fooling with old log segments.
9145 9146 9147 9148
	 */
	END_CRIT_SECTION();

	/*
9149 9150 9151 9152 9153 9154
	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
	 */
	smgrpostckpt();

	/*
	 * Delete old log files (those no longer needed even for previous
9155 9156 9157 9158
	 * checkpoint).
	 */
	if (gp_keep_all_xlog == false && (_logId || _logSeg))
	{
9159
		GetXLogCleanUpTo(recptr, &_logId, &_logSeg);
9160 9161

		PrevLogSeg(_logId, _logSeg);
9162
		RemoveOldXlogFiles(_logId, _logSeg, recptr);
9163 9164 9165 9166 9167 9168 9169
	}

	/*
	 * Make more log segments if needed.  (Do this after deleting offline log
	 * segments, to avoid having peak disk space usage higher than necessary.)
	 */
	if (!shutdown)
9170
		PreallocXlogFiles(recptr);
9171 9172 9173 9174 9175 9176 9177 9178

	/*
	 * Truncate pg_subtrans if possible.  We can throw away all data before
	 * the oldest XMIN of any running transaction.	No future transaction will
	 * attempt to reference any pg_subtrans entry older than that (see Asserts
	 * in subtrans.c).	During recovery, though, we mustn't do this because
	 * StartupSUBTRANS hasn't been called yet.
	 */
9179
	if (!RecoveryInProgress())
9180
		TruncateSUBTRANS(GetOldestXmin(true, false));
9181

9182 9183
	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
9184
		LogCheckpointEnd(false);
9185

9186
	if (resync_to_sync_transition)
9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201
	{
		RequestXLogSwitch();

		UpdateControlFile();

		/* database is resumed */
		primaryMirrorSetIOSuspended(FALSE);
	}
	else
	{
		/*
		 * Normal case.
		 */
	}

9202
	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9203 9204
									 NBuffers,
									 CheckpointStats.ckpt_segs_added,
9205 9206 9207
									 CheckpointStats.ckpt_segs_removed,
									 CheckpointStats.ckpt_segs_recycled);

9208 9209 9210 9211 9212 9213 9214 9215 9216 9217
	LWLockRelease(CheckpointLock);
}

/*
 * Flush all data in shared memory to disk, and fsync
 *
 * This is the common code shared between regular checkpoints and
 * recovery restartpoints.
 */
static void
9218
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9219 9220 9221 9222 9223
{
	CheckPointCLOG();
	CheckPointSUBTRANS();
	CheckPointMultiXact();
	DistributedLog_CheckPoint();
9224
	CheckPointBuffers(flags);	/* performs all required fsyncs */
9225 9226 9227 9228 9229
	/* We deliberately delay 2PC checkpointing as long as possible */
	CheckPointTwoPhase(checkPointRedo);
}

/*
9230
 * Save a checkpoint for recovery restart if appropriate
9231
 *
9232 9233 9234 9235 9236 9237
 * This function is called each time a checkpoint record is read from XLOG.
 * It must determine whether the checkpoint represents a safe restartpoint or
 * not.  If so, the checkpoint record is stashed in shared memory so that
 * CreateRestartPoint can consult it.  (Note that the latter function is
 * executed by the bgwriter, while this one will be executed by the startup
 * process.)
9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256
 */
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
	int			rmid;

	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Is it safe to checkpoint?  We must ask each of the resource managers
	 * whether they have any partial state information that might prevent a
	 * correct restart from this point.  If so, we skip this opportunity, but
	 * return at the next checkpoint record for another try.
	 */
	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
	{
		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
9257 9258 9259 9260 9261
			{
				elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
					 rmid,
					 checkPoint->redo.xlogid,
					 checkPoint->redo.xrecoff);
9262
				return;
9263
			}
9264 9265 9266 9267 9268 9269 9270 9271
	}

	/* Update the shared RedoRecPtr */
	 SpinLockAcquire(&xlogctl->info_lck);
	 xlogctl->Insert.RedoRecPtr = checkPoint->redo;
	 SpinLockRelease(&xlogctl->info_lck);

	/*
9272 9273
	 * Copy the checkpoint record to shared memory, so that bgwriter can use
	 * it the next time it wants to perform a restartpoint.
9274
	 */
9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313
	SpinLockAcquire(&xlogctl->info_lck);
	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Establish a restartpoint if possible.
 *
 * This is similar to CreateCheckPoint, but is used during WAL recovery
 * to establish a point from which recovery can roll forward without
 * replaying the entire recovery log.
 *
 * Returns true if a new restartpoint was established. We can only establish
 * a restartpoint if we have replayed a safe checkpoint record since last
 * restartpoint.
 */
bool
CreateRestartPoint(int flags)
{
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;
	uint32		_logId = 0;
	uint32		_logSeg = 0;

	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
	 * happens at a time.
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

	/* Get a local copy of the last safe checkpoint record. */
	SpinLockAcquire(&xlogctl->info_lck);
	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324

	if (IsStandbyMode())
	{
		/*
		 * Select point at which we can truncate the log, which we base on the
		 * prior checkpoint's earliest info.
		*/
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
	}

	/*
9325 9326
	 * Check that we're still in recovery mode. It's ok if we exit recovery
	 * mode after this check, the restart point is valid anyway.
9327
	 */
9328 9329 9330 9331 9332 9333 9334
	if (!RecoveryInProgress())
	{
		ereport(DEBUG2,
			  (errmsg("skipping restartpoint, recovery has already ended")));
		LWLockRelease(CheckpointLock);
		return false;
	}
9335 9336

	/*
9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348
	 * If the last checkpoint record we've replayed is already our last
	 * restartpoint, we can't perform a new restart point. We still update
	 * minRecoveryPoint in that case, so that if this is a shutdown restart
	 * point, we won't start up earlier than before. That's not strictly
	 * necessary, but when we get hot standby capability, it would be rather
	 * weird if the database opened up for read-only connections at a
	 * point-in-time before the last shutdown. Such time travel is still
	 * possible in case of immediate shutdown, though.
	 *
	 * We don't explicitly advance minRecoveryPoint when we do create a
	 * restartpoint. It's assumed that flushing the buffers will do that as a
	 * side-effect.
9349
	 */
9350 9351 9352 9353
	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
		XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
	{
		XLogRecPtr	InvalidXLogRecPtr = {0, 0};
9354

9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373
		ereport(DEBUG2,
				(errmsg("skipping restartpoint, already performed at %X/%X",
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));

		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
		LWLockRelease(CheckpointLock);
		return false;
	}

	if (log_checkpoints)
	{
		/*
		 * Prepare to accumulate statistics.
		 */
		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

		LogCheckpointStart(flags, true);
	}
9374

9375 9376 9377 9378 9379 9380 9381
	CheckPointGuts(lastCheckPoint.redo, flags);

	/*
	 * Update pg_control, using current time.  Check that it still shows
	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
	 * this is a quick hack to make sure nothing really bad happens if
	 * somehow we get here after the end-of-recovery checkpoint.
9382 9383 9384 9385 9386 9387 9388 9389
	 *
	 * GPDB allows replay to also change the control file during
	 * DB_IN_STANDBY_MODE so that mirror can be restarted from the latest
	 * checkpoint location. This will save the recovery time of mirror, and also
	 * allow mirror to remove already replayed xlogs.
	 *
	 * FIXME: need to consider consolidating the DB_IN_ARCHIVE_RECOVERY (upstream)
	 * and DB_IN_STANDBY_MODE (GPDB only)
9390 9391
	 */
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9392 9393
	if ((ControlFile->state == DB_IN_ARCHIVE_RECOVERY
		     || ControlFile->state == DB_IN_STANDBY_MODE) &&
9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414
		XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
	{
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = lastCheckPointRecPtr;
		ControlFile->checkPointCopy = lastCheckPoint;
		ControlFile->time = (pg_time_t) time(NULL);
		UpdateControlFile();
	}
	LWLockRelease(ControlFileLock);

	/*
	 * Currently, there is no need to truncate pg_subtrans during recovery. If
	 * we did do that, we will need to have called StartupSUBTRANS() already
	 * and then TruncateSUBTRANS() would go here.
	 */

	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
		LogCheckpointEnd(true);

	ereport((log_checkpoints ? LOG : DEBUG2),
9415
			(errmsg("recovery restart point at %X/%X",
9416 9417 9418
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));

	/* XXX this is currently BROKEN because we are in the wrong process */
9419
	if (recoveryLastXTime)
9420
		ereport((log_checkpoints ? LOG : DEBUG2),
9421 9422
				(errmsg("last completed transaction was at log time %s",
						timestamptz_to_str(recoveryLastXTime))));
9423

9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441
	elog((Debug_print_qd_mirroring ? LOG : DEBUG1), "RecoveryRestartPoint: checkpoint copy redo location %s, previous checkpoint location %s",
		 XLogLocationToString(&ControlFile->checkPointCopy.redo),
		 XLogLocationToString2(&ControlFile->prevCheckPoint));

	if (IsStandbyMode())
	{
		/*
		 * Delete offline log files (those no longer needed even for previous
		 * checkpoint).
		 */
		if (gp_keep_all_xlog == false && (_logId || _logSeg))
		{
			XLogRecPtr endptr;

			/* Get the current (or recent) end of xlog */
			endptr = GetStandbyFlushRecPtr(NULL);

			PrevLogSeg(_logId, _logSeg);
9442
			RemoveOldXlogFiles(_logId, _logSeg, endptr);
9443 9444
		}
	}
9445 9446 9447

	LWLockRelease(CheckpointLock);
	return true;
9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472
}

/*
 * Write a NEXTOID log record
 */
void
XLogPutNextOid(Oid nextOid)
{
	XLogRecData rdata;

	rdata.data = (char *) (&nextOid);
	rdata.len = sizeof(Oid);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);

	/*
	 * We need not flush the NEXTOID record immediately, because any of the
	 * just-allocated OIDs could only reach disk as part of a tuple insert or
	 * update that would have its own XLOG record that must follow the NEXTOID
	 * record.	Therefore, the standard buffer LSN interlock applied to those
	 * records will ensure no such OID reaches disk before the NEXTOID record
	 * does.
	 *
	 * Note, however, that the above statement only covers state "within" the
9473 9474
	 * database.  When we use a generated OID as a file or directory name, we
	 * are in a sense violating the basic WAL rule, because that filesystem
9475
	 * change may reach disk before the NEXTOID WAL record does.  The impact
9476 9477 9478 9479 9480
	 * of this is that if a database crash occurs immediately afterward, we
	 * might after restart re-generate the same OID and find that it conflicts
	 * with the leftover file or directory.  But since for safety's sake we
	 * always loop until finding a nonconflicting filename, this poses no real
	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9481 9482 9483
	 */
}

9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498
/*
 * Write a NEXTRELFILENODE log record similar to XLogPutNextOid
 */
void
XLogPutNextRelfilenode(Oid nextRelfilenode)
{
	XLogRecData rdata;

	rdata.data = (char *) (&nextRelfilenode);
	rdata.len = sizeof(Oid);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTRELFILENODE, &rdata);
}

9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525
/*
 * Write an XLOG SWITCH record.
 *
 * Here we just blindly issue an XLogInsert request for the record.
 * All the magic happens inside XLogInsert.
 *
 * The return value is either the end+1 address of the switch record,
 * or the end+1 address of the prior segment if we did not need to
 * write a switch record because we are already at segment start.
 */
XLogRecPtr
RequestXLogSwitch(void)
{
	XLogRecPtr	RecPtr;
	XLogRecData rdata;

	/* XLOG SWITCH, alone among xlog record types, has no data */
	rdata.buffer = InvalidBuffer;
	rdata.data = NULL;
	rdata.len = 0;
	rdata.next = NULL;

	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);

	return RecPtr;
}

9526 9527 9528 9529 9530 9531
/*
 * Write a backup block if needed when we are setting a hint. Note that
 * this may be called for a variety of page types, not just heaps.
 *
 * Callable while holding just share lock on the buffer content.
 *
9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548
 * We can't use the plain backup block mechanism since that relies on the
 * Buffer being exclusively locked. Since some modifications (setting LSN, hint
 * bits) are allowed in a sharelocked buffer that can lead to wal checksum
 * failures. So instead we copy the page and insert the copied data as normal
 * record data.
 *
 * We only need to do something if page has not yet been full page written in
 * this checkpoint round. The LSN of the inserted wal record is returned if we
 * had to write, InvalidXLogRecPtr otherwise.
 *
 * It is possible that multiple concurrent backends could attempt to write WAL
 * records. In that case, multiple copies of the same block would be recorded
 * in separate WAL records by different backends, though that is still OK from
 * a correctness perspective.
 *
 * Note that this only works for buffers that fit the standard page model,
 * i.e. those for which buffer_std == true
9549 9550
 */
XLogRecPtr
9551
XLogSaveBufferForHint(Buffer buffer, Relation relation)
9552
{
9553 9554 9555
	XLogRecPtr recptr = InvalidXLogRecPtr;
	XLogRecPtr lsn;
	XLogRecData rdata[2];
9556
	BkpBlock	bkpb;
9557

9558
	/*
9559
	 * Ensure no checkpoint can change our view of RedoRecPtr.
9560
	 */
9561
	Assert(MyProc->inCommit);
9562 9563

	/*
9564
	 * Update RedoRecPtr so XLogCheckBuffer can make the right decision
9565
	 */
9566
	GetRedoRecPtr();
9567

9568 9569 9570 9571 9572 9573 9574
	/*
	 * Setup phony rdata element for use within XLogCheckBuffer only.
	 * We reuse and reset rdata for any actual WAL record insert.
	 */
	rdata[0].buffer = buffer;
	rdata[0].buffer_std = true;

9575 9576 9577
	/*
	 * Check buffer while not holding an exclusive lock.
	 */
9578
	if (XLogCheckBuffer(rdata, false, false, &lsn, &bkpb))
9579 9580 9581
	{
		char copied_buffer[BLCKSZ];
		char *origdata = (char *) BufferGetBlock(buffer);
9582

9583 9584 9585 9586 9587
		/*
		 * Copy buffer so we don't have to worry about concurrent hint bit or
		 * lsn updates. We assume pd_lower/upper cannot be changed without an
		 * exclusive lock, so the contents bkp are not racy.
		 */
9588 9589 9590 9591
		memcpy(copied_buffer, origdata, bkpb.hole_offset);
		memcpy(copied_buffer + bkpb.hole_offset,
			   origdata + bkpb.hole_offset + bkpb.hole_length,
			   BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
9592 9593 9594 9595

		/*
		 * Header for backup block.
		 */
9596 9597
		rdata[0].data = (char *) &bkpb;
		rdata[0].len = sizeof(BkpBlock);
9598 9599 9600 9601 9602 9603 9604
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = &(rdata[1]);

		/*
		 * Save copy of the buffer.
		 */
		rdata[1].data = copied_buffer;
9605
		rdata[1].len = BLCKSZ - bkpb.hole_length;
9606 9607 9608 9609 9610 9611 9612
		rdata[1].buffer = InvalidBuffer;
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
	}

	return recptr;
9613 9614
}

9615 9616 9617 9618
/*
 * XLOG resource manager's routines
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
9619
 * not all record types are related to control file updates.
9620 9621 9622 9623 9624 9625
 */
void
xlog_redo(XLogRecPtr beginLoc __attribute__((unused)), XLogRecPtr lsn __attribute__((unused)), XLogRecord *record)
{
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

9626
	/* Backup blocks are not used in xlog records */
9627
	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
9628

9629 9630 9631 9632
	if (info == XLOG_NEXTOID)
	{
		Oid			nextOid;

9633 9634 9635 9636 9637 9638
		/*
		 * We used to try to take the maximum of ShmemVariableCache->nextOid
		 * and the recorded nextOid, but that fails if the OID counter wraps
		 * around.  Since no OID allocation should be happening during replay
		 * anyway, better to just believe the record exactly.
		 */
9639
		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9640 9641
		ShmemVariableCache->nextOid = nextOid;
		ShmemVariableCache->oidCount = 0;
9642
	}
9643 9644 9645 9646 9647 9648 9649 9650
	if (info == XLOG_NEXTRELFILENODE)
	{
		Oid			nextRelfilenode;

		memcpy(&nextRelfilenode, XLogRecGetData(record), sizeof(Oid));
		ShmemVariableCache->nextRelfilenode = nextRelfilenode;
		ShmemVariableCache->relfilenodeCount = 0;
	}
9651 9652 9653 9654 9655 9656 9657 9658 9659
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
9660 9661
		ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
		ShmemVariableCache->relfilenodeCount = 0;
9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711
		MultiXactSetNextMXact(checkPoint.nextMulti,
							  checkPoint.nextMultiOffset);

		/*
		 * If we see a shutdown checkpoint while waiting for an end-of-backup
		 * record, the backup was canceled and the end-of-backup record will
		 * never arrive.
		 */
		if (StandbyMode &&
			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
			ereport(PANIC,
			(errmsg("online backup was canceled, recovery cannot continue")));

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

		/* Update shared-memory copy of checkpoint XID/epoch */
		 {
			 /* use volatile pointer to prevent code rearrangement */
			 volatile XLogCtlData *xlogctl = XLogCtl;

			 SpinLockAcquire(&xlogctl->info_lck);
			 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
			 xlogctl->ckptXid = checkPoint.nextXid;
			 SpinLockRelease(&xlogctl->info_lck);
		 }

		/*
		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
		 */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
		{
			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
				!list_member_int(expectedTLIs,
								 (int) checkPoint.ThisTimeLineID))
				ereport(PANIC,
						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
								checkPoint.ThisTimeLineID, ThisTimeLineID)));
			/* Following WAL records should be run with new TLI */
			ThisTimeLineID = checkPoint.ThisTimeLineID;
		}

		RecoveryRestartPoint(&checkPoint);
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9712
		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
9713 9714 9715
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
			ShmemVariableCache->nextXid = checkPoint.nextXid;
9716 9717 9718
		/* ... but still treat OID counter as exact */
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
9719 9720
		ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
		ShmemVariableCache->relfilenodeCount = 0;
9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746
		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
								  checkPoint.nextMultiOffset);

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

		/* Update shared-memory copy of checkpoint XID/epoch */
		 {
			 /* use volatile pointer to prevent code rearrangement */
			 volatile XLogCtlData *xlogctl = XLogCtl;

			 SpinLockAcquire(&xlogctl->info_lck);
			 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
			 xlogctl->ckptXid = checkPoint.nextXid;
			 SpinLockRelease(&xlogctl->info_lck);
		 }

		/* TLI should not change in an on-line checkpoint */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
			ereport(PANIC,
					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
							checkPoint.ThisTimeLineID, ThisTimeLineID)));

		RecoveryRestartPoint(&checkPoint);
	}
9747 9748 9749 9750
	else if (info == XLOG_NOOP)
	{
		/* nothing to do here */
	}
9751 9752 9753 9754
	else if (info == XLOG_SWITCH)
	{
		/* nothing to do here */
	}
9755 9756
	else if (info == XLOG_HINT)
	{
9757
		char *data;
9758
		BkpBlock bkpb;
9759 9760

		/*
9761 9762 9763 9764
		 * Hint bit records contain a backup block stored "inline" in the normal
		 * data since the locking when writing hint records isn't sufficient to
		 * use the normal backup block mechanism, which assumes exclusive lock
		 * on the buffer supplied.
9765
		 *
9766 9767
		 * Since the only change in these backup block are hint bits, there are
		 * no recovery conflicts generated.
9768 9769 9770 9771 9772
		 *
		 * This also means there is no corresponding API call for this,
		 * so an smgr implementation has no need to implement anything.
		 * Which means nothing is needed in md.c etc
		 */
9773
		data = XLogRecGetData(record);
9774 9775
		memcpy(&bkpb, data, sizeof(BkpBlock));
		data += sizeof(BkpBlock);
9776

9777
		RestoreBackupBlockContents(lsn, bkpb, data, false, false);
9778
	}
9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819
	else if (info == XLOG_BACKUP_END)
	{
		XLogRecPtr	startpoint;

		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

		if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
		{
			/*
			 * We have reached the end of base backup, the point where
			 * pg_stop_backup() was done.
			 * Reset backupStartPoint, and update minRecoveryPoint to make
			 * sure we don't allow starting up at an earlier point even if
			 * recovery is stopped and restarted soon after this.
			 */
			elog(DEBUG1, "end of backup reached");

			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

			if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
				ControlFile->minRecoveryPoint = lsn;
			MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
			ControlFile->backupEndRequired = false;
			UpdateControlFile();

			LWLockRelease(ControlFileLock);
		}
	}
}

void
xlog_desc(StringInfo buf, XLogRecPtr beginLoc, XLogRecord *record)
{
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
	char		*rec = XLogRecGetData(record);

	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint *checkpoint = (CheckPoint *) rec;

9820
		CheckpointExtendedRecord ckptExtended;
9821

9822
		appendStringInfo(buf, "checkpoint: redo %X/%X; "
9823
						 "tli %u; xid %u/%u; oid %u; relfilenode %u; multi %u; offset %u; %s",
9824 9825 9826 9827
						 checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
						 checkpoint->ThisTimeLineID,
						 checkpoint->nextXidEpoch, checkpoint->nextXid,
						 checkpoint->nextOid,
9828
						 checkpoint->nextRelfilenode,
9829 9830 9831 9832
						 checkpoint->nextMulti,
						 checkpoint->nextMultiOffset,
				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");

9833 9834 9835
		UnpackCheckPointRecord(record, &ckptExtended);

		if (ckptExtended.dtxCheckpointLen > 0)
9836 9837
		{
			appendStringInfo(buf,
9838
				 ", checkpoint record data length = %u, DTX committed count %d, DTX data length %u",
9839 9840
							 record->xl_len,
							 ckptExtended.dtxCheckpoint->committedCount,
9841
							 ckptExtended.dtxCheckpointLen);
9842 9843 9844 9845
			if (ckptExtended.ptas != NULL)
				appendStringInfo(buf,
								 ", prepared transaction agg state count = %d",
								 ckptExtended.ptas->count);
9846 9847
		}
	}
9848 9849 9850 9851
	else if (info == XLOG_NOOP)
	{
		appendStringInfo(buf, "xlog no-op");
	}
9852 9853 9854 9855 9856 9857 9858
	else if (info == XLOG_NEXTOID)
	{
		Oid			nextOid;

		memcpy(&nextOid, rec, sizeof(Oid));
		appendStringInfo(buf, "nextOid: %u", nextOid);
	}
9859 9860
	else if (info == XLOG_HINT)
	{
9861
		BkpBlock *bkpb = (BkpBlock *) rec;
9862
		appendStringInfo(buf, "page hint: %u/%u/%u block %u",
9863 9864 9865 9866
						 bkpb->node.spcNode,
						 bkpb->node.dbNode,
						 bkpb->node.relNode,
						 bkpb->block);
9867
	}
9868 9869 9870 9871 9872 9873 9874
	else if (info == XLOG_NEXTRELFILENODE)
	{
		Oid			nextRelfilenode;

		memcpy(&nextRelfilenode, rec, sizeof(Oid));
		appendStringInfo(buf, "nextRelfilenode: %u", nextRelfilenode);
	}
9875 9876 9877 9878 9879 9880 9881 9882
	else if (info == XLOG_SWITCH)
	{
		appendStringInfo(buf, "xlog switch");
	}
	else
		appendStringInfo(buf, "UNKNOWN");
}

9883 9884
static void
xlog_outrec(StringInfo buf, XLogRecord *record)
9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902
{
	int			i;

	appendStringInfo(buf, "prev %X/%X; xid %u",
					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
					 record->xl_xid);

	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
			appendStringInfo(buf, "; bkpb%d", i + 1);
	}

	appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
}


/*
9903 9904
 * Return the (possible) sync flag used for opening a file, depending on the
 * value of the GUC wal_sync_method.
9905
 */
9906 9907
static int
get_sync_bit(int method)
9908
{
9909 9910 9911
	/* If fsync is disabled, never open in sync mode */
	if (!enableFsync)
		return 0;
9912

9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924
	/*
	 * walreceiver process receives xlog data from walsender process.
	 * It needs to write the xlog data as soon as it receives and the amount it receives.
	 * As the amount of data received by it to write cannot be guaranteed to be
	 * OS/FS block size aligned, should never use O_DIRECT for the same.
	 * Also, as code is not expecting O_DIRECT to be used for xlog writes on walreceiver,
	 * the buffer pointer to perform xlog writes is not made usre to be OS/FS blocks size aligned.
	 */
	if (MyAuxProcType == WalReceiverProcess)
		return 0;

	switch (method)
9925
	{
9926 9927 9928 9929 9930 9931
			/*
			 * enum values for all sync options are defined even if they are
			 * not supported on the current platform.  But if not, they are
			 * not included in the enum option array, and therefore will never
			 * be seen here.
			 */
9932 9933 9934 9935
		case SYNC_METHOD_FSYNC:
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
		case SYNC_METHOD_FDATASYNC:
			return 0;
9936
#ifdef OPEN_SYNC_FLAG
9937 9938
		case SYNC_METHOD_OPEN:
			return OPEN_SYNC_FLAG;
9939 9940
#endif
#ifdef OPEN_DATASYNC_FLAG
9941 9942
		case SYNC_METHOD_OPEN_DSYNC:
			return OPEN_DATASYNC_FLAG;
9943
#endif
9944 9945 9946
		default:
			/* can't happen (unless we are out of sync with option array) */
			elog(ERROR, "unrecognized wal_sync_method: %d", method);
9947
			return 0;			/* silence warning */
9948 9949
	}
}
9950

9951 9952 9953 9954 9955 9956
/*
 * GUC support
 */
bool
assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source pg_attribute_unused() )
{
9957
	if (!doit)
9958
		return true;
9959

9960
	if (sync_method != new_sync_method)
9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972
	{
		/*
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new flag
		 * bit) at next use.
		 */
		if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
		{
			if (MirroredFlatFile_Flush(
								&mirroredLogFileOpen,
								/* suppressError */ true))
9973 9974
				ereport(PANIC,
						(errcode_for_file_access(),
9975
						 errmsg("could not fsync log file %u, segment %u: %m",
B
Bruce Momjian 已提交
9976
								openLogId, openLogSeg)));
9977
			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9978 9979 9980 9981
				XLogFileClose();
		}
	}

9982
	return true;
9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009
}

/*
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
 */
void
issue_xlog_fsync(int fd, uint32 log, uint32 seg)
{
	switch (sync_method)
	{
		case SYNC_METHOD_FSYNC:
			if (pg_fsync_no_writethrough(fd) != 0)
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not fsync log file %u, segment %u: %m",
								log, seg)));
			break;
#ifdef HAVE_FSYNC_WRITETHROUGH
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
			if (pg_fsync_writethrough(fd) != 0)
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not fsync write-through log file %u, segment %u: %m",
								log, seg)));
10010 10011
			break;
#endif
10012 10013
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
10014
			if (pg_fdatasync(fd) != 0)
10015 10016
				ereport(PANIC,
						(errcode_for_file_access(),
10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081
					errmsg("could not fdatasync log file %u, segment %u: %m",
						   log, seg)));
			break;
#endif
		case SYNC_METHOD_OPEN:
//		case SYNC_METHOD_OPEN_DSYNC:
			/* write synced it already */
			break;
		default:
			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
			break;
	}
}

/*
 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
 * function. It creates the necessary starting checkpoint and constructs the
 * backup label file.
 *
 * There are two kind of backups: exclusive and non-exclusive. An exclusive
 * backup is started with pg_start_backup(), and there can be only one active
 * at a time. The backup label file of an exclusive backup is written to
 * $PGDATA/backup_label, and it is removed by pg_stop_backup().
 *
 * A non-exclusive backup is used for the streaming base backups (see
 * src/backend/replication/basebackup.c). The difference to exclusive backups
 * is that the backup label file is not written to disk. Instead, its would-be
 * contents are returned in *labelfile, and the caller is responsible for
 * including it in the backup archive as 'backup_label'. There can be many
 * non-exclusive backups active at the same time, and they don't conflict
 * with an exclusive backup either.
 *
 * Every successfully started non-exclusive backup must be stopped by calling
 * do_pg_stop_backup() or do_pg_abort_backup().
 */
XLogRecPtr
do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
{
	bool		exclusive = (labelfile == NULL);
	bool		backup_started_in_recovery = false;
	XLogRecPtr	checkpointloc;
	XLogRecPtr	startpoint;
	pg_time_t	stamp_time;
	char		strfbuf[128];
	char		xlogfilename[MAXFNAMELEN];
	uint32		_logId;
	uint32		_logSeg;
	struct stat stat_buf;
	FILE	   *fp;
	StringInfoData labelfbuf;

	/* base backup in recovery mode not currently supported */
	backup_started_in_recovery = false;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
		   errmsg("must be superuser or replication role to run a backup")));

	if (strlen(backupidstr) > MAXPGPATH)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("backup label too long (max %d bytes)",
						MAXPGPATH)));

10082 10083 10084 10085 10086 10087
	if (!XLogIsNeeded())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("WAL level not sufficient for making an online backup"),
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125
	/*
	 * Mark backup active in shared memory.  We must do full-page WAL writes
	 * during an on-line backup even if not doing so at other times, because
	 * it's quite possible for the backup dump to obtain a "torn" (partially
	 * written) copy of a database page if it reads the page concurrently with
	 * our write to the same page.	This can be fixed as long as the first
	 * write to the page in the WAL sequence is a full-page write. Hence, we
	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
	 * are no dirty pages in shared memory that might get dumped while the
	 * backup is in progress without having a corresponding WAL record.  (Once
	 * the backup is complete, we need not force full-page writes anymore,
	 * since we expect that any pages not modified during the backup interval
	 * must have been correctly captured by the backup.)
	 *
	 * Note that forcePageWrites has no effect during an online backup from
	 * the standby.
	 *
	 * We must hold WALInsertLock to change the value of forcePageWrites, to
	 * ensure adequate interlocking against XLogInsert().
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (exclusive)
	{
		if (XLogCtl->Insert.exclusiveBackup)
		{
			LWLockRelease(WALInsertLock);
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is already in progress"),
					 errhint("Run pg_stop_backup() and try again.")));
		}
		XLogCtl->Insert.exclusiveBackup = true;
	}
	else
		XLogCtl->Insert.nonExclusiveBackups++;
	XLogCtl->Insert.forcePageWrites = true;
	LWLockRelease(WALInsertLock);

10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137
	/*
	 * Force an XLOG file switch before the checkpoint, to ensure that the WAL
	 * segment the checkpoint is written to doesn't contain pages with old
	 * timeline IDs. That would otherwise happen if you called
	 * pg_start_backup() right after restoring from a PITR archive: the first
	 * WAL segment containing the startup checkpoint has pages in the
	 * beginning with the old timeline ID. That can cause trouble at recovery:
	 * we won't have a history file covering the old timeline if pg_xlog
	 * directory was not included in the base backup and the WAL archive was
	 * cleared too before starting the backup.
	 */
	RequestXLogSwitch();
10138

10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187
	/* Ensure we release forcePageWrites if fail below */
	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
	{
		bool		gotUniqueStartpoint = false;

		/*
		 * Force an XLOG file switch before the checkpoint, to ensure that the
		 * WAL segment the checkpoint is written to doesn't contain pages with
		 * old timeline IDs.  That would otherwise happen if you called
		 * pg_start_backup() right after restoring from a PITR archive: the
		 * first WAL segment containing the startup checkpoint has pages in
		 * the beginning with the old timeline ID.	That can cause trouble at
		 * recovery: we won't have a history file covering the old timeline if
		 * pg_xlog directory was not included in the base backup and the WAL
		 * archive was cleared too before starting the backup.
		 *
		 * This also ensures that we have emitted a WAL page header that has
		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
		 * compress out removable backup blocks, it won't remove any that
		 * occur after this point.
		 *
		 * During recovery, we skip forcing XLOG file switch, which means that
		 * the backup taken during recovery is not available for the special
		 * recovery case described above.
		 */
		if (!backup_started_in_recovery)
			RequestXLogSwitch();

		do
		{
			/*
			 * Force a CHECKPOINT.	Aside from being necessary to prevent torn
			 * page problems, this guarantees that two successive backup runs
			 * will have different checkpoint positions and hence different
			 * history file names, even if nothing happened in between.
			 *
			 * During recovery, establish a restartpoint if possible. We use
			 * the last restartpoint as the backup starting checkpoint. This
			 * means that two successive backup runs can have same checkpoint
			 * positions.
			 *
			 * Since the fact that we are executing do_pg_start_backup()
			 * during recovery means that checkpointer is running, we can use
			 * RequestCheckpoint() to establish a restartpoint.
			 *
			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
			 * passing fast = true).  Otherwise this can take awhile.
			 */
10188 10189
			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
							  (fast ? CHECKPOINT_IMMEDIATE : 0));
10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348

			/*
			 * Now we need to fetch the checkpoint record location, and also
			 * its REDO pointer.  The oldest point in WAL that would be needed
			 * to restore starting from the checkpoint is precisely the REDO
			 * pointer.
			 */
			LWLockAcquire(ControlFileLock, LW_SHARED);
			checkpointloc = ControlFile->checkPoint;
			startpoint = ControlFile->checkPointCopy.redo;
			LWLockRelease(ControlFileLock);

			/*
			 * If two base backups are started at the same time (in WAL sender
			 * processes), we need to make sure that they use different
			 * checkpoints as starting locations, because we use the starting
			 * WAL location as a unique identifier for the base backup in the
			 * end-of-backup WAL record and when we write the backup history
			 * file. Perhaps it would be better generate a separate unique ID
			 * for each backup instead of forcing another checkpoint, but
			 * taking a checkpoint right after another is not that expensive
			 * either because only few buffers have been dirtied yet.
			 */
			LWLockAcquire(WALInsertLock, LW_SHARED);
			if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
			{
				XLogCtl->Insert.lastBackupStart = startpoint;
				gotUniqueStartpoint = true;
			}
			LWLockRelease(WALInsertLock);
		} while (!gotUniqueStartpoint);

		XLByteToSeg(startpoint, _logId, _logSeg);
		XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);

		/*
		 * Construct backup label file
		 */
		initStringInfo(&labelfbuf);

		/* Use the log timezone here, not the session timezone */
		stamp_time = (pg_time_t) time(NULL);
		pg_strftime(strfbuf, sizeof(strfbuf),
					"%Y-%m-%d %H:%M:%S %Z",
					pg_localtime(&stamp_time, log_timezone));
		appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
						 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
		appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
						 checkpointloc.xlogid, checkpointloc.xrecoff);
		appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
						 exclusive ? "pg_start_backup" : "streamed");
		appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
						 backup_started_in_recovery ? "standby" : "master");
		appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
		appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

		elogif(debug_basebackup, LOG, "basebackup label file --\n%s", labelfbuf.data);

		/*
		 * Okay, write the file, or return its contents to caller.
		 */
		if (exclusive)
		{
			/*
			 * Check for existing backup label --- implies a backup is already
			 * running.  (XXX given that we checked exclusiveBackup above,
			 * maybe it would be OK to just unlink any such label file?)
			 */
			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
			{
				if (errno != ENOENT)
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not stat file \"%s\": %m",
									BACKUP_LABEL_FILE)));
			}
			else
				ereport(ERROR,
						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						 errmsg("a backup is already in progress"),
						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
								 BACKUP_LABEL_FILE)));

			fp = AllocateFile(BACKUP_LABEL_FILE, "w");

			if (!fp)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not create file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
				fflush(fp) != 0 ||
				pg_fsync(fileno(fp)) != 0 ||
				ferror(fp) ||
				FreeFile(fp))
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not write file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			pfree(labelfbuf.data);
		}
		else
			*labelfile = labelfbuf.data;
	}
	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));

	/*
	 * We're done.  As a convenience, return the starting WAL location.
	 */
	return startpoint;
}

/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
	bool		exclusive = DatumGetBool(arg);

	/* Update backup counters and forcePageWrites on failure */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (exclusive)
	{
		Assert(XLogCtl->Insert.exclusiveBackup);
		XLogCtl->Insert.exclusiveBackup = false;
	}
	else
	{
		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
		XLogCtl->Insert.nonExclusiveBackups--;
	}

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);
}

/*
 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
 * function.

 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
 * the non-exclusive backup specified by 'labelfile'.
 */
XLogRecPtr
do_pg_stop_backup(char *labelfile)
{
	bool		exclusive = (labelfile == NULL);
	bool		backup_started_in_recovery = false;
	XLogRecPtr	startpoint;
	XLogRecPtr	stoppoint;
	XLogRecData rdata;
	pg_time_t	stamp_time;
	char		strfbuf[128];
	char		histfilepath[MAXPGPATH];
	char		startxlogfilename[MAXFNAMELEN];
	char		stopxlogfilename[MAXFNAMELEN];
10349 10350
	char		lastxlogfilename[MAXFNAMELEN];
	char		histfilename[MAXFNAMELEN];
10351 10352 10353 10354 10355 10356
	char		backupfrom[20];
	uint32		_logId;
	uint32		_logSeg;
	FILE	   *lfp;
	FILE	   *fp;
	char		ch;
10357 10358
	int			seconds_before_warning;
	int			waits = 0;
10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369
	char	   *remaining;
	char	   *ptr;

	/* Currently backup during recovery not supported */
	backup_started_in_recovery = false;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
		 (errmsg("must be superuser or replication role to run a backup"))));

10370 10371 10372 10373 10374 10375
	if (!XLogIsNeeded())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("WAL level not sufficient for making an online backup"),
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536
	/*
	 * OK to update backup counters and forcePageWrites
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (exclusive)
		XLogCtl->Insert.exclusiveBackup = false;
	else
	{
		/*
		 * The user-visible pg_start/stop_backup() functions that operate on
		 * exclusive backups can be called at any time, but for non-exclusive
		 * backups, it is expected that each do_pg_start_backup() call is
		 * matched by exactly one do_pg_stop_backup() call.
		 */
		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
		XLogCtl->Insert.nonExclusiveBackups--;
	}

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);

	if (exclusive)
	{
		/*
		 * Read the existing label file into memory.
		 */
		struct stat statbuf;
		int			r;

		if (stat(BACKUP_LABEL_FILE, &statbuf))
		{
			if (errno != ENOENT)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not stat file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is not in progress")));
		}

		lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
		if (!lfp)
		{
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		}
		labelfile = palloc(statbuf.st_size + 1);
		r = fread(labelfile, statbuf.st_size, 1, lfp);
		labelfile[statbuf.st_size] = '\0';

		/*
		 * Close and remove the backup label file
		 */
		if (r != 1 || ferror(lfp) || FreeFile(lfp))
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		if (unlink(BACKUP_LABEL_FILE) != 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not remove file \"%s\": %m",
							BACKUP_LABEL_FILE)));
	}

	/*
	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
	 * but we are not expecting any variability in the file format).
	 */
	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
			   &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
			   &ch) != 4 || ch != '\n')
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */

	/*
	 * Parse the BACKUP FROM line. If we are taking an online backup from the
	 * standby, we confirm that the standby has not been promoted during the
	 * backup.
	 */
	ptr = strstr(remaining, "BACKUP FROM:");
	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("the standby was promoted during online backup"),
				 errhint("This means that the backup being taken is corrupt "
						 "and should not be used. "
						 "Try taking another online backup.")));

	/*
	 * Write the backup-end xlog record
	 */
	rdata.data = (char *) (&startpoint);
	rdata.len = sizeof(startpoint);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);

	elog(LOG, "Basebackup stop point is at %X/%X.",
			   stoppoint.xlogid, stoppoint.xrecoff);

	/*
	 * Force a switch to a new xlog segment file, so that the backup is valid
	 * as soon as archiver moves out the current segment file.
	 */
	RequestXLogSwitch();

	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
	XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);

	/* Use the log timezone here, not the session timezone */
	stamp_time = (pg_time_t) time(NULL);
	pg_strftime(strfbuf, sizeof(strfbuf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&stamp_time, log_timezone));

	/*
	 * Write the backup history file
	 */
	XLByteToSeg(startpoint, _logId, _logSeg);
	BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
						  startpoint.xrecoff % XLogSegSize);
	fp = AllocateFile(histfilepath, "w");
	if (!fp)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m",
						histfilepath)));
	fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
			startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
	fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
			stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
	/* transfer remaining lines from label to history file */
	fprintf(fp, "%s", remaining);
	fprintf(fp, "STOP TIME: %s\n", strfbuf);
	if (fflush(fp) || ferror(fp) || FreeFile(fp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write file \"%s\": %m",
						histfilepath)));

	/*
	 * Clean out any no-longer-needed history files.  As a side effect, this
	 * will post a .ready file for the newly created history file, notifying
	 * the archiver that history file may be archived immediately.
	 */
	CleanupBackupHistory();

10537
	/*
10538 10539 10540 10541 10542 10543 10544
	 * If archiving is enabled, wait for all the required WAL files to be
	 * archived before returning. If archiving isn't enabled, the required
	 * WAL needs to be transported via streaming replication (hopefully
	 * with wal_keep_segments set high enough), or some more exotic
	 * mechanism like polling and copying files from pg_xlog with script.
	 * We have no knowledge of those mechanisms, so it's up to the user to
	 * ensure that he gets all the required WAL.
10545
	 *
10546 10547 10548 10549
	 * We wait until both the last WAL file filled during backup and the
	 * history file have been archived, and assume that the alphabetic
	 * sorting property of the WAL files ensures any earlier WAL files are
	 * safely archived as well.
10550
	 *
10551 10552 10553
	 * We wait forever, since archive_command is supposed to work and we
	 * assume the admin wanted his backup to work completely. If you don't
	 * wish to wait, you can set statement_timeout.
10554
	 */
10555
	if (XLogArchivingActive())
T
Tom Lane 已提交
10556
	{
10557 10558
	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
	XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
T
Tom Lane 已提交
10559

10560 10561
	XLByteToSeg(startpoint, _logId, _logSeg);
	BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
10562 10563 10564 10565 10566
						  startpoint.xrecoff % XLogSegSize);

	seconds_before_warning = 60;
	waits = 0;

10567 10568
	while (XLogArchiveIsBusy(lastxlogfilename) ||
		   XLogArchiveIsBusy(histfilename))
10569 10570 10571 10572 10573 10574 10575
	{
		CHECK_FOR_INTERRUPTS();

		pg_usleep(1000000L);

		if (++waits >= seconds_before_warning)
		{
10576
			seconds_before_warning *= 2;		/* This wraps in >10 years... */
10577 10578 10579
			ereport(WARNING,
					(errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
							waits)));
10580 10581
		}
	}
10582 10583 10584 10585
	}
	else
		ereport(NOTICE,
				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
10586

10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623
	/*
	 * We're done.  As a convenience, return the ending WAL location.
	 */
	return stoppoint;
}

/*
 * do_pg_abort_backup: abort a running backup
 *
 * This does just the most basic steps of do_pg_stop_backup(), by taking the
 * system out of backup mode, thus making it a lot more safe to call from
 * an error handler.
 *
 * NB: This is only for aborting a non-exclusive backup that doesn't write
 * backup_label. A backup started with pg_stop_backup() needs to be finished
 * with pg_stop_backup().
 */
void
do_pg_abort_backup(void)
{
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
	XLogCtl->Insert.nonExclusiveBackups--;

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);
}


/*
 * pg_switch_xlog: switch to next xlog file
 */
Datum
10624
pg_switch_xlog(PG_FUNCTION_ARGS)
10625 10626 10627 10628 10629 10630 10631
{
	XLogRecPtr	switchpoint;
	char		location[MAXFNAMELEN];

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
10632
			 (errmsg("must be superuser to switch transaction log files"))));
10633 10634 10635 10636 10637 10638 10639 10640

	switchpoint = RequestXLogSwitch();

	/*
	 * As a convenience, return the WAL location of the switch record
	 */
	snprintf(location, sizeof(location), "%X/%X",
			 switchpoint.xlogid, switchpoint.xrecoff);
10641
	PG_RETURN_TEXT_P(cstring_to_text(location));
10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667
}

/*
 * Report the current WAL write location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to an external
 * archiving process.  Note that the data before this point is written out
 * to the kernel, but is not necessarily synced to disk.
 */
Datum
pg_current_xlog_location(PG_FUNCTION_ARGS __attribute__((unused)) )
{
	char		location[MAXFNAMELEN];

	/* Make sure we have an up-to-date local LogwrtResult */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	snprintf(location, sizeof(location), "%X/%X",
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
10668
	PG_RETURN_TEXT_P(cstring_to_text(location));
10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691
}

/*
 * Report the current WAL insert location (same format as pg_start_backup etc)
 *
 * This function is mostly for debugging purposes.
 */
Datum
pg_current_xlog_insert_location(PG_FUNCTION_ARGS __attribute__((unused)) )
{
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecPtr	current_recptr;
	char		location[MAXFNAMELEN];

	/*
	 * Get the current end-of-WAL position ... shared lock is sufficient
	 */
	LWLockAcquire(WALInsertLock, LW_SHARED);
	INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
	LWLockRelease(WALInsertLock);

	snprintf(location, sizeof(location), "%X/%X",
			 current_recptr.xlogid, current_recptr.xrecoff);
10692
	PG_RETURN_TEXT_P(cstring_to_text(location));
10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723
}

/*
 * Compute an xlog file name and decimal byte offset given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 *
 * Note that a location exactly at a segment boundary is taken to be in
 * the previous segment.  This is usually the right thing, since the
 * expected usage is to determine which xlog file(s) are ready to archive.
 */
Datum
pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	uint32		xrecoff;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];
	Datum		values[2];
	bool		isnull[2];
	TupleDesc	resultTupleDesc;
	HeapTuple	resultHeapTuple;
	Datum		result;

	/*
	 * Read input and parse
	 */
10724
	locationstr = text_to_cstring(location);
10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("could not parse transaction log location \"%s\"",
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	/*
	 * Construct a tuple descriptor for the result row.  This must match this
	 * function's pg_proc entry!
	 */
	resultTupleDesc = CreateTemplateTupleDesc(2, false);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
					   TEXTOID, -1, 0);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
					   INT4OID, -1, 0);

	resultTupleDesc = BlessTupleDesc(resultTupleDesc);

	/*
	 * xlogfilename
	 */
	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

10753
	values[0] = CStringGetTextDatum(xlogfilename);
10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789
	isnull[0] = false;

	/*
	 * offset
	 */
	xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;

	values[1] = UInt32GetDatum(xrecoff);
	isnull[1] = false;

	/*
	 * Tuple jam: Having first prepared your Datums, then squash together
	 */
	resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);

	result = HeapTupleGetDatum(resultHeapTuple);

	PG_RETURN_DATUM(result);
}

/*
 * Compute an xlog file name given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 */
Datum
pg_xlogfile_name(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];

10790
	locationstr = text_to_cstring(location);
10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("could not parse transaction log location \"%s\"",
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

10804
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981
}

/*
 * read_backup_label: check to see if a backup_label file is present
 *
 * If we see a backup_label during recovery, we assume that we are recovering
 * from a backup dump file, and we therefore roll forward from the checkpoint
 * identified by the label file, NOT what pg_control says.	This avoids the
 * problem that pg_control might have been archived one or more checkpoints
 * later than the start of the dump, and so if we rely on it as the start
 * point, we will fail to restore a consistent database state.
 *
 * Returns TRUE if a backup_label was found (and fills the checkpoint
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
 * respectively); returns FALSE if not. If this backup_label came from a
 * streamed backup, *backupEndRequired is set to TRUE.
 */
static bool
read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired)
{
	char		startxlogfilename[MAXFNAMELEN];
	TimeLineID	tli;
	FILE	   *lfp;
	char		ch;
	char		backuptype[20];
	char		backupfrom[20];

	*backupEndRequired = false;

	/*
	 * See if label file is present
	 */
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		return false;			/* it's not there, all is fine */
	}

	/*
	 * Read and parse the START WAL LOCATION, CHECKPOINT and BACKUP_METHOD
	 * lines (this code is pretty crude, but we are not expecting any variability
	 * in the file format).
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
			   &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
			   startxlogfilename, &ch) != 5 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));

	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
			   &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
			   &ch) != 3 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));

	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
	{
		/* Streaming backup method is only supported */
		if (strcmp(backuptype, "streamed") == 0)
			*backupEndRequired = true;
		else
			ereport(FATAL,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));

	}

	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
	{
		/* Backup from standby is not supported */
		if (strcmp(backupfrom, "master") != 0)
			ereport(FATAL,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
	}

	if (ferror(lfp) || FreeFile(lfp))
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
						BACKUP_LABEL_FILE)));

	return true;
}

/*
 * Get latest redo apply position.
 *
 * Optionally, returns the current recovery target timeline. Callers not
 * interested in that may pass NULL for targetTLI.
 *
 * Exported to allow WAL receiver to read the pointer directly.
 */
XLogRecPtr
GetXLogReplayRecPtr(TimeLineID *targetTLI)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->lastReplayedEndRecPtr;
	if (targetTLI)
		*targetTLI = xlogctl->RecoveryTargetTLI;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

/*
 * Get current standby flush position, ie, the last WAL position
 * known to be fsync'd to disk in standby.
 *
 * If 'targetTLI' is not NULL, it's set to the current recovery target
 * timeline.
 */
XLogRecPtr
GetStandbyFlushRecPtr(TimeLineID *targetTLI)
{
	XLogRecPtr      receivePtr;
	XLogRecPtr      replayPtr;

	receivePtr = GetWalRcvWriteRecPtr(NULL);
	replayPtr = GetXLogReplayRecPtr(targetTLI);

	if (XLByteLT(receivePtr, replayPtr))
		return replayPtr;
	else
		return receivePtr;
}

/*
 * GetRecoveryTargetTLI - get the current recovery target timeline ID
 */
TimeLineID
GetRecoveryTargetTLI(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	TimeLineID result;

	SpinLockAcquire(&xlogctl->info_lck);
	result = xlogctl->RecoveryTargetTLI;
	SpinLockRelease(&xlogctl->info_lck);

	return result;
}

/*
 * Error context callback for errors occurring during rm_redo().
 */
static void
rm_redo_error_callback(void *arg)
{
	RedoErrorCallBack *redoErrorCallBack = (RedoErrorCallBack*) arg;
	StringInfoData buf;

	initStringInfo(&buf);
	RmgrTable[redoErrorCallBack->record->xl_rmid].rm_desc(
												   &buf,
												   redoErrorCallBack->location,
												   redoErrorCallBack->record);

	/* don't bother emitting empty description */
	if (buf.len > 0)
		errcontext("xlog redo %s", buf.data);

	pfree(buf.data);
}

10982 10983 10984 10985 10986 10987 10988 10989 10990 10991
#if 0 /* GPDB doesn't have online backup */
/*
 * BackupInProgress: check if online backup mode is active
 *
 * This is done by checking for existence of the "backup_label" file.
 */
bool
BackupInProgress(void)
{
	struct stat stat_buf;
10992

10993
	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
V
WAL  
Vadim B. Mikheev 已提交
10994
}
B
Bruce Momjian 已提交
10995

10996 10997 10998 10999 11000 11001 11002
/*
 * CancelBackup: rename the "backup_label" file to cancel backup mode
 *
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
 * Note that this will render an online backup in progress useless.
 * To correctly finish an online backup, pg_stop_backup must be called.
 */
V
WAL  
Vadim B. Mikheev 已提交
11003
void
11004
CancelBackup(void)
V
WAL  
Vadim B. Mikheev 已提交
11005
{
11006
	struct stat stat_buf;
V
WAL  
Vadim B. Mikheev 已提交
11007

11008 11009 11010
	/* if the file is not there, return */
	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
		return;
B
Bruce Momjian 已提交
11011

11012 11013 11014 11015
	/* remove leftover file from previously cancelled backup if it exists */
	unlink(BACKUP_LABEL_OLD);

	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
11016
	{
11017 11018 11019
		ereport(LOG,
				(errmsg("online backup mode cancelled"),
				 errdetail("\"%s\" was renamed to \"%s\".",
11020
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11021
	}
11022
	else
11023
	{
11024 11025 11026 11027
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("online backup mode was not cancelled"),
				 errdetail("Could not rename \"%s\" to \"%s\": %m.",
11028
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11029
	}
11030 11031 11032
}
#endif

11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044
static char *
XLogLocationToBuffer(char *buffer, XLogRecPtr *loc, bool longFormat)
{

	if (longFormat)
	{
		uint32 seg = loc->xrecoff / XLogSegSize;
		uint32 offset = loc->xrecoff % XLogSegSize;
		sprintf(buffer,
			    "%X/%X (==> seg %d, offset 0x%X)",
			    loc->xlogid, loc->xrecoff,
			    seg, offset);
11045
	}
11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111
	else
		sprintf(buffer,
			    "%X/%X",
			    loc->xlogid, loc->xrecoff);

	return buffer;
}

static char xlogLocationBuffer[50];
static char xlogLocationBuffer2[50];
static char xlogLocationBuffer3[50];
static char xlogLocationBuffer4[50];
static char xlogLocationBuffer5[50];

char *
XLogLocationToString(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString2(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer2, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString3(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer3, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString4(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer4, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString5(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer5, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer, loc, true);
}

char *
XLogLocationToString2_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer2, loc, true);
}

char *
XLogLocationToString3_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer3, loc, true);
}

char *
XLogLocationToString4_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer4, loc, true);
11112
}
11113

11114 11115 11116 11117 11118 11119 11120 11121
char *
XLogLocationToString5_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer5, loc, true);
}


/* ------------------------------------------------------
11122
 *	Startup Process main entry point and signal handlers
11123 11124
 * ------------------------------------------------------
 */
11125 11126

/*
11127
 * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
11128
 *
11129 11130
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
11131
 */
11132
static void
11133
startupproc_quickdie(SIGNAL_ARGS)
11134
{
11135
	PG_SETMASK(&BlockSig);
11136

11137 11138 11139 11140 11141 11142 11143 11144 11145
	/*
	 * We DO NOT want to run proc_exit() callbacks -- we're here because
	 * shared memory may be corrupted, so we don't want to try to clean up our
	 * transaction.  Just nail the windows shut and get out of town.  Now that
	 * there's an atexit callback to prevent third-party code from breaking
	 * things by calling exit() directly, we have to reset the callbacks
	 * explicitly to make this work as intended.
	 */
	on_exit_reset();
11146

11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168
	/*
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
	 * should ensure the postmaster sees this as a crash, too, but no harm in
	 * being doubly sure.)
	 */
	exit(2);
}

/* SIGUSR2: set flag to finish recovery */
static void
StartupProcTriggerHandler(SIGNAL_ARGS)
{
	int			save_errno = errno;

	WakeupRecovery();

	errno = save_errno;
}

11169 11170 11171 11172 11173 11174 11175
/* SIGUSR1: let latch facility handle the signal */
static void
StartupProcSigUsr1Handler(SIGNAL_ARGS)
{
	latch_sigusr1_handler();
}

11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198
/* SIGHUP: set flag to re-read config file at next convenient time */
static void
StartupProcSigHupHandler(SIGNAL_ARGS)
{
	int			save_errno = errno;

	got_SIGHUP = true;
	WakeupRecovery();

	errno = save_errno;
}

/* SIGTERM: set flag to abort redo and exit */
static void
StartupProcShutdownHandler(SIGNAL_ARGS)
{
	int			save_errno = errno;

	if (in_restore_command)
		proc_exit(1);
	else
		shutdown_requested = true;
	WakeupRecovery();
11199

11200 11201
	errno = save_errno;
}
B
Bruce Momjian 已提交
11202

11203 11204 11205 11206
/* Handle SIGHUP and SIGTERM signals of startup process */
void
HandleStartupProcInterrupts(void)
{
11207
	/*
11208
	 * Check if we were requested to re-read config file.
11209
	 */
11210
	if (got_SIGHUP)
11211
	{
11212 11213
		got_SIGHUP = false;
		ProcessConfigFile(PGC_SIGHUP);
11214
	}
B
Bruce Momjian 已提交
11215

11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264
	/*
	 * Check if we were requested to exit without finishing recovery.
	 */
	if (shutdown_requested)
		proc_exit(1);

	/*
	 * Emergency bailout if postmaster has died.  This is to avoid the
	 * necessity for manual cleanup of all postmaster children.
	 */
	if (IsUnderPostmaster && !PostmasterIsAlive(true))
		exit(1);
}

static void
HandleCrash(SIGNAL_ARGS)
{
    /**
     * Handle crash is registered as a signal handler for SIGILL/SIGBUS/SIGSEGV
     *
     * This simply calls the standard handler which will log the signal and reraise the
     *      signal if needed
     */
    StandardHandlerForSigillSigsegvSigbus_OnMainThread("a startup process", PASS_SIGNAL_ARGS);
}

/* Main entry point for startup process */
void
StartupProcessMain(int passNum)
{
	am_startup = true;
	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 */
	pqsignal(SIGHUP, StartupProcSigHupHandler);	 /* reload config file */
	pqsignal(SIGINT, SIG_IGN);					/* ignore query cancel */
	pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
	pqsignal(SIGQUIT, startupproc_quickdie);		/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
11265
	pqsignal(SIGUSR1, StartupProcSigUsr1Handler);
11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294
	if (passNum == 1)
		pqsignal(SIGUSR2, StartupProcTriggerHandler);
	else
		pqsignal(SIGUSR2, SIG_IGN);

#ifdef SIGBUS
	pqsignal(SIGBUS, HandleCrash);
#endif
#ifdef SIGILL
    pqsignal(SIGILL, HandleCrash);
#endif
#ifdef SIGSEGV
	pqsignal(SIGSEGV, HandleCrash);
#endif

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

11295
	StartupXLOG();
11296

11297 11298
	BuildFlatFiles(false);

11299 11300 11301 11302 11303 11304 11305
	/*
	 * Exit normally. Exit code 0 tells postmaster that we completed
	 * recovery successfully.
	 */
	proc_exit(0);
}

11306
/*
11307 11308 11309
 * The routine recovers pg_control flat file on mirror side.
 *		a) It copies pg_control file from primary to mirror
 *      b) pg_control file is overwritten on mirror
11310
 *
11311
 * Status is not returned, If an error occurs segmentState will be set to Fault.
11312
 */
11313 11314
int
XLogRecoverMirrorControlFile(void)
11315
{
11316 11317
	MirroredFlatFileOpen	mirroredOpen;
	int						retval = 0;
11318

11319
	while (1) {
11320

11321
		ReadControlFile();
11322

11323 11324 11325 11326 11327 11328 11329 11330
		retval = MirroredFlatFile_Open(
							  &mirroredOpen,
							  XLOG_CONTROL_FILE_SUBDIR,
							  XLOG_CONTROL_FILE_SIMPLE,
							  O_CREAT | O_RDWR | PG_BINARY,
							  S_IRUSR | S_IWUSR,
							  /* suppressError */ false,
							  /* atomic operation */ false,
11331
							  /* isMirrorRecovery */ TRUE);
11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354
		if (retval != 0)
			break;

		retval = MirroredFlatFile_Write(
							   &mirroredOpen,
							   0,
							   ControlFile,
							   PG_CONTROL_SIZE,
							   /* suppressError */ false);
		if (retval != 0)
			break;

		retval = MirroredFlatFile_Flush(
							   &mirroredOpen,
							   /* suppressError */ false);
		if (retval != 0)
			break;

		MirroredFlatFile_Close(&mirroredOpen);
		break;
	} // while(1)

	return retval;
11355 11356
}

11357 11358 11359 11360 11361 11362 11363
int
XLogRecoverMirror(void)
{
  DIR                *cldir;
  struct dirent     *clde;
  int                retval = 0;
	char            *xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
11364

11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380
  cldir = AllocateDir(xlogDir);
  while ((clde = ReadDir(cldir, xlogDir)) != NULL) {
    if (strlen(clde->d_name) == 24 &&
	strspn(clde->d_name, "0123456789ABCDEF") == 24) {

      retval = MirrorFlatFile( XLOGDIR, clde->d_name);

      if (retval != 0)
	break;

    }
  }
  FreeDir(cldir);
	pfree(xlogDir);

  return retval;
11381 11382
}

11383
/*
11384 11385 11386
 * Check to see whether the user-specified trigger file exists and whether a
 * promote request has arrived.  If either condition holds, request postmaster
 * to shut down walreceiver, wait for it to exit, and return true.
11387 11388
 */
static bool
11389
CheckForStandbyTrigger(void)
11390
{
11391
	static bool triggered = false;
11392

11393 11394
	if (triggered)
		return true;
11395

11396
	if (CheckPromoteSignal(true))
11397
	{
11398 11399 11400 11401 11402
		ereport(LOG,
				(errmsg("received promote request")));
		ShutdownWalRcv();
		triggered = true;
		return true;
11403
	}
B
Bruce Momjian 已提交
11404

11405 11406
	return false;
}
11407

11408 11409 11410 11411 11412 11413 11414 11415
/*
 * Check to see if a promote request has arrived. Should be
 * called by postmaster after receiving SIGUSR1.
 */
bool
CheckPromoteSignal(bool do_unlink)
{
	struct stat stat_buf;
11416

11417
	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
11418 11419
	{
		/*
11420 11421
		 * Since we are in a signal handler, it's not safe to elog. We
		 * silently ignore any error from unlink.
11422
		 */
11423 11424 11425
		if (do_unlink)
			unlink(PROMOTE_SIGNAL_FILE);
		return true;
11426
	}
11427 11428
	return false;
}
11429

11430
/*
11431 11432
 * Put the current standby master dbid in the shared memory, which will
 * be looked up from mmxlog.
11433
 */
11434 11435
void
SetStandbyDbid(int16 dbid)
11436
{
11437 11438
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
11439

11440 11441 11442
	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->standbyDbid = dbid;
	SpinLockRelease(&xlogctl->info_lck);
11443

11444 11445 11446 11447 11448
	/*
	 * Let postmaster know we've changed standby dbid.
	 */
	SendPostmasterSignal(PMSIGNAL_SEGCONFIG_CHANGE);
}
11449

11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477
/*
 * Returns current standby dbid.
 */
int16
GetStandbyDbid(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	int16	dbid;

	SpinLockAcquire(&xlogctl->info_lck);
	dbid = xlogctl->standbyDbid;
	SpinLockRelease(&xlogctl->info_lck);

	return dbid;
}

/*
 * True if we are running standby-mode continuous recovery.
 * Note this would return false after finishing the recovery, even if
 * we are still on standby master with a primary master running.
 * Also this only works in the startup process as the StandbyMode
 * flag is not in shared memory.
 */
bool
IsStandbyMode(void)
{
	return StandbyMode;
11478
}
11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508

static void
GetXLogCleanUpTo(XLogRecPtr recptr, uint32 *_logId, uint32 *_logSeg)
{
#ifndef USE_SEGWALREP
	/* Only for MASTER check this GUC and act */
    if (GpIdentity.segindex == MASTER_CONTENT_ID)
    {
#endif
	/*
	 * See if we have a live WAL sender and see if it has a
	 * start xlog location (with active basebackup) or standby fsync location
	 * (with active standby). We have to compare it with prev. checkpoint
	 * location. We use the min out of them to figure out till
	 * what point we need to save the xlog seg files
	 */
	XLogRecPtr xlogCleanUpTo = WalSndCtlGetXLogCleanUpTo();
	if (!XLogRecPtrIsInvalid(xlogCleanUpTo))
	{
		if (XLByteLT(recptr, xlogCleanUpTo))
			xlogCleanUpTo = recptr;
	}
	else
		xlogCleanUpTo = recptr;

	CheckKeepWalSegments(xlogCleanUpTo, _logId, _logSeg);
#ifndef USE_SEGWALREP
	}
#endif
}
11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565

/*
 * Checks whether the current buffer page and backup page stored in the
 * WAL record are consistent or not. Before comparing the two pages, a
 * masking can be applied to the pages to ignore certain areas like hint bits,
 * unused space between pd_lower and pd_upper among other things. This
 * function should be called once WAL replay has been completed for a
 * given record.
 */
static void
checkXLogConsistency(XLogRecord *record, XLogRecPtr EndRecPtr)
{
	RmgrId		rmid = record->xl_rmid;
	char       *blk;

	/* Records with no backup blocks have no need for consistency checks. */
	if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
		return;

	Assert((record->xl_extended_info & XLR_CHECK_CONSISTENCY) != 0);

	blk = (char *) XLogRecGetData(record) + record->xl_len;
	for (int i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		BkpBlock    bkpb;
		Buffer		buf;
		Page		page;
		char       *src_buffer;

		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
		{
			/*
			 * WAL record doesn't contain a block do nothing.
			 */
			continue;
		}

		memcpy(&bkpb, blk, sizeof(BkpBlock));
		blk += sizeof(BkpBlock);
		src_buffer = blk;
		/* move on to point to next block */
		blk += BLCKSZ - bkpb.hole_length;

		if (bkpb.block_info & BLOCK_APPLY)
		{
			/*
			 * WAL record has already applied the page, so bypass the
			 * consistency check as that would result in comparing the full
			 * page stored in the record with itself.
			 */
			continue;
		}

		/*
		 * Read the contents from the current buffer and store it in a
		 * temporary page.
		 */
11566
		buf = XLogReadBuffer(bkpb.node, bkpb.block, false);
11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635
		if (!BufferIsValid(buf))
			continue;

		page = BufferGetPage(buf);

		/*
		 * Take a copy of the local page where WAL has been applied to have a
		 * comparison base before masking it...
		 */
		memcpy(replay_image_masked, page, BLCKSZ);

		/* No need for this page anymore now that a copy is in. */
		UnlockReleaseBuffer(buf);

		/*
		 * If the block LSN is already ahead of this WAL record, we can't
		 * expect contents to match.  This can happen if recovery is
		 * restarted.
		 */
		if (XLByteLT(EndRecPtr, PageGetLSN(replay_image_masked)))
			continue;

		/*
		 * Read the contents from the backup copy, stored in WAL record and
		 * store it in a temporary page. There is no need to allocate a new
		 * page here, a local buffer is fine to hold its contents and a mask
		 * can be directly applied on it.
		 */
		if (bkpb.hole_length == 0)
		{
			memcpy((char *) master_image_masked, src_buffer, BLCKSZ);
		}
		else
		{
			/* zero-fill the hole, anyways gets masked out */
			MemSet((char *) master_image_masked, 0, BLCKSZ);
			memcpy((char *) master_image_masked, src_buffer, bkpb.hole_offset);
			memcpy((char *) master_image_masked + (bkpb.hole_offset + bkpb.hole_length),
				   src_buffer + bkpb.hole_offset,
				   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
		}

		/*
		 * If masking function is defined, mask both the master and replay
		 * images
		 */
		if (RmgrTable[rmid].rm_mask != NULL)
		{
			RmgrTable[rmid].rm_mask(replay_image_masked, bkpb.block);
			RmgrTable[rmid].rm_mask(master_image_masked, bkpb.block);
		}

		/* Time to compare the master and replay images. */
		if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
		{
			elog(FATAL,
				 "inconsistent page found, rel %u/%u/%u, blkno %u",
				 bkpb.node.spcNode, bkpb.node.dbNode, bkpb.node.relNode,
				 bkpb.block);
		}
		else
		{
			elog(DEBUG1,
				 "Consistent page for rel %u/%u/%u, blkno %u",
				 bkpb.node.spcNode, bkpb.node.dbNode, bkpb.node.relNode,
				 bkpb.block);
		}
	}
}
11636 11637 11638 11639 11640 11641 11642 11643 11644 11645

/*
 * Wake up startup process to replay newly arrived WAL, or to notice that
 * failover has been requested.
 */
void
WakeupRecovery(void)
{
	SetLatch(&XLogCtl->recoveryWakeupLatch);
}