xlog.c 391.4 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
B
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.345 2009/06/26 20:29:04 tgl Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <ctype.h>
T
Tom Lane 已提交
18
#include <signal.h>
19
#include <time.h>
20
#include <fcntl.h>
21
#include <sys/stat.h>
22
#include <sys/time.h>
23 24
#include <sys/wait.h>
#include <unistd.h>
25

26
#include "access/clog.h"
27
#include "access/multixact.h"
28
#include "access/distributedlog.h"
29
#include "access/subtrans.h"
30
#include "access/transam.h"
31
#include "access/tuptoaster.h"
32
#include "access/twophase.h"
33
#include "access/xact.h"
34
#include "access/xlog_internal.h"
35
#include "access/xlogmm.h"
36
#include "access/xlogdefs.h"
37
#include "access/xlogutils.h"
38
#include "catalog/catalog.h"
39
#include "catalog/catversion.h"
H
Heikki Linnakangas 已提交
40
#include "catalog/pg_authid.h"
T
Tom Lane 已提交
41
#include "catalog/pg_control.h"
42
#include "catalog/pg_type.h"
43 44 45
#include "catalog/pg_database.h"
#include "catalog/pg_tablespace.h"
#include "executor/spi.h"
46
#include "funcapi.h"
47 48
#include "libpq/pqsignal.h"
#include "libpq/hba.h"
49
#include "miscadmin.h"
50
#include "pgstat.h"
51
#include "postmaster/bgwriter.h"
52
#include "postmaster/postmaster.h"
53
#include "storage/bufpage.h"
54
#include "storage/bufmgr.h"
55
#include "storage/fd.h"
56
#include "storage/ipc.h"
57
#include "storage/latch.h"
58
#include "storage/pmsignal.h"
59
#include "storage/procarray.h"
60
#include "storage/smgr.h"
61
#include "storage/spin.h"
62
#include "utils/builtins.h"
63
#include "utils/nabstime.h"
64 65 66 67 68 69 70 71 72
#include "utils/faultinjector.h"
#include "utils/flatfiles.h"
#include "utils/guc.h"
#include "utils/ps_status.h"
#include "pg_trace.h"
#include "utils/catcache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
#include "utils/pg_crc.h"
73
#include "utils/ps_status.h"
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/backendid.h"
#include "storage/sinvaladt.h"

#include "cdb/cdbtm.h"
#include "cdb/cdbfilerep.h"
#include "cdb/cdbfilerepresyncmanager.h"
#include "cdb/cdbvars.h"
#include "cdb/cdbpersistentrelation.h"
#include "cdb/cdbmirroredflatfile.h"
#include "cdb/cdbpersistentrecovery.h"
#include "cdb/cdbresynchronizechangetracking.h"
#include "cdb/cdbpersistentfilesysobj.h"
#include "cdb/cdbpersistentcheck.h"
89
#include "utils/resscheduler.h"
90
#include "utils/snapmgr.h"
91

92
extern uint32 bootstrap_data_checksum_version;
93

94 95 96
/* File path names (all relative to $PGDATA) */
#define RECOVERY_COMMAND_FILE	"recovery.conf"
#define RECOVERY_COMMAND_DONE	"recovery.done"
97
#define PROMOTE_SIGNAL_FILE "promote"
98 99


T
Tom Lane 已提交
100 101
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
102
int			XLOGbuffers = 8;
103
int			XLogArchiveTimeout = 0;
104
bool		XLogArchiveMode = false;
105
char	   *XLogArchiveCommand = NULL;
106
bool		fullPageWrites = true;
107 108
char   *wal_consistency_checking_string = NULL;
bool   *wal_consistency_checking = NULL;
109
bool		log_checkpoints = false;
110
int			sync_method = DEFAULT_SYNC_METHOD;
T
Tom Lane 已提交
111

112 113 114 115
#ifdef WAL_DEBUG
bool		XLOG_DEBUG = false;
#endif

116
/*
117 118 119 120 121
 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 * When we are done with an old XLOG segment file, we will recycle it as a
 * future XLOG segment as long as there aren't already XLOGfileslop future
 * segments; else we'll delete it.  This could be made a separate GUC
 * variable, but at present I think it's sufficient to hardwire it as
B
Bruce Momjian 已提交
122
 * 2*CheckPointSegments+1.	Under normal conditions, a checkpoint will free
123 124 125
 * no more than 2*CheckPointSegments log segments, and we want to recycle all
 * of them; the +1 allows boundary cases to happen without wasting a
 * delete/create-segment cycle.
126 127 128
 */
#define XLOGfileslop	(2*CheckPointSegments + 1)

129
bool am_startup = false;
130

131
/*
132
 * GUC support
133
 */
134
const struct config_enum_entry sync_method_options[] = {
135
	{"fsync", SYNC_METHOD_FSYNC, false},
136
#ifdef HAVE_FSYNC_WRITETHROUGH
137
	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
138 139
#endif
#ifdef HAVE_FDATASYNC
140
	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
141 142
#endif
#ifdef OPEN_SYNC_FLAG
143
	{"open_sync", SYNC_METHOD_OPEN, false},
144 145
#endif
#ifdef OPEN_DATASYNC_FLAG
146
	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
147
#endif
148
	{NULL, 0, false}
149
};
T
Tom Lane 已提交
150

151 152 153 154 155 156 157
/*
 * Statistics for current checkpoint are collected in this global struct.
 * Because only the background writer or a stand-alone backend can perform
 * checkpoints, this will be unused in normal backends.
 */
CheckpointStatsData CheckpointStats;

T
Tom Lane 已提交
158
/*
159 160
 * ThisTimeLineID will be same in all backends --- it identifies current
 * WAL timeline for the database system.
T
Tom Lane 已提交
161
 */
162
TimeLineID	ThisTimeLineID = 0;
V
WAL  
Vadim B. Mikheev 已提交
163

164
/*
165
 * Are we doing recovery from XLOG?
166
 *
167 168 169 170 171
 * This is only ever true in the startup process; it should be read as meaning
 * "this process is replaying WAL records", rather than "the system is in
 * recovery mode".  It should be examined primarily by functions that need
 * to act differently when called from a WAL redo function (e.g., to skip WAL
 * logging).  To check whether the system is in recovery regardless of which
172 173
 * process you're running in, use RecoveryInProgress().
 */
T
Tom Lane 已提交
174
bool		InRecovery = false;
B
Bruce Momjian 已提交
175

176 177 178 179 180
/*
 * Local copy of SharedRecoveryInProgress variable. True actually means "not
 * known, need to check the shared state".
 */
static bool LocalRecoveryInProgress = true;
B
Bruce Momjian 已提交
181

182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
/*
 * Local state for XLogInsertAllowed():
 *		1: unconditionally allowed to insert XLOG
 *		0: unconditionally not allowed to insert XLOG
 *		-1: must check RecoveryInProgress(); disallow until it is false
 * Most processes start with -1 and transition to 1 after seeing that recovery
 * is not in progress.  But we can also force the value for special cases.
 * The coding in XLogInsertAllowed() depends on the first two of these states
 * being numerically the same as bool true and false.
 */
static int	LocalXLogInsertAllowed = -1;

/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;

197
/* Was the last xlog file restored from archive, or local? */
B
Bruce Momjian 已提交
198
static bool restoredFromArchive = false;
199

200
/* options taken from recovery.conf */
201
#ifdef NOT_USED
202
static char *recoveryRestoreCommand = NULL;
203
#endif
204
static char *recoveryEndCommand = NULL;
205 206 207
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
B
Bruce Momjian 已提交
208
static TransactionId recoveryTargetXid;
209
static TimestampTz recoveryTargetTime;
210
static TimestampTz recoveryLastXTime = 0;
211

212 213 214
static char *replay_image_masked = NULL;
static char *master_image_masked = NULL;

215 216 217 218 219 220 221
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyModeRequested = false;
static char *PrimaryConnInfo = NULL;

/* are we currently in standby mode? */
bool StandbyMode = false;

222
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
B
Bruce Momjian 已提交
223
static TransactionId recoveryStopXid;
224
static TimestampTz recoveryStopTime;
B
Bruce Momjian 已提交
225
static bool recoveryStopAfter;
226 227 228 229 230 231 232 233 234 235 236 237 238

/*
 * During normal operation, the only timeline we care about is ThisTimeLineID.
 * During recovery, however, things are more complicated.  To simplify life
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 * scan through the WAL history (that is, it is the line that was active when
 * the currently-scanned WAL record was generated).  We also need these
 * timeline values:
 *
 * recoveryTargetTLI: the desired timeline that we want to end in.
 *
 * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 * its known parents, newest first (so recoveryTargetTLI is always the
B
Bruce Momjian 已提交
239
 * first list member).	Only these TLIs are expected to be seen in the WAL
240 241 242 243 244 245 246 247 248
 * segments we read, and indeed only these TLIs will be considered as
 * candidate WAL files to open at all.
 *
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
 * (This is not necessarily the same as ThisTimeLineID, because we could
 * be scanning data that was copied from an ancestor timeline when the current
 * file was created.)  During a sequential scan we do not allow this value
 * to decrease.
 */
B
Bruce Momjian 已提交
249
static TimeLineID recoveryTargetTLI;
250
List *expectedTLIs;
B
Bruce Momjian 已提交
251
static TimeLineID curFileTLI;
252

T
Tom Lane 已提交
253 254
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
255 256 257 258
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 * end+1 of the last record, and is reset when we end a top-level transaction,
 * or start a new one; so it can be used to tell if the current transaction has
 * created any XLOG records.
T
Tom Lane 已提交
259 260
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
261

262
XLogRecPtr	XactLastRecEnd = {0, 0};
263

264 265 266 267
static uint32 ProcLastRecTotalLen = 0;

static uint32 ProcLastRecDataLen = 0;

268 269
static XLogRecPtr InvalidXLogRecPtr = {0, 0};

T
Tom Lane 已提交
270 271 272
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
273
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
274
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
B
Bruce Momjian 已提交
275
 * hold the Insert lock).  See XLogInsert for details.	We are also allowed
276
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
277 278
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 * InitXLOGAccess.
T
Tom Lane 已提交
279
 */
280
static XLogRecPtr RedoRecPtr;
281

282 283 284 285 286 287 288 289 290 291 292 293
/*
 * RedoStartLSN points to the checkpoint's REDO location which is specified
 * in a backup label file, backup history file or control file. In standby
 * mode, XLOG streaming usually starts from the position where an invalid
 * record was found. But if we fail to read even the initial checkpoint
 * record, we use the REDO location instead of the checkpoint location as
 * the start position of XLOG streaming. Otherwise we would have to jump
 * backwards to the REDO location after reading the checkpoint record,
 * because the REDO record can precede the checkpoint record.
 */
static XLogRecPtr RedoStartLSN = {0, 0};

T
Tom Lane 已提交
294 295 296 297 298 299 300 301 302
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
303
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
304 305 306
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
307 308 309 310
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
311 312 313
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
314 315
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
316 317 318
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
319
 * but is updated when convenient.	Again, it exists for the convenience of
320
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
321 322 323 324 325 326 327 328 329 330
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
331 332 333 334 335 336 337 338 339 340 341 342 343
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
344
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
345 346
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the bgwriter, this is just pro forma).
347
 *
T
Tom Lane 已提交
348 349
 *----------
 */
350

T
Tom Lane 已提交
351
typedef struct XLogwrtRqst
352
{
T
Tom Lane 已提交
353 354
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
355
} XLogwrtRqst;
356

357 358 359 360 361 362
typedef struct XLogwrtResult
{
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
} XLogwrtResult;

T
Tom Lane 已提交
363 364 365
/*
 * Shared state data for XLogInsert.
 */
366 367
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
368 369
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
370
	int			curridx;		/* current block index in cache */
B
Bruce Momjian 已提交
371 372 373
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
374
	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
375 376 377 378 379 380 381 382 383 384 385

	/*
	 * exclusiveBackup is true if a backup started with pg_start_backup() is
	 * in progress, and nonExclusiveBackups is a counter indicating the number
	 * of streaming base backups currently in progress. forcePageWrites is set
	 * to true when either of these is non-zero. lastBackupStart is the latest
	 * checkpoint redo location used as a starting point for an online backup.
	 */
	bool		exclusiveBackup;
	int			nonExclusiveBackups;
	XLogRecPtr	lastBackupStart;
386 387
} XLogCtlInsert;

T
Tom Lane 已提交
388 389 390
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
391 392
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
393 394
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	int			curridx;		/* cache index of next block to write */
395
	pg_time_t	lastSegSwitchTime;		/* time of last xlog segment switch */
396 397
} XLogCtlWrite;

T
Tom Lane 已提交
398 399 400
/*
 * Total shared-memory state for XLOG.
 */
401 402
typedef struct XLogCtlData
{
403
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
404
	XLogCtlInsert Insert;
405

T
Tom Lane 已提交
406
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
407 408
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
409 410
	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
	TransactionId ckptXid;
B
Bruce Momjian 已提交
411
	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
412 413
	uint32		lastRemovedLog; /* latest removed/recycled XLOG segment */
	uint32		lastRemovedSeg;
414

415
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
416 417
	XLogCtlWrite Write;

418 419 420 421 422 423 424 425 426 427 428
	/* Protected by ChangeTrackingTransitionLock. */
	XLogRecPtr	lastChangeTrackingEndLoc;
								/*
								 * End + 1 of the last XLOG record inserted and
 								 * (possible) change tracked.
 								 */

	/* Resynchronize */
	bool		sendingResynchronizeTransitionMsg;
	slock_t		resynchronize_lck;		/* locks shared variables shown above */

T
Tom Lane 已提交
429
	/*
B
Bruce Momjian 已提交
430 431 432
	 * These values do not change after startup, although the pointed-to pages
	 * and xlblocks values certainly do.  Permission to read/write the pages
	 * and xlblocks values depends on WALInsertLock and WALWriteLock.
T
Tom Lane 已提交
433
	 */
B
Bruce Momjian 已提交
434
	char	   *pages;			/* buffers for unwritten XLOG pages */
435
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
436
	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
437
	TimeLineID	ThisTimeLineID;
T
Tom Lane 已提交
438

439
	/*
440
	 * SharedRecoveryInProgress indicates if we're still in crash or archive
441
	 * recovery.  Protected by info_lck.
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
	 */
	bool		SharedRecoveryInProgress;

	/*
	 * recoveryWakeupLatch is used to wake up the startup process to continue
	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
	 * to appear.
	 */
	Latch		recoveryWakeupLatch;

	/*
	 * the standby's dbid when it runs.  Used in mmxlog to emit standby filepath.
	 * Protected by info_lck
	 */
	int16		standbyDbid;

458
	/*
459 460
	 * During recovery, we keep a copy of the latest checkpoint record here.
	 * Used by the background writer when it wants to create a restartpoint.
461 462 463 464 465 466
	 *
	 * Protected by info_lck.
	 */
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;

467 468 469 470 471 472 473 474 475 476 477
	/*
	 * Save the location of the last checkpoint record to enable supressing
	 * unnecessary checkpoint records -- when no new xlog has been written
	 * since the last one.
	 */
	bool 		haveLastCheckpointLoc;
	XLogRecPtr	lastCheckpointLoc;
	XLogRecPtr	lastCheckpointEndLoc;

	/*
	 * lastReplayedEndRecPtr points to end+1 of the last record successfully
478 479 480
	 * replayed. When we're currently replaying a record, ie. in a redo
	 * function, replayEndRecPtr points to the end+1 of the record being
	 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
481 482
	 */
	XLogRecPtr	lastReplayedEndRecPtr;
483 484 485
	XLogRecPtr	replayEndRecPtr;

	slock_t		info_lck;		/* locks shared variables shown above */
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520

	/* current effective recovery target timeline */
	TimeLineID	RecoveryTargetTLI;

	/*
	 * timestamp of when we started replaying the current chunk of WAL data,
	 * only relevant for replication or archive recovery
	 */
	TimestampTz currentChunkStartTime;

	/*
	 * Save the redo range used in Pass 1 recovery so it can be used in subsequent passes.
	 */
	bool		multipleRecoveryPassesNeeded;
	XLogRecPtr	pass1StartLoc;
	XLogRecPtr	pass1LastLoc;
	XLogRecPtr	pass1LastCheckpointLoc;

	/*=================Pass 4 PersistentTable-Cat verification================*/
	/*If true integrity checks will be performed in Pass4.*/
	bool		integrityCheckNeeded;

	/*
	 * Currently set database and tablespace to be verified for database specific
	 * PT-Cat verification in Pass4. These fields also act as implicit flags
	 * PT-Cat which indicate if there are any more databases to perform
	 * PT-Cat verifications checks on.
	 */
	Oid			currentDatabaseToVerify;
	Oid			tablespaceOfCurrentDatabaseToVerify;

	/*Indicates if pass4 PT-Cat verification checks passed*/
	bool		pass4_PTCatVerificationPassed;
	/*==========Pass 4 PersistentTable-Cat verification End===================*/

521 522
} XLogCtlData;

523
static XLogCtlData *XLogCtl = NULL;
524

525
/*
T
Tom Lane 已提交
526
 * We maintain an image of pg_control in shared memory.
527
 */
528
static ControlFileData *ControlFile = NULL;
529

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
typedef struct ControlFileWatch
{
	bool		watcherInitialized;
	XLogRecPtr	current_checkPointLoc;		/* current last check point record ptr */
	XLogRecPtr	current_prevCheckPointLoc;  /* current previous check point record ptr */
	XLogRecPtr	current_checkPointCopy_redo;
								/* current checkpointCopy value for
								 * next RecPtr available when we began to
								 * create CheckPoint (i.e. REDO start point) */

} ControlFileWatch;


/*
 * We keep the watcher in shared memory.
 */
static ControlFileWatch *ControlFileWatcher = NULL;

T
Tom Lane 已提交
548 549 550 551 552
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
553

T
Tom Lane 已提交
554 555
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
556
	(XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
T
Tom Lane 已提交
557 558 559 560 561 562

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
563
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
564 565 566 567 568 569 570
	)

#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
571

T
Tom Lane 已提交
572 573 574 575
/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
576
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
577

578 579 580 581 582 583 584 585 586
/*
 * Codes indicating where we got a WAL file from during recovery, or where
 * to attempt to get one.  These are chosen so that they can be OR'd together
 * in a bitmask state variable.
 */
#define XLOG_FROM_ARCHIVE		(1<<0)	/* Restored using restore_command */
#define XLOG_FROM_PG_XLOG		(1<<1)	/* Existing file in pg_xlog */
#define XLOG_FROM_STREAM		(1<<2)	/* Streamed from master */

T
Tom Lane 已提交
587 588 589 590 591 592
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
593
static MirroredFlatFileOpen	mirroredLogFileOpen = MirroredFlatFileOpen_Init;
T
Tom Lane 已提交
594 595 596
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
597

T
Tom Lane 已提交
598 599 600 601
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
602 603 604
 * will be just past that page.readLen indicates how much of the current
 * page has been read into readBuf, and readSource indicates where we got
 * the currently open file from.
T
Tom Lane 已提交
605
 */
606 607 608 609
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626
static uint32 readLen = 0;
static int	readSource = 0;		/* XLOG_FROM_* code */

/*
 * Keeps track of which sources we've tried to read the current WAL
 * record from and failed.
 */
static int	failedSources = 0;	/* OR of XLOG_FROM_* codes */

/*
 * These variables track when we last obtained some WAL data to process,
 * and where we got it from.  (XLogReceiptSource is initially the same as
 * readSource, but readSource gets reset to zero when we don't have data
 * to process right now.)
 */
static TimestampTz XLogReceiptTime = 0;
static int	XLogReceiptSource = 0;		/* XLOG_FROM_* code */
B
Bruce Momjian 已提交
627

628
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
T
Tom Lane 已提交
629
static char *readBuf = NULL;
B
Bruce Momjian 已提交
630

631 632 633 634
/* Buffer for current ReadRecord result (expandable) */
static char *readRecordBuf = NULL;
static uint32 readRecordBufSize = 0;

T
Tom Lane 已提交
635
/* State information for XLOG reading */
B
Bruce Momjian 已提交
636 637
static XLogRecPtr ReadRecPtr;	/* start of last record read */
static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
638
static XLogRecord *nextRecord = NULL;
639
static TimeLineID lastPageTLI = 0;
640
static TimeLineID lastSegmentTLI = 0;
641

642 643 644
static XLogRecPtr minRecoveryPoint;		/* local copy of
										 * ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
645

V
WAL  
Vadim B. Mikheev 已提交
646 647
static bool InRedo = false;

648
/*
649
 * Flags set by interrupt handlers for later service in the redo loop.
650 651 652 653 654 655 656 657 658
 */
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t shutdown_requested = false;

/*
 * Flag set when executing a restore command, to tell SIGTERM signal handler
 * that it's safe to just proc_exit.
 */
static volatile sig_atomic_t in_restore_command = false;
659 660


661 662
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
663 664
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
665
static void XLogArchiveCleanup(const char *xlog);
666
static void exitArchiveRecovery(TimeLineID endTLI,
B
Bruce Momjian 已提交
667
					uint32 endLogId, uint32 endLogSeg);
668
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
669
static void LocalSetXLogInsertAllowed(void);
670
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
671
static void Checkpoint_RecoveryPass(XLogRecPtr checkPointRedo);
T
Tom Lane 已提交
672

673
static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
674 675 676
							bool wal_check_consistency_enabled,
							XLogRecPtr *lsn, BkpBlock *bkpb);
static void RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
677
				char *blk, bool get_cleanup_lock, bool keep_buffer);
678

679 680
static bool AdvanceXLInsertBuffer(bool new_segment);
static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
681 682 683
static void XLogFileInit(
			 MirroredFlatFileOpen *mirroredOpen,
			 uint32 log, uint32 seg,
B
Bruce Momjian 已提交
684
			 bool *use_existent, bool use_lock);
685 686
static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
687
					   bool use_lock, char *tmpsimpleFileName);
688
static void XLogFileClose(void);
689 690 691 692 693 694 695 696 697 698 699 700 701
static void XLogFileOpen(
				MirroredFlatFileOpen *mirroredOpen,
				uint32 log,
				uint32 seg);

static bool StartupXLOG_Pass4_CheckIfAnyInDoubtPreparedTransactions(void);
static void StartupXLOG_Pass4_NonDBSpecificPTCatVerification(void);
static void StartupXLOG_Pass4_DBSpecificPTCatVerification(void);
static bool StartupXLOG_Pass4_GetDBForPTCatVerification(void);

static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess);

702
#ifdef NOT_USED
703
static bool RestoreArchivedFile(char *path, const char *xlogfname,
B
Bruce Momjian 已提交
704
					const char *recovername, off_t expectedSize);
705
static void ExecuteRecoveryEndCommand(void);
706
#endif
707
static void PreallocXlogFiles(XLogRecPtr endptr);
708
static void UpdateLastRemovedPtr(char *filename);
709
static void ValidateXLOGDirectoryStructure(void);
710
static void CleanupBackupHistory(void);
711
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
712
#ifdef NOT_USED
713
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
714
#endif
715
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
716 717
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool segmentonly);

718 719 720 721 722 723 724 725 726 727 728
typedef struct CheckpointExtendedRecord
{
	TMGXACT_CHECKPOINT	*dtxCheckpoint;
	uint32				dtxCheckpointLen;
	MasterMirrorCheckpointInfo	masterMirroringCheckpoint;
	uint32				masterMirroringCheckpointLen;
	prepared_transaction_agg_state  *ptas;
} CheckpointExtendedRecord;

static void UnpackCheckPointRecord(XLogRecord *record,
								   CheckpointExtendedRecord *ckptExtended);
729 730 731
static bool existsTimeLineHistory(TimeLineID probeTLI);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
B
Bruce Momjian 已提交
732 733
					 TimeLineID endTLI,
					 uint32 endLogId, uint32 endLogSeg);
734 735 736 737
static void ControlFileWatcherSaveInitial(void);
static void ControlFileWatcherCheckForChange(void);
static bool XLogGetWriteAndFlushedLoc(XLogRecPtr *writeLoc, XLogRecPtr *flushedLoc);
static XLogRecPtr XLogInsert_Internal(RmgrId rmid, uint8 info, XLogRecData *rdata, TransactionId headerXid);
T
Tom Lane 已提交
738 739
static void WriteControlFile(void);
static void ReadControlFile(void);
740

741
static char *str_time(pg_time_t tnow);
742

743
static void xlog_outrec(StringInfo buf, XLogRecord *record);
744 745
static void pg_start_backup_callback(int code, Datum arg);
static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired);
746
static void rm_redo_error_callback(void *arg);
747
static int	get_sync_bit(int method);
748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775

/* New functions added for WAL replication */
static void SetCurrentChunkStartTime(TimestampTz xtime);
static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources);
static void XLogProcessCheckpointRecord(XLogRecord *rec, XLogRecPtr loc);

typedef struct RedoErrorCallBack
{
	XLogRecPtr	location;

	XLogRecord 	*record;
} RedoErrorCallBack;

static int XLogGetEof(XLogRecPtr *eofRecPtr);

static	int XLogFillZero(
				 uint32	logId,
				 uint32	seg,
				 uint32	startOffset,
				 uint32	endOffset);

static int XLogReconcileEofInternal(
							 XLogRecPtr	startLocation,
							 XLogRecPtr	endLocation);

void HandleStartupProcInterrupts(void);
static bool CheckForStandbyTrigger(void);

776
static void GetXLogCleanUpTo(XLogRecPtr recptr, uint32 *_logId, uint32 *_logSeg);
777
static void checkXLogConsistency(XLogRecord *record, XLogRecPtr EndRecPtr);
778

779 780 781 782 783 784 785 786 787 788
static char *XLogContiguousCopy(
	XLogRecord 		*record,

	XLogRecData 	*rdata)
{
	XLogRecData *rdt;
	int32 len;
	char *buffer;

	rdt = rdata;
789
	len = SizeOfXLogRecord;
790 791 792 793 794 795 796 797 798 799 800
	while (rdt != NULL)
	{
		if (rdt->data != NULL)
		{
			len += rdt->len;
		}
		rdt = rdt->next;
	}

	buffer = (char*)palloc(len);

801
	memcpy(buffer, record, SizeOfXLogRecord);
802
	rdt = rdata;
803
	len = SizeOfXLogRecord;
804 805 806 807 808 809 810 811 812 813 814 815
	while (rdt != NULL)
	{
		if (rdt->data != NULL)
		{
			memcpy(&buffer[len], rdt->data, rdt->len);
			len += rdt->len;
		}
		rdt = rdt->next;
	}

	return buffer;
}
T
Tom Lane 已提交
816 817 818 819

/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
820
 * the rdata chain (see xlog.h for notes about rdata).
T
Tom Lane 已提交
821 822 823 824 825 826 827 828 829 830 831
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
832
XLogRecPtr
833
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
834
{
835 836 837 838 839 840 841 842 843 844 845 846 847 848
	return XLogInsert_Internal(rmid, info, rdata, GetCurrentTransactionIdIfAny());
}

XLogRecPtr
XLogInsert_OverrideXid(RmgrId rmid, uint8 info, XLogRecData *rdata, TransactionId overrideXid)
{
	return XLogInsert_Internal(rmid, info, rdata, overrideXid);
}


static XLogRecPtr
XLogInsert_Internal(RmgrId rmid, uint8 info, XLogRecData *rdata, TransactionId headerXid)
{

B
Bruce Momjian 已提交
849 850
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
851
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
852 853 854
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
855
	int			curridx;
B
Bruce Momjian 已提交
856
	XLogRecData *rdt;
857
	char 		*rdatabuf = NULL;
B
Bruce Momjian 已提交
858 859 860 861
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
862 863 864 865
	XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
	pg_crc32	rdata_crc;
B
Bruce Momjian 已提交
866 867 868 869
	uint32		len,
				write_len;
	unsigned	i;
	bool		updrqst;
870
	bool		doPageWrites;
871
	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
872
	bool		rdata_iscopy = false;
873
	uint8       extended_info = 0;
874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909

    /* Safety check in case our assumption is ever broken. */
	/* NOTE: This is slightly modified from the one in xact.c -- the test for */
 	/* NOTE: seqXlogWrite is omitted... */
	/* NOTE: some local-only changes are OK */
 	if (Gp_role == GP_ROLE_EXECUTE && !Gp_is_writer)
 	{
 		/*
 	     * we better only do really minor things on the reader that result
 	     * in writing to the xlog here at commit.  for now sequences
 	     * should be the only one
 	     */
		if (DistributedTransactionContext == DTX_CONTEXT_LOCAL_ONLY)
		{
			/* MPP-1687: readers may under some circumstances extend the CLOG
			 * rmid == RM_CLOG_ID and info having CLOG_ZEROPAGE set */
			elog(LOG, "Reader qExec committing LOCAL_ONLY changes. (%d %d)", rmid, info);
		}
		else
		{
			/*
			 * We are allowing the QE Reader to write to support error tables.
			 */
			elog(DEBUG1, "Reader qExec writing changes. (%d %d)", rmid, info);
#ifdef nothing
			ereport(ERROR,
					(errmsg("Reader qExec had local changes to commit! (rmid = %u)",
							rmid),
					 errdetail("A Reader qExec tried to commit local changes.  "
							   "Only the single Writer qExec can do so. "),
					 errhint("This is most likely the result of a feature being turned "
							 "on that violates the single WRITER principle")));
#endif
		}
 	}

910
	/* cross-check on whether we should be here or not */
911 912
	if (!XLogInsertAllowed())
		elog(ERROR, "cannot make new WAL entries during recovery");
913

914
	/* info's high bits are reserved for use by me */
V
Vadim B. Mikheev 已提交
915
	if (info & XLR_INFO_MASK)
916
		elog(PANIC, "invalid xlog info mask %02X", info);
V
Vadim B. Mikheev 已提交
917

918 919
	TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

T
Tom Lane 已提交
920
	/*
B
Bruce Momjian 已提交
921 922
	 * In bootstrap mode, we don't actually log anything but XLOG resources;
	 * return a phony record pointer.
T
Tom Lane 已提交
923
	 */
V
Vadim B. Mikheev 已提交
924
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
925 926
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
927
		RecPtr.xrecoff = SizeOfXLogLongPHD;		/* start of 1st chkpt record */
928
		return RecPtr;
V
WAL  
Vadim B. Mikheev 已提交
929 930
	}

931 932 933 934 935 936 937 938 939
	/*
	 * Enforce consistency checks for this record if user is looking for
	 * it. Do this before at the beginning of this routine to give the
	 * possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY
	 * directly for a record.
	 */
	if (wal_consistency_checking[rmid])
		extended_info |= XLR_CHECK_CONSISTENCY;

T
Tom Lane 已提交
940
	/*
941
	 * Here we scan the rdata chain, determine which buffers must be backed
T
Tom Lane 已提交
942
	 * up, and compute the CRC values for the data.  Note that the record
B
Bruce Momjian 已提交
943 944 945 946
	 * header isn't added into the CRC initially since we don't know the final
	 * length or info bits quite yet.  Thus, the CRC will represent the CRC of
	 * the whole record in the order "rdata, then backup blocks, then record
	 * header".
T
Tom Lane 已提交
947
	 *
948 949 950 951 952
	 * We may have to loop back to here if a race condition is detected below.
	 * We could prevent the race by doing all this work while holding the
	 * insert lock, but it seems better to avoid doing CRC calculations while
	 * holding the lock.  This means we have to be careful about modifying the
	 * rdata chain until we know we aren't going to loop back again.  The only
B
Bruce Momjian 已提交
953 954 955 956 957
	 * change we allow ourselves to make earlier is to set rdt->data = NULL in
	 * chain items we have decided we will have to back up the whole buffer
	 * for.  This is OK because we will certainly decide the same thing again
	 * for those items if we do it over; doing it here saves an extra pass
	 * over the chain later.
T
Tom Lane 已提交
958
	 */
959
begin:;
T
Tom Lane 已提交
960 961 962 963 964 965
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

966 967 968 969 970 971 972 973
	/*
	 * Decide if we need to do full-page writes in this XLOG record: true if
	 * full_page_writes is on or we have a PITR request for it.  Since we
	 * don't yet have the insert lock, forcePageWrites could change under us,
	 * but we'll recheck it once we have the lock.
	 */
	doPageWrites = fullPageWrites || Insert->forcePageWrites;

974
	INIT_CRC32C(rdata_crc);
T
Tom Lane 已提交
975
	len = 0;
B
Bruce Momjian 已提交
976
	for (rdt = rdata;;)
977 978 979
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
980
			/* Simple data, just include it */
981
			len += rdt->len;
982
			COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
983
		}
T
Tom Lane 已提交
984
		else
985
		{
T
Tom Lane 已提交
986 987
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
988
			{
T
Tom Lane 已提交
989
				if (rdt->buffer == dtbuf[i])
990
				{
991 992 993 994 995 996 997
					/*
					 * Buffer already referenced by earlier chain item and
					 * will be applied then only ignore it. Block can exist
					 * for consistency check purpose and hence should include
					 * original data along if its only for that purpose.
					 */
					if (dtbuf_bkp[i] && (dtbuf_xlg[i].block_info & BLOCK_APPLY))
T
Tom Lane 已提交
998 999 1000 1001
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
1002
						COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
1003 1004
					}
					break;
1005
				}
T
Tom Lane 已提交
1006
				if (dtbuf[i] == InvalidBuffer)
1007
				{
T
Tom Lane 已提交
1008 1009
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
1010

1011
					if (doPageWrites && XLogCheckBuffer(rdt, true,
1012
										(extended_info & XLR_CHECK_CONSISTENCY) != 0,
1013
										&(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
T
Tom Lane 已提交
1014 1015
					{
						dtbuf_bkp[i] = true;
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026

						if (dtbuf_xlg[i].block_info & BLOCK_APPLY)
							rdt->data = NULL;
						else
						{
							if (rdt->data)
							{
								len += rdt->len;
								COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
							}
						}
T
Tom Lane 已提交
1027 1028 1029 1030
					}
					else if (rdt->data)
					{
						len += rdt->len;
1031
						COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
1032 1033
					}
					break;
1034 1035
				}
			}
T
Tom Lane 已提交
1036
			if (i >= XLR_MAX_BKP_BLOCKS)
1037
				elog(PANIC, "can backup at most %d blocks per xlog record",
T
Tom Lane 已提交
1038
					 XLR_MAX_BKP_BLOCKS);
1039
		}
1040
		/* Break out of loop when rdt points to last chain item */
1041 1042 1043 1044 1045
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
	/*
	 * Now add the backup block headers and data into the CRC
	 */
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (dtbuf_bkp[i])
		{
			BkpBlock   *bkpb = &(dtbuf_xlg[i]);
			char	   *page;

1056
			COMP_CRC32C(rdata_crc,
1057 1058 1059 1060 1061
					   (char *) bkpb,
					   sizeof(BkpBlock));
			page = (char *) BufferGetBlock(dtbuf[i]);
			if (bkpb->hole_length == 0)
			{
1062
				COMP_CRC32C(rdata_crc, page, BLCKSZ);
1063 1064 1065 1066
			}
			else
			{
				/* must skip the hole */
1067 1068
				COMP_CRC32C(rdata_crc, page, bkpb->hole_offset);
				COMP_CRC32C(rdata_crc,
1069 1070 1071 1072 1073 1074
						   page + (bkpb->hole_offset + bkpb->hole_length),
						   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
			}
		}
	}

T
Tom Lane 已提交
1075
	/*
1076 1077
	 * NOTE: We disallow len == 0 because it provides a useful bit of extra
	 * error checking in ReadRecord.  This means that all callers of
B
Bruce Momjian 已提交
1078 1079 1080
	 * XLogInsert must supply at least some not-in-a-buffer data.  However, we
	 * make an exception for XLOG SWITCH records because we don't want them to
	 * ever cross a segment boundary.
T
Tom Lane 已提交
1081
	 */
1082
	if (len == 0 && !isLogSwitch)
1083
		elog(PANIC, "invalid xlog record length %u", len);
1084

1085
	START_CRIT_SECTION();
1086

1087 1088 1089
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
1090
	/*
B
Bruce Momjian 已提交
1091 1092 1093
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to go
	 * back and recompute everything.  This can only happen just after a
	 * checkpoint, so it's better to be slow in this case and fast otherwise.
1094 1095
	 *
	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
B
Bruce Momjian 已提交
1096 1097
	 * affect the contents of the XLOG record, so we'll update our local copy
	 * but not force a recomputation.
T
Tom Lane 已提交
1098 1099
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
1100
	{
T
Tom Lane 已提交
1101 1102 1103
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

1104
		if (doPageWrites)
1105
		{
1106
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
T
Tom Lane 已提交
1107
			{
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
				if (dtbuf[i] == InvalidBuffer)
					continue;
				if (dtbuf_bkp[i] == false &&
					XLByteLE(dtbuf_lsn[i], RedoRecPtr))
				{
					/*
					 * Oops, this buffer now needs to be backed up, but we
					 * didn't think so above.  Start over.
					 */
					LWLockRelease(WALInsertLock);
1118

1119 1120 1121
					END_CRIT_SECTION();
					goto begin;
				}
T
Tom Lane 已提交
1122
			}
1123 1124 1125
		}
	}

1126
	/*
B
Bruce Momjian 已提交
1127 1128 1129 1130
	 * Also check to see if forcePageWrites was just turned on; if we weren't
	 * already doing full-page writes then go back and recompute. (If it was
	 * just turned off, we could recompute the record without full pages, but
	 * we choose not to bother.)
1131 1132 1133 1134 1135 1136 1137 1138 1139
	 */
	if (Insert->forcePageWrites && !doPageWrites)
	{
		/* Oops, must redo it with full-page data */
		LWLockRelease(WALInsertLock);
		END_CRIT_SECTION();
		goto begin;
	}

T
Tom Lane 已提交
1140
	/*
B
Bruce Momjian 已提交
1141 1142 1143 1144
	 * Make additional rdata chain entries for the backup blocks, so that we
	 * don't need to special-case them in the write loop.  Note that we have
	 * now irrevocably changed the input rdata chain.  At the exit of this
	 * loop, write_len includes the backup block data.
T
Tom Lane 已提交
1145
	 *
1146 1147 1148
	 * Also set the appropriate info bits to show which buffers were backed
	 * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
	 * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
T
Tom Lane 已提交
1149 1150 1151
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1152
	{
1153 1154 1155
		BkpBlock   *bkpb;
		char	   *page;

1156
		if (!dtbuf_bkp[i])
1157 1158
			continue;

T
Tom Lane 已提交
1159
		info |= XLR_SET_BKP_BLOCK(i);
1160

1161 1162 1163 1164 1165
		bkpb = &(dtbuf_xlg[i]);
		page = (char *) BufferGetBlock(dtbuf[i]);

		rdt->next = &(dtbuf_rdt1[i]);
		rdt = rdt->next;
1166

1167 1168
		rdt->data = (char *) bkpb;
		rdt->len = sizeof(BkpBlock);
T
Tom Lane 已提交
1169
		write_len += sizeof(BkpBlock);
1170

1171 1172
		rdt->next = &(dtbuf_rdt2[i]);
		rdt = rdt->next;
1173

1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195
		if (bkpb->hole_length == 0)
		{
			rdt->data = page;
			rdt->len = BLCKSZ;
			write_len += BLCKSZ;
			rdt->next = NULL;
		}
		else
		{
			/* must skip the hole */
			rdt->data = page;
			rdt->len = bkpb->hole_offset;
			write_len += bkpb->hole_offset;

			rdt->next = &(dtbuf_rdt3[i]);
			rdt = rdt->next;

			rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
			rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
			write_len += rdt->len;
			rdt->next = NULL;
		}
1196 1197
	}

1198 1199 1200 1201 1202 1203 1204
	/*
	 * If we backed up any full blocks and online backup is not in progress,
	 * mark the backup blocks as removable.  This allows the WAL archiver to
	 * know whether it is safe to compress archived WAL data by transforming
	 * full-block records into the non-full-block format.
	 *
	 * Note: we could just set the flag whenever !forcePageWrites, but
B
Bruce Momjian 已提交
1205 1206
	 * defining it like this leaves the info bit free for some potential other
	 * use in records without any backup blocks.
1207 1208 1209 1210
	 */
	if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
		info |= XLR_BKP_REMOVABLE;

1211
	/*
1212
	 * If there isn't enough space on the current XLOG page for a record
B
Bruce Momjian 已提交
1213
	 * header, advance to the next page (leaving the unused space as zeroes).
1214
	 */
T
Tom Lane 已提交
1215 1216
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
1217 1218
	if (freespace < SizeOfXLogRecord)
	{
1219
		updrqst = AdvanceXLInsertBuffer(false);
1220 1221 1222
		freespace = INSERT_FREESPACE(Insert);
	}

1223
	/* Compute record's XLOG location */
T
Tom Lane 已提交
1224
	curridx = Insert->curridx;
1225 1226 1227
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/*
B
Bruce Momjian 已提交
1228 1229 1230 1231 1232
	 * If the record is an XLOG_SWITCH, and we are exactly at the start of a
	 * segment, we need not insert it (and don't want to because we'd like
	 * consecutive switch requests to be no-ops).  Instead, make sure
	 * everything is written and flushed through the end of the prior segment,
	 * and return the prior segment's end address.
1233 1234 1235 1236
	 */
	if (isLogSwitch &&
		(RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
	{
1237
		/* We can release insert lock immediately */
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
		LWLockRelease(WALInsertLock);

		RecPtr.xrecoff -= SizeOfXLogLongPHD;
		if (RecPtr.xrecoff == 0)
		{
			/* crossing a logid boundary */
			RecPtr.xlogid -= 1;
			RecPtr.xrecoff = XLogFileSize;
		}

		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(RecPtr, LogwrtResult.Flush))
		{
			XLogwrtRqst FlushRqst;

			FlushRqst.Write = RecPtr;
			FlushRqst.Flush = RecPtr;
			XLogWrite(FlushRqst, false, false);
		}
		LWLockRelease(WALWriteLock);

		END_CRIT_SECTION();

		return RecPtr;
	}
T
Tom Lane 已提交
1264

1265 1266
	/* Insert record header */

1267
	record = (XLogRecord *) Insert->currpos;
1268
	record->xl_prev = Insert->PrevRecord;
1269
	record->xl_xid = headerXid;
1270
	record->xl_tot_len = SizeOfXLogRecord + write_len;
T
Tom Lane 已提交
1271
	record->xl_len = len;		/* doesn't include backup blocks */
1272
	record->xl_info = info;
1273
	record->xl_rmid = rmid;
1274
	record->xl_extended_info = extended_info;
1275

1276
	/* Now we can finish computing the record's CRC */
1277
	COMP_CRC32C(rdata_crc, (char *) record + sizeof(pg_crc32),
1278
			   SizeOfXLogRecord - sizeof(pg_crc32));
1279
	FIN_CRC32C(rdata_crc);
1280 1281
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
1282 1283 1284 1285
	/* Record begin of record in appropriate places */
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

1286 1287 1288
	ProcLastRecTotalLen = record->xl_tot_len;
	ProcLastRecDataLen = write_len;

1289
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
1290
	freespace -= SizeOfXLogRecord;
1291

1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
	if (Debug_xlog_insert_print)
	{
		StringInfoData buf;
		char *contiguousCopy;

		initStringInfo(&buf);
		appendStringInfo(&buf, "XLOG INSERT @ %s, total length %u, data length %u: ",
						 XLogLocationToString(&RecPtr),
						 ProcLastRecTotalLen,
						 ProcLastRecDataLen);
1302
		xlog_outrec(&buf, record);
1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319

		contiguousCopy = XLogContiguousCopy(record, rdata);
		appendStringInfo(&buf, " - ");
		RmgrTable[record->xl_rmid].rm_desc(&buf, RecPtr, (XLogRecord*)contiguousCopy);
		pfree(contiguousCopy);

		elog(LOG, "%s", buf.data);
		pfree(buf.data);
	}

	/*
	 * Always copy of the relevant rdata information in case we discover below we
	 * are in 'Change Tracking' mode and need to call ChangeTracking_AddRecordFromXlog().
	 */

	rdatabuf = ChangeTracking_CopyRdataBuffers(rdata, rmid, info, &rdata_iscopy);

T
Tom Lane 已提交
1320 1321 1322 1323
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
1324
	{
1325 1326 1327 1328
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
1329
		{
1330 1331 1332 1333 1334
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
1335
				write_len -= freespace;
1336 1337 1338
			}
			else
			{
1339
				/* enough room to write whole data. do it. */
1340 1341
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
1342
				write_len -= rdata->len;
1343 1344 1345 1346
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
1347 1348
		}

1349
		/* Use next buffer */
1350
		updrqst = AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
1351 1352 1353 1354 1355 1356
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
1357
		freespace = INSERT_FREESPACE(Insert);
1358
	}
1359

T
Tom Lane 已提交
1360 1361
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
1362
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
1363
	freespace = INSERT_FREESPACE(Insert);
1364

V
Vadim B. Mikheev 已提交
1365
	/*
B
Bruce Momjian 已提交
1366 1367
	 * The recptr I return is the beginning of the *next* record. This will be
	 * stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
1368
	 */
T
Tom Lane 已提交
1369
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
1370

1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
	/*
	 * If the record is an XLOG_SWITCH, we must now write and flush all the
	 * existing data, and then forcibly advance to the start of the next
	 * segment.  It's not good to do this I/O while holding the insert lock,
	 * but there seems too much risk of confusion if we try to release the
	 * lock sooner.  Fortunately xlog switch needn't be a high-performance
	 * operation anyway...
	 */
	if (isLogSwitch)
	{
		XLogCtlWrite *Write = &XLogCtl->Write;
		XLogwrtRqst FlushRqst;
		XLogRecPtr	OldSegEnd;

1385 1386
		TRACE_POSTGRESQL_XLOG_SWITCH();

1387 1388 1389
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

		/*
B
Bruce Momjian 已提交
1390 1391
		 * Flush through the end of the page containing XLOG_SWITCH, and
		 * perform end-of-segment actions (eg, notifying archiver).
1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441
		 */
		WriteRqst = XLogCtl->xlblocks[curridx];
		FlushRqst.Write = WriteRqst;
		FlushRqst.Flush = WriteRqst;
		XLogWrite(FlushRqst, false, true);

		/* Set up the next buffer as first page of next segment */
		/* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
		(void) AdvanceXLInsertBuffer(true);

		/* There should be no unwritten data */
		curridx = Insert->curridx;
		Assert(curridx == Write->curridx);

		/* Compute end address of old segment */
		OldSegEnd = XLogCtl->xlblocks[curridx];
		OldSegEnd.xrecoff -= XLOG_BLCKSZ;
		if (OldSegEnd.xrecoff == 0)
		{
			/* crossing a logid boundary */
			OldSegEnd.xlogid -= 1;
			OldSegEnd.xrecoff = XLogFileSize;
		}

		/* Make it look like we've written and synced all of old segment */
		LogwrtResult.Write = OldSegEnd;
		LogwrtResult.Flush = OldSegEnd;

		/*
		 * Update shared-memory status --- this code should match XLogWrite
		 */
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire(&xlogctl->info_lck);
			xlogctl->LogwrtResult = LogwrtResult;
			if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
				xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
			if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
				xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
			SpinLockRelease(&xlogctl->info_lck);
		}

		Write->LogwrtResult = LogwrtResult;

		LWLockRelease(WALWriteLock);

		updrqst = false;		/* done already */
	}
1442
	else
1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458
	{
		/* normal case, ie not xlog switch */

		/* Need to update shared LogwrtRqst if some block was filled up */
		if (freespace < SizeOfXLogRecord)
		{
			/* curridx is filled and available for writing out */
			updrqst = true;
		}
		else
		{
			/* if updrqst already set, write through end of previous buf */
			curridx = PrevBufIdx(curridx);
		}
		WriteRqst = XLogCtl->xlblocks[curridx];
	}
1459

1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
	/*
	 * Use this lock to make sure we add Change Tracking records correctly.
	 *
	 * IMPORTANT: Acquiring this lock must be done AFTER ALL WRITE AND FSYNC calls under
	 * WALInsertLock.  Otherwise, the write suspension that occurs as a natural part of
	 * mirror communication loss and fault handling would suspend us and cause a deadlock.
	 *
	 * When this lock is held EXCLUSIVE, we are in transition from 'In Sync' to
	 * 'Change Tracking'.  During that time other processes are initializing the
	 * 'Change Tracking' log with information since the last checkpoint.  Thus, we need to
	 * wait here before we add our information.
	 */
	LWLockAcquire(ChangeTrackingTransitionLock, LW_SHARED);

	if (Debug_print_xlog_relation_change_info && rdatabuf != NULL)
	{
		bool skipIssue;

		skipIssue =
			ChangeTracking_PrintRelationChangeInfo(
												rmid,
												info,
												(void *)rdatabuf,
												&RecPtr,
												/* weAreGeneratingXLogNow */ true,
												/* printSkipIssuesOnly */ Debug_print_xlog_relation_change_info_skip_issues_only);

		if (Debug_print_xlog_relation_change_info_backtrace_skip_issues &&
			skipIssue)
		{
			/* Code for investigating MPP-13909, will be removed as part of the fix */
			elog(WARNING, 
				 "ChangeTracking_PrintRelationChangeInfo hang skipIssue %s",
				 (skipIssue ? "true" : "false"));
			
			for (int i=0; i < 24 * 60; i++)
			{
				pg_usleep(60000000L); /* 60 sec */
			}
			Insist(0);
			debug_backtrace();
		}
	}

	/* if needed, send this record to the changetracker */
	if (ChangeTracking_ShouldTrackChanges() && rdatabuf != NULL)
	{
		ChangeTracking_AddRecordFromXlog(rmid, info, (void *)rdatabuf, &RecPtr);
	}

	/*
	 * Last LSN location has to be tracked also when no mirrors are configured
	 * in order to handle gpaddmirrors correctly
	 */
	XLogCtl->lastChangeTrackingEndLoc = RecPtr;

	if(rdata_iscopy)
	{
		if (rdatabuf != NULL)
		{
			pfree(rdatabuf);
			rdatabuf = NULL;
		}
		rdata_iscopy = false;
	}

	LWLockRelease(ChangeTrackingTransitionLock);

1528
	LWLockRelease(WALInsertLock);
1529 1530 1531

	if (updrqst)
	{
1532 1533 1534
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1535
		SpinLockAcquire(&xlogctl->info_lck);
T
Tom Lane 已提交
1536
		/* advance global request to include new block(s) */
1537 1538
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
1539
		/* update local result copy while I have the chance */
1540
		LogwrtResult = xlogctl->LogwrtResult;
1541
		SpinLockRelease(&xlogctl->info_lck);
1542 1543
	}

1544
	XactLastRecEnd = RecPtr;
1545

1546
	END_CRIT_SECTION();
1547

1548
	return RecPtr;
1549
}
1550

1551 1552 1553 1554 1555 1556 1557 1558 1559
XLogRecPtr
XLogLastInsertBeginLoc(void)
{
	return ProcLastRecPtr;
}

XLogRecPtr
XLogLastInsertEndLoc(void)
{
1560
	return XactLastRecEnd;
1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580
}

XLogRecPtr
XLogLastChangeTrackedLoc(void)
{
	return XLogCtl->lastChangeTrackingEndLoc;
}

uint32
XLogLastInsertTotalLen(void)
{
	return ProcLastRecTotalLen;
}

uint32
XLogLastInsertDataLen(void)
{
	return ProcLastRecDataLen;
}

1581
/*
1582 1583 1584
 * Determine whether the buffer referenced by an XLogRecData item has to
 * be backed up, and if so fill a BkpBlock struct for it.  In any case
 * save the buffer's LSN at *lsn.
1585
 */
1586
static bool
1587
XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
1588
				bool wal_check_consistency_enabled,
1589
				XLogRecPtr *lsn, BkpBlock *bkpb)
1590
{
1591
	Page		page;
1592
	bool needs_backup;
1593

1594
	page = BufferGetPage(rdata->buffer);
1595 1596

	/*
1597 1598 1599 1600
	 * We assume page LSN is first data on *every* page that can be passed
	 * to XLogInsert, whether it has the standard page layout or not. We
	 * don't need to take the buffer header lock for PageGetLSN if we hold
	 * an exclusive lock on the page and/or the relation.
1601
	 */
1602
	if (holdsExclusiveLock)
1603
		*lsn = PageGetLSN(page);
1604 1605
	else
		*lsn = BufferGetLSNAtomic(rdata->buffer);
1606

1607
	needs_backup = XLByteLE(((PageHeader) page)->pd_lsn, RedoRecPtr);
1608 1609

	if (needs_backup || wal_check_consistency_enabled)
1610
	{
1611 1612 1613
		/*
		 * The page needs to be backed up, so set up *bkpb
		 */
1614
		BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625
		bkpb->block_info = 0;

		/*
		 * If WAL consistency checking is enabled for the
		 * resource manager of this WAL record, a full-page
		 * image is included in the record for the block
		 * modified. During redo, the full-page is replayed
		 * only if block_apply is set.
		 */
		if (needs_backup)
			bkpb->block_info |= BLOCK_APPLY;
1626

1627 1628 1629
		if (rdata->buffer_std)
		{
			/* Assume we can omit data between pd_lower and pd_upper */
1630 1631
			uint16		lower = ((PageHeader) page)->pd_lower;
			uint16		upper = ((PageHeader) page)->pd_upper;
1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652

			if (lower >= SizeOfPageHeaderData &&
				upper > lower &&
				upper <= BLCKSZ)
			{
				bkpb->hole_offset = lower;
				bkpb->hole_length = upper - lower;
			}
			else
			{
				/* No "hole" to compress out */
				bkpb->hole_offset = 0;
				bkpb->hole_length = 0;
			}
		}
		else
		{
			/* Not a standard page header, don't try to eliminate "hole" */
			bkpb->hole_offset = 0;
			bkpb->hole_length = 0;
		}
1653

1654
		return true;			/* buffer requires backup */
1655
	}
1656 1657

	return false;				/* buffer does not need to be backed up */
1658 1659
}

1660 1661 1662 1663 1664 1665
/*
 * XLogArchiveNotify
 *
 * Create an archive notification file
 *
 * The name of the notification file is the message that will be picked up
1666
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
1667
 * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1668
 * then when complete, rename it to 0000000100000001000000C6.done
1669 1670 1671 1672 1673
 */
static void
XLogArchiveNotify(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
B
Bruce Momjian 已提交
1674
	FILE	   *fd;
1675 1676 1677 1678

	/* insert an otherwise empty file called <XLOG>.ready */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	fd = AllocateFile(archiveStatusPath, "w");
B
Bruce Momjian 已提交
1679 1680
	if (fd == NULL)
	{
1681 1682 1683 1684 1685 1686
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not create archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}
B
Bruce Momjian 已提交
1687 1688
	if (FreeFile(fd))
	{
1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not write archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}

	/* Notify archiver that it's got something to do */
	if (IsUnderPostmaster)
		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
}

/*
 * Convenience routine to notify using log/seg representation of filename
 */
static void
XLogArchiveNotifySeg(uint32 log, uint32 seg)
{
	char		xlog[MAXFNAMELEN];

1709
	XLogFileName(xlog, ThisTimeLineID, log, seg);
1710 1711 1712 1713
	XLogArchiveNotify(xlog);
}

/*
1714
 * XLogArchiveCheckDone
1715
 *
1716 1717 1718 1719
 * This is called when we are ready to delete or recycle an old XLOG segment
 * file or backup history file.  If it is okay to delete it then return true.
 * If it is not time to delete it, make sure a .ready file exists, and return
 * false.
1720 1721
 *
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1722 1723 1724 1725
 * then return false; else create <XLOG>.ready and return false.
 *
 * The reason we do things this way is so that if the original attempt to
 * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1726 1727
 */
static bool
1728
XLogArchiveCheckDone(const char *xlog)
1729 1730 1731 1732
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

1733 1734 1735 1736 1737
	/* Always deletable if archiving is off */
	if (!XLogArchivingActive())
		return true;

	/* First check for .done --- this means archiver is done with it */
1738 1739 1740 1741 1742 1743 1744
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
B
Bruce Momjian 已提交
1745
		return false;
1746 1747 1748 1749 1750 1751 1752

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Retry creation of the .ready file */
1753
	XLogArchiveNotify(xlog);
1754 1755 1756
	return false;
}

1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788
/*
 * XLogArchiveIsBusy
 *
 * Check to see if an XLOG segment file is still unarchived.
 * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 * the first place we aren't chartered to recreate the .ready file, and
 * in the second place we should consider that if the file is already gone
 * then it's not busy.  (This check is needed to handle the race condition
 * that a checkpoint already deleted the no-longer-needed file.)
 */
static bool
XLogArchiveIsBusy(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

	/* First check for .done --- this means archiver is done with it */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/*
1789 1790 1791
	 * Check to see if the WAL file has been removed by checkpoint, which
	 * implies it has already been archived, and explains why we can't see a
	 * status file for it.
1792 1793 1794 1795 1796 1797 1798 1799 1800
	 */
	snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
	if (stat(archiveStatusPath, &stat_buf) != 0 &&
		errno == ENOENT)
		return false;

	return true;
}

1801 1802 1803
/*
 * XLogArchiveCleanup
 *
1804
 * Cleanup archive notification file(s) for a particular xlog segment
1805 1806 1807 1808
 */
static void
XLogArchiveCleanup(const char *xlog)
{
B
Bruce Momjian 已提交
1809
	char		archiveStatusPath[MAXPGPATH];
1810

1811
	/* Remove the .done file */
1812 1813 1814
	StatusFilePath(archiveStatusPath, xlog, ".done");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1815 1816 1817 1818 1819

	/* Remove the .ready file if present --- normally it shouldn't be */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1820 1821
}

T
Tom Lane 已提交
1822 1823 1824 1825
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
1826 1827 1828 1829
 * If new_segment is TRUE then we set up the next buffer page as the first
 * page of the next xlog segment file, possibly but not usually the next
 * consecutive file page.
 *
T
Tom Lane 已提交
1830
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
1831
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
1832 1833 1834
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
1835
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
1836 1837
 */
static bool
1838
AdvanceXLInsertBuffer(bool new_segment)
1839
{
T
Tom Lane 已提交
1840 1841
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
1842
	int			nextidx = NextBufIdx(Insert->curridx);
T
Tom Lane 已提交
1843 1844 1845
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
1846 1847
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
1848

T
Tom Lane 已提交
1849 1850 1851
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
1852

T
Tom Lane 已提交
1853
	/*
B
Bruce Momjian 已提交
1854 1855 1856
	 * Get ending-offset of the buffer page we need to replace (this may be
	 * zero if the buffer hasn't been used yet).  Fall through if it's already
	 * written out.
T
Tom Lane 已提交
1857 1858 1859 1860 1861 1862
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
1863

T
Tom Lane 已提交
1864
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1865

1866
		/* Before waiting, get info_lck and update LogwrtResult */
1867 1868 1869 1870
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

1871
			SpinLockAcquire(&xlogctl->info_lck);
1872 1873 1874
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
1875
			SpinLockRelease(&xlogctl->info_lck);
1876
		}
1877 1878 1879 1880 1881 1882 1883 1884 1885

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
1886
		{
1887 1888 1889 1890
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1891
			{
1892 1893 1894
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
1895
			}
1896
			else
T
Tom Lane 已提交
1897 1898
			{
				/*
B
Bruce Momjian 已提交
1899 1900
				 * Have to write buffers while holding insert lock. This is
				 * not good, so only write as much as we absolutely must.
T
Tom Lane 已提交
1901
				 */
1902
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
T
Tom Lane 已提交
1903 1904 1905
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
1906
				XLogWrite(WriteRqst, false, false);
1907
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1908
				Insert->LogwrtResult = LogwrtResult;
1909
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1910 1911 1912 1913
			}
		}
	}

T
Tom Lane 已提交
1914
	/*
B
Bruce Momjian 已提交
1915 1916
	 * Now the next buffer slot is free and we can set it up to be the next
	 * output page.
T
Tom Lane 已提交
1917
	 */
1918
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1919 1920 1921 1922 1923 1924 1925 1926

	if (new_segment)
	{
		/* force it to a segment start point */
		NewPageEndPtr.xrecoff += XLogSegSize - 1;
		NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
	}

1927
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
1928
	{
T
Tom Lane 已提交
1929
		/* crossing a logid boundary */
1930
		NewPageEndPtr.xlogid += 1;
1931
		NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1932
	}
T
Tom Lane 已提交
1933
	else
1934
		NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1935
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1936
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
B
Bruce Momjian 已提交
1937

T
Tom Lane 已提交
1938
	Insert->curridx = nextidx;
1939
	Insert->currpage = NewPage;
B
Bruce Momjian 已提交
1940 1941

	Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
B
Bruce Momjian 已提交
1942

T
Tom Lane 已提交
1943
	/*
B
Bruce Momjian 已提交
1944 1945
	 * Be sure to re-zero the buffer so that bytes beyond what we've written
	 * will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
1946
	 */
1947
	MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1948

1949 1950 1951
	/*
	 * Fill the new page's header
	 */
B
Bruce Momjian 已提交
1952 1953
	NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;

1954
	/* NewPage->xlp_info = 0; */	/* done by memset */
B
Bruce Momjian 已提交
1955 1956
	NewPage   ->xlp_tli = ThisTimeLineID;
	NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1957
	NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
T
Tom Lane 已提交
1958

1959
	/*
1960
	 * If first page of an XLOG segment file, make it a long header.
1961 1962 1963
	 */
	if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
	{
1964
		XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1965

1966 1967
		NewLongPage->xlp_sysid = ControlFile->system_identifier;
		NewLongPage->xlp_seg_size = XLogSegSize;
1968
		NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
B
Bruce Momjian 已提交
1969 1970 1971
		NewPage   ->xlp_info |= XLP_LONG_HEADER;

		Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1972 1973
	}

T
Tom Lane 已提交
1974
	return update_needed;
1975 1976
}

1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
/*
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
 *
 * Caller must have just finished filling the open log file (so that
 * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
 * to the open log file and see if that exceeds CheckPointSegments.
 *
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
 */
static bool
XLogCheckpointNeeded(void)
{
	/*
1990 1991
	 * A straight computation of segment number could overflow 32 bits. Rather
	 * than assuming we have working 64-bit arithmetic, we compare the
B
Bruce Momjian 已提交
1992 1993
	 * highest-order bits separately, and force a checkpoint immediately when
	 * they change.
1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
	 */
	uint32		old_segno,
				new_segno;
	uint32		old_highbits,
				new_highbits;

	old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
		(RedoRecPtr.xrecoff / XLogSegSize);
	old_highbits = RedoRecPtr.xlogid / XLogSegSize;
	new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
	new_highbits = openLogId / XLogSegSize;
	if (new_highbits != old_highbits ||
B
Bruce Momjian 已提交
2006
		new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
2007 2008 2009 2010
		return true;
	return false;
}

T
Tom Lane 已提交
2011 2012 2013
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
2014 2015 2016 2017 2018
 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
 * may stop at any convenient boundary (such as a cache or logfile boundary).
 * This option allows us to avoid uselessly issuing multiple writes when a
 * single one would do.
 *
2019 2020 2021 2022 2023 2024
 * If xlog_switch == TRUE, we are intending an xlog segment switch, so
 * perform end-of-segment actions after writing the last page, even if
 * it's not physically the end of its segment.  (NB: this will work properly
 * only if caller specifies WriteRqst == page-end and flexible == false,
 * and there is some data to write.)
 *
2025
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
2026
 */
2027
static void
2028
XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
2029
{
2030
	XLogCtlWrite *Write = &XLogCtl->Write;
T
Tom Lane 已提交
2031
	bool		ispartialpage;
2032
	bool		last_iteration;
2033
	bool		finishing_seg;
2034
	bool		use_existent;
2035 2036 2037 2038
	int			curridx;
	int			npages;
	int			startidx;
	uint32		startoffset;
2039

2040 2041 2042
	/* We should always be inside a critical section here */
	Assert(CritSectionCount > 0);

B
Bruce Momjian 已提交
2043
	/*
B
Bruce Momjian 已提交
2044
	 * Update local LogwrtResult (caller probably did this already, but...)
B
Bruce Momjian 已提交
2045
	 */
T
Tom Lane 已提交
2046 2047
	LogwrtResult = Write->LogwrtResult;

2048 2049 2050
	/*
	 * Since successive pages in the xlog cache are consecutively allocated,
	 * we can usually gather multiple pages together and issue just one
B
Bruce Momjian 已提交
2051 2052 2053 2054 2055
	 * write() call.  npages is the number of pages we have determined can be
	 * written together; startidx is the cache block index of the first one,
	 * and startoffset is the file offset at which it should go. The latter
	 * two variables are only valid when npages > 0, but we must initialize
	 * all of them to keep the compiler quiet.
2056 2057 2058 2059 2060 2061 2062 2063 2064
	 */
	npages = 0;
	startidx = 0;
	startoffset = 0;

	/*
	 * Within the loop, curridx is the cache block index of the page to
	 * consider writing.  We advance Write->curridx only after successfully
	 * writing pages.  (Right now, this refinement is useless since we are
B
Bruce Momjian 已提交
2065 2066
	 * going to PANIC if any error occurs anyway; but someday it may come in
	 * useful.)
2067 2068
	 */
	curridx = Write->curridx;
B
 
Bruce Momjian 已提交
2069

T
Tom Lane 已提交
2070
	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
2071
	{
2072
		/*
B
Bruce Momjian 已提交
2073 2074 2075
		 * Make sure we're not ahead of the insert process.  This could happen
		 * if we're passed a bogus WriteRqst.Write that is past the end of the
		 * last page that's been initialized by AdvanceXLInsertBuffer.
2076
		 */
2077
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
2078
			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2079
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2080 2081
				 XLogCtl->xlblocks[curridx].xlogid,
				 XLogCtl->xlblocks[curridx].xrecoff);
2082

T
Tom Lane 已提交
2083
		/* Advance LogwrtResult.Write to end of current buffer page */
2084
		LogwrtResult.Write = XLogCtl->xlblocks[curridx];
T
Tom Lane 已提交
2085 2086 2087
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
2088
		{
T
Tom Lane 已提交
2089
			/*
2090 2091
			 * Switch to new logfile segment.  We cannot have any pending
			 * pages here (since we dump what we have at segment end).
T
Tom Lane 已提交
2092
			 */
2093
			Assert(npages == 0);
2094
			if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
2095
				XLogFileClose();
T
Tom Lane 已提交
2096 2097
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

2098 2099
			/* create/use new log file */
			use_existent = true;
2100 2101 2102 2103 2104

			XLogFileInit(
					&mirroredLogFileOpen,
					openLogId, openLogSeg,
					&use_existent, true);
T
Tom Lane 已提交
2105
			openLogOff = 0;
2106 2107
		}

2108
		/* Make sure we have the current logfile open */
2109
		if (!MirroredFlatFile_IsActive(&mirroredLogFileOpen))
2110
		{
T
Tom Lane 已提交
2111
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
2112 2113 2114 2115
			XLogFileOpen(
					&mirroredLogFileOpen,
					openLogId,
					openLogSeg);
T
Tom Lane 已提交
2116
			openLogOff = 0;
2117 2118
		}

2119 2120 2121 2122 2123
		/* Add current page to the set of pending pages-to-dump */
		if (npages == 0)
		{
			/* first of group */
			startidx = curridx;
2124
			startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
2125 2126
		}
		npages++;
2127

T
Tom Lane 已提交
2128
		/*
B
Bruce Momjian 已提交
2129 2130 2131 2132
		 * Dump the set if this will be the last loop iteration, or if we are
		 * at the last page of the cache area (since the next page won't be
		 * contiguous in memory), or if we are at the end of the logfile
		 * segment.
T
Tom Lane 已提交
2133
		 */
2134 2135
		last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);

2136
		finishing_seg = !ispartialpage &&
2137
			(startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2138

2139
		if (last_iteration ||
2140 2141
			curridx == XLogCtl->XLogCacheBlck ||
			finishing_seg)
T
Tom Lane 已提交
2142
		{
2143 2144
			char	   *from;
			Size		nbytes;
2145

2146 2147 2148 2149 2150 2151 2152
			/* Need to seek in the file? */
			if (openLogOff != startoffset)
			{
				openLogOff = startoffset;
			}

			/* OK to write the page(s) */
2153 2154
			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
			nbytes = npages * (Size) XLOG_BLCKSZ;
2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181

			/* The following code is a sanity check to try to catch the issue described in MPP-12611 */
			if (!IsBootstrapProcessingMode())
			  {
			  char   simpleFileName[MAXPGPATH];
			  XLogFileName(simpleFileName, ThisTimeLineID, openLogId, openLogSeg);
                          if (strcmp(simpleFileName, mirroredLogFileOpen.simpleFileName) != 0)
			    {
			      ereport( PANIC
				       , (errmsg_internal("Expected Xlog file name does not match current open xlog file name. \
                                                           Expected file = %s, \
                                                           open file = %s, \
                                                           WriteRqst.Write = %s, \
                                                           WriteRqst.Flush = %s "
							 , simpleFileName
							 , mirroredLogFileOpen.simpleFileName
							 , XLogLocationToString(&(WriteRqst.Write))
							 , XLogLocationToString(&(WriteRqst.Flush)))));
			    }
			  }

			if (MirroredFlatFile_Write(
							&mirroredLogFileOpen,
							openLogOff,
							from,
							nbytes,
							/* suppressError */ true))
2182 2183 2184 2185
			{
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not write to log file %u, segment %u "
P
Peter Eisentraut 已提交
2186
								"at offset %u, length %lu: %m",
2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202
								openLogId, openLogSeg,
								openLogOff, (unsigned long) nbytes)));
			}

			/* Update state for write */
			openLogOff += nbytes;
			Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
			npages = 0;

			/*
			 * If we just wrote the whole last page of a logfile segment,
			 * fsync the segment immediately.  This avoids having to go back
			 * and re-open prior segments when an fsync request comes along
			 * later. Doing it here ensures that one and only one backend will
			 * perform this fsync.
			 *
2203 2204 2205
			 * We also do this if this is the last page written for an xlog
			 * switch.
			 *
2206
			 * This is also the right place to notify the Archiver that the
B
Bruce Momjian 已提交
2207
			 * segment is ready to copy to archival storage, and to update the
2208 2209 2210
			 * timer for archive_timeout, and to signal for a checkpoint if
			 * too many logfile segments have been used since the last
			 * checkpoint.
2211
			 */
2212
			if (finishing_seg || (xlog_switch && last_iteration))
2213
			{
2214 2215 2216 2217 2218 2219 2220 2221 2222 2223
				if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
					MirroredFlatFile_Flush(
									&mirroredLogFileOpen,
									/* suppressError */ false);

				elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
					 "XLogWrite (#1): flush loc %s; write loc %s",
					 XLogLocationToString_Long(&LogwrtResult.Flush),
					 XLogLocationToString2_Long(&LogwrtResult.Write));

B
Bruce Momjian 已提交
2224
				LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */
2225 2226 2227

				if (XLogArchivingActive())
					XLogArchiveNotifySeg(openLogId, openLogSeg);
2228

2229
				Write->lastSegSwitchTime = (pg_time_t) time(NULL);
2230 2231

				/*
2232
				 * Signal bgwriter to start a checkpoint if we've consumed too
2233
				 * much xlog since the last one.  For speed, we first check
B
Bruce Momjian 已提交
2234 2235 2236
				 * using the local copy of RedoRecPtr, which might be out of
				 * date; if it looks like a checkpoint is needed, forcibly
				 * update RedoRecPtr and recheck.
2237
				 */
2238 2239
				if (IsUnderPostmaster &&
					XLogCheckpointNeeded())
2240
				{
2241 2242
					if (Debug_print_qd_mirroring)
						elog(LOG, "time for a checkpoint, signaling bgwriter");
2243 2244
					(void) GetRedoRecPtr();
					if (XLogCheckpointNeeded())
2245
						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2246
				}
2247
			}
T
Tom Lane 已提交
2248
		}
2249

T
Tom Lane 已提交
2250 2251 2252 2253 2254 2255
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
2256 2257 2258 2259 2260
		curridx = NextBufIdx(curridx);

		/* If flexible, break out of loop as soon as we wrote something */
		if (flexible && npages == 0)
			break;
2261
	}
2262 2263 2264

	Assert(npages == 0);
	Assert(curridx == Write->curridx);
2265

T
Tom Lane 已提交
2266 2267 2268 2269 2270
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
2271
	{
T
Tom Lane 已提交
2272
		/*
B
Bruce Momjian 已提交
2273 2274 2275
		 * Could get here without iterating above loop, in which case we might
		 * have no open file or the wrong one.	However, we do not need to
		 * fsync more than one file.
T
Tom Lane 已提交
2276
		 */
2277 2278
		if (sync_method != SYNC_METHOD_OPEN &&
			sync_method != SYNC_METHOD_OPEN_DSYNC)
T
Tom Lane 已提交
2279
		{
2280
			if (MirroredFlatFile_IsActive(&mirroredLogFileOpen) &&
B
Bruce Momjian 已提交
2281
				!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
2282
				XLogFileClose();
2283
			if (!MirroredFlatFile_IsActive(&mirroredLogFileOpen))
2284 2285
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
2286 2287 2288 2289
				XLogFileOpen(
						&mirroredLogFileOpen,
						openLogId,
						openLogSeg);
2290 2291
				openLogOff = 0;
			}
2292 2293 2294 2295 2296 2297 2298 2299 2300
			if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
				MirroredFlatFile_Flush(
								&mirroredLogFileOpen,
								/* suppressError */ false);

			elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
				 "XLogWrite (#2): flush loc %s; write loc %s",
				 XLogLocationToString_Long(&LogwrtResult.Flush),
				 XLogLocationToString2_Long(&LogwrtResult.Write));
T
Tom Lane 已提交
2301
		}
2302

T
Tom Lane 已提交
2303
		LogwrtResult.Flush = LogwrtResult.Write;
2304 2305
	}

T
Tom Lane 已提交
2306 2307 2308
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
2309
	 * We make sure that the shared 'request' values do not fall behind the
B
Bruce Momjian 已提交
2310 2311
	 * 'result' values.  This is not absolutely essential, but it saves some
	 * code in a couple of places.
T
Tom Lane 已提交
2312
	 */
2313 2314 2315 2316
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2317
		SpinLockAcquire(&xlogctl->info_lck);
2318 2319 2320 2321 2322
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
2323
		SpinLockRelease(&xlogctl->info_lck);
2324
	}
2325

T
Tom Lane 已提交
2326 2327 2328
	Write->LogwrtResult = LogwrtResult;
}

2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344
/*
 * Record the LSN for an asynchronous transaction commit.
 * (This should not be called for aborts, nor for synchronous commits.)
 */
void
XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
		xlogctl->asyncCommitLSN = asyncCommitLSN;
	SpinLockRelease(&xlogctl->info_lck);
}

2345 2346 2347 2348
/*
 * Advance minRecoveryPoint in control file.
 *
 * If we crash during recovery, we must reach this point again before the
2349 2350
 * database is consistent.
 *
2351
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2352
 * is only updated if it's not already greater than or equal to 'lsn'.
2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367
 */
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
	/* Quick check using our local copy of the variable */
	if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
		return;

	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

	/* update local copy */
	minRecoveryPoint = ControlFile->minRecoveryPoint;

	/*
	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2368 2369
	 * i.e., we're doing crash recovery.  We never modify the control file's
	 * value in that case, so we can short-circuit future checks here too.
2370 2371 2372 2373 2374 2375 2376
	 */
	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
		updateMinRecoveryPoint = false;
	else if (force || XLByteLT(minRecoveryPoint, lsn))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
2377
		XLogRecPtr	newMinRecoveryPoint;
2378 2379 2380 2381

		/*
		 * To avoid having to update the control file too often, we update it
		 * all the way to the last record being replayed, even though 'lsn'
2382 2383 2384 2385 2386 2387 2388 2389 2390
		 * would suffice for correctness.  This also allows the 'force' case
		 * to not need a valid 'lsn' value.
		 *
		 * Another important reason for doing it this way is that the passed
		 * 'lsn' value could be bogus, i.e., past the end of available WAL,
		 * if the caller got it from a corrupted heap page.  Accepting such
		 * a value as the min recovery point would prevent us from coming up
		 * at all.  Instead, we just log a warning and continue with recovery.
		 * (See also the comments about corrupt LSNs in XLogFlush.)
2391 2392 2393 2394 2395
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
		SpinLockRelease(&xlogctl->info_lck);

2396 2397 2398 2399 2400 2401
		if (!force && XLByteLT(newMinRecoveryPoint, lsn))
			elog(WARNING,
				 "xlog min recovery request %X/%X is past current point %X/%X",
				 lsn.xlogid, lsn.xrecoff,
				 newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);

2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416
		/* update control file */
		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
		{
			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
			UpdateControlFile();
			minRecoveryPoint = newMinRecoveryPoint;

			ereport(DEBUG2,
					(errmsg("updated min recovery point to %X/%X",
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
		}
	}
	LWLockRelease(ControlFileLock);
}

T
Tom Lane 已提交
2417 2418 2419
/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
2420
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
2421 2422 2423 2424 2425 2426 2427 2428
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

2429
	/*
2430 2431 2432 2433 2434
	 * During REDO, we are reading not writing WAL.  Therefore, instead of
	 * trying to flush the WAL, we should update minRecoveryPoint instead.
	 * We test XLogInsertAllowed(), not InRecovery, because we need the
	 * bgwriter to act this way too, and because when the bgwriter tries
	 * to write the end-of-recovery checkpoint, it should indeed flush.
2435
	 */
2436
	if (!XLogInsertAllowed())
2437 2438
	{
		UpdateMinRecoveryPoint(record, false);
T
Tom Lane 已提交
2439
		return;
2440
	}
T
Tom Lane 已提交
2441

2442 2443 2444 2445 2446 2447
	if (Debug_print_qd_mirroring)
		elog(LOG, "xlog flush request %s; write %s; flush %s",
			 XLogLocationToString(&record),
			 XLogLocationToString2(&LogwrtResult.Write),
			 XLogLocationToString3(&LogwrtResult.Flush));

T
Tom Lane 已提交
2448 2449 2450 2451
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

2452
#ifdef WAL_DEBUG
2453
	if (XLOG_DEBUG)
2454
		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2455 2456 2457
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2458
#endif
2459

T
Tom Lane 已提交
2460 2461 2462 2463
	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
B
Bruce Momjian 已提交
2464 2465 2466 2467
	 * piggyback as much data as we can on each fsync: if we see any more data
	 * entered into the xlog buffer, we'll write and fsync that too, so that
	 * the final value of LogwrtResult.Flush is as large as possible. This
	 * gives us some chance of avoiding another fsync immediately after.
T
Tom Lane 已提交
2468 2469 2470 2471 2472
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

2473
	/* read LogwrtResult and update local state */
2474 2475 2476 2477
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2478
		SpinLockAcquire(&xlogctl->info_lck);
2479 2480 2481
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
2482
		SpinLockRelease(&xlogctl->info_lck);
2483
	}
2484 2485 2486

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2487
	{
2488 2489 2490 2491
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2492
		{
2493 2494 2495 2496 2497 2498
			/* try to write/flush later additions to XLOG as well */
			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
			{
				XLogCtlInsert *Insert = &XLogCtl->Insert;
				uint32		freespace = INSERT_FREESPACE(Insert);

B
Bruce Momjian 已提交
2499
				if (freespace < SizeOfXLogRecord)		/* buffer is full */
2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				else
				{
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
					WriteRqstPtr.xrecoff -= freespace;
				}
				LWLockRelease(WALInsertLock);
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = WriteRqstPtr;
			}
			else
			{
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = record;
			}
2515
			XLogWrite(WriteRqst, false, false);
T
Tom Lane 已提交
2516
		}
2517
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
2518 2519 2520
	}

	END_CRIT_SECTION();
2521 2522 2523

	/*
	 * If we still haven't flushed to the request point then we have a
B
Bruce Momjian 已提交
2524 2525
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
2526
	 *
2527 2528 2529 2530
	 * Formerly we treated this as a PANIC condition, but that hurts the
	 * system's robustness rather than helping it: we do not want to take down
	 * the whole system due to corruption on one data page.  In particular, if
	 * the bad page is encountered again during recovery then we would be
2531 2532 2533 2534 2535 2536
	 * unable to restart the database at all!  (This scenario actually
	 * happened in the field several times with 7.1 releases.)  As of 8.4,
	 * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
	 * problem; the only time we can reach here during recovery is while
	 * flushing the end-of-recovery checkpoint record, and we don't expect
	 * that to have a bad LSN.
2537
	 *
2538
	 * Note that for calls from xact.c, the ERROR will
2539
	 * be promoted to PANIC since xact.c calls this routine inside a critical
B
Bruce Momjian 已提交
2540 2541
	 * section.  However, calls from bufmgr.c are not within critical sections
	 * and so we will not force a restart for a bad LSN on a data page.
2542 2543
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
2544
		elog(ERROR,
B
Bruce Momjian 已提交
2545
		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2546 2547
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2548 2549
}

2550
/*
2551

2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563
 * TODO: This is just for the matter of WAL receiver build.  We cannot
 * expose MirroredFlatFileOpen in xlog.h.
 */
int
XLogFileInitExt(uint32 log, uint32 seg, bool *use_existent, bool use_lock)
{
	MirroredFlatFileOpen mirroredOpen;

	XLogFileInit(&mirroredOpen, log, seg, use_existent, use_lock);
	return mirroredOpen.primaryFile;
}

2564 2565 2566 2567 2568 2569
/*
 * Flush xlog, but without specifying exactly where to flush to.
 *
 * We normally flush only completed blocks; but if there is nothing to do on
 * that basis, we check for unflushed async commits in the current incomplete
 * block, and flush through the latest one of those.  Thus, if async commits
B
Bruce Momjian 已提交
2570
 * are not being used, we will flush complete blocks only.	We can guarantee
2571
 * that async commits reach disk after at most three cycles; normally only
B
Bruce Momjian 已提交
2572
 * one or two.	(We allow XLogWrite to write "flexibly", meaning it can stop
2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584
 * at the end of the buffer ring; this makes a difference only with very high
 * load or long wal_writer_delay, but imposes one extra cycle for the worst
 * case for async commits.)
 *
 * This routine is invoked periodically by the background walwriter process.
 */
void
XLogBackgroundFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
	bool		flexible = true;

2585 2586 2587 2588
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return;

2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608
	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* back off to last completed page boundary */
	WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;

	/* if we have already flushed that far, consider async commit records */
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2609
		SpinLockAcquire(&xlogctl->info_lck);
2610
		WriteRqstPtr = xlogctl->asyncCommitLSN;
2611
		SpinLockRelease(&xlogctl->info_lck);
2612 2613 2614
		flexible = false;		/* ensure it all gets written */
	}

2615 2616 2617 2618 2619
	/*
	 * If already known flushed, we're done. Just need to check if we
	 * are holding an open file handle to a logfile that's no longer
	 * in use, preventing the file from being deleted.
	 */
2620
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2621
	{
2622 2623
		if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
		{
2624 2625 2626 2627 2628
			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
			{
				XLogFileClose();
			}
		}
2629
		return;
2630
	}
2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657

#ifdef WAL_DEBUG
	if (XLOG_DEBUG)
		elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
			 WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
#endif

	START_CRIT_SECTION();

	/* now wait for the write lock */
	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
	LogwrtResult = XLogCtl->Write.LogwrtResult;
	if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		XLogwrtRqst WriteRqst;

		WriteRqst.Write = WriteRqstPtr;
		WriteRqst.Flush = WriteRqstPtr;
		XLogWrite(WriteRqst, flexible, false);
	}
	LWLockRelease(WALWriteLock);

	END_CRIT_SECTION();
}

2658 2659
/*
 * Flush any previous asynchronously-committed transactions' commit records.
2660 2661 2662 2663
 *
 * NOTE: it is unwise to assume that this provides any strong guarantees.
 * In particular, because of the inexact LSN bookkeeping used by clog.c,
 * we cannot assume that hint bits will be settable for these transactions.
2664 2665 2666 2667 2668
 */
void
XLogAsyncCommitFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
B
Bruce Momjian 已提交
2669

2670 2671 2672
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

2673 2674 2675 2676
	/* There's no asynchronously committed transactions during recovery */
	if (RecoveryInProgress())
		return;

2677 2678 2679 2680 2681 2682 2683
	SpinLockAcquire(&xlogctl->info_lck);
	WriteRqstPtr = xlogctl->asyncCommitLSN;
	SpinLockRelease(&xlogctl->info_lck);

	XLogFlush(WriteRqstPtr);
}

2684 2685 2686 2687 2688 2689 2690 2691 2692
/*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
bool
XLogNeedsFlush(XLogRecPtr record)
{
2693 2694 2695 2696
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return false;

2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* check again */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	return true;
}

T
Tom Lane 已提交
2718 2719 2720
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
2721 2722 2723
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
2724
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
2725 2726
 * file was used.
 *
2727
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2728
 * place.  This should be TRUE except during bootstrap log creation.  The
2729
 * caller must *not* hold the lock at call.
2730
 *
T
Tom Lane 已提交
2731
 * Returns FD of opened file.
2732 2733 2734 2735 2736
 *
 * Note: errors here are ERROR not PANIC because we might or might not be
 * inside a critical section (eg, during checkpoint there is no reason to
 * take down the system on failure).  They will promote to PANIC if we are
 * in a critical section.
T
Tom Lane 已提交
2737
 */
2738 2739 2740 2741 2742
static void
XLogFileInit(
	MirroredFlatFileOpen *mirroredOpen,
	uint32 log, uint32 seg,
	bool *use_existent, bool use_lock)
2743
{
2744 2745
	char		simpleFileName[MAXPGPATH];
	char		tmpsimple[MAXPGPATH];
2746
	char		tmppath[MAXPGPATH];
2747
	MirroredFlatFileOpen tmpMirroredOpen;
2748
	char	   *zbuffer;
2749 2750 2751
	uint32		installed_log;
	uint32		installed_seg;
	int			max_advance;
2752
	int			nbytes;
2753
	char			*xlogDir = NULL;
2754

2755
	XLogFileName(simpleFileName, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
2756 2757

	/*
B
Bruce Momjian 已提交
2758
	 * Try to use existent file (checkpoint maker may have created it already)
V
Vadim B. Mikheev 已提交
2759
	 */
2760
	if (*use_existent)
V
Vadim B. Mikheev 已提交
2761
	{
2762 2763 2764 2765
		if (MirroredFlatFile_Open(
							mirroredOpen,
							XLOGDIR,
							simpleFileName,
2766
							O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2767 2768 2769
						    S_IRUSR | S_IWUSR,
						    /* suppressError */ true,
							/* atomic operation */ false,
2770
							/* isMirrorRecovery */ false))
V
Vadim B. Mikheev 已提交
2771
		{
2772 2773 2774 2775
			char		path[MAXPGPATH];

			XLogFilePath(path, ThisTimeLineID, log, seg);

V
Vadim B. Mikheev 已提交
2776
			if (errno != ENOENT)
2777
				ereport(ERROR,
2778
						(errcode_for_file_access(),
2779
						 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2780
								path, log, seg)));
V
Vadim B. Mikheev 已提交
2781 2782
		}
		else
2783
			return;
V
Vadim B. Mikheev 已提交
2784 2785
	}

2786
	/*
B
Bruce Momjian 已提交
2787 2788 2789 2790
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
	 * another process is doing the same thing.  If so, we will end up
	 * pre-creating an extra log segment.  That seems OK, and better than
	 * holding the lock throughout this lengthy process.
2791
	 */
2792 2793
	elog(DEBUG2, "creating and filling new WAL file");

2794 2795 2796 2797
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
		
	if (snprintf(tmpsimple, MAXPGPATH, "xlogtemp.%d", (int) getpid()) > MAXPGPATH)
	{
2798 2799 2800
		ereport(ERROR,
				(errmsg("could not generate filename xlogtemp.%d", (int)getpid())));
	}
2801

2802 2803
	if (snprintf(tmppath, MAXPGPATH, "%s/%s", xlogDir, tmpsimple) > MAXPGPATH)
	{
2804 2805 2806
		ereport(ERROR,
				(errmsg("could not generate filename %s/%s", xlogDir, tmpsimple)));
	}
2807 2808 2809 2810 2811

	MirroredFlatFile_Drop(
						  XLOGDIR,
						  tmpsimple,
						  /* suppressError */ true,
2812
						  /* isMirrorRecovery */ false);
2813

2814
	/* do not use get_sync_bit here --- want to fsync only at end of fill */
2815 2816 2817 2818 2819 2820 2821 2822
	MirroredFlatFile_Open(
						&tmpMirroredOpen,
						XLOGDIR,
						tmpsimple,
						O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					    S_IRUSR | S_IWUSR,
					    /* suppressError */ false,
						/* atomic operation */ false,
2823
						/* isMirrorRecovery */ false);
2824

2825
	/*
B
Bruce Momjian 已提交
2826 2827 2828 2829 2830 2831 2832
	 * Zero-fill the file.	We have to do this the hard way to ensure that all
	 * the file space has really been allocated --- on platforms that allow
	 * "holes" in files, just seeking to the end doesn't allocate intermediate
	 * space.  This way, we know that we have all the space and (after the
	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
	 * log file.
2833 2834 2835 2836
	 *
	 * Note: palloc zbuffer, instead of just using a local char array, to
	 * ensure it is reasonably well-aligned; this may save a few cycles
	 * transferring data to the kernel.
2837
	 */
2838 2839
	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2840
	{
2841
		errno = 0;
2842 2843 2844
		if (MirroredFlatFile_Append(
							&tmpMirroredOpen,
							zbuffer,
2845
							XLOG_BLCKSZ,
2846
							/* suppressError */ true))
T
Tom Lane 已提交
2847
		{
B
Bruce Momjian 已提交
2848
			int			save_errno = errno;
T
Tom Lane 已提交
2849

B
Bruce Momjian 已提交
2850
			/*
B
Bruce Momjian 已提交
2851
			 * If we fail to make the file, delete it to release disk space
B
Bruce Momjian 已提交
2852
			 */
2853 2854 2855 2856
			MirroredFlatFile_Drop(
							XLOGDIR,
							tmpsimple,
							/* suppressError */ false,
2857
							/* isMirrorRecovery */ false);
2858

2859 2860
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
2861

2862
			ereport(ERROR,
2863
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2864
					 errmsg("could not write to file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2865
		}
2866
	}
2867
	pfree(zbuffer);
2868

2869 2870 2871
	MirroredFlatFile_Flush(
				&tmpMirroredOpen,
				/* suppressError */ false);
2872

2873
	MirroredFlatFile_Close(&tmpMirroredOpen);
T
Tom Lane 已提交
2874

2875
	/*
2876 2877
	 * Now move the segment into place with its final name.
	 *
2878
	 * If caller didn't want to use a pre-existing file, get rid of any
B
Bruce Momjian 已提交
2879 2880 2881
	 * pre-existing file.  Otherwise, cope with possibility that someone else
	 * has created the file while we were filling ours: if so, use ours to
	 * pre-create a future log segment.
2882
	 */
2883 2884 2885 2886 2887
	installed_log = log;
	installed_seg = seg;
	max_advance = XLOGfileslop;
	if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
								*use_existent, &max_advance,
2888
								use_lock, tmpsimple))
2889
	{
2890 2891 2892 2893 2894
		/*
		 * No need for any more future segments, or InstallXLogFileSegment()
		 * failed to rename the file into place. If the rename failed, opening
		 * the file below will fail.
		 */
2895 2896 2897 2898
		MirroredFlatFile_Drop(
						XLOGDIR,
						tmpsimple,
						/* suppressError */ false,
2899
						/* isMirrorRecovery */ false);
2900 2901 2902 2903 2904 2905
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
2906 2907 2908 2909
	MirroredFlatFile_Open(
						mirroredOpen,
						XLOGDIR,
						simpleFileName,
2910
						O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2911 2912 2913
					    S_IRUSR | S_IWUSR,
					    /* suppressError */ false,
						/* atomic operation */ false,
2914
						/* isMirrorRecovery */ false);
2915 2916

	pfree(xlogDir);
2917

2918
	elog(DEBUG2, "done creating and filling new WAL file");
2919 2920
}

2921 2922 2923 2924 2925 2926 2927 2928 2929
/*
 * Create a new XLOG file segment by copying a pre-existing one.
 *
 * log, seg: identify segment to be created.
 *
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 *		a different timeline)
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
2930
 * considerations.	But we should be just as tense as XLogFileInit to avoid
2931 2932 2933 2934 2935 2936 2937 2938
 * emplacing a bogus file.
 */
static void
XLogFileCopy(uint32 log, uint32 seg,
			 TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
2939
	char		buffer[XLOG_BLCKSZ];
2940 2941 2942
	int			srcfd;
	int			fd;
	int			nbytes;
2943
	char		*xlogDir = NULL;
2944 2945 2946 2947 2948 2949 2950

	/*
	 * Open the source file
	 */
	XLogFilePath(path, srcTLI, srclog, srcseg);
	srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
2951
		ereport(ERROR,
2952 2953 2954 2955 2956 2957
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", path)));

	/*
	 * Copy into a temp file name.
	 */
2958 2959 2960 2961 2962 2963 2964
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
				 xlogDir, (int) getpid()) > MAXPGPATH)
		ereport(ERROR,
				(errmsg("could not generate filename %s/xlogtemp.%d",
						xlogDir, (int) getpid())));
	pfree(xlogDir);	
2965 2966
	unlink(tmppath);

2967 2968 2969
	elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirroring: copying xlog file '%s' to '%s'",
		 path, tmppath);

2970
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2971 2972 2973
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2974
		ereport(ERROR,
2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * Do the data copying.
	 */
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
	{
		errno = 0;
		if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			if (errno != 0)
2987
				ereport(ERROR,
2988 2989 2990
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			else
2991
				ereport(ERROR,
B
Bruce Momjian 已提交
2992
						(errmsg("not enough data in file \"%s\"", path)));
2993 2994 2995 2996 2997 2998 2999
		}
		errno = 0;
		if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			int			save_errno = errno;

			/*
B
Bruce Momjian 已提交
3000
			 * If we fail to make the file, delete it to release disk space
3001 3002 3003 3004 3005
			 */
			unlink(tmppath);
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;

3006
			ereport(ERROR,
3007
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
3008
					 errmsg("could not write to file \"%s\": %m", tmppath)));
3009 3010 3011 3012
		}
	}

	if (pg_fsync(fd) != 0)
3013
		ereport(ERROR,
3014 3015 3016 3017
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
3018
		ereport(ERROR,
3019 3020 3021 3022 3023 3024 3025 3026
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));

	close(srcfd);

	/*
	 * Now move the segment into place with its final name.
	 */
3027
	if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false, NULL))
3028
		elog(ERROR, "InstallXLogFileSegment should not have failed");
3029 3030
}

3031 3032 3033 3034 3035 3036
/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
3037 3038 3039
 * *log, *seg: identify segment to install as (or first possible target).
 * When find_free is TRUE, these are modified on return to indicate the
 * actual installation location or last segment searched.
3040 3041 3042 3043 3044 3045 3046
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
3047 3048 3049 3050
 * *max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  On return, reduced
 * by the number of slots skipped over.  (Irrelevant, and may be NULL,
 * when find_free is FALSE.)
3051
 *
3052
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3053
 * place.  This should be TRUE except during bootstrap log creation.  The
3054
 * caller must *not* hold the lock at call.
3055
 *
3056 3057 3058
 * Returns TRUE if the file was installed successfully.  FALSE indicates that
 * max_advance limit was exceeded, or an error occurred while renaming the
 * file into place.
3059 3060
 */
static bool
3061 3062
InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
3063
					   bool use_lock, char* tmpsimpleFileName)
3064 3065
{
	char		path[MAXPGPATH];
3066
	char		simpleFileName[MAXPGPATH];
3067
	struct stat stat_buf;
3068 3069 3070 3071 3072
	int retval = 0;

	errno = 0;

	XLogFileName(simpleFileName, ThisTimeLineID, *log, *seg);
3073

3074
	XLogFilePath(path, ThisTimeLineID, *log, *seg);
3075 3076 3077 3078 3079

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
3080
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3081

3082 3083 3084
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
3085 3086 3087 3088 3089 3090
		if (tmpsimpleFileName) {

			MirroredFlatFile_Drop(
								  XLOGDIR,
								  simpleFileName,
								  /* suppressError */ true,
3091
								  /* isMirrorRecovery */ false);
3092 3093 3094
		} else {
			unlink(path);
		}
3095
	}
3096 3097
	else
	{
3098
		/* Find a free slot to put it in */
3099
		while (stat(path, &stat_buf) == 0)
3100
		{
3101
			if (*max_advance <= 0)
3102 3103 3104
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
3105
					LWLockRelease(ControlFileLock);
3106 3107
				return false;
			}
3108 3109
			NextLogSeg(*log, *seg);
			(*max_advance)--;
3110 3111

			XLogFileName(simpleFileName, ThisTimeLineID, *log, *seg);
3112
			XLogFilePath(path, ThisTimeLineID, *log, *seg);
3113 3114 3115 3116 3117 3118 3119
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
3120
	 */
3121
#if HAVE_WORKING_LINK
3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134

	if (tmpsimpleFileName) {
		retval = MirroredFlatFile_Rename(
										 XLOGDIR,
										 /* old name */ tmpsimpleFileName,
										 /* new name */ simpleFileName,
										 /* can exist? */ false,
										 /* isMirrorRecovery */ false);
	} else {
		retval = link(tmppath, path);
	}

	if (retval < 0)
3135 3136 3137 3138
	{
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
3139
				(errcode_for_file_access(),
3140
				 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
3141
						tmppath, path, *log, *seg)));
3142 3143
		return false;
	}
3144 3145 3146 3147 3148 3149 3150

	if (tmpsimpleFileName) {

		MirroredFlatFile_Drop(
						  XLOGDIR,
						  tmpsimpleFileName,
						  /* suppressError */ true,
3151
						  /* isMirrorRecovery */ false);
3152 3153 3154 3155
	} else {
		unlink(tmppath);
	}

3156
#else
3157 3158 3159 3160 3161 3162
	if (tmpsimpleFileName) {
		retval = MirroredFlatFile_Rename(
						  XLOGDIR,
						  /* old name */ tmpsimpleFileName,
						  /* new name */ simpleFileName,
						  /* can exist */ false,
3163
						  /* isMirrorRecovery */ false);
3164 3165 3166 3167 3168
	} else {
		retval = rename(tmppath, path);
	}

	if (retval < 0)
3169
	{
3170 3171 3172
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
3173
				(errcode_for_file_access(),
3174
				 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
3175
						tmppath, path, *log, *seg)));
3176
		return false;
3177
	}
3178
#endif
V
Vadim B. Mikheev 已提交
3179

3180
	if (use_lock)
3181
		LWLockRelease(ControlFileLock);
3182

3183
	return true;
3184 3185
}

T
Tom Lane 已提交
3186
/*
3187
 * Open a pre-existing logfile segment for writing.
T
Tom Lane 已提交
3188
 */
3189 3190 3191 3192 3193
static void
XLogFileOpen(
	MirroredFlatFileOpen *mirroredOpen,
	uint32 log,
	uint32 seg)
3194
{
3195
	char		simpleFileName[MAXPGPATH];
3196

3197
	XLogFileName(simpleFileName, ThisTimeLineID, log, seg);
3198

3199 3200 3201 3202
	if (MirroredFlatFile_Open(
					mirroredOpen,
					XLOGDIR,
					simpleFileName,
3203
					O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3204 3205 3206
					S_IRUSR | S_IWUSR,
					/* suppressError */ false,
					/* atomic operation */ false,
3207
					/* isMirrorRecovery */ false))
3208
	{
3209
		char		path[MAXPGPATH];
3210

3211
		XLogFileName(path, ThisTimeLineID, log, seg);
3212

3213 3214
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3215 3216
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
3217
	}
3218 3219
}

3220 3221 3222
/*
 * Close the current logfile segment for writing.
 */
3223
static void
3224 3225
XLogFileClose(void)
{
3226
	Assert(MirroredFlatFile_IsActive(&mirroredLogFileOpen));
3227 3228

	/*
3229
	 * WAL segment files will not be re-read in normal operation, so we advise
3230
	 * the OS to release any cached pages.	But do not do so if WAL archiving
3231
	 * is active, because archiver process could use the cache to read the WAL
3232 3233 3234 3235 3236 3237
	 * segment.
	 */
	/* GPDB_84_MERGE_FIXME: Disabled for now, because I'm not sure if this
	 * would make sense with file replication. Like with WAL replication, you
	 * don't want to DONTNEED the file, if it's just about to be read by 
	 * mirroring, to be transmitted to the mirror.
3238
	 */
3239
#ifdef NOT_USED
3240
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
A
Abhijit Subramanya 已提交
3241
	if (!XLogIsNeeded())
3242
		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3243 3244
#endif
#endif
3245
	MirroredFlatFile_Close(&mirroredLogFileOpen);
3246 3247
}

3248
#if 0 /* GPDB doesn't make use of this function */
3249
/*
3250
 * Attempt to retrieve the specified file from off-line archival storage.
3251
 * If successful, fill "path" with its complete path (note that this will be
3252 3253
 * a temp file name that doesn't follow the normal naming convention), and
 * return TRUE.
3254
 *
3255 3256 3257
 * If not successful, fill "path" with the name of the normal on-line file
 * (which may or may not actually exist, but we'll try to use it), and return
 * FALSE.
3258 3259 3260 3261
 *
 * For fixed-size files, the caller may pass the expected size as an
 * additional crosscheck on successful recovery.  If the file size is not
 * known, set expectedSize = 0.
3262
 */
3263 3264
static bool
RestoreArchivedFile(char *path, const char *xlogfname,
3265
					const char *recovername, off_t expectedSize)
3266
{
B
Bruce Momjian 已提交
3267 3268
	char		xlogpath[MAXPGPATH];
	char		xlogRestoreCmd[MAXPGPATH];
3269
	char		lastRestartPointFname[MAXPGPATH];
B
Bruce Momjian 已提交
3270 3271
	char	   *dp;
	char	   *endp;
3272
	const char *sp;
B
Bruce Momjian 已提交
3273
	int			rc;
3274
	bool		signaled;
3275
	struct stat stat_buf;
B
Bruce Momjian 已提交
3276 3277
	uint32		restartLog;
	uint32		restartSeg;
3278 3279

	/*
B
Bruce Momjian 已提交
3280 3281 3282 3283
	 * When doing archive recovery, we always prefer an archived log file even
	 * if a file of the same name exists in XLOGDIR.  The reason is that the
	 * file in XLOGDIR could be an old, un-filled or partly-filled version
	 * that was copied and restored as part of backing up $PGDATA.
3284
	 *
B
Bruce Momjian 已提交
3285
	 * We could try to optimize this slightly by checking the local copy
B
Bruce Momjian 已提交
3286 3287 3288 3289
	 * lastchange timestamp against the archived copy, but we have no API to
	 * do this, nor can we guarantee that the lastchange timestamp was
	 * preserved correctly when we copied to archive. Our aim is robustness,
	 * so we elect not to do this.
3290
	 *
3291 3292 3293
	 * If we cannot obtain the log file from the archive, however, we will try
	 * to use the XLOGDIR file if it exists.  This is so that we can make use
	 * of log segments that weren't yet transferred to the archive.
3294
	 *
3295 3296 3297 3298
	 * Notice that we don't actually overwrite any files when we copy back
	 * from archive because the recoveryRestoreCommand may inadvertently
	 * restore inappropriate xlogs, or they may be corrupt, so we may wish to
	 * fallback to the segments remaining in current XLOGDIR later. The
B
Bruce Momjian 已提交
3299 3300
	 * copy-from-archive filename is always the same, ensuring that we don't
	 * run out of disk space on long recoveries.
3301
	 */
3302
	snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
3303 3304

	/*
3305
	 * Make sure there is no existing file named recovername.
3306 3307 3308 3309 3310 3311
	 */
	if (stat(xlogpath, &stat_buf) != 0)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3312
					 errmsg("could not stat file \"%s\": %m",
3313 3314 3315 3316 3317 3318 3319
							xlogpath)));
	}
	else
	{
		if (unlink(xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
3320
					 errmsg("could not remove file \"%s\": %m",
3321 3322 3323
							xlogpath)));
	}

3324 3325
	/*
	 * Calculate the archive file cutoff point for use during log shipping
3326 3327
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
3328 3329
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
3330 3331
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
3332 3333 3334
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
3335 3336 3337 3338
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
3339 3340 3341 3342 3343 3344 3345 3346 3347
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
		/* we shouldn't need anything earlier than last restart point */
3348
		Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
3349 3350 3351 3352
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366
	/*
	 * construct the command to be executed
	 */
	dp = xlogRestoreCmd;
	endp = xlogRestoreCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryRestoreCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'p':
3367
					/* %p: relative path of target file */
3368
					sp++;
B
Bruce Momjian 已提交
3369
					StrNCpy(dp, xlogpath, endp - dp);
3370
					make_native_path(dp);
3371 3372 3373 3374 3375
					dp += strlen(dp);
					break;
				case 'f':
					/* %f: filename of desired file */
					sp++;
B
Bruce Momjian 已提交
3376
					StrNCpy(dp, xlogfname, endp - dp);
3377 3378
					dp += strlen(dp);
					break;
3379 3380 3381 3382 3383 3384
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
B
Bruce Momjian 已提交
3407
			(errmsg_internal("executing restore command \"%s\"",
3408 3409
							 xlogRestoreCmd)));

3410 3411
	/*
	 * Set in_restore_command to tell the signal handler that we should exit
3412
	 * right away on SIGTERM. We know that we're at a safe point to do that.
3413 3414 3415 3416 3417
	 * Check if we had already received the signal, so that we don't miss a
	 * shutdown request received just before this.
	 */
	in_restore_command = true;
	if (shutdown_requested)
3418
		proc_exit(1);
3419

3420
	/*
3421
	 * Copy xlog from archival storage to XLOGDIR
3422 3423
	 */
	rc = system(xlogRestoreCmd);
3424 3425 3426

	in_restore_command = false;

3427 3428
	if (rc == 0)
	{
3429 3430 3431 3432
		/*
		 * command apparently succeeded, but let's make sure the file is
		 * really there now and has the correct size.
		 *
3433 3434 3435 3436 3437
		 * XXX I made wrong-size a fatal error to ensure the DBA would notice
		 * it, but is that too strong?	We could try to plow ahead with a
		 * local copy of the file ... but the problem is that there probably
		 * isn't one, and we'd incorrectly conclude we've reached the end of
		 * WAL and we're done recovering ...
3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461
		 */
		if (stat(xlogpath, &stat_buf) == 0)
		{
			if (expectedSize > 0 && stat_buf.st_size != expectedSize)
				ereport(FATAL,
						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
								xlogfname,
								(unsigned long) stat_buf.st_size,
								(unsigned long) expectedSize)));
			else
			{
				ereport(LOG,
						(errmsg("restored log file \"%s\" from archive",
								xlogfname)));
				strcpy(path, xlogpath);
				return true;
			}
		}
		else
		{
			/* stat failed */
			if (errno != ENOENT)
				ereport(FATAL,
						(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3462
						 errmsg("could not stat file \"%s\": %m",
3463
								xlogpath)));
3464 3465 3466 3467
		}
	}

	/*
3468
	 * Remember, we rollforward UNTIL the restore fails so failure here is
B
Bruce Momjian 已提交
3469
	 * just part of the process... that makes it difficult to determine
B
Bruce Momjian 已提交
3470 3471 3472
	 * whether the restore failed because there isn't an archive to restore,
	 * or because the administrator has specified the restore program
	 * incorrectly.  We have to assume the former.
3473 3474
	 *
	 * However, if the failure was due to any sort of signal, it's best to
B
Bruce Momjian 已提交
3475 3476 3477
	 * punt and abort recovery.  (If we "return false" here, upper levels will
	 * assume that recovery is complete and start up the database!) It's
	 * essential to abort on child SIGINT and SIGQUIT, because per spec
3478
	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3479 3480 3481 3482 3483
	 * those it's a good bet we should have gotten it too.
	 *
	 * On SIGTERM, assume we have received a fast shutdown request, and exit
	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
	 * child process. If we receive it first, the signal handler will call
3484 3485 3486
	 * proc_exit, otherwise we do it here. If we or the child process received
	 * SIGTERM for any other reason than a fast shutdown request, postmaster
	 * will perform an immediate shutdown when it sees us exiting
3487
	 * unexpectedly.
3488
	 *
B
Bruce Momjian 已提交
3489 3490 3491 3492
	 * Per the Single Unix Spec, shells report exit status > 128 when a called
	 * command died on a signal.  Also, 126 and 127 are used to report
	 * problems such as an unfindable command; treat those as fatal errors
	 * too.
3493
	 */
3494
	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3495
		proc_exit(1);
3496

3497 3498 3499
	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

	ereport(signaled ? FATAL : DEBUG2,
B
Bruce Momjian 已提交
3500 3501
		(errmsg("could not restore file \"%s\" from archive: return code %d",
				xlogfname, rc)));
3502 3503

	/*
B
Bruce Momjian 已提交
3504 3505
	 * if an archived file is not available, there might still be a version of
	 * this file in XLOGDIR, so return that as the filename to open.
3506
	 *
B
Bruce Momjian 已提交
3507 3508
	 * In many recovery scenarios we expect this to fail also, but if so that
	 * just means we've reached the end of WAL.
3509
	 */
3510
	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3511
	return false;
3512
}
3513
#endif
3514

3515
#ifdef NOT_USED
3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535
/*
 * Attempt to execute the recovery_end_command.
 */
static void
ExecuteRecoveryEndCommand(void)
{
	char		xlogRecoveryEndCmd[MAXPGPATH];
	char		lastRestartPointFname[MAXPGPATH];
	char	   *dp;
	char	   *endp;
	const char *sp;
	int			rc;
	bool		signaled;
	uint32		restartLog;
	uint32		restartSeg;

	Assert(recoveryEndCommand);

	/*
	 * Calculate the archive file cutoff point for use during log shipping
3536 3537
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
3538 3539
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
3540 3541
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
3542 3543 3544
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
3545 3546 3547 3548
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

	/*
	 * construct the command to be executed
	 */
	dp = xlogRecoveryEndCmd;
	endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryEndCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
			(errmsg_internal("executing recovery end command \"%s\"",
							 xlogRecoveryEndCmd)));

	/*
T
Tom Lane 已提交
3606
	 * execute the constructed command
3607 3608 3609 3610 3611 3612
	 */
	rc = system(xlogRecoveryEndCmd);
	if (rc != 0)
	{
		/*
		 * If the failure was due to any sort of signal, it's best to punt and
3613
		 * abort recovery. See also detailed comments on signals in
3614 3615 3616 3617 3618 3619
		 * RestoreArchivedFile().
		 */
		signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

		ereport(signaled ? FATAL : WARNING,
				(errmsg("recovery_end_command \"%s\": return code %d",
3620
						xlogRecoveryEndCmd, rc)));
3621 3622
	}
}
3623
#endif
3624

V
Vadim B. Mikheev 已提交
3625
/*
3626 3627 3628 3629 3630 3631 3632 3633
 * Preallocate log files beyond the specified log endpoint.
 *
 * XXX this is currently extremely conservative, since it forces only one
 * future log segment to exist, and even that only if we are 75% done with
 * the current one.  This is only appropriate for very low-WAL-volume systems.
 * High-volume systems will be OK once they've built up a sufficient set of
 * recycled log segments, but the startup transient is likely to include
 * a lot of segment creations by foreground processes, which is not so good.
T
Tom Lane 已提交
3634
 */
3635
static void
T
Tom Lane 已提交
3636 3637 3638 3639
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
3640 3641 3642

	MirroredFlatFileOpen	mirroredOpen;

3643
	bool		use_existent;
T
Tom Lane 已提交
3644 3645

	XLByteToPrevSeg(endptr, _logId, _logSeg);
B
Bruce Momjian 已提交
3646
	if ((endptr.xrecoff - 1) % XLogSegSize >=
B
Bruce Momjian 已提交
3647
		(uint32) (0.75 * XLogSegSize))
T
Tom Lane 已提交
3648 3649
	{
		NextLogSeg(_logId, _logSeg);
3650
		use_existent = true;
3651 3652 3653 3654
		XLogFileInit(
			&mirroredOpen,
			_logId, _logSeg, &use_existent, true);
		MirroredFlatFile_Close(&mirroredOpen);
3655
		if (!use_existent)
3656
			CheckpointStats.ckpt_segs_added++;
T
Tom Lane 已提交
3657 3658 3659
	}
}

3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700
/*
 * Get the log/seg of the latest removed or recycled WAL segment.
 * Returns 0/0 if no WAL segments have been removed since startup.
 */
void
XLogGetLastRemoved(uint32 *log, uint32 *seg)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	*log = xlogctl->lastRemovedLog;
	*seg = xlogctl->lastRemovedSeg;
	SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Update the last removed log/seg pointer in shared memory, to reflect
 * that the given XLOG file has been removed.
 */
static void
UpdateLastRemovedPtr(char *filename)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	uint32		tli,
				log,
				seg;

	XLogFromFileName(filename, &tli, &log, &seg);

	SpinLockAcquire(&xlogctl->info_lck);
	if (log > xlogctl->lastRemovedLog ||
		(log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
	{
		xlogctl->lastRemovedLog = log;
		xlogctl->lastRemovedSeg = seg;
	}
	SpinLockRelease(&xlogctl->info_lck);
}

T
Tom Lane 已提交
3701
/*
3702
 * Recycle or remove all log files older or equal to passed log/seg#
3703 3704 3705
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
3706 3707
 */
static void
3708
RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
3709
{
3710 3711
	uint32		endlogId;
	uint32		endlogSeg;
3712
	int			max_advance;
B
Bruce Momjian 已提交
3713 3714
	DIR		   *xldir;
	struct dirent *xlde;
3715
	char		lastoff[MAXFNAMELEN];
B
Bruce Momjian 已提交
3716
	char		path[MAXPGPATH];
3717
	char		*xlogDir = NULL;
V
Vadim B. Mikheev 已提交
3718

3719 3720
#ifdef WIN32
	char		newpath[MAXPGPATH];
3721
	char		newfilename[MAXPGPATH];
3722
#endif
3723
	struct stat statbuf;
3724

3725 3726 3727 3728
	/*
	 * Initialize info about where to try to recycle to.  We allow recycling
	 * segments up to XLOGfileslop segments beyond the current XLOG location.
	 */
3729
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3730
	max_advance = XLOGfileslop;
V
Vadim B. Mikheev 已提交
3731

3732 3733
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	xldir = AllocateDir(xlogDir);
V
Vadim B. Mikheev 已提交
3734
	if (xldir == NULL)
3735
		ereport(ERROR,
3736
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3737
				 errmsg("could not open transaction log directory \"%s\": %m",
3738
						xlogDir)));
V
Vadim B. Mikheev 已提交
3739

3740
	XLogFileName(lastoff, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
3741

3742
	while ((xlde = ReadDir(xldir, xlogDir)) != NULL)
V
Vadim B. Mikheev 已提交
3743
	{
3744
		/*
3745
		 * We ignore the timeline part of the XLOG segment identifiers in
B
Bruce Momjian 已提交
3746 3747 3748 3749 3750
		 * deciding whether a segment is still needed.	This ensures that we
		 * won't prematurely remove a segment from a parent timeline. We could
		 * probably be a little more proactive about removing segments of
		 * non-parent timelines, but that would be a whole lot more
		 * complicated.
3751
		 *
B
Bruce Momjian 已提交
3752 3753
		 * We use the alphanumeric sorting property of the filenames to decide
		 * which ones are earlier than the lastoff segment.
3754
		 */
3755 3756 3757
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
V
Vadim B. Mikheev 已提交
3758
		{
3759
			if (XLogArchiveCheckDone(xlde->d_name))
3760
			{
3761 3762 3763 3764 3765 3766 3767
				if (snprintf(path, MAXPGPATH, "%s/%s", xlogDir, xlde->d_name) > MAXPGPATH)
				{
					ereport(ERROR, (errmsg("cannot generate filename %s/%s", xlogDir, xlde->d_name)));
				}

				/* Update the last removed location in shared memory first */
				UpdateLastRemovedPtr(xlde->d_name);
3768

3769
				/*
B
Bruce Momjian 已提交
3770
				 * Before deleting the file, see if it can be recycled as a
3771 3772 3773
				 * future log segment. Only recycle normal files, pg_standby
				 * for example can create symbolic links pointing to a
				 * separate archive directory.
3774
				 */
3775 3776
				if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
					InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3777
										   true, &max_advance, true, xlde->d_name))
3778
				{
3779
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3780 3781
							(errmsg("recycled transaction log file \"%s\"",
									xlde->d_name)));
3782
					CheckpointStats.ckpt_segs_recycled++;
3783 3784 3785 3786 3787 3788
					/* Needn't recheck that slot on future iterations */
					if (max_advance > 0)
					{
						NextLogSeg(endlogId, endlogSeg);
						max_advance--;
					}
3789 3790 3791 3792
				}
				else
				{
					/* No need for any more future segments... */
3793
					int rc = 0;
3794

3795
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3796 3797
							(errmsg("removing transaction log file \"%s\"",
									xlde->d_name)));
3798

3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814
#ifdef WIN32
					/*
					 * On Windows, if another process (e.g another backend)
					 * holds the file open in FILE_SHARE_DELETE mode, unlink
					 * will succeed, but the file will still show up in
					 * directory listing until the last handle is closed.
					 * To avoid confusing the lingering deleted file for a
					 * live WAL file that needs to be archived, rename it
					 * before deleting it.
					 *
					 * If another process holds the file open without
					 * FILE_SHARE_DELETE flag, rename will fail. We'll try
					 * again at the next checkpoint.
					 */
					snprintf(newpath, MAXPGPATH, "%s.deleted", path);
					if (rename(path, newpath) != 0)
3815 3816
					{
						ereport(LOG,
3817
								(errcode_for_file_access(),
3818
								 errmsg("could not rename old transaction log file \"%s\": %m",
3819
										path)));
3820 3821
						continue;
					}
3822
					snprintf(newfilename, MAXPGPATH, "%s.deleted", xlde->d_name);
3823
					rc = MirroredFlatFile_Drop(
3824 3825 3826
										  XLOGDIR,
										  newfilename,
										  /* suppressError */ true,
3827
										  /* isMirrorRecovery */ false);
3828
#else
3829
					rc = MirroredFlatFile_Drop(
3830 3831 3832
										  XLOGDIR,
										  xlde->d_name,
										  /* suppressError */ true,
3833
										  /* isMirrorRecovery */ false);
3834
#endif
3835

3836
					if (rc != 0)
3837 3838
					{
						ereport(LOG,
3839 3840 3841
								(errcode_for_file_access(),
								 errmsg("could not remove old transaction log file \"%s\": %m",
										path)));
3842 3843
						continue;
					}
3844

3845
					CheckpointStats.ckpt_segs_removed++;
3846
				}
3847 3848

				XLogArchiveCleanup(xlde->d_name);
3849
			}
V
Vadim B. Mikheev 已提交
3850 3851
		}
	}
B
Bruce Momjian 已提交
3852

3853
	FreeDir(xldir);
3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891
	pfree(xlogDir);
}

/*
 * Print log files in the system log.
 *
 */
void
XLogPrintLogNames(void)
{
	DIR		   *xldir;
	struct dirent *xlde;
	int count = 0;
	char *xlogDir = NULL;

	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	xldir = AllocateDir(xlogDir);
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open transaction log directory \"%s\": %m",
						xlogDir)));

	while ((xlde = ReadDir(xldir, xlogDir)) != NULL)
	{
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24)
		{
			elog(LOG,"found log file \"%s\"",
				 xlde->d_name);
			count++;
		}
	}

	FreeDir(xldir);
	pfree(xlogDir);

	elog(LOG,"%d files found", count);
V
Vadim B. Mikheev 已提交
3892 3893
}

3894
/*
3895 3896 3897
 * Remove previous backup history files.  This also retries creation of
 * .ready files for any backup history files for which XLogArchiveNotify
 * failed earlier.
3898 3899
 */
static void
3900
CleanupBackupHistory(void)
3901 3902 3903 3904
{
	DIR		   *xldir;
	struct dirent *xlde;
	char		path[MAXPGPATH];
3905
	char	*xlogDir = NULL;
3906

3907 3908
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	xldir = AllocateDir(xlogDir);
3909 3910 3911
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3912
				 errmsg("could not open transaction log directory \"%s\": %m",
3913
						xlogDir)));
3914

3915
	while ((xlde = ReadDir(xldir, xlogDir)) != NULL)
3916 3917 3918 3919 3920 3921
	{
		if (strlen(xlde->d_name) > 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
				   ".backup") == 0)
		{
3922
			if (XLogArchiveCheckDone(xlde->d_name))
3923 3924
			{
				ereport(DEBUG2,
B
Bruce Momjian 已提交
3925 3926
				(errmsg("removing transaction log backup history file \"%s\"",
						xlde->d_name)));
3927 3928 3929 3930
				if (snprintf(path, MAXPGPATH, "%s/%s", xlogDir, xlde->d_name) > MAXPGPATH)
				{
					elog(LOG, "CleanupBackupHistory: Cannot generate filename %s/%s", xlogDir, xlde->d_name);
				}
3931 3932 3933 3934 3935 3936
				unlink(path);
				XLogArchiveCleanup(xlde->d_name);
			}
		}
	}

3937
	pfree(xlogDir);
3938 3939 3940
	FreeDir(xldir);
}

T
Tom Lane 已提交
3941 3942 3943 3944
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
3945 3946 3947 3948 3949 3950 3951 3952 3953
 *
 * Note: when a backup block is available in XLOG, we restore it
 * unconditionally, even if the page in the database appears newer.
 * This is to protect ourselves against database pages that were partially
 * or incorrectly written during a crash.  We assume that the XLOG data
 * must be good because it has passed a CRC check, while the database
 * page might not be.  This will force us to replay all subsequent
 * modifications of the page that appear in XLOG, rather than possibly
 * ignoring them as already applied, but that's not a huge drawback.
3954 3955
 *
 * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3956
 * Otherwise, a normal exclusive lock is used.	At the moment, that's just
3957 3958 3959
 * pro forma, because there can't be any regular backends in the system
 * during recovery.  The 'cleanup' argument applies to all backup blocks
 * in the WAL record, that suffices for now.
T
Tom Lane 已提交
3960
 */
3961 3962
void
RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3963 3964 3965 3966 3967
{
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
3968
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
3969
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3970
	{
T
Tom Lane 已提交
3971
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3972 3973
			continue;

3974
		memcpy(&bkpb, blk, sizeof(BkpBlock));
3975 3976
		blk += sizeof(BkpBlock);

3977 3978 3979
		/* get_cleanup_lock is ignored in GPDB */
		RestoreBackupBlockContents(lsn, bkpb, blk, false, false);

3980 3981 3982
		blk += BLCKSZ - bkpb.hole_length;
	}
}
3983

3984 3985 3986 3987 3988
/*
 * Workhorse for RestoreBackupBlock usable without an xlog record
 *
 * Restores a full-page image from BkpBlock and a data pointer.
 */
3989
static void
3990 3991 3992 3993 3994
RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
						   bool get_cleanup_lock, bool keep_buffer)
{
	Buffer		buffer;
	Page		page;
3995

3996 3997 3998
	if (! (bkpb.block_info & BLOCK_APPLY))
		return;

3999
	MIRROREDLOCK_BUFMGR_DECLARE;
4000

4001 4002
	// -------- MirroredLock ----------
	MIRROREDLOCK_BUFMGR_LOCK;
4003

4004
	buffer = XLogReadBuffer(bkpb.node, bkpb.block, true);
4005 4006 4007 4008 4009 4010 4011
	Assert(BufferIsValid(buffer));
#if 0 /* upstream merge */
	if (get_cleanup_lock)
		LockBufferForCleanup(buffer);
	else
		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
#endif
4012

4013
	page = (Page) BufferGetPage(buffer);
4014

4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026
	if (bkpb.hole_length == 0)
	{
		memcpy((char *) page, blk, BLCKSZ);
	}
	else
	{
		memcpy((char *) page, blk, bkpb.hole_offset);
		/* must zero-fill the hole */
		MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
		memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
			   blk + bkpb.hole_offset,
			   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
4027
	}
4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042

	/*
	 * The checksum value on this page is currently invalid. We don't
	 * need to reset it here since it will be set before being written.
	 */

	PageSetLSN(page, lsn);
	MarkBufferDirty(buffer);

	if (!keep_buffer)
		UnlockReleaseBuffer(buffer);

	MIRROREDLOCK_BUFMGR_UNLOCK;
	// -------- MirroredLock ----------

4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070
	return;
}

bool
IsBkpBlockApplied(XLogRecord *record, uint8 block_id)
{
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

	Assert(block_id < XLR_MAX_BKP_BLOCKS);

	blk = (char *) XLogRecGetData(record) + record->xl_len;
	for (i = 0; i <= block_id; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;

		memcpy(&bkpb, blk, sizeof(BkpBlock));
		blk += sizeof(BkpBlock);

		if (i == block_id)
			return (bkpb.block_info & BLOCK_APPLY) != 0;

		blk += BLCKSZ - bkpb.hole_length;
	}

	return false;
4071 4072
}

T
Tom Lane 已提交
4073 4074 4075 4076 4077 4078 4079
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
4080 4081 4082
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
4083
	pg_crc32	crc;
4084 4085
	int			i;
	uint32		len = record->xl_len;
4086
	BkpBlock	bkpb;
4087 4088
	char	   *blk;

4089 4090 4091 4092
	/*
	 * Calculate the crc using the new fast crc32c algorithm
	 */

4093
	/* First the rmgr data */
4094 4095
	INIT_CRC32C(crc);
	COMP_CRC32C(crc, XLogRecGetData(record), len);
4096

4097
	/* Add in the backup blocks, if any */
B
Bruce Momjian 已提交
4098
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
4099
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
4100
	{
B
Bruce Momjian 已提交
4101
		uint32		blen;
4102

T
Tom Lane 已提交
4103
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
4104 4105
			continue;

4106 4107
		memcpy(&bkpb, blk, sizeof(BkpBlock));
		if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
4108
		{
4109
			ereport(emode,
4110 4111 4112
					(errmsg("incorrect hole size in record at %X/%X",
							recptr.xlogid, recptr.xrecoff)));
			return false;
4113
		}
4114
		blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
4115
		COMP_CRC32C(crc, blk, blen);
4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128
		blk += blen;
	}

	/* Check that xl_tot_len agrees with our calculation */
	if (blk != (char *) record + record->xl_tot_len)
	{
		ereport(emode,
				(errmsg("incorrect total length in record at %X/%X",
						recptr.xlogid, recptr.xrecoff)));
		return false;
	}

	/* Finally include the record header */
4129
	COMP_CRC32C(crc, (char *) record + sizeof(pg_crc32),
4130
			   SizeOfXLogRecord - sizeof(pg_crc32));
4131
	FIN_CRC32C(crc);
4132

4133
	if (!EQ_CRC32C(record->xl_crc, crc))
4134 4135 4136
	{
		ereport(emode,
		(errmsg("incorrect resource manager data checksum in record at %X/%X",
B
Bruce Momjian 已提交
4137
				recptr.xlogid, recptr.xrecoff)));
4138
		return false;
4139 4140
	}

4141
	return true;
4142 4143
}

T
Tom Lane 已提交
4144
/*
4145
 * Verify whether pg_xlog exists
T
Tom Lane 已提交
4146
 *
4147 4148 4149
 * It is not the goal of this function to verify the contents of these
 * directories, but to help in cases where someone has performed a
 * copy but omitted pg_xlog from the copy.
T
Tom Lane 已提交
4150
 */
4151 4152
static void
ValidateXLOGDirectoryStructure(void)
4153
{
4154 4155
	char		path[MAXPGPATH];
	char	   *fullpath;
4156
	struct stat stat_buf;
T
Tom Lane 已提交
4157

4158
	fullpath = makeRelativeToTxnFilespace(XLOGDIR);
4159

4160
	/* Check for pg_xlog; if it doesn't exist, error out */
4161
	if (stat(fullpath, &stat_buf) != 0 ||
4162 4163 4164 4165
			!S_ISDIR(stat_buf.st_mode))
			ereport(FATAL,
					(errmsg("required WAL directory \"%s\" does not exist",
							XLOGDIR)));
4166 4167 4168 4169 4170

	/* Check for archive_status */
	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
	fullpath = makeRelativeToTxnFilespace(path);
	if (stat(fullpath, &stat_buf) == 0)
4171
	{
4172 4173 4174 4175 4176
		/* Check for weird cases where it exists but isn't a directory */
		if (!S_ISDIR(stat_buf.st_mode))
			ereport(FATAL, 
					(errmsg("required WAL directory \"%s\" does not exist",
							path)));
4177 4178 4179
	}
	else
	{
4180 4181 4182 4183 4184 4185
		ereport(LOG,
				(errmsg("creating missing WAL directory \"%s\"", path)));
		if (mkdir(fullpath, 0700) < 0)
			ereport(FATAL, 
					(errmsg("could not create missing directory \"%s\": %m",
							path)));
4186
	}
4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204
}

/*
 * Open a logfile segment for reading (during recovery).
 * It's assumed to be already available in pg_xlog.
 */
static int
XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
			 int source, bool notfoundOk)
{
	char		xlogfname[MAXFNAMELEN];
	char		activitymsg[MAXFNAMELEN + 16];
	char		path[MAXPGPATH];
	int			fd;

	XLogFileName(xlogfname, tli, log, seg);

	switch (source)
T
Tom Lane 已提交
4205
	{
4206 4207 4208 4209 4210 4211 4212 4213
		case XLOG_FROM_PG_XLOG:
		case XLOG_FROM_STREAM:
			XLogFilePath(path, tli, log, seg);
			restoredFromArchive = false;
			break;

		default:
			elog(ERROR, "invalid XLogFileRead source %d", source);
T
Tom Lane 已提交
4214
	}
4215

4216 4217 4218 4219 4220 4221 4222 4223 4224
	elogif(debug_xlog_record_read, LOG,
		   "xlog file read -- File read request with log %u, seg %u,"
		   "tli %u, source = %s, notfoundok = %s",
		   log, seg, (uint32) tli,
		   source == XLOG_FROM_PG_XLOG ? "xlog" : "stream",
		   notfoundOk ? "true" : "false");

	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (fd >= 0)
4225
	{
4226 4227 4228 4229 4230 4231 4232 4233 4234
		/* Success! */
		curFileTLI = tli;

		/*
		 * Report recovery progress in PS display, if we are in
		 * startup process.  There are more cases like Filerep recovery
		 * and Prepare phase where we don't want to report it.
		 */
		if (am_startup)
4235
		{
4236 4237 4238
			snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
					 xlogfname);
			set_ps_display(activitymsg, false);
4239
		}
4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297

		/* Track source of data in assorted state variables */
		readSource = source;
		XLogReceiptSource = source;
		/* In FROM_STREAM case, caller tracks receipt time, not me */
		if (source != XLOG_FROM_STREAM)
			XLogReceiptTime = GetCurrentTimestamp();

		elogif(debug_xlog_record_read, LOG,
			   "xlog file read -- Read file %s (log %u, seg %u)",
			   path, log, seg);

		return fd;
	}

	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
		ereport(PANIC,
				(errcode_for_file_access(),
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));

	elogif(debug_xlog_record_read, LOG,
		   "xlog file read -- Couldn't read file %s (log %u, seg %u)",
		   path, log, seg);
	return -1;
}


/*
 * Open a logfile segment for reading (during recovery).
 *
 * This version searches for the segment with any TLI listed in expectedTLIs.
 */
static int
XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
{
	char		path[MAXPGPATH];
	ListCell   *cell;
	int			fd;

	/*
	 * Loop looking for a suitable timeline ID: we might need to read any of
	 * the timelines listed in expectedTLIs.
	 *
	 * We expect curFileTLI on entry to be the TLI of the preceding file in
	 * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
	 * to go backwards; this prevents us from picking up the wrong file when a
	 * parent timeline extends to higher segment numbers than the child we
	 * want to read.
	 */
	foreach(cell, expectedTLIs)
	{
		TimeLineID	tli = (TimeLineID) lfirst_int(cell);

		if (tli < curFileTLI)
			break;				/* don't bother looking at too-old TLIs */

		if (sources & XLOG_FROM_PG_XLOG)
4298
		{
4299 4300 4301 4302 4303 4304 4305
			elogif(debug_xlog_record_read, LOG,
				   "xlog file read (tli) -- requesting a file read (log %u, seg %u)"
				   "with currenttli %d ", log, seg, curFileTLI);

			fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
			if (fd != -1)
				return fd;
4306
		}
4307
	}
4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353

	/* Couldn't find it.  For simplicity, complain about front timeline */
	XLogFilePath(path, recoveryTargetTLI, log, seg);
	errno = ENOENT;
	ereport(emode,
			(errcode_for_file_access(),
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
	return -1;
}


/*
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
 * Returns true if the page is read successfully.
 *
 * This is responsible for waiting for the requested WAL record to arrive in
 * standby mode.
 *
 * 'emode' specifies the log level used for reporting "file not found" or
 * "end of WAL" situations in standby mode when a trigger file is found.
 * If set to WARNING or below, XLogPageRead() returns false in those situations
 * on higher log levels the ereport() won't return.
 *
 * In standby mode, this only returns false if promotion has been triggered.
 * Otherwise it keeps sleeping and retrying indefinitely.
 */
static bool
XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess)
{
	static XLogRecPtr receivedUpto = {0, 0};
	bool		switched_segment = false;
	uint32		targetPageOff;
	uint32		targetRecOff;
	uint32		targetId;
	uint32		targetSeg;
	static pg_time_t last_fail_time = 0;

	XLByteToSeg(*RecPtr, targetId, targetSeg);
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;

	/* Fast exit if we have read the record in the current buffer already */
	if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
		targetPageOff == readOff && targetRecOff < readLen)
4354
	{
4355 4356 4357 4358 4359 4360
		elogif(debug_xlog_record_read, LOG,
			   "xlog page read -- Requested record %X/%X (targetlogid %u,"
			   "targetset %u, targetpageoff %u, targetrecoff %u) already"
			   "exists in current read buffer",
			   RecPtr->xlogid, RecPtr->xrecoff,
			   targetId, targetSeg, targetPageOff, targetRecOff);
B
Bruce Momjian 已提交
4361

4362
		return true;
4363 4364
	}

4365 4366 4367 4368
	/*
	 * See if we need to switch to a new segment because the requested record
	 * is not in the currently open one.
	 */
T
Tom Lane 已提交
4369
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
4370
	{
4371 4372 4373 4374 4375
		elogif(debug_xlog_record_read, LOG,
			   "xlog page read -- Requested record %X/%X does not exist in"
			   "current read xlog file (readlog %u, readseg %u)",
			   RecPtr->xlogid, RecPtr->xrecoff, readId, readSeg);

4376 4377
		close(readFile);
		readFile = -1;
4378
		readSource = 0;
4379
	}
4380

T
Tom Lane 已提交
4381
	XLByteToSeg(*RecPtr, readId, readSeg);
4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392

	elogif(debug_xlog_record_read, LOG,
		   "xlog page read -- Requested record %X/%X has targetlogid %u, "
		   "targetseg %u, targetpageoff %u, targetrecoff %u",
		   RecPtr->xlogid, RecPtr->xrecoff,
		   targetId, targetSeg, targetPageOff, targetRecOff);

retry:
	/* See if we need to retrieve more data */
	if (readFile < 0 ||
		(readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
4393
	{
4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404
		if (StandbyMode)
		{
			/*
			 * In standby mode, wait for the requested record to become
			 * available, via WAL receiver having streamed the record.
			 */
			for (;;)
			{
				if (WalRcvInProgress())
				{
					bool		havedata;
4405

4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628
					/*
					 * If we find an invalid record in the WAL streamed from
					 * master, something is seriously wrong. There's little
					 * chance that the problem will just go away, but PANIC is
					 * not good for availability. Disconnect, and retry from
					 * pg_xlog again (That may spawn the Wal receiver again!).
					 * XXX
					 */
					if (failedSources & XLOG_FROM_STREAM)
					{
						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- Xlog from stream is a failed"
							   "source, hence requesting walreceiver shutdown.");

						ShutdownWalRcv();
						continue;
					}

					/*
					 * WAL receiver is active, so see if new data has arrived.
					 *
					 * We only advance XLogReceiptTime when we obtain fresh
					 * WAL from walreceiver and observe that we had already
					 * processed everything before the most recent "chunk"
					 * that it flushed to disk.  In steady state where we are
					 * keeping up with the incoming data, XLogReceiptTime will
					 * be updated on each cycle.  When we are behind,
					 * XLogReceiptTime will not advance, so the grace time
					 * alloted to conflicting queries will decrease.
					 */
					if (XLByteLT(*RecPtr, receivedUpto))
						havedata = true;
					else
					{
						XLogRecPtr	latestChunkStart;

						receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
						if (XLByteLT(*RecPtr, receivedUpto))
						{
							havedata = true;
							if (!XLByteLT(*RecPtr, latestChunkStart))
							{
								XLogReceiptTime = GetCurrentTimestamp();
								SetCurrentChunkStartTime(XLogReceiptTime);
							}
						}
						else
							havedata = false;
					}

					if (havedata)
					{
						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- There is enough xlog data to be "
							   "read (receivedupto %X/%X, requestedrec %X/%X)",
							   receivedUpto.xlogid, receivedUpto.xrecoff,
							   RecPtr->xlogid, RecPtr->xrecoff);

						/*
						 * Great, streamed far enough. Open the file if it's
						 * not open already.  Use XLOG_FROM_STREAM so that
						 * source info is set correctly and XLogReceiptTime
						 * isn't changed.
						 */
						if (readFile < 0)
						{
							readFile =
								XLogFileRead(readId, readSeg, PANIC,
											 recoveryTargetTLI,
											 XLOG_FROM_STREAM, false);
							Assert(readFile >= 0);
							switched_segment = true;
						}
						else
						{
							/* just make sure source info is correct... */
							readSource = XLOG_FROM_STREAM;
							XLogReceiptSource = XLOG_FROM_STREAM;
						}
						break;
					}

					/*
					 * Data not here yet, so check for trigger then sleep for
					 * five seconds like in the WAL file polling case below.
					 */
					if (CheckForStandbyTrigger())
					{
						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- Standby trigger was activated");

						goto retry;
					}

					elogif(debug_xlog_record_read, LOG,
						   "xlog page read -- No xlog data to read as of now. "
						   "Will Wait on latch till some event occurs");

					/*
					 * Wait for more WAL to arrive, or timeout to be reached
					 */
					WaitLatch(&XLogCtl->recoveryWakeupLatch,
							  WL_LATCH_SET | WL_TIMEOUT,
							  5000L);
					ResetLatch(&XLogCtl->recoveryWakeupLatch);
				}
				else
				{
					int			sources;
					pg_time_t	now;

					if (readFile >= 0)
					{
						close(readFile);
						readFile = -1;
					}

					/* Reset curFileTLI if random fetch. */
					if (randAccess)
						curFileTLI = 0;

					/* Read an existing file from pg_xlog. */
					sources = XLOG_FROM_PG_XLOG;
					if (!(sources & ~failedSources))
					{
						/*
						 * Check if we have been asked to be promoted. If yes,
						 * no use of requesting a new WAL receiver
						 */
						if (CheckForStandbyTrigger())
							goto triggered;

						/*
						 * We've exhausted all options for retrieving the
						 * file. Retry.
						 */
						failedSources = 0;

						elogif(debug_xlog_record_read, LOG,
							   "xlog page read -- All read sources have failed. So, retry.");

						/*
						 * If it hasn't been long since last attempt, sleep to
						 * avoid busy-waiting.
						 */
						now = (pg_time_t) time(NULL);
						if ((now - last_fail_time) < 5)
						{
							pg_usleep(1000000L * (5 - (now - last_fail_time)));
							now = (pg_time_t) time(NULL);
						}
						last_fail_time = now;

						/*
						 * If primary_conninfo is set, launch walreceiver to
						 * try to stream the missing WAL.
						 *
						 * If fetching_ckpt is TRUE, RecPtr points to the
						 * initial checkpoint location. In that case, we use
						 * RedoStartLSN as the streaming start position
						 * instead of RecPtr, so that when we later jump
						 * backwards to start redo at RedoStartLSN, we will
						 * have the logs streamed already.
						 */
						if (PrimaryConnInfo)
						{
							RequestXLogStreaming(
									  fetching_ckpt ? RedoStartLSN : *RecPtr,
												 PrimaryConnInfo);
							continue;
						}
					}
					/* Don't try to read from a source that just failed */
					sources &= ~failedSources;
					readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
												  sources);
					switched_segment = true;
					if (readFile >= 0)
						break;

					/*
					 * Nope, not found in pg_xlog.
					 */
					failedSources |= sources;

					/*
					 * Check to see if the trigger file exists. Note that we
					 * do this only after failure, so when you create the
					 * trigger file, we still finish replaying as much as we
					 * can from pg_xlog before failover.
					 */
					if (CheckForStandbyTrigger())
						goto triggered;
				}

				/*
				 * This possibly-long loop needs to handle interrupts of
				 * startup process.
				 */
				HandleStartupProcInterrupts();
			}
		}
		else
		{
			/* In crash recovery. */
			if (readFile < 0)
			{
				int			sources;

				/* Reset curFileTLI if random fetch. */
				if (randAccess)
					curFileTLI = 0;

				sources = XLOG_FROM_PG_XLOG;

				readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
											sources);
				switched_segment = true;
				if (readFile < 0)
					return false;
			}
		}
	}
4629

4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656
	/*
	 * At this point, we have the right segment open and if we're streaming we
	 * know the requested record is in it.
	 */
	Assert(readFile != -1);

	/*
	 * If the current segment is being streamed from master, calculate how
	 * much of the current page we have received already. We know the
	 * requested record has been received, but this is for the benefit of
	 * future calls, to allow quick exit at the top of this function.
	 */
	if (readSource == XLOG_FROM_STREAM)
	{
		if (RecPtr->xlogid != receivedUpto.xlogid ||
			(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
		{
			readLen = XLOG_BLCKSZ;
		}
		else
			readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
	}
	else
		readLen = XLOG_BLCKSZ;

	if (switched_segment && targetPageOff != 0)
	{
4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672
		/*
		 * Whenever switching to a new WAL segment, we read the first page of
		 * the file and validate its header, even if that's not where the
		 * target record is.  This is so that we can check the additional
		 * identification info that is present in the first page's "long"
		 * header.
		 */
		readOff = 0;
		if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
		{
			ereport(emode,
					(errcode_for_file_access(),
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
							readId, readSeg, readOff)));
			goto next_record_is_invalid;
		}
4673
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
T
Tom Lane 已提交
4674
		{
4675 4676
			ereport(emode,
					(errcode_for_file_access(),
4677
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
4678
							readId, readSeg, readOff)));
T
Tom Lane 已提交
4679 4680
			goto next_record_is_invalid;
		}
4681
	}
4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793

	/* Read the requested page */
	readOff = targetPageOff;
	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
	{
		ereport(emode,
				(errcode_for_file_access(),
		 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
				readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
	{
		ereport(emode,
				(errcode_for_file_access(),
		 errmsg("could not read from log file %u, segment %u, offset %u: %m",
				readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, false))
	{
		elogif(debug_xlog_record_read, LOG,
			   "xlog page read -- xlog page header invalid");
		goto next_record_is_invalid;
	}

	Assert(targetId == readId);
	Assert(targetSeg == readSeg);
	Assert(targetPageOff == readOff);
	Assert(targetRecOff < readLen);

	return true;

next_record_is_invalid:

	elogif(debug_xlog_record_read, LOG,
		   "xlog page read -- next record is invalid.");

	failedSources |= readSource;

	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return false;

triggered:
	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	return false;
}

/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
 * (emode must be either PANIC or LOG.)
 *
 * The record is copied into readRecordBuf, so that on successful return,
 * the returned record pointer always points there.
 */
XLogRecord *
XLogReadRecord(XLogRecPtr *RecPtr, bool fetching_ckpt, int emode)
{
	XLogRecord *record;
	char	   *buffer;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
	bool		randAccess = false;
	uint32		len,
				total_len;
	uint32		targetRecOff;
	uint32		pageHeaderSize;

	if (readBuf == NULL)
	{
		/*
		 * First time through, permanently allocate readBuf.  We do it this
		 * way, rather than just making a static array, for two reasons: (1)
		 * no need to waste the storage in most instantiations of the backend;
		 * (2) a static char array isn't guaranteed to have any particular
		 * alignment, whereas malloc() will provide MAXALIGN'd storage.
		 */
		readBuf = (char *) malloc(XLOG_BLCKSZ);
		if(!readBuf)
			ereport(PANIC, (errmsg("Cannot allocate memory for read log record. Out of Memory")));
	}

	if (RecPtr == NULL)
	{
		RecPtr = &tmpRecPtr;

		/*
		 * RecPtr is pointing to end+1 of the previous WAL record. We must
		 * advance it if necessary to where the next record starts.  First,
		 * align to next page if no more records can fit on the current page.
		 */
		if (nextRecord == NULL)
		{
			/* align old recptr to next page */
4794 4795 4796
			if (RecPtr->xrecoff % XLOG_BLCKSZ != 0)
				RecPtr->xrecoff += (XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ);
			if (RecPtr->xrecoff >= XLogFileSize)
4797
			{
4798 4799
				(RecPtr->xlogid)++;
				RecPtr->xrecoff = 0;
4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846
			}
		}
		/* We will account for page header size below */
	}
	else
	{
		/*
		 * In this case, the passed-in record pointer should already be
		 * pointing to a valid record starting position.
		 */
		if (!XRecOffIsValid(RecPtr->xrecoff))
			ereport(PANIC,
					(errmsg("invalid record offset at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));

		/*
		 * Since we are going to a random position in WAL, forget any prior
		 * state about what timeline we were in, and allow it to be any
		 * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
		 * to go backwards (but we can't reset that variable right here, since
		 * we might not change files at all).
		 */
		lastPageTLI = 0;		/* see comment in ValidXLOGHeader */
		lastSegmentTLI = 0;
		randAccess = true;		/* allow curFileTLI to go backwards too */
	}

	/* This is the first try to read this page. */
	failedSources = 0;
retry:
	/* Read the page containing the record */
	if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
	{
		/*
		 * In standby mode, XLogPageRead returning false means that promotion
		 * has been triggered.
		 */
		if (StandbyMode)
			return NULL;
		else
			goto next_record_is_invalid;
	}

	/* *********Above this xlogpageread should called ***********/
	pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
	if (targetRecOff == 0)
4847 4848
	{
		/*
4849 4850 4851 4852
		 * At page start, so skip over page header.  The Assert checks that
		 * we're not scribbling on caller's record pointer; it's OK because we
		 * can only get here in the continuing-from-prev-record case, since
		 * XRecOffIsValid rejected the zero-page-offset case otherwise.
4853
		 */
4854
		Assert(RecPtr == &tmpRecPtr);
4855
		RecPtr->xrecoff += pageHeaderSize;
4856 4857 4858 4859 4860 4861 4862 4863 4864
		targetRecOff = pageHeaderSize;
	}
	else if (targetRecOff < pageHeaderSize)
	{
		ereport(emode,
				(errmsg("invalid record offset at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
T
Tom Lane 已提交
4865
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
4866
		targetRecOff == pageHeaderSize)
4867
	{
4868 4869 4870
		ereport(emode,
				(errmsg("contrecord is requested by %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
4871 4872
		goto next_record_is_invalid;
	}
4873
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
4874

T
Tom Lane 已提交
4875
	/*
B
Bruce Momjian 已提交
4876 4877
	 * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
	 * required.
T
Tom Lane 已提交
4878
	 */
4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		if (record->xl_len != 0)
		{
			ereport(emode,
					(errmsg("invalid xlog switch record at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else if (record->xl_len == 0)
4890
	{
4891 4892 4893
		ereport(emode,
				(errmsg("record with zero length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
4894 4895
		goto next_record_is_invalid;
	}
4896 4897 4898 4899 4900 4901 4902 4903 4904
	if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
		record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
		XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
	{
		ereport(emode,
				(errmsg("invalid record length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
4905 4906 4907 4908
	if (record->xl_rmid > RM_MAX_ID)
	{
		ereport(emode,
				(errmsg("invalid resource manager ID %u at %X/%X",
B
Bruce Momjian 已提交
4909
						record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
4910 4911
		goto next_record_is_invalid;
	}
4912 4913 4914
	if (randAccess)
	{
		/*
B
Bruce Momjian 已提交
4915 4916
		 * We can't exactly verify the prev-link, but surely it should be less
		 * than the record's own address.
4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929
		 */
		if (!XLByteLT(record->xl_prev, *RecPtr))
		{
			ereport(emode,
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else
	{
		/*
B
Bruce Momjian 已提交
4930 4931 4932
		 * Record's prev-link should exactly match our previous location. This
		 * check guards against torn WAL pages where a stale but valid-looking
		 * WAL record starts on a sector boundary.
4933 4934 4935 4936 4937 4938 4939 4940 4941 4942
		 */
		if (!XLByteEQ(record->xl_prev, ReadRecPtr))
		{
			ereport(emode,
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
B
Bruce Momjian 已提交
4943

T
Tom Lane 已提交
4944
	/*
B
Bruce Momjian 已提交
4945
	 * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
4946 4947 4948 4949
	 * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
	 * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
	 * enough for all "normal" records, but very large commit or abort records
	 * might need more space.)
T
Tom Lane 已提交
4950
	 */
4951
	total_len = record->xl_tot_len;
4952
	if (total_len > readRecordBufSize)
4953
	{
4954 4955
		uint32		newSize = total_len;

4956 4957
		newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
		newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
4958 4959 4960 4961 4962 4963 4964
		if (readRecordBuf)
			free(readRecordBuf);
		readRecordBuf = (char *) malloc(newSize);
		if (!readRecordBuf)
		{
			readRecordBufSize = 0;
			ereport(emode,
4965 4966
					(errmsg("cannot allocate %u bytes for record at %X/%X",
							newSize, RecPtr->xlogid, RecPtr->xrecoff)));
4967 4968 4969
			goto next_record_is_invalid;
		}
		readRecordBufSize = newSize;
4970
	}
4971 4972

	buffer = readRecordBuf;
4973
	nextRecord = NULL;
4974
	len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
T
Tom Lane 已提交
4975
	if (total_len > len)
4976
	{
T
Tom Lane 已提交
4977 4978
		/* Need to reassemble record */
		XLogContRecord *contrecord;
4979
		XLogRecPtr	pagelsn;
B
Bruce Momjian 已提交
4980
		uint32		gotlen = len;
4981

4982 4983 4984 4985
		/* Initialize pagelsn to the beginning of the page this record is on */
		pagelsn = *RecPtr;
		pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;

T
Tom Lane 已提交
4986
		memcpy(buffer, record, len);
4987
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
4988
		buffer += len;
4989
		for (;;)
4990
		{
4991 4992 4993
			/* Calculate pointer to beginning of next page */
			pagelsn.xrecoff += XLOG_BLCKSZ;
			if (pagelsn.xrecoff >= XLogFileSize)
4994
			{
4995 4996
				(pagelsn.xlogid)++;
				pagelsn.xrecoff = 0;
4997
			}
4998 4999
			/* Wait for the next page to become available */
			if (!XLogPageRead(&pagelsn, emode, false, false))
T
Tom Lane 已提交
5000
			{
5001 5002 5003 5004 5005 5006 5007 5008
				/*
				 * In standby-mode, XLogPageRead returning false means that
				 * promotion has been triggered.
				 */
				if (StandbyMode)
					return NULL;
				else
					goto next_record_is_invalid;
T
Tom Lane 已提交
5009
			}
5010

T
Tom Lane 已提交
5011
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
5012
			{
5013 5014 5015
				ereport(emode,
						(errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
								readId, readSeg, readOff)));
5016 5017
				goto next_record_is_invalid;
			}
5018 5019
			pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
			contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
B
Bruce Momjian 已提交
5020
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
5021
				total_len != (contrecord->xl_rem_len + gotlen))
5022
			{
5023 5024 5025 5026
				ereport(emode,
						(errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
								contrecord->xl_rem_len,
								readId, readSeg, readOff)));
5027 5028
				goto next_record_is_invalid;
			}
5029
			len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
T
Tom Lane 已提交
5030
			if (contrecord->xl_rem_len > len)
5031
			{
B
Bruce Momjian 已提交
5032
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
5033 5034 5035 5036 5037 5038 5039 5040 5041 5042
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
5043
		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
5044
		if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
5045
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
T
Tom Lane 已提交
5046
		{
B
Bruce Momjian 已提交
5047
			nextRecord = (XLogRecord *) ((char *) contrecord +
B
Bruce Momjian 已提交
5048
					MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
T
Tom Lane 已提交
5049 5050 5051
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
5052 5053
			pageHeaderSize +
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
T
Tom Lane 已提交
5054
		ReadRecPtr = *RecPtr;
5055
		/* needn't worry about XLOG SWITCH, it can't cross page boundaries */
T
Tom Lane 已提交
5056
		return record;
5057 5058
	}

T
Tom Lane 已提交
5059 5060 5061
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
5062
	if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
T
Tom Lane 已提交
5063 5064 5065 5066 5067 5068
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
B
Bruce Momjian 已提交
5069

5070 5071 5072 5073 5074 5075 5076 5077 5078
	/*
	 * Special processing if it's an XLOG SWITCH record
	 */
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		/* Pretend it extends to end of segment */
		EndRecPtr.xrecoff += XLogSegSize - 1;
		EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
		nextRecord = NULL;		/* definitely not on same page */
B
Bruce Momjian 已提交
5079

5080
		/*
B
Bruce Momjian 已提交
5081 5082 5083
		 * Pretend that readBuf contains the last page of the segment. This is
		 * just to avoid Assert failure in StartupXLOG if XLOG ends with this
		 * segment.
5084 5085 5086
		 */
		readOff = XLogSegSize - XLOG_BLCKSZ;
	}
5087 5088 5089 5090 5091 5092

	elogif(debug_xlog_record_read, LOG,
		   "xlog read record -- Read record %X/%X successfully with endrecptr %X/%X",
		   ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
		   EndRecPtr.xlogid, EndRecPtr.xrecoff);

T
Tom Lane 已提交
5093
	return (XLogRecord *) buffer;
5094

5095 5096 5097 5098 5099 5100 5101
next_record_is_invalid:

	elogif(debug_xlog_record_read, LOG,
		   "xlog record read -- next record is invalid.");

	failedSources |= readSource;

5102 5103 5104 5105 5106 5107
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}

5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148
	nextRecord = NULL;

	/* In standby-mode, keep trying */
	if (StandbyMode && !CheckForStandbyTrigger())
		goto retry;
	else
		return NULL;
}

/*
 * Close, re-set and clean all the necessary resources used during reading
 * XLog records.
 */
void
XLogCloseReadRecord(void)
{
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	else
		Assert(readFile == -1);

	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	if (readRecordBuf)
	{
		free(readRecordBuf);
		readRecordBuf = NULL;
	}

	readId = 0;
	readSeg = 0;
	readOff = 0;
	readLen = 0;
	readRecordBufSize = 0;
T
Tom Lane 已提交
5149
	nextRecord = NULL;
5150 5151 5152 5153 5154

	memset(&ReadRecPtr, 0, sizeof(XLogRecPtr));
	memset(&EndRecPtr, 0, sizeof(XLogRecPtr));

	elog((Debug_print_qd_mirroring ? LOG : DEBUG1),"close read record");
5155 5156
}

5157 5158 5159 5160
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
5161
 * ReadRecord.	It's not intended for use from anywhere else.
5162 5163
 */
static bool
5164
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool segmentonly)
5165
{
5166 5167
	XLogRecPtr	recaddr;

5168 5169
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
5170 5171 5172
		ereport(emode,
				(errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
						hdr->xlp_magic, readId, readSeg, readOff)));
5173 5174 5175 5176
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
5177 5178 5179
		ereport(emode,
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
5180 5181
		return false;
	}
5182
	if (hdr->xlp_info & XLP_LONG_HEADER)
5183
	{
5184
		XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
B
Bruce Momjian 已提交
5185

5186
		if (longhdr->xlp_sysid != ControlFile->system_identifier)
5187
		{
5188 5189
			char		fhdrident_str[32];
			char		sysident_str[32];
5190

5191
			/*
B
Bruce Momjian 已提交
5192 5193
			 * Format sysids separately to keep platform-dependent format code
			 * out of the translatable message string.
5194 5195 5196 5197 5198 5199 5200
			 */
			snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
					 longhdr->xlp_sysid);
			snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
					 ControlFile->system_identifier);
			ereport(emode,
					(errmsg("WAL file is from different system"),
B
Bruce Momjian 已提交
5201 5202
					 errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
							   fhdrident_str, sysident_str)));
5203 5204 5205 5206 5207 5208
			return false;
		}
		if (longhdr->xlp_seg_size != XLogSegSize)
		{
			ereport(emode,
					(errmsg("WAL file is from different system"),
B
Bruce Momjian 已提交
5209
					 errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
5210 5211
			return false;
		}
5212 5213 5214 5215 5216 5217 5218
		if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
		{
			ereport(emode,
					(errmsg("WAL file is from different system"),
					 errdetail("Incorrect XLOG_BLCKSZ in page header.")));
			return false;
		}
5219
	}
5220 5221 5222 5223 5224 5225 5226 5227 5228
	else if (readOff == 0)
	{
		/* hmm, first page of file doesn't have a long header? */
		ereport(emode,
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
		return false;
	}

5229 5230 5231 5232 5233 5234
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
		ereport(emode,
				(errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
B
Bruce Momjian 已提交
5235
						hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Check page TLI is one of the expected values.
	 */
	if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
	{
		ereport(emode,
				(errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
						hdr->xlp_tli,
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Since child timelines are always assigned a TLI greater than their
	 * immediate parent's TLI, we should never see TLI go backwards across
	 * successive pages of a consistent WAL sequence.
	 *
B
Bruce Momjian 已提交
5257
	 * Of course this check should only be applied when advancing sequentially
5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268
	 * across pages; therefore ReadRecord resets lastPageTLI and
	 * lastSegmentTLI to zero when going to a random page.
	 *
	 * Sometimes we re-open a segment that's already been partially replayed.
	 * In that case we cannot perform the normal TLI check: if there is a
	 * timeline switch within the segment, the first page has a smaller TLI
	 * than later pages following the timeline switch, and we might've read
	 * them already. As a weaker test, we still check that it's not smaller
	 * than the TLI we last saw at the beginning of a segment. Pass
	 * segmentonly = true when re-validating the first page like that, and the
	 * page you're actually interested in comes later.
5269
	 */
5270
	if (hdr->xlp_tli < (segmentonly ? lastSegmentTLI : lastPageTLI))
5271 5272 5273 5274 5275 5276 5277 5278
	{
		ereport(emode,
				(errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
						hdr->xlp_tli, lastPageTLI,
						readId, readSeg, readOff)));
		return false;
	}
	lastPageTLI = hdr->xlp_tli;
5279 5280 5281
	if (readOff == 0)
		lastSegmentTLI = hdr->xlp_tli;

5282 5283 5284 5285 5286 5287 5288
	return true;
}

/*
 * Try to read a timeline's history file.
 *
 * If successful, return the list of component TLIs (the given TLI followed by
B
Bruce Momjian 已提交
5289
 * its ancestor TLIs).	If we can't find the history file, assume that the
5290 5291 5292
 * timeline has no parents, and return a list of just the specified timeline
 * ID.
 */
5293 5294
List *
XLogReadTimeLineHistory(TimeLineID targetTLI)
5295 5296 5297 5298
{
	List	   *result;
	char		path[MAXPGPATH];
	char		fline[MAXPGPATH];
B
Bruce Momjian 已提交
5299
	FILE	   *fd;
5300

5301 5302 5303 5304 5305
	/* Timeline 1 does not have a history file, so no need to check */
	if (targetTLI == 1)
		return list_make1_int((int) targetTLI);

	TLHistoryFilePath(path, targetTLI);
5306

B
Bruce Momjian 已提交
5307
	fd = AllocateFile(path, "r");
5308 5309 5310 5311 5312
	if (fd == NULL)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
5313
					 errmsg("could not open file \"%s\": %m", path)));
5314 5315 5316 5317 5318 5319
		/* Not there, so assume no parents */
		return list_make1_int((int) targetTLI);
	}

	result = NIL;

B
Bruce Momjian 已提交
5320 5321 5322
	/*
	 * Parse the file...
	 */
5323
	while (fgets(fline, sizeof(fline), fd) != NULL)
5324 5325
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
5326 5327 5328
		char	   *ptr;
		char	   *endptr;
		TimeLineID	tli;
5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348

		for (ptr = fline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* expect a numeric timeline ID as first field of line */
		tli = (TimeLineID) strtoul(ptr, &endptr, 0);
		if (endptr == ptr)
			ereport(FATAL,
					(errmsg("syntax error in history file: %s", fline),
					 errhint("Expected a numeric timeline ID.")));

		if (result &&
			tli <= (TimeLineID) linitial_int(result))
			ereport(FATAL,
					(errmsg("invalid data in history file: %s", fline),
B
Bruce Momjian 已提交
5349
				   errhint("Timeline IDs must be in increasing sequence.")));
5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362

		/* Build list with newest item first */
		result = lcons_int((int) tli, result);

		/* we ignore the remainder of each line */
	}

	FreeFile(fd);

	if (result &&
		targetTLI <= (TimeLineID) linitial_int(result))
		ereport(FATAL,
				(errmsg("invalid data in history file \"%s\"", path),
B
Bruce Momjian 已提交
5363
			errhint("Timeline IDs must be less than child timeline's ID.")));
5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380

	result = lcons_int((int) targetTLI, result);

	ereport(DEBUG3,
			(errmsg_internal("history of timeline %u is %s",
							 targetTLI, nodeToString(result))));

	return result;
}

/*
 * Probe whether a timeline history file exists for the given timeline ID
 */
static bool
existsTimeLineHistory(TimeLineID probeTLI)
{
	char		path[MAXPGPATH];
B
Bruce Momjian 已提交
5381
	FILE	   *fd;
5382

5383
	TLHistoryFilePath(path, probeTLI);
5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395

	fd = AllocateFile(path, "r");
	if (fd != NULL)
	{
		FreeFile(fd);
		return true;
	}
	else
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
5396
					 errmsg("could not open file \"%s\": %m", path)));
5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414
		return false;
	}
}

/*
 * Find the newest existing timeline, assuming that startTLI exists.
 *
 * Note: while this is somewhat heuristic, it does positively guarantee
 * that (result + 1) is not a known timeline, and therefore it should
 * be safe to assign that ID to a new timeline.
 */
static TimeLineID
findNewestTimeLine(TimeLineID startTLI)
{
	TimeLineID	newestTLI;
	TimeLineID	probeTLI;

	/*
B
Bruce Momjian 已提交
5415 5416
	 * The algorithm is just to probe for the existence of timeline history
	 * files.  XXX is it useful to allow gaps in the sequence?
5417 5418 5419
	 */
	newestTLI = startTLI;

B
Bruce Momjian 已提交
5420
	for (probeTLI = startTLI + 1;; probeTLI++)
5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443
	{
		if (existsTimeLineHistory(probeTLI))
		{
			newestTLI = probeTLI;		/* probeTLI exists */
		}
		else
		{
			/* doesn't exist, assume we're done */
			break;
		}
	}

	return newestTLI;
}

/*
 * Create a new timeline history file.
 *
 *	newTLI: ID of the new timeline
 *	parentTLI: ID of its immediate parent
 *	endTLI et al: ID of the last used WAL file, for annotation purposes
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
5444
 * considerations.	But we should be just as tense as XLogFileInit to avoid
5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458
 * emplacing a bogus file.
 */
static void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
					 TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		xlogfname[MAXFNAMELEN];
	char		buffer[BLCKSZ];
	int			srcfd;
	int			fd;
	int			nbytes;
5459
	char		*xlogDir = NULL;
5460

B
Bruce Momjian 已提交
5461
	Assert(newTLI > parentTLI); /* else bad selection of newTLI */
5462 5463 5464 5465

	/*
	 * Write into a temp file name.
	 */
5466 5467 5468 5469 5470 5471
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d", xlogDir, (int) getpid()) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate filename %s/xlogtemp.%d", xlogDir, (int) getpid())));
	}
	pfree(xlogDir);
5472 5473
	unlink(tmppath);

5474
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
5475 5476 5477
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
5478
		ereport(ERROR,
5479 5480 5481
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

5482
	TLHistoryFilePath(path, parentTLI);
5483 5484 5485 5486 5487

	srcfd = BasicOpenFile(path, O_RDONLY, 0);
	if (srcfd < 0)
	{
		if (errno != ENOENT)
5488
			ereport(ERROR,
5489
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
5490
					 errmsg("could not open file \"%s\": %m", path)));
5491 5492 5493 5494 5495 5496 5497 5498 5499
		/* Not there, so assume parent has no parents */
	}
	else
	{
		for (;;)
		{
			errno = 0;
			nbytes = (int) read(srcfd, buffer, sizeof(buffer));
			if (nbytes < 0 || errno != 0)
5500
				ereport(ERROR,
5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			if (nbytes == 0)
				break;
			errno = 0;
			if ((int) write(fd, buffer, nbytes) != nbytes)
			{
				int			save_errno = errno;

				/*
				 * If we fail to make the file, delete it to release disk
				 * space
				 */
				unlink(tmppath);
B
Bruce Momjian 已提交
5515 5516

				/*
B
Bruce Momjian 已提交
5517
				 * if write didn't set errno, assume problem is no disk space
B
Bruce Momjian 已提交
5518
				 */
5519 5520
				errno = save_errno ? save_errno : ENOSPC;

5521
				ereport(ERROR,
5522
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
5523
					 errmsg("could not write to file \"%s\": %m", tmppath)));
5524 5525 5526 5527 5528 5529 5530 5531
			}
		}
		close(srcfd);
	}

	/*
	 * Append one line with the details of this timeline split.
	 *
B
Bruce Momjian 已提交
5532 5533
	 * If we did have a parent file, insert an extra newline just in case the
	 * parent file failed to end with one.
5534 5535 5536 5537 5538 5539 5540 5541 5542 5543
	 */
	XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);

	snprintf(buffer, sizeof(buffer),
			 "%s%u\t%s\t%s transaction %u at %s\n",
			 (srcfd < 0) ? "" : "\n",
			 parentTLI,
			 xlogfname,
			 recoveryStopAfter ? "after" : "before",
			 recoveryStopXid,
5544
			 timestamptz_to_str(recoveryStopTime));
5545 5546 5547 5548 5549 5550 5551 5552

	nbytes = strlen(buffer);
	errno = 0;
	if ((int) write(fd, buffer, nbytes) != nbytes)
	{
		int			save_errno = errno;

		/*
B
Bruce Momjian 已提交
5553
		 * If we fail to make the file, delete it to release disk space
5554 5555 5556 5557 5558
		 */
		unlink(tmppath);
		/* if write didn't set errno, assume problem is no disk space */
		errno = save_errno ? save_errno : ENOSPC;

5559
		ereport(ERROR,
5560 5561 5562 5563 5564
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", tmppath)));
	}

	if (pg_fsync(fd) != 0)
5565
		ereport(ERROR,
5566 5567 5568 5569
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
5570
		ereport(ERROR,
5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));


	/*
	 * Now move the completed history file into place with its final name.
	 */
	TLHistoryFilePath(path, newTLI);

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
	 */
#if HAVE_WORKING_LINK
	if (link(tmppath, path) < 0)
5587
		ereport(ERROR,
5588 5589 5590 5591 5592 5593
				(errcode_for_file_access(),
				 errmsg("could not link file \"%s\" to \"%s\": %m",
						tmppath, path)));
	unlink(tmppath);
#else
	if (rename(tmppath, path) < 0)
5594
		ereport(ERROR,
5595 5596 5597 5598 5599 5600 5601 5602 5603 5604
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						tmppath, path)));
#endif

	/* The history file can be archived immediately. */
	TLHistoryFileName(histfname, newTLI);
	XLogArchiveNotify(histfname);
}

5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662
static void
ControlFileWatcherSaveInitial(void)
{
	ControlFileWatcher->current_checkPointLoc = ControlFile->checkPoint;
	ControlFileWatcher->current_prevCheckPointLoc = ControlFile->prevCheckPoint;
	ControlFileWatcher->current_checkPointCopy_redo = ControlFile->checkPointCopy.redo;

	if (Debug_print_control_checkpoints)
		elog(LOG,"pg_control checkpoint: initial values (checkpoint loc %s, previous loc %s, copy's redo loc %s)",
			 XLogLocationToString_Long(&ControlFile->checkPoint),
			 XLogLocationToString2_Long(&ControlFile->prevCheckPoint),
			 XLogLocationToString3_Long(&ControlFile->checkPointCopy.redo));

	ControlFileWatcher->watcherInitialized = true;
}

static void
ControlFileWatcherCheckForChange(void)
{
	XLogRecPtr  writeLoc;
	XLogRecPtr  flushedLoc;

	if (!XLByteEQ(ControlFileWatcher->current_checkPointLoc,ControlFile->checkPoint) ||
		!XLByteEQ(ControlFileWatcher->current_prevCheckPointLoc,ControlFile->prevCheckPoint) ||
		!XLByteEQ(ControlFileWatcher->current_checkPointCopy_redo,ControlFile->checkPointCopy.redo))
	{
		ControlFileWatcher->current_checkPointLoc = ControlFile->checkPoint;
		ControlFileWatcher->current_prevCheckPointLoc = ControlFile->prevCheckPoint;
		ControlFileWatcher->current_checkPointCopy_redo = ControlFile->checkPointCopy.redo;

		if (XLogGetWriteAndFlushedLoc(&writeLoc, &flushedLoc))
		{
			bool problem = XLByteLE(flushedLoc,ControlFile->checkPoint);
			if (problem)
				elog(PANIC,"Checkpoint location %s for pg_control file is not flushed (write loc %s, flushed loc is %s)",
				     XLogLocationToString_Long(&ControlFile->checkPoint),
				     XLogLocationToString2_Long(&writeLoc),
				     XLogLocationToString3_Long(&flushedLoc));

			if (Debug_print_control_checkpoints)
				elog(LOG,"pg_control checkpoint: change (checkpoint loc %s, previous loc %s, copy's redo loc %s, write loc %s, flushed loc %s)",
					 XLogLocationToString_Long(&ControlFile->checkPoint),
					 XLogLocationToString2_Long(&ControlFile->prevCheckPoint),
					 XLogLocationToString3_Long(&ControlFile->checkPointCopy.redo),
					 XLogLocationToString4_Long(&writeLoc),
					 XLogLocationToString5_Long(&flushedLoc));
		}
		else
		{
			if (Debug_print_control_checkpoints)
				elog(LOG,"pg_control checkpoint: change (checkpoint loc %s, previous loc %s, copy's redo loc %s)",
					 XLogLocationToString_Long(&ControlFile->checkPoint),
					 XLogLocationToString2_Long(&ControlFile->prevCheckPoint),
					 XLogLocationToString3_Long(&ControlFile->checkPointCopy.redo));
		}
	}
}

5663 5664
/*
 * I/O routines for pg_control
5665 5666
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
5667
 * contents of pg_control.	WriteControlFile() initializes pg_control
5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */
static void
WriteControlFile(void)
{
5680 5681
	MirroredFlatFileOpen	mirroredOpen;

B
Bruce Momjian 已提交
5682
	char		buffer[PG_CONTROL_SIZE];		/* need not be aligned */
5683 5684

	/*
T
Tom Lane 已提交
5685
	 * Initialize version and compatibility-check fields
5686
	 */
T
Tom Lane 已提交
5687 5688
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
5689 5690 5691 5692

	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
	ControlFile->floatFormat = FLOATFORMAT_VALUE;

5693 5694
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
5695
	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
5696
	ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
5697 5698

	ControlFile->nameDataLen = NAMEDATALEN;
5699
	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
5700

5701 5702
	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

5703
#ifdef HAVE_INT64_TIMESTAMP
5704
	ControlFile->enableIntTimes = true;
5705
#else
5706
	ControlFile->enableIntTimes = false;
5707
#endif
5708 5709
	ControlFile->float4ByVal = FLOAT4PASSBYVAL;
	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
5710

T
Tom Lane 已提交
5711
	/* Contents are protected with a CRC */
5712 5713
	INIT_CRC32C(ControlFile->crc);
	COMP_CRC32C(ControlFile->crc,
5714 5715
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
5716
	FIN_CRC32C(ControlFile->crc);
T
Tom Lane 已提交
5717

5718
	/*
5719 5720 5721 5722 5723
	 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
	 * excess over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail when we
	 * check the contents of the file, but hopefully with a more specific
	 * error than "couldn't read pg_control".
5724
	 */
5725 5726
	if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
		elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
5727

5728
	memset(buffer, 0, PG_CONTROL_SIZE);
5729 5730
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

5731 5732 5733 5734 5735 5736 5737 5738
	MirroredFlatFile_Open(
					&mirroredOpen,
					XLOG_CONTROL_FILE_SUBDIR,
					XLOG_CONTROL_FILE_SIMPLE,
					O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					S_IRUSR | S_IWUSR,
					/* suppressError */ false,
					/* atomic operation */ false,
5739
					/* isMirrorRecovery */ false);
5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754

	MirroredFlatFile_Write(
					&mirroredOpen,
					0,
					buffer,
					PG_CONTROL_SIZE,
					/* suppressError */ false);

	MirroredFlatFile_Flush(
					&mirroredOpen,
					/* suppressError */ false);

	MirroredFlatFile_Close(&mirroredOpen);

	ControlFileWatcherSaveInitial();
5755 5756 5757 5758 5759
}

static void
ReadControlFile(void)
{
5760
	pg_crc32	crc;
5761 5762 5763 5764 5765
	int			fd;

	/*
	 * Read data...
	 */
5766 5767 5768
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
5769
	if (fd < 0)
5770 5771 5772
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
5773
						XLOG_CONTROL_FILE)));
5774 5775

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
5776 5777
		ereport(PANIC,
				(errcode_for_file_access(),
5778
				 errmsg("could not read from control file: %m")));
5779 5780 5781

	close(fd);

T
Tom Lane 已提交
5782
	/*
B
Bruce Momjian 已提交
5783 5784 5785 5786
	 * Check for expected pg_control format version.  If this is wrong, the
	 * CRC check will likely fail because we'll be checking the wrong number
	 * of bytes.  Complaining about wrong version will probably be more
	 * enlightening than complaining about wrong CRC.
T
Tom Lane 已提交
5787
	 */
5788 5789 5790 5791 5792

	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
5793 5794
		 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
			ControlFile->pg_control_version, ControlFile->pg_control_version,
5795 5796 5797
						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));

T
Tom Lane 已提交
5798
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
5799 5800 5801
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
B
Bruce Momjian 已提交
5802 5803
				  " but the server was compiled with PG_CONTROL_VERSION %d.",
						ControlFile->pg_control_version, PG_CONTROL_VERSION),
5804
				 errhint("It looks like you need to initdb.")));
5805

T
Tom Lane 已提交
5806
	/* Now check the CRC. */
5807 5808
	INIT_CRC32C(crc);
	COMP_CRC32C(crc,
5809 5810
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
5811
	FIN_CRC32C(crc);
5812

5813
	if (!EQ_CRC32C(crc, ControlFile->crc))
5814 5815
		ereport(FATAL,
				(errmsg("incorrect checksum in control file")));
5816

5817
	/*
5818
	 * Do compatibility checking immediately.  If the database isn't
5819 5820
	 * compatible with the backend executable, we want to abort before we can
	 * possibly do any damage.
5821
	 */
5822
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
5823 5824 5825
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
B
Bruce Momjian 已提交
5826 5827
				  " but the server was compiled with CATALOG_VERSION_NO %d.",
						ControlFile->catalog_version_no, CATALOG_VERSION_NO),
5828
				 errhint("It looks like you need to initdb.")));
5829 5830 5831
	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5832 5833 5834 5835
		   errdetail("The database cluster was initialized with MAXALIGN %d,"
					 " but the server was compiled with MAXALIGN %d.",
					 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
				 errhint("It looks like you need to initdb.")));
5836 5837 5838
	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
P
Peter Eisentraut 已提交
5839
				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
5840
				 errhint("It looks like you need to initdb.")));
5841
	if (ControlFile->blcksz != BLCKSZ)
5842 5843
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5844 5845 5846 5847
			 errdetail("The database cluster was initialized with BLCKSZ %d,"
					   " but the server was compiled with BLCKSZ %d.",
					   ControlFile->blcksz, BLCKSZ),
				 errhint("It looks like you need to recompile or initdb.")));
5848
	if (ControlFile->relseg_size != RELSEG_SIZE)
5849 5850
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5851 5852 5853 5854
		errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
				  " but the server was compiled with RELSEG_SIZE %d.",
				  ControlFile->relseg_size, RELSEG_SIZE),
				 errhint("It looks like you need to recompile or initdb.")));
5855 5856 5857
	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5858 5859 5860
		errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
				  " but the server was compiled with XLOG_BLCKSZ %d.",
				  ControlFile->xlog_blcksz, XLOG_BLCKSZ),
5861
				 errhint("It looks like you need to recompile or initdb.")));
5862 5863 5864 5865
	if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
B
Bruce Momjian 已提交
5866
					   " but the server was compiled with XLOG_SEG_SIZE %d.",
5867
						   ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
B
Bruce Momjian 已提交
5868
				 errhint("It looks like you need to recompile or initdb.")));
5869
	if (ControlFile->nameDataLen != NAMEDATALEN)
5870 5871
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
5872 5873 5874 5875
		errdetail("The database cluster was initialized with NAMEDATALEN %d,"
				  " but the server was compiled with NAMEDATALEN %d.",
				  ControlFile->nameDataLen, NAMEDATALEN),
				 errhint("It looks like you need to recompile or initdb.")));
5876
	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
5877 5878
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
5879
				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
B
Bruce Momjian 已提交
5880
					  " but the server was compiled with INDEX_MAX_KEYS %d.",
5881
						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
B
Bruce Momjian 已提交
5882
				 errhint("It looks like you need to recompile or initdb.")));
5883 5884 5885 5886
	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
B
Bruce Momjian 已提交
5887 5888
				" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
			  ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
5889
				 errhint("It looks like you need to recompile or initdb.")));
5890 5891

#ifdef HAVE_INT64_TIMESTAMP
5892
	if (ControlFile->enableIntTimes != true)
5893 5894 5895
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
5896 5897
				  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
5898
#else
5899
	if (ControlFile->enableIntTimes != false)
5900 5901 5902
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
5903 5904
			   " but the server was compiled without HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
5905 5906
#endif

5907 5908 5909 5910 5911
#ifdef USE_FLOAT4_BYVAL
	if (ControlFile->float4ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
5912
					  " but the server was compiled with USE_FLOAT4_BYVAL."),
5913 5914 5915 5916 5917
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float4ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
5918 5919
		errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
				  " but the server was compiled without USE_FLOAT4_BYVAL."),
5920 5921 5922 5923 5924 5925 5926 5927
				 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT8_BYVAL
	if (ControlFile->float8ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
5928
					  " but the server was compiled with USE_FLOAT8_BYVAL."),
5929 5930 5931 5932 5933
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float8ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
5934 5935
		errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
				  " but the server was compiled without USE_FLOAT8_BYVAL."),
5936 5937 5938
				 errhint("It looks like you need to recompile or initdb.")));
#endif

5939 5940 5941 5942
	/* Make the initdb settings visible as GUC variables, too */
	SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
					PGC_INTERNAL, PGC_S_OVERRIDE);

5943 5944 5945 5946 5947 5948 5949 5950
	if (!ControlFileWatcher->watcherInitialized)
	{
		ControlFileWatcherSaveInitial();
	}
	else
	{
		ControlFileWatcherCheckForChange();
	}
5951 5952
}

5953 5954
static bool
XLogGetWriteAndFlushedLoc(XLogRecPtr *writeLoc, XLogRecPtr *flushedLoc)
5955
{
5956 5957
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
5958

5959 5960 5961 5962
	SpinLockAcquire(&xlogctl->info_lck);
	*writeLoc = xlogctl->LogwrtResult.Write;
	*flushedLoc = xlogctl->LogwrtResult.Flush;
	SpinLockRelease(&xlogctl->info_lck);
5963

5964 5965
	return (writeLoc->xlogid != 0 || writeLoc->xrecoff != 0);
}
5966

5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027
/*
 * Very specific purpose routine for FileRep that flushes out XLOG records from the
 * XLOG memory cache to disk.
 */
void
XLogFileRepFlushCache(
	XLogRecPtr	*lastChangeTrackingEndLoc)
{
	/*
	 * We hold the ChangeTrackingTransitionLock EXCLUSIVE, thus the lastChangeTrackingEndLoc
	 * value is the previous location -- the one we want.
	 *
	 * Since the lock is acquired after ALL WRITES and FSYNCS in XLogInsert_Internal,
	 * we know this flush is safe (i.e. will not hang) and will push out all XLOG records we
	 * want to see in the next call to ChangeTracking_CreateInitialFromPreviousCheckpoint.
	 */

	*lastChangeTrackingEndLoc = XLogCtl->lastChangeTrackingEndLoc;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "XLogFileRepFlushCache: Going flush through location %s...",
			 XLogLocationToString(lastChangeTrackingEndLoc));

	XLogFlush(*lastChangeTrackingEndLoc);
}

void
XLogInChangeTrackingTransition(void)
{
	XLogRecPtr	lastChangeTrackingEndLoc;
	XLogRecPtr      recPtrInit = {0, 0};

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "XLogInChangeTrackingTransition: Acquiring ChangeTrackingTransitionLock...");

	LWLockAcquire(ChangeTrackingTransitionLock, LW_EXCLUSIVE);

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "XLogInChangeTrackingTransition: SegmentStateInChangeTrackingTransition...");

	FileRep_SetSegmentState(SegmentStateInChangeTrackingTransition, FaultTypeNotInitialized);

	XLogFileRepFlushCache(&lastChangeTrackingEndLoc);

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "XLogInChangeTrackingTransition: Calling ChangeTracking_CreateInitialFromPreviousCheckpoint with lastChangeTrackingEndLoc %s",
			 XLogLocationToString(&lastChangeTrackingEndLoc));

	/*
	 * During gpstart the following order is followed for recovery:
	 *                      a) xlog records from last checkpoint are replayed into change tracking log file
	 *                      b) xlog is replayed by 3 pass mechanism
	 * In that case XLogCtl->lastChangeTrackingEndLoc will be still set to {0,0} and
	 * in order to insert all xlog records from last checkpoint into change tracking log file
	 * ChangeTracking_CreateInitialFromPreviousCheckpoint(NULL); has to be called.
	 */
	if (XLByteEQ(lastChangeTrackingEndLoc, recPtrInit))
6028
	{
6029 6030 6031 6032 6033
		ChangeTracking_CreateInitialFromPreviousCheckpoint(NULL);
	}
	else
	{
		ChangeTracking_CreateInitialFromPreviousCheckpoint(&lastChangeTrackingEndLoc);
6034
	}
6035

6036
	LWLockRelease(ChangeTrackingTransitionLock);
6037

6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048
	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "XLogInChangeTrackingTransition: Released ChangeTrackingTransitionLock");

}

void
UpdateControlFile(void)
{
	MirroredFlatFileOpen	mirroredOpen;

6049 6050
	INIT_CRC32C(ControlFile->crc);
	COMP_CRC32C(ControlFile->crc,
6051 6052
				   (char *) ControlFile,
				   offsetof(ControlFileData, crc));
6053
	FIN_CRC32C(ControlFile->crc);
6054 6055 6056 6057 6058 6059 6060 6061 6062

	MirroredFlatFile_Open(
					&mirroredOpen,
					XLOG_CONTROL_FILE_SUBDIR,
					XLOG_CONTROL_FILE_SIMPLE,
					O_RDWR | PG_BINARY,
					S_IRUSR | S_IWUSR,
					/* suppressError */ false,
					/* atomic operation */ false,
6063
					/* isMirrorRecovery */ false);
6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090

	MirroredFlatFile_Write(
					&mirroredOpen,
					0,
					ControlFile,
					PG_CONTROL_SIZE,
					/* suppressError */ false);

	MirroredFlatFile_Flush(
					&mirroredOpen,
					/* suppressError */ false);

	MirroredFlatFile_Close(&mirroredOpen);

	Assert (ControlFileWatcher->watcherInitialized);

	ControlFileWatcherCheckForChange();
}

/*
 * Returns the unique system identifier from control file.
 */
uint64
GetSystemIdentifier(void)
{
	Assert(ControlFile != NULL);
	return ControlFile->system_identifier;
6091 6092
}

6093
/*
T
Tom Lane 已提交
6094
 * Initialization of shared memory for XLOG
6095
 */
6096
Size
6097
XLOGShmemSize(void)
6098
{
6099
	Size		size;
6100

6101 6102 6103 6104 6105 6106 6107
	/* XLogCtl */
	size = sizeof(XLogCtlData);
	/* xlblocks array */
	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
	/* extra alignment padding for XLOG I/O buffers */
	size = add_size(size, ALIGNOF_XLOG_BUFFER);
	/* and the buffers themselves */
6108
	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
6109 6110

	/*
B
Bruce Momjian 已提交
6111 6112 6113
	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
	 * routine again below to compute the actual allocation size.
6114 6115
	 */

6116 6117 6118 6119
	/*
	 * Similary, we also don't PgControlWatch for the above reasons, too.
	 */

6120
	return size;
6121 6122 6123 6124 6125
}

void
XLOGShmemInit(void)
{
6126
	bool		foundCFile,
6127 6128
				foundXLog,
				foundCFileWatcher;
6129
	char	   *allocptr;
6130

6131
	ControlFile = (ControlFileData *)
6132
		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
6133 6134
	ControlFileWatcher = (ControlFileWatch *)
		ShmemInitStruct("Control File Watcher", sizeof(ControlFileWatch), &foundCFileWatcher);
6135 6136
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
6137

6138
	if (foundCFile || foundXLog || foundCFileWatcher)
6139 6140
	{
		/* both should be present or neither */
6141
		Assert(foundCFile && foundXLog && foundCFileWatcher);
6142 6143
		return;
	}
6144

T
Tom Lane 已提交
6145
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
6146

6147 6148
	XLogCtl->pass4_PTCatVerificationPassed = true;

T
Tom Lane 已提交
6149
	/*
B
Bruce Momjian 已提交
6150 6151 6152
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
	 * multiple of the alignment for same, so no extra alignment padding is
	 * needed here.
T
Tom Lane 已提交
6153
	 */
6154 6155
	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
T
Tom Lane 已提交
6156
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
6157
	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
B
Bruce Momjian 已提交
6158

T
Tom Lane 已提交
6159
	/*
6160
	 * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
T
Tom Lane 已提交
6161
	 */
6162 6163
	allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
	XLogCtl->pages = allocptr;
6164
	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
T
Tom Lane 已提交
6165 6166

	/*
B
Bruce Momjian 已提交
6167 6168
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
	 * in additional info.)
T
Tom Lane 已提交
6169 6170
	 */
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
6171
	XLogCtl->SharedRecoveryInProgress = true;
T
Tom Lane 已提交
6172
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
6173
	SpinLockInit(&XLogCtl->info_lck);
6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189
	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);

	XLogCtl->haveLastCheckpointLoc = false;
	memset(&XLogCtl->lastCheckpointLoc, 0, sizeof(XLogRecPtr));
	memset(&XLogCtl->lastCheckpointEndLoc, 0, sizeof(XLogRecPtr));

	/*
	 * Initialize the shared memory by the parameter given to postmaster.
	 * GpStandbyDbid could be inconsistent with the catalog if the postmaster
	 * is given wrong id, but there is no chance to check it in this early
	 * stage of startup, and this is how we have been doing historically.
	 */
	XLogCtl->standbyDbid = GpStandbyDbid;

	SpinLockInit(&XLogCtl->resynchronize_lck);
}
T
Tom Lane 已提交
6190

6191 6192 6193 6194 6195 6196 6197 6198 6199
/**
 * This should be called when we are sure that it is safe to try to read the control file and BEFORE
 *  we have launched any child processes that need access to collation and ctype data.
 *
 * It is not safe to read the control file on a mirror because it may not be synchronized
 */
void
XLogStartupInit(void)
{
6200
	/*
B
Bruce Momjian 已提交
6201 6202 6203
	 * If we are not in bootstrap mode, pg_control should already exist. Read
	 * and validate it immediately (see comments in ReadControlFile() for the
	 * reasons why).
6204 6205 6206
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
6207 6208
}

6209 6210 6211 6212 6213 6214 6215 6216 6217 6218
/*
 * Are checksums enabled for data pages?
 */
bool
DataChecksumsEnabled(void)
{
	Assert(ControlFile != NULL);
	return (ControlFile->data_checksum_version > 0);
}

6219
/*
T
Tom Lane 已提交
6220 6221
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
6222 6223
 */
void
T
Tom Lane 已提交
6224
BootStrapXLOG(void)
6225
{
6226
	CheckPoint	checkPoint;
T
Tom Lane 已提交
6227 6228
	char	   *buffer;
	XLogPageHeader page;
6229
	XLogLongPageHeader longpage;
6230
	XLogRecord *record;
B
Bruce Momjian 已提交
6231
	bool		use_existent;
6232 6233
	uint64		sysidentifier;
	struct timeval tv;
6234
	pg_crc32	crc;
6235

6236
	/*
B
Bruce Momjian 已提交
6237 6238 6239 6240 6241 6242 6243 6244 6245 6246
	 * Select a hopefully-unique system identifier code for this installation.
	 * We use the result of gettimeofday(), including the fractional seconds
	 * field, as being about as unique as we can easily get.  (Think not to
	 * use random(), since it hasn't been seeded and there's no portable way
	 * to seed it other than the system clock value...)  The upper half of the
	 * uint64 value is just the tv_sec part, while the lower half is the XOR
	 * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
	 * unnecessarily if "uint64" is really only 32 bits wide.  A person
	 * knowing this encoding can determine the initialization time of the
	 * installation, which could perhaps be useful sometimes.
6247 6248 6249 6250 6251
	 */
	gettimeofday(&tv, NULL);
	sysidentifier = ((uint64) tv.tv_sec) << 32;
	sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

6252 6253 6254
	/* First timeline ID is always 1 */
	ThisTimeLineID = 1;

6255
	/* page buffer must be aligned suitably for O_DIRECT */
6256
	buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
6257
	page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
6258
	memset(page, 0, XLOG_BLCKSZ);
T
Tom Lane 已提交
6259

6260 6261 6262 6263 6264 6265 6266
	/*
	 * Set up information for the initial checkpoint record
	 *
	 * The initial checkpoint record is written to the beginning of the WAL
	 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
	 * used, so that we can use 0/0 to mean "before any valid WAL segment".
	 */
6267
	checkPoint.redo.xlogid = 0;
6268
	checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
6269
	checkPoint.ThisTimeLineID = ThisTimeLineID;
6270
	checkPoint.nextXidEpoch = 0;
6271
	checkPoint.nextXid = FirstNormalTransactionId;
6272
	checkPoint.nextOid = FirstBootstrapObjectId;
6273
	checkPoint.nextRelfilenode = FirstNormalObjectId;
6274
	checkPoint.nextMulti = FirstMultiXactId;
6275
	checkPoint.nextMultiOffset = 0;
6276
	checkPoint.time = (pg_time_t) time(NULL);
6277

6278 6279 6280
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;
6281 6282
	ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
	ShmemVariableCache->relfilenodeCount = 0;
6283
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6284

6285
	/* Set up the XLOG page header */
6286
	page->xlp_magic = XLOG_PAGE_MAGIC;
6287 6288
	page->xlp_info = XLP_LONG_HEADER;
	page->xlp_tli = ThisTimeLineID;
6289
	page->xlp_pageaddr.xlogid = 0;
6290
	page->xlp_pageaddr.xrecoff = XLogSegSize;
6291 6292 6293
	longpage = (XLogLongPageHeader) page;
	longpage->xlp_sysid = sysidentifier;
	longpage->xlp_seg_size = XLogSegSize;
6294
	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
6295 6296

	/* Insert the initial checkpoint record */
6297
	record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
6298
	record->xl_prev.xlogid = 0;
6299
	record->xl_prev.xrecoff = 0;
6300
	record->xl_xid = InvalidTransactionId;
6301
	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
6302
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
6303
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
6304
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
6305
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
6306

6307 6308 6309
	INIT_CRC32C(crc);
	COMP_CRC32C(crc, &checkPoint, sizeof(checkPoint));
	COMP_CRC32C(crc, (char *) record + sizeof(pg_crc32),
6310
			   SizeOfXLogRecord - sizeof(pg_crc32));
6311
	FIN_CRC32C(crc);
6312

6313 6314
	record->xl_crc = crc;

6315
	/* Create first XLOG segment file */
6316
	use_existent = false;
6317 6318 6319
	XLogFileInit(
		&mirroredLogFileOpen,
		0, 1, &use_existent, false);
6320

6321
	/* Write the first page with the initial record */
6322
	errno = 0;
6323 6324 6325 6326 6327
	if (MirroredFlatFile_Append(
			&mirroredLogFileOpen,
			page,
			XLOG_BLCKSZ,
			/* suppressError */ true))
6328
	{
6329 6330
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
6331
			  errmsg("could not write bootstrap transaction log file: %m")));
6332
	}
6333

6334 6335 6336
	if (MirroredFlatFile_Flush(
			&mirroredLogFileOpen,
			/* suppressError */ true))
6337 6338
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
6339
			  errmsg("could not fsync bootstrap transaction log file: %m")));
6340

6341 6342
	MirroredFlatFile_Close(
			&mirroredLogFileOpen);
6343

6344 6345
	/* Now create pg_control */

6346
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
6347
	/* Initialize pg_control status fields */
6348
	ControlFile->system_identifier = sysidentifier;
T
Tom Lane 已提交
6349 6350
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
6351
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
6352
	ControlFile->checkPointCopy = checkPoint;
6353 6354
	ControlFile->data_checksum_version = bootstrap_data_checksum_version;

6355
	/* some additional ControlFile fields are set in WriteControlFile() */
6356

6357
	WriteControlFile();
6358 6359 6360

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
6361
	BootStrapSUBTRANS();
6362
	BootStrapMultiXact();
6363
	DistributedLog_BootStrap();
6364

6365
	pfree(buffer);
6366 6367
}

6368
static char *
6369
str_time(pg_time_t tnow)
6370
{
6371
	static char buf[128];
6372

6373
	pg_strftime(buf, sizeof(buf),
6374 6375
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&tnow, log_timezone));
6376

6377
	return buf;
6378 6379
}

6380 6381
/*
 * See if there is a recovery command file (recovery.conf), and if so
6382
 * read in parameters for recovery in standby mode.
6383 6384 6385 6386 6387
 *
 * XXX longer term intention is to expand this to
 * cater for additional parameters and controls
 * possibly use a flex lexer similar to the GUC one
 */
6388 6389
void
XLogReadRecoveryCommandFile(int emode)
6390
{
B
Bruce Momjian 已提交
6391 6392 6393 6394 6395 6396
	FILE	   *fd;
	char		cmdline[MAXPGPATH];
	TimeLineID	rtli = 0;
	bool		rtliGiven = false;
	bool		syntaxError = false;

6397
	fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
6398 6399 6400
	if (fd == NULL)
	{
		if (errno == ENOENT)
6401
			return;				/* not there, so no recovery in standby mode */
6402
		ereport(FATAL,
B
Bruce Momjian 已提交
6403
				(errcode_for_file_access(),
6404
				 errmsg("could not open recovery command file \"%s\": %m",
6405
						RECOVERY_COMMAND_FILE)));
6406 6407
	}

6408 6409 6410
	ereport(emode,
			(errmsg("Found recovery.conf file, checking appropriate parameters "
					" for recovery in standby mode")));
6411

B
Bruce Momjian 已提交
6412 6413 6414
	/*
	 * Parse the file...
	 */
6415
	while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
6416 6417
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
6418 6419 6420
		char	   *ptr;
		char	   *tok1;
		char	   *tok2;
6421 6422 6423 6424 6425 6426 6427 6428 6429 6430

		for (ptr = cmdline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* identify the quoted parameter value */
B
Bruce Momjian 已提交
6431
		tok1 = strtok(ptr, "'");
6432 6433 6434 6435 6436
		if (!tok1)
		{
			syntaxError = true;
			break;
		}
B
Bruce Momjian 已提交
6437
		tok2 = strtok(NULL, "'");
6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450
		if (!tok2)
		{
			syntaxError = true;
			break;
		}
		/* reparse to get just the parameter name */
		tok1 = strtok(ptr, " \t=");
		if (!tok1)
		{
			syntaxError = true;
			break;
		}

6451
		if (strcmp(tok1, "primary_conninfo") == 0)
B
Bruce Momjian 已提交
6452
		{
6453 6454 6455 6456
			PrimaryConnInfo = pstrdup(tok2);
			ereport(emode,
					(errmsg("primary_conninfo = \"%s\"",
							PrimaryConnInfo)));
6457
		}
6458 6459 6460 6461 6462 6463 6464
		else if (strcmp(tok1, "recovery_end_command") == 0)
		{
			recoveryEndCommand = pstrdup(tok2);
			ereport(LOG,
					(errmsg("recovery_end_command = '%s'",
							recoveryEndCommand)));
		}
B
Bruce Momjian 已提交
6465 6466
		else if (strcmp(tok1, "recovery_target_timeline") == 0)
		{
6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485
			rtliGiven = true;
			if (strcmp(tok2, "latest") == 0)
				rtli = 0;
			else
			{
				errno = 0;
				rtli = (TimeLineID) strtoul(tok2, NULL, 0);
				if (errno == EINVAL || errno == ERANGE)
					ereport(FATAL,
							(errmsg("recovery_target_timeline is not a valid number: \"%s\"",
									tok2)));
			}
			if (rtli)
				ereport(LOG,
						(errmsg("recovery_target_timeline = %u", rtli)));
			else
				ereport(LOG,
						(errmsg("recovery_target_timeline = latest")));
		}
B
Bruce Momjian 已提交
6486 6487
		else if (strcmp(tok1, "recovery_target_xid") == 0)
		{
6488 6489 6490 6491
			errno = 0;
			recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
			if (errno == EINVAL || errno == ERANGE)
				ereport(FATAL,
B
Bruce Momjian 已提交
6492 6493
				 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
						 tok2)));
6494 6495 6496 6497 6498 6499
			ereport(LOG,
					(errmsg("recovery_target_xid = %u",
							recoveryTargetXid)));
			recoveryTarget = true;
			recoveryTargetExact = true;
		}
B
Bruce Momjian 已提交
6500 6501
		else if (strcmp(tok1, "recovery_target_time") == 0)
		{
6502 6503 6504 6505 6506 6507 6508 6509
			/*
			 * if recovery_target_xid specified, then this overrides
			 * recovery_target_time
			 */
			if (recoveryTargetExact)
				continue;
			recoveryTarget = true;
			recoveryTargetExact = false;
B
Bruce Momjian 已提交
6510

6511
			/*
6512
			 * Convert the time string given by the user to TimestampTz form.
6513
			 */
6514 6515
			recoveryTargetTime =
				DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
B
Bruce Momjian 已提交
6516
														CStringGetDatum(tok2),
6517 6518
												ObjectIdGetDatum(InvalidOid),
														Int32GetDatum(-1)));
6519
			ereport(LOG,
6520
					(errmsg("recovery_target_time = '%s'",
6521
							timestamptz_to_str(recoveryTargetTime))));
6522
		}
B
Bruce Momjian 已提交
6523
		else if (strcmp(tok1, "recovery_target_inclusive") == 0)
6524 6525 6526 6527
		{
			/*
			 * does nothing if a recovery_target is not also set
			 */
6528
			if (!parse_bool(tok2, &recoveryTargetInclusive))
6529 6530 6531
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
6532
			ereport(LOG,
6533 6534 6535 6536 6537
					(errmsg("standby_mode = %s", tok2)));
		}
		else if (strcmp(tok1, "standby_mode") == 0)
		{
			if (!parse_bool(tok2, &StandbyModeRequested))
6538 6539
				  ereport(ERROR,
							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6540
					  errmsg("parameter \"standby_mode\" requires a Boolean value")));
6541
		}
6542 6543 6544 6545 6546 6547 6548 6549
		else
			ereport(FATAL,
					(errmsg("unrecognized recovery parameter \"%s\"",
							tok1)));
	}

	FreeFile(fd);

B
Bruce Momjian 已提交
6550 6551
	if (syntaxError)
		ereport(FATAL,
6552 6553
				(errmsg("syntax error in recovery command file: %s",
						cmdline),
B
Bruce Momjian 已提交
6554
			  errhint("Lines should have the format parameter = 'value'.")));
6555 6556

	/*
6557
	 * Check for compulsory parameters
6558
	 */
6559
	if (StandbyModeRequested)
6560
	{
6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572
		if (PrimaryConnInfo == NULL)
			ereport(FATAL,
					(errmsg("recovery command file \"%s\" primary_conninfo not specified",
							RECOVERY_COMMAND_FILE),
					 errhint("The database server in standby mode needs primary_connection to connect to primary.")));
	}
	else
	{
		/* Currently, standby mode request is a must if recovery.conf file exists */
		ereport(FATAL,
				(errmsg("recovery command file \"%s\" request for standby mode not specified",
						RECOVERY_COMMAND_FILE)));
6573
	}
6574 6575 6576 6577 6578 6579
}

/*
 * Exit archive-recovery state
 */
static void
6580
exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
6581
{
B
Bruce Momjian 已提交
6582 6583
	char		recoveryPath[MAXPGPATH];
	char		xlogpath[MAXPGPATH];
6584
	char		*xlogDir = NULL;
6585
	XLogRecPtr	InvalidXLogRecPtr = {0, 0};
6586 6587

	/*
6588
	 * We are no longer in archive recovery state.
6589 6590 6591
	 */
	InArchiveRecovery = false;

6592 6593 6594 6595
	/*
	 * Update min recovery point one last time.
	 */
	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
6596 6597

	/*
B
Bruce Momjian 已提交
6598 6599 6600
	 * We should have the ending log segment currently open.  Verify, and then
	 * close it (to avoid problems on Windows with trying to rename or delete
	 * an open file).
6601 6602 6603 6604 6605 6606 6607 6608 6609
	 */
	Assert(readFile >= 0);
	Assert(readId == endLogId);
	Assert(readSeg == endLogSeg);

	close(readFile);
	readFile = -1;

	/*
B
Bruce Momjian 已提交
6610 6611 6612 6613 6614 6615 6616
	 * If the segment was fetched from archival storage, we want to replace
	 * the existing xlog segment (if any) with the archival version.  This is
	 * because whatever is in XLOGDIR is very possibly older than what we have
	 * from the archives, since it could have come from restoring a PGDATA
	 * backup.	In any case, the archival version certainly is more
	 * descriptive of what our current database state is, because that is what
	 * we replayed from.
6617
	 *
6618 6619
	 * Note that if we are establishing a new timeline, ThisTimeLineID is
	 * already set to the new value, and so we will create a new file instead
6620 6621
	 * of overwriting any existing file.  (This is, in fact, always the case
	 * at present.)
6622
	 */
6623 6624 6625 6626 6627
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(recoveryPath, MAXPGPATH, "%s/RECOVERYXLOG", xlogDir) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate path %s/RECOVERYXLOG", xlogDir)));	
	}
6628
	XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
6629 6630 6631 6632 6633 6634 6635 6636 6637 6638

	if (restoredFromArchive)
	{
		ereport(DEBUG3,
				(errmsg_internal("moving last restored xlog to \"%s\"",
								 xlogpath)));
		unlink(xlogpath);		/* might or might not exist */
		if (rename(recoveryPath, xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
6639
					 errmsg("could not rename file \"%s\" to \"%s\": %m",
6640 6641 6642 6643 6644 6645 6646 6647 6648 6649
							recoveryPath, xlogpath)));
		/* XXX might we need to fix permissions on the file? */
	}
	else
	{
		/*
		 * If the latest segment is not archival, but there's still a
		 * RECOVERYXLOG laying about, get rid of it.
		 */
		unlink(recoveryPath);	/* ignore any error */
B
Bruce Momjian 已提交
6650

6651
		/*
B
Bruce Momjian 已提交
6652 6653 6654
		 * If we are establishing a new timeline, we have to copy data from
		 * the last WAL segment of the old timeline to create a starting WAL
		 * segment for the new timeline.
6655 6656 6657 6658
		 *
		 * Notify the archiver that the last WAL segment of the old timeline
		 * is ready to copy to archival storage. Otherwise, it is not archived
		 * for a while.
6659 6660
		 */
		if (endTLI != ThisTimeLineID)
6661
		{
6662 6663
			XLogFileCopy(endLogId, endLogSeg,
						 endTLI, endLogId, endLogSeg);
6664 6665 6666 6667 6668 6669 6670

			if (XLogArchivingActive())
			{
				XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
				XLogArchiveNotify(xlogpath);
			}
		}
6671 6672 6673
	}

	/*
B
Bruce Momjian 已提交
6674 6675
	 * Let's just make real sure there are not .ready or .done flags posted
	 * for the new segment.
6676
	 */
6677 6678
	XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
	XLogArchiveCleanup(xlogpath);
6679

6680
	/* Get rid of any remaining recovered timeline-history file, too */
6681 6682 6683 6684
	if (snprintf(recoveryPath, MAXPGPATH, "%s/RECOVERYHISTORY", xlogDir) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate path %s/RECOVERYHISTORY", xlogDir)));
	}
B
Bruce Momjian 已提交
6685
	unlink(recoveryPath);		/* ignore any error */
6686 6687

	/*
B
Bruce Momjian 已提交
6688 6689
	 * Rename the config file out of the way, so that we don't accidentally
	 * re-enter archive recovery mode in a subsequent crash.
6690
	 */
6691 6692
	unlink(RECOVERY_COMMAND_DONE);
	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
6693 6694
		ereport(FATAL,
				(errcode_for_file_access(),
6695
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
6696
						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
6697

6698
	pfree(xlogDir);
6699 6700 6701 6702 6703 6704 6705 6706
}

/*
 * For point-in-time recovery, this function decides whether we want to
 * stop applying the XLOG at or after the current record.
 *
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 * *includeThis is set TRUE if we should apply this record before stopping.
6707 6708 6709
 *
 * We also track the timestamp of the latest applied COMMIT/ABORT record
 * in recoveryLastXTime, for logging purposes.
6710 6711
 * Also, some information is saved in recoveryStopXid et al for use in
 * annotating the new timeline's history file.
6712 6713 6714 6715 6716
 */
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
	bool		stopsHere;
B
Bruce Momjian 已提交
6717
	uint8		record_info;
B
Bruce Momjian 已提交
6718
	TimestampTz recordXtime;
6719 6720 6721 6722 6723 6724 6725

	/* We only consider stopping at COMMIT or ABORT records */
	if (record->xl_rmid != RM_XACT_ID)
		return false;
	record_info = record->xl_info & ~XLR_INFO_MASK;
	if (record_info == XLOG_XACT_COMMIT)
	{
B
Bruce Momjian 已提交
6726
		xl_xact_commit *recordXactCommitData;
6727 6728

		recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
6729
		recordXtime = recordXactCommitData->xact_time;
6730 6731 6732
	}
	else if (record_info == XLOG_XACT_ABORT)
	{
B
Bruce Momjian 已提交
6733
		xl_xact_abort *recordXactAbortData;
6734 6735

		recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
6736
		recordXtime = recordXactAbortData->xact_time;
6737 6738 6739 6740
	}
	else
		return false;

6741 6742
	/* Do we have a PITR target at all? */
	if (!recoveryTarget)
6743 6744
	{
		recoveryLastXTime = recordXtime;
6745
		return false;
6746
	}
6747

6748 6749 6750
	if (recoveryTargetExact)
	{
		/*
B
Bruce Momjian 已提交
6751 6752
		 * there can be only one transaction end record with this exact
		 * transactionid
6753
		 *
B
Bruce Momjian 已提交
6754
		 * when testing for an xid, we MUST test for equality only, since
B
Bruce Momjian 已提交
6755 6756 6757
		 * transactions are numbered in the order they start, not the order
		 * they complete. A higher numbered xid will complete before you about
		 * 50% of the time...
6758 6759 6760 6761 6762 6763 6764 6765
		 */
		stopsHere = (record->xl_xid == recoveryTargetXid);
		if (stopsHere)
			*includeThis = recoveryTargetInclusive;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
6766 6767 6768
		 * there can be many transactions that share the same commit time, so
		 * we stop after the last one, if we are inclusive, or stop at the
		 * first one if we are exclusive
6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779
		 */
		if (recoveryTargetInclusive)
			stopsHere = (recordXtime > recoveryTargetTime);
		else
			stopsHere = (recordXtime >= recoveryTargetTime);
		if (stopsHere)
			*includeThis = false;
	}

	if (stopsHere)
	{
6780 6781 6782 6783
		recoveryStopXid = record->xl_xid;
		recoveryStopTime = recordXtime;
		recoveryStopAfter = *includeThis;

6784 6785
		if (record_info == XLOG_XACT_COMMIT)
		{
6786
			if (recoveryStopAfter)
6787 6788
				ereport(LOG,
						(errmsg("recovery stopping after commit of transaction %u, time %s",
6789 6790
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6791 6792 6793
			else
				ereport(LOG,
						(errmsg("recovery stopping before commit of transaction %u, time %s",
6794 6795
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6796 6797 6798
		}
		else
		{
6799
			if (recoveryStopAfter)
6800 6801
				ereport(LOG,
						(errmsg("recovery stopping after abort of transaction %u, time %s",
6802 6803
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6804 6805 6806
			else
				ereport(LOG,
						(errmsg("recovery stopping before abort of transaction %u, time %s",
6807 6808
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
6809
		}
6810

6811 6812
		if (recoveryStopAfter)
			recoveryLastXTime = recordXtime;
6813
	}
6814 6815
	else
		recoveryLastXTime = recordXtime;
6816 6817 6818 6819

	return stopsHere;
}

6820
/*
6821 6822 6823 6824
 * Save timestamp of the next chunk of WAL records to apply.
 *
 * We keep this in XLogCtl, not a simple static variable, so that it can be
 * seen by all backends.
6825
 */
6826 6827
static void
SetCurrentChunkStartTime(TimestampTz xtime)
6828
{
6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->currentChunkStartTime = xtime;
	SpinLockRelease(&xlogctl->info_lck);
}

static void
printEndOfXLogFile(XLogRecPtr	*loc)
{
	uint32 seg = loc->xrecoff / XLogSegSize;

	XLogRecPtr roundedDownLoc;

6844
	XLogRecord *record;
6845
	XLogRecPtr	LastRec;
6846

6847
	/*
6848 6849
	 * Go back to the beginning of the log file and read forward to find
	 * the end of the transaction log.
6850
	 */
6851 6852
	roundedDownLoc.xlogid = loc->xlogid;
	roundedDownLoc.xrecoff = (seg * XLogSegSize) + SizeOfXLogLongPHD;
6853

6854
	XLogCloseReadRecord();
6855

6856 6857 6858 6859 6860 6861 6862
	record = XLogReadRecord(&roundedDownLoc, false, LOG);
	if (record == NULL)
	{
		elog(LOG,"Couldn't read transaction log file (logid %d, seg %d)",
			 loc->xlogid, seg);
		return;
	}
6863

6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879
	do
	{
		LastRec = ReadRecPtr;

		record = XLogReadRecord(NULL, false, DEBUG5);
	} while (record != NULL);

	record = XLogReadRecord(&LastRec, false, ERROR);

	elog(LOG,"found end of transaction log file %s",
		 XLogLocationToString_Long(&EndRecPtr));

	XLogCloseReadRecord();
}

static void
6880
StartupXLOG_InProduction(bool bgwriterLaunched)
6881 6882 6883 6884 6885 6886 6887 6888
{
	TransactionId oldestActiveXID;

	/* Pre-scan prepared transactions to find out the range of XIDs present */
	oldestActiveXID = PrescanPreparedTransactions();

	elog(LOG, "Oldest active transaction from prepared transactions %u", oldestActiveXID);

6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900
	/*
	 * Initialize TransactionXmin to current oldestActiveXID, generally
	 * initialized during GetSnapshotData(). This is to avoid situations where
	 * scanning pg_authid or other tables mostly in BuildFlatFiles() below via
	 * SnapshotNow may try to chase down pg_subtrans for older "sub-committed"
	 * transaction, file corresponding to which may not and is not supposed to
	 * exist. Setting this here will avoid calling SubTransGetParent() in
	 * TransactionIdDidCommit() for older XIDs. Also, set RecentGlobalXmin
	 * since Heap access method functions needs it to have good value as well.
	 */
	TransactionXmin = RecentGlobalXmin = oldestActiveXID;

6901 6902 6903 6904 6905 6906 6907 6908
	/* Start up the commit log and related stuff, too */
	StartupCLOG();
	StartupSUBTRANS(oldestActiveXID);
	StartupMultiXact();
	DistributedLog_Startup(
						oldestActiveXID,
						ShmemVariableCache->nextXid);

6909 6910 6911 6912 6913 6914 6915
	/* also initialize latestCompletedXid, to nextXid - 1 */
	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
	elog(LOG, "latest completed transaction id is %u and next transaction id is %u",
		ShmemVariableCache->latestCompletedXid,
		ShmemVariableCache->nextXid);

6916 6917
	/* Reload shared-memory state for prepared transactions */
	RecoverPreparedTransactions();
6918

6919
	/*
6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935
	 * Perform a checkpoint to update all our recovery activity to disk.
	 *
	 * Note that we write a shutdown checkpoint rather than an on-line
	 * one. This is not particularly critical, but since we may be
	 * assigning a new TLI, using a shutdown checkpoint allows us to have
	 * the rule that TLI only changes in shutdown checkpoints, which
	 * allows some extra error checking in xlog_redo.
	 *
	 * Note that - Creation of shutdown checkpoint changes the state in pg_control.
	 * If that happens when we are standby who was recently promoted, the
	 * state in pg_control indicating promotion phases (e.g. DB_IN_STANDBY_PROMOTION,
	 * DB_INSTANDBY_NEW_TLI_SET) before the checkpoint creation will get
	 * overwritten posing a problem for further flow. Hence, CreateCheckpoint()
	 * has an ungly hack to avoid this situation and thus we avoid change of
	 * pg_control state just in this special situation. CreateCheckpoint() also
	 * has a comment referring this.
6936
	 */
6937 6938 6939 6940 6941 6942
	if (bgwriterLaunched)
		RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
						  CHECKPOINT_IMMEDIATE |
						  CHECKPOINT_WAIT);
	else
		CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6943

6944
#ifdef NOT_USED
6945
	/*
6946
	 * And finally, execute the recovery_end_command, if any.
6947
	 */
6948 6949 6950 6951 6952
	if (recoveryEndCommand)
		ExecuteRecoveryCommand(recoveryEndCommand,
							   "recovery_end_command",
							   true);
#endif
6953

6954
	/*
6955 6956 6957
	 * If this system was a standby which was promoted (or whose catalog is not
	 * yet updated after promote), we delay going into actual production till Pass4.
	 * Pass4 updates the catalog to comply with the standby promotion changes.
6958
	 */
6959
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6960 6961 6962 6963
	if (ControlFile->state == DB_IN_STANDBY_PROMOTED
		|| ControlFile->state == DB_IN_STANDBY_NEW_TLI_SET)
	{
		ControlFile->state = DB_IN_STANDBY_NEW_TLI_SET;
6964
		ControlFile->time = (pg_time_t) time(NULL);
6965 6966 6967 6968 6969 6970
		UpdateControlFile();
		ereport(LOG, (errmsg("database system is almost ready")));
	}
	else
	{
		ControlFile->state = DB_IN_PRODUCTION;
6971
		ControlFile->time = (pg_time_t) time(NULL);
6972 6973 6974
		UpdateControlFile();
		ereport(LOG, (errmsg("database system is ready")));
	}
6975
	LWLockRelease(ControlFileLock);
6976

6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988
	{
		char version[512];

		strcpy(version, PG_VERSION_STR " compiled on " __DATE__ " " __TIME__);

#ifdef USE_ASSERT_CHECKING
		strcat(version, " (with assert checking)");
#endif
		ereport(LOG,(errmsg("%s", version)));

	}

6989
	/*
6990 6991 6992 6993
	 * All done.  Allow backends to write WAL.	(Although the bool flag is
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
6994
	 */
6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
	}
}

/*
 * Error context callback for tracing or errors occurring during PASS 1 redo.
 */
static void
StartupXLOG_RedoPass1Context(void *arg)
{
	XLogRecord		*record = (XLogRecord*) arg;

	StringInfoData buf;

	initStringInfo(&buf);
	appendStringInfo(&buf, "REDO PASS 1 @ %s; LSN %s: ",
					 XLogLocationToString(&ReadRecPtr),
					 XLogLocationToString2(&EndRecPtr));
7019
	xlog_outrec(&buf, record);
7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043
	appendStringInfo(&buf, " - ");
	RmgrTable[record->xl_rmid].rm_desc(&buf,
									   ReadRecPtr,
									   record);

	errcontext("%s", buf.data);

	pfree(buf.data);
}

/*
 * Error context callback for tracing or errors occurring during PASS 1 redo.
 */
static void
StartupXLOG_RedoPass3Context(void *arg)
{
	XLogRecord		*record = (XLogRecord*) arg;

	StringInfoData buf;

	initStringInfo(&buf);
	appendStringInfo(&buf, "REDO PASS 3 @ %s; LSN %s: ",
					 XLogLocationToString(&ReadRecPtr),
					 XLogLocationToString2(&EndRecPtr));
7044
	xlog_outrec(&buf, record);
7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063
	appendStringInfo(&buf, " - ");
	RmgrTable[record->xl_rmid].rm_desc(&buf,
									   ReadRecPtr,
									   record);

	errcontext("%s", buf.data);

	pfree(buf.data);
}


static void
ApplyStartupRedo(
	XLogRecPtr		*beginLoc,

	XLogRecPtr		*lsn,

	XLogRecord		*record)
{
7064 7065
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
7066

7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090
	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE;
	RedoErrorCallBack redoErrorCallBack;

	ErrorContextCallback errcontext;

	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER;

	/* Setup error traceback support for ereport() */
	redoErrorCallBack.location = *beginLoc;
	redoErrorCallBack.record = record;

	errcontext.callback = rm_redo_error_callback;
	errcontext.arg = (void *) &redoErrorCallBack;
	errcontext.previous = error_context_stack;
	error_context_stack = &errcontext;

	/* nextXid must be beyond record's xid */
	if (TransactionIdFollowsOrEquals(record->xl_xid,
									 ShmemVariableCache->nextXid))
	{
		ShmemVariableCache->nextXid = record->xl_xid;
		TransactionIdAdvance(ShmemVariableCache->nextXid);
	}

T
Tom Lane 已提交
7091
	/*
7092 7093
	 * Update shared replayEndRecPtr before replaying this record,
	 * so that XLogFlush will update minRecoveryPoint correctly.
T
Tom Lane 已提交
7094
	 */
7095 7096 7097
	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->replayEndRecPtr = EndRecPtr;
	SpinLockRelease(&xlogctl->info_lck);
7098 7099 7100

	RmgrTable[record->xl_rmid].rm_redo(*beginLoc, *lsn, record);

7101 7102 7103 7104 7105 7106 7107 7108
	/*
	 * After redo, check whether the backup pages associated with
	 * the WAL record are consistent with the existing pages. This
	 * check is done only if consistency check is enabled for this
	 * record.
	 */
	if ((record->xl_extended_info & XLR_CHECK_CONSISTENCY) != 0)
		checkXLogConsistency(record, *lsn);
7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125
	/* Pop the error context stack */
	error_context_stack = errcontext.previous;

	MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT;

}

/*
 * Process passed checkpoint record either during normal recovery or
 * in standby mode.
 *
 * If in standby mode, master mirroring information stored by the checkpoint
 * record is processed as well.
 */
static void
XLogProcessCheckpointRecord(XLogRecord *rec, XLogRecPtr loc)
{
7126
	CheckpointExtendedRecord ckptExtended;
7127

7128
	UnpackCheckPointRecord(rec, &ckptExtended);
7129 7130

	/*
7131 7132
	 * In standby mode, empty all master mirroring related hash tables. Get
	 * filespace, tablespace and database info from checkpoint record (master
7133 7134
	 * mirroring part) and maintain them in hash tables.
	 *
7135 7136
	 * We'll perform this only during standby mode because during normal
	 * non-standby recovery Persistent Tables do that job.
7137
	 */
7138
	if (IsStandbyMode())
7139
	{
7140 7141 7142
		mmxlog_empty_hashtables();
		if (ckptExtended.masterMirroringCheckpointLen > 0)
			mmxlog_read_checkpoint_data(ckptExtended.masterMirroringCheckpoint, &loc);
7143 7144
	}

7145
	if (ckptExtended.dtxCheckpoint)
7146
	{
7147
		/* Handle the DTX information. */
7148
		UtilityModeFindOrCreateDtmRedoFile();
7149
		redoDtxCheckPoint(ckptExtended.dtxCheckpoint);
7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193
		UtilityModeCloseDtmRedoFile();
	}
}


/*
 * This must be called ONCE during postmaster or standalone-backend startup
 *
 *	How Recovery works ?
 *---------------------------------------------------------------
 *| Clean Shutdown case    	| 	Not Clean Shutdown  case|
 *|(InRecovery = false)		|	(InRecovery = true)	|
 *---------------------------------------------------------------
 *|				|		   |		|
 *|				|record after	   |record after|
 *|				|checkpoint =	   |checkpoint =|
 *|				|NULL		   |NOT NULL	|
 *|				|(bypass Redo  	   |(dont bypass|
 *|				|  	   	   |Redo	|
 *---------------------------------------------------------------
 *|				|		   |		|
 *|	No Redo			|No Redo	   |Redo done	|
 *|	No Recovery Passes	|Recovery Pass done|Recovery 	|
 *|				|		   |Pass done	|
 *---------------------------------------------------------------
 */
void
StartupXLOG(void)
{
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
	bool		wasShutdown;
	bool		reachedStopPoint = false;
	bool		haveBackupLabel = false;
	XLogRecPtr	RecPtr,
				LastRec,
				checkPointLoc,
				EndOfLog;
	uint32		endLogId;
	uint32		endLogSeg;
	XLogRecord *record;
	uint32		freespace;
	bool		multipleRecoveryPassesNeeded = false;
	bool		backupEndRequired = false;
7194
	bool		bgwriterLaunched = false;
7195 7196 7197 7198 7199 7200 7201 7202 7203

	/*
	 * Read control file and check XLOG status looks valid.
	 *
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
	 */
	ReadControlFile();

7204
	if (ControlFile->state < DB_SHUTDOWNED ||
7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215
		ControlFile->state > DB_IN_PRODUCTION ||
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		ereport(FATAL,
				(errmsg("control file contains invalid data")));

	if (ControlFile->state == DB_SHUTDOWNED)
		ereport(LOG,
				(errmsg("database system was shut down at %s",
						str_time(ControlFile->time))));
	else if (ControlFile->state == DB_SHUTDOWNING)
		ereport(LOG,
7216
				(errmsg("database system shutdown was interrupted; last known up at %s",
7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247
						str_time(ControlFile->time))));
	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
		ereport(LOG,
		   (errmsg("database system was interrupted while in recovery at %s",
				   str_time(ControlFile->time)),
			errhint("This probably means that some data is corrupted and"
					" you will have to use the last backup for recovery."),
			errSendAlert(true)));
	else if (ControlFile->state == DB_IN_STANDBY_MODE)
		ereport(LOG,
				(errmsg("database system was interrupted while in standby mode at  %s",
						str_time(ControlFile->checkPointCopy.time)),
						errhint("This probably means something unexpected happened either"
								" during replay at standby or receipt of XLog from primary."),
				 errSendAlert(true)));
	else if (ControlFile->state == DB_IN_STANDBY_PROMOTED)
		ereport(LOG,
				(errmsg("database system was interrupted after standby was promoted at %s",
						str_time(ControlFile->checkPointCopy.time)),
				 errhint("If this has occurred more than once something unexpected is happening"
				" after standby has been promoted"),
				 errSendAlert(true)));
	else if (ControlFile->state == DB_IN_STANDBY_NEW_TLI_SET)
		ereport(LOG,
				(errmsg("database system was interrupted post new TLI was setup on standby promotion at %s",
						str_time(ControlFile->checkPointCopy.time)),
						 errhint("If this has occurred more than once something unexpected is happening"
						" after standby has been promoted and new TLI has been set"),
				 errSendAlert(true)));
	else if (ControlFile->state == DB_IN_PRODUCTION)
		ereport(LOG,
7248
				(errmsg("database system was interrupted; last known up at %s",
7249 7250 7251 7252 7253 7254 7255 7256
						str_time(ControlFile->time))));

	/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
	if (ControlFile->state != DB_SHUTDOWNED)
		pg_usleep(60000000L);
#endif

7257
	/*
7258 7259 7260
	 * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
	 * someone has performed a copy for PITR, these directories may have been
	 * excluded and need to be re-created.
7261
	 */
7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284
	ValidateXLOGDirectoryStructure();

	/*
	 * Clear out any old relcache cache files.	This is *necessary* if we do
	 * any WAL replay, since that would probably result in the cache files
	 * being out of sync with database reality.  In theory we could leave them
	 * in place if the database had been cleanly shut down, but it seems
	 * safest to just remove them always and let them be rebuilt during the
	 * first backend startup.
	 */
	RelationCacheInitFileRemove();

	/*
	 * Initialize on the assumption we want to recover to the same timeline
	 * that's active according to pg_control.
	 */
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

	/*
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
	 */
	XLogReadRecoveryCommandFile(LOG);
7285

7286
	if (StandbyModeRequested)
T
Tom Lane 已提交
7287
	{
7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331
		Assert(ControlFile->state != DB_IN_CRASH_RECOVERY
				&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET);

		/*
		 * If the standby was promoted (last time) and recovery.conf
		 * is still found this time with standby mode request,
		 * it means the standby crashed post promotion but before recovery.conf
		 * cleanup. Hence, it is not considered a standby request this time.
		 */
		if (ControlFile->state == DB_IN_STANDBY_PROMOTED)
			StandbyModeRequested = false;
	}

	/* Now we can determine the list of expected TLIs */
	expectedTLIs = XLogReadTimeLineHistory(recoveryTargetTLI);

	/*
	 * If pg_control's timeline is not in expectedTLIs, then we cannot
	 * proceed: the backup is not part of the history of the requested
	 * timeline.
	 */
	if (!list_member_int(expectedTLIs,
						 (int) ControlFile->checkPointCopy.ThisTimeLineID))
		ereport(FATAL,
				(errmsg("requested timeline %u is not a child of database system timeline %u",
						recoveryTargetTLI,
						ControlFile->checkPointCopy.ThisTimeLineID)));
	/*
	 * Save the selected recovery target timeline ID in shared memory so that
	 * other processes can see them
	 */
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;

	if (StandbyModeRequested)
		ereport(LOG,
				(errmsg("entering standby mode")));

	/*
	 * Take ownership of the wakeup latch if we're going to sleep during
	 * recovery.
	 */
	if (StandbyModeRequested)
		OwnLatch(&XLogCtl->recoveryWakeupLatch);

7332 7333 7334 7335 7336 7337 7338
	/*
	 * Allocate pages dedicated to WAL consistency checks, those had better
	 * be aligned.
	 */
	replay_image_masked = (char *) palloc(BLCKSZ);
	master_image_masked = (char *) palloc(BLCKSZ);

7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354
	if (read_backup_label(&checkPointLoc, &backupEndRequired))
	{
		/*
		 * Currently, it is assumed that a backup file exists iff a base backup
		 * has been performed and then the recovery.conf file is generated, thus
		 * standby mode has to be requested
		 */
		if (!StandbyModeRequested)
			ereport(FATAL,
					(errmsg("Found backup.label file without any standby mode request")));

		/* Activate recovery in standby mode */
		StandbyMode = true;

		Assert(backupEndRequired);

7355
		/*
B
Bruce Momjian 已提交
7356 7357
		 * When a backup_label file is present, we want to roll forward from
		 * the checkpoint it identifies, rather than using pg_control.
7358
		 */
7359
		record = ReadCheckpointRecord(checkPointLoc, 0);
7360 7361
		if (record != NULL)
		{
7362 7363
			memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
			wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
7364
			ereport(DEBUG1,
7365
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
7366
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
7367
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
7368 7369

			/*
7370 7371 7372 7373
			 * Make sure that REDO location exists. This may not be
			 * the case if there was a crash during an online backup,
			 * which left a backup_label around that references a WAL
			 * segment that's already been archived.
7374 7375 7376 7377 7378 7379 7380 7381
			 */
			if (XLByteLT(checkPoint.redo, checkPointLoc))
			{
				if (!XLogReadRecord(&(checkPoint.redo), false, LOG))
					ereport(FATAL,
							(errmsg("could not find redo location referenced by checkpoint record"),
							 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
			}
7382 7383 7384
		}
		else
		{
7385
			ereport(FATAL,
B
Bruce Momjian 已提交
7386 7387
					(errmsg("could not locate required checkpoint record"),
					 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
7388
			wasShutdown = false; /* keep compiler quiet */
7389
		}
7390 7391
		/* set flag to delete it later */
		haveBackupLabel = true;
T
Tom Lane 已提交
7392 7393 7394
	}
	else
	{
7395 7396 7397 7398 7399 7400
		if (StandbyModeRequested)
		{
			/* Activate recovery in standby mode */
			StandbyMode = true;
		}

7401
		/*
B
Bruce Momjian 已提交
7402 7403
		 * Get the last valid checkpoint record.  If the latest one according
		 * to pg_control is broken, try the next-to-last one.
7404 7405
		 */
		checkPointLoc = ControlFile->checkPoint;
7406 7407
		RedoStartLSN = ControlFile->checkPointCopy.redo;

7408
		record = ReadCheckpointRecord(checkPointLoc, 1);
T
Tom Lane 已提交
7409 7410
		if (record != NULL)
		{
7411
			ereport(DEBUG1,
7412
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
7413
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
T
Tom Lane 已提交
7414
		}
7415 7416 7417 7418 7419 7420 7421 7422 7423
		else if (StandbyMode)
		{
			/*
			 * The last valid checkpoint record required for a streaming
			 * recovery exists in neither standby nor the primary.
			 */
			ereport(PANIC,
					(errmsg("could not locate a valid checkpoint record")));
		}
T
Tom Lane 已提交
7424
		else
7425
		{
7426 7427
			printEndOfXLogFile(&checkPointLoc);

7428
			checkPointLoc = ControlFile->prevCheckPoint;
7429
			record = ReadCheckpointRecord(checkPointLoc, 2);
7430 7431 7432
			if (record != NULL)
			{
				ereport(LOG,
B
Bruce Momjian 已提交
7433 7434 7435
						(errmsg("using previous checkpoint record at %X/%X",
							  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
				InRecovery = true;		/* force recovery even if SHUTDOWNED */
7436 7437
			}
			else
7438 7439
			{
				printEndOfXLogFile(&checkPointLoc);
7440
				ereport(PANIC,
B
Bruce Momjian 已提交
7441
					 (errmsg("could not locate a valid checkpoint record")));
7442
			}
7443
		}
7444 7445
		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
T
Tom Lane 已提交
7446
	}
7447

T
Tom Lane 已提交
7448
	LastRec = RecPtr = checkPointLoc;
7449 7450 7451 7452 7453 7454 7455 7456 7457
	XLogCtl->pass1LastCheckpointLoc = checkPointLoc;

	/*
	 * Currently, standby mode (WAL based replication support) is not provided
	 * to segments.
	 * Hence it's okay to do the following only once on the segments as there
	 * will be only one checkpoint to be analyzed.
	 */
	if (GpIdentity.segindex != MASTER_CONTENT_ID)
7458 7459 7460 7461 7462 7463
	{
		CheckpointExtendedRecord ckptExtended;
		UnpackCheckPointRecord(record, &ckptExtended);
		if (ckptExtended.ptas)
			SetupCheckpointPreparedTransactionList(ckptExtended.ptas);
	}
7464 7465 7466 7467 7468 7469

	/*
	 * Find Xacts that are distributed committed from the checkpoint record and
	 * store them such that they can utilized later during DTM recovery.
	 */
	XLogProcessCheckpointRecord(record, checkPointLoc);
7470

7471
	ereport(DEBUG1,
B
Bruce Momjian 已提交
7472 7473 7474
			(errmsg("redo record is at %X/%X; shutdown %s",
					checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
					wasShutdown ? "TRUE" : "FALSE")));
7475
	ereport(DEBUG1,
7476
			(errmsg("next transaction ID: %u/%u; next OID: %u; next relfilenode: %u",
7477
					checkPoint.nextXidEpoch, checkPoint.nextXid,
7478
					checkPoint.nextOid, checkPoint.nextRelfilenode)));
7479
	ereport(DEBUG1,
7480 7481
			(errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
					checkPoint.nextMulti, checkPoint.nextMultiOffset)));
7482

7483
	if (!TransactionIdIsNormal(checkPoint.nextXid))
7484
		ereport(PANIC,
7485
				(errmsg("invalid next transaction ID")));
7486

7487
	/* initialize shared memory variables from the checkpoint record */
7488 7489
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
7490
	ShmemVariableCache->oidCount = 0;
7491 7492
	ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
	ShmemVariableCache->relfilenodeCount = 0;
7493
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
7494 7495
	XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
	XLogCtl->ckptXid = checkPoint.nextXid;
7496

7497
	/*
B
Bruce Momjian 已提交
7498 7499 7500
	 * We must replay WAL entries using the same TimeLineID they were created
	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
	 * also xlog_redo()).
7501
	 */
7502
	ThisTimeLineID = checkPoint.ThisTimeLineID;
7503

7504
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
7505

7506
	if (XLByteLT(RecPtr, checkPoint.redo))
7507 7508
		ereport(PANIC,
				(errmsg("invalid redo in checkpoint record")));
7509

7510
	/*
B
Bruce Momjian 已提交
7511
	 * Check whether we need to force recovery from WAL.  If it appears to
B
Bruce Momjian 已提交
7512 7513
	 * have been a clean shutdown and we did not have a recovery.conf file,
	 * then assume no recovery needed.
7514
	 */
7515
	if (XLByteLT(checkPoint.redo, RecPtr))
7516
	{
T
Tom Lane 已提交
7517
		if (wasShutdown)
7518
			ereport(PANIC,
B
Bruce Momjian 已提交
7519
					(errmsg("invalid redo record in shutdown checkpoint")));
V
WAL  
Vadim B. Mikheev 已提交
7520
		InRecovery = true;
7521
	}
7522
	else if (StandbyModeRequested)
7523 7524
	{
		/* force recovery due to presence of recovery.conf */
7525 7526
		ereport(LOG,
				(errmsg("setting recovery standby mode active")));
7527 7528
		InRecovery = true;
	}
7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542
	else if (ControlFile->state != DB_SHUTDOWNED)
		InRecovery = true;

	if (InRecovery && !IsUnderPostmaster)
	{
		ereport(FATAL,
				(errmsg("Database must be shutdown cleanly when using single backend start")));
	}

	if (InRecovery && gp_before_persistence_work)
	{
		ereport(FATAL,
				(errmsg("Database must be shutdown cleanly when using gp_before_persistence_work = on")));
	}
7543

7544
	/* Recovery from xlog */
7545
	if (InRecovery)
7546
	{
B
Bruce Momjian 已提交
7547
		int			rmid;
7548

7549 7550 7551
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

7552
		/*
B
Bruce Momjian 已提交
7553 7554
		 * Update pg_control to show that we are recovering and to show the
		 * selected checkpoint as the place we are starting from. We also mark
7555
		 * pg_control with any minimum recovery stop point
7556
		 */
7557
		if (StandbyMode)
7558
		{
7559
			ereport(LOG,
7560 7561
					(errmsg("recovery in standby mode in progress")));
			ControlFile->state = DB_IN_STANDBY_MODE;
7562
		}
7563
		else
7564
		{
7565
			ereport(LOG,
7566 7567
					(errmsg("database system was not properly shut down; "
							"automatic recovery in progress")));
7568 7569 7570 7571

			if (ControlFile->state != DB_IN_STANDBY_PROMOTED
				&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET)
				ControlFile->state = DB_IN_CRASH_RECOVERY;
7572
		}
7573

7574 7575 7576
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = checkPointLoc;
		ControlFile->checkPointCopy = checkPoint;
7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592

		if (StandbyMode)
		{
			/* initialize minRecoveryPoint if not set yet */
			if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
				ControlFile->minRecoveryPoint = checkPoint.redo;
		}

		/* Set backupStartPoint if we're starting recovery from a base backup. */
		if (haveBackupLabel)
		{
			Assert(ControlFile->state == DB_IN_STANDBY_MODE);
			ControlFile->backupStartPoint = checkPoint.redo;
			ControlFile->backupEndRequired = backupEndRequired;
		}

7593
		ControlFile->time = (pg_time_t) time(NULL);
7594
		/* No need to hold ControlFileLock yet, we aren't up far enough */
7595 7596
		UpdateControlFile();

7597 7598
		/* initialize our local copy of minRecoveryPoint */
		minRecoveryPoint = ControlFile->minRecoveryPoint;
7599

7600 7601 7602
		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
7603 7604
		pgstat_reset_all();

7605
		/*
B
Bruce Momjian 已提交
7606 7607 7608 7609 7610 7611
		 * If there was a backup label file, it's done its job and the info
		 * has now been propagated into pg_control.  We must get rid of the
		 * label file so that if we crash during recovery, we'll pick up at
		 * the latest recovery restartpoint instead of going all the way back
		 * to the backup start point.  It seems prudent though to just rename
		 * the file out of the way rather than delete it completely.
7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622
		 */
		if (haveBackupLabel)
		{
			unlink(BACKUP_LABEL_OLD);
			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
				ereport(FATAL,
						(errcode_for_file_access(),
						 errmsg("could not rename file \"%s\" to \"%s\": %m",
								BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
		}

7623 7624
		UtilityModeFindOrCreateDtmRedoFile();

7625
		/* Initialize resource managers */
7626 7627 7628 7629 7630 7631
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

7632
		/*
7633 7634 7635
		 * Initialize shared variables for tracking progress of WAL replay,
		 * as if we had just replayed the record before the REDO location
		 * (or the checkpoint record itself, if it's a shutdown checkpoint).
7636 7637
		 */
		SpinLockAcquire(&xlogctl->info_lck);
7638 7639 7640 7641 7642
		if (XLByteLT(checkPoint.redo, RecPtr))
			xlogctl->replayEndRecPtr = checkPoint.redo;
		else
			xlogctl->replayEndRecPtr = EndRecPtr;
		xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
7643 7644 7645 7646 7647 7648 7649 7650
		xlogctl->currentChunkStartTime = 0;
		SpinLockRelease(&xlogctl->info_lck);

		/* Also ensure XLogReceiptTime has a sane value */
		XLogReceiptTime = GetCurrentTimestamp();

		/*
		 * Find the first record that logically follows the checkpoint --- it
B
Bruce Momjian 已提交
7651
		 * might physically precede it, though.
7652
		 */
7653
		if (XLByteLT(checkPoint.redo, RecPtr))
7654 7655
		{
			/* back up to find the record */
7656
			record = XLogReadRecord(&(checkPoint.redo), false, PANIC);
7657
		}
B
Bruce Momjian 已提交
7658
		else
7659
		{
7660
			/* just have to read next record after CheckPoint */
7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676
			record = XLogReadRecord(NULL, false, LOG);
		}

		/*
		 * In case where its not a clean shutdown but it doesn't have a record
		 * following the checkpoint record, just proceed with the Pass 2, 3, 4
		 * to clear any inconsistent entries in Persistent Tables without
		 * doing the whole redo loop below.
		 */
		if (record == NULL)	
		{
			/*
			 * There are no WAL records following the checkpoint
			 */
			ereport(LOG,
					(errmsg("no record for redo after checkpoint, skip redo and proceed for recovery pass")));
7677
		}
7678

7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691
		XLogCtl->pass1StartLoc = ReadRecPtr;

		/*
		 * MPP-11179
		 * Recovery Passes will be done in both the cases:
		 * 1. When record after checkpoint = NULL (No redo)
		 * 2. When record after checkpoint != NULL (redo also)
		 */
		multipleRecoveryPassesNeeded = true;

		/*
		 * main redo apply loop, executed if we have record after checkpoint
		 */
T
Tom Lane 已提交
7692
		if (record != NULL)
7693
		{
7694 7695
			bool		recoveryContinue = true;
			bool		recoveryApply = true;
7696 7697
			bool		lastReadRecWasCheckpoint=false;
			CurrentResourceOwner = ResourceOwnerCreate(NULL, "xlog");
7698
			bool		reachedMinRecoveryPoint = false;
7699

7700
			InRedo = true;
7701

7702 7703
			if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
				ereport(LOG,
7704 7705
						(errmsg("redo starts at %X/%X",
								ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
7706 7707 7708 7709 7710
			else
				ereport(LOG,
						(errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
								ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
7711

7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730
			/*
			 * Let postmaster know we've started redo now, so that it can
			 * launch bgwriter to perform restartpoints.  We don't bother
			 * during crash recovery as restartpoints can only be performed
			 * during archive recovery.  And we'd like to keep crash recovery
			 * simple, to avoid introducing bugs that could you from
			 * recovering after crash.
			 *
			 * After this point, we can no longer assume that we're the only
			 * process in addition to postmaster!  Also, fsync requests are
			 * subsequently to be handled by the bgwriter, not locally.
			 */
			if (InArchiveRecovery && IsUnderPostmaster)
			{
				SetForwardFsyncRequests();
				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
				bgwriterLaunched = true;
			}

7731 7732
			do
			{
7733 7734 7735
				ErrorContextCallback errcontext;

				HandleStartupProcInterrupts();
V
WAL  
Vadim B. Mikheev 已提交
7736

7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769
				/*
				 * Check if we were requested to re-read config file.
				 */
				if (got_SIGHUP)
				{
					got_SIGHUP = false;
					ProcessConfigFile(PGC_SIGHUP);
				}

				/*
				 * Check if we were requested to exit without finishing
				 * recovery.
				 */
				if (shutdown_requested)
					proc_exit(1);

				/*
				 * Have we passed our safe starting point? If so, we can tell
				 * postmaster that the database is consistent now.
				 */
				if (!reachedMinRecoveryPoint &&
					XLByteLT(minRecoveryPoint, EndRecPtr))
				{
					reachedMinRecoveryPoint = true;
					if (InArchiveRecovery)
					{
						ereport(LOG,
							  (errmsg("consistent recovery state reached")));
						if (IsUnderPostmaster)
							SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
					}
				}

7770 7771 7772 7773 7774
				/*
				 * Have we reached our recovery target?
				 */
				if (recoveryStopsHere(record, &recoveryApply))
				{
B
Bruce Momjian 已提交
7775
					reachedStopPoint = true;	/* see below */
7776 7777 7778 7779 7780
					recoveryContinue = false;
					if (!recoveryApply)
						break;
				}

7781
				errcontext.callback = StartupXLOG_RedoPass1Context;
7782 7783 7784 7785
				errcontext.arg = (void *) record;
				errcontext.previous = error_context_stack;
				error_context_stack = &errcontext;

7786 7787 7788 7789 7790 7791 7792
				/*
				 * Replay every XLog record read in continuous recovery (standby) mode
				 * But while in normal crash recovery mode apply only Persistent
				 * Tables' related XLog records
				 */
				if (IsStandbyMode() ||
					PersistentRecovery_ShouldHandlePass1XLogRec(&ReadRecPtr, &EndRecPtr, record))
7793
				{
7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810
					/*
					 * See if this record is a checkpoint, if yes then uncover it to
					 * find distributed committed Xacts.
					 * No need to unpack checkpoint in crash recovery mode
					 */
					uint8 xlogRecInfo = record->xl_info & ~XLR_INFO_MASK;

					if (IsStandbyMode() &&
						record->xl_rmid == RM_XLOG_ID &&
						(xlogRecInfo == XLOG_CHECKPOINT_SHUTDOWN
						|| xlogRecInfo == XLOG_CHECKPOINT_ONLINE))
					{
						XLogProcessCheckpointRecord(record, ReadRecPtr);
						XLogCtl->pass1LastCheckpointLoc = ReadRecPtr;
						memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
						lastReadRecWasCheckpoint = true;
					}
7811

7812 7813 7814 7815 7816 7817 7818 7819
					/*
					 * Update shared replayEndRecPtr before replaying this record,
					 * so that XLogFlush will update minRecoveryPoint correctly.
					 */
					SpinLockAcquire(&xlogctl->info_lck);
					xlogctl->replayEndRecPtr = EndRecPtr;
					SpinLockRelease(&xlogctl->info_lck);

7820
					ApplyStartupRedo(&ReadRecPtr, &EndRecPtr, record);
7821

7822 7823 7824 7825 7826 7827 7828
					/*
					 * Update lastReplayedEndRecPtr after this record has been
					 * successfully replayed.
					 */
					SpinLockAcquire(&xlogctl->info_lck);
					xlogctl->lastReplayedEndRecPtr = EndRecPtr;
					SpinLockRelease(&xlogctl->info_lck);
7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853

					/*
					 * GPDB_84_MERGE_FIXME: Create restartpoints aggressively.
					 *
					 * In PostgreSQL, the bgwriter creates restartpoints during archive
					 * recovery at its own leisure. In GDPB, with WAL replication based
					 * mirroring, that was tripping the gp_replica_check checks, because
					 * it bypasses the shared buffer cache and reads directly from disk.
					 * For now, restore the old behavior, before the upstream change
					 * to start bgwriter during archive recovery, and create a
					 * restartpoint immediately after replaying a checkpoint record.
					 */
					{
						uint8 xlogRecInfo = record->xl_info & ~XLR_INFO_MASK;

						if (record->xl_rmid == RM_XLOG_ID &&
							(xlogRecInfo == XLOG_CHECKPOINT_SHUTDOWN ||
							 xlogRecInfo == XLOG_CHECKPOINT_ONLINE))
						{
							if (bgwriterLaunched)
								RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
							else
								CreateRestartPoint(CHECKPOINT_IMMEDIATE);
						}
					}
7854
				}
7855

7856 7857 7858
				/* Pop the error context stack */
				error_context_stack = errcontext.previous;

7859 7860
				LastRec = ReadRecPtr;

7861
				record = XLogReadRecord(NULL, false, LOG);
B
Bruce Momjian 已提交
7862

7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881
				/*
				 *  If the last (actually it is last-to-last in case there is any
				 *  record after the latest checkpoint record during the reading
				 *  the Xlog records) record is a checkpoint, then startlocation
				 *  for Pass 1 should be decided
				 *
				 *  This step looks redundant in case of normal recovery (no
				 *  standby mode) but its not that costly.
				 */
				if (lastReadRecWasCheckpoint)
				{
					if (XLByteLT(checkPoint.redo, RecPtr))
						XLogCtl->pass1StartLoc = checkPoint.redo;
					else
						XLogCtl->pass1StartLoc = ReadRecPtr;
					lastReadRecWasCheckpoint = false;
				}

			} while (record != NULL && recoveryContinue);
7882

7883
			CurrentResourceOwner = NULL;
7884

7885 7886 7887
			ereport(LOG,
					(errmsg("redo done at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
7888

7889 7890
			if (recoveryLastXTime)
				ereport(LOG,
B
Bruce Momjian 已提交
7891 7892
					 (errmsg("last completed transaction was at log time %s",
							 timestamptz_to_str(recoveryLastXTime))));
V
WAL  
Vadim B. Mikheev 已提交
7893
			InRedo = false;
7894
		}
7895 7896 7897
		/*
		 * end of main redo apply loop
		 */
V
WAL  
Vadim B. Mikheev 已提交
7898 7899
	}

T
Tom Lane 已提交
7900
	/*
7901 7902 7903
	 * Kill WAL receiver, if it's still running, before we continue to write
	 * the startup checkpoint record. It will trump over the checkpoint and
	 * subsequent records if it's still alive when we start writing WAL.
T
Tom Lane 已提交
7904
	 */
7905
	ShutdownWalRcv();
7906

7907
	/*
7908 7909 7910 7911 7912 7913 7914 7915
	 * We don't need the latch anymore. It's not strictly necessary to disown
	 * it, but let's do it for the sake of tidiness.
	 */
	if (StandbyModeRequested)
		DisownLatch(&XLogCtl->recoveryWakeupLatch);

	/*
	 * We are now done reading the xlog from stream.
7916
	 */
7917
	if (StandbyMode)
7918
	{
7919 7920 7921 7922 7923
		Assert(ControlFile->state == DB_IN_STANDBY_MODE);
		StandbyMode = false;

		/* Transition to promoted mode */
		ControlFile->state = DB_IN_STANDBY_PROMOTED;
7924
		ControlFile->time = (pg_time_t) time(NULL);
7925
		UpdateControlFile();
7926 7927
	}

7928
	/*
7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942
	 * Re-fetch the last valid or last applied record, so we can identify the
	 * exact endpoint of what we consider the valid portion of WAL.
	 */
	record = XLogReadRecord(&LastRec, false, PANIC);
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);

	elog(LOG,"end of transaction log location is %s",
		 XLogLocationToString(&EndOfLog));

	XLogCtl->pass1LastLoc = ReadRecPtr;

	/*
	 * Complain if we did not roll forward far enough to render the backup
7943 7944 7945 7946
	 * dump consistent.  Note: it is indeed okay to look at the local variable
	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
	 * advanced beyond the WAL we processed.
7947
	 */
7948 7949 7950
	if (InRecovery &&
		(XLByteLT(EndOfLog, ControlFile->minRecoveryPoint) ||
		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7951
	{
7952
		if (reachedStopPoint)	/* stopped because of stop request */
7953
			ereport(FATAL,
7954
					(errmsg("requested recovery stop point is before consistent recovery point")));
B
Bruce Momjian 已提交
7955
		else	/* ran off end of WAL */
7956
			ereport(FATAL,
7957
					(errmsg("WAL ends before consistent recovery point")));
7958 7959
	}

7960 7961 7962
	/*
	 * Consider whether we need to assign a new timeline ID.
	 *
B
Bruce Momjian 已提交
7963 7964
	 * If we are doing an archive recovery, we always assign a new ID.	This
	 * handles a couple of issues.	If we stopped short of the end of WAL
7965 7966
	 * during recovery, then we are clearly generating a new timeline and must
	 * assign it a unique new ID.  Even if we ran to the end, modifying the
B
Bruce Momjian 已提交
7967 7968
	 * current last segment is problematic because it may result in trying to
	 * overwrite an already-archived copy of that segment, and we encourage
7969 7970 7971 7972
	 * DBAs to make their archive_commands reject that.  We can dodge the
	 * problem by making the new active segment have a new timeline ID.
	 *
	 * In a normal crash recovery, we can just extend the timeline we were in.
7973 7974
	 *
	 * GPDB: Greenplum doesn't support archive recovery.
7975
	 */
7976
	if (false /*InArchiveRecovery */)
7977
	{
7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000
		/*
		 * Ran off end of WAL before reaching end-of-backup WAL record, or
		 * minRecoveryPoint. That's usually a bad sign, indicating that you
		 * tried to recover from an online backup but never called
		 * pg_stop_backup(), or you didn't archive all the WAL up to that
		 * point. However, this also happens in crash recovery, if the system
		 * crashes while an online backup is in progress. We must not treat
		 * that as an error, or the database will refuse to start up.
		 */
		if (StandbyModeRequested || ControlFile->backupEndRequired)
		{
			if (ControlFile->backupEndRequired)
				ereport(FATAL,
						(errmsg("WAL ends before end of online backup"),
						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
				ereport(FATAL,
						(errmsg("WAL ends before end of online backup"),
						 errhint("Online backup should be complete, and all WAL up to that point must be available at recovery.")));
			else
				ereport(FATAL,
					  (errmsg("WAL ends before consistent recovery point")));
		}
8001 8002 8003 8004 8005
	}

	/* Save the selected TimeLineID in shared memory, too */
	XLogCtl->ThisTimeLineID = ThisTimeLineID;

8006 8007 8008 8009 8010 8011 8012
	/*
	 * Prepare to write WAL starting at EndOfLog position, and init xlog
	 * buffer cache using the block containing the last record from the
	 * previous incarnation.
	 */
	openLogId = endLogId;
	openLogSeg = endLogSeg;
8013 8014 8015 8016
	XLogFileOpen(
			&mirroredLogFileOpen,
			openLogId,
			openLogSeg);
T
Tom Lane 已提交
8017
	openLogOff = 0;
V
WAL  
Vadim B. Mikheev 已提交
8018
	Insert = &XLogCtl->Insert;
8019
	Insert->PrevRecord = LastRec;
8020 8021
	XLogCtl->xlblocks[0].xlogid = openLogId;
	XLogCtl->xlblocks[0].xrecoff =
8022
		((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
B
Bruce Momjian 已提交
8023 8024

	/*
B
Bruce Momjian 已提交
8025 8026 8027
	 * Tricky point here: readBuf contains the *last* block that the LastRec
	 * record spans, not the one it starts in.	The last block is indeed the
	 * one we want to use.
T
Tom Lane 已提交
8028
	 */
8029 8030
	Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
	memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
8031
	Insert->currpos = (char *) Insert->currpage +
8032
		(EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
8033

T
Tom Lane 已提交
8034
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
8035

T
Tom Lane 已提交
8036 8037 8038
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
8039

T
Tom Lane 已提交
8040 8041
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
8042

8043 8044 8045 8046 8047 8048 8049 8050 8051 8052
	freespace = INSERT_FREESPACE(Insert);
	if (freespace > 0)
	{
		/* Make sure rest of page is zero */
		MemSet(Insert->currpos, 0, freespace);
		XLogCtl->Write.curridx = 0;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
8053 8054
		 * Whenever Write.LogwrtResult points to exactly the end of a page,
		 * Write.curridx must point to the *next* page (see XLogWrite()).
8055
		 *
B
Bruce Momjian 已提交
8056
		 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
B
Bruce Momjian 已提交
8057
		 * this is sufficient.	The first actual attempt to insert a log
8058
		 * record will advance the insert state.
8059 8060 8061 8062
		 */
		XLogCtl->Write.curridx = NextBufIdx(0);
	}

V
WAL  
Vadim B. Mikheev 已提交
8063
	if (InRecovery)
8064
	{
8065 8066 8067
		/*
		 * Close down Recovery for Startup PASS 1.
		 */
B
Bruce Momjian 已提交
8068
		int			rmid;
8069

8070 8071 8072 8073 8074 8075 8076
		/*
		 * Resource managers might need to write WAL records, eg, to record
		 * index cleanup actions.  So temporarily enable XLogInsertAllowed in
		 * this process only.
		 */
		LocalSetXLogInsertAllowed();

8077 8078 8079 8080 8081 8082 8083 8084 8085
		/*
		 * Allow resource managers to do any required cleanup.
		 */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_cleanup != NULL)
				RmgrTable[rmid].rm_cleanup();
		}

8086 8087 8088
		/* Disallow XLogInsert again */
		LocalXLogInsertAllowed = -1;

8089 8090 8091 8092 8093 8094
		/*
		 * Check to see if the XLOG sequence contained any unresolved
		 * references to uninitialized pages.
		 */
		XLogCheckInvalidPages();

8095 8096 8097 8098 8099
		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
		pgstat_reset_all();

T
Tom Lane 已提交
8100
		/*
8101 8102
		 * We are not finished with multiple passes, so we do not do a
		 * shutdown checkpoint here as we did in the past.
8103
		 *
8104
		 * We only flush out the Resource Managers.
T
Tom Lane 已提交
8105
		 */
8106
		Checkpoint_RecoveryPass(XLogCtl->pass1LastLoc);
8107

8108
		UtilityModeCloseDtmRedoFile();
8109
	}
8110

T
Tom Lane 已提交
8111 8112 8113
	/*
	 * Preallocate additional log files, if wanted.
	 */
8114
	PreallocXlogFiles(EndOfLog);
8115

8116
	/*
8117
	 * Okay, we're finished with Pass 1.
8118
	 */
V
WAL  
Vadim B. Mikheev 已提交
8119
	InRecovery = false;
8120

8121
	/* start the archive_timeout timer running */
8122
	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
8123

8124 8125 8126 8127
	/* initialize shared-memory copy of latest checkpoint XID/epoch */
	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;

8128 8129 8130 8131 8132 8133 8134
	if (!gp_before_persistence_work)
	{
		/*
		 * Create a resource owner to keep track of our resources (currently only
		 * buffer pins).
		 */
		CurrentResourceOwner = ResourceOwnerCreate(NULL, "StartupXLOG");
8135

A
Asim R P 已提交
8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146
		/*
		 * Setup syscache so that PT tuples may be updated using usual
		 * heap access methods.  This is safe because:
		 *
		 *   1. Catalog cache is backend local.
		 *
		 *   2. The backend handling this pass (pass 1) of recovery is
		 *   about to exit.  Rebuilding PT is the last operation
		 *   performed by this backend.
		 *
		 *   3. At the beginning of pass 2, we are initializing
D
Daniel Gustafsson 已提交
8147
		 *   catalog cache, see StartupProcessMain()
A
Asim R P 已提交
8148 8149 8150 8151
		 */
		if (IsUnderPostmaster)
			InitCatalogCache();

8152 8153 8154 8155 8156
		/*
		 * During startup after we have performed recovery is the only place we
		 * scan in the persistent meta-data into memory on already initdb database.
		 */
		PersistentFileSysObj_StartupInitScan();
8157
	}
T
Tom Lane 已提交
8158

8159
	if (!IsUnderPostmaster)
T
Tom Lane 已提交
8160
	{
8161
		Assert(!multipleRecoveryPassesNeeded);
T
Tom Lane 已提交
8162

8163
		StartupXLOG_InProduction(bgwriterLaunched);
T
Tom Lane 已提交
8164

8165 8166 8167 8168
		ereport(LOG,
				(errmsg("Finished single backend startup")));
	}
	else
T
Tom Lane 已提交
8169
	{
8170 8171 8172
		XLogCtl->multipleRecoveryPassesNeeded = multipleRecoveryPassesNeeded;

		if (!gp_startup_integrity_checks)
8173
		{
8174 8175
			ereport(LOG,
					(errmsg("Integrity checks will be skipped because gp_startup_integrity_checks = off")));
8176
		}
8177
		else
8178
		{
8179
			XLogCtl->integrityCheckNeeded = true;
8180
		}
8181 8182

		if (!XLogCtl->multipleRecoveryPassesNeeded)
8183
		{
8184
			StartupXLOG_InProduction(bgwriterLaunched);
8185 8186 8187 8188

			ereport(LOG,
					(errmsg("Finished normal startup for clean shutdown case")));

8189
		}
8190
		else
8191
		{
8192 8193
			ereport(LOG,
					(errmsg("Finished startup pass 1.  Proceeding to startup crash recovery passes 2 and 3.")));
8194
		}
T
Tom Lane 已提交
8195
	}
8196 8197

	XLogCloseReadRecord();
8198 8199
}

8200
bool XLogStartupMultipleRecoveryPassesNeeded(void)
V
WAL  
Vadim B. Mikheev 已提交
8201
{
8202 8203
	Assert(XLogCtl != NULL);
	return XLogCtl->multipleRecoveryPassesNeeded;
8204 8205
}

8206
bool XLogStartupIntegrityCheckNeeded(void)
8207
{
8208 8209 8210
	Assert(XLogCtl != NULL);
	return XLogCtl->integrityCheckNeeded;
}
8211

8212 8213 8214 8215 8216 8217 8218 8219 8220
static void
GetRedoRelationFileName(char *path)
{
	char *xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(path, MAXPGPATH, "%s/RedoRelationFile", xlogDir) > MAXPGPATH)
	{
		ereport(ERROR, (errmsg("cannot generate pathname %s/RedoRelationFile", xlogDir)));
	}
	pfree(xlogDir);
V
WAL  
Vadim B. Mikheev 已提交
8221 8222
}

8223 8224
static int
CreateRedoRelationFile(void)
8225
{
8226
	char	path[MAXPGPATH];
8227

8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239
	int		result;

	GetRedoRelationFileName(path);

	result = open(path, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
	if (result < 0)
	{
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create redo relation file \"%s\"",
						path)));
	}
8240 8241 8242 8243

	return result;
}

8244 8245
static int
OpenRedoRelationFile(void)
8246
{
8247
	char	path[MAXPGPATH];
8248

8249
	int		result;
8250

8251
	GetRedoRelationFileName(path);
8252

8253 8254 8255 8256 8257 8258 8259 8260
	result = open(path, O_RDONLY, S_IRUSR | S_IWUSR);
	if (result < 0)
	{
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open redo relation file \"%s\"",
						path)));
	}
8261

8262
	return result;
8263 8264
}

8265 8266
static void
UnlinkRedoRelationFile(void)
8267
{
8268
	char	path[MAXPGPATH];
8269

8270
	GetRedoRelationFileName(path);
8271

8272 8273 8274 8275
	if (unlink(path) < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not unlink redo relation file \"%s\": %m", path)));
8276 8277
}

T
Tom Lane 已提交
8278
/*
8279
 * This must be called ONCE during postmaster or standalone-backend startup
T
Tom Lane 已提交
8280
 */
8281
void
8282
StartupXLOG_Pass2(void)
8283
{
8284
	XLogRecord *record;
V
Vadim B. Mikheev 已提交
8285

8286 8287 8288 8289 8290
	int redoRelationFile;

	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
		     "Entering StartupXLOG_Pass2");
8291

8292
	/*
8293
	 * Read control file and verify XLOG status looks valid.
8294
	 */
8295
	ReadControlFile();
8296

8297
	if (ControlFile->state < DB_SHUTDOWNED ||
8298 8299 8300 8301
		ControlFile->state > DB_IN_PRODUCTION ||
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		ereport(FATAL,
				(errmsg("Startup Pass 2: control file contains invalid data")));
T
Tom Lane 已提交
8302

8303 8304
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
8305

8306 8307
	/* Now we can determine the list of expected TLIs */
	expectedTLIs = XLogReadTimeLineHistory(recoveryTargetTLI);
8308

8309 8310 8311 8312 8313
	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
		     "ControlFile with recoveryTargetTLI %u, transaction log to start location is %s",
			 recoveryTargetTLI,
			 XLogLocationToString(&XLogCtl->pass1StartLoc));
T
Tom Lane 已提交
8314

8315
	if (GpIdentity.segindex != MASTER_CONTENT_ID)
T
Tom Lane 已提交
8316
	{
8317
		CheckpointExtendedRecord ckptExtended;
8318 8319 8320
		if (Debug_persistent_recovery_print)
			elog(PersistentRecovery_DebugPrintLevel(),
				"Read the checkpoint record location saved from pass1, "
8321
				"and setup the prepared transaction hash list.");
8322
		record = XLogReadRecord(&XLogCtl->pass1LastCheckpointLoc, false, PANIC);
8323 8324 8325 8326

		UnpackCheckPointRecord(record, &ckptExtended);
		if (ckptExtended.ptas)
			SetupCheckpointPreparedTransactionList(ckptExtended.ptas);
T
Tom Lane 已提交
8327 8328
	}

8329
	record = XLogReadRecord(&XLogCtl->pass1StartLoc, false, PANIC);
B
Bruce Momjian 已提交
8330

T
Tom Lane 已提交
8331
	/*
8332
	 * Pass 2 XLOG scan
T
Tom Lane 已提交
8333
	 */
8334
	while (true)
8335
	{
8336
		PersistentRecovery_HandlePass2XLogRec(&ReadRecPtr, &EndRecPtr, record);
8337

8338 8339 8340 8341 8342 8343
		if (XLByteEQ(ReadRecPtr, XLogCtl->pass1LastLoc))
			break;

		Assert(XLByteLE(ReadRecPtr,XLogCtl->pass1LastLoc));

		record = XLogReadRecord(NULL, false, PANIC);
8344
	}
8345
	XLogCloseReadRecord();
B
Bruce Momjian 已提交
8346

8347
	PersistentRecovery_Scan();
8348

8349
	PersistentRecovery_CrashAbort();
8350

8351
	PersistentRecovery_Update();
8352

8353
	PersistentRecovery_Drop();
T
Tom Lane 已提交
8354

8355
	PersistentRecovery_UpdateAppendOnlyMirrorResyncEofs();
8356

8357 8358 8359
#ifdef USE_ASSERT_CHECKING
//	PersistentRecovery_VerifyTablesAgainstMemory();
#endif
8360

8361
	Checkpoint_RecoveryPass(XLogCtl->pass1LastLoc);
8362

T
Tom Lane 已提交
8363
	/*
8364
	 * Create a file that passes information to pass 3.
T
Tom Lane 已提交
8365
	 */
8366
	redoRelationFile = CreateRedoRelationFile();
8367

8368
	PersistentRecovery_SerializeRedoRelationFile(redoRelationFile);
8369

8370
	close(redoRelationFile);
8371

8372 8373
	ereport(LOG,
			(errmsg("Finished startup crash recovery pass 2")));
8374

8375 8376 8377 8378
	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
		     "Exiting StartupXLOG_Pass2");
}
T
Tom Lane 已提交
8379

8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390
/*
 * This must be called ONCE during postmaster or standalone-backend startup
 */
void
StartupXLOG_Pass3(void)
{
	int redoRelationFile;
	XLogRecord *record;
	int 		rmid;
	uint32		endLogId;
	uint32		endLogSeg;
8391

8392 8393 8394
	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
		     "Entering StartupXLOG_Pass3");
8395

8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408
	redoRelationFile = OpenRedoRelationFile();

	PersistentRecovery_DeserializeRedoRelationFile(redoRelationFile);

	close(redoRelationFile);

	UnlinkRedoRelationFile();

	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
			 "StartupXLOG_Pass3: Begin re-scanning XLOG");

	InRecovery = true;
8409

T
Tom Lane 已提交
8410
	/*
8411
	 * Read control file and verify XLOG status looks valid.
T
Tom Lane 已提交
8412
	 */
8413
	ReadControlFile();
8414

8415
	if (ControlFile->state < DB_SHUTDOWNED ||
8416 8417 8418 8419
		ControlFile->state > DB_IN_PRODUCTION ||
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		ereport(FATAL,
				(errmsg("Startup Pass 2: control file contains invalid data")));
8420

8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;

	/* Now we can determine the list of expected TLIs */
	expectedTLIs = XLogReadTimeLineHistory(recoveryTargetTLI);

	if (Debug_persistent_recovery_print)
	{
		elog(PersistentRecovery_DebugPrintLevel(),
			 "ControlFile with recoveryTargetTLI %u, transaction log to start location is %s",
			 recoveryTargetTLI,
			 XLogLocationToString(&XLogCtl->pass1StartLoc));
		elog(PersistentRecovery_DebugPrintLevel(),
		     "StartupXLOG_RedoPass3Context: Control File checkpoint location is %s",
		     XLogLocationToString(&ControlFile->checkPoint));
8436 8437
	}

8438 8439 8440
	UtilityModeFindOrCreateDtmRedoFile();

	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
V
Vadim B. Mikheev 已提交
8441
	{
8442 8443
		if (RmgrTable[rmid].rm_startup != NULL)
			RmgrTable[rmid].rm_startup();
V
Vadim B. Mikheev 已提交
8444 8445
	}

8446 8447
	if (GpIdentity.segindex != MASTER_CONTENT_ID)
	{
8448
		CheckpointExtendedRecord ckptExtended;
8449
		record = XLogReadRecord(&XLogCtl->pass1LastCheckpointLoc, false, PANIC);
8450 8451 8452 8453

		UnpackCheckPointRecord(record, &ckptExtended);
		if (ckptExtended.ptas)
			SetupCheckpointPreparedTransactionList(ckptExtended.ptas);
8454 8455
	}

8456 8457 8458 8459 8460 8461 8462
	/*
	 * Allocate pages dedicated to WAL consistency checks, those had better
	 * be aligned.
	 */
	replay_image_masked = (char *) palloc(BLCKSZ);
	master_image_masked = (char *) palloc(BLCKSZ);

8463
	record = XLogReadRecord(&XLogCtl->pass1StartLoc, false, PANIC);
T
Tom Lane 已提交
8464

8465
	/*
8466
	 * Pass 3 XLOG scan
8467
	 */
8468 8469 8470
	while (true)
	{
		ErrorContextCallback errcontext;
8471

8472 8473 8474 8475 8476
		/* Setup error traceback support for ereport() */
		errcontext.callback = StartupXLOG_RedoPass3Context;
		errcontext.arg = (void *) record;
		errcontext.previous = error_context_stack;
		error_context_stack = &errcontext;
8477

8478 8479
		if (PersistentRecovery_ShouldHandlePass3XLogRec(&ReadRecPtr, &EndRecPtr, record))
			ApplyStartupRedo(&ReadRecPtr, &EndRecPtr, record);
V
WAL  
Vadim B. Mikheev 已提交
8480

8481 8482
		/* Pop the error context stack */
		error_context_stack = errcontext.previous;
8483

8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496
		/*
		 * For Pass 3, we read through the new log generated by Pass 2 in case
		 * there are Master Mirror XLOG records we need to take action on.
		 *
		 * It is obscure: Pass 3 REDO of Create fs-obj may need to be compensated for
		 * by Deletes generated in Pass 2...
		 */
		record = XLogReadRecord(NULL, false, LOG);
		if (record == NULL)
			break;
	}

	XLogCloseReadRecord();
8497 8498

	/*
8499 8500 8501 8502 8503 8504 8505 8506
	 * Consider whether we need to assign a new timeline ID.
	 *
	 * If we were in standby mode, we always assign a new ID.
	 * This currently helps for avoiding standby fail-back situation (If the original
	 * primary is down and original standby is acting as the new primary, in such
	 * a case original primary can't act as the new standby to avoid XLog mismatch)
	 *
	 * In a normal crash recovery, we can just extend the timeline we were in.
8507
	 */
8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530
	if (ControlFile->state == DB_IN_STANDBY_PROMOTED)
	{
		/* Read the last XLog record */
		record = XLogReadRecord(&XLogCtl->pass1LastLoc, false, PANIC);
		XLByteToPrevSeg(EndRecPtr, endLogId, endLogSeg);

		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
							curFileTLI, endLogId, endLogSeg);
		ereport(LOG,
				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));

		/* Save the selected TimeLineID in shared memory, too */
		XLogCtl->ThisTimeLineID = ThisTimeLineID;

		/*
		 * We are now done reading the old WAL. Make a writable copy of the last
		 * WAL segment.
		 */
		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);

		XLogCloseReadRecord();
	}
8531 8532

	/*
8533
	 * Allow resource managers to do any required cleanup.
8534 8535 8536
	 */
	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
	{
8537 8538
		if (RmgrTable[rmid].rm_cleanup != NULL)
			RmgrTable[rmid].rm_cleanup();
8539 8540 8541
	}

	/*
8542 8543
	 * Check to see if the XLOG sequence contained any unresolved
	 * references to uninitialized pages.
8544
	 */
8545 8546 8547 8548
	XLogCheckInvalidPages();

	/* Reset pgstat data, because it may be invalid after recovery */
	pgstat_reset_all();
8549

8550
	UtilityModeCloseDtmRedoFile();
8551

8552 8553 8554
	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
			 "StartupXLOG_Pass3: End re-scanning XLOG");
8555

8556
	InRecovery = false;
B
Bruce Momjian 已提交
8557

8558 8559
	/* postmaster ensures that bgwriter is already running in pass 3. */
	StartupXLOG_InProduction(true);
8560 8561 8562 8563 8564 8565 8566

	ereport(LOG,
			(errmsg("Finished startup crash recovery pass 3")));

	if (Debug_persistent_recovery_print)
		elog(PersistentRecovery_DebugPrintLevel(),
			 "Exiting StartupXLOG_Pass3");
8567 8568
}

8569
/*
8570 8571 8572 8573
 * Startup Pass 4 can perform basic integrity checks as well as
 * PersistentTable-Catalog verification (if appropriate GUC is turned on).
 * If the GUC is NOT set --
 * 1. Only basic integrity checks will be performed.
8574
 *
8575 8576 8577 8578 8579 8580 8581 8582 8583 8584
 * If the GUC is set --
 * 1. Both Non-DB specific and DB-specific verification checks
 * are executed to see if the system is consistent.
 * 2. First run of Pass 4 performs basic integrity checks and
 * non-DB specific checks. At the same time, next DB on which DB-specific
 * verifications are to be performed is selected.
 * 3. This DB selected in #2 will be used in the next cycle of Pass 4
 * as a new spawned process for verification purposes. At the same time,
 * a new database is selected for the subsequent cycle and thus this
 * continues until there are no more DBs left to be verified in the system.
8585
 */
8586 8587
void
StartupXLOG_Pass4(void)
8588
{
8589
	bool doPTCatVerification = false;
8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618
	char *fullpath;

	/*
	 * In order to access the catalog, we need a database, and a
	 * tablespace; our access to the heap is going to be slightly
	 * limited, so we'll just use some defaults.
	 */
	if (!XLogStartup_DoNextPTCatVerificationIteration())
	{
		MyDatabaseId = TemplateDbOid;
		MyDatabaseTableSpace = DEFAULTTABLESPACE_OID;
	}
	else
	{
		MyDatabaseId = XLogCtl->currentDatabaseToVerify;
		MyDatabaseTableSpace = XLogCtl->tablespaceOfCurrentDatabaseToVerify;
	}

	/*
	 * Now we can mark our PGPROC entry with the database ID
	 * (We assume this is an atomic store so no lock is needed)
	 */
	MyProc->databaseId = MyDatabaseId;

	fullpath = GetDatabasePath(MyDatabaseId, MyDatabaseTableSpace);

	SetDatabasePath(fullpath);

	RelationCacheInitializePhase3();
8619

8620 8621 8622 8623 8624 8625 8626
	/*
	 * Start with the basic Pass 4 integrity checks. If requested (GUC & No In-Doubt
	 * prepared Xacts) then pursue non-database specific integrity checks
	 */
	if(!XLogStartup_DoNextPTCatVerificationIteration())
	{
		PersistentFileSysObj_StartupIntegrityCheck();
8627

8628 8629 8630 8631 8632
		/*
		 * Do the check for inconsistencies in global sequence number after the catalog cache is set up
		 * MPP-17207. Inconsistent global sequence number can be fixed with setting the guc
		 * gp_persistent_repair_global_sequence
		 */
8633

8634 8635
		PersistentFileSysObj_DoGlobalSequenceScan();
		Insist(isFilespaceInfoConsistent());
8636

8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659
		/*
		 * Now we can update the catalog to tell the system is fully-promoted,
		 * if was standby.  This should be done after all WAL-replay finished
		 * otherwise we'll be in inconsistent state where catalog says I'm in
		 * primary state while the recovery is trying to stream.
		 */
		if (ControlFile->state == DB_IN_STANDBY_NEW_TLI_SET)
		{
			GpRoleValue old_role = Gp_role;
	
			/* I am privileged */
			InitializeSessionUserIdStandalone();
			/* Start transaction locally */
			Gp_role = GP_ROLE_UTILITY;
			StartTransactionCommand();
			GetTransactionSnapshot();
			DirectFunctionCall1(gp_activate_standby, (Datum) 0);
			/* close the transaction we started above */
			CommitTransactionCommand();
			Gp_role = old_role;

			ereport(LOG, (errmsg("Updated catalog to support standby promotion")));

8660
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8661
			ControlFile->state = DB_IN_PRODUCTION;
8662
			ControlFile->time = (pg_time_t) time(NULL);
8663
			UpdateControlFile();
8664
			LWLockRelease(ControlFileLock);
8665 8666
			ereport(LOG, (errmsg("database system is ready")));
		}
8667

8668 8669
		ereport(LOG,
			(errmsg("Finished BASIC startup integrity checking")));
8670

8671 8672 8673 8674 8675
		/*
		 * Check if the system has any in-doubt prepared transactions
		 * If No(Yes) - Do(nt) perform extra verification checks
		 */
		if (debug_persistent_ptcat_verification)
T
Tom Lane 已提交
8676
		{
8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695
			/*
			 * As the startup passes are Auxiliary processes and not pure Backeneds
			 * they don't have a user set. Hence, a concrete user id is needed.
			 * Pass 4 may perform some PersistentTable-Catalog verification, which
			 * uses SPI and hence will need a user id set
			 * Set the user id to bootstrap user id to obtain super user rights.
			 */
			if (GpIdentity.segindex != MASTER_CONTENT_ID)
				Gp_role = GP_ROLE_UTILITY;

			SetSessionUserId(BOOTSTRAP_SUPERUSERID, true);
			StartTransactionCommand();
			doPTCatVerification = !StartupXLOG_Pass4_CheckIfAnyInDoubtPreparedTransactions();

			/* Perform non-database specific verification checks */
			if (doPTCatVerification)
				StartupXLOG_Pass4_NonDBSpecificPTCatVerification();

			CommitTransactionCommand();
T
Tom Lane 已提交
8696 8697 8698
		}
	}

8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710
	/*
	 * If a database (and thus its tablespace) has already been selected, perform
	 * database specific verifications.
	 *
	 * And then get the first or the next database (and its tablespace) for the first or
	 * next cycle of Pass4 database specific extra verification checks
	 */
	if (doPTCatVerification || XLogStartup_DoNextPTCatVerificationIteration())
	{
		/* Redundant usage to maintain code readability */
		if (GpIdentity.segindex != MASTER_CONTENT_ID)
			Gp_role = GP_ROLE_UTILITY;
8711

8712 8713
		SetSessionUserId(BOOTSTRAP_SUPERUSERID, true);
		StartTransactionCommand();
8714

8715 8716
		if(XLogStartup_DoNextPTCatVerificationIteration())
			StartupXLOG_Pass4_DBSpecificPTCatVerification();
T
Tom Lane 已提交
8717

8718
		if (!StartupXLOG_Pass4_GetDBForPTCatVerification())
T
Tom Lane 已提交
8719
		{
8720 8721
			if (!XLogCtl->pass4_PTCatVerificationPassed)
				elog(FATAL,"Startup Pass 4 PersistentTable-Catalog verification failed!!!");
T
Tom Lane 已提交
8722
		}
8723 8724 8725 8726 8727 8728
		else
		{
			if (Gp_role == GP_ROLE_DISPATCH && (GpIdentity.segindex == MASTER_CONTENT_ID))
			{
				bool exists = false;
				postDTMRecv_dbTblSpc_Hash_Entry currentDbTblSpc;
8729

8730 8731 8732 8733
				if (!Persistent_PostDTMRecv_IsHashFull())
				{
					currentDbTblSpc.database = XLogCtl->currentDatabaseToVerify;
					currentDbTblSpc.tablespace = XLogCtl->tablespaceOfCurrentDatabaseToVerify;
8734

8735 8736 8737 8738 8739 8740 8741 8742
					if (Persistent_PostDTMRecv_InsertHashEntry(currentDbTblSpc.database, &currentDbTblSpc, &exists))
					{
						if (exists)
							elog(FATAL,"Database already present in the Hash Table");
					}
				}
			}
		}
8743

8744
		CommitTransactionCommand();
8745
	}
V
WAL  
Vadim B. Mikheev 已提交
8746
}
B
Bruce Momjian 已提交
8747

8748
static bool StartupXLOG_Pass4_CheckIfAnyInDoubtPreparedTransactions(void)
V
WAL  
Vadim B. Mikheev 已提交
8749
{
8750 8751
	bool retVal = false;
	Persistent_Pre_ExecuteQuery();
B
Bruce Momjian 已提交
8752

8753
	PG_TRY();
8754
	{
8755
		int ret = Persistent_ExecuteQuery("select * from pg_prepared_xacts", true);
8756

8757 8758 8759 8760 8761 8762
		if (ret > 0)
			retVal = true;
		else if(ret == 0)
			retVal = false;
		else
			Insist(0);
8763
	}
8764
	PG_CATCH();
8765
	{
8766 8767
		Persistent_ExecuteQuery_Cleanup();
		elog(FATAL, "In-Doubt transaction Check: Failure");
8768
	}
8769
	PG_END_TRY();
V
WAL  
Vadim B. Mikheev 已提交
8770

8771 8772 8773
	Persistent_Post_ExecuteQuery();
	return retVal;
}
8774

8775 8776 8777 8778 8779 8780
/*
 * Indicates if more verification cycles are needed. XLogCtl current database
 * and tablespace act as the flags and also carry database Oid and tablespace Oid
 * for the next cycle of verification.
 */
bool XLogStartup_DoNextPTCatVerificationIteration(void)
V
WAL  
Vadim B. Mikheev 已提交
8781
{
8782 8783 8784
	if (XLogCtl->currentDatabaseToVerify != InvalidOid &&
			XLogCtl->tablespaceOfCurrentDatabaseToVerify != InvalidOid)
		return true;
8785

8786 8787 8788 8789
	Insist(XLogCtl->currentDatabaseToVerify == InvalidOid &&
				XLogCtl->tablespaceOfCurrentDatabaseToVerify == InvalidOid);
		return false;
}
8790

8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813
bool
StartupXLOG_Pass4_GetDBForPTCatVerification(void)
{
	char		*filename;
	FILE		*db_file;
	char		dbName[NAMEDATALEN];
	Oid			dbId= InvalidOid;
	Oid			tblSpaceId = InvalidOid;
	Oid			selectDbId = InvalidOid;
	Oid			selectTblSpaceId = InvalidOid;
	TransactionId dbFrozenxid;
	bool		gotDatabase = false;
	bool		chooseNextDatabase = false;

	filename = database_getflatfilename();
	db_file = AllocateFile(filename, "r");
	if (db_file == NULL)
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", filename)));

	while (read_pg_database_line(db_file, dbName, &dbId,
								 &tblSpaceId, &dbFrozenxid))
8814
	{
8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835
		Assert(dbId != InvalidOid && tblSpaceId != InvalidOid);
		if(XLogCtl->currentDatabaseToVerify == InvalidOid)
		{
			Assert(XLogCtl->tablespaceOfCurrentDatabaseToVerify == InvalidOid);
			selectDbId = dbId;
			selectTblSpaceId = tblSpaceId;
			gotDatabase = true;
			break;
		}
		else if (XLogCtl->currentDatabaseToVerify == dbId)
		{
			Assert(XLogCtl->tablespaceOfCurrentDatabaseToVerify == tblSpaceId);
			chooseNextDatabase = true;
		}
		else if (chooseNextDatabase)
		{
			selectDbId = dbId;
			selectTblSpaceId = tblSpaceId;
			gotDatabase = true;
			break;
		}
8836 8837
	}

8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855
	FreeFile(db_file);
	pfree(filename);

	XLogCtl->currentDatabaseToVerify = selectDbId;
	XLogCtl->tablespaceOfCurrentDatabaseToVerify = selectTblSpaceId;
	return gotDatabase;
}

/*
 * Perform Verification which is database specific
 * - Currently performed as part of Startup Pass 4
 */
void
StartupXLOG_Pass4_DBSpecificPTCatVerification()
{
	elog(LOG,"DB specific PersistentTable-Catalog Verification using DB %d", MyDatabaseId);
	if (!Persistent_DBSpecificPTCatVerification())
		XLogCtl->pass4_PTCatVerificationPassed = false;
V
WAL  
Vadim B. Mikheev 已提交
8856
}
8857

8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868
/*
 * Perform Verification which is NOT based on particular database
 * - Currently performed as part of Startup Pass 4
 */
void
StartupXLOG_Pass4_NonDBSpecificPTCatVerification(void)
{
	elog(LOG,"Non-DB specific PersistentTable-Catalog Verification using DB %d", MyDatabaseId);
	if (!Persistent_NonDBSpecificPTCatVerification())
		XLogCtl->pass4_PTCatVerificationPassed = false;
}
8869 8870

/*
8871 8872 8873 8874 8875 8876 8877 8878 8879 8880
 * Determine the recovery redo start location from the pg_control file.
 *
 *    1) Only uses information from the pg_control file.
 *    2) This simplified routine does not examine the offline recovery file or
 *       the online backup labels, etc.
 *    3) This routine is a heavily reduced version of StartXLOG.
 *    4) IMPORTANT NOTE: This routine sets global variables that establish
 *       the timeline context necessary to do ReadRecord.  The ThisTimeLineID
 *       and expectedTLIs globals are set.
 *
8881
 */
8882 8883
void
XLogGetRecoveryStart(char *callerStr, char *reasonStr, XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint)
8884
{
8885 8886 8887 8888 8889
	CheckPoint	checkPoint;
	XLogRecPtr	checkPointLoc;
	XLogRecord *record;
	bool previous;
	XLogRecPtr checkPointLSN;
8890

8891 8892 8893 8894 8895 8896 8897 8898 8899 8900
	Assert(redoCheckPointLoc != NULL);
	Assert(redoCheckPoint != NULL);

	ereport((Debug_print_qd_mirroring ? LOG : DEBUG1),
			(errmsg("%s: determine restart location %s",
			 callerStr, reasonStr)));

	XLogCloseReadRecord();

	if (Debug_print_qd_mirroring)
8901
	{
8902
		XLogPrintLogNames();
8903
	}
8904 8905 8906 8907 8908 8909 8910

	/*
	 * Read control file and verify XLOG status looks valid.
	 *
	 */
	ReadControlFile();

8911
	if (ControlFile->state < DB_SHUTDOWNED ||
8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934
		ControlFile->state > DB_IN_PRODUCTION ||
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		ereport(FATAL,
				(errmsg("%s: control file contains invalid data", callerStr)));

	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
	checkPointLoc = ControlFile->checkPoint;
	ThisTimeLineID = ControlFile->checkPointCopy.ThisTimeLineID;

	/*
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
	 */
	XLogReadRecoveryCommandFile(DEBUG5);

	/* Now we can determine the list of expected TLIs */
	expectedTLIs = XLogReadTimeLineHistory(ThisTimeLineID);

	record = ReadCheckpointRecord(checkPointLoc, 1);
	if (record != NULL)
8935
	{
8936 8937 8938 8939 8940 8941
		previous = false;
		ereport((Debug_print_qd_mirroring ? LOG : DEBUG1),
				(errmsg("%s: checkpoint record is at %s (LSN %s)",
						callerStr,
						XLogLocationToString(&checkPointLoc),
						XLogLocationToString2(&EndRecPtr))));
8942
	}
8943
	else
8944
	{
8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962
		previous = true;
		checkPointLoc = ControlFile->prevCheckPoint;
		record = ReadCheckpointRecord(checkPointLoc, 2);
		if (record != NULL)
		{
			ereport((Debug_print_qd_mirroring ? LOG : DEBUG1),
					(errmsg("%s: using previous checkpoint record at %s (LSN %s)",
						    callerStr,
							XLogLocationToString(&checkPointLoc),
						    XLogLocationToString2(&EndRecPtr))));
		}
		else
		{
			FileRep_SetSegmentState(SegmentStateFault, FaultTypeDB);

			ereport(ERROR,
				 (errmsg("%s: could not locate a valid checkpoint record", callerStr)));
		}
8963
	}
8964 8965 8966 8967 8968

	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	checkPointLSN = EndRecPtr;

	if (XLByteEQ(checkPointLoc,checkPoint.redo))
8969
	{
8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980
		{
			char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];

			snprintf(tmpBuf, sizeof(tmpBuf),
					 "control file has restart '%s' and redo start checkpoint at location(lsn) '%s(%s)' ",
					 (previous ? "previous " : ""),
					 XLogLocationToString3(&checkPointLoc),
					 XLogLocationToString4(&checkPointLSN));

			FileRep_InsertConfigLogEntry(tmpBuf);
		}
8981
	}
8982
 	else if (XLByteLT(checkPointLoc, checkPoint.redo))
8983
	{
8984 8985 8986 8987
		FileRep_SetSegmentState(SegmentStateFault, FaultTypeDB);

		ereport(ERROR,
				(errmsg("%s: invalid redo in checkpoint record", callerStr)));
8988 8989
	}
	else
8990 8991
	{
		XLogRecord *record;
8992

8993 8994 8995 8996 8997 8998 8999 9000 9001
		record = XLogReadRecord(&checkPoint.redo, false, LOG);
		if (record == NULL)
		{
			FileRep_SetSegmentState(SegmentStateFault, FaultTypeDB);

			ereport(ERROR,
			 (errmsg("%s: first redo record before checkpoint not found at %s",
					 callerStr, XLogLocationToString(&checkPoint.redo))));
		}
9002

9003
		{
9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014
			char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];

			snprintf(tmpBuf, sizeof(tmpBuf),
					 "control file has restart '%s' checkpoint at location(lsn) '%s(%s)', redo starts at location(lsn) '%s(%s)' ",
					 (previous ? "previous " : ""),
					 XLogLocationToString3(&checkPointLoc),
					 XLogLocationToString4(&checkPointLSN),
					 XLogLocationToString(&checkPoint.redo),
					 XLogLocationToString2(&EndRecPtr));

			FileRep_InsertConfigLogEntry(tmpBuf);
9015 9016
		}
	}
9017

9018
	XLogCloseReadRecord();
9019

9020 9021 9022 9023
	*redoCheckPointLoc = checkPointLoc;
	*redoCheckPoint = checkPoint;

}
9024 9025

/*
9026 9027 9028 9029 9030 9031 9032
 * Is the system still in recovery?
 *
 * Unlike testing InRecovery, this works in any process that's connected to
 * shared memory.
 *
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
 * variables the first time we see that recovery is finished.
9033
 */
9034 9035
bool
RecoveryInProgress(void)
9036
{
9037 9038 9039 9040 9041 9042 9043 9044
	/*
	 * We check shared state each time only until we leave recovery mode. We
	 * can't re-enter recovery, so there's no need to keep checking after the
	 * shared variable has once been seen false.
	 */
	if (!LocalRecoveryInProgress)
		return false;
	else
9045
	{
9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		/* spinlock is essential on machines with weak memory ordering! */
		SpinLockAcquire(&xlogctl->info_lck);
		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
		SpinLockRelease(&xlogctl->info_lck);

		/*
		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
		 * is finished. InitPostgres() relies upon this behaviour to ensure
		 * that InitXLOGAccess() is called at backend startup.	(If you change
		 * this, see also LocalSetXLogInsertAllowed.)
		 */
		if (!LocalRecoveryInProgress)
			InitXLOGAccess();

		return LocalRecoveryInProgress;
	}
9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124

	/*
	 * All done.  Allow backends to write WAL.  (Although the bool flag is
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
	 */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
	}
}

/*
 * Is this process allowed to insert new WAL records?
 *
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
 * But we also have provisions for forcing the result "true" or "false"
 * within specific processes regardless of the global state.
 */
bool
XLogInsertAllowed(void)
{
	/*
	 * If value is "unconditionally true" or "unconditionally false",
	 * just return it.  This provides the normal fast path once recovery
	 * is known done.
	 */
	if (LocalXLogInsertAllowed >= 0)
		return (bool) LocalXLogInsertAllowed;

	/*
	 * Else, must check to see if we're still in recovery.
	 */
	if (RecoveryInProgress())
		return false;

	/*
	 * On exit from recovery, reset to "unconditionally true", since there
	 * is no need to keep checking.
	 */
	LocalXLogInsertAllowed = 1;
	return true;
}

/*
 * Make XLogInsertAllowed() return true in the current process only.
 */
static void
LocalSetXLogInsertAllowed(void)
{
	Assert(LocalXLogInsertAllowed == -1);
	LocalXLogInsertAllowed = 1;

	/* Initialize as RecoveryInProgress() would do when switching state */
	InitXLOGAccess();
9125 9126 9127 9128 9129 9130 9131 9132
}

/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 *
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
 */
9133
static XLogRecord *
9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282
ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
{
	XLogRecord *record;
	bool sizeOk;
	uint32 delta_xl_tot_len;		/* delta of total len of entire record */
	uint32 delta_xl_len;			/* delta of total len of rmgr data */

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
				(errmsg("invalid primary checkpoint link in control file")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint link in control file")));
				break;
			default:
				ereport(LOG,
				   (errmsg("invalid checkpoint link in backup_label file")));
				break;
		}
		return NULL;
	}

	/*
	 * Set fetching_ckpt to true here, so that XLogReadRecord()
	 * uses RedoStartLSN as the start replication location used
	 * by WAL receiver (when StandbyMode is on). See comments
	 * for fetching_ckpt in XLogReadPage()
	 */
	record = XLogReadRecord(&RecPtr, true /* fetching_checkpoint */, LOG);

	if (record == NULL)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid primary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid resource manager ID in primary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid resource manager ID in secondary checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
				(errmsg("invalid resource manager ID in checkpoint record at location %s",
				        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
				   (errmsg("invalid xl_info in primary checkpoint record at location %s",
				           XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
				 (errmsg("invalid xl_info in secondary checkpoint record at location %s",
				         XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid xl_info in checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}

	sizeOk = false;
	if (record->xl_len == sizeof(CheckPoint) &&
		record->xl_tot_len == SizeOfXLogRecord + sizeof(CheckPoint))
	{
		sizeOk = true;
	}
	else if (record->xl_len > sizeof(CheckPoint) &&
		record->xl_tot_len > SizeOfXLogRecord + sizeof(CheckPoint))
	{
		delta_xl_len = record->xl_len - sizeof(CheckPoint);
		delta_xl_tot_len = record->xl_tot_len - (SizeOfXLogRecord + sizeof(CheckPoint));

		if (delta_xl_len == delta_xl_tot_len)
		{
			sizeOk = true;
		}
	}

	if (!sizeOk)
	{
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
					(errmsg("invalid length of primary checkpoint at location %s",
					        XLogLocationToString_Long(&RecPtr))));
				break;
			case 2:
				ereport(LOG,
				  (errmsg("invalid length of secondary checkpoint record at location %s",
				          XLogLocationToString_Long(&RecPtr))));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid length of checkpoint record at location %s",
						        XLogLocationToString_Long(&RecPtr))));
				break;
		}
		return NULL;
	}
	return record;
}

static void
UnpackCheckPointRecord(
	XLogRecord			*record,
9283
	CheckpointExtendedRecord *ckptExtended)
9284
{
9285 9286
	char *current_record_ptr;
	int remainderLen;
9287

9288
	if (record->xl_len == sizeof(CheckPoint))
9289
	{
9290 9291 9292 9293 9294 9295 9296
		/* Special (for bootstrap, xlog switch, maybe others) */
		ckptExtended->dtxCheckpoint = NULL;
		ckptExtended->dtxCheckpointLen = 0;
		ckptExtended->masterMirroringCheckpointLen = 0;
		ckptExtended->ptas = NULL;
		return;
	}
9297

9298 9299
	/* Normal checkpoint Record */
	Assert(record->xl_len > sizeof(CheckPoint));
9300

9301 9302
	current_record_ptr = ((char*)XLogRecGetData(record)) + sizeof(CheckPoint);
	remainderLen = record->xl_len - sizeof(CheckPoint);
9303

9304 9305 9306 9307
	/* Start of distributed transaction information */
	ckptExtended->dtxCheckpoint = (TMGXACT_CHECKPOINT *)current_record_ptr;
	ckptExtended->dtxCheckpointLen =
		TMGXACT_CHECKPOINT_BYTES((ckptExtended->dtxCheckpoint)->committedCount);
9308

9309 9310 9311 9312 9313 9314 9315 9316
	/*
	 * The master mirror checkpoint (mmxlog) and prepared transaction aggregate state (ptas) will be skipped
	 * when gp_before_filespace_setup is ON.
	 */
	if (remainderLen > ckptExtended->dtxCheckpointLen)
	{
		current_record_ptr = current_record_ptr + ckptExtended->dtxCheckpointLen;
		remainderLen -= ckptExtended->dtxCheckpointLen;
9317 9318


9319 9320 9321 9322
		/* Lets fetch the master mirroring information */
		ckptExtended->masterMirroringCheckpointLen =
				mmxlog_get_checkpoint_record_fields(current_record_ptr,
				                                    &(ckptExtended->masterMirroringCheckpoint));
9323

9324
		Assert(remainderLen > ckptExtended->masterMirroringCheckpointLen);
9325

9326 9327
		current_record_ptr = current_record_ptr + ckptExtended->masterMirroringCheckpointLen;
		remainderLen -= ckptExtended->masterMirroringCheckpointLen;
9328

9329 9330 9331 9332 9333 9334
		/* Finally, point to prepared transaction information */
		ckptExtended->ptas = (prepared_transaction_agg_state *) current_record_ptr;
		Assert(remainderLen == PREPARED_TRANSACTION_CHECKPOINT_BYTES(ckptExtended->ptas->count));
	}
	else
	{
9335
		Assert(remainderLen == ckptExtended->dtxCheckpointLen);
9336 9337 9338
		ckptExtended->masterMirroringCheckpointLen = 0;
		ckptExtended->ptas = NULL;
	}
9339

9340 9341 9342
	if (Debug_persistent_recovery_print)
	{
		elog(PersistentRecovery_DebugPrintLevel(),
9343 9344 9345
			 "UnpackCheckPointRecord: Checkpoint record data length = %u, "
			 "DTX committed count %d, DTX data length %u, "
			 "Master Mirroring length %u, filespaces %d, tablespaces %d, databases %d, "
9346 9347 9348 9349
			 "Prepared Transaction count = %d",
			 record->xl_len,
			 ckptExtended->dtxCheckpoint->committedCount, ckptExtended->dtxCheckpointLen,
			 ckptExtended->masterMirroringCheckpointLen,
9350 9351 9352 9353
			 ckptExtended->masterMirroringCheckpointLen ? ckptExtended->masterMirroringCheckpoint.fspc->count : 0,
			 ckptExtended->masterMirroringCheckpointLen ? ckptExtended->masterMirroringCheckpoint.tspc->count : 0,
			 ckptExtended->masterMirroringCheckpointLen ? ckptExtended->masterMirroringCheckpoint.dbdir->count : 0,
			 ckptExtended->ptas ? ckptExtended->ptas->count : 0);
9354
	}
9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371
}

/*
 * This must be called during startup of a backend process, except that
 * it need not be called in a standalone backend (which does StartupXLOG
 * instead).  We need to initialize the local copies of ThisTimeLineID and
 * RedoRecPtr.
 *
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
 */
void
InitXLOGAccess(void)
{
	/* ThisTimeLineID doesn't change so we need no lock to copy it */
	ThisTimeLineID = XLogCtl->ThisTimeLineID;
9372 9373 9374 9375 9376 9377 9378
	/* GPDB_84_MERGE_FIXME: Disabled, because FTS process was tripping it.
	 * This assertion was added by the merge, so I suspect it's been wrong
	 * all along, but we haven't noticed. */
#if 0
	Assert(ThisTimeLineID != 0);
#endif

9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445
	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
	(void) GetRedoRecPtr();
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
GetRedoRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
	SpinLockRelease(&xlogctl->info_lck);

	return RedoRecPtr;
}

/*
 * GetInsertRecPtr -- Returns the current insert position.
 *
 * NOTE: The value *actually* returned is the position of the last full
 * xlog page. It lags behind the real insert position by at most 1 page.
 * For that, we don't need to acquire WALInsertLock which can be quite
 * heavily contended, and an approximation is enough for the current
 * usage of this function.
 */
XLogRecPtr
GetInsertRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtRqst.Write;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

/*
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
 * position known to be fsync'd to disk.
 */
XLogRecPtr
GetFlushRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtResult.Flush;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

/*
 * Get the time of the last xlog segment switch
 */
9446
pg_time_t
9447 9448
GetLastSegSwitchTime(void)
{
9449
	pg_time_t	result;
9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507

	/* Need WALWriteLock, but shared lock is sufficient */
	LWLockAcquire(WALWriteLock, LW_SHARED);
	result = XLogCtl->Write.lastSegSwitchTime;
	LWLockRelease(WALWriteLock);

	return result;
}

/*
 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
 *
 * This is exported for use by code that would like to have 64-bit XIDs.
 * We don't really support such things, but all XIDs within the system
 * can be presumed "close to" the result, and thus the epoch associated
 * with them can be determined.
 */
void
GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
{
	uint32		ckptXidEpoch;
	TransactionId ckptXid;
	TransactionId nextXid;

	/* Must read checkpoint info first, else have race condition */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		ckptXidEpoch = xlogctl->ckptXidEpoch;
		ckptXid = xlogctl->ckptXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* Now fetch current nextXid */
	nextXid = ReadNewTransactionId();

	/*
	 * nextXid is certainly logically later than ckptXid.  So if it's
	 * numerically less, it must have wrapped into the next epoch.
	 */
	if (nextXid < ckptXid)
		ckptXidEpoch++;

	*xid = nextXid;
	*epoch = ckptXidEpoch;
}

/*
 * This must be called ONCE during postmaster or standalone-backend shutdown
 */
void
ShutdownXLOG(int code __attribute__((unused)) , Datum arg __attribute__((unused)) )
{
	ereport(LOG,
			(errmsg("shutting down")));

9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519
	if (RecoveryInProgress())
		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	else
	{
		/*
		 * If archiving is enabled, rotate the last XLOG file so that all the
		 * remaining records are archived (postmaster wakes up the archiver
		 * process one more time at the end of shutdown). The checkpoint
		 * record will go to the next XLOG file and won't be archived (yet).
		 */
		if (XLogArchivingActive() && XLogArchiveCommandSet())
			RequestXLogSwitch();
9520

9521 9522
		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	}
9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589
	ShutdownCLOG();
	ShutdownSUBTRANS();
	ShutdownMultiXact();
	DistributedLog_Shutdown();

	ereport(LOG,
			(errmsg("database system is shut down"),
					errSendAlert(true)));
}

/*
 * Calculate the last segment that we need to retain because of
 * keep_wal_segments, by subtracting keep_wal_segments from the passed
 * xlog location
 */
static void
CheckKeepWalSegments(XLogRecPtr recptr, uint32 *_logId, uint32 *_logSeg)
{
	uint32	log;
	uint32	seg;
	uint32	keep_log;
	uint32	keep_seg;

	if (keep_wal_segments <= 0)
		return;

	XLByteToSeg(recptr, log, seg);

	keep_seg = keep_wal_segments % XLogSegsPerFile;
	keep_log = keep_wal_segments / XLogSegsPerFile;
	ereport(DEBUG1,
			(errmsg("%s: Input %d %d (Keep %d %d) (current %d %d)",
					PG_FUNCNAME_MACRO, *_logId, *_logSeg, keep_log,
					keep_seg, log, seg)));
	if (seg < keep_seg)
	{
		keep_log += 1;
		seg = seg - keep_seg + XLogSegsPerFile;
	}
	else
	{
		seg = seg - keep_seg;
	}

	/* Avoid underflow, don't go below (0,1) */
	if (log < keep_log || (log == keep_log && seg == 0))
	{
		log = 0;
		seg = 1;
	}
	else
	{
		log = log - keep_log;
	}

	/* check not to delete WAL segments newer than the calculated segment */
	if (log < *_logId || (log == *_logId && seg < *_logSeg))
	{
		*_logId = log;
		*_logSeg = seg;
	}

	ereport(DEBUG1,
			(errmsg("%s: Output %d %d",
					PG_FUNCNAME_MACRO, *_logId, *_logSeg)));
}

9590 9591 9592 9593
/*
 * Log start of a checkpoint.
 */
static void
9594
LogCheckpointStart(int flags, bool restartpoint)
9595
{
9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607
	const char *msg;

	/*
	 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
	 * the main message, but what about all the flags?
	 */
	if (restartpoint)
		msg = "restartpoint starting:%s%s%s%s%s%s%s";
	else
		msg = "checkpoint starting:%s%s%s%s%s%s%s";

	elog(LOG, msg,
9608
		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
9609
		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620
		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
		 (flags & CHECKPOINT_FORCE) ? " force" : "",
		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}

/*
 * Log end of a checkpoint.
 */
static void
9621
LogCheckpointEnd(bool restartpoint)
9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643
{
	long		write_secs,
				sync_secs,
				total_secs;
	int			write_usecs,
				sync_usecs,
				total_usecs;

	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

	TimestampDifference(CheckpointStats.ckpt_start_t,
						CheckpointStats.ckpt_end_t,
						&total_secs, &total_usecs);

	TimestampDifference(CheckpointStats.ckpt_write_t,
						CheckpointStats.ckpt_sync_t,
						&write_secs, &write_usecs);

	TimestampDifference(CheckpointStats.ckpt_sync_t,
						CheckpointStats.ckpt_sync_end_t,
						&sync_secs, &sync_usecs);

9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663
	if (restartpoint)
		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
	else
		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
			 "%d transaction log file(s) added, %d removed, %d recycled; "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 CheckpointStats.ckpt_segs_added,
			 CheckpointStats.ckpt_segs_removed,
			 CheckpointStats.ckpt_segs_recycled,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
9664 9665
}

9666 9667 9668
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 *
9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680
 * flags is a bitwise OR of the following:
 *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
 *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
 *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
 *		ignoring checkpoint_completion_target parameter.
 *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
 *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
 *		CHECKPOINT_END_OF_RECOVERY).
 *
 * Note: flags contains other bits, of interest here only for logging purposes.
 * In particular note that this routine is synchronous and does not pay
 * attention to CHECKPOINT_WAIT.
9681 9682
 */
void
9683
CreateCheckPoint(int flags)
9684 9685
{
	MIRRORED_LOCK_DECLARE;
9686
	READ_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;
9687

9688
	bool		shutdown;
9689 9690 9691 9692 9693 9694 9695 9696 9697
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecData rdata[6];
	char* 		dtxCheckPointInfo;
	int			dtxCheckPointInfoSize;
	uint32		freespace;
	uint32		_logId;
	uint32		_logSeg;
9698
	VirtualTransactionId *vxids;
9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711
	int     	nvxids;
	bool		resync_to_sync_transition;

	resync_to_sync_transition = (flags & CHECKPOINT_RESYNC_TO_INSYNC_TRANSITION) != 0;

	/*
	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
	 * issued at a different time.
	 */
	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
		shutdown = true;
	else
		shutdown = false;
9712 9713 9714 9715 9716 9717 9718 9719

	if (shutdown && ControlFile->state == DB_STARTUP)
	{
		return;
	}

#ifdef FAULT_INJECTOR
	/* During resync checkpoint has to complete otherwise segment cannot transition into Sync state */
9720
	if (! resync_to_sync_transition)
9721 9722 9723 9724 9725 9726 9727 9728 9729 9730
	{
		if (FaultInjector_InjectFaultIfSet(
										   Checkpoint,
										   DDLNotSpecified,
										   "" /* databaseName */,
										   "" /* tableName */) == FaultInjectorTypeSkip)
			return;  // skip checkpoint
	}
#endif

9731 9732 9733 9734
	/* sanity check */
	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
		elog(ERROR, "can't create a checkpoint during recovery");

9735 9736 9737 9738 9739 9740 9741 9742
	/*
	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
	 * (This is just pro forma, since in the present system structure there is
	 * only one process that is allowed to issue checkpoints at any given
	 * time.)
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

9743 9744 9745 9746 9747 9748 9749 9750 9751 9752
	/*
	 * Prepare to accumulate statistics.
	 *
	 * Note: because it is possible for log_checkpoints to change while a
	 * checkpoint proceeds, we always accumulate stats, even if
	 * log_checkpoints is currently off.
	 */
	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

9753
	if (resync_to_sync_transition)
9754 9755 9756 9757 9758 9759
	{
		LWLockAcquire(MirroredLock, LW_EXCLUSIVE);

		/* database transitions to suspended state, IO activity on the segment is suspended */
		primaryMirrorSetIOSuspended(TRUE);

9760
		SIMPLE_FAULT_INJECTOR(FileRepTransitionToInSyncBeforeCheckpoint);
9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785
	}
	else
	{
		/*
		 * Normal case.
		 */
		MIRRORED_LOCK;
	}

	/*
	 * Use a critical section to force system panic if we have trouble.
	 */
	START_CRIT_SECTION();

	if (shutdown)
	{
		/*
		 * This is an ugly fix to dis-allow changing the pg_control
		 * state for standby promotion continuity.
		 *
		 * Refer to Startup_InProduction() for more details
		 */
		if (ControlFile->state != DB_IN_STANDBY_PROMOTED
			&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET)
		{
9786
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9787
			ControlFile->state = DB_SHUTDOWNING;
9788
			ControlFile->time = (pg_time_t) time(NULL);
9789
			UpdateControlFile();
9790
			LWLockRelease(ControlFileLock);
9791 9792 9793
		}
	}

9794 9795 9796 9797 9798 9799 9800 9801
	/*
	 * Let smgr prepare for checkpoint; this has to happen before we determine
	 * the REDO pointer.  Note that smgr must not do anything that'd have to
	 * be undone if we decide no checkpoint is needed.
	 */
	smgrpreckpt();

	/* Begin filling in the checkpoint WAL record */
9802
	MemSet(&checkPoint, 0, sizeof(checkPoint));
9803
	checkPoint.time = (pg_time_t) time(NULL);
9804 9805 9806 9807 9808

	/*
	 * The WRITE_PERSISTENT_STATE_ORDERED_LOCK gets these locks:
	 *    MirroredLock SHARED, and
	 *    PersistentObjLock EXCLUSIVE.
9809
	 * as well as set MyProc->inCommit = true.
9810 9811 9812 9813 9814 9815 9816 9817 9818
	 *
	 * The READ_PERSISTENT_STATE_ORDERED_LOCK gets this lock:
	 *    PersistentObjLock SHARED.
	 *
	 * They do this to prevent Persistent object changes during checkpoint and
	 * prevent persistent object reads while writing.  And acquire the MirroredLock
	 * at a level that blocks DDL during FileRep statechanges...
	 */

9819 9820 9821 9822
	/*
	 * We must hold WALInsertLock while examining insert state to determine
	 * the checkpoint REDO pointer.
	 */
9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

	/*
	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
	 * any XLOG records since the start of the last checkpoint, skip the
	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
	 * when the system is idle. That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the previous
	 * checkpoint record is in a different xlog page?)
	 *
	 * We have to make two tests to determine that nothing has happened since
	 * the start of the last checkpoint: current insertion point must match
	 * the end of the last checkpoint record, and its redo pointer must point
	 * to itself.
	 */
9840 9841
	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
				  CHECKPOINT_FORCE)) == 0)
9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
#ifdef originalCheckpointChecking
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
#else
		/*
		 * GP: Modified since the checkpoint record is not fixed length
		 * so we keep track of the last checkpoint locations (beginning and
		 * end) and use thoe values for comparison.
		 */
		if (XLogCtl->haveLastCheckpointLoc &&
			XLByteEQ(XLogCtl->lastCheckpointLoc,ControlFile->checkPoint) &&
			XLByteEQ(curInsert,XLogCtl->lastCheckpointEndLoc) &&
			XLByteEQ(ControlFile->checkPoint,ControlFile->checkPointCopy.redo))
#endif
		{
			LWLockRelease(WALInsertLock);
9867
			if (resync_to_sync_transition)
9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884
			{
				LWLockRelease(MirroredLock);
			}
			else
			{
				/*
				 * Normal case.
				 */
				MIRRORED_UNLOCK;
			}
			LWLockRelease(CheckpointLock);

			END_CRIT_SECTION();
			return;
		}
	}

9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895
	/*
	 * An end-of-recovery checkpoint is created before anyone is allowed to
	 * write WAL. To allow us to write the checkpoint record, temporarily
	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
	 */
	if (flags & CHECKPOINT_END_OF_RECOVERY)
		LocalSetXLogInsertAllowed();

	checkPoint.ThisTimeLineID = ThisTimeLineID;

9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933
	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will be,
	 * since other backends may insert more XLOG records while we're off doing
	 * the buffer flush work.  Those XLOG records are logically after the
	 * checkpoint, even though physically before it.  Got that?
	 */
	freespace = INSERT_FREESPACE(Insert);
	if (freespace < SizeOfXLogRecord)
	{
		(void) AdvanceXLInsertBuffer(false);
		/* OK to ignore update return flag, since we will do flush anyway */
		freespace = INSERT_FREESPACE(Insert);
	}
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);

	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
	 * must be done while holding the insert lock AND the info_lck.
	 *
	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
	 * pointing past where it really needs to point.  This is okay; the only
	 * consequence is that XLogInsert might back up whole buffers that it
	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
	 * XLogInserts that happen while we are dumping buffers must assume that
	 * their buffer changes are not included in the checkpoint.
	 */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/*
9934 9935
	 * Now we can release WAL insert lock, allowing other xacts to proceed
	 * while we are flushing disk buffers.
9936 9937 9938
	 */
	LWLockRelease(WALInsertLock);

9939 9940 9941 9942 9943
	/*
	 * If enabled, log checkpoint start.  We postpone this until now so as not
	 * to log anything if we decided to skip the checkpoint.
	 */
	if (log_checkpoints)
9944
		LogCheckpointStart(flags, false);
9945

9946 9947
	TRACE_POSTGRESQL_CHECKPOINT_START(flags);

9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971
	/*
	 * Before flushing data, we must wait for any transactions that are
	 * currently in their commit critical sections.  If an xact inserted its
	 * commit record into XLOG just before the REDO point, then a crash
	 * restart from the REDO point would not replay that record, which means
	 * that our flushing had better include the xact's update of pg_clog.  So
	 * we wait till he's out of his commit critical section before proceeding.
	 * See notes in RecordTransactionCommit().
	 *
	 * Because we've already released WALInsertLock, this test is a bit fuzzy:
	 * it is possible that we will wait for xacts we didn't really need to
	 * wait for.  But the delay should be short and it seems better to make
	 * checkpoint take a bit longer than to hold locks longer than necessary.
	 * (In fact, the whole reason we have this issue is that xact.c does
	 * commit record XLOG insertion and clog update as two separate steps
	 * protected by different locks, but again that seems best on grounds of
	 * minimizing lock contention.)
	 *
	 * A transaction that has not yet set inCommit when we look cannot be at
	 * risk, since he's not inserted his commit record yet; and one that's
	 * already cleared it is not at risk either, since he's done fixing clog
	 * and we will correctly flush the update below.  So we cannot miss any
	 * xacts we need to wait for.
	 */
9972 9973
	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
	if (nvxids > 0)
9974 9975 9976
	{
		do
		{
9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987
			/*
			 * GPDB needs to AbsorbFsyncRequests() here to avoid deadlock when
			 * fsync request queue is full while backend is in commit and
			 * performing ForgetRelationFsyncRequests() or
			 * ForgetDatabaseFsyncRequests(). Since for GPDB the mdlink
			 * happens through persistent tables cleanup, during which
			 * inCommit flag is set to avoid checkpoint from happening.
			 * PostgreSQL doesn't need this as ForgetRelationFsyncRequests()
			 * or ForgetDatabaseFsyncRequests() are not under inCommit=true.
			 */
			AbsorbFsyncRequests();
9988
			pg_usleep(10000L);	/* wait for 10 msec */
9989
		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9990
	}
9991
	pfree(vxids);
9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010

	/*
	 * Get the other info we need for the checkpoint record.
	 */
	LWLockAcquire(XidGenLock, LW_SHARED);
	checkPoint.nextXid = ShmemVariableCache->nextXid;
	LWLockRelease(XidGenLock);

	/* Increase XID epoch if we've wrapped around since last checkpoint */
	checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
		checkPoint.nextXidEpoch++;

	LWLockAcquire(OidGenLock, LW_SHARED);
	checkPoint.nextOid = ShmemVariableCache->nextOid;
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
	LWLockRelease(OidGenLock);

10011 10012 10013 10014 10015 10016
	LWLockAcquire(RelfilenodeGenLock, LW_SHARED);
	checkPoint.nextRelfilenode = ShmemVariableCache->nextRelfilenode;
	if (!shutdown)
		checkPoint.nextRelfilenode += ShmemVariableCache->relfilenodeCount;
	LWLockRelease(RelfilenodeGenLock);

10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030
	MultiXactGetCheckptMulti(shutdown,
							 &checkPoint.nextMulti,
							 &checkPoint.nextMultiOffset);

	/*
	 * Having constructed the checkpoint record, ensure all shmem disk buffers
	 * and commit-log buffers are flushed to disk.
	 *
	 * This I/O could fail for various reasons.  If so, we will fail to
	 * complete the checkpoint, but there is no reason to force a system
	 * panic. Accordingly, exit critical section while doing it.
	 */
	END_CRIT_SECTION();

10031
	CheckPointGuts(checkPoint.redo, flags);
10032 10033 10034 10035 10036

	START_CRIT_SECTION();

	/*
	 * Now insert the checkpoint record into XLOG.
10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062
	 *
	 * Here is the locking order and scope:
	 *
	 * getDtxCheckPointInfoAndLock (i.e. shmControlLock)
	 * 	READ_PERSISTENT_STATE_ORDERED_LOCK (i.e. PersistentObjLock)
	 * 		mmxlog_append_checkpoint_data
	 * 		XLogInsert
	 * 	READ_PERSISTENT_STATE_ORDERED_UNLOCK
	 * freeDtxCheckPointInfoAndUnlock
	 * XLogFlush
	 *
	 * We get the PersistentObjLock to prevent Persistent Object writers as
	 * we collect the Master Mirroring information from mmxlog_append_checkpoint_data()
	 * until finally after the checkpoint record is inserted into the XLOG to prevent the
	 * persistent information from changing.
	 *
	 * For example, if we don't hold the PersistentObjLock across mmxlog_append_checkpoint_data()
	 * and XLogInsert(), another xlog activity like drop tablespace could happen in between, which
	 * might caused wrong behavior when master standby replay checkpoint record.
	 *
	 * Master standby replay (mmxlog_read_checkpoint_data) the mmxlog information stored in the checkpoint
	 * record to recreate those persistent objects like filespace, tablespace, database dir, etc. If those
	 * objects dropped after checkpoint collected persistent objects information, but before checkpoint
	 * record write to XLOG, then the standby replay would first drop the object based on mmxlog record,
	 * then recreated based on the checkpoint record. That will ends-up left behind the directories already
	 * dropped on the master, break the consistency between the master and the standby.
10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081
	 */

	getDtxCheckPointInfoAndLock(&dtxCheckPointInfo, &dtxCheckPointInfoSize);

	rdata[0].data = (char *) (&checkPoint);
	rdata[0].len = sizeof(checkPoint);
	rdata[0].buffer = InvalidBuffer;
	rdata[0].next = &(rdata[1]);

	rdata[1].data = (char *) dtxCheckPointInfo;
	rdata[1].len = dtxCheckPointInfoSize;
	rdata[1].buffer = InvalidBuffer;
	rdata[1].next = NULL;

	/*
	 * Have the master mirror sync code add filespace and tablespace
	 * meta data to keep the standby consistent. Safe to call on segments
	 * as this is a NOOP if we're not the master.
	 */
10082
	READ_PERSISTENT_STATE_ORDERED_LOCK;
10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097
	mmxlog_append_checkpoint_data(rdata);

	prepared_transaction_agg_state *p = NULL;

	getTwoPhasePreparedTransactionData(&p, "CreateCheckPoint");
	elog(PersistentRecovery_DebugPrintLevel(), "CreateCheckPoint: prepared transactions = %d", p->count);
	rdata[5].data = (char*)p;
	rdata[5].buffer = InvalidBuffer;
	rdata[5].len = PREPARED_TRANSACTION_CHECKPOINT_BYTES(p->count);
	rdata[4].next = &(rdata[5]);
	rdata[5].next = NULL;

	if (Debug_persistent_recovery_print)
	{
		elog(PersistentRecovery_DebugPrintLevel(),
10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112
			"CreateCheckPoint: Regular checkpoint length = %u"
			", DTX checkpoint length %u (rdata[1].next is NULL %s)"
			", Master mirroring filespace length = %u (rdata[2].next is NULL %s)"
			", Master mirroring tablespace length = %u (rdata[3].next is NULL %s)"
			", Master mirroring database directory length = %u"
			", Prepared Transaction length = %u",
			rdata[0].len,
			rdata[1].len,
			(rdata[1].next == NULL ? "true" : "false"),
			rdata[2].len,
			(rdata[2].next == NULL ? "true" : "false"),
			rdata[3].len,
			(rdata[3].next == NULL ? "true" : "false"),
			rdata[4].len,
			rdata[5].len);
10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129
	}


	/*
	 * Need to save the oldest prepared transaction XLogRecPtr for use later.
	 * It is not sufficient to just save the pointer because we may remove the
	 * space after it is written in XLogInsert.
	 */
	XLogRecPtr *ptrd_oldest_ptr = NULL;
	XLogRecPtr ptrd_oldest;

	memset(&ptrd_oldest, 0, sizeof(ptrd_oldest));

	ptrd_oldest_ptr = getTwoPhaseOldestPreparedTransactionXLogRecPtr(&rdata[5]);

	if (Debug_persistent_recovery_print)
	{
10130 10131
		elog(PersistentRecovery_DebugPrintLevel(), "CreateCheckPoint: Oldest Prepared Record = %s",
				ptrd_oldest_ptr ? XLogLocationToString(ptrd_oldest_ptr) : "NULL");
10132 10133 10134 10135 10136 10137 10138 10139 10140 10141
	}


	if (ptrd_oldest_ptr != NULL)
		memcpy(&ptrd_oldest, ptrd_oldest_ptr, sizeof(ptrd_oldest));

	recptr = XLogInsert(RM_XLOG_ID,
			            shutdown ? XLOG_CHECKPOINT_SHUTDOWN : XLOG_CHECKPOINT_ONLINE,
			            rdata);

10142 10143
	READ_PERSISTENT_STATE_ORDERED_UNLOCK;

10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156
	if (Debug_persistent_recovery_print)
	{
		elog(PersistentRecovery_DebugPrintLevel(),
			 "CreateCheckPoint: Checkpoint location = %s, total length %u, data length %d",
			 XLogLocationToString(&recptr),
			 XLogLastInsertTotalLen(),
			 XLogLastInsertDataLen());
	}

	freeDtxCheckPointInfoAndUnlock(dtxCheckPointInfo, dtxCheckPointInfoSize, &recptr);

	XLogFlush(recptr);

10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171
	/*
	 * We mustn't write any new WAL after a shutdown checkpoint, or it will
	 * be overwritten at next startup.  No-one should even try, this just
	 * allows sanity-checking.  In the case of an end-of-recovery checkpoint,
	 * we want to just temporarily disable writing until the system has exited
	 * recovery.
	 */
	if (shutdown)
	{
		if (flags & CHECKPOINT_END_OF_RECOVERY)
			LocalXLogInsertAllowed = -1;	/* return to "check" state */
		else
			LocalXLogInsertAllowed = 0;		/* never again write WAL */
	}

10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
	 * = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
		ereport(PANIC,
				(errmsg("concurrent transaction log activity while database system is shutting down")));

	/*
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info or the oldest prepared transaction xlog record's info.
	 */
	if (ptrd_oldest_ptr != NULL && XLByteLE(ptrd_oldest, ControlFile->checkPointCopy.redo))
		XLByteToSeg(ptrd_oldest, _logId, _logSeg);
	else
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);

10189 10190 10191 10192 10193 10194 10195
	if (Debug_persistent_recovery_print)
	{
		elog(PersistentRecovery_DebugPrintLevel(),
			 "CreateCheckPoint: previous checkpoint's earliest info (copy redo location %s, previous checkpoint location %s)",
			 XLogLocationToString(&ControlFile->checkPointCopy.redo),
			 XLogLocationToString2(&ControlFile->prevCheckPoint));
	}
10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216

	/*
	 * Update the control file.
	 */
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
	if (shutdown)
	{
		/*
		 * Ugly fix to dis-allow changing pg_control state
		 * for standby promotion continuity
		 */
		if (ControlFile->state != DB_IN_STANDBY_PROMOTED
			&& ControlFile->state != DB_IN_STANDBY_NEW_TLI_SET)
			ControlFile->state = DB_SHUTDOWNED;
	}

	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
	/* crash recovery should always recover to the end of WAL */
	MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
10217
	ControlFile->time = (pg_time_t) time(NULL);
10218 10219 10220 10221 10222 10223

	/*
	 * Save the last checkpoint position.
	 */
	XLogCtl->haveLastCheckpointLoc = true;
	XLogCtl->lastCheckpointLoc = ProcLastRecPtr;
10224
	XLogCtl->lastCheckpointEndLoc = XactLastRecEnd;
10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241

	UpdateControlFile();
	LWLockRelease(ControlFileLock);

	/* Update shared-memory copy of checkpoint XID/epoch */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
		xlogctl->ckptXid = checkPoint.nextXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/*
	 * We are now done with critical updates; no need for system panic if we
10242
	 * have trouble while fooling with old log segments.
10243 10244 10245 10246
	 */
	END_CRIT_SECTION();

	/*
10247 10248 10249 10250 10251 10252
	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
	 */
	smgrpostckpt();

	/*
	 * Delete old log files (those no longer needed even for previous
10253 10254 10255 10256
	 * checkpoint).
	 */
	if (gp_keep_all_xlog == false && (_logId || _logSeg))
	{
10257
		GetXLogCleanUpTo(recptr, &_logId, &_logSeg);
10258 10259

		PrevLogSeg(_logId, _logSeg);
10260
		RemoveOldXlogFiles(_logId, _logSeg, recptr);
10261 10262 10263 10264 10265 10266 10267
	}

	/*
	 * Make more log segments if needed.  (Do this after deleting offline log
	 * segments, to avoid having peak disk space usage higher than necessary.)
	 */
	if (!shutdown)
10268
		PreallocXlogFiles(recptr);
10269 10270 10271 10272 10273 10274 10275 10276

	/*
	 * Truncate pg_subtrans if possible.  We can throw away all data before
	 * the oldest XMIN of any running transaction.	No future transaction will
	 * attempt to reference any pg_subtrans entry older than that (see Asserts
	 * in subtrans.c).	During recovery, though, we mustn't do this because
	 * StartupSUBTRANS hasn't been called yet.
	 */
10277
	if (!RecoveryInProgress())
10278
		TruncateSUBTRANS(GetOldestXmin(true, false));
10279 10280

	if (Debug_persistent_recovery_print)
10281
	{
10282
		elog(PersistentRecovery_DebugPrintLevel(),
10283
			 "CreateCheckPoint: checkpoint location %s, redo location %s",
10284 10285
			 XLogLocationToString(&ControlFile->checkPoint),
			 XLogLocationToString2(&checkPoint.redo));
10286 10287 10288 10289
	}

	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
10290
		LogCheckpointEnd(false);
10291

10292
	if (resync_to_sync_transition)
10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312
	{
		RequestXLogSwitch();

		FileRepResyncManager_ResyncFlatFiles();

		UpdateControlFile();

		LWLockRelease(MirroredLock);

		/* database is resumed */
		primaryMirrorSetIOSuspended(FALSE);
	}
	else
	{
		/*
		 * Normal case.
		 */
		MIRRORED_UNLOCK;
	}

10313
	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
10314 10315
									 NBuffers,
									 CheckpointStats.ckpt_segs_added,
10316 10317 10318
									 CheckpointStats.ckpt_segs_removed,
									 CheckpointStats.ckpt_segs_recycled);

10319 10320 10321 10322 10323 10324 10325 10326 10327 10328
	LWLockRelease(CheckpointLock);
}

/*
 * Flush all data in shared memory to disk, and fsync
 *
 * This is the common code shared between regular checkpoints and
 * recovery restartpoints.
 */
static void
10329
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
10330 10331 10332 10333 10334 10335
{
	CheckPointCLOG();
	CheckPointSUBTRANS();
	CheckPointMultiXact();
	CheckPointChangeTracking();
	DistributedLog_CheckPoint();
10336
	CheckPointBuffers(flags);	/* performs all required fsyncs */
10337 10338 10339 10340
	/* We deliberately delay 2PC checkpointing as long as possible */
	CheckPointTwoPhase(checkPointRedo);
}

10341 10342
static void
Checkpoint_RecoveryPass(XLogRecPtr checkPointRedo)
10343
{
10344
	CheckPointGuts(checkPointRedo, CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
10345 10346 10347
}

/*
10348
 * Save a checkpoint for recovery restart if appropriate
10349
 *
10350 10351 10352 10353 10354 10355
 * This function is called each time a checkpoint record is read from XLOG.
 * It must determine whether the checkpoint represents a safe restartpoint or
 * not.  If so, the checkpoint record is stashed in shared memory so that
 * CreateRestartPoint can consult it.  (Note that the latter function is
 * executed by the bgwriter, while this one will be executed by the startup
 * process.)
10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374
 */
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
	int			rmid;

	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Is it safe to checkpoint?  We must ask each of the resource managers
	 * whether they have any partial state information that might prevent a
	 * correct restart from this point.  If so, we skip this opportunity, but
	 * return at the next checkpoint record for another try.
	 */
	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
	{
		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
10375 10376 10377 10378 10379
			{
				elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
					 rmid,
					 checkPoint->redo.xlogid,
					 checkPoint->redo.xrecoff);
10380
				return;
10381
			}
10382 10383 10384 10385 10386 10387 10388 10389
	}

	/* Update the shared RedoRecPtr */
	 SpinLockAcquire(&xlogctl->info_lck);
	 xlogctl->Insert.RedoRecPtr = checkPoint->redo;
	 SpinLockRelease(&xlogctl->info_lck);

	/*
10390 10391
	 * Copy the checkpoint record to shared memory, so that bgwriter can use
	 * it the next time it wants to perform a restartpoint.
10392
	 */
10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431
	SpinLockAcquire(&xlogctl->info_lck);
	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Establish a restartpoint if possible.
 *
 * This is similar to CreateCheckPoint, but is used during WAL recovery
 * to establish a point from which recovery can roll forward without
 * replaying the entire recovery log.
 *
 * Returns true if a new restartpoint was established. We can only establish
 * a restartpoint if we have replayed a safe checkpoint record since last
 * restartpoint.
 */
bool
CreateRestartPoint(int flags)
{
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;
	uint32		_logId = 0;
	uint32		_logSeg = 0;

	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
	 * happens at a time.
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

	/* Get a local copy of the last safe checkpoint record. */
	SpinLockAcquire(&xlogctl->info_lck);
	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442

	if (IsStandbyMode())
	{
		/*
		 * Select point at which we can truncate the log, which we base on the
		 * prior checkpoint's earliest info.
		*/
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
	}

	/*
10443 10444
	 * Check that we're still in recovery mode. It's ok if we exit recovery
	 * mode after this check, the restart point is valid anyway.
10445
	 */
10446 10447 10448 10449 10450 10451 10452
	if (!RecoveryInProgress())
	{
		ereport(DEBUG2,
			  (errmsg("skipping restartpoint, recovery has already ended")));
		LWLockRelease(CheckpointLock);
		return false;
	}
10453 10454

	/*
10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466
	 * If the last checkpoint record we've replayed is already our last
	 * restartpoint, we can't perform a new restart point. We still update
	 * minRecoveryPoint in that case, so that if this is a shutdown restart
	 * point, we won't start up earlier than before. That's not strictly
	 * necessary, but when we get hot standby capability, it would be rather
	 * weird if the database opened up for read-only connections at a
	 * point-in-time before the last shutdown. Such time travel is still
	 * possible in case of immediate shutdown, though.
	 *
	 * We don't explicitly advance minRecoveryPoint when we do create a
	 * restartpoint. It's assumed that flushing the buffers will do that as a
	 * side-effect.
10467
	 */
10468 10469 10470 10471
	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
		XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
	{
		XLogRecPtr	InvalidXLogRecPtr = {0, 0};
10472

10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491
		ereport(DEBUG2,
				(errmsg("skipping restartpoint, already performed at %X/%X",
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));

		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
		LWLockRelease(CheckpointLock);
		return false;
	}

	if (log_checkpoints)
	{
		/*
		 * Prepare to accumulate statistics.
		 */
		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

		LogCheckpointStart(flags, true);
	}
10492

10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523
	CheckPointGuts(lastCheckPoint.redo, flags);

	/*
	 * Update pg_control, using current time.  Check that it still shows
	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
	 * this is a quick hack to make sure nothing really bad happens if
	 * somehow we get here after the end-of-recovery checkpoint.
	 */
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
		XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
	{
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = lastCheckPointRecPtr;
		ControlFile->checkPointCopy = lastCheckPoint;
		ControlFile->time = (pg_time_t) time(NULL);
		UpdateControlFile();
	}
	LWLockRelease(ControlFileLock);

	/*
	 * Currently, there is no need to truncate pg_subtrans during recovery. If
	 * we did do that, we will need to have called StartupSUBTRANS() already
	 * and then TruncateSUBTRANS() would go here.
	 */

	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
		LogCheckpointEnd(true);

	ereport((log_checkpoints ? LOG : DEBUG2),
10524
			(errmsg("recovery restart point at %X/%X",
10525 10526 10527
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));

	/* XXX this is currently BROKEN because we are in the wrong process */
10528
	if (recoveryLastXTime)
10529
		ereport((log_checkpoints ? LOG : DEBUG2),
10530 10531
				(errmsg("last completed transaction was at log time %s",
						timestamptz_to_str(recoveryLastXTime))));
10532

10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550
	elog((Debug_print_qd_mirroring ? LOG : DEBUG1), "RecoveryRestartPoint: checkpoint copy redo location %s, previous checkpoint location %s",
		 XLogLocationToString(&ControlFile->checkPointCopy.redo),
		 XLogLocationToString2(&ControlFile->prevCheckPoint));

	if (IsStandbyMode())
	{
		/*
		 * Delete offline log files (those no longer needed even for previous
		 * checkpoint).
		 */
		if (gp_keep_all_xlog == false && (_logId || _logSeg))
		{
			XLogRecPtr endptr;

			/* Get the current (or recent) end of xlog */
			endptr = GetStandbyFlushRecPtr(NULL);

			PrevLogSeg(_logId, _logSeg);
10551
			RemoveOldXlogFiles(_logId, _logSeg, endptr);
10552 10553
		}
	}
10554 10555 10556

	LWLockRelease(CheckpointLock);
	return true;
10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581
}

/*
 * Write a NEXTOID log record
 */
void
XLogPutNextOid(Oid nextOid)
{
	XLogRecData rdata;

	rdata.data = (char *) (&nextOid);
	rdata.len = sizeof(Oid);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);

	/*
	 * We need not flush the NEXTOID record immediately, because any of the
	 * just-allocated OIDs could only reach disk as part of a tuple insert or
	 * update that would have its own XLOG record that must follow the NEXTOID
	 * record.	Therefore, the standard buffer LSN interlock applied to those
	 * records will ensure no such OID reaches disk before the NEXTOID record
	 * does.
	 *
	 * Note, however, that the above statement only covers state "within" the
10582 10583
	 * database.  When we use a generated OID as a file or directory name, we
	 * are in a sense violating the basic WAL rule, because that filesystem
10584
	 * change may reach disk before the NEXTOID WAL record does.  The impact
10585 10586 10587 10588 10589
	 * of this is that if a database crash occurs immediately afterward, we
	 * might after restart re-generate the same OID and find that it conflicts
	 * with the leftover file or directory.  But since for safety's sake we
	 * always loop until finding a nonconflicting filename, this poses no real
	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
10590 10591 10592
	 */
}

10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607
/*
 * Write a NEXTRELFILENODE log record similar to XLogPutNextOid
 */
void
XLogPutNextRelfilenode(Oid nextRelfilenode)
{
	XLogRecData rdata;

	rdata.data = (char *) (&nextRelfilenode);
	rdata.len = sizeof(Oid);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTRELFILENODE, &rdata);
}

10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634
/*
 * Write an XLOG SWITCH record.
 *
 * Here we just blindly issue an XLogInsert request for the record.
 * All the magic happens inside XLogInsert.
 *
 * The return value is either the end+1 address of the switch record,
 * or the end+1 address of the prior segment if we did not need to
 * write a switch record because we are already at segment start.
 */
XLogRecPtr
RequestXLogSwitch(void)
{
	XLogRecPtr	RecPtr;
	XLogRecData rdata;

	/* XLOG SWITCH, alone among xlog record types, has no data */
	rdata.buffer = InvalidBuffer;
	rdata.data = NULL;
	rdata.len = 0;
	rdata.next = NULL;

	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);

	return RecPtr;
}

10635 10636 10637 10638 10639 10640
/*
 * Write a backup block if needed when we are setting a hint. Note that
 * this may be called for a variety of page types, not just heaps.
 *
 * Callable while holding just share lock on the buffer content.
 *
10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657
 * We can't use the plain backup block mechanism since that relies on the
 * Buffer being exclusively locked. Since some modifications (setting LSN, hint
 * bits) are allowed in a sharelocked buffer that can lead to wal checksum
 * failures. So instead we copy the page and insert the copied data as normal
 * record data.
 *
 * We only need to do something if page has not yet been full page written in
 * this checkpoint round. The LSN of the inserted wal record is returned if we
 * had to write, InvalidXLogRecPtr otherwise.
 *
 * It is possible that multiple concurrent backends could attempt to write WAL
 * records. In that case, multiple copies of the same block would be recorded
 * in separate WAL records by different backends, though that is still OK from
 * a correctness perspective.
 *
 * Note that this only works for buffers that fit the standard page model,
 * i.e. those for which buffer_std == true
10658 10659
 */
XLogRecPtr
10660
XLogSaveBufferForHint(Buffer buffer, Relation relation)
10661
{
10662 10663 10664
	XLogRecPtr recptr = InvalidXLogRecPtr;
	XLogRecPtr lsn;
	XLogRecData rdata[2];
10665
	BkpBlockWithPT bkpbwithpt;
10666

10667
	/*
10668
	 * Ensure no checkpoint can change our view of RedoRecPtr.
10669
	 */
10670
	Assert(MyProc->inCommit);
10671 10672

	/*
10673
	 * Update RedoRecPtr so XLogCheckBuffer can make the right decision
10674
	 */
10675
	GetRedoRecPtr();
10676

10677 10678 10679 10680 10681 10682 10683
	/*
	 * Setup phony rdata element for use within XLogCheckBuffer only.
	 * We reuse and reset rdata for any actual WAL record insert.
	 */
	rdata[0].buffer = buffer;
	rdata[0].buffer_std = true;

10684 10685 10686
	/*
	 * Check buffer while not holding an exclusive lock.
	 */
10687
	if (XLogCheckBuffer(rdata, false, false, &lsn, &bkpbwithpt.bkpb))
10688 10689 10690
	{
		char copied_buffer[BLCKSZ];
		char *origdata = (char *) BufferGetBlock(buffer);
10691

10692 10693 10694 10695 10696 10697 10698
		if (!RelationAllowedToGenerateXLogRecord(relation))
		{
			return recptr;
		}

		RelationGetPTInfo(relation, &bkpbwithpt.persistentTid, &bkpbwithpt.persistentSerialNum);
		
10699 10700 10701 10702 10703
		/*
		 * Copy buffer so we don't have to worry about concurrent hint bit or
		 * lsn updates. We assume pd_lower/upper cannot be changed without an
		 * exclusive lock, so the contents bkp are not racy.
		 */
10704 10705 10706 10707
		memcpy(copied_buffer, origdata, bkpbwithpt.bkpb.hole_offset);
		memcpy(copied_buffer + bkpbwithpt.bkpb.hole_offset,
			   origdata + bkpbwithpt.bkpb.hole_offset + bkpbwithpt.bkpb.hole_length,
			   BLCKSZ - bkpbwithpt.bkpb.hole_offset - bkpbwithpt.bkpb.hole_length);
10708 10709 10710 10711

		/*
		 * Header for backup block.
		 */
10712 10713
		rdata[0].data = (char *) &bkpbwithpt;
		rdata[0].len = sizeof(BkpBlockWithPT);
10714 10715 10716 10717 10718 10719 10720
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = &(rdata[1]);

		/*
		 * Save copy of the buffer.
		 */
		rdata[1].data = copied_buffer;
10721
		rdata[1].len = BLCKSZ - bkpbwithpt.bkpb.hole_length;
10722 10723 10724 10725 10726 10727 10728
		rdata[1].buffer = InvalidBuffer;
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
	}

	return recptr;
10729 10730
}

10731 10732 10733 10734
/*
 * XLOG resource manager's routines
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
10735
 * not all record types are related to control file updates.
10736 10737 10738 10739 10740 10741
 */
void
xlog_redo(XLogRecPtr beginLoc __attribute__((unused)), XLogRecPtr lsn __attribute__((unused)), XLogRecord *record)
{
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

10742
	/* Backup blocks are not used in xlog records */
10743
	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
10744

10745 10746 10747 10748
	if (info == XLOG_NEXTOID)
	{
		Oid			nextOid;

10749 10750 10751 10752 10753 10754
		/*
		 * We used to try to take the maximum of ShmemVariableCache->nextOid
		 * and the recorded nextOid, but that fails if the OID counter wraps
		 * around.  Since no OID allocation should be happening during replay
		 * anyway, better to just believe the record exactly.
		 */
10755
		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
10756 10757
		ShmemVariableCache->nextOid = nextOid;
		ShmemVariableCache->oidCount = 0;
10758
	}
10759 10760 10761 10762 10763 10764 10765 10766
	if (info == XLOG_NEXTRELFILENODE)
	{
		Oid			nextRelfilenode;

		memcpy(&nextRelfilenode, XLogRecGetData(record), sizeof(Oid));
		ShmemVariableCache->nextRelfilenode = nextRelfilenode;
		ShmemVariableCache->relfilenodeCount = 0;
	}
10767 10768 10769 10770 10771 10772 10773 10774 10775
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
10776 10777
		ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
		ShmemVariableCache->relfilenodeCount = 0;
10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827
		MultiXactSetNextMXact(checkPoint.nextMulti,
							  checkPoint.nextMultiOffset);

		/*
		 * If we see a shutdown checkpoint while waiting for an end-of-backup
		 * record, the backup was canceled and the end-of-backup record will
		 * never arrive.
		 */
		if (StandbyMode &&
			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
			ereport(PANIC,
			(errmsg("online backup was canceled, recovery cannot continue")));

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

		/* Update shared-memory copy of checkpoint XID/epoch */
		 {
			 /* use volatile pointer to prevent code rearrangement */
			 volatile XLogCtlData *xlogctl = XLogCtl;

			 SpinLockAcquire(&xlogctl->info_lck);
			 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
			 xlogctl->ckptXid = checkPoint.nextXid;
			 SpinLockRelease(&xlogctl->info_lck);
		 }

		/*
		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
		 */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
		{
			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
				!list_member_int(expectedTLIs,
								 (int) checkPoint.ThisTimeLineID))
				ereport(PANIC,
						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
								checkPoint.ThisTimeLineID, ThisTimeLineID)));
			/* Following WAL records should be run with new TLI */
			ThisTimeLineID = checkPoint.ThisTimeLineID;
		}

		RecoveryRestartPoint(&checkPoint);
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10828
		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
10829 10830 10831
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
			ShmemVariableCache->nextXid = checkPoint.nextXid;
10832 10833 10834
		/* ... but still treat OID counter as exact */
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
10835 10836
		ShmemVariableCache->nextRelfilenode = checkPoint.nextRelfilenode;
		ShmemVariableCache->relfilenodeCount = 0;
10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862
		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
								  checkPoint.nextMultiOffset);

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

		/* Update shared-memory copy of checkpoint XID/epoch */
		 {
			 /* use volatile pointer to prevent code rearrangement */
			 volatile XLogCtlData *xlogctl = XLogCtl;

			 SpinLockAcquire(&xlogctl->info_lck);
			 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
			 xlogctl->ckptXid = checkPoint.nextXid;
			 SpinLockRelease(&xlogctl->info_lck);
		 }

		/* TLI should not change in an on-line checkpoint */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
			ereport(PANIC,
					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
							checkPoint.ThisTimeLineID, ThisTimeLineID)));

		RecoveryRestartPoint(&checkPoint);
	}
10863 10864 10865 10866
	else if (info == XLOG_NOOP)
	{
		/* nothing to do here */
	}
10867 10868 10869 10870
	else if (info == XLOG_SWITCH)
	{
		/* nothing to do here */
	}
10871 10872
	else if (info == XLOG_HINT)
	{
10873
		char *data;
10874
		BkpBlockWithPT bkpbwithpt;
10875 10876

		/*
10877 10878 10879 10880
		 * Hint bit records contain a backup block stored "inline" in the normal
		 * data since the locking when writing hint records isn't sufficient to
		 * use the normal backup block mechanism, which assumes exclusive lock
		 * on the buffer supplied.
10881
		 *
10882 10883
		 * Since the only change in these backup block are hint bits, there are
		 * no recovery conflicts generated.
10884 10885 10886 10887 10888
		 *
		 * This also means there is no corresponding API call for this,
		 * so an smgr implementation has no need to implement anything.
		 * Which means nothing is needed in md.c etc
		 */
10889
		data = XLogRecGetData(record);
10890 10891
		memcpy(&bkpbwithpt, data, sizeof(BkpBlockWithPT));
		data += sizeof(BkpBlockWithPT);
10892

10893
		RestoreBackupBlockContents(lsn, bkpbwithpt.bkpb, data, false, false);
10894
	}
10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935
	else if (info == XLOG_BACKUP_END)
	{
		XLogRecPtr	startpoint;

		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

		if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
		{
			/*
			 * We have reached the end of base backup, the point where
			 * pg_stop_backup() was done.
			 * Reset backupStartPoint, and update minRecoveryPoint to make
			 * sure we don't allow starting up at an earlier point even if
			 * recovery is stopped and restarted soon after this.
			 */
			elog(DEBUG1, "end of backup reached");

			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

			if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
				ControlFile->minRecoveryPoint = lsn;
			MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
			ControlFile->backupEndRequired = false;
			UpdateControlFile();

			LWLockRelease(ControlFileLock);
		}
	}
}

void
xlog_desc(StringInfo buf, XLogRecPtr beginLoc, XLogRecord *record)
{
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
	char		*rec = XLogRecGetData(record);

	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint *checkpoint = (CheckPoint *) rec;

10936
		CheckpointExtendedRecord ckptExtended;
10937

10938
		appendStringInfo(buf, "checkpoint: redo %X/%X; "
10939
						 "tli %u; xid %u/%u; oid %u; relfilenode %u; multi %u; offset %u; %s",
10940 10941 10942 10943
						 checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
						 checkpoint->ThisTimeLineID,
						 checkpoint->nextXidEpoch, checkpoint->nextXid,
						 checkpoint->nextOid,
10944
						 checkpoint->nextRelfilenode,
10945 10946 10947 10948
						 checkpoint->nextMulti,
						 checkpoint->nextMultiOffset,
				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");

10949 10950 10951
		UnpackCheckPointRecord(record, &ckptExtended);

		if (ckptExtended.dtxCheckpointLen > 0)
10952 10953 10954
		{
			appendStringInfo(buf,
				 ", checkpoint record data length = %u, DTX committed count %d, DTX data length %u, Master Mirroring information length %u",
10955 10956 10957 10958 10959 10960 10961 10962 10963 10964
							 record->xl_len,
							 ckptExtended.dtxCheckpoint->committedCount,
							 ckptExtended.dtxCheckpointLen,
							 ckptExtended.masterMirroringCheckpointLen);
			if (ckptExtended.ptas != NULL)
				appendStringInfo(buf,
								 ", prepared transaction agg state count = %d",
								 ckptExtended.ptas->count);

			if (ckptExtended.masterMirroringCheckpointLen > 0)
10965
			{
10966 10967 10968 10969 10970
				appendStringInfo(buf,
								 ", master mirroring information: %d filespaces, %d tablespaces, %d databases",
								 ckptExtended.masterMirroringCheckpoint.fspc->count,
								 ckptExtended.masterMirroringCheckpoint.tspc->count,
								 ckptExtended.masterMirroringCheckpoint.dbdir->count);
10971 10972 10973
			}
		}
	}
10974 10975 10976 10977
	else if (info == XLOG_NOOP)
	{
		appendStringInfo(buf, "xlog no-op");
	}
10978 10979 10980 10981 10982 10983 10984
	else if (info == XLOG_NEXTOID)
	{
		Oid			nextOid;

		memcpy(&nextOid, rec, sizeof(Oid));
		appendStringInfo(buf, "nextOid: %u", nextOid);
	}
10985 10986
	else if (info == XLOG_HINT)
	{
10987
		BkpBlockWithPT *bkpwithpt = (BkpBlockWithPT *) rec;
10988
		appendStringInfo(buf, "page hint: %u/%u/%u block %u",
10989 10990 10991 10992
						 bkpwithpt->bkpb.node.spcNode,
						 bkpwithpt->bkpb.node.dbNode,
						 bkpwithpt->bkpb.node.relNode,
						 bkpwithpt->bkpb.block);
10993
	}
10994 10995 10996 10997 10998 10999 11000
	else if (info == XLOG_NEXTRELFILENODE)
	{
		Oid			nextRelfilenode;

		memcpy(&nextRelfilenode, rec, sizeof(Oid));
		appendStringInfo(buf, "nextRelfilenode: %u", nextRelfilenode);
	}
11001 11002 11003 11004 11005 11006 11007 11008
	else if (info == XLOG_SWITCH)
	{
		appendStringInfo(buf, "xlog switch");
	}
	else
		appendStringInfo(buf, "UNKNOWN");
}

11009 11010
static void
xlog_outrec(StringInfo buf, XLogRecord *record)
11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028
{
	int			i;

	appendStringInfo(buf, "prev %X/%X; xid %u",
					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
					 record->xl_xid);

	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
			appendStringInfo(buf, "; bkpb%d", i + 1);
	}

	appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
}


/*
11029 11030
 * Return the (possible) sync flag used for opening a file, depending on the
 * value of the GUC wal_sync_method.
11031
 */
11032 11033
static int
get_sync_bit(int method)
11034
{
11035 11036 11037
	/* If fsync is disabled, never open in sync mode */
	if (!enableFsync)
		return 0;
11038

11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050
	/*
	 * walreceiver process receives xlog data from walsender process.
	 * It needs to write the xlog data as soon as it receives and the amount it receives.
	 * As the amount of data received by it to write cannot be guaranteed to be
	 * OS/FS block size aligned, should never use O_DIRECT for the same.
	 * Also, as code is not expecting O_DIRECT to be used for xlog writes on walreceiver,
	 * the buffer pointer to perform xlog writes is not made usre to be OS/FS blocks size aligned.
	 */
	if (MyAuxProcType == WalReceiverProcess)
		return 0;

	switch (method)
11051
	{
11052 11053 11054 11055 11056 11057
			/*
			 * enum values for all sync options are defined even if they are
			 * not supported on the current platform.  But if not, they are
			 * not included in the enum option array, and therefore will never
			 * be seen here.
			 */
11058 11059 11060 11061
		case SYNC_METHOD_FSYNC:
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
		case SYNC_METHOD_FDATASYNC:
			return 0;
11062
#ifdef OPEN_SYNC_FLAG
11063 11064
		case SYNC_METHOD_OPEN:
			return OPEN_SYNC_FLAG;
11065 11066
#endif
#ifdef OPEN_DATASYNC_FLAG
11067 11068
		case SYNC_METHOD_OPEN_DSYNC:
			return OPEN_DATASYNC_FLAG;
11069
#endif
11070 11071 11072
		default:
			/* can't happen (unless we are out of sync with option array) */
			elog(ERROR, "unrecognized wal_sync_method: %d", method);
11073
			return 0;			/* silence warning */
11074 11075
	}
}
11076

11077 11078 11079 11080 11081 11082
/*
 * GUC support
 */
bool
assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source pg_attribute_unused() )
{
11083
	if (!doit)
11084
		return true;
11085

11086
	if (sync_method != new_sync_method)
11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098
	{
		/*
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new flag
		 * bit) at next use.
		 */
		if (MirroredFlatFile_IsActive(&mirroredLogFileOpen))
		{
			if (MirroredFlatFile_Flush(
								&mirroredLogFileOpen,
								/* suppressError */ true))
11099 11100
				ereport(PANIC,
						(errcode_for_file_access(),
11101
						 errmsg("could not fsync log file %u, segment %u: %m",
B
Bruce Momjian 已提交
11102
								openLogId, openLogSeg)));
11103
			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
11104 11105 11106 11107
				XLogFileClose();
		}
	}

11108
	return true;
11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135
}

/*
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
 */
void
issue_xlog_fsync(int fd, uint32 log, uint32 seg)
{
	switch (sync_method)
	{
		case SYNC_METHOD_FSYNC:
			if (pg_fsync_no_writethrough(fd) != 0)
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not fsync log file %u, segment %u: %m",
								log, seg)));
			break;
#ifdef HAVE_FSYNC_WRITETHROUGH
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
			if (pg_fsync_writethrough(fd) != 0)
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not fsync write-through log file %u, segment %u: %m",
								log, seg)));
11136 11137
			break;
#endif
11138 11139
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
11140
			if (pg_fdatasync(fd) != 0)
11141 11142
				ereport(PANIC,
						(errcode_for_file_access(),
11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207
					errmsg("could not fdatasync log file %u, segment %u: %m",
						   log, seg)));
			break;
#endif
		case SYNC_METHOD_OPEN:
//		case SYNC_METHOD_OPEN_DSYNC:
			/* write synced it already */
			break;
		default:
			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
			break;
	}
}

/*
 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
 * function. It creates the necessary starting checkpoint and constructs the
 * backup label file.
 *
 * There are two kind of backups: exclusive and non-exclusive. An exclusive
 * backup is started with pg_start_backup(), and there can be only one active
 * at a time. The backup label file of an exclusive backup is written to
 * $PGDATA/backup_label, and it is removed by pg_stop_backup().
 *
 * A non-exclusive backup is used for the streaming base backups (see
 * src/backend/replication/basebackup.c). The difference to exclusive backups
 * is that the backup label file is not written to disk. Instead, its would-be
 * contents are returned in *labelfile, and the caller is responsible for
 * including it in the backup archive as 'backup_label'. There can be many
 * non-exclusive backups active at the same time, and they don't conflict
 * with an exclusive backup either.
 *
 * Every successfully started non-exclusive backup must be stopped by calling
 * do_pg_stop_backup() or do_pg_abort_backup().
 */
XLogRecPtr
do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
{
	bool		exclusive = (labelfile == NULL);
	bool		backup_started_in_recovery = false;
	XLogRecPtr	checkpointloc;
	XLogRecPtr	startpoint;
	pg_time_t	stamp_time;
	char		strfbuf[128];
	char		xlogfilename[MAXFNAMELEN];
	uint32		_logId;
	uint32		_logSeg;
	struct stat stat_buf;
	FILE	   *fp;
	StringInfoData labelfbuf;

	/* base backup in recovery mode not currently supported */
	backup_started_in_recovery = false;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
		   errmsg("must be superuser or replication role to run a backup")));

	if (strlen(backupidstr) > MAXPGPATH)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("backup label too long (max %d bytes)",
						MAXPGPATH)));

11208 11209 11210 11211 11212 11213
	if (!XLogIsNeeded())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("WAL level not sufficient for making an online backup"),
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251
	/*
	 * Mark backup active in shared memory.  We must do full-page WAL writes
	 * during an on-line backup even if not doing so at other times, because
	 * it's quite possible for the backup dump to obtain a "torn" (partially
	 * written) copy of a database page if it reads the page concurrently with
	 * our write to the same page.	This can be fixed as long as the first
	 * write to the page in the WAL sequence is a full-page write. Hence, we
	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
	 * are no dirty pages in shared memory that might get dumped while the
	 * backup is in progress without having a corresponding WAL record.  (Once
	 * the backup is complete, we need not force full-page writes anymore,
	 * since we expect that any pages not modified during the backup interval
	 * must have been correctly captured by the backup.)
	 *
	 * Note that forcePageWrites has no effect during an online backup from
	 * the standby.
	 *
	 * We must hold WALInsertLock to change the value of forcePageWrites, to
	 * ensure adequate interlocking against XLogInsert().
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (exclusive)
	{
		if (XLogCtl->Insert.exclusiveBackup)
		{
			LWLockRelease(WALInsertLock);
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is already in progress"),
					 errhint("Run pg_stop_backup() and try again.")));
		}
		XLogCtl->Insert.exclusiveBackup = true;
	}
	else
		XLogCtl->Insert.nonExclusiveBackups++;
	XLogCtl->Insert.forcePageWrites = true;
	LWLockRelease(WALInsertLock);

11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263
	/*
	 * Force an XLOG file switch before the checkpoint, to ensure that the WAL
	 * segment the checkpoint is written to doesn't contain pages with old
	 * timeline IDs. That would otherwise happen if you called
	 * pg_start_backup() right after restoring from a PITR archive: the first
	 * WAL segment containing the startup checkpoint has pages in the
	 * beginning with the old timeline ID. That can cause trouble at recovery:
	 * we won't have a history file covering the old timeline if pg_xlog
	 * directory was not included in the base backup and the WAL archive was
	 * cleared too before starting the backup.
	 */
	RequestXLogSwitch();
11264

11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313
	/* Ensure we release forcePageWrites if fail below */
	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
	{
		bool		gotUniqueStartpoint = false;

		/*
		 * Force an XLOG file switch before the checkpoint, to ensure that the
		 * WAL segment the checkpoint is written to doesn't contain pages with
		 * old timeline IDs.  That would otherwise happen if you called
		 * pg_start_backup() right after restoring from a PITR archive: the
		 * first WAL segment containing the startup checkpoint has pages in
		 * the beginning with the old timeline ID.	That can cause trouble at
		 * recovery: we won't have a history file covering the old timeline if
		 * pg_xlog directory was not included in the base backup and the WAL
		 * archive was cleared too before starting the backup.
		 *
		 * This also ensures that we have emitted a WAL page header that has
		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
		 * compress out removable backup blocks, it won't remove any that
		 * occur after this point.
		 *
		 * During recovery, we skip forcing XLOG file switch, which means that
		 * the backup taken during recovery is not available for the special
		 * recovery case described above.
		 */
		if (!backup_started_in_recovery)
			RequestXLogSwitch();

		do
		{
			/*
			 * Force a CHECKPOINT.	Aside from being necessary to prevent torn
			 * page problems, this guarantees that two successive backup runs
			 * will have different checkpoint positions and hence different
			 * history file names, even if nothing happened in between.
			 *
			 * During recovery, establish a restartpoint if possible. We use
			 * the last restartpoint as the backup starting checkpoint. This
			 * means that two successive backup runs can have same checkpoint
			 * positions.
			 *
			 * Since the fact that we are executing do_pg_start_backup()
			 * during recovery means that checkpointer is running, we can use
			 * RequestCheckpoint() to establish a restartpoint.
			 *
			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
			 * passing fast = true).  Otherwise this can take awhile.
			 */
11314 11315
			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
							  (fast ? CHECKPOINT_IMMEDIATE : 0));
11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474

			/*
			 * Now we need to fetch the checkpoint record location, and also
			 * its REDO pointer.  The oldest point in WAL that would be needed
			 * to restore starting from the checkpoint is precisely the REDO
			 * pointer.
			 */
			LWLockAcquire(ControlFileLock, LW_SHARED);
			checkpointloc = ControlFile->checkPoint;
			startpoint = ControlFile->checkPointCopy.redo;
			LWLockRelease(ControlFileLock);

			/*
			 * If two base backups are started at the same time (in WAL sender
			 * processes), we need to make sure that they use different
			 * checkpoints as starting locations, because we use the starting
			 * WAL location as a unique identifier for the base backup in the
			 * end-of-backup WAL record and when we write the backup history
			 * file. Perhaps it would be better generate a separate unique ID
			 * for each backup instead of forcing another checkpoint, but
			 * taking a checkpoint right after another is not that expensive
			 * either because only few buffers have been dirtied yet.
			 */
			LWLockAcquire(WALInsertLock, LW_SHARED);
			if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
			{
				XLogCtl->Insert.lastBackupStart = startpoint;
				gotUniqueStartpoint = true;
			}
			LWLockRelease(WALInsertLock);
		} while (!gotUniqueStartpoint);

		XLByteToSeg(startpoint, _logId, _logSeg);
		XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);

		/*
		 * Construct backup label file
		 */
		initStringInfo(&labelfbuf);

		/* Use the log timezone here, not the session timezone */
		stamp_time = (pg_time_t) time(NULL);
		pg_strftime(strfbuf, sizeof(strfbuf),
					"%Y-%m-%d %H:%M:%S %Z",
					pg_localtime(&stamp_time, log_timezone));
		appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
						 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
		appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
						 checkpointloc.xlogid, checkpointloc.xrecoff);
		appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
						 exclusive ? "pg_start_backup" : "streamed");
		appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
						 backup_started_in_recovery ? "standby" : "master");
		appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
		appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

		elogif(debug_basebackup, LOG, "basebackup label file --\n%s", labelfbuf.data);

		/*
		 * Okay, write the file, or return its contents to caller.
		 */
		if (exclusive)
		{
			/*
			 * Check for existing backup label --- implies a backup is already
			 * running.  (XXX given that we checked exclusiveBackup above,
			 * maybe it would be OK to just unlink any such label file?)
			 */
			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
			{
				if (errno != ENOENT)
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not stat file \"%s\": %m",
									BACKUP_LABEL_FILE)));
			}
			else
				ereport(ERROR,
						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						 errmsg("a backup is already in progress"),
						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
								 BACKUP_LABEL_FILE)));

			fp = AllocateFile(BACKUP_LABEL_FILE, "w");

			if (!fp)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not create file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
				fflush(fp) != 0 ||
				pg_fsync(fileno(fp)) != 0 ||
				ferror(fp) ||
				FreeFile(fp))
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not write file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			pfree(labelfbuf.data);
		}
		else
			*labelfile = labelfbuf.data;
	}
	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));

	/*
	 * We're done.  As a convenience, return the starting WAL location.
	 */
	return startpoint;
}

/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
	bool		exclusive = DatumGetBool(arg);

	/* Update backup counters and forcePageWrites on failure */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (exclusive)
	{
		Assert(XLogCtl->Insert.exclusiveBackup);
		XLogCtl->Insert.exclusiveBackup = false;
	}
	else
	{
		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
		XLogCtl->Insert.nonExclusiveBackups--;
	}

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);
}

/*
 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
 * function.

 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
 * the non-exclusive backup specified by 'labelfile'.
 */
XLogRecPtr
do_pg_stop_backup(char *labelfile)
{
	bool		exclusive = (labelfile == NULL);
	bool		backup_started_in_recovery = false;
	XLogRecPtr	startpoint;
	XLogRecPtr	stoppoint;
	XLogRecData rdata;
	pg_time_t	stamp_time;
	char		strfbuf[128];
	char		histfilepath[MAXPGPATH];
	char		startxlogfilename[MAXFNAMELEN];
	char		stopxlogfilename[MAXFNAMELEN];
11475 11476
	char		lastxlogfilename[MAXFNAMELEN];
	char		histfilename[MAXFNAMELEN];
11477 11478 11479 11480 11481 11482
	char		backupfrom[20];
	uint32		_logId;
	uint32		_logSeg;
	FILE	   *lfp;
	FILE	   *fp;
	char		ch;
11483 11484
	int			seconds_before_warning;
	int			waits = 0;
11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495
	char	   *remaining;
	char	   *ptr;

	/* Currently backup during recovery not supported */
	backup_started_in_recovery = false;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
		 (errmsg("must be superuser or replication role to run a backup"))));

11496 11497 11498 11499 11500 11501
	if (!XLogIsNeeded())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("WAL level not sufficient for making an online backup"),
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662
	/*
	 * OK to update backup counters and forcePageWrites
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (exclusive)
		XLogCtl->Insert.exclusiveBackup = false;
	else
	{
		/*
		 * The user-visible pg_start/stop_backup() functions that operate on
		 * exclusive backups can be called at any time, but for non-exclusive
		 * backups, it is expected that each do_pg_start_backup() call is
		 * matched by exactly one do_pg_stop_backup() call.
		 */
		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
		XLogCtl->Insert.nonExclusiveBackups--;
	}

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);

	if (exclusive)
	{
		/*
		 * Read the existing label file into memory.
		 */
		struct stat statbuf;
		int			r;

		if (stat(BACKUP_LABEL_FILE, &statbuf))
		{
			if (errno != ENOENT)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not stat file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is not in progress")));
		}

		lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
		if (!lfp)
		{
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		}
		labelfile = palloc(statbuf.st_size + 1);
		r = fread(labelfile, statbuf.st_size, 1, lfp);
		labelfile[statbuf.st_size] = '\0';

		/*
		 * Close and remove the backup label file
		 */
		if (r != 1 || ferror(lfp) || FreeFile(lfp))
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		if (unlink(BACKUP_LABEL_FILE) != 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not remove file \"%s\": %m",
							BACKUP_LABEL_FILE)));
	}

	/*
	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
	 * but we are not expecting any variability in the file format).
	 */
	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
			   &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
			   &ch) != 4 || ch != '\n')
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */

	/*
	 * Parse the BACKUP FROM line. If we are taking an online backup from the
	 * standby, we confirm that the standby has not been promoted during the
	 * backup.
	 */
	ptr = strstr(remaining, "BACKUP FROM:");
	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("the standby was promoted during online backup"),
				 errhint("This means that the backup being taken is corrupt "
						 "and should not be used. "
						 "Try taking another online backup.")));

	/*
	 * Write the backup-end xlog record
	 */
	rdata.data = (char *) (&startpoint);
	rdata.len = sizeof(startpoint);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);

	elog(LOG, "Basebackup stop point is at %X/%X.",
			   stoppoint.xlogid, stoppoint.xrecoff);

	/*
	 * Force a switch to a new xlog segment file, so that the backup is valid
	 * as soon as archiver moves out the current segment file.
	 */
	RequestXLogSwitch();

	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
	XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);

	/* Use the log timezone here, not the session timezone */
	stamp_time = (pg_time_t) time(NULL);
	pg_strftime(strfbuf, sizeof(strfbuf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&stamp_time, log_timezone));

	/*
	 * Write the backup history file
	 */
	XLByteToSeg(startpoint, _logId, _logSeg);
	BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
						  startpoint.xrecoff % XLogSegSize);
	fp = AllocateFile(histfilepath, "w");
	if (!fp)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m",
						histfilepath)));
	fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
			startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
	fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
			stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
	/* transfer remaining lines from label to history file */
	fprintf(fp, "%s", remaining);
	fprintf(fp, "STOP TIME: %s\n", strfbuf);
	if (fflush(fp) || ferror(fp) || FreeFile(fp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write file \"%s\": %m",
						histfilepath)));

	/*
	 * Clean out any no-longer-needed history files.  As a side effect, this
	 * will post a .ready file for the newly created history file, notifying
	 * the archiver that history file may be archived immediately.
	 */
	CleanupBackupHistory();

11663
	/*
11664 11665 11666 11667 11668 11669 11670
	 * If archiving is enabled, wait for all the required WAL files to be
	 * archived before returning. If archiving isn't enabled, the required
	 * WAL needs to be transported via streaming replication (hopefully
	 * with wal_keep_segments set high enough), or some more exotic
	 * mechanism like polling and copying files from pg_xlog with script.
	 * We have no knowledge of those mechanisms, so it's up to the user to
	 * ensure that he gets all the required WAL.
11671
	 *
11672 11673 11674 11675
	 * We wait until both the last WAL file filled during backup and the
	 * history file have been archived, and assume that the alphabetic
	 * sorting property of the WAL files ensures any earlier WAL files are
	 * safely archived as well.
11676
	 *
11677 11678 11679
	 * We wait forever, since archive_command is supposed to work and we
	 * assume the admin wanted his backup to work completely. If you don't
	 * wish to wait, you can set statement_timeout.
11680
	 */
11681
	if (XLogArchivingActive())
T
Tom Lane 已提交
11682
	{
11683 11684
	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
	XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
T
Tom Lane 已提交
11685

11686 11687
	XLByteToSeg(startpoint, _logId, _logSeg);
	BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
11688 11689 11690 11691 11692
						  startpoint.xrecoff % XLogSegSize);

	seconds_before_warning = 60;
	waits = 0;

11693 11694
	while (XLogArchiveIsBusy(lastxlogfilename) ||
		   XLogArchiveIsBusy(histfilename))
11695 11696 11697 11698 11699 11700 11701
	{
		CHECK_FOR_INTERRUPTS();

		pg_usleep(1000000L);

		if (++waits >= seconds_before_warning)
		{
11702
			seconds_before_warning *= 2;		/* This wraps in >10 years... */
11703 11704 11705
			ereport(WARNING,
					(errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
							waits)));
11706 11707
		}
	}
11708 11709 11710 11711
	}
	else
		ereport(NOTICE,
				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11712

11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749
	/*
	 * We're done.  As a convenience, return the ending WAL location.
	 */
	return stoppoint;
}

/*
 * do_pg_abort_backup: abort a running backup
 *
 * This does just the most basic steps of do_pg_stop_backup(), by taking the
 * system out of backup mode, thus making it a lot more safe to call from
 * an error handler.
 *
 * NB: This is only for aborting a non-exclusive backup that doesn't write
 * backup_label. A backup started with pg_stop_backup() needs to be finished
 * with pg_stop_backup().
 */
void
do_pg_abort_backup(void)
{
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
	XLogCtl->Insert.nonExclusiveBackups--;

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);
}


/*
 * pg_switch_xlog: switch to next xlog file
 */
Datum
11750
pg_switch_xlog(PG_FUNCTION_ARGS)
11751 11752 11753 11754 11755 11756 11757
{
	XLogRecPtr	switchpoint;
	char		location[MAXFNAMELEN];

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
11758
			 (errmsg("must be superuser to switch transaction log files"))));
11759 11760 11761 11762 11763 11764 11765 11766

	switchpoint = RequestXLogSwitch();

	/*
	 * As a convenience, return the WAL location of the switch record
	 */
	snprintf(location, sizeof(location), "%X/%X",
			 switchpoint.xlogid, switchpoint.xrecoff);
11767
	PG_RETURN_TEXT_P(cstring_to_text(location));
11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793
}

/*
 * Report the current WAL write location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to an external
 * archiving process.  Note that the data before this point is written out
 * to the kernel, but is not necessarily synced to disk.
 */
Datum
pg_current_xlog_location(PG_FUNCTION_ARGS __attribute__((unused)) )
{
	char		location[MAXFNAMELEN];

	/* Make sure we have an up-to-date local LogwrtResult */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	snprintf(location, sizeof(location), "%X/%X",
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
11794
	PG_RETURN_TEXT_P(cstring_to_text(location));
11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817
}

/*
 * Report the current WAL insert location (same format as pg_start_backup etc)
 *
 * This function is mostly for debugging purposes.
 */
Datum
pg_current_xlog_insert_location(PG_FUNCTION_ARGS __attribute__((unused)) )
{
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecPtr	current_recptr;
	char		location[MAXFNAMELEN];

	/*
	 * Get the current end-of-WAL position ... shared lock is sufficient
	 */
	LWLockAcquire(WALInsertLock, LW_SHARED);
	INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
	LWLockRelease(WALInsertLock);

	snprintf(location, sizeof(location), "%X/%X",
			 current_recptr.xlogid, current_recptr.xrecoff);
11818
	PG_RETURN_TEXT_P(cstring_to_text(location));
11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849
}

/*
 * Compute an xlog file name and decimal byte offset given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 *
 * Note that a location exactly at a segment boundary is taken to be in
 * the previous segment.  This is usually the right thing, since the
 * expected usage is to determine which xlog file(s) are ready to archive.
 */
Datum
pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	uint32		xrecoff;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];
	Datum		values[2];
	bool		isnull[2];
	TupleDesc	resultTupleDesc;
	HeapTuple	resultHeapTuple;
	Datum		result;

	/*
	 * Read input and parse
	 */
11850
	locationstr = text_to_cstring(location);
11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("could not parse transaction log location \"%s\"",
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	/*
	 * Construct a tuple descriptor for the result row.  This must match this
	 * function's pg_proc entry!
	 */
	resultTupleDesc = CreateTemplateTupleDesc(2, false);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
					   TEXTOID, -1, 0);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
					   INT4OID, -1, 0);

	resultTupleDesc = BlessTupleDesc(resultTupleDesc);

	/*
	 * xlogfilename
	 */
	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

11879
	values[0] = CStringGetTextDatum(xlogfilename);
11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915
	isnull[0] = false;

	/*
	 * offset
	 */
	xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;

	values[1] = UInt32GetDatum(xrecoff);
	isnull[1] = false;

	/*
	 * Tuple jam: Having first prepared your Datums, then squash together
	 */
	resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);

	result = HeapTupleGetDatum(resultHeapTuple);

	PG_RETURN_DATUM(result);
}

/*
 * Compute an xlog file name given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 */
Datum
pg_xlogfile_name(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];

11916
	locationstr = text_to_cstring(location);
11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("could not parse transaction log location \"%s\"",
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

11930
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107
}

/*
 * read_backup_label: check to see if a backup_label file is present
 *
 * If we see a backup_label during recovery, we assume that we are recovering
 * from a backup dump file, and we therefore roll forward from the checkpoint
 * identified by the label file, NOT what pg_control says.	This avoids the
 * problem that pg_control might have been archived one or more checkpoints
 * later than the start of the dump, and so if we rely on it as the start
 * point, we will fail to restore a consistent database state.
 *
 * Returns TRUE if a backup_label was found (and fills the checkpoint
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
 * respectively); returns FALSE if not. If this backup_label came from a
 * streamed backup, *backupEndRequired is set to TRUE.
 */
static bool
read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired)
{
	char		startxlogfilename[MAXFNAMELEN];
	TimeLineID	tli;
	FILE	   *lfp;
	char		ch;
	char		backuptype[20];
	char		backupfrom[20];

	*backupEndRequired = false;

	/*
	 * See if label file is present
	 */
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		return false;			/* it's not there, all is fine */
	}

	/*
	 * Read and parse the START WAL LOCATION, CHECKPOINT and BACKUP_METHOD
	 * lines (this code is pretty crude, but we are not expecting any variability
	 * in the file format).
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
			   &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
			   startxlogfilename, &ch) != 5 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));

	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
			   &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
			   &ch) != 3 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));

	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
	{
		/* Streaming backup method is only supported */
		if (strcmp(backuptype, "streamed") == 0)
			*backupEndRequired = true;
		else
			ereport(FATAL,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));

	}

	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
	{
		/* Backup from standby is not supported */
		if (strcmp(backupfrom, "master") != 0)
			ereport(FATAL,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
	}

	if (ferror(lfp) || FreeFile(lfp))
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
						BACKUP_LABEL_FILE)));

	return true;
}

/*
 * Get latest redo apply position.
 *
 * Optionally, returns the current recovery target timeline. Callers not
 * interested in that may pass NULL for targetTLI.
 *
 * Exported to allow WAL receiver to read the pointer directly.
 */
XLogRecPtr
GetXLogReplayRecPtr(TimeLineID *targetTLI)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->lastReplayedEndRecPtr;
	if (targetTLI)
		*targetTLI = xlogctl->RecoveryTargetTLI;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

/*
 * Get current standby flush position, ie, the last WAL position
 * known to be fsync'd to disk in standby.
 *
 * If 'targetTLI' is not NULL, it's set to the current recovery target
 * timeline.
 */
XLogRecPtr
GetStandbyFlushRecPtr(TimeLineID *targetTLI)
{
	XLogRecPtr      receivePtr;
	XLogRecPtr      replayPtr;

	receivePtr = GetWalRcvWriteRecPtr(NULL);
	replayPtr = GetXLogReplayRecPtr(targetTLI);

	if (XLByteLT(receivePtr, replayPtr))
		return replayPtr;
	else
		return receivePtr;
}

/*
 * GetRecoveryTargetTLI - get the current recovery target timeline ID
 */
TimeLineID
GetRecoveryTargetTLI(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	TimeLineID result;

	SpinLockAcquire(&xlogctl->info_lck);
	result = xlogctl->RecoveryTargetTLI;
	SpinLockRelease(&xlogctl->info_lck);

	return result;
}

/*
 * Error context callback for errors occurring during rm_redo().
 */
static void
rm_redo_error_callback(void *arg)
{
	RedoErrorCallBack *redoErrorCallBack = (RedoErrorCallBack*) arg;
	StringInfoData buf;

	initStringInfo(&buf);
	RmgrTable[redoErrorCallBack->record->xl_rmid].rm_desc(
												   &buf,
												   redoErrorCallBack->location,
												   redoErrorCallBack->record);

	/* don't bother emitting empty description */
	if (buf.len > 0)
		errcontext("xlog redo %s", buf.data);

	pfree(buf.data);
}

12108 12109 12110 12111 12112 12113 12114 12115 12116 12117
#if 0 /* GPDB doesn't have online backup */
/*
 * BackupInProgress: check if online backup mode is active
 *
 * This is done by checking for existence of the "backup_label" file.
 */
bool
BackupInProgress(void)
{
	struct stat stat_buf;
12118

12119
	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
V
WAL  
Vadim B. Mikheev 已提交
12120
}
B
Bruce Momjian 已提交
12121

12122 12123 12124 12125 12126 12127 12128
/*
 * CancelBackup: rename the "backup_label" file to cancel backup mode
 *
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
 * Note that this will render an online backup in progress useless.
 * To correctly finish an online backup, pg_stop_backup must be called.
 */
V
WAL  
Vadim B. Mikheev 已提交
12129
void
12130
CancelBackup(void)
V
WAL  
Vadim B. Mikheev 已提交
12131
{
12132
	struct stat stat_buf;
V
WAL  
Vadim B. Mikheev 已提交
12133

12134 12135 12136
	/* if the file is not there, return */
	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
		return;
B
Bruce Momjian 已提交
12137

12138 12139 12140 12141
	/* remove leftover file from previously cancelled backup if it exists */
	unlink(BACKUP_LABEL_OLD);

	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
12142
	{
12143 12144 12145
		ereport(LOG,
				(errmsg("online backup mode cancelled"),
				 errdetail("\"%s\" was renamed to \"%s\".",
12146
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
12147
	}
12148
	else
12149
	{
12150 12151 12152 12153
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("online backup mode was not cancelled"),
				 errdetail("Could not rename \"%s\" to \"%s\": %m.",
12154
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
12155
	}
12156 12157 12158
}
#endif

12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170
static char *
XLogLocationToBuffer(char *buffer, XLogRecPtr *loc, bool longFormat)
{

	if (longFormat)
	{
		uint32 seg = loc->xrecoff / XLogSegSize;
		uint32 offset = loc->xrecoff % XLogSegSize;
		sprintf(buffer,
			    "%X/%X (==> seg %d, offset 0x%X)",
			    loc->xlogid, loc->xrecoff,
			    seg, offset);
12171
	}
12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237
	else
		sprintf(buffer,
			    "%X/%X",
			    loc->xlogid, loc->xrecoff);

	return buffer;
}

static char xlogLocationBuffer[50];
static char xlogLocationBuffer2[50];
static char xlogLocationBuffer3[50];
static char xlogLocationBuffer4[50];
static char xlogLocationBuffer5[50];

char *
XLogLocationToString(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString2(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer2, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString3(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer3, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString4(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer4, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString5(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer5, loc, Debug_print_qd_mirroring);
}

char *
XLogLocationToString_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer, loc, true);
}

char *
XLogLocationToString2_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer2, loc, true);
}

char *
XLogLocationToString3_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer3, loc, true);
}

char *
XLogLocationToString4_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer4, loc, true);
12238
}
12239

12240 12241 12242 12243 12244 12245 12246 12247
char *
XLogLocationToString5_Long(XLogRecPtr *loc)
{
	return XLogLocationToBuffer(xlogLocationBuffer5, loc, true);
}


void xlog_print_redo_read_buffer_not_found(
12248
	RelFileNode 	*rnode,
12249 12250 12251 12252 12253 12254 12255 12256
	BlockNumber 	blkno,
	XLogRecPtr 		lsn,
	const char 		*funcName)
{
	if (funcName != NULL)
		elog(PersistentRecovery_DebugPrintLevel(),
			 "%s redo for %u/%u/%u did not find buffer for block %d (LSN %s)",
			 funcName,
12257 12258 12259
			 rnode->spcNode,
			 rnode->dbNode,
			 rnode->relNode,
12260 12261 12262 12263 12264
			 blkno,
			 XLogLocationToString(&lsn));
	else
		elog(PersistentRecovery_DebugPrintLevel(),
			 "Redo for %u/%u/%u did not find buffer for block %d (LSN %s)",
12265 12266 12267
			 rnode->spcNode,
			 rnode->dbNode,
			 rnode->relNode,
12268 12269 12270 12271 12272
			 blkno,
			 XLogLocationToString(&lsn));
}

void xlog_print_redo_lsn_application(
12273
	RelFileNode	   *rnode,
12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288
	BlockNumber 	blkno,
	void			*pagePtr,
	XLogRecPtr 		lsn,
	const char 		*funcName)
{
	Page page = (Page)pagePtr;
	XLogRecPtr	pageCurrentLsn = PageGetLSN(page);
	bool willApplyChange;

	willApplyChange = XLByteLT(pageCurrentLsn, lsn);

	if (funcName != NULL)
		elog(PersistentRecovery_DebugPrintLevel(),
			 "%s redo application for %u/%u/%u, block %d, willApplyChange = %s, current LSN %s, change LSN %s",
			 funcName,
12289 12290 12291
			 rnode->spcNode,
			 rnode->dbNode,
			 rnode->relNode,
12292 12293 12294 12295 12296 12297 12298
			 blkno,
			 (willApplyChange ? "true" : "false"),
			 XLogLocationToString(&pageCurrentLsn),
			 XLogLocationToString2(&lsn));
	else
		elog(PersistentRecovery_DebugPrintLevel(),
			 "Redo application for %u/%u/%u, block %d, willApplyChange = %s, current LSN %s, change LSN %s",
12299 12300 12301
			 rnode->spcNode,
			 rnode->dbNode,
			 rnode->relNode,
12302 12303 12304 12305 12306 12307 12308
			 blkno,
			 (willApplyChange ? "true" : "false"),
			 XLogLocationToString(&pageCurrentLsn),
			 XLogLocationToString2(&lsn));
}

/* ------------------------------------------------------
12309
 *	Startup Process main entry point and signal handlers
12310 12311
 * ------------------------------------------------------
 */
12312 12313

/*
12314
 * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
12315
 *
12316 12317
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
12318
 */
12319
static void
12320
startupproc_quickdie(SIGNAL_ARGS)
12321
{
12322
	PG_SETMASK(&BlockSig);
12323

12324 12325 12326 12327 12328 12329 12330 12331 12332
	/*
	 * We DO NOT want to run proc_exit() callbacks -- we're here because
	 * shared memory may be corrupted, so we don't want to try to clean up our
	 * transaction.  Just nail the windows shut and get out of town.  Now that
	 * there's an atexit callback to prevent third-party code from breaking
	 * things by calling exit() directly, we have to reset the callbacks
	 * explicitly to make this work as intended.
	 */
	on_exit_reset();
12333

12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355
	/*
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
	 * should ensure the postmaster sees this as a crash, too, but no harm in
	 * being doubly sure.)
	 */
	exit(2);
}

/* SIGUSR2: set flag to finish recovery */
static void
StartupProcTriggerHandler(SIGNAL_ARGS)
{
	int			save_errno = errno;

	WakeupRecovery();

	errno = save_errno;
}

12356 12357 12358 12359 12360 12361 12362
/* SIGUSR1: let latch facility handle the signal */
static void
StartupProcSigUsr1Handler(SIGNAL_ARGS)
{
	latch_sigusr1_handler();
}

12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385
/* SIGHUP: set flag to re-read config file at next convenient time */
static void
StartupProcSigHupHandler(SIGNAL_ARGS)
{
	int			save_errno = errno;

	got_SIGHUP = true;
	WakeupRecovery();

	errno = save_errno;
}

/* SIGTERM: set flag to abort redo and exit */
static void
StartupProcShutdownHandler(SIGNAL_ARGS)
{
	int			save_errno = errno;

	if (in_restore_command)
		proc_exit(1);
	else
		shutdown_requested = true;
	WakeupRecovery();
12386

12387 12388
	errno = save_errno;
}
B
Bruce Momjian 已提交
12389

12390 12391 12392 12393
/* Handle SIGHUP and SIGTERM signals of startup process */
void
HandleStartupProcInterrupts(void)
{
12394
	/*
12395
	 * Check if we were requested to re-read config file.
12396
	 */
12397
	if (got_SIGHUP)
12398
	{
12399 12400
		got_SIGHUP = false;
		ProcessConfigFile(PGC_SIGHUP);
12401
	}
B
Bruce Momjian 已提交
12402

12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451
	/*
	 * Check if we were requested to exit without finishing recovery.
	 */
	if (shutdown_requested)
		proc_exit(1);

	/*
	 * Emergency bailout if postmaster has died.  This is to avoid the
	 * necessity for manual cleanup of all postmaster children.
	 */
	if (IsUnderPostmaster && !PostmasterIsAlive(true))
		exit(1);
}

static void
HandleCrash(SIGNAL_ARGS)
{
    /**
     * Handle crash is registered as a signal handler for SIGILL/SIGBUS/SIGSEGV
     *
     * This simply calls the standard handler which will log the signal and reraise the
     *      signal if needed
     */
    StandardHandlerForSigillSigsegvSigbus_OnMainThread("a startup process", PASS_SIGNAL_ARGS);
}

/* Main entry point for startup process */
void
StartupProcessMain(int passNum)
{
	am_startup = true;
	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 */
	pqsignal(SIGHUP, StartupProcSigHupHandler);	 /* reload config file */
	pqsignal(SIGINT, SIG_IGN);					/* ignore query cancel */
	pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
	pqsignal(SIGQUIT, startupproc_quickdie);		/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
12452
	pqsignal(SIGUSR1, StartupProcSigUsr1Handler);
12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482
	if (passNum == 1)
		pqsignal(SIGUSR2, StartupProcTriggerHandler);
	else
		pqsignal(SIGUSR2, SIG_IGN);

#ifdef SIGBUS
	pqsignal(SIGBUS, HandleCrash);
#endif
#ifdef SIGILL
    pqsignal(SIGILL, HandleCrash);
#endif
#ifdef SIGSEGV
	pqsignal(SIGSEGV, HandleCrash);
#endif

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	switch (passNum)
12483
	{
12484 12485
	case 1:
		StartupXLOG();
12486
		BuildFlatFiles(false);
12487 12488 12489 12490
		break;

	case 2:
	case 4:
12491
		/*
D
Daniel Gustafsson 已提交
12492
		 * NOTE: The following initialization logic was borrowed from ftsprobe.
12493
		 */
12494
		SetProcessingMode(InitProcessing);
12495

12496
		/*
12497 12498
		 * Create a resource owner to keep track of our resources (currently only
		 * buffer pins).
12499
		 */
12500 12501 12502 12503 12504 12505 12506 12507 12508
		if (passNum == 2)
		{
			CurrentResourceOwner = ResourceOwnerCreate(NULL, "Startup Pass 2");
		}
		else
		{
			Assert(passNum == 4);
			CurrentResourceOwner = ResourceOwnerCreate(NULL, "Startup Pass 4");
		}
B
Bruce Momjian 已提交
12509

12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549
		/*
		 * NOTE: AuxiliaryProcessMain has already called:
		 * NOTE:      BaseInit,
		 * NOTE:      InitAuxiliaryProcess instead of InitProcess, and
		 * NOTE:      InitBufferPoolBackend.
		 */

		InitXLOGAccess();

		SetProcessingMode(NormalProcessing);

		/*
		 * Add my PGPROC struct to the ProcArray.
		 *
		 * Once I have done this, I am visible to other backends!
		 */
		InitProcessPhase2();

		/*
		 * Initialize my entry in the shared-invalidation manager's array of
		 * per-backend data.
		 *
		 * Sets up MyBackendId, a unique backend identifier.
		 */
		MyBackendId = InvalidBackendId;

		/*
		 * Though this is a startup process and currently no one sends invalidation
		 * messages concurrently, we set sendOnly = false, since we have relcaches.
		 */
		SharedInvalBackendInit(false);

		if (MyBackendId > MaxBackends || MyBackendId <= 0)
			elog(FATAL, "bad backend id: %d", MyBackendId);

		/*
		 * bufmgr needs another initialization call too
		 */
		InitBufferPoolBackend();

12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567
		/* heap access requires the rel-cache.
		 *
		 * Pass 2 uses heap API to insert/update/delete from persistent
		 * tables.  In order to use the heap API, RelationDescriptor is
		 * required.  In pass 2, persistent tables are accessed using
		 * DirectOpen API to obtain the RelationDescriptor.  Hence, we
		 * don't need to load full relcache as in
		 * RelationCacheInitializePhase3().
		 *
		 * However, there is cache invalidation logic within heap API
		 * needs basic data structures for catalog cache to be
		 * initialized.  Hence, we need to do RelationCacheInitialize(),
		 * InitCatalogCache(), and RelationCacheInitializePhase2()
		 * before StartupXLOG_Pass2().
		 *
		 * Pass 4 needs RelationCacheInitializePhase3() to do catalog
		 * validation, after xlog replay is complete.
		 */
12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589
		RelationCacheInitialize();
		InitCatalogCache();

		/*
		 * It's now possible to do real access to the system catalogs.
		 *
		 * Load relcache entries for the system catalogs.  This must create at
		 * least the minimum set of "nailed-in" cache entries.
		 */
		RelationCacheInitializePhase2();

		if (passNum == 2)
		{
			StartupXLOG_Pass2();
		}
		else
		{
			Assert(passNum == 4);
			StartupXLOG_Pass4();
		}

		break;
B
Bruce Momjian 已提交
12590

12591
	case 3:
12592
		/*
12593
		 * Pass 3 does REDO work for all non-meta-data (i.e. not the gp_persistent_* tables).
12594
		 */
12595
		SetProcessingMode(InitProcessing);
12596 12597

		/*
12598 12599
		 * Create a resource owner to keep track of our resources (currently only
		 * buffer pins).
12600
		 */
12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734
		CurrentResourceOwner = ResourceOwnerCreate(NULL, "Startup Pass 3");

		/*
		 * NOTE: AuxiliaryProcessMain has already called:
		 * NOTE:      BaseInit,
		 * NOTE:      InitAuxiliaryProcess instead of InitProcess, and
		 * NOTE:      InitBufferPoolBackend.
		 */

		InitXLOGAccess();

		SetProcessingMode(NormalProcessing);

		StartupXLOG_Pass3();

		PgVersionRecoverMirror();
		break;

	default:
		elog(PANIC, "Unexpected pass number %d", passNum);
	}

	/*
	 * Exit normally. Exit code 0 tells postmaster that we completed
	 * recovery successfully.
	 */
	proc_exit(0);
}

/*
 *
 */
static
int XLogGetEof(XLogRecPtr *eofRecPtr)
{
	int	status = STATUS_OK;

	XLogRecPtr	redoCheckpointLoc;
	CheckPoint	redoCheckpoint;

	XLogRecPtr	startLoc;

	XLogRecord	*record;
	XLogRecPtr	LastRec;

	XLogGetRecoveryStart("filerep",
						 "get checkpoint location",
						 &redoCheckpointLoc,
						 &redoCheckpoint);

	startLoc = redoCheckpoint.redo;

	XLogCloseReadRecord();

	record = XLogReadRecord(&startLoc, false, DEBUG1);
	if (record == NULL)
	{
		FileRep_SetSegmentState(SegmentStateFault, FaultTypeDB);

		elog(WARNING," couldn't read start location %s",
			 XLogLocationToString(&startLoc));
		status = STATUS_ERROR;
	}

	do
	{
		LastRec = ReadRecPtr;

		record = XLogReadRecord(NULL, false, DEBUG1);
	} while (record != NULL);

	record = XLogReadRecord(&LastRec, false, ERROR);
	*eofRecPtr = EndRecPtr;

	XLogCloseReadRecord();

	return status;
}

/*
 *
 */
static
int XLogReconcileEofInternal(
					XLogRecPtr	startLocation,
					XLogRecPtr	endLocation)
{

	uint32		startLogId;
	uint32		startSeg;

	uint32		endLogId;
	uint32		endSeg;

	uint32		logId;
	uint32		seg;

	uint32		startOffset;
	uint32		endOffset;

	int			status = STATUS_OK;

	Assert(XLByteLT(startLocation, endLocation));

	XLByteToSeg(startLocation, startLogId, startSeg);
	XLByteToSeg(endLocation, endLogId, endSeg);

	logId = startLogId;
	seg = startSeg;

	while (1) {

		if (logId == startLogId && seg == startSeg)
			startOffset = startLocation.xrecoff % XLogSegSize;
		else
			startOffset = 0;

		if (logId == endLogId && seg == endSeg)
			endOffset = endLocation.xrecoff % XLogSegSize;
		else
			endOffset = XLogSegSize;

		{
			char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];

			snprintf(tmpBuf, sizeof(tmpBuf),
					 "xlog reconcile log id '%u' seg '%u' start offset '%d' end offset '%d' xlog size '%d' ",
					 logId, seg, startOffset, endOffset, XLogSegSize);

			FileRep_InsertConfigLogEntry(tmpBuf);
		}

		status = XLogFillZero(logId, seg, startOffset, endOffset);
		if (status != STATUS_OK)
12735
		{
12736 12737 12738
			FileRep_SetSegmentState(SegmentStateFault, FaultTypeIO);

			break;
12739 12740
		}

12741 12742
		if (logId == endLogId && seg == endSeg)
			break;
B
Bruce Momjian 已提交
12743

12744
		NextLogSeg(logId, seg);
12745
	}
B
Bruce Momjian 已提交
12746

12747
	return STATUS_OK;
12748 12749
}

12750 12751 12752 12753 12754 12755
static
int XLogFillZero(
				 uint32	logId,
				 uint32	seg,
				 uint32	startOffset,
				 uint32	endOffset)
12756
{
12757 12758 12759
	char		path[MAXPGPATH];
	char		fname[MAXPGPATH];
	char		zbuffer[XLOG_BLCKSZ];
12760

12761 12762 12763
	int			fd = 0;
	uint32		offset = startOffset;
	Size		writeLen = 0;
B
Bruce Momjian 已提交
12764

12765 12766
	int			status = STATUS_OK;
	char		*xlogDir = NULL;
12767

12768
	Assert(startOffset < endOffset);
12769

12770
	errno = 0;
B
Bruce Momjian 已提交
12771

12772
	XLogFileName(fname, ThisTimeLineID, logId, seg);
B
Bruce Momjian 已提交
12773

12774 12775 12776 12777 12778 12779 12780
	xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
	if (snprintf(path, MAXPGPATH, "%s/%s", xlogDir, fname) >= MAXPGPATH) {
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("could not allocate path, path too long \"%s/%s\"",
						xlogDir, fname)));
		return STATUS_ERROR;
12781
	}
12782
	pfree(xlogDir);
B
Bruce Momjian 已提交
12783

12784 12785 12786 12787 12788 12789 12790 12791
	fd = open(path, O_RDWR, 0);
	if (fd < 0) {
			ereport(WARNING,
					(errcode_for_file_access(),
					 errmsg("could not open xlog file \"%s\" : %m",
							path)));
			return STATUS_ERROR;
	}
B
Bruce Momjian 已提交
12792

12793 12794
	if (ftruncate(fd, startOffset) < 0) {
		ereport(WARNING,
12795
				(errcode_for_file_access(),
12796 12797 12798 12799 12800 12801 12802 12803
				 errmsg("could not truncate xlog file \"%s\" to position \"%d\" : %m",
						path, startOffset)));
		status = STATUS_ERROR;
		goto exit;
	}

	if (lseek(fd, (off_t) startOffset, SEEK_SET) < 0) {
		ereport(WARNING,
12804
				(errcode_for_file_access(),
12805 12806 12807 12808 12809
				 errmsg("could not seek xlog file \"%s\" to position \"%d\" : %m",
						path, startOffset)));
		status = STATUS_ERROR;
		goto exit;
	}
B
Bruce Momjian 已提交
12810

12811
	/*
12812 12813 12814 12815 12816 12817 12818
	 * Zero-fill the file.	We have to do this the hard way to ensure that all
	 * the file space has really been allocated --- on platforms that allow
	 * "holes" in files, just seeking to the end doesn't allocate intermediate
	 * space.  This way, we know that we have all the space and (after the
	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
	 * log file.
12819
	 */
12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849
	MemSet(zbuffer, 0, sizeof(zbuffer));

	while (1) {
		errno = 0;
		writeLen = (Size) Min(XLOG_BLCKSZ - (offset % XLOG_BLCKSZ), endOffset - offset);

		if ((int) write(fd, zbuffer, writeLen) != (int) writeLen) {
			int			save_errno = errno;

			/*
			 * If we fail to make the file, delete it to release disk space
			 */

			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;

			ereport(WARNING,
					(errcode_for_file_access(),
					 errmsg("could not write to file \"%s\": %m", path)));
			status = STATUS_ERROR;
			goto exit;
		}
		offset += writeLen;
		if (offset >= endOffset) {
			break;
		}
	}

	if (pg_fsync(fd) != 0) {
		ereport(WARNING,
12850
				(errcode_for_file_access(),
12851 12852 12853
				 errmsg("could not fsync file \"%s\": %m", path)));
		status = STATUS_ERROR;
	}
B
Bruce Momjian 已提交
12854

12855
exit:
A
Asim R P 已提交
12856 12857 12858 12859 12860
	if (close(fd)) {
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", path)));
		status = STATUS_ERROR;
12861
	}
B
Bruce Momjian 已提交
12862

12863
	return status;
12864
}
12865

12866

12867
/*
12868 12869 12870 12871 12872
 *
 *		a) get logical XLog EOF on primary
 *		b) send logical XLog EOF to mirror
 *		c) if mirror ahead then reconcile XLog EOF on mirror
 *		d) if primary ahead then reconcile XLog EOF on primary
12873
 */
12874 12875
int
XLogReconcileEofPrimary(void)
12876
{
12877 12878
	XLogRecPtr	primaryEof = {0, 0};
	XLogRecPtr	mirrorEof;
12879

12880 12881
	uint32		logId;
	uint32		seg;
12882

12883
	char		simpleFileName[MAXPGPATH];
12884

12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928
	int			status = STATUS_OK;

	status = XLogGetEof(&primaryEof);

	if (status != STATUS_OK) {
		return status;
	}

	XLByteToSeg(primaryEof, logId, seg);

	XLogFileName(simpleFileName, ThisTimeLineID, logId, seg);

	status = MirroredFlatFile_ReconcileXLogEof(
											   XLOGDIR,
											   simpleFileName,
											   primaryEof,
											   &mirrorEof);

	if (status != STATUS_OK) {
		return status;
	}

	if (XLByteEQ(primaryEof, mirrorEof))
	{
		FileRep_InsertConfigLogEntry("primary and mirror xlog eof match");
		return STATUS_OK;
	}

	if (XLByteLT(primaryEof, mirrorEof))
	{
		FileRep_InsertConfigLogEntry("primary is behind, xlog was truncated on mirror");

		status = MirrorFlatFile(
								XLOGDIR,
								simpleFileName);
		return STATUS_OK;
	}

	FileRep_InsertConfigLogEntry("mirror is behind, xlog will be copied to mirror");

	status = MirrorFlatFile(
							XLOGDIR,
							simpleFileName);
	return status;
12929 12930 12931
}

/*
12932
 *
12933
 */
12934 12935 12936 12937
int
XLogReconcileEofMirror(
		XLogRecPtr	primaryEof,
		XLogRecPtr	*mirrorEof)
12938
{
12939 12940
	XLogRecPtr	mirrorEofLocal = {0, 0};
	int			status = STATUS_OK;
12941

12942
	status = XLogGetEof(&mirrorEofLocal);
12943

12944 12945 12946 12947
	*mirrorEof = mirrorEofLocal;

	if (status != STATUS_OK) {
		return status;
12948 12949 12950
	}


12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971
	if (XLByteEQ(primaryEof, mirrorEofLocal)) {
		FileRep_InsertConfigLogEntry("primary and mirror xlog eof match");
		return STATUS_OK;
	}

	if (! XLByteLT(primaryEof, mirrorEofLocal)) {
		FileRep_InsertConfigLogEntry("mirror is behind, xlog will be truncated on primary");
		return STATUS_OK;
	}

	FileRep_InsertConfigLogEntry("primary is behind, xlog was truncated on mirror");

	status = XLogReconcileEofInternal(
						  primaryEof,
						  mirrorEofLocal);

	if (status != STATUS_OK) {
		return status;
	}

	return status;
12972 12973 12974
}

/*
12975 12976 12977
 * The routine recovers pg_control flat file on mirror side.
 *		a) It copies pg_control file from primary to mirror
 *      b) pg_control file is overwritten on mirror
12978
 *
12979
 * Status is not returned, If an error occurs segmentState will be set to Fault.
12980
 */
12981 12982
int
XLogRecoverMirrorControlFile(void)
12983
{
12984 12985
	MirroredFlatFileOpen	mirroredOpen;
	int						retval = 0;
12986

12987
	while (1) {
12988

12989
		ReadControlFile();
12990

12991 12992 12993 12994 12995 12996 12997 12998
		retval = MirroredFlatFile_Open(
							  &mirroredOpen,
							  XLOG_CONTROL_FILE_SUBDIR,
							  XLOG_CONTROL_FILE_SIMPLE,
							  O_CREAT | O_RDWR | PG_BINARY,
							  S_IRUSR | S_IWUSR,
							  /* suppressError */ false,
							  /* atomic operation */ false,
12999
							  /* isMirrorRecovery */ TRUE);
13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022
		if (retval != 0)
			break;

		retval = MirroredFlatFile_Write(
							   &mirroredOpen,
							   0,
							   ControlFile,
							   PG_CONTROL_SIZE,
							   /* suppressError */ false);
		if (retval != 0)
			break;

		retval = MirroredFlatFile_Flush(
							   &mirroredOpen,
							   /* suppressError */ false);
		if (retval != 0)
			break;

		MirroredFlatFile_Close(&mirroredOpen);
		break;
	} // while(1)

	return retval;
13023 13024
}

13025 13026 13027 13028
/*
 * The ChangeTracking module will call this xlog routine in order for
 * it to gather all the xlog records since the last checkpoint and
 * add any relevant information to the change log if necessary.
13029
 *
13030 13031 13032 13033 13034
 * It returns the number of records that were found (not all of them
 * were interesting to the changetracker though).
 *
 * See ChangeTracking_CreateInitialFromPreviousCheckpoint()
 * for more information.
13035
 */
13036 13037
int XLogAddRecordsToChangeTracking(
	XLogRecPtr	*lastChangeTrackingEndLoc)
13038
{
13039 13040 13041 13042 13043 13044 13045
	XLogRecord *record;
	XLogRecPtr	redoCheckpointLoc;
	CheckPoint	redoCheckpoint;
	XLogRecPtr	startLoc;
	XLogRecPtr	lastEndLoc;
	XLogRecPtr	lastChangeTrackingLogEndLoc = {0, 0};
	int count = 0;
13046

13047
	/*
13048 13049 13050 13051 13052 13053
	 * Find latest checkpoint record and the redo record from xlog. This record
	 * will be used to find the starting point to scan xlog records to be pushed
	 * to changetracking log. This is needed either to generate/produce new change
	 * tracking log or to make the changetracking log catchup with xlog in case
	 * it has fallen behind.
	 * TODO: does this function really work for us? if so, change its name for something more global
13054
	 */
13055 13056 13057 13058
	XLogGetRecoveryStart("CHANGETRACKING",
						 "get checkpoint location",
						 &redoCheckpointLoc,
						 &redoCheckpoint);
13059

13060
	startLoc = redoCheckpoint.redo;
13061

13062 13063 13064
	XLogCloseReadRecord();
	elog(LOG, "last checkpoint location for generating initial changetracking log %s",
			XLogLocationToString(&startLoc));
13065

13066
	/*
13067 13068 13069 13070
	 * Find the last entry and thus the LSN recorded by it from the CT_FULL
	 * log. Later, it will be used to maintain the xlog and changetracking log
	 * to the same end point.
	 * We perform this when the lastChangetrackingEndLoc is not known.
13071
	 */
13072 13073
	if (lastChangeTrackingEndLoc == NULL)
	{
13074 13075 13076 13077 13078
		if (!ChangeTracking_GetLastChangeTrackingLogEndLoc(&lastChangeTrackingLogEndLoc))
		{
			return 0;
		}
		
13079 13080 13081
		elog(LOG, "last changetracked location in changetracking full log %s",
				XLogLocationToString(&lastChangeTrackingLogEndLoc));
	}
13082

13083 13084 13085 13086 13087 13088
	record = XLogReadRecord(&startLoc, false, LOG);
	if (record == NULL)
	{
		elog(ERROR," couldn't read start location %s",
			 XLogLocationToString(&startLoc));
	}
13089

13090 13091 13092 13093
	if (lastChangeTrackingEndLoc != NULL &&
		XLByteLT(*lastChangeTrackingEndLoc, EndRecPtr))
	{
		XLogCloseReadRecord();
13094

13095 13096 13097 13098 13099 13100 13101
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "XLogAddRecordsToChangeTracking: Returning 0 records through end location %s",
				 XLogLocationToString(lastChangeTrackingEndLoc));

		return 0;
	}
13102 13103

	/*
13104 13105
	 * Make a pass through all xlog records from last checkpoint and
	 * gather information from the interesting ones into the change log.
13106
	 */
13107 13108 13109 13110 13111 13112
	while (true)
	{
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "XLogAddRecordsToChangeTracking: Going to add change tracking record for XLOG (end) location %s",
				 XLogLocationToString(&EndRecPtr));
13113

13114 13115 13116 13117 13118 13119 13120 13121
		ChangeTracking_AddRecordFromXlog(record->xl_rmid,
									     record->xl_info,
										 (XLogRecData *)XLogRecGetData(record),
										 &EndRecPtr);
		count++;

		lastEndLoc = EndRecPtr;

13122
		SIMPLE_FAULT_INJECTOR(FileRepTransitionToChangeTracking);
13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171

		if (lastChangeTrackingEndLoc != NULL)
		{
			if (XLByteEQ(EndRecPtr, *lastChangeTrackingEndLoc))
			{
				if (Debug_persistent_print)
					elog(Persistent_DebugPrintLevel(),
						 "XLogAddRecordsToChangeTracking: Returning %d records from start location %s through end location %s",
						 count,
						 XLogLocationToString(&startLoc),
						 XLogLocationToString2(lastChangeTrackingEndLoc));
				break;
			}

			record = XLogReadRecord(NULL, false, ERROR);
			Assert (record != NULL);

			if (!XLByteLE(EndRecPtr, *lastChangeTrackingEndLoc))
			{
				if (Debug_persistent_print)
					elog(Persistent_DebugPrintLevel(),
						 "XLogAddRecordsToChangeTracking: Read beyond expected last change tracking XLOG record.  "
						 "Returning %d records. "
						 "Last change tracking XLOG record (end) position is %s; scanned XLOG record (end) position is %s (start location is %s)",
						 count,
						 XLogLocationToString(lastChangeTrackingEndLoc),
						 XLogLocationToString2(&EndRecPtr),
						 XLogLocationToString3(&startLoc));
				break;
			}
		}
		else
		{
			/*
			 * Read to end of log.
			 */
			record = XLogReadRecord(NULL, false, LOG);
			if (record == NULL)
			{
				if (Debug_persistent_print)
					elog(Persistent_DebugPrintLevel(),
						 "XLogAddRecordsToChangeTracking: Returning %d records through end of log location %s",
						 count,
						 XLogLocationToString(&lastEndLoc));

				break;
			}
		}
	}
13172 13173

	/*
13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189
	 * We now need to make sure that (in the case of crash recovery) there are no
	 * records in the change tracking logs that have lsn higher than the highest lsn in xlog.
	 *
	 *	a) Find the highest lsn in xlog
	 *	b) Find the highest lsn in change tracking log files before interesting
	 *	   xlog entries from last checkpoint onwards are appended to it
	 *	   (see above)
	 *	c) if the highest lsn in change tracking > the highest lsn in xlog then
	 *		i) store in compacting shared memory the highest lsn in xlog
	 *		ii) Flush all data into CT_LOG_FULL
	 *		iii) Rename CT_LOG_FULL to CT_LOG_TRANSIENT
	 *	d) after database is started the compacting (CT_LOG_TRANSIENT) will discard all records from
	 *	   change tracking log file that are higher than the highest lsn in xlog
	 */
	if (lastChangeTrackingEndLoc == NULL)
	{
A
Asim R P 已提交
13190 13191 13192 13193 13194
		/*
		 * Xlog must have been read till the end to get last lsn on
		 * disk (EndRecPtr).
		 */
		Assert (record == NULL);
13195 13196 13197 13198 13199 13200 13201 13202 13203

		if (! (lastChangeTrackingLogEndLoc.xlogid == 0 && lastChangeTrackingLogEndLoc.xrecoff == 0) &&
			XLByteLT(EndRecPtr, lastChangeTrackingLogEndLoc))
		{
			elog(LOG,
				 "changetracking: "
				 "found last changetracking log LSN (%s) higher than last xlog LSN, "
				 "invalid records will be discarded",
				 XLogLocationToString(&lastChangeTrackingLogEndLoc));
13204

13205
			elog(LOG, "xlog LSN (%s)", XLogLocationToString(&EndRecPtr));
13206

13207 13208 13209 13210 13211 13212 13213 13214
			ChangeTracking_FsyncDataIntoLog(CTF_LOG_FULL);
			ChangeTrackingSetXLogEndLocation(EndRecPtr);
			ChangeTracking_CreateTransientLog();
		}
	}

	XLogCloseReadRecord();
	return count;
13215 13216
}

13217 13218 13219 13220 13221 13222 13223
int
XLogRecoverMirror(void)
{
  DIR                *cldir;
  struct dirent     *clde;
  int                retval = 0;
	char            *xlogDir = makeRelativeToTxnFilespace(XLOGDIR);
13224

13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240
  cldir = AllocateDir(xlogDir);
  while ((clde = ReadDir(cldir, xlogDir)) != NULL) {
    if (strlen(clde->d_name) == 24 &&
	strspn(clde->d_name, "0123456789ABCDEF") == 24) {

      retval = MirrorFlatFile( XLOGDIR, clde->d_name);

      if (retval != 0)
	break;

    }
  }
  FreeDir(cldir);
	pfree(xlogDir);

  return retval;
13241 13242
}

13243
/*
13244 13245 13246
 * Check to see whether the user-specified trigger file exists and whether a
 * promote request has arrived.  If either condition holds, request postmaster
 * to shut down walreceiver, wait for it to exit, and return true.
13247 13248
 */
static bool
13249
CheckForStandbyTrigger(void)
13250
{
13251
	static bool triggered = false;
13252

13253 13254
	if (triggered)
		return true;
13255

13256
	if (CheckPromoteSignal(true))
13257
	{
13258 13259 13260 13261 13262
		ereport(LOG,
				(errmsg("received promote request")));
		ShutdownWalRcv();
		triggered = true;
		return true;
13263
	}
B
Bruce Momjian 已提交
13264

13265 13266
	return false;
}
13267

13268 13269 13270 13271 13272 13273 13274 13275
/*
 * Check to see if a promote request has arrived. Should be
 * called by postmaster after receiving SIGUSR1.
 */
bool
CheckPromoteSignal(bool do_unlink)
{
	struct stat stat_buf;
13276

13277
	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
13278 13279
	{
		/*
13280 13281
		 * Since we are in a signal handler, it's not safe to elog. We
		 * silently ignore any error from unlink.
13282
		 */
13283 13284 13285
		if (do_unlink)
			unlink(PROMOTE_SIGNAL_FILE);
		return true;
13286
	}
13287 13288
	return false;
}
13289

13290
/*
13291 13292
 * Put the current standby master dbid in the shared memory, which will
 * be looked up from mmxlog.
13293
 */
13294 13295
void
SetStandbyDbid(int16 dbid)
13296
{
13297 13298
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
13299

13300 13301 13302
	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->standbyDbid = dbid;
	SpinLockRelease(&xlogctl->info_lck);
13303

13304 13305 13306 13307 13308
	/*
	 * Let postmaster know we've changed standby dbid.
	 */
	SendPostmasterSignal(PMSIGNAL_SEGCONFIG_CHANGE);
}
13309

13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337
/*
 * Returns current standby dbid.
 */
int16
GetStandbyDbid(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	int16	dbid;

	SpinLockAcquire(&xlogctl->info_lck);
	dbid = xlogctl->standbyDbid;
	SpinLockRelease(&xlogctl->info_lck);

	return dbid;
}

/*
 * True if we are running standby-mode continuous recovery.
 * Note this would return false after finishing the recovery, even if
 * we are still on standby master with a primary master running.
 * Also this only works in the startup process as the StandbyMode
 * flag is not in shared memory.
 */
bool
IsStandbyMode(void)
{
	return StandbyMode;
13338
}
13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368

static void
GetXLogCleanUpTo(XLogRecPtr recptr, uint32 *_logId, uint32 *_logSeg)
{
#ifndef USE_SEGWALREP
	/* Only for MASTER check this GUC and act */
    if (GpIdentity.segindex == MASTER_CONTENT_ID)
    {
#endif
	/*
	 * See if we have a live WAL sender and see if it has a
	 * start xlog location (with active basebackup) or standby fsync location
	 * (with active standby). We have to compare it with prev. checkpoint
	 * location. We use the min out of them to figure out till
	 * what point we need to save the xlog seg files
	 */
	XLogRecPtr xlogCleanUpTo = WalSndCtlGetXLogCleanUpTo();
	if (!XLogRecPtrIsInvalid(xlogCleanUpTo))
	{
		if (XLByteLT(recptr, xlogCleanUpTo))
			xlogCleanUpTo = recptr;
	}
	else
		xlogCleanUpTo = recptr;

	CheckKeepWalSegments(xlogCleanUpTo, _logId, _logSeg);
#ifndef USE_SEGWALREP
	}
#endif
}
13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429

/*
 * Checks whether the current buffer page and backup page stored in the
 * WAL record are consistent or not. Before comparing the two pages, a
 * masking can be applied to the pages to ignore certain areas like hint bits,
 * unused space between pd_lower and pd_upper among other things. This
 * function should be called once WAL replay has been completed for a
 * given record.
 */
static void
checkXLogConsistency(XLogRecord *record, XLogRecPtr EndRecPtr)
{
	MIRROREDLOCK_BUFMGR_DECLARE;
	RmgrId		rmid = record->xl_rmid;
	char       *blk;

	/* Records with no backup blocks have no need for consistency checks. */
	if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
		return;

	Assert((record->xl_extended_info & XLR_CHECK_CONSISTENCY) != 0);

	blk = (char *) XLogRecGetData(record) + record->xl_len;
	for (int i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		BkpBlock    bkpb;
		Buffer		buf;
		Page		page;
		char       *src_buffer;

		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
		{
			/*
			 * WAL record doesn't contain a block do nothing.
			 */
			continue;
		}

		memcpy(&bkpb, blk, sizeof(BkpBlock));
		blk += sizeof(BkpBlock);
		src_buffer = blk;
		/* move on to point to next block */
		blk += BLCKSZ - bkpb.hole_length;

		if (bkpb.block_info & BLOCK_APPLY)
		{
			/*
			 * WAL record has already applied the page, so bypass the
			 * consistency check as that would result in comparing the full
			 * page stored in the record with itself.
			 */
			continue;
		}

		// -------- MirroredLock ----------
		MIRROREDLOCK_BUFMGR_LOCK;

		/*
		 * Read the contents from the current buffer and store it in a
		 * temporary page.
		 */
13430
		buf = XLogReadBuffer(bkpb.node, bkpb.block, false);
13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502
		if (!BufferIsValid(buf))
			continue;

		page = BufferGetPage(buf);

		/*
		 * Take a copy of the local page where WAL has been applied to have a
		 * comparison base before masking it...
		 */
		memcpy(replay_image_masked, page, BLCKSZ);

		/* No need for this page anymore now that a copy is in. */
		UnlockReleaseBuffer(buf);

		MIRROREDLOCK_BUFMGR_UNLOCK;
		// -------- MirroredLock ----------

		/*
		 * If the block LSN is already ahead of this WAL record, we can't
		 * expect contents to match.  This can happen if recovery is
		 * restarted.
		 */
		if (XLByteLT(EndRecPtr, PageGetLSN(replay_image_masked)))
			continue;

		/*
		 * Read the contents from the backup copy, stored in WAL record and
		 * store it in a temporary page. There is no need to allocate a new
		 * page here, a local buffer is fine to hold its contents and a mask
		 * can be directly applied on it.
		 */
		if (bkpb.hole_length == 0)
		{
			memcpy((char *) master_image_masked, src_buffer, BLCKSZ);
		}
		else
		{
			/* zero-fill the hole, anyways gets masked out */
			MemSet((char *) master_image_masked, 0, BLCKSZ);
			memcpy((char *) master_image_masked, src_buffer, bkpb.hole_offset);
			memcpy((char *) master_image_masked + (bkpb.hole_offset + bkpb.hole_length),
				   src_buffer + bkpb.hole_offset,
				   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
		}

		/*
		 * If masking function is defined, mask both the master and replay
		 * images
		 */
		if (RmgrTable[rmid].rm_mask != NULL)
		{
			RmgrTable[rmid].rm_mask(replay_image_masked, bkpb.block);
			RmgrTable[rmid].rm_mask(master_image_masked, bkpb.block);
		}

		/* Time to compare the master and replay images. */
		if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
		{
			elog(FATAL,
				 "inconsistent page found, rel %u/%u/%u, blkno %u",
				 bkpb.node.spcNode, bkpb.node.dbNode, bkpb.node.relNode,
				 bkpb.block);
		}
		else
		{
			elog(DEBUG1,
				 "Consistent page for rel %u/%u/%u, blkno %u",
				 bkpb.node.spcNode, bkpb.node.dbNode, bkpb.node.relNode,
				 bkpb.block);
		}
	}
}
13503 13504 13505 13506 13507 13508 13509 13510 13511 13512

/*
 * Wake up startup process to replay newly arrived WAL, or to notice that
 * failover has been requested.
 */
void
WakeupRecovery(void)
{
	SetLatch(&XLogCtl->recoveryWakeupLatch);
}