xlog.c 98.4 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
B
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.114 2003/04/25 19:45:08 tgl Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <fcntl.h>
T
Tom Lane 已提交
18
#include <signal.h>
19 20 21
#include <unistd.h>
#include <errno.h>
#include <sys/stat.h>
V
Vadim B. Mikheev 已提交
22
#include <sys/time.h>
V
Vadim B. Mikheev 已提交
23
#include <dirent.h>
24

25
#include "access/clog.h"
26
#include "access/transam.h"
27
#include "access/xact.h"
28 29
#include "access/xlog.h"
#include "access/xlogutils.h"
30
#include "catalog/catversion.h"
T
Tom Lane 已提交
31
#include "catalog/pg_control.h"
32 33
#include "storage/bufpage.h"
#include "storage/lwlock.h"
34
#include "storage/pmsignal.h"
35
#include "storage/proc.h"
36
#include "storage/sinval.h"
37
#include "storage/spin.h"
38
#include "utils/builtins.h"
39
#include "utils/guc.h"
40
#include "utils/relcache.h"
V
WAL  
Vadim B. Mikheev 已提交
41 42
#include "miscadmin.h"

43

44 45 46
/*
 * This chunk of hackery attempts to determine which file sync methods
 * are available on the current platform, and to choose an appropriate
B
Bruce Momjian 已提交
47
 * default method.	We assume that fsync() is always available, and that
48 49 50 51
 * configure determined whether fdatasync() is.
 */
#define SYNC_METHOD_FSYNC		0
#define SYNC_METHOD_FDATASYNC	1
B
Bruce Momjian 已提交
52 53
#define SYNC_METHOD_OPEN		2		/* used for both O_SYNC and
										 * O_DSYNC */
54 55

#if defined(O_SYNC)
B
Bruce Momjian 已提交
56
#define OPEN_SYNC_FLAG	   O_SYNC
57
#else
B
Bruce Momjian 已提交
58 59 60
#if defined(O_FSYNC)
#define OPEN_SYNC_FLAG	  O_FSYNC
#endif
61 62 63
#endif

#if defined(OPEN_SYNC_FLAG)
B
Bruce Momjian 已提交
64 65 66
#if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
#define OPEN_DATASYNC_FLAG	  O_DSYNC
#endif
67 68 69
#endif

#if defined(OPEN_DATASYNC_FLAG)
B
Bruce Momjian 已提交
70 71 72
#define DEFAULT_SYNC_METHOD_STR    "open_datasync"
#define DEFAULT_SYNC_METHOD		   SYNC_METHOD_OPEN
#define DEFAULT_SYNC_FLAGBIT	   OPEN_DATASYNC_FLAG
73
#else
B
Bruce Momjian 已提交
74 75 76 77 78 79 80 81 82
#if defined(HAVE_FDATASYNC)
#define DEFAULT_SYNC_METHOD_STR   "fdatasync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FDATASYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#else
#define DEFAULT_SYNC_METHOD_STR   "fsync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FSYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#endif
83 84 85
#endif


T
Tom Lane 已提交
86 87
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
88
int			XLOGbuffers = 8;
T
Tom Lane 已提交
89
int			XLOG_DEBUG = 0;
90 91
char	   *XLOG_sync_method = NULL;
const char	XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
B
Bruce Momjian 已提交
92 93
char		XLOG_archive_dir[MAXPGPATH];		/* null string means
												 * delete 'em */
T
Tom Lane 已提交
94

95
/*
96
 * XLOGfileslop is used in the code as the allowed "fuzz" in the number of
97
 * preallocated XLOG segments --- we try to have at least XLOGfiles advance
B
Bruce Momjian 已提交
98
 * segments but no more than XLOGfileslop segments.  This could
99 100 101 102 103 104 105 106 107 108
 * be made a separate GUC variable, but at present I think it's sufficient
 * to hardwire it as 2*CheckPointSegments+1.  Under normal conditions, a
 * checkpoint will free no more than 2*CheckPointSegments log segments, and
 * we want to recycle all of them; the +1 allows boundary cases to happen
 * without wasting a delete/create-segment cycle.
 */

#define XLOGfileslop	(2*CheckPointSegments + 1)


109 110 111 112 113 114
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
static int	sync_method = DEFAULT_SYNC_METHOD;
static int	open_sync_bit = DEFAULT_SYNC_FLAGBIT;

#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)

115 116
#define MinXLOGbuffers	4

T
Tom Lane 已提交
117 118 119 120 121

/*
 * ThisStartUpID will be same in all backends --- it identifies current
 * instance of the database system.
 */
V
WAL  
Vadim B. Mikheev 已提交
122 123
StartUpID	ThisStartUpID = 0;

T
Tom Lane 已提交
124 125
/* Are we doing recovery by reading XLOG? */
bool		InRecovery = false;
126

T
Tom Lane 已提交
127 128
/*
 * MyLastRecPtr points to the start of the last XLOG record inserted by the
129 130
 * current transaction.  If MyLastRecPtr.xrecoff == 0, then the current
 * xact hasn't yet inserted any transaction-controlled XLOG records.
T
Tom Lane 已提交
131 132
 *
 * Note that XLOG records inserted outside transaction control are not
133
 * reflected into MyLastRecPtr.  They do, however, cause MyXactMadeXLogEntry
B
Bruce Momjian 已提交
134
 * to be set true.	The latter can be used to test whether the current xact
135 136
 * made any loggable changes (including out-of-xact changes, such as
 * sequence updates).
137 138 139
 *
 * When we insert/update/delete a tuple in a temporary relation, we do not
 * make any XLOG record, since we don't care about recovering the state of
B
Bruce Momjian 已提交
140
 * the temp rel after a crash.	However, we will still need to remember
141 142 143
 * whether our transaction committed or aborted in that case.  So, we must
 * set MyXactMadeTempRelUpdate true to indicate that the XID will be of
 * interest later.
T
Tom Lane 已提交
144 145
 */
XLogRecPtr	MyLastRecPtr = {0, 0};
V
Vadim B. Mikheev 已提交
146

147 148
bool		MyXactMadeXLogEntry = false;

149 150
bool		MyXactMadeTempRelUpdate = false;

T
Tom Lane 已提交
151 152 153
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 * current backend.  It is updated for all inserts, transaction-controlled
B
Bruce Momjian 已提交
154
 * or not.	ProcLastRecEnd is similar but points to end+1 of last record.
T
Tom Lane 已提交
155 156
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
157

158 159
XLogRecPtr	ProcLastRecEnd = {0, 0};

T
Tom Lane 已提交
160 161 162
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
163
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
164
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
B
Bruce Momjian 已提交
165
 * hold the Insert lock).  See XLogInsert for details.	We are also allowed
166 167
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 * see GetRedoRecPtr.
T
Tom Lane 已提交
168 169
 */
static XLogRecPtr RedoRecPtr;
170

T
Tom Lane 已提交
171 172 173 174 175 176 177 178 179
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
180
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
181 182 183
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
184 185 186 187
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
188 189 190
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
191 192
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
193 194 195
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
196
 * but is updated when convenient.	Again, it exists for the convenience of
197
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
198 199 200 201 202 203 204 205 206 207
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
 * CheckpointLock: must be held to do a checkpoint (ensures only one
 * checkpointer at a time; even though the postmaster won't launch
 * parallel checkpoint processes, we need this because manual checkpoints
 * could be launched simultaneously).
 *
T
Tom Lane 已提交
226 227 228
 *----------
 */
typedef struct XLogwrtRqst
229
{
T
Tom Lane 已提交
230 231
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
232
} XLogwrtRqst;
233

T
Tom Lane 已提交
234
typedef struct XLogwrtResult
235
{
T
Tom Lane 已提交
236 237
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
238
} XLogwrtResult;
239

T
Tom Lane 已提交
240 241 242
/*
 * Shared state data for XLogInsert.
 */
243 244
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
245 246 247 248 249 250
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
	uint16		curridx;		/* current block index in cache */
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
251 252
} XLogCtlInsert;

T
Tom Lane 已提交
253 254 255
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
256 257
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
258 259
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	uint16		curridx;		/* cache index of next block to write */
260 261
} XLogCtlWrite;

T
Tom Lane 已提交
262 263 264
/*
 * Total shared-memory state for XLOG.
 */
265 266
typedef struct XLogCtlData
{
267
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
268
	XLogCtlInsert Insert;
T
Tom Lane 已提交
269
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
270 271
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
272
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
273 274
	XLogCtlWrite Write;

T
Tom Lane 已提交
275 276
	/*
	 * These values do not change after startup, although the pointed-to
277 278 279
	 * pages and xlblocks values certainly do.	Permission to read/write
	 * the pages and xlblocks values depends on WALInsertLock and
	 * WALWriteLock.
T
Tom Lane 已提交
280
	 */
B
Bruce Momjian 已提交
281 282 283 284 285
	char	   *pages;			/* buffers for unwritten XLOG pages */
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + BLCKSZ */
	uint32		XLogCacheByte;	/* # bytes in xlog buffers */
	uint32		XLogCacheBlck;	/* highest allocated xlog buffer index */
	StartUpID	ThisStartUpID;
T
Tom Lane 已提交
286

287
	/* This value is not protected by *any* lock... */
288 289
	/* see SetSavedRedoRecPtr/GetSavedRedoRecPtr */
	XLogRecPtr	SavedRedoRecPtr;
T
Tom Lane 已提交
290

B
Bruce Momjian 已提交
291
	slock_t		info_lck;		/* locks shared LogwrtRqst/LogwrtResult */
292 293
} XLogCtlData;

294
static XLogCtlData *XLogCtl = NULL;
295

296
/*
T
Tom Lane 已提交
297
 * We maintain an image of pg_control in shared memory.
298
 */
299
static ControlFileData *ControlFile = NULL;
300

T
Tom Lane 已提交
301 302 303 304 305
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
306

T
Tom Lane 已提交
307 308 309 310 311 312 313 314 315
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
	(BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
316
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
	)


/* Increment an xlogid/segment pair */
#define NextLogSeg(logId, logSeg)	\
	do { \
		if ((logSeg) >= XLogSegsPerFile-1) \
		{ \
			(logId)++; \
			(logSeg) = 0; \
		} \
		else \
			(logSeg)++; \
	} while (0)

/* Decrement an xlogid/segment pair (assume it's not 0,0) */
#define PrevLogSeg(logId, logSeg)	\
	do { \
		if (logSeg) \
			(logSeg)--; \
		else \
		{ \
			(logId)--; \
			(logSeg) = XLogSegsPerFile-1; \
		} \
	} while (0)
V
WAL  
Vadim B. Mikheev 已提交
343

T
Tom Lane 已提交
344 345 346 347
/*
 * Compute ID and segment from an XLogRecPtr.
 *
 * For XLByteToSeg, do the computation at face value.  For XLByteToPrevSeg,
B
Bruce Momjian 已提交
348
 * a boundary byte is taken to be in the previous segment.	This is suitable
T
Tom Lane 已提交
349 350 351 352 353 354 355 356 357 358 359
 * for deciding which segment to write given a pointer to a record end,
 * for example.
 */
#define XLByteToSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = (xlrp).xrecoff / XLogSegSize \
	)
#define XLByteToPrevSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = ((xlrp).xrecoff - 1) / XLogSegSize \
	)
360

361
/*
T
Tom Lane 已提交
362 363 364 365
 * Is an XLogRecPtr within a particular XLOG segment?
 *
 * For XLByteInSeg, do the computation at face value.  For XLByteInPrevSeg,
 * a boundary byte is taken to be in the previous segment.
366
 */
T
Tom Lane 已提交
367 368 369 370 371 372 373
#define XLByteInSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 (xlrp).xrecoff / XLogSegSize == (logSeg))

#define XLByteInPrevSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 ((xlrp).xrecoff - 1) / XLogSegSize == (logSeg))
374 375


376
#define XLogFileName(path, log, seg)	\
377 378
			snprintf(path, MAXPGPATH, "%s/%08X%08X",	\
					 XLogDir, log, seg)
379

T
Tom Lane 已提交
380 381 382 383 384
#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
385

386
#define XRecOffIsValid(xrecoff) \
T
Tom Lane 已提交
387 388
		((xrecoff) % BLCKSZ >= SizeOfXLogPHD && \
		(BLCKSZ - (xrecoff) % BLCKSZ) >= SizeOfXLogRecord)
389

T
Tom Lane 已提交
390 391 392 393 394 395
/*
 * _INTL_MAXLOGRECSZ: max space needed for a record including header and
 * any backup-block data.
 */
#define _INTL_MAXLOGRECSZ	(SizeOfXLogRecord + MAXLOGRECSZ + \
							 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
396

397

T
Tom Lane 已提交
398
/* File path names */
399
static char XLogDir[MAXPGPATH];
B
Bruce Momjian 已提交
400
static char ControlFilePath[MAXPGPATH];
T
Tom Lane 已提交
401 402 403 404 405 406

/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
407

T
Tom Lane 已提交
408 409 410 411 412 413 414 415 416 417
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
418

T
Tom Lane 已提交
419 420 421 422 423 424
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
 * will be just past that page.
 */
425 426 427 428
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
B
Bruce Momjian 已提交
429

T
Tom Lane 已提交
430 431
/* Buffer for currently read page (BLCKSZ bytes) */
static char *readBuf = NULL;
B
Bruce Momjian 已提交
432

T
Tom Lane 已提交
433 434 435
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr;
static XLogRecPtr EndRecPtr;
436
static XLogRecord *nextRecord = NULL;
437
static StartUpID lastReadSUI;
438

V
WAL  
Vadim B. Mikheev 已提交
439 440
static bool InRedo = false;

T
Tom Lane 已提交
441 442 443

static bool AdvanceXLInsertBuffer(void);
static void XLogWrite(XLogwrtRqst WriteRqst);
B
Bruce Momjian 已提交
444 445
static int XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock);
446
static bool InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
447 448
					   bool find_free, int max_advance,
					   bool use_lock);
T
Tom Lane 已提交
449 450
static int	XLogFileOpen(uint32 log, uint32 seg, bool econt);
static void PreallocXlogFiles(XLogRecPtr endptr);
451
static void MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr);
T
Tom Lane 已提交
452
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer);
453
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI);
T
Tom Lane 已提交
454
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr,
455
					 int whichChkpt,
B
Bruce Momjian 已提交
456
					 char *buffer);
T
Tom Lane 已提交
457 458 459 460
static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(time_t tnow);
static void xlog_outrec(char *buf, XLogRecord *record);
461
static void issue_xlog_fsync(void);
T
Tom Lane 已提交
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
 * the rdata list (see xlog.h for notes about rdata).
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
479
XLogRecPtr
480
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
481
{
B
Bruce Momjian 已提交
482 483
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
484
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
485 486 487 488 489 490 491 492 493 494 495 496 497 498
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
	uint16		curridx;
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
	crc64		rdata_crc;
	uint32		len,
				write_len;
	unsigned	i;
499
	XLogwrtRqst LogwrtRqst;
B
Bruce Momjian 已提交
500 501
	bool		updrqst;
	bool		no_tran = (rmid == RM_XLOG_ID) ? true : false;
V
Vadim B. Mikheev 已提交
502 503 504 505

	if (info & XLR_INFO_MASK)
	{
		if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
506
			elog(PANIC, "XLogInsert: invalid info mask %02X",
T
Tom Lane 已提交
507
				 (info & XLR_INFO_MASK));
V
Vadim B. Mikheev 已提交
508 509 510 511
		no_tran = true;
		info &= ~XLR_INFO_MASK;
	}

T
Tom Lane 已提交
512
	/*
B
Bruce Momjian 已提交
513 514
	 * In bootstrap mode, we don't actually log anything but XLOG
	 * resources; return a phony record pointer.
T
Tom Lane 已提交
515
	 */
V
Vadim B. Mikheev 已提交
516
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
517 518
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
519
		RecPtr.xrecoff = SizeOfXLogPHD; /* start of 1st checkpoint record */
V
WAL  
Vadim B. Mikheev 已提交
520 521 522
		return (RecPtr);
	}

T
Tom Lane 已提交
523 524 525 526 527 528
	/*
	 * Here we scan the rdata list, determine which buffers must be backed
	 * up, and compute the CRC values for the data.  Note that the record
	 * header isn't added into the CRC yet since we don't know the final
	 * length or info bits quite yet.
	 *
B
Bruce Momjian 已提交
529 530
	 * We may have to loop back to here if a race condition is detected
	 * below. We could prevent the race by doing all this work while
531
	 * holding the insert lock, but it seems better to avoid doing CRC
B
Bruce Momjian 已提交
532 533 534 535 536 537 538 539
	 * calculations while holding the lock.  This means we have to be
	 * careful about modifying the rdata list until we know we aren't
	 * going to loop back again.  The only change we allow ourselves to
	 * make earlier is to set rdt->data = NULL in list items we have
	 * decided we will have to back up the whole buffer for.  This is OK
	 * because we will certainly decide the same thing again for those
	 * items if we do it over; doing it here saves an extra pass over the
	 * list later.
T
Tom Lane 已提交
540
	 */
541
begin:;
T
Tom Lane 已提交
542 543 544 545 546 547
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

548
	INIT_CRC64(rdata_crc);
T
Tom Lane 已提交
549
	len = 0;
B
Bruce Momjian 已提交
550
	for (rdt = rdata;;)
551 552 553
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
554
			/* Simple data, just include it */
555 556 557
			len += rdt->len;
			COMP_CRC64(rdata_crc, rdt->data, rdt->len);
		}
T
Tom Lane 已提交
558
		else
559
		{
T
Tom Lane 已提交
560 561
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
562
			{
T
Tom Lane 已提交
563
				if (rdt->buffer == dtbuf[i])
564
				{
T
Tom Lane 已提交
565 566 567 568 569 570 571 572 573
					/* Buffer already referenced by earlier list item */
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
574
				}
T
Tom Lane 已提交
575
				if (dtbuf[i] == InvalidBuffer)
576
				{
T
Tom Lane 已提交
577 578
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
B
Bruce Momjian 已提交
579

T
Tom Lane 已提交
580 581 582
					/*
					 * XXX We assume page LSN is first data on page
					 */
B
Bruce Momjian 已提交
583
					dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
T
Tom Lane 已提交
584 585
					if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
					{
B
Bruce Momjian 已提交
586
						crc64		dtcrc;
T
Tom Lane 已提交
587 588 589 590 591 592 593 594 595 596

						dtbuf_bkp[i] = true;
						rdt->data = NULL;
						INIT_CRC64(dtcrc);
						COMP_CRC64(dtcrc,
								   BufferGetBlock(dtbuf[i]),
								   BLCKSZ);
						dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
						dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
						COMP_CRC64(dtcrc,
B
Bruce Momjian 已提交
597
								(char *) &(dtbuf_xlg[i]) + sizeof(crc64),
T
Tom Lane 已提交
598 599 600 601 602 603 604 605 606 607
								   sizeof(BkpBlock) - sizeof(crc64));
						FIN_CRC64(dtcrc);
						dtbuf_xlg[i].crc = dtcrc;
					}
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
608 609
				}
			}
T
Tom Lane 已提交
610
			if (i >= XLR_MAX_BKP_BLOCKS)
611
				elog(PANIC, "XLogInsert: can backup %d blocks at most",
T
Tom Lane 已提交
612
					 XLR_MAX_BKP_BLOCKS);
613
		}
T
Tom Lane 已提交
614
		/* Break out of loop when rdt points to last list item */
615 616 617 618 619
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

T
Tom Lane 已提交
620 621 622
	/*
	 * NOTE: the test for len == 0 here is somewhat fishy, since in theory
	 * all of the rmgr data might have been suppressed in favor of backup
B
Bruce Momjian 已提交
623
	 * blocks.	Currently, all callers of XLogInsert provide at least some
T
Tom Lane 已提交
624 625 626 627
	 * not-in-a-buffer data and so len == 0 should never happen, but that
	 * may not be true forever.  If you need to remove the len == 0 check,
	 * also remove the check for xl_len == 0 in ReadRecord, below.
	 */
628
	if (len == 0 || len > MAXLOGRECSZ)
629
		elog(PANIC, "XLogInsert: invalid record length %u", len);
630

631
	START_CRIT_SECTION();
632

633
	/* update LogwrtResult before doing cache fill check */
634 635 636 637 638 639 640 641 642
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		LogwrtRqst = xlogctl->LogwrtRqst;
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
643

644
	/*
645 646
	 * If cache is half filled then try to acquire write lock and do
	 * XLogWrite. Ignore any fractional blocks in performing this check.
647 648 649 650 651
	 */
	LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % BLCKSZ;
	if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
		(LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
		 XLogCtl->XLogCacheByte / 2))
T
Tom Lane 已提交
652
	{
653
		if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
654
		{
655 656 657 658
			LogwrtResult = XLogCtl->Write.LogwrtResult;
			if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
				XLogWrite(LogwrtRqst);
			LWLockRelease(WALWriteLock);
659 660 661
		}
	}

662 663 664
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
665 666
	/*
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to
B
Bruce Momjian 已提交
667 668 669
	 * go back and recompute everything.  This can only happen just after
	 * a checkpoint, so it's better to be slow in this case and fast
	 * otherwise.
T
Tom Lane 已提交
670 671
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
672
	{
T
Tom Lane 已提交
673 674 675 676
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

		for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
677
		{
T
Tom Lane 已提交
678 679 680 681 682 683
			if (dtbuf[i] == InvalidBuffer)
				continue;
			if (dtbuf_bkp[i] == false &&
				XLByteLE(dtbuf_lsn[i], RedoRecPtr))
			{
				/*
B
Bruce Momjian 已提交
684 685
				 * Oops, this buffer now needs to be backed up, but we
				 * didn't think so above.  Start over.
T
Tom Lane 已提交
686
				 */
687
				LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
688 689 690
				END_CRIT_SECTION();
				goto begin;
			}
691 692 693
		}
	}

T
Tom Lane 已提交
694 695 696 697 698 699 700
	/*
	 * Make additional rdata list entries for the backup blocks, so that
	 * we don't need to special-case them in the write loop.  Note that we
	 * have now irrevocably changed the input rdata list.  At the exit of
	 * this loop, write_len includes the backup block data.
	 *
	 * Also set the appropriate info bits to show which buffers were backed
B
Bruce Momjian 已提交
701 702 703
	 * up.	The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th
	 * distinct buffer value (ignoring InvalidBuffer) appearing in the
	 * rdata list.
T
Tom Lane 已提交
704 705 706
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
707 708 709 710
	{
		if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
			continue;

T
Tom Lane 已提交
711
		info |= XLR_SET_BKP_BLOCK(i);
712 713 714

		rdt->next = &(dtbuf_rdt[2 * i]);

B
Bruce Momjian 已提交
715
		dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
716
		dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
T
Tom Lane 已提交
717
		write_len += sizeof(BkpBlock);
718 719 720

		rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);

B
Bruce Momjian 已提交
721
		dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
722
		dtbuf_rdt[2 * i + 1].len = BLCKSZ;
T
Tom Lane 已提交
723
		write_len += BLCKSZ;
724 725 726
		dtbuf_rdt[2 * i + 1].next = NULL;
	}

T
Tom Lane 已提交
727
	/* Insert record header */
728

T
Tom Lane 已提交
729 730
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
731 732
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
733
		updrqst = AdvanceXLInsertBuffer();
734 735 736
		freespace = BLCKSZ - SizeOfXLogPHD;
	}

T
Tom Lane 已提交
737
	curridx = Insert->curridx;
738
	record = (XLogRecord *) Insert->currpos;
T
Tom Lane 已提交
739

740
	record->xl_prev = Insert->PrevRecord;
V
Vadim B. Mikheev 已提交
741
	if (no_tran)
742 743 744 745
	{
		record->xl_xact_prev.xlogid = 0;
		record->xl_xact_prev.xrecoff = 0;
	}
V
Vadim B. Mikheev 已提交
746 747 748
	else
		record->xl_xact_prev = MyLastRecPtr;

749
	record->xl_xid = GetCurrentTransactionId();
T
Tom Lane 已提交
750
	record->xl_len = len;		/* doesn't include backup blocks */
751
	record->xl_info = info;
752
	record->xl_rmid = rmid;
753

T
Tom Lane 已提交
754
	/* Now we can finish computing the main CRC */
B
Bruce Momjian 已提交
755
	COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
756
			   SizeOfXLogRecord - sizeof(crc64));
757 758 759
	FIN_CRC64(rdata_crc);
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
760 761 762
	/* Compute record's XLOG location */
	INSERT_RECPTR(RecPtr, Insert, curridx);

J
Jan Wieck 已提交
763
	/* If first XLOG record of transaction, save it in PGPROC array */
V
Vadim B. Mikheev 已提交
764
	if (MyLastRecPtr.xrecoff == 0 && !no_tran)
765
	{
766 767 768
		/*
		 * We do not acquire SInvalLock here because of possible deadlock.
		 * Anyone who wants to inspect other procs' logRec must acquire
B
Bruce Momjian 已提交
769
		 * WALInsertLock, instead.	A better solution would be a per-PROC
770 771
		 * spinlock, but no time for that before 7.2 --- tgl 12/19/01.
		 */
772 773
		MyProc->logRec = RecPtr;
	}
V
WAL  
Vadim B. Mikheev 已提交
774 775 776

	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
777
		char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
778

779
		sprintf(buf, "INSERT @ %X/%X: ", RecPtr.xlogid, RecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
780
		xlog_outrec(buf, record);
781
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
782 783
		{
			strcat(buf, " - ");
784
			RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
785
		}
786
		elog(LOG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
787 788
	}

T
Tom Lane 已提交
789 790 791 792 793
	/* Record begin of record in appropriate places */
	if (!no_tran)
		MyLastRecPtr = RecPtr;
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;
794
	MyXactMadeXLogEntry = true;
T
Tom Lane 已提交
795

796
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
797
	freespace -= SizeOfXLogRecord;
798

T
Tom Lane 已提交
799 800 801 802
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
803
	{
804 805 806 807
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
808
		{
809 810 811 812 813
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
814
				write_len -= freespace;
815 816 817 818 819
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
820
				write_len -= rdata->len;
821 822 823 824
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
825 826
		}

827
		/* Use next buffer */
T
Tom Lane 已提交
828 829 830 831 832 833 834 835
		updrqst = AdvanceXLInsertBuffer();
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
		freespace = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
836
	}
837

T
Tom Lane 已提交
838 839
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
840
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
841
	freespace = INSERT_FREESPACE(Insert);
842

V
Vadim B. Mikheev 已提交
843
	/*
B
Bruce Momjian 已提交
844 845
	 * The recptr I return is the beginning of the *next* record. This
	 * will be stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
846
	 */
T
Tom Lane 已提交
847
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
848

T
Tom Lane 已提交
849
	/* Need to update shared LogwrtRqst if some block was filled up */
850
	if (freespace < SizeOfXLogRecord)
B
Bruce Momjian 已提交
851 852
		updrqst = true;			/* curridx is filled and available for
								 * writing out */
853 854
	else
		curridx = PrevBufIdx(curridx);
T
Tom Lane 已提交
855
	WriteRqst = XLogCtl->xlblocks[curridx];
856

857
	LWLockRelease(WALInsertLock);
858 859 860

	if (updrqst)
	{
861 862 863 864
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
T
Tom Lane 已提交
865
		/* advance global request to include new block(s) */
866 867
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
868
		/* update local result copy while I have the chance */
869 870
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
871 872
	}

873 874
	ProcLastRecEnd = RecPtr;

875
	END_CRIT_SECTION();
876

877
	return (RecPtr);
878
}
879

T
Tom Lane 已提交
880 881 882 883 884
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
885
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
886 887 888
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
889
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
890 891 892
 */
static bool
AdvanceXLInsertBuffer(void)
893
{
T
Tom Lane 已提交
894 895 896 897 898 899
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
	uint16		nextidx = NextBufIdx(Insert->curridx);
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
900 901
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
902

T
Tom Lane 已提交
903 904 905
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
906

T
Tom Lane 已提交
907
	/*
B
Bruce Momjian 已提交
908 909 910
	 * Get ending-offset of the buffer page we need to replace (this may
	 * be zero if the buffer hasn't been used yet).  Fall through if it's
	 * already written out.
T
Tom Lane 已提交
911 912 913 914 915 916
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
917

T
Tom Lane 已提交
918
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
919

920
		/* Before waiting, get info_lck and update LogwrtResult */
921 922 923 924 925 926 927 928 929 930
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
			SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
		}
931 932 933 934 935 936 937 938 939

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
940
		{
941 942 943 944
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
945
			{
946 947 948
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
949
			}
950
			else
T
Tom Lane 已提交
951 952
			{
				/*
B
Bruce Momjian 已提交
953 954
				 * Have to write buffers while holding insert lock. This
				 * is not good, so only write as much as we absolutely
T
Tom Lane 已提交
955 956 957 958 959 960
				 * must.
				 */
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
				XLogWrite(WriteRqst);
961
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
962
				Insert->LogwrtResult = LogwrtResult;
963 964 965 966
			}
		}
	}

T
Tom Lane 已提交
967 968 969 970
	/*
	 * Now the next buffer slot is free and we can set it up to be the
	 * next output page.
	 */
971 972
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
973
	{
T
Tom Lane 已提交
974
		/* crossing a logid boundary */
975 976
		NewPageEndPtr.xlogid += 1;
		NewPageEndPtr.xrecoff = BLCKSZ;
977
	}
T
Tom Lane 已提交
978
	else
979 980 981
		NewPageEndPtr.xrecoff += BLCKSZ;
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * BLCKSZ);
T
Tom Lane 已提交
982
	Insert->curridx = nextidx;
983 984
	Insert->currpage = NewPage;
	Insert->currpos = ((char *) NewPage) + SizeOfXLogPHD;
B
Bruce Momjian 已提交
985

T
Tom Lane 已提交
986
	/*
B
Bruce Momjian 已提交
987 988
	 * Be sure to re-zero the buffer so that bytes beyond what we've
	 * written will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
989
	 */
990 991 992 993
	MemSet((char *) NewPage, 0, BLCKSZ);

	/* And fill the new page's header */
	NewPage->xlp_magic = XLOG_PAGE_MAGIC;
994
	/* NewPage->xlp_info = 0; */	/* done by memset */
995 996 997
	NewPage->xlp_sui = ThisStartUpID;
	NewPage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
	NewPage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
T
Tom Lane 已提交
998 999

	return update_needed;
1000 1001
}

T
Tom Lane 已提交
1002 1003 1004
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
1005
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
1006
 */
1007
static void
T
Tom Lane 已提交
1008
XLogWrite(XLogwrtRqst WriteRqst)
1009
{
1010 1011
	XLogCtlWrite *Write = &XLogCtl->Write;
	char	   *from;
T
Tom Lane 已提交
1012
	bool		ispartialpage;
1013
	bool		use_existent;
1014

B
Bruce Momjian 已提交
1015 1016 1017 1018
	/*
	 * Update local LogwrtResult (caller probably did this already,
	 * but...)
	 */
T
Tom Lane 已提交
1019 1020 1021
	LogwrtResult = Write->LogwrtResult;

	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1022
	{
1023 1024 1025 1026 1027 1028 1029
		/*
		 * Make sure we're not ahead of the insert process.  This could
		 * happen if we're passed a bogus WriteRqst.Write that is past the
		 * end of the last page that's been initialized by
		 * AdvanceXLInsertBuffer.
		 */
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
1030
			elog(PANIC, "XLogWrite: write request %X/%X is past end of log %X/%X",
1031 1032 1033
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
				 XLogCtl->xlblocks[Write->curridx].xlogid,
				 XLogCtl->xlblocks[Write->curridx].xrecoff);
1034

T
Tom Lane 已提交
1035 1036 1037 1038 1039
		/* Advance LogwrtResult.Write to end of current buffer page */
		LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1040
		{
T
Tom Lane 已提交
1041 1042 1043 1044
			/*
			 * Switch to new logfile segment.
			 */
			if (openLogFile >= 0)
1045
			{
T
Tom Lane 已提交
1046
				if (close(openLogFile) != 0)
1047
					elog(PANIC, "close of log file %u, segment %u failed: %m",
T
Tom Lane 已提交
1048 1049
						 openLogId, openLogSeg);
				openLogFile = -1;
1050
			}
T
Tom Lane 已提交
1051 1052
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1053 1054 1055 1056
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1057
			openLogOff = 0;
1058

T
Tom Lane 已提交
1059
			/* update pg_control, unless someone else already did */
1060
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1061 1062 1063
			if (ControlFile->logId < openLogId ||
				(ControlFile->logId == openLogId &&
				 ControlFile->logSeg < openLogSeg + 1))
T
Tom Lane 已提交
1064 1065 1066 1067 1068
			{
				ControlFile->logId = openLogId;
				ControlFile->logSeg = openLogSeg + 1;
				ControlFile->time = time(NULL);
				UpdateControlFile();
B
Bruce Momjian 已提交
1069

1070
				/*
B
Bruce Momjian 已提交
1071 1072 1073 1074
				 * Signal postmaster to start a checkpoint if it's been
				 * too long since the last one.  (We look at local copy of
				 * RedoRecPtr which might be a little out of date, but
				 * should be close enough for this purpose.)
1075 1076 1077 1078 1079 1080 1081
				 */
				if (IsUnderPostmaster &&
					(openLogId != RedoRecPtr.xlogid ||
					 openLogSeg >= (RedoRecPtr.xrecoff / XLogSegSize) +
					 (uint32) CheckPointSegments))
				{
					if (XLOG_DEBUG)
1082
						elog(LOG, "XLogWrite: time for a checkpoint, signaling postmaster");
1083
					SendPostmasterSignal(PMSIGNAL_DO_CHECKPOINT);
1084
				}
T
Tom Lane 已提交
1085
			}
1086
			LWLockRelease(ControlFileLock);
1087 1088
		}

T
Tom Lane 已提交
1089
		if (openLogFile < 0)
1090
		{
T
Tom Lane 已提交
1091 1092 1093
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
			openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
			openLogOff = 0;
1094 1095
		}

T
Tom Lane 已提交
1096 1097
		/* Need to seek in the file? */
		if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1098
		{
T
Tom Lane 已提交
1099 1100
			openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
			if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1101
				elog(PANIC, "lseek of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1102
					 openLogId, openLogSeg, openLogOff);
1103 1104
		}

T
Tom Lane 已提交
1105 1106
		/* OK to write the page */
		from = XLogCtl->pages + Write->curridx * BLCKSZ;
1107
		errno = 0;
T
Tom Lane 已提交
1108
		if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1109 1110 1111 1112
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
1113
			elog(PANIC, "write of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1114
				 openLogId, openLogSeg, openLogOff);
1115
		}
T
Tom Lane 已提交
1116
		openLogOff += BLCKSZ;
1117

T
Tom Lane 已提交
1118 1119 1120
		/*
		 * If we just wrote the whole last page of a logfile segment,
		 * fsync the segment immediately.  This avoids having to go back
B
Bruce Momjian 已提交
1121 1122 1123
		 * and re-open prior segments when an fsync request comes along
		 * later. Doing it here ensures that one and only one backend will
		 * perform this fsync.
T
Tom Lane 已提交
1124 1125 1126
		 */
		if (openLogOff >= XLogSegSize && !ispartialpage)
		{
1127
			issue_xlog_fsync();
B
Bruce Momjian 已提交
1128
			LogwrtResult.Flush = LogwrtResult.Write;	/* end of current page */
T
Tom Lane 已提交
1129
		}
1130

T
Tom Lane 已提交
1131 1132 1133 1134 1135 1136 1137
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
		Write->curridx = NextBufIdx(Write->curridx);
1138 1139
	}

T
Tom Lane 已提交
1140 1141 1142 1143 1144
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1145
	{
T
Tom Lane 已提交
1146
		/*
B
Bruce Momjian 已提交
1147 1148 1149
		 * Could get here without iterating above loop, in which case we
		 * might have no open file or the wrong one.  However, we do not
		 * need to fsync more than one file.
T
Tom Lane 已提交
1150
		 */
1151
		if (sync_method != SYNC_METHOD_OPEN)
T
Tom Lane 已提交
1152
		{
1153
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1154
			 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1155 1156
			{
				if (close(openLogFile) != 0)
1157
					elog(PANIC, "close of log file %u, segment %u failed: %m",
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
				openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
				openLogOff = 0;
			}
			issue_xlog_fsync();
T
Tom Lane 已提交
1168 1169
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1170 1171
	}

T
Tom Lane 已提交
1172 1173 1174
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1175 1176
	 * We make sure that the shared 'request' values do not fall behind the
	 * 'result' values.  This is not absolutely essential, but it saves
T
Tom Lane 已提交
1177 1178
	 * some code in a couple of places.
	 */
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
1191

T
Tom Lane 已提交
1192 1193 1194 1195 1196 1197
	Write->LogwrtResult = LogwrtResult;
}

/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
1198
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

	/* Disabled during REDO */
	if (InRedo)
		return;

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

1215 1216 1217 1218 1219 1220 1221 1222 1223
	if (XLOG_DEBUG)
	{
		elog(LOG, "XLogFlush%s: request %X/%X; write %X/%X; flush %X/%X",
			 (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
	}

T
Tom Lane 已提交
1224 1225 1226 1227 1228 1229
	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
	 * piggyback as much data as we can on each fsync: if we see any more
	 * data entered into the xlog buffer, we'll write and fsync that too,
B
Bruce Momjian 已提交
1230 1231 1232
	 * so that the final value of LogwrtResult.Flush is as large as
	 * possible. This gives us some chance of avoiding another fsync
	 * immediately after.
T
Tom Lane 已提交
1233 1234 1235 1236 1237
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

1238
	/* read LogwrtResult and update local state */
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
1249 1250 1251

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1252
	{
1253 1254 1255 1256
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1257
		{
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
			/* try to write/flush later additions to XLOG as well */
			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
			{
				XLogCtlInsert *Insert = &XLogCtl->Insert;
				uint32		freespace = INSERT_FREESPACE(Insert);

				if (freespace < SizeOfXLogRecord)	/* buffer is full */
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				else
				{
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
					WriteRqstPtr.xrecoff -= freespace;
				}
				LWLockRelease(WALInsertLock);
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = WriteRqstPtr;
			}
			else
			{
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = record;
			}
T
Tom Lane 已提交
1280 1281
			XLogWrite(WriteRqst);
		}
1282
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1283 1284 1285
	}

	END_CRIT_SECTION();
1286 1287 1288

	/*
	 * If we still haven't flushed to the request point then we have a
B
Bruce Momjian 已提交
1289 1290 1291
	 * problem; most likely, the requested flush point is past end of
	 * XLOG. This has been seen to occur when a disk page has a corrupted
	 * LSN.
1292
	 *
1293
	 * Formerly we treated this as a PANIC condition, but that hurts the
1294 1295
	 * system's robustness rather than helping it: we do not want to take
	 * down the whole system due to corruption on one data page.  In
B
Bruce Momjian 已提交
1296 1297 1298 1299 1300
	 * particular, if the bad page is encountered again during recovery
	 * then we would be unable to restart the database at all!	(This
	 * scenario has actually happened in the field several times with 7.1
	 * releases. Note that we cannot get here while InRedo is true, but if
	 * the bad page is brought in and marked dirty during recovery then
1301 1302 1303
	 * CreateCheckpoint will try to flush it at the end of recovery.)
	 *
	 * The current approach is to ERROR under normal conditions, but only
B
Bruce Momjian 已提交
1304 1305 1306 1307 1308 1309
	 * WARNING during recovery, so that the system can be brought up even
	 * if there's a corrupt LSN.  Note that for calls from xact.c, the
	 * ERROR will be promoted to PANIC since xact.c calls this routine
	 * inside a critical section.  However, calls from bufmgr.c are not
	 * within critical sections and so we will not force a restart for a
	 * bad LSN on a data page.
1310 1311
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
B
Bruce Momjian 已提交
1312
		elog(InRecovery ? WARNING : ERROR,
1313 1314 1315
			 "XLogFlush: request %X/%X is not satisfied --- flushed only to %X/%X",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1316 1317
}

T
Tom Lane 已提交
1318 1319 1320
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
1321 1322 1323
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
1324
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
1325 1326
 * file was used.
 *
1327
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1328
 * place.  This should be TRUE except during bootstrap log creation.  The
1329
 * caller must *not* hold the lock at call.
1330
 *
T
Tom Lane 已提交
1331 1332
 * Returns FD of opened file.
 */
1333
static int
1334 1335
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
1336
{
1337
	char		path[MAXPGPATH];
1338
	char		tmppath[MAXPGPATH];
1339
	char		zbuffer[BLCKSZ];
1340
	int			fd;
1341
	int			nbytes;
1342 1343

	XLogFileName(path, log, seg);
V
Vadim B. Mikheev 已提交
1344 1345

	/*
B
Bruce Momjian 已提交
1346 1347
	 * Try to use existent file (checkpoint maker may have created it
	 * already)
V
Vadim B. Mikheev 已提交
1348
	 */
1349
	if (*use_existent)
V
Vadim B. Mikheev 已提交
1350
	{
1351 1352
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
1353 1354 1355
		if (fd < 0)
		{
			if (errno != ENOENT)
1356
				elog(PANIC, "open of %s (log file %u, segment %u) failed: %m",
1357
					 path, log, seg);
V
Vadim B. Mikheev 已提交
1358 1359
		}
		else
B
Bruce Momjian 已提交
1360
			return (fd);
V
Vadim B. Mikheev 已提交
1361 1362
	}

1363
	/*
B
Bruce Momjian 已提交
1364 1365 1366
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible
	 * that another process is doing the same thing.  If so, we will end
	 * up pre-creating an extra log segment.  That seems OK, and better
1367
	 * than holding the lock throughout this lengthy process.
1368
	 */
1369 1370
	snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
			 XLogDir, (int) getpid());
1371 1372

	unlink(tmppath);
1373

1374
	/* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
1375
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
1376
					   S_IRUSR | S_IWUSR);
1377
	if (fd < 0)
1378
		elog(PANIC, "creation of file %s failed: %m", tmppath);
1379

1380
	/*
B
Bruce Momjian 已提交
1381
	 * Zero-fill the file.	We have to do this the hard way to ensure that
1382 1383
	 * all the file space has really been allocated --- on platforms that
	 * allow "holes" in files, just seeking to the end doesn't allocate
B
Bruce Momjian 已提交
1384
	 * intermediate space.	This way, we know that we have all the space
1385
	 * and (after the fsync below) that all the indirect blocks are down
1386 1387
	 * on disk.  Therefore, fdatasync(2) or O_DSYNC will be sufficient to
	 * sync future writes to the log file.
1388 1389 1390 1391
	 */
	MemSet(zbuffer, 0, sizeof(zbuffer));
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
	{
1392
		errno = 0;
1393
		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
T
Tom Lane 已提交
1394
		{
B
Bruce Momjian 已提交
1395
			int			save_errno = errno;
T
Tom Lane 已提交
1396

B
Bruce Momjian 已提交
1397 1398 1399 1400
			/*
			 * If we fail to make the file, delete it to release disk
			 * space
			 */
1401
			unlink(tmppath);
1402 1403
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
1404

1405
			elog(PANIC, "ZeroFill failed to write %s: %m", tmppath);
T
Tom Lane 已提交
1406
		}
1407
	}
1408

1409
	if (pg_fsync(fd) != 0)
1410
		elog(PANIC, "fsync of file %s failed: %m", tmppath);
1411

V
Vadim B. Mikheev 已提交
1412
	close(fd);
T
Tom Lane 已提交
1413

1414
	/*
1415 1416
	 * Now move the segment into place with its final name.
	 *
1417 1418 1419 1420 1421
	 * If caller didn't want to use a pre-existing file, get rid of any
	 * pre-existing file.  Otherwise, cope with possibility that someone
	 * else has created the file while we were filling ours: if so, use
	 * ours to pre-create a future log segment.
	 */
1422
	if (!InstallXLogFileSegment(log, seg, tmppath,
B
Bruce Momjian 已提交
1423
								*use_existent, XLOGfileslop,
1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436
								use_lock))
	{
		/* No need for any more future segments... */
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
1437
		elog(PANIC, "open of %s (log file %u, segment %u) failed: %m",
1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460
			 path, log, seg);

	return (fd);
}

/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
 * log, seg: identify segment to install as (or first possible target).
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
 * max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  (Irrelevant if
 * find_free is FALSE.)
 *
1461
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1462
 * place.  This should be TRUE except during bootstrap log creation.  The
1463
 * caller must *not* hold the lock at call.
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473
 *
 * Returns TRUE if file installed, FALSE if not installed because of
 * exceeding max_advance limit.  (Any other kind of failure causes elog().)
 */
static bool
InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
					   bool find_free, int max_advance,
					   bool use_lock)
{
	char		path[MAXPGPATH];
1474
	struct stat stat_buf;
1475 1476 1477 1478 1479 1480 1481

	XLogFileName(path, log, seg);

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
1482
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1483

1484 1485 1486 1487 1488
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
1489 1490
	else
	{
1491
		/* Find a free slot to put it in */
1492
		while (stat(path, &stat_buf) == 0)
1493
		{
1494 1495 1496 1497
			if (--max_advance < 0)
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
1498
					LWLockRelease(ControlFileLock);
1499 1500 1501 1502
				return false;
			}
			NextLogSeg(log, seg);
			XLogFileName(path, log, seg);
1503 1504 1505 1506 1507 1508 1509
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
1510
	 */
1511
#if HAVE_WORKING_LINK
1512
	if (link(tmppath, path) < 0)
1513
		elog(PANIC, "link from %s to %s (initialization of log file %u, segment %u) failed: %m",
1514
			 tmppath, path, log, seg);
1515
	unlink(tmppath);
1516
#else
1517
	if (rename(tmppath, path) < 0)
1518
		elog(PANIC, "rename from %s to %s (initialization of log file %u, segment %u) failed: %m",
1519
			 tmppath, path, log, seg);
1520
#endif
V
Vadim B. Mikheev 已提交
1521

1522
	if (use_lock)
1523
		LWLockRelease(ControlFileLock);
1524

1525
	return true;
1526 1527
}

T
Tom Lane 已提交
1528 1529 1530
/*
 * Open a pre-existing logfile segment.
 */
1531 1532 1533
static int
XLogFileOpen(uint32 log, uint32 seg, bool econt)
{
1534 1535
	char		path[MAXPGPATH];
	int			fd;
1536 1537 1538

	XLogFileName(path, log, seg);

1539 1540
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
1541 1542 1543 1544
	if (fd < 0)
	{
		if (econt && errno == ENOENT)
		{
1545 1546
			elog(LOG, "open of %s (log file %u, segment %u) failed: %m",
				 path, log, seg);
1547 1548
			return (fd);
		}
1549
		elog(PANIC, "open of %s (log file %u, segment %u) failed: %m",
1550
			 path, log, seg);
1551 1552
	}

1553
	return (fd);
1554 1555
}

V
Vadim B. Mikheev 已提交
1556
/*
T
Tom Lane 已提交
1557 1558 1559 1560 1561 1562 1563 1564 1565
 * Preallocate log files beyond the specified log endpoint, according to
 * the XLOGfile user parameter.
 */
static void
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
1566
	bool		use_existent;
T
Tom Lane 已提交
1567 1568

	XLByteToPrevSeg(endptr, _logId, _logSeg);
B
Bruce Momjian 已提交
1569
	if ((endptr.xrecoff - 1) % XLogSegSize >=
B
Bruce Momjian 已提交
1570
		(uint32) (0.75 * XLogSegSize))
T
Tom Lane 已提交
1571 1572
	{
		NextLogSeg(_logId, _logSeg);
1573 1574
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1575 1576 1577 1578 1579 1580
		close(lf);
	}
}

/*
 * Remove or move offline all log files older or equal to passed log/seg#
1581 1582 1583
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
1584 1585
 */
static void
1586
MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
1587
{
1588 1589
	uint32		endlogId;
	uint32		endlogSeg;
B
Bruce Momjian 已提交
1590 1591 1592 1593
	DIR		   *xldir;
	struct dirent *xlde;
	char		lastoff[32];
	char		path[MAXPGPATH];
V
Vadim B. Mikheev 已提交
1594

1595
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
V
Vadim B. Mikheev 已提交
1596 1597 1598

	xldir = opendir(XLogDir);
	if (xldir == NULL)
1599
		elog(PANIC, "could not open transaction log directory (%s): %m",
1600
			 XLogDir);
V
Vadim B. Mikheev 已提交
1601

T
Tom Lane 已提交
1602
	sprintf(lastoff, "%08X%08X", log, seg);
V
Vadim B. Mikheev 已提交
1603 1604 1605 1606

	errno = 0;
	while ((xlde = readdir(xldir)) != NULL)
	{
T
Tom Lane 已提交
1607 1608 1609
		if (strlen(xlde->d_name) == 16 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 16 &&
			strcmp(xlde->d_name, lastoff) <= 0)
V
Vadim B. Mikheev 已提交
1610
		{
1611
			snprintf(path, MAXPGPATH, "%s/%s", XLogDir, xlde->d_name);
1612
			if (XLOG_archive_dir[0])
1613 1614 1615
			{
				elog(LOG, "archiving transaction log file %s",
					 xlde->d_name);
B
Bruce Momjian 已提交
1616
				elog(WARNING, "archiving log files is not implemented!");
1617
			}
1618
			else
1619 1620 1621
			{
				/*
				 * Before deleting the file, see if it can be recycled as
1622
				 * a future log segment.  We allow recycling segments up
B
Bruce Momjian 已提交
1623 1624
				 * to XLOGfileslop segments beyond the current XLOG
				 * location.
1625 1626
				 */
				if (InstallXLogFileSegment(endlogId, endlogSeg, path,
B
Bruce Momjian 已提交
1627
										   true, XLOGfileslop,
1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640
										   true))
				{
					elog(LOG, "recycled transaction log file %s",
						 xlde->d_name);
				}
				else
				{
					/* No need for any more future segments... */
					elog(LOG, "removing transaction log file %s",
						 xlde->d_name);
					unlink(path);
				}
			}
V
Vadim B. Mikheev 已提交
1641 1642 1643 1644
		}
		errno = 0;
	}
	if (errno)
1645
		elog(PANIC, "could not read transaction log directory (%s): %m",
1646
			 XLogDir);
V
Vadim B. Mikheev 已提交
1647 1648 1649
	closedir(xldir);
}

T
Tom Lane 已提交
1650 1651 1652 1653 1654
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
 */
1655 1656 1657 1658 1659 1660 1661 1662 1663 1664
static void
RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
{
	Relation	reln;
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
1665
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
1666
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1667
	{
T
Tom Lane 已提交
1668
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1669 1670
			continue;

B
Bruce Momjian 已提交
1671
		memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
		blk += sizeof(BkpBlock);

		reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);

		if (reln)
		{
			buffer = XLogReadBuffer(true, reln, bkpb.block);
			if (BufferIsValid(buffer))
			{
				page = (Page) BufferGetPage(buffer);
B
Bruce Momjian 已提交
1682
				memcpy((char *) page, blk, BLCKSZ);
1683 1684 1685 1686 1687 1688 1689 1690 1691 1692
				PageSetLSN(page, lsn);
				PageSetSUI(page, ThisStartUpID);
				UnlockAndWriteBuffer(buffer);
			}
		}

		blk += BLCKSZ;
	}
}

T
Tom Lane 已提交
1693 1694 1695 1696 1697 1698 1699
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
1700 1701 1702 1703 1704 1705 1706 1707 1708
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
	crc64		crc;
	crc64		cbuf;
	int			i;
	uint32		len = record->xl_len;
	char	   *blk;

T
Tom Lane 已提交
1709
	/* Check CRC of rmgr data and record header */
1710
	INIT_CRC64(crc);
T
Tom Lane 已提交
1711
	COMP_CRC64(crc, XLogRecGetData(record), len);
B
Bruce Momjian 已提交
1712
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
1713
			   SizeOfXLogRecord - sizeof(crc64));
1714 1715
	FIN_CRC64(crc);

T
Tom Lane 已提交
1716
	if (!EQ_CRC64(record->xl_crc, crc))
1717
	{
1718
		elog(emode, "ReadRecord: bad resource manager data checksum in record at %X/%X",
T
Tom Lane 已提交
1719
			 recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1720
		return (false);
1721 1722
	}

T
Tom Lane 已提交
1723
	/* Check CRCs of backup blocks, if any */
B
Bruce Momjian 已提交
1724
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
1725
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1726
	{
T
Tom Lane 已提交
1727
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1728 1729 1730
			continue;

		INIT_CRC64(crc);
T
Tom Lane 已提交
1731 1732 1733
		COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
		COMP_CRC64(crc, blk + sizeof(crc64),
				   sizeof(BkpBlock) - sizeof(crc64));
1734
		FIN_CRC64(crc);
B
Bruce Momjian 已提交
1735 1736
		memcpy((char *) &cbuf, blk, sizeof(crc64));		/* don't assume
														 * alignment */
1737

T
Tom Lane 已提交
1738
		if (!EQ_CRC64(cbuf, crc))
1739
		{
1740
			elog(emode, "ReadRecord: bad checksum of backup block %d in record at %X/%X",
T
Tom Lane 已提交
1741
				 i + 1, recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1742
			return (false);
1743
		}
T
Tom Lane 已提交
1744
		blk += sizeof(BkpBlock) + BLCKSZ;
1745 1746
	}

B
Bruce Momjian 已提交
1747
	return (true);
1748 1749
}

T
Tom Lane 已提交
1750 1751 1752 1753 1754 1755
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
1756 1757
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
 * (emode must be either PANIC or LOG.)
T
Tom Lane 已提交
1758 1759 1760 1761 1762
 *
 * buffer is a workspace at least _INTL_MAXLOGRECSZ bytes long.  It is needed
 * to reassemble a record that crosses block boundaries.  Note that on
 * successful return, the returned record pointer always points at buffer.
 */
1763
static XLogRecord *
T
Tom Lane 已提交
1764
ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer)
1765
{
1766 1767
	XLogRecord *record;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
T
Tom Lane 已提交
1768 1769 1770 1771
	uint32		len,
				total_len;
	uint32		targetPageOff;
	unsigned	i;
1772
	bool		nextmode = false;
T
Tom Lane 已提交
1773 1774 1775 1776 1777 1778

	if (readBuf == NULL)
	{
		/*
		 * First time through, permanently allocate readBuf.  We do it
		 * this way, rather than just making a static array, for two
B
Bruce Momjian 已提交
1779 1780 1781 1782
		 * reasons: (1) no need to waste the storage in most
		 * instantiations of the backend; (2) a static char array isn't
		 * guaranteed to have any particular alignment, whereas malloc()
		 * will provide MAXALIGN'd storage.
T
Tom Lane 已提交
1783 1784 1785 1786
		 */
		readBuf = (char *) malloc(BLCKSZ);
		Assert(readBuf != NULL);
	}
1787

T
Tom Lane 已提交
1788
	if (RecPtr == NULL)
1789
	{
1790
		RecPtr = &tmpRecPtr;
1791
		nextmode = true;
T
Tom Lane 已提交
1792
		/* fast case if next record is on same page */
1793 1794 1795 1796 1797
		if (nextRecord != NULL)
		{
			record = nextRecord;
			goto got_record;
		}
T
Tom Lane 已提交
1798
		/* align old recptr to next page */
1799 1800 1801 1802 1803 1804 1805 1806
		if (tmpRecPtr.xrecoff % BLCKSZ != 0)
			tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
		tmpRecPtr.xrecoff += SizeOfXLogPHD;
1807
	}
1808
	else if (!XRecOffIsValid(RecPtr->xrecoff))
1809
		elog(PANIC, "ReadRecord: invalid record offset at %X/%X",
1810
			 RecPtr->xlogid, RecPtr->xrecoff);
1811

T
Tom Lane 已提交
1812
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
1813
	{
1814 1815
		close(readFile);
		readFile = -1;
1816
	}
T
Tom Lane 已提交
1817
	XLByteToSeg(*RecPtr, readId, readSeg);
1818
	if (readFile < 0)
1819
	{
T
Tom Lane 已提交
1820
		readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1821 1822
		if (readFile < 0)
			goto next_record_is_invalid;
1823
		readOff = (uint32) (-1);	/* force read to occur below */
1824 1825
	}

T
Tom Lane 已提交
1826 1827
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / BLCKSZ) * BLCKSZ;
	if (readOff != targetPageOff)
1828
	{
T
Tom Lane 已提交
1829 1830 1831
		readOff = targetPageOff;
		if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
		{
1832
			elog(emode, "ReadRecord: lseek of log file %u, segment %u, offset %u failed: %m",
1833
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1834 1835
			goto next_record_is_invalid;
		}
1836
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1837
		{
1838
			elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1839
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1840 1841
			goto next_record_is_invalid;
		}
1842
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, nextmode))
1843 1844
			goto next_record_is_invalid;
	}
T
Tom Lane 已提交
1845
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
1846 1847
		RecPtr->xrecoff % BLCKSZ == SizeOfXLogPHD)
	{
1848
		elog(emode, "ReadRecord: contrecord is requested by %X/%X",
1849
			 RecPtr->xlogid, RecPtr->xrecoff);
1850 1851
		goto next_record_is_invalid;
	}
1852
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
1853 1854

got_record:;
B
Bruce Momjian 已提交
1855

T
Tom Lane 已提交
1856
	/*
B
Bruce Momjian 已提交
1857 1858
	 * Currently, xl_len == 0 must be bad data, but that might not be true
	 * forever.  See note in XLogInsert.
T
Tom Lane 已提交
1859
	 */
1860 1861
	if (record->xl_len == 0)
	{
1862
		elog(emode, "ReadRecord: record with zero length at %X/%X",
T
Tom Lane 已提交
1863
			 RecPtr->xlogid, RecPtr->xrecoff);
1864 1865
		goto next_record_is_invalid;
	}
B
Bruce Momjian 已提交
1866

T
Tom Lane 已提交
1867
	/*
B
Bruce Momjian 已提交
1868 1869
	 * Compute total length of record including any appended backup
	 * blocks.
T
Tom Lane 已提交
1870 1871 1872 1873 1874 1875 1876 1877
	 */
	total_len = SizeOfXLogRecord + record->xl_len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;
		total_len += sizeof(BkpBlock) + BLCKSZ;
	}
B
Bruce Momjian 已提交
1878

T
Tom Lane 已提交
1879 1880 1881 1882 1883 1884
	/*
	 * Make sure it will fit in buffer (currently, it is mechanically
	 * impossible for this test to fail, but it seems like a good idea
	 * anyway).
	 */
	if (total_len > _INTL_MAXLOGRECSZ)
1885
	{
1886
		elog(emode, "ReadRecord: record length %u at %X/%X too long",
T
Tom Lane 已提交
1887
			 total_len, RecPtr->xlogid, RecPtr->xrecoff);
1888 1889 1890 1891
		goto next_record_is_invalid;
	}
	if (record->xl_rmid > RM_MAX_ID)
	{
1892
		elog(emode, "ReadRecord: invalid resource manager id %u at %X/%X",
1893
			 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff);
1894 1895 1896
		goto next_record_is_invalid;
	}
	nextRecord = NULL;
T
Tom Lane 已提交
1897 1898
	len = BLCKSZ - RecPtr->xrecoff % BLCKSZ;
	if (total_len > len)
1899
	{
T
Tom Lane 已提交
1900 1901
		/* Need to reassemble record */
		XLogContRecord *contrecord;
B
Bruce Momjian 已提交
1902
		uint32		gotlen = len;
1903

T
Tom Lane 已提交
1904
		memcpy(buffer, record, len);
1905
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
1906
		buffer += len;
1907
		for (;;)
1908
		{
T
Tom Lane 已提交
1909 1910
			readOff += BLCKSZ;
			if (readOff >= XLogSegSize)
1911 1912
			{
				close(readFile);
T
Tom Lane 已提交
1913 1914 1915
				readFile = -1;
				NextLogSeg(readId, readSeg);
				readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1916 1917
				if (readFile < 0)
					goto next_record_is_invalid;
T
Tom Lane 已提交
1918
				readOff = 0;
1919 1920
			}
			if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1921
			{
1922
				elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1923
					 readId, readSeg, readOff);
T
Tom Lane 已提交
1924 1925
				goto next_record_is_invalid;
			}
1926
			if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
1927
				goto next_record_is_invalid;
T
Tom Lane 已提交
1928
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
1929
			{
1930
				elog(emode, "ReadRecord: there is no ContRecord flag in log file %u, segment %u, offset %u",
1931
					 readId, readSeg, readOff);
1932 1933
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1934
			contrecord = (XLogContRecord *) ((char *) readBuf + SizeOfXLogPHD);
B
Bruce Momjian 已提交
1935
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
1936
				total_len != (contrecord->xl_rem_len + gotlen))
1937
			{
1938
				elog(emode, "ReadRecord: invalid ContRecord length %u in log file %u, segment %u, offset %u",
T
Tom Lane 已提交
1939
					 contrecord->xl_rem_len, readId, readSeg, readOff);
1940 1941
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1942 1943
			len = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
			if (contrecord->xl_rem_len > len)
1944
			{
B
Bruce Momjian 已提交
1945
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
		if (BLCKSZ - SizeOfXLogRecord >= SizeOfXLogPHD +
			SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
		{
B
Bruce Momjian 已提交
1959
			nextRecord = (XLogRecord *) ((char *) contrecord +
T
Tom Lane 已提交
1960 1961 1962 1963
				SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
B
Bruce Momjian 已提交
1964
			SizeOfXLogPHD + SizeOfXLogContRecord +
T
Tom Lane 已提交
1965 1966 1967
			MAXALIGN(contrecord->xl_rem_len);
		ReadRecPtr = *RecPtr;
		return record;
1968 1969
	}

T
Tom Lane 已提交
1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
	if (BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % BLCKSZ +
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
	return (XLogRecord *) buffer;
1981

T
Tom Lane 已提交
1982 1983 1984 1985 1986
next_record_is_invalid:;
	close(readFile);
	readFile = -1;
	nextRecord = NULL;
	return NULL;
1987 1988
}

1989 1990 1991 1992
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
1993
 * ReadRecord.	It's not intended for use from anywhere else.
1994 1995 1996 1997
 */
static bool
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI)
{
1998 1999
	XLogRecPtr	recaddr;

2000 2001
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
2002
		elog(emode, "ReadRecord: invalid magic number %04X in log file %u, segment %u, offset %u",
2003 2004 2005 2006 2007
			 hdr->xlp_magic, readId, readSeg, readOff);
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
2008
		elog(emode, "ReadRecord: invalid info bits %04X in log file %u, segment %u, offset %u",
2009 2010 2011
			 hdr->xlp_info, readId, readSeg, readOff);
		return false;
	}
2012 2013 2014 2015
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
2016
		elog(emode, "ReadRecord: unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
2017 2018 2019 2020
			 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
			 readId, readSeg, readOff);
		return false;
	}
B
Bruce Momjian 已提交
2021

2022
	/*
B
Bruce Momjian 已提交
2023 2024 2025 2026
	 * We disbelieve a SUI less than the previous page's SUI, or more than
	 * a few counts greater.  In theory as many as 512 shutdown checkpoint
	 * records could appear on a 32K-sized xlog page, so that's the most
	 * differential there could legitimately be.
2027 2028
	 *
	 * Note this check can only be applied when we are reading the next page
B
Bruce Momjian 已提交
2029 2030
	 * in sequence, so ReadRecord passes a flag indicating whether to
	 * check.
2031 2032 2033 2034 2035 2036
	 */
	if (checkSUI)
	{
		if (hdr->xlp_sui < lastReadSUI ||
			hdr->xlp_sui > lastReadSUI + 512)
		{
2037 2038
			/* translator: SUI = startup id */
			elog(emode, "ReadRecord: out-of-sequence SUI %u (after %u) in log file %u, segment %u, offset %u",
2039 2040 2041 2042 2043 2044 2045 2046
				 hdr->xlp_sui, lastReadSUI, readId, readSeg, readOff);
			return false;
		}
	}
	lastReadSUI = hdr->xlp_sui;
	return true;
}

2047 2048 2049 2050
/*
 * I/O routines for pg_control
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
2051
 * contents of pg_control.	WriteControlFile() initializes pg_control
2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */

void
XLOGPathInit(void)
{
	/* Init XLOG file paths */
2066
	snprintf(XLogDir, MAXPGPATH, "%s/pg_xlog", DataDir);
2067
	snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
2068 2069 2070 2071 2072 2073
}

static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
2074
	char		buffer[BLCKSZ]; /* need not be aligned */
2075 2076 2077
	char	   *localeptr;

	/*
T
Tom Lane 已提交
2078
	 * Initialize version and compatibility-check fields
2079
	 */
T
Tom Lane 已提交
2080 2081
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
2082 2083
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094

	ControlFile->nameDataLen = NAMEDATALEN;
	ControlFile->funcMaxArgs = FUNC_MAX_ARGS;

#ifdef HAVE_INT64_TIMESTAMP
	ControlFile->enableIntTimes = TRUE;
#else
	ControlFile->enableIntTimes = FALSE;
#endif

	ControlFile->localeBuflen = LOCALE_NAME_BUFLEN;
2095 2096
	localeptr = setlocale(LC_COLLATE, NULL);
	if (!localeptr)
2097
		elog(PANIC, "invalid LC_COLLATE setting");
2098 2099 2100
	StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
	localeptr = setlocale(LC_CTYPE, NULL);
	if (!localeptr)
2101
		elog(PANIC, "invalid LC_CTYPE setting");
2102
	StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
B
Bruce Momjian 已提交
2103

T
Tom Lane 已提交
2104 2105
	/* Contents are protected with a CRC */
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2106 2107
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2108 2109 2110
			   sizeof(ControlFileData) - sizeof(crc64));
	FIN_CRC64(ControlFile->crc);

2111
	/*
B
Bruce Momjian 已提交
2112 2113 2114 2115 2116
	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
	 * over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail
	 * when we check the contents of the file, but hopefully with a more
	 * specific error than "couldn't read pg_control".
2117 2118
	 */
	if (sizeof(ControlFileData) > BLCKSZ)
2119
		elog(PANIC, "sizeof(ControlFileData) is larger than BLCKSZ; fix either one");
2120

2121 2122 2123
	memset(buffer, 0, BLCKSZ);
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

2124 2125
	fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
2126
	if (fd < 0)
2127
		elog(PANIC, "WriteControlFile: could not create control file (%s): %m",
2128 2129
			 ControlFilePath);

2130
	errno = 0;
2131
	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
2132 2133 2134 2135
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2136
		elog(PANIC, "WriteControlFile: write to control file failed: %m");
2137
	}
2138

2139
	if (pg_fsync(fd) != 0)
2140
		elog(PANIC, "WriteControlFile: fsync of control file failed: %m");
2141 2142 2143 2144 2145 2146 2147

	close(fd);
}

static void
ReadControlFile(void)
{
2148
	crc64		crc;
2149 2150 2151 2152 2153 2154 2155
	int			fd;

	/*
	 * Read data...
	 */
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
2156
		elog(PANIC, "could not open control file (%s): %m", ControlFilePath);
2157 2158

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2159
		elog(PANIC, "read from control file failed: %m");
2160 2161 2162

	close(fd);

T
Tom Lane 已提交
2163 2164 2165 2166 2167 2168 2169
	/*
	 * Check for expected pg_control format version.  If this is wrong,
	 * the CRC check will likely fail because we'll be checking the wrong
	 * number of bytes.  Complaining about wrong version will probably be
	 * more enlightening than complaining about wrong CRC.
	 */
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
2170
		elog(PANIC,
2171 2172 2173
			 "The database cluster was initialized with PG_CONTROL_VERSION %d,\n"
			 "\tbut the server was compiled with PG_CONTROL_VERSION %d.\n"
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2174 2175 2176
			 ControlFile->pg_control_version, PG_CONTROL_VERSION);

	/* Now check the CRC. */
2177
	INIT_CRC64(crc);
B
Bruce Momjian 已提交
2178 2179
	COMP_CRC64(crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2180
			   sizeof(ControlFileData) - sizeof(crc64));
2181 2182
	FIN_CRC64(crc);

T
Tom Lane 已提交
2183
	if (!EQ_CRC64(crc, ControlFile->crc))
2184
		elog(PANIC, "invalid checksum in control file");
2185

2186
	/*
B
Bruce Momjian 已提交
2187 2188
	 * Do compatibility checking immediately.  We do this here for 2
	 * reasons:
2189
	 *
B
Bruce Momjian 已提交
2190 2191
	 * (1) if the database isn't compatible with the backend executable, we
	 * want to abort before we can possibly do any damage;
2192 2193 2194
	 *
	 * (2) this code is executed in the postmaster, so the setlocale() will
	 * propagate to forked backends, which aren't going to read this file
B
Bruce Momjian 已提交
2195
	 * for themselves.	(These locale settings are considered critical
2196 2197
	 * compatibility items because they can affect sort order of indexes.)
	 */
T
Tom Lane 已提交
2198
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
2199
		elog(PANIC,
2200
			 "The database cluster was initialized with CATALOG_VERSION_NO %d,\n"
B
Bruce Momjian 已提交
2201
		   "\tbut the backend was compiled with CATALOG_VERSION_NO %d.\n"
2202
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2203
			 ControlFile->catalog_version_no, CATALOG_VERSION_NO);
2204
	if (ControlFile->blcksz != BLCKSZ)
2205
		elog(PANIC,
2206 2207 2208
			 "The database cluster was initialized with BLCKSZ %d,\n"
			 "\tbut the backend was compiled with BLCKSZ %d.\n"
			 "\tIt looks like you need to initdb.",
2209 2210
			 ControlFile->blcksz, BLCKSZ);
	if (ControlFile->relseg_size != RELSEG_SIZE)
2211
		elog(PANIC,
2212 2213
			 "The database cluster was initialized with RELSEG_SIZE %d,\n"
			 "\tbut the backend was compiled with RELSEG_SIZE %d.\n"
2214
			 "\tIt looks like you need to recompile or initdb.",
2215
			 ControlFile->relseg_size, RELSEG_SIZE);
2216 2217 2218 2219 2220 2221 2222 2223 2224 2225

	if (ControlFile->nameDataLen != NAMEDATALEN)
		elog(PANIC,
			 "The database cluster was initialized with NAMEDATALEN %d,\n"
			 "\tbut the backend was compiled with NAMEDATALEN %d.\n"
			 "\tIt looks like you need to recompile or initdb.",
			 ControlFile->nameDataLen, NAMEDATALEN);

	if (ControlFile->funcMaxArgs != FUNC_MAX_ARGS)
		elog(PANIC,
B
Bruce Momjian 已提交
2226
		  "The database cluster was initialized with FUNC_MAX_ARGS %d,\n"
2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239
			 "\tbut the backend was compiled with FUNC_MAX_ARGS %d.\n"
			 "\tIt looks like you need to recompile or initdb.",
			 ControlFile->funcMaxArgs, FUNC_MAX_ARGS);

#ifdef HAVE_INT64_TIMESTAMP
	if (ControlFile->enableIntTimes != TRUE)
		elog(PANIC,
			 "The database cluster was initialized without HAVE_INT64_TIMESTAMP\n"
			 "\tbut the backend was compiled with HAVE_INT64_TIMESTAMP.\n"
			 "\tIt looks like you need to recompile or initdb.");
#else
	if (ControlFile->enableIntTimes != FALSE)
		elog(PANIC,
B
Bruce Momjian 已提交
2240 2241
		"The database cluster was initialized with HAVE_INT64_TIMESTAMP\n"
		 "\tbut the backend was compiled without HAVE_INT64_TIMESTAMP.\n"
2242 2243 2244 2245 2246 2247
			 "\tIt looks like you need to recompile or initdb.");
#endif

	if (ControlFile->localeBuflen != LOCALE_NAME_BUFLEN)
		elog(PANIC,
			 "The database cluster was initialized with LOCALE_NAME_BUFLEN %d,\n"
B
Bruce Momjian 已提交
2248
		   "\tbut the backend was compiled with LOCALE_NAME_BUFLEN %d.\n"
2249 2250 2251
			 "\tIt looks like you need to initdb.",
			 ControlFile->localeBuflen, LOCALE_NAME_BUFLEN);

2252
	if (setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
2253
		elog(PANIC,
B
Bruce Momjian 已提交
2254
		   "The database cluster was initialized with LC_COLLATE '%s',\n"
2255 2256
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2257 2258
			 ControlFile->lc_collate);
	if (setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
2259
		elog(PANIC,
2260 2261 2262
			 "The database cluster was initialized with LC_CTYPE '%s',\n"
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2263
			 ControlFile->lc_ctype);
2264 2265 2266 2267 2268 2269

	/* Make the fixed locale settings visible as GUC variables, too */
	SetConfigOption("lc_collate", ControlFile->lc_collate,
					PGC_INTERNAL, PGC_S_OVERRIDE);
	SetConfigOption("lc_ctype", ControlFile->lc_ctype,
					PGC_INTERNAL, PGC_S_OVERRIDE);
2270 2271
}

2272
void
2273
UpdateControlFile(void)
2274
{
2275
	int			fd;
2276

2277
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2278 2279
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2280
			   sizeof(ControlFileData) - sizeof(crc64));
2281 2282
	FIN_CRC64(ControlFile->crc);

2283
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
2284
	if (fd < 0)
2285
		elog(PANIC, "could not open control file (%s): %m", ControlFilePath);
2286

2287
	errno = 0;
2288
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2289 2290 2291 2292
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2293
		elog(PANIC, "write to control file failed: %m");
2294
	}
2295

2296
	if (pg_fsync(fd) != 0)
2297
		elog(PANIC, "fsync of control file failed: %m");
2298 2299 2300 2301

	close(fd);
}

2302
/*
T
Tom Lane 已提交
2303
 * Initialization of shared memory for XLOG
2304 2305
 */

2306
int
2307
XLOGShmemSize(void)
2308 2309 2310 2311
{
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

T
Tom Lane 已提交
2312 2313 2314
	return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
		+ BLCKSZ * XLOGbuffers +
		MAXALIGN(sizeof(ControlFileData));
2315 2316 2317 2318 2319
}

void
XLOGShmemInit(void)
{
2320
	bool		found;
2321

2322
	/* this must agree with space requested by XLOGShmemSize() */
2323 2324 2325
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

2326
	XLogCtl = (XLogCtlData *)
T
Tom Lane 已提交
2327 2328 2329 2330 2331
		ShmemInitStruct("XLOG Ctl",
						MAXALIGN(sizeof(XLogCtlData) +
								 sizeof(XLogRecPtr) * XLOGbuffers)
						+ BLCKSZ * XLOGbuffers,
						&found);
2332
	Assert(!found);
2333 2334 2335 2336
	ControlFile = (ControlFileData *)
		ShmemInitStruct("Control File", sizeof(ControlFileData), &found);
	Assert(!found);

T
Tom Lane 已提交
2337
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
2338

T
Tom Lane 已提交
2339 2340 2341 2342 2343 2344 2345 2346
	/*
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be
	 * a multiple of the alignment for same, so no extra alignment padding
	 * is needed here.
	 */
	XLogCtl->xlblocks = (XLogRecPtr *)
		(((char *) XLogCtl) + sizeof(XLogCtlData));
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
B
Bruce Momjian 已提交
2347

T
Tom Lane 已提交
2348
	/*
B
Bruce Momjian 已提交
2349 2350
	 * Here, on the other hand, we must MAXALIGN to ensure the page
	 * buffers have worst-case alignment.
T
Tom Lane 已提交
2351 2352 2353 2354 2355 2356 2357
	 */
	XLogCtl->pages =
		((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
									  sizeof(XLogRecPtr) * XLOGbuffers);
	memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);

	/*
B
Bruce Momjian 已提交
2358 2359
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will
	 * fill in additional info.)
T
Tom Lane 已提交
2360 2361 2362 2363
	 */
	XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
2364
	SpinLockInit(&XLogCtl->info_lck);
T
Tom Lane 已提交
2365

2366 2367 2368 2369 2370 2371 2372
	/*
	 * If we are not in bootstrap mode, pg_control should already exist.
	 * Read and validate it immediately (see comments in ReadControlFile()
	 * for the reasons why).
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
2373 2374 2375
}

/*
T
Tom Lane 已提交
2376 2377
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
2378 2379
 */
void
T
Tom Lane 已提交
2380
BootStrapXLOG(void)
2381
{
2382
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2383 2384
	char	   *buffer;
	XLogPageHeader page;
2385
	XLogRecord *record;
B
Bruce Momjian 已提交
2386
	bool		use_existent;
2387
	crc64		crc;
2388

T
Tom Lane 已提交
2389 2390 2391 2392
	/* Use malloc() to ensure buffer is MAXALIGNED */
	buffer = (char *) malloc(BLCKSZ);
	page = (XLogPageHeader) buffer;

2393 2394 2395
	checkPoint.redo.xlogid = 0;
	checkPoint.redo.xrecoff = SizeOfXLogPHD;
	checkPoint.undo = checkPoint.redo;
T
Tom Lane 已提交
2396
	checkPoint.ThisStartUpID = 0;
2397
	checkPoint.nextXid = FirstNormalTransactionId;
2398
	checkPoint.nextOid = BootstrapObjectIdData;
T
Tom Lane 已提交
2399
	checkPoint.time = time(NULL);
2400

2401 2402 2403 2404
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;

2405 2406 2407
	memset(buffer, 0, BLCKSZ);
	page->xlp_magic = XLOG_PAGE_MAGIC;
	page->xlp_info = 0;
2408
	page->xlp_sui = checkPoint.ThisStartUpID;
2409 2410
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
2411 2412 2413
	record = (XLogRecord *) ((char *) page + SizeOfXLogPHD);
	record->xl_prev.xlogid = 0;
	record->xl_prev.xrecoff = 0;
2414 2415 2416
	record->xl_xact_prev = record->xl_prev;
	record->xl_xid = InvalidTransactionId;
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
2417
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
2418
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
2419
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
2420

2421
	INIT_CRC64(crc);
T
Tom Lane 已提交
2422
	COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
B
Bruce Momjian 已提交
2423
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
2424
			   SizeOfXLogRecord - sizeof(crc64));
2425 2426 2427
	FIN_CRC64(crc);
	record->xl_crc = crc;

2428 2429
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
2430

2431
	errno = 0;
T
Tom Lane 已提交
2432
	if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
2433 2434 2435 2436
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2437
		elog(PANIC, "BootStrapXLOG failed to write log file: %m");
2438
	}
2439

T
Tom Lane 已提交
2440
	if (pg_fsync(openLogFile) != 0)
2441
		elog(PANIC, "BootStrapXLOG failed to fsync log file: %m");
2442

T
Tom Lane 已提交
2443 2444
	close(openLogFile);
	openLogFile = -1;
2445

2446
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
2447 2448 2449
	/* Initialize pg_control status fields */
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
2450 2451 2452
	ControlFile->logId = 0;
	ControlFile->logSeg = 1;
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
2453
	ControlFile->checkPointCopy = checkPoint;
2454
	/* some additional ControlFile fields are set in WriteControlFile() */
2455

2456
	WriteControlFile();
2457 2458 2459

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
2460 2461
}

2462
static char *
2463 2464
str_time(time_t tnow)
{
T
Tom Lane 已提交
2465
	static char buf[32];
2466

2467
	strftime(buf, sizeof(buf),
T
Tom Lane 已提交
2468
			 "%Y-%m-%d %H:%M:%S %Z",
2469
			 localtime(&tnow));
2470

2471
	return buf;
2472 2473 2474
}

/*
T
Tom Lane 已提交
2475
 * This must be called ONCE during postmaster or standalone-backend startup
2476 2477
 */
void
T
Tom Lane 已提交
2478
StartupXLOG(void)
2479
{
2480 2481
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2482
	bool		wasShutdown;
2483
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
2484 2485 2486
				LastRec,
				checkPointLoc,
				EndOfLog;
2487
	XLogRecord *record;
T
Tom Lane 已提交
2488
	char	   *buffer;
2489

T
Tom Lane 已提交
2490 2491
	/* Use malloc() to ensure record buffer is MAXALIGNED */
	buffer = (char *) malloc(_INTL_MAXLOGRECSZ);
2492

T
Tom Lane 已提交
2493
	CritSectionCount++;
2494 2495

	/*
2496 2497
	 * Read control file and check XLOG status looks valid.
	 *
B
Bruce Momjian 已提交
2498 2499
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
2500
	 */
2501
	ReadControlFile();
2502

2503 2504 2505
	if (ControlFile->logSeg == 0 ||
		ControlFile->state < DB_SHUTDOWNED ||
		ControlFile->state > DB_IN_PRODUCTION ||
2506
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
2507
		elog(PANIC, "control file context is broken");
2508 2509

	if (ControlFile->state == DB_SHUTDOWNED)
2510
		elog(LOG, "database system was shut down at %s",
2511
			 str_time(ControlFile->time));
2512
	else if (ControlFile->state == DB_SHUTDOWNING)
2513
		elog(LOG, "database system shutdown was interrupted at %s",
2514
			 str_time(ControlFile->time));
2515
	else if (ControlFile->state == DB_IN_RECOVERY)
2516
		elog(LOG, "database system was interrupted being in recovery at %s\n"
T
Tom Lane 已提交
2517
			 "\tThis probably means that some data blocks are corrupted\n"
2518
			 "\tand you will have to use the last backup for recovery.",
2519
			 str_time(ControlFile->time));
2520
	else if (ControlFile->state == DB_IN_PRODUCTION)
2521
		elog(LOG, "database system was interrupted at %s",
2522
			 str_time(ControlFile->time));
2523

2524 2525 2526 2527 2528 2529
	/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
	if (XLOG_DEBUG && ControlFile->state != DB_SHUTDOWNED)
		sleep(60);
#endif

T
Tom Lane 已提交
2530 2531 2532 2533
	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
2534
	record = ReadCheckpointRecord(ControlFile->checkPoint, 1, buffer);
T
Tom Lane 已提交
2535 2536 2537
	if (record != NULL)
	{
		checkPointLoc = ControlFile->checkPoint;
2538
		elog(LOG, "checkpoint record is at %X/%X",
T
Tom Lane 已提交
2539 2540 2541 2542
			 checkPointLoc.xlogid, checkPointLoc.xrecoff);
	}
	else
	{
2543
		record = ReadCheckpointRecord(ControlFile->prevCheckPoint, 2, buffer);
T
Tom Lane 已提交
2544 2545 2546
		if (record != NULL)
		{
			checkPointLoc = ControlFile->prevCheckPoint;
2547
			elog(LOG, "using previous checkpoint record at %X/%X",
T
Tom Lane 已提交
2548 2549 2550 2551
				 checkPointLoc.xlogid, checkPointLoc.xrecoff);
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
2552
			elog(PANIC, "unable to locate a valid checkpoint record");
T
Tom Lane 已提交
2553 2554 2555 2556
	}
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
2557

2558
	elog(LOG, "redo record is at %X/%X; undo record is at %X/%X; shutdown %s",
2559
		 checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
V
Vadim B. Mikheev 已提交
2560
		 checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
T
Tom Lane 已提交
2561
		 wasShutdown ? "TRUE" : "FALSE");
2562
	elog(LOG, "next transaction id: %u; next oid: %u",
2563
		 checkPoint.nextXid, checkPoint.nextOid);
2564
	if (!TransactionIdIsNormal(checkPoint.nextXid))
2565
		elog(PANIC, "invalid next transaction id");
2566 2567 2568

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
2569
	ShmemVariableCache->oidCount = 0;
2570

V
WAL  
Vadim B. Mikheev 已提交
2571
	ThisStartUpID = checkPoint.ThisStartUpID;
B
Bruce Momjian 已提交
2572
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
2573
		XLogCtl->SavedRedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
2574

2575
	if (XLByteLT(RecPtr, checkPoint.redo))
2576
		elog(PANIC, "invalid redo in checkpoint record");
2577 2578 2579
	if (checkPoint.undo.xrecoff == 0)
		checkPoint.undo = RecPtr;

B
Bruce Momjian 已提交
2580
	if (XLByteLT(checkPoint.undo, RecPtr) ||
V
Vadim B. Mikheev 已提交
2581
		XLByteLT(checkPoint.redo, RecPtr))
2582
	{
T
Tom Lane 已提交
2583
		if (wasShutdown)
2584
			elog(PANIC, "invalid redo/undo record in shutdown checkpoint");
V
WAL  
Vadim B. Mikheev 已提交
2585
		InRecovery = true;
2586 2587
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
2588
		InRecovery = true;
2589

V
WAL  
Vadim B. Mikheev 已提交
2590 2591
	/* REDO */
	if (InRecovery)
2592
	{
2593 2594
		int		rmid;

2595
		elog(LOG, "database system was not properly shut down; "
2596
			 "automatic recovery in progress");
2597 2598 2599 2600
		ControlFile->state = DB_IN_RECOVERY;
		ControlFile->time = time(NULL);
		UpdateControlFile();

2601
		/* Start up the recovery environment */
V
WAL  
Vadim B. Mikheev 已提交
2602
		XLogInitRelationCache();
V
Vadim B. Mikheev 已提交
2603

2604 2605 2606 2607 2608 2609
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

2610 2611
		/* Is REDO required ? */
		if (XLByteLT(checkPoint.redo, RecPtr))
2612
			record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
B
Bruce Momjian 已提交
2613
		else
2614 2615
		{
			/* read past CheckPoint record */
T
Tom Lane 已提交
2616
			record = ReadRecord(NULL, LOG, buffer);
2617
		}
2618

T
Tom Lane 已提交
2619
		if (record != NULL)
2620
		{
V
WAL  
Vadim B. Mikheev 已提交
2621
			InRedo = true;
2622
			elog(LOG, "redo starts at %X/%X",
2623
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2624 2625
			do
			{
2626 2627
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
2628
											ShmemVariableCache->nextXid))
2629 2630 2631 2632
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}
V
WAL  
Vadim B. Mikheev 已提交
2633 2634
				if (XLOG_DEBUG)
				{
B
Bruce Momjian 已提交
2635
					char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
2636

2637
					sprintf(buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
2638 2639
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
							EndRecPtr.xlogid, EndRecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
2640 2641
					xlog_outrec(buf, record);
					strcat(buf, " - ");
B
Bruce Momjian 已提交
2642 2643
					RmgrTable[record->xl_rmid].rm_desc(buf,
								record->xl_info, XLogRecGetData(record));
2644
					elog(LOG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
2645 2646
				}

T
Tom Lane 已提交
2647
				if (record->xl_info & XLR_BKP_BLOCK_MASK)
2648 2649
					RestoreBkpBlocks(record, EndRecPtr);

2650
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
T
Tom Lane 已提交
2651 2652
				record = ReadRecord(NULL, LOG, buffer);
			} while (record != NULL);
2653
			elog(LOG, "redo done at %X/%X",
2654
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2655
			LastRec = ReadRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2656
			InRedo = false;
2657 2658
		}
		else
2659
			elog(LOG, "redo is not required");
V
WAL  
Vadim B. Mikheev 已提交
2660 2661
	}

T
Tom Lane 已提交
2662 2663 2664 2665
	/*
	 * Init xlog buffer cache using the block containing the last valid
	 * record from the previous incarnation.
	 */
2666
	record = ReadRecord(&LastRec, PANIC, buffer);
T
Tom Lane 已提交
2667 2668 2669 2670 2671 2672
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, openLogId, openLogSeg);
	openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
	openLogOff = 0;
	ControlFile->logId = openLogId;
	ControlFile->logSeg = openLogSeg + 1;
V
WAL  
Vadim B. Mikheev 已提交
2673
	Insert = &XLogCtl->Insert;
2674
	Insert->PrevRecord = LastRec;
B
Bruce Momjian 已提交
2675 2676

	/*
2677 2678
	 * If the next record will go to the new page then initialize for that
	 * one.
T
Tom Lane 已提交
2679
	 */
2680 2681 2682 2683
	if ((BLCKSZ - EndOfLog.xrecoff % BLCKSZ) < SizeOfXLogRecord)
		EndOfLog.xrecoff += (BLCKSZ - EndOfLog.xrecoff % BLCKSZ);
	if (EndOfLog.xrecoff % BLCKSZ == 0)
	{
2684 2685 2686 2687
		XLogRecPtr	NewPageEndPtr;

		NewPageEndPtr = EndOfLog;
		if (NewPageEndPtr.xrecoff >= XLogFileSize)
2688
		{
2689 2690 2691
			/* crossing a logid boundary */
			NewPageEndPtr.xlogid += 1;
			NewPageEndPtr.xrecoff = BLCKSZ;
2692 2693
		}
		else
2694 2695
			NewPageEndPtr.xrecoff += BLCKSZ;
		XLogCtl->xlblocks[0] = NewPageEndPtr;
2696 2697 2698 2699 2700
		Insert->currpage->xlp_magic = XLOG_PAGE_MAGIC;
		if (InRecovery)
			Insert->currpage->xlp_sui = ThisStartUpID;
		else
			Insert->currpage->xlp_sui = ThisStartUpID + 1;
2701 2702
		Insert->currpage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
		Insert->currpage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
2703
		/* rest of buffer was zeroed in XLOGShmemInit */
2704
		Insert->currpos = (char *) Insert->currpage + SizeOfXLogPHD;
2705 2706 2707 2708 2709 2710
	}
	else
	{
		XLogCtl->xlblocks[0].xlogid = openLogId;
		XLogCtl->xlblocks[0].xrecoff =
			((EndOfLog.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
2711

2712 2713
		/*
		 * Tricky point here: readBuf contains the *last* block that the
2714
		 * LastRec record spans, not the one it starts in.	The last block
2715
		 * is indeed the one we want to use.
2716 2717 2718 2719 2720 2721 2722 2723
		 */
		Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - BLCKSZ) % XLogSegSize);
		memcpy((char *) Insert->currpage, readBuf, BLCKSZ);
		Insert->currpos = (char *) Insert->currpage +
			(EndOfLog.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
		/* Make sure rest of page is zero */
		memset(Insert->currpos, 0, INSERT_FREESPACE(Insert));
	}
V
WAL  
Vadim B. Mikheev 已提交
2724

T
Tom Lane 已提交
2725
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
2726

T
Tom Lane 已提交
2727 2728 2729
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
2730

T
Tom Lane 已提交
2731 2732
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
2733

V
Vadim B. Mikheev 已提交
2734
#ifdef NOT_USED
V
WAL  
Vadim B. Mikheev 已提交
2735 2736 2737
	/* UNDO */
	if (InRecovery)
	{
2738 2739 2740
		RecPtr = ReadRecPtr;
		if (XLByteLT(checkPoint.undo, RecPtr))
		{
2741
			elog(LOG, "undo starts at %X/%X",
2742
				 RecPtr.xlogid, RecPtr.xrecoff);
2743 2744
			do
			{
2745
				record = ReadRecord(&RecPtr, PANIC, buffer);
2746
				if (TransactionIdIsValid(record->xl_xid) &&
2747
					!TransactionIdDidCommit(record->xl_xid))
V
misc  
Vadim B. Mikheev 已提交
2748
					RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
2749 2750
				RecPtr = record->xl_prev;
			} while (XLByteLE(checkPoint.undo, RecPtr));
2751
			elog(LOG, "undo done at %X/%X",
2752
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2753 2754
		}
		else
2755
			elog(LOG, "undo is not required");
2756
	}
V
WAL  
Vadim B. Mikheev 已提交
2757
#endif
2758

V
WAL  
Vadim B. Mikheev 已提交
2759
	if (InRecovery)
2760
	{
2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776
		int		rmid;

		/*
		 * Allow resource managers to do any required cleanup.
		 */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_cleanup != NULL)
				RmgrTable[rmid].rm_cleanup();
		}

		/* suppress in-transaction check in CreateCheckPoint */
		MyLastRecPtr.xrecoff = 0;
		MyXactMadeXLogEntry = false;
		MyXactMadeTempRelUpdate = false;

T
Tom Lane 已提交
2777
		/*
2778 2779
		 * Perform a new checkpoint to update our recovery activity to disk.
		 *
T
Tom Lane 已提交
2780 2781 2782 2783 2784 2785
		 * In case we had to use the secondary checkpoint, make sure that
		 * it will still be shown as the secondary checkpoint after this
		 * CreateCheckPoint operation; we don't want the broken primary
		 * checkpoint to become prevCheckPoint...
		 */
		ControlFile->checkPoint = checkPointLoc;
2786
		CreateCheckPoint(true, true);
2787 2788 2789 2790

		/*
		 * Close down recovery environment
		 */
V
WAL  
Vadim B. Mikheev 已提交
2791
		XLogCloseRelationCache();
2792
	}
2793

T
Tom Lane 已提交
2794 2795 2796 2797
	/*
	 * Preallocate additional log files, if wanted.
	 */
	PreallocXlogFiles(EndOfLog);
2798

V
WAL  
Vadim B. Mikheev 已提交
2799
	InRecovery = false;
2800 2801 2802 2803 2804

	ControlFile->state = DB_IN_PRODUCTION;
	ControlFile->time = time(NULL);
	UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2805 2806 2807
	ThisStartUpID++;
	XLogCtl->ThisStartUpID = ThisStartUpID;

2808 2809 2810
	/* Start up the commit log, too */
	StartupCLOG();

2811
	elog(LOG, "database system is ready");
2812
	CritSectionCount--;
2813

T
Tom Lane 已提交
2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	free(buffer);
}

2829 2830 2831 2832
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 * whichChkpt = 1 for "primary", 2 for "secondary", merely informative
 */
T
Tom Lane 已提交
2833 2834
static XLogRecord *
ReadCheckpointRecord(XLogRecPtr RecPtr,
2835
					 int whichChkpt,
T
Tom Lane 已提交
2836 2837 2838 2839 2840 2841
					 char *buffer)
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
2842 2843 2844
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint link in control file" :
				   "invalid secondary checkpoint link in control file"));
T
Tom Lane 已提交
2845 2846 2847 2848 2849 2850 2851
		return NULL;
	}

	record = ReadRecord(&RecPtr, LOG, buffer);

	if (record == NULL)
	{
2852 2853 2854
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint record" :
				   "invalid secondary checkpoint record"));
T
Tom Lane 已提交
2855 2856 2857 2858
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
2859
		elog(LOG, (whichChkpt == 1 ?
2860 2861
			 "invalid resource manager id in primary checkpoint record" :
		  "invalid resource manager id in secondary checkpoint record"));
T
Tom Lane 已提交
2862 2863 2864 2865 2866
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
2867 2868 2869
		elog(LOG, (whichChkpt == 1 ?
				   "invalid xl_info in primary checkpoint record" :
				   "invalid xl_info in secondary checkpoint record"));
T
Tom Lane 已提交
2870 2871 2872 2873
		return NULL;
	}
	if (record->xl_len != sizeof(CheckPoint))
	{
2874 2875 2876
		elog(LOG, (whichChkpt == 1 ?
				   "invalid length of primary checkpoint record" :
				   "invalid length of secondary checkpoint record"));
T
Tom Lane 已提交
2877 2878 2879
		return NULL;
	}
	return record;
2880 2881
}

V
WAL  
Vadim B. Mikheev 已提交
2882
/*
T
Tom Lane 已提交
2883
 * Postmaster uses this to initialize ThisStartUpID & RedoRecPtr from
2884
 * XLogCtlData located in shmem after successful startup.
V
WAL  
Vadim B. Mikheev 已提交
2885 2886 2887 2888 2889
 */
void
SetThisStartUpID(void)
{
	ThisStartUpID = XLogCtl->ThisStartUpID;
2890
	RedoRecPtr = XLogCtl->SavedRedoRecPtr;
2891 2892 2893
}

/*
T
Tom Lane 已提交
2894
 * CheckPoint process called by postmaster saves copy of new RedoRecPtr
2895 2896 2897 2898 2899 2900
 * in shmem (using SetSavedRedoRecPtr).  When checkpointer completes,
 * postmaster calls GetSavedRedoRecPtr to update its own copy of RedoRecPtr,
 * so that subsequently-spawned backends will start out with a reasonably
 * up-to-date local RedoRecPtr.  Since these operations are not protected by
 * any lock and copying an XLogRecPtr isn't atomic, it's unsafe to use either
 * of these routines at other times!
2901 2902
 */
void
2903
SetSavedRedoRecPtr(void)
2904
{
2905
	XLogCtl->SavedRedoRecPtr = RedoRecPtr;
2906 2907 2908
}

void
2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919
GetSavedRedoRecPtr(void)
{
	RedoRecPtr = XLogCtl->SavedRedoRecPtr;
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
2920 2921
GetRedoRecPtr(void)
{
2922 2923 2924 2925 2926 2927 2928 2929 2930
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
	SpinLockRelease_NoHoldoff(&xlogctl->info_lck);

	return RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2931 2932
}

2933
/*
T
Tom Lane 已提交
2934
 * This must be called ONCE during postmaster or standalone-backend shutdown
2935 2936
 */
void
T
Tom Lane 已提交
2937
ShutdownXLOG(void)
2938
{
2939
	elog(LOG, "shutting down");
2940

T
Tom Lane 已提交
2941 2942
	/* suppress in-transaction check in CreateCheckPoint */
	MyLastRecPtr.xrecoff = 0;
2943
	MyXactMadeXLogEntry = false;
2944
	MyXactMadeTempRelUpdate = false;
T
Tom Lane 已提交
2945

2946
	CritSectionCount++;
V
Vadim B. Mikheev 已提交
2947
	CreateDummyCaches();
2948
	CreateCheckPoint(true, true);
2949
	ShutdownCLOG();
2950
	CritSectionCount--;
2951

2952
	elog(LOG, "database system is shut down");
2953 2954
}

T
Tom Lane 已提交
2955 2956
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
2957 2958 2959
 *
 * If force is true, we force a checkpoint regardless of whether any XLOG
 * activity has occurred since the last one.
T
Tom Lane 已提交
2960
 */
2961
void
2962
CreateCheckPoint(bool shutdown, bool force)
2963
{
2964 2965 2966
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
2967
	XLogRecData rdata;
2968
	uint32		freespace;
V
Vadim B. Mikheev 已提交
2969 2970 2971
	uint32		_logId;
	uint32		_logSeg;

2972
	if (MyXactMadeXLogEntry)
V
Vadim B. Mikheev 已提交
2973
		elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
B
Bruce Momjian 已提交
2974

2975 2976
	/*
	 * The CheckpointLock can be held for quite a while, which is not good
2977 2978 2979 2980 2981
	 * because we won't respond to a cancel/die request while waiting for
	 * an LWLock.  (But the alternative of using a regular lock won't work
	 * for background checkpoint processes, which are not regular
	 * backends.) So, rather than use a plain LWLockAcquire, use this
	 * kluge to allow an interrupt to be accepted while we are waiting:
2982 2983
	 */
	while (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
V
Vadim B. Mikheev 已提交
2984
	{
2985 2986
		CHECK_FOR_INTERRUPTS();
		sleep(1);
V
Vadim B. Mikheev 已提交
2987
	}
2988

2989 2990
	START_CRIT_SECTION();

2991 2992 2993 2994 2995 2996
	if (shutdown)
	{
		ControlFile->state = DB_SHUTDOWNING;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
T
Tom Lane 已提交
2997 2998

	memset(&checkPoint, 0, sizeof(checkPoint));
V
WAL  
Vadim B. Mikheev 已提交
2999
	checkPoint.ThisStartUpID = ThisStartUpID;
T
Tom Lane 已提交
3000
	checkPoint.time = time(NULL);
3001

3002
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
3003 3004

	/*
3005 3006 3007 3008
	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
	 * any XLOG records since the start of the last checkpoint, skip the
	 * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
	 * when the system is idle. That wastes log space, and more importantly it
B
Bruce Momjian 已提交
3009 3010
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
3011 3012
	 * (Perhaps it'd make even more sense to checkpoint only when the previous
	 * checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
3013 3014
	 *
	 * We have to make two tests to determine that nothing has happened since
B
Bruce Momjian 已提交
3015 3016 3017
	 * the start of the last checkpoint: current insertion point must
	 * match the end of the last checkpoint record, and its redo pointer
	 * must point to itself.
T
Tom Lane 已提交
3018
	 */
3019
	if (!shutdown && !force)
T
Tom Lane 已提交
3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
3032 3033
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044
			END_CRIT_SECTION();
			return;
		}
	}

	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will
	 * be, since other backends may insert more XLOG records while we're
	 * off doing the buffer flush work.  Those XLOG records are logically
B
Bruce Momjian 已提交
3045
	 * after the checkpoint, even though physically before it.	Got that?
T
Tom Lane 已提交
3046 3047
	 */
	freespace = INSERT_FREESPACE(Insert);
3048 3049
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
3050 3051
		(void) AdvanceXLInsertBuffer();
		/* OK to ignore update return flag, since we will do flush anyway */
3052 3053
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
T
Tom Lane 已提交
3054
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
3055

T
Tom Lane 已提交
3056 3057
	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls;
3058
	 * this must be done while holding the insert lock AND the info_lck.
T
Tom Lane 已提交
3059
	 */
3060 3061 3062 3063 3064 3065 3066 3067
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
B
Bruce Momjian 已提交
3068

T
Tom Lane 已提交
3069
	/*
B
Bruce Momjian 已提交
3070 3071
	 * Get UNDO record ptr - this is oldest of PGPROC->logRec values. We
	 * do this while holding insert lock to ensure that we won't miss any
B
Bruce Momjian 已提交
3072 3073
	 * about-to-commit transactions (UNDO must include all xacts that have
	 * commits after REDO point).
3074 3075 3076
	 *
	 * XXX temporarily ifdef'd out to avoid three-way deadlock condition:
	 * GetUndoRecPtr needs to grab SInvalLock to ensure that it is looking
B
Bruce Momjian 已提交
3077 3078 3079 3080 3081 3082
	 * at a stable set of proc records, but grabbing SInvalLock while
	 * holding WALInsertLock is no good.  GetNewTransactionId may cause a
	 * WAL record to be written while holding XidGenLock, and
	 * GetSnapshotData needs to get XidGenLock while holding SInvalLock,
	 * so there's a risk of deadlock. Need to find a better solution.  See
	 * pgsql-hackers discussion of 17-Dec-01.
T
Tom Lane 已提交
3083
	 */
3084
#ifdef NOT_USED
T
Tom Lane 已提交
3085 3086 3087
	checkPoint.undo = GetUndoRecPtr();

	if (shutdown && checkPoint.undo.xrecoff != 0)
3088
		elog(PANIC, "active transaction while database system is shutting down");
3089
#endif
T
Tom Lane 已提交
3090 3091 3092 3093 3094

	/*
	 * Now we can release insert lock, allowing other xacts to proceed
	 * even while we are flushing disk buffers.
	 */
3095
	LWLockRelease(WALInsertLock);
3096

3097
	LWLockAcquire(XidGenLock, LW_SHARED);
3098
	checkPoint.nextXid = ShmemVariableCache->nextXid;
3099
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
3100

3101
	LWLockAcquire(OidGenLock, LW_SHARED);
3102
	checkPoint.nextOid = ShmemVariableCache->nextOid;
3103 3104
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
3105
	LWLockRelease(OidGenLock);
3106

T
Tom Lane 已提交
3107
	/*
B
Bruce Momjian 已提交
3108
	 * Having constructed the checkpoint record, ensure all shmem disk
3109
	 * buffers and commit-log buffers are flushed to disk.
T
Tom Lane 已提交
3110
	 */
3111
	CheckPointCLOG();
3112
	FlushBufferPool();
3113

T
Tom Lane 已提交
3114 3115 3116
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
3117
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3118
	rdata.data = (char *) (&checkPoint);
3119 3120 3121
	rdata.len = sizeof(checkPoint);
	rdata.next = NULL;

T
Tom Lane 已提交
3122 3123 3124 3125 3126 3127
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
3128

T
Tom Lane 已提交
3129 3130 3131 3132 3133
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record,
	 * recptr = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
3134
		elog(PANIC, "concurrent transaction log activity while database system is shutting down");
3135

T
Tom Lane 已提交
3136
	/*
3137 3138 3139 3140 3141 3142 3143
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
	 *
	 * With UNDO support: oldest item is redo or undo, whichever is older;
	 * but watch out for case that undo = 0.
	 *
	 * Without UNDO support: just use the redo pointer.  This allows xlog
3144 3145
	 * space to be freed much faster when there are long-running
	 * transactions.
T
Tom Lane 已提交
3146
	 */
3147
#ifdef NOT_USED
B
Bruce Momjian 已提交
3148
	if (ControlFile->checkPointCopy.undo.xrecoff != 0 &&
T
Tom Lane 已提交
3149 3150 3151 3152
		XLByteLT(ControlFile->checkPointCopy.undo,
				 ControlFile->checkPointCopy.redo))
		XLByteToSeg(ControlFile->checkPointCopy.undo, _logId, _logSeg);
	else
3153
#endif
T
Tom Lane 已提交
3154
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
3155

T
Tom Lane 已提交
3156 3157 3158
	/*
	 * Update the control file.
	 */
3159
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3160 3161
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
3162 3163 3164
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
3165 3166
	ControlFile->time = time(NULL);
	UpdateControlFile();
3167
	LWLockRelease(ControlFileLock);
3168

V
Vadim B. Mikheev 已提交
3169
	/*
T
Tom Lane 已提交
3170 3171
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
V
Vadim B. Mikheev 已提交
3172 3173 3174
	 */
	if (_logId || _logSeg)
	{
T
Tom Lane 已提交
3175
		PrevLogSeg(_logId, _logSeg);
3176
		MoveOfflineLogs(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
3177 3178
	}

T
Tom Lane 已提交
3179 3180 3181 3182 3183 3184 3185 3186
	/*
	 * Make more log segments if needed.  (Do this after deleting offline
	 * log segments, to avoid having peak disk space usage higher than
	 * necessary.)
	 */
	if (!shutdown)
		PreallocXlogFiles(recptr);

3187
	LWLockRelease(CheckpointLock);
V
Vadim B. Mikheev 已提交
3188

3189
	END_CRIT_SECTION();
3190
}
V
WAL  
Vadim B. Mikheev 已提交
3191

T
Tom Lane 已提交
3192 3193 3194
/*
 * Write a NEXTOID log record
 */
3195 3196 3197
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
3198
	XLogRecData rdata;
3199

3200
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3201
	rdata.data = (char *) (&nextOid);
3202 3203 3204 3205
	rdata.len = sizeof(Oid);
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
}
V
WAL  
Vadim B. Mikheev 已提交
3206

T
Tom Lane 已提交
3207 3208 3209
/*
 * XLOG resource manager's routines
 */
V
WAL  
Vadim B. Mikheev 已提交
3210 3211 3212
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
3213
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
3214

3215
	if (info == XLOG_NEXTOID)
3216
	{
B
Bruce Momjian 已提交
3217
		Oid			nextOid;
3218 3219 3220

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
3221
		{
3222
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
3241
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
3242 3243
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
3244 3245 3246 3247 3248 3249
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
3250
	}
V
WAL  
Vadim B. Mikheev 已提交
3251
}
B
Bruce Momjian 已提交
3252

V
WAL  
Vadim B. Mikheev 已提交
3253 3254 3255 3256
void
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
B
Bruce Momjian 已提交
3257

V
WAL  
Vadim B. Mikheev 已提交
3258
void
B
Bruce Momjian 已提交
3259
xlog_desc(char *buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
3260
{
B
Bruce Momjian 已提交
3261
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
3262

T
Tom Lane 已提交
3263 3264
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
3265
	{
B
Bruce Momjian 已提交
3266 3267
		CheckPoint *checkpoint = (CheckPoint *) rec;

3268
		sprintf(buf + strlen(buf), "checkpoint: redo %X/%X; undo %X/%X; "
B
Bruce Momjian 已提交
3269 3270 3271 3272 3273 3274
				"sui %u; xid %u; oid %u; %s",
				checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
				checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
				checkpoint->ThisStartUpID, checkpoint->nextXid,
				checkpoint->nextOid,
			 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
3275
	}
3276 3277
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
3278
		Oid			nextOid;
3279 3280 3281 3282

		memcpy(&nextOid, rec, sizeof(Oid));
		sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
	}
V
WAL  
Vadim B. Mikheev 已提交
3283 3284 3285 3286 3287 3288 3289
	else
		strcat(buf, "UNKNOWN");
}

static void
xlog_outrec(char *buf, XLogRecord *record)
{
B
Bruce Momjian 已提交
3290 3291
	int			bkpb;
	int			i;
3292

3293
	sprintf(buf + strlen(buf), "prev %X/%X; xprev %X/%X; xid %u",
B
Bruce Momjian 已提交
3294 3295 3296
			record->xl_prev.xlogid, record->xl_prev.xrecoff,
			record->xl_xact_prev.xlogid, record->xl_xact_prev.xrecoff,
			record->xl_xid);
3297

T
Tom Lane 已提交
3298
	for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3299 3300 3301 3302 3303 3304 3305 3306 3307 3308
	{
		if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i))))
			continue;
		bkpb++;
	}

	if (bkpb)
		sprintf(buf + strlen(buf), "; bkpb %d", bkpb);

	sprintf(buf + strlen(buf), ": %s",
B
Bruce Momjian 已提交
3309
			RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
3310
}
3311 3312 3313


/*
3314
 * GUC support
3315
 */
3316 3317
const char *
assign_xlog_sync_method(const char *method, bool doit, bool interactive)
3318
{
B
Bruce Momjian 已提交
3319 3320
	int			new_sync_method;
	int			new_sync_bit;
3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348

	if (strcasecmp(method, "fsync") == 0)
	{
		new_sync_method = SYNC_METHOD_FSYNC;
		new_sync_bit = 0;
	}
#ifdef HAVE_FDATASYNC
	else if (strcasecmp(method, "fdatasync") == 0)
	{
		new_sync_method = SYNC_METHOD_FDATASYNC;
		new_sync_bit = 0;
	}
#endif
#ifdef OPEN_SYNC_FLAG
	else if (strcasecmp(method, "open_sync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_SYNC_FLAG;
	}
#endif
#ifdef OPEN_DATASYNC_FLAG
	else if (strcasecmp(method, "open_datasync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_DATASYNC_FLAG;
	}
#endif
	else
3349
		return NULL;
3350

3351 3352 3353
	if (!doit)
		return method;

3354 3355 3356
	if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
	{
		/*
B
Bruce Momjian 已提交
3357 3358 3359 3360
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new
		 * flag bit) at next use.
3361 3362 3363 3364
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
3365
				elog(PANIC, "fsync of log file %u, segment %u failed: %m",
3366 3367 3368 3369
					 openLogId, openLogSeg);
			if (open_sync_bit != new_sync_bit)
			{
				if (close(openLogFile) != 0)
3370
					elog(PANIC, "close of log file %u, segment %u failed: %m",
3371 3372 3373 3374 3375 3376 3377
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
		}
		sync_method = new_sync_method;
		open_sync_bit = new_sync_bit;
	}
3378 3379

	return method;
3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390
}


/*
 * Issue appropriate kind of fsync (if any) on the current XLOG output file
 */
static void
issue_xlog_fsync(void)
{
	switch (sync_method)
	{
3391
		case SYNC_METHOD_FSYNC:
3392
			if (pg_fsync(openLogFile) != 0)
3393
				elog(PANIC, "fsync of log file %u, segment %u failed: %m",
3394 3395 3396 3397 3398
					 openLogId, openLogSeg);
			break;
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
			if (pg_fdatasync(openLogFile) != 0)
3399
				elog(PANIC, "fdatasync of log file %u, segment %u failed: %m",
3400 3401 3402 3403 3404 3405 3406
					 openLogId, openLogSeg);
			break;
#endif
		case SYNC_METHOD_OPEN:
			/* write synced it already */
			break;
		default:
3407
			elog(PANIC, "bogus wal_sync_method %d", sync_method);
3408 3409 3410
			break;
	}
}