xlog.c 96.0 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
7
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.86 2002/01/14 17:55:57 tgl Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <fcntl.h>
T
Tom Lane 已提交
18
#include <signal.h>
19 20 21
#include <unistd.h>
#include <errno.h>
#include <sys/stat.h>
V
Vadim B. Mikheev 已提交
22
#include <sys/time.h>
V
Vadim B. Mikheev 已提交
23 24
#include <sys/types.h>
#include <dirent.h>
25 26 27
#ifdef USE_LOCALE
#include <locale.h>
#endif
28

29
#include "access/clog.h"
30
#include "access/transam.h"
31
#include "access/xact.h"
32 33
#include "access/xlog.h"
#include "access/xlogutils.h"
34
#include "catalog/catversion.h"
T
Tom Lane 已提交
35
#include "catalog/pg_control.h"
36 37
#include "storage/bufpage.h"
#include "storage/lwlock.h"
38
#include "storage/pmsignal.h"
39
#include "storage/proc.h"
40
#include "storage/sinval.h"
41
#include "storage/spin.h"
42
#include "utils/builtins.h"
43
#include "utils/relcache.h"
44
#include "utils/selfuncs.h"
V
WAL  
Vadim B. Mikheev 已提交
45 46
#include "miscadmin.h"

47

48 49 50
/*
 * This chunk of hackery attempts to determine which file sync methods
 * are available on the current platform, and to choose an appropriate
B
Bruce Momjian 已提交
51
 * default method.	We assume that fsync() is always available, and that
52 53 54 55
 * configure determined whether fdatasync() is.
 */
#define SYNC_METHOD_FSYNC		0
#define SYNC_METHOD_FDATASYNC	1
B
Bruce Momjian 已提交
56 57
#define SYNC_METHOD_OPEN		2		/* used for both O_SYNC and
										 * O_DSYNC */
58 59

#if defined(O_SYNC)
B
Bruce Momjian 已提交
60
#define OPEN_SYNC_FLAG	   O_SYNC
61
#else
B
Bruce Momjian 已提交
62 63 64
#if defined(O_FSYNC)
#define OPEN_SYNC_FLAG	  O_FSYNC
#endif
65 66 67
#endif

#if defined(OPEN_SYNC_FLAG)
B
Bruce Momjian 已提交
68 69 70
#if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
#define OPEN_DATASYNC_FLAG	  O_DSYNC
#endif
71 72 73
#endif

#if defined(OPEN_DATASYNC_FLAG)
B
Bruce Momjian 已提交
74 75 76
#define DEFAULT_SYNC_METHOD_STR    "open_datasync"
#define DEFAULT_SYNC_METHOD		   SYNC_METHOD_OPEN
#define DEFAULT_SYNC_FLAGBIT	   OPEN_DATASYNC_FLAG
77
#else
B
Bruce Momjian 已提交
78 79 80 81 82 83 84 85 86
#if defined(HAVE_FDATASYNC)
#define DEFAULT_SYNC_METHOD_STR   "fdatasync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FDATASYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#else
#define DEFAULT_SYNC_METHOD_STR   "fsync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FSYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#endif
87 88 89
#endif


T
Tom Lane 已提交
90 91
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
92
int			XLOGbuffers = 8;
93
int			XLOGfiles = 0;		/* # of files to preallocate during ckpt */
T
Tom Lane 已提交
94
int			XLOG_DEBUG = 0;
95 96
char	   *XLOG_sync_method = NULL;
const char	XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
B
Bruce Momjian 已提交
97 98
char		XLOG_archive_dir[MAXPGPATH];		/* null string means
												 * delete 'em */
T
Tom Lane 已提交
99

100
/*
101
 * XLOGfileslop is used in the code as the allowed "fuzz" in the number of
102 103 104 105 106 107 108 109 110 111 112 113
 * preallocated XLOG segments --- we try to have at least XLOGfiles advance
 * segments but no more than XLOGfiles+XLOGfileslop segments.  This could
 * be made a separate GUC variable, but at present I think it's sufficient
 * to hardwire it as 2*CheckPointSegments+1.  Under normal conditions, a
 * checkpoint will free no more than 2*CheckPointSegments log segments, and
 * we want to recycle all of them; the +1 allows boundary cases to happen
 * without wasting a delete/create-segment cycle.
 */

#define XLOGfileslop	(2*CheckPointSegments + 1)


114 115 116 117 118 119
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
static int	sync_method = DEFAULT_SYNC_METHOD;
static int	open_sync_bit = DEFAULT_SYNC_FLAGBIT;

#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)

120 121
#define MinXLOGbuffers	4

T
Tom Lane 已提交
122 123 124 125 126

/*
 * ThisStartUpID will be same in all backends --- it identifies current
 * instance of the database system.
 */
V
WAL  
Vadim B. Mikheev 已提交
127 128
StartUpID	ThisStartUpID = 0;

T
Tom Lane 已提交
129 130
/* Are we doing recovery by reading XLOG? */
bool		InRecovery = false;
131

T
Tom Lane 已提交
132 133 134 135 136 137 138 139 140
/*
 * MyLastRecPtr points to the start of the last XLOG record inserted by the
 * current transaction.  If MyLastRecPtr.xrecoff == 0, then we are not in
 * a transaction or the transaction has not yet made any loggable changes.
 *
 * Note that XLOG records inserted outside transaction control are not
 * reflected into MyLastRecPtr.
 */
XLogRecPtr	MyLastRecPtr = {0, 0};
V
Vadim B. Mikheev 已提交
141

T
Tom Lane 已提交
142 143 144 145 146 147
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 * current backend.  It is updated for all inserts, transaction-controlled
 * or not.
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
148

T
Tom Lane 已提交
149 150 151
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
152
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
153
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
154
 * hold the Insert lock).  See XLogInsert for details.
T
Tom Lane 已提交
155 156
 */
static XLogRecPtr RedoRecPtr;
157

T
Tom Lane 已提交
158 159 160 161 162 163 164 165 166
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
167
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
168 169 170
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
171 172 173 174
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
175 176 177
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
178 179
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
180 181 182
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
183
 * but is updated when convenient.	Again, it exists for the convenience of
184
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
185 186 187 188 189 190 191 192 193 194
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
 * CheckpointLock: must be held to do a checkpoint (ensures only one
 * checkpointer at a time; even though the postmaster won't launch
 * parallel checkpoint processes, we need this because manual checkpoints
 * could be launched simultaneously).
 *
T
Tom Lane 已提交
213 214 215
 *----------
 */
typedef struct XLogwrtRqst
216
{
T
Tom Lane 已提交
217 218
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
219
} XLogwrtRqst;
220

T
Tom Lane 已提交
221
typedef struct XLogwrtResult
222
{
T
Tom Lane 已提交
223 224
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
225
} XLogwrtResult;
226

T
Tom Lane 已提交
227 228 229
/*
 * Shared state data for XLogInsert.
 */
230 231
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
232 233 234 235 236 237
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
	uint16		curridx;		/* current block index in cache */
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
238 239
} XLogCtlInsert;

T
Tom Lane 已提交
240 241 242
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
243 244
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
245 246
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	uint16		curridx;		/* cache index of next block to write */
247 248
} XLogCtlWrite;

T
Tom Lane 已提交
249 250 251
/*
 * Total shared-memory state for XLOG.
 */
252 253
typedef struct XLogCtlData
{
254
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
255
	XLogCtlInsert Insert;
T
Tom Lane 已提交
256
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
257 258
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
259
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
260 261
	XLogCtlWrite Write;

T
Tom Lane 已提交
262 263
	/*
	 * These values do not change after startup, although the pointed-to
264 265 266
	 * pages and xlblocks values certainly do.	Permission to read/write
	 * the pages and xlblocks values depends on WALInsertLock and
	 * WALWriteLock.
T
Tom Lane 已提交
267
	 */
B
Bruce Momjian 已提交
268 269 270 271 272
	char	   *pages;			/* buffers for unwritten XLOG pages */
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + BLCKSZ */
	uint32		XLogCacheByte;	/* # bytes in xlog buffers */
	uint32		XLogCacheBlck;	/* highest allocated xlog buffer index */
	StartUpID	ThisStartUpID;
T
Tom Lane 已提交
273

274
	/* This value is not protected by *any* lock... */
B
Bruce Momjian 已提交
275
	XLogRecPtr	RedoRecPtr;		/* see SetRedoRecPtr/GetRedoRecPtr */
T
Tom Lane 已提交
276

B
Bruce Momjian 已提交
277
	slock_t		info_lck;		/* locks shared LogwrtRqst/LogwrtResult */
278 279
} XLogCtlData;

280
static XLogCtlData *XLogCtl = NULL;
281

282
/*
T
Tom Lane 已提交
283
 * We maintain an image of pg_control in shared memory.
284
 */
285
static ControlFileData *ControlFile = NULL;
286

T
Tom Lane 已提交
287 288 289 290 291
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
292

T
Tom Lane 已提交
293 294 295 296 297 298 299 300 301
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
	(BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
302
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
	)


/* Increment an xlogid/segment pair */
#define NextLogSeg(logId, logSeg)	\
	do { \
		if ((logSeg) >= XLogSegsPerFile-1) \
		{ \
			(logId)++; \
			(logSeg) = 0; \
		} \
		else \
			(logSeg)++; \
	} while (0)

/* Decrement an xlogid/segment pair (assume it's not 0,0) */
#define PrevLogSeg(logId, logSeg)	\
	do { \
		if (logSeg) \
			(logSeg)--; \
		else \
		{ \
			(logId)--; \
			(logSeg) = XLogSegsPerFile-1; \
		} \
	} while (0)
V
WAL  
Vadim B. Mikheev 已提交
329

T
Tom Lane 已提交
330 331 332 333
/*
 * Compute ID and segment from an XLogRecPtr.
 *
 * For XLByteToSeg, do the computation at face value.  For XLByteToPrevSeg,
B
Bruce Momjian 已提交
334
 * a boundary byte is taken to be in the previous segment.	This is suitable
T
Tom Lane 已提交
335 336 337 338 339 340 341 342 343 344 345
 * for deciding which segment to write given a pointer to a record end,
 * for example.
 */
#define XLByteToSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = (xlrp).xrecoff / XLogSegSize \
	)
#define XLByteToPrevSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = ((xlrp).xrecoff - 1) / XLogSegSize \
	)
346

347
/*
T
Tom Lane 已提交
348 349 350 351
 * Is an XLogRecPtr within a particular XLOG segment?
 *
 * For XLByteInSeg, do the computation at face value.  For XLByteInPrevSeg,
 * a boundary byte is taken to be in the previous segment.
352
 */
T
Tom Lane 已提交
353 354 355 356 357 358 359
#define XLByteInSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 (xlrp).xrecoff / XLogSegSize == (logSeg))

#define XLByteInPrevSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 ((xlrp).xrecoff - 1) / XLogSegSize == (logSeg))
360 361


362
#define XLogFileName(path, log, seg)	\
363 364
			snprintf(path, MAXPGPATH, "%s/%08X%08X",	\
					 XLogDir, log, seg)
365

T
Tom Lane 已提交
366 367 368 369 370
#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
371

372
#define XRecOffIsValid(xrecoff) \
T
Tom Lane 已提交
373 374
		((xrecoff) % BLCKSZ >= SizeOfXLogPHD && \
		(BLCKSZ - (xrecoff) % BLCKSZ) >= SizeOfXLogRecord)
375

T
Tom Lane 已提交
376 377 378 379 380 381
/*
 * _INTL_MAXLOGRECSZ: max space needed for a record including header and
 * any backup-block data.
 */
#define _INTL_MAXLOGRECSZ	(SizeOfXLogRecord + MAXLOGRECSZ + \
							 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
382

383

T
Tom Lane 已提交
384
/* File path names */
B
Bruce Momjian 已提交
385 386
static char XLogDir[MAXPGPATH];
static char ControlFilePath[MAXPGPATH];
T
Tom Lane 已提交
387 388 389 390 391 392

/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
393

T
Tom Lane 已提交
394 395 396 397 398 399 400 401 402 403
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
404

T
Tom Lane 已提交
405 406 407 408 409 410
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
 * will be just past that page.
 */
411 412 413 414
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
B
Bruce Momjian 已提交
415

T
Tom Lane 已提交
416 417
/* Buffer for currently read page (BLCKSZ bytes) */
static char *readBuf = NULL;
B
Bruce Momjian 已提交
418

T
Tom Lane 已提交
419 420 421
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr;
static XLogRecPtr EndRecPtr;
422
static XLogRecord *nextRecord = NULL;
423
static StartUpID lastReadSUI;
424

V
WAL  
Vadim B. Mikheev 已提交
425 426
static bool InRedo = false;

T
Tom Lane 已提交
427 428 429

static bool AdvanceXLInsertBuffer(void);
static void XLogWrite(XLogwrtRqst WriteRqst);
B
Bruce Momjian 已提交
430 431
static int XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock);
432
static bool InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
433 434
					   bool find_free, int max_advance,
					   bool use_lock);
T
Tom Lane 已提交
435 436
static int	XLogFileOpen(uint32 log, uint32 seg, bool econt);
static void PreallocXlogFiles(XLogRecPtr endptr);
437
static void MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr);
T
Tom Lane 已提交
438
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer);
439
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI);
T
Tom Lane 已提交
440
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr,
441
					 int whichChkpt,
B
Bruce Momjian 已提交
442
					 char *buffer);
T
Tom Lane 已提交
443 444 445 446
static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(time_t tnow);
static void xlog_outrec(char *buf, XLogRecord *record);
447
static void issue_xlog_fsync(void);
T
Tom Lane 已提交
448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
 * the rdata list (see xlog.h for notes about rdata).
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
465
XLogRecPtr
466
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
467
{
B
Bruce Momjian 已提交
468 469
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
470
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
471 472 473 474 475 476 477 478 479 480 481 482 483 484
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
	uint16		curridx;
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
	crc64		rdata_crc;
	uint32		len,
				write_len;
	unsigned	i;
485
	XLogwrtRqst LogwrtRqst;
B
Bruce Momjian 已提交
486 487
	bool		updrqst;
	bool		no_tran = (rmid == RM_XLOG_ID) ? true : false;
V
Vadim B. Mikheev 已提交
488 489 490 491

	if (info & XLR_INFO_MASK)
	{
		if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
B
Bruce Momjian 已提交
492
			elog(STOP, "XLogInsert: invalid info mask %02X",
T
Tom Lane 已提交
493
				 (info & XLR_INFO_MASK));
V
Vadim B. Mikheev 已提交
494 495 496 497
		no_tran = true;
		info &= ~XLR_INFO_MASK;
	}

T
Tom Lane 已提交
498
	/*
B
Bruce Momjian 已提交
499 500
	 * In bootstrap mode, we don't actually log anything but XLOG
	 * resources; return a phony record pointer.
T
Tom Lane 已提交
501
	 */
V
Vadim B. Mikheev 已提交
502
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
503 504
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
505
		RecPtr.xrecoff = SizeOfXLogPHD; /* start of 1st checkpoint record */
V
WAL  
Vadim B. Mikheev 已提交
506 507 508
		return (RecPtr);
	}

T
Tom Lane 已提交
509 510 511 512 513 514
	/*
	 * Here we scan the rdata list, determine which buffers must be backed
	 * up, and compute the CRC values for the data.  Note that the record
	 * header isn't added into the CRC yet since we don't know the final
	 * length or info bits quite yet.
	 *
B
Bruce Momjian 已提交
515 516
	 * We may have to loop back to here if a race condition is detected
	 * below. We could prevent the race by doing all this work while
517
	 * holding the insert lock, but it seems better to avoid doing CRC
B
Bruce Momjian 已提交
518 519 520 521 522 523 524 525
	 * calculations while holding the lock.  This means we have to be
	 * careful about modifying the rdata list until we know we aren't
	 * going to loop back again.  The only change we allow ourselves to
	 * make earlier is to set rdt->data = NULL in list items we have
	 * decided we will have to back up the whole buffer for.  This is OK
	 * because we will certainly decide the same thing again for those
	 * items if we do it over; doing it here saves an extra pass over the
	 * list later.
T
Tom Lane 已提交
526
	 */
527
begin:;
T
Tom Lane 已提交
528 529 530 531 532 533
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

534
	INIT_CRC64(rdata_crc);
T
Tom Lane 已提交
535
	len = 0;
B
Bruce Momjian 已提交
536
	for (rdt = rdata;;)
537 538 539
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
540
			/* Simple data, just include it */
541 542 543
			len += rdt->len;
			COMP_CRC64(rdata_crc, rdt->data, rdt->len);
		}
T
Tom Lane 已提交
544
		else
545
		{
T
Tom Lane 已提交
546 547
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
548
			{
T
Tom Lane 已提交
549
				if (rdt->buffer == dtbuf[i])
550
				{
T
Tom Lane 已提交
551 552 553 554 555 556 557 558 559
					/* Buffer already referenced by earlier list item */
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
560
				}
T
Tom Lane 已提交
561
				if (dtbuf[i] == InvalidBuffer)
562
				{
T
Tom Lane 已提交
563 564
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
B
Bruce Momjian 已提交
565

T
Tom Lane 已提交
566 567 568
					/*
					 * XXX We assume page LSN is first data on page
					 */
B
Bruce Momjian 已提交
569
					dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
T
Tom Lane 已提交
570 571
					if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
					{
B
Bruce Momjian 已提交
572
						crc64		dtcrc;
T
Tom Lane 已提交
573 574 575 576 577 578 579 580 581 582

						dtbuf_bkp[i] = true;
						rdt->data = NULL;
						INIT_CRC64(dtcrc);
						COMP_CRC64(dtcrc,
								   BufferGetBlock(dtbuf[i]),
								   BLCKSZ);
						dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
						dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
						COMP_CRC64(dtcrc,
B
Bruce Momjian 已提交
583
								(char *) &(dtbuf_xlg[i]) + sizeof(crc64),
T
Tom Lane 已提交
584 585 586 587 588 589 590 591 592 593
								   sizeof(BkpBlock) - sizeof(crc64));
						FIN_CRC64(dtcrc);
						dtbuf_xlg[i].crc = dtcrc;
					}
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
594 595
				}
			}
T
Tom Lane 已提交
596 597 598
			if (i >= XLR_MAX_BKP_BLOCKS)
				elog(STOP, "XLogInsert: can backup %d blocks at most",
					 XLR_MAX_BKP_BLOCKS);
599
		}
T
Tom Lane 已提交
600
		/* Break out of loop when rdt points to last list item */
601 602 603 604 605
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

T
Tom Lane 已提交
606 607 608
	/*
	 * NOTE: the test for len == 0 here is somewhat fishy, since in theory
	 * all of the rmgr data might have been suppressed in favor of backup
B
Bruce Momjian 已提交
609
	 * blocks.	Currently, all callers of XLogInsert provide at least some
T
Tom Lane 已提交
610 611 612 613
	 * not-in-a-buffer data and so len == 0 should never happen, but that
	 * may not be true forever.  If you need to remove the len == 0 check,
	 * also remove the check for xl_len == 0 in ReadRecord, below.
	 */
614
	if (len == 0 || len > MAXLOGRECSZ)
615
		elog(STOP, "XLogInsert: invalid record length %u", len);
616

617
	START_CRIT_SECTION();
618

619
	/* update LogwrtResult before doing cache fill check */
620 621 622 623 624 625 626 627 628
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		LogwrtRqst = xlogctl->LogwrtRqst;
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
629

630
	/*
631 632
	 * If cache is half filled then try to acquire write lock and do
	 * XLogWrite. Ignore any fractional blocks in performing this check.
633 634 635 636 637
	 */
	LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % BLCKSZ;
	if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
		(LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
		 XLogCtl->XLogCacheByte / 2))
T
Tom Lane 已提交
638
	{
639
		if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
640
		{
641 642 643 644
			LogwrtResult = XLogCtl->Write.LogwrtResult;
			if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
				XLogWrite(LogwrtRqst);
			LWLockRelease(WALWriteLock);
645 646 647
		}
	}

648 649 650
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
651 652
	/*
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to
B
Bruce Momjian 已提交
653 654 655
	 * go back and recompute everything.  This can only happen just after
	 * a checkpoint, so it's better to be slow in this case and fast
	 * otherwise.
T
Tom Lane 已提交
656 657
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
658
	{
T
Tom Lane 已提交
659 660 661 662
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

		for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
663
		{
T
Tom Lane 已提交
664 665 666 667 668 669
			if (dtbuf[i] == InvalidBuffer)
				continue;
			if (dtbuf_bkp[i] == false &&
				XLByteLE(dtbuf_lsn[i], RedoRecPtr))
			{
				/*
B
Bruce Momjian 已提交
670 671
				 * Oops, this buffer now needs to be backed up, but we
				 * didn't think so above.  Start over.
T
Tom Lane 已提交
672
				 */
673
				LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
674 675 676
				END_CRIT_SECTION();
				goto begin;
			}
677 678 679
		}
	}

T
Tom Lane 已提交
680 681 682 683 684 685 686
	/*
	 * Make additional rdata list entries for the backup blocks, so that
	 * we don't need to special-case them in the write loop.  Note that we
	 * have now irrevocably changed the input rdata list.  At the exit of
	 * this loop, write_len includes the backup block data.
	 *
	 * Also set the appropriate info bits to show which buffers were backed
B
Bruce Momjian 已提交
687 688 689
	 * up.	The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th
	 * distinct buffer value (ignoring InvalidBuffer) appearing in the
	 * rdata list.
T
Tom Lane 已提交
690 691 692
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
693 694 695 696
	{
		if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
			continue;

T
Tom Lane 已提交
697
		info |= XLR_SET_BKP_BLOCK(i);
698 699 700

		rdt->next = &(dtbuf_rdt[2 * i]);

B
Bruce Momjian 已提交
701
		dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
702
		dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
T
Tom Lane 已提交
703
		write_len += sizeof(BkpBlock);
704 705 706

		rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);

B
Bruce Momjian 已提交
707
		dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
708
		dtbuf_rdt[2 * i + 1].len = BLCKSZ;
T
Tom Lane 已提交
709
		write_len += BLCKSZ;
710 711 712
		dtbuf_rdt[2 * i + 1].next = NULL;
	}

T
Tom Lane 已提交
713
	/* Insert record header */
714

T
Tom Lane 已提交
715 716
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
717 718
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
719
		updrqst = AdvanceXLInsertBuffer();
720 721 722
		freespace = BLCKSZ - SizeOfXLogPHD;
	}

T
Tom Lane 已提交
723
	curridx = Insert->curridx;
724
	record = (XLogRecord *) Insert->currpos;
T
Tom Lane 已提交
725

726
	record->xl_prev = Insert->PrevRecord;
V
Vadim B. Mikheev 已提交
727
	if (no_tran)
728 729 730 731
	{
		record->xl_xact_prev.xlogid = 0;
		record->xl_xact_prev.xrecoff = 0;
	}
V
Vadim B. Mikheev 已提交
732 733 734
	else
		record->xl_xact_prev = MyLastRecPtr;

735
	record->xl_xid = GetCurrentTransactionId();
T
Tom Lane 已提交
736
	record->xl_len = len;		/* doesn't include backup blocks */
737
	record->xl_info = info;
738
	record->xl_rmid = rmid;
739

T
Tom Lane 已提交
740
	/* Now we can finish computing the main CRC */
B
Bruce Momjian 已提交
741
	COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
742
			   SizeOfXLogRecord - sizeof(crc64));
743 744 745
	FIN_CRC64(rdata_crc);
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
746 747 748 749
	/* Compute record's XLOG location */
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/* If first XLOG record of transaction, save it in PROC array */
V
Vadim B. Mikheev 已提交
750
	if (MyLastRecPtr.xrecoff == 0 && !no_tran)
751
	{
752 753 754 755 756 757
		/*
		 * We do not acquire SInvalLock here because of possible deadlock.
		 * Anyone who wants to inspect other procs' logRec must acquire
		 * WALInsertLock, instead.  A better solution would be a per-PROC
		 * spinlock, but no time for that before 7.2 --- tgl 12/19/01.
		 */
758 759
		MyProc->logRec = RecPtr;
	}
V
WAL  
Vadim B. Mikheev 已提交
760 761 762

	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
763
		char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
764

765
		sprintf(buf, "INSERT @ %X/%X: ", RecPtr.xlogid, RecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
766
		xlog_outrec(buf, record);
767
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
768 769
		{
			strcat(buf, " - ");
770
			RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
771
		}
772
		elog(DEBUG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
773 774
	}

T
Tom Lane 已提交
775 776 777 778 779 780
	/* Record begin of record in appropriate places */
	if (!no_tran)
		MyLastRecPtr = RecPtr;
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

781
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
782
	freespace -= SizeOfXLogRecord;
783

T
Tom Lane 已提交
784 785 786 787
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
788
	{
789 790 791 792
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
793
		{
794 795 796 797 798
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
799
				write_len -= freespace;
800 801 802 803 804
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
805
				write_len -= rdata->len;
806 807 808 809
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
810 811
		}

812
		/* Use next buffer */
T
Tom Lane 已提交
813 814 815 816 817 818 819 820
		updrqst = AdvanceXLInsertBuffer();
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
		freespace = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
821
	}
822

T
Tom Lane 已提交
823 824
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
825
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
826
	freespace = INSERT_FREESPACE(Insert);
827

V
Vadim B. Mikheev 已提交
828
	/*
B
Bruce Momjian 已提交
829 830
	 * The recptr I return is the beginning of the *next* record. This
	 * will be stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
831
	 */
T
Tom Lane 已提交
832
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
833

T
Tom Lane 已提交
834
	/* Need to update shared LogwrtRqst if some block was filled up */
835
	if (freespace < SizeOfXLogRecord)
B
Bruce Momjian 已提交
836 837
		updrqst = true;			/* curridx is filled and available for
								 * writing out */
838 839
	else
		curridx = PrevBufIdx(curridx);
T
Tom Lane 已提交
840
	WriteRqst = XLogCtl->xlblocks[curridx];
841

842
	LWLockRelease(WALInsertLock);
843 844 845

	if (updrqst)
	{
846 847 848 849
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
T
Tom Lane 已提交
850
		/* advance global request to include new block(s) */
851 852
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
853
		/* update local result copy while I have the chance */
854 855
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
856 857
	}

858
	END_CRIT_SECTION();
859

860
	return (RecPtr);
861
}
862

T
Tom Lane 已提交
863 864 865 866 867
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
868
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
869 870 871
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
872
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
873 874 875
 */
static bool
AdvanceXLInsertBuffer(void)
876
{
T
Tom Lane 已提交
877 878 879 880 881 882
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
	uint16		nextidx = NextBufIdx(Insert->curridx);
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
883 884
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
885

T
Tom Lane 已提交
886 887 888
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
889

T
Tom Lane 已提交
890
	/*
B
Bruce Momjian 已提交
891 892 893
	 * Get ending-offset of the buffer page we need to replace (this may
	 * be zero if the buffer hasn't been used yet).  Fall through if it's
	 * already written out.
T
Tom Lane 已提交
894 895 896 897 898 899
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
900

T
Tom Lane 已提交
901
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
902

903
		/* Before waiting, get info_lck and update LogwrtResult */
904 905 906 907 908 909 910 911 912 913
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
			SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
		}
914 915 916 917 918 919 920 921 922

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
923
		{
924 925 926 927
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
928
			{
929 930 931
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
932
			}
933
			else
T
Tom Lane 已提交
934 935
			{
				/*
B
Bruce Momjian 已提交
936 937
				 * Have to write buffers while holding insert lock. This
				 * is not good, so only write as much as we absolutely
T
Tom Lane 已提交
938 939 940 941 942 943
				 * must.
				 */
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
				XLogWrite(WriteRqst);
944
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
945
				Insert->LogwrtResult = LogwrtResult;
946 947 948 949
			}
		}
	}

T
Tom Lane 已提交
950 951 952 953
	/*
	 * Now the next buffer slot is free and we can set it up to be the
	 * next output page.
	 */
954 955
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
956
	{
T
Tom Lane 已提交
957
		/* crossing a logid boundary */
958 959
		NewPageEndPtr.xlogid += 1;
		NewPageEndPtr.xrecoff = BLCKSZ;
960
	}
T
Tom Lane 已提交
961
	else
962 963 964
		NewPageEndPtr.xrecoff += BLCKSZ;
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * BLCKSZ);
T
Tom Lane 已提交
965
	Insert->curridx = nextidx;
966 967
	Insert->currpage = NewPage;
	Insert->currpos = ((char *) NewPage) + SizeOfXLogPHD;
B
Bruce Momjian 已提交
968

T
Tom Lane 已提交
969
	/*
B
Bruce Momjian 已提交
970 971
	 * Be sure to re-zero the buffer so that bytes beyond what we've
	 * written will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
972
	 */
973 974 975 976
	MemSet((char *) NewPage, 0, BLCKSZ);

	/* And fill the new page's header */
	NewPage->xlp_magic = XLOG_PAGE_MAGIC;
977
	/* NewPage->xlp_info = 0; */	/* done by memset */
978 979 980
	NewPage->xlp_sui = ThisStartUpID;
	NewPage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
	NewPage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
T
Tom Lane 已提交
981 982

	return update_needed;
983 984
}

T
Tom Lane 已提交
985 986 987
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
988
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
989
 */
990
static void
T
Tom Lane 已提交
991
XLogWrite(XLogwrtRqst WriteRqst)
992
{
993 994
	XLogCtlWrite *Write = &XLogCtl->Write;
	char	   *from;
T
Tom Lane 已提交
995
	bool		ispartialpage;
996
	bool		use_existent;
997

B
Bruce Momjian 已提交
998 999 1000 1001
	/*
	 * Update local LogwrtResult (caller probably did this already,
	 * but...)
	 */
T
Tom Lane 已提交
1002 1003 1004
	LogwrtResult = Write->LogwrtResult;

	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1005
	{
1006 1007 1008 1009 1010 1011 1012
		/*
		 * Make sure we're not ahead of the insert process.  This could
		 * happen if we're passed a bogus WriteRqst.Write that is past the
		 * end of the last page that's been initialized by
		 * AdvanceXLInsertBuffer.
		 */
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
1013 1014 1015 1016
			elog(STOP, "XLogWrite: write request %X/%X is past end of log %X/%X",
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
				 XLogCtl->xlblocks[Write->curridx].xlogid,
				 XLogCtl->xlblocks[Write->curridx].xrecoff);
1017

T
Tom Lane 已提交
1018 1019 1020 1021 1022
		/* Advance LogwrtResult.Write to end of current buffer page */
		LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1023
		{
T
Tom Lane 已提交
1024 1025 1026 1027
			/*
			 * Switch to new logfile segment.
			 */
			if (openLogFile >= 0)
1028
			{
T
Tom Lane 已提交
1029
				if (close(openLogFile) != 0)
1030
					elog(STOP, "close of log file %u, segment %u failed: %m",
T
Tom Lane 已提交
1031 1032
						 openLogId, openLogSeg);
				openLogFile = -1;
1033
			}
T
Tom Lane 已提交
1034 1035
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1036 1037 1038 1039
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1040
			openLogOff = 0;
1041 1042 1043 1044 1045

			if (!use_existent)	/* there was no precreated file */
				elog(LOG, "XLogWrite: new log file created - "
					 "consider increasing WAL_FILES");

T
Tom Lane 已提交
1046
			/* update pg_control, unless someone else already did */
1047
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1048 1049 1050
			if (ControlFile->logId < openLogId ||
				(ControlFile->logId == openLogId &&
				 ControlFile->logSeg < openLogSeg + 1))
T
Tom Lane 已提交
1051 1052 1053 1054 1055
			{
				ControlFile->logId = openLogId;
				ControlFile->logSeg = openLogSeg + 1;
				ControlFile->time = time(NULL);
				UpdateControlFile();
B
Bruce Momjian 已提交
1056

1057
				/*
B
Bruce Momjian 已提交
1058 1059 1060 1061
				 * Signal postmaster to start a checkpoint if it's been
				 * too long since the last one.  (We look at local copy of
				 * RedoRecPtr which might be a little out of date, but
				 * should be close enough for this purpose.)
1062 1063 1064 1065 1066 1067 1068
				 */
				if (IsUnderPostmaster &&
					(openLogId != RedoRecPtr.xlogid ||
					 openLogSeg >= (RedoRecPtr.xrecoff / XLogSegSize) +
					 (uint32) CheckPointSegments))
				{
					if (XLOG_DEBUG)
1069
						elog(DEBUG, "XLogWrite: time for a checkpoint, signaling postmaster");
1070
					SendPostmasterSignal(PMSIGNAL_DO_CHECKPOINT);
1071
				}
T
Tom Lane 已提交
1072
			}
1073
			LWLockRelease(ControlFileLock);
1074 1075
		}

T
Tom Lane 已提交
1076
		if (openLogFile < 0)
1077
		{
T
Tom Lane 已提交
1078 1079 1080
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
			openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
			openLogOff = 0;
1081 1082
		}

T
Tom Lane 已提交
1083 1084
		/* Need to seek in the file? */
		if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1085
		{
T
Tom Lane 已提交
1086 1087
			openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
			if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1088
				elog(STOP, "lseek of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1089
					 openLogId, openLogSeg, openLogOff);
1090 1091
		}

T
Tom Lane 已提交
1092 1093
		/* OK to write the page */
		from = XLogCtl->pages + Write->curridx * BLCKSZ;
1094
		errno = 0;
T
Tom Lane 已提交
1095
		if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1096 1097 1098 1099
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
1100
			elog(STOP, "write of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1101
				 openLogId, openLogSeg, openLogOff);
1102
		}
T
Tom Lane 已提交
1103
		openLogOff += BLCKSZ;
1104

T
Tom Lane 已提交
1105 1106 1107
		/*
		 * If we just wrote the whole last page of a logfile segment,
		 * fsync the segment immediately.  This avoids having to go back
B
Bruce Momjian 已提交
1108 1109 1110
		 * and re-open prior segments when an fsync request comes along
		 * later. Doing it here ensures that one and only one backend will
		 * perform this fsync.
T
Tom Lane 已提交
1111 1112 1113
		 */
		if (openLogOff >= XLogSegSize && !ispartialpage)
		{
1114
			issue_xlog_fsync();
B
Bruce Momjian 已提交
1115
			LogwrtResult.Flush = LogwrtResult.Write;	/* end of current page */
T
Tom Lane 已提交
1116
		}
1117

T
Tom Lane 已提交
1118 1119 1120 1121 1122 1123 1124
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
		Write->curridx = NextBufIdx(Write->curridx);
1125 1126
	}

T
Tom Lane 已提交
1127 1128 1129 1130 1131
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1132
	{
T
Tom Lane 已提交
1133
		/*
B
Bruce Momjian 已提交
1134 1135 1136
		 * Could get here without iterating above loop, in which case we
		 * might have no open file or the wrong one.  However, we do not
		 * need to fsync more than one file.
T
Tom Lane 已提交
1137
		 */
1138
		if (sync_method != SYNC_METHOD_OPEN)
T
Tom Lane 已提交
1139
		{
1140
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1141
			 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1142 1143
			{
				if (close(openLogFile) != 0)
1144
					elog(STOP, "close of log file %u, segment %u failed: %m",
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
				openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
				openLogOff = 0;
			}
			issue_xlog_fsync();
T
Tom Lane 已提交
1155 1156
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1157 1158
	}

T
Tom Lane 已提交
1159 1160 1161
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1162 1163
	 * We make sure that the shared 'request' values do not fall behind the
	 * 'result' values.  This is not absolutely essential, but it saves
T
Tom Lane 已提交
1164 1165
	 * some code in a couple of places.
	 */
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
1178

T
Tom Lane 已提交
1179 1180 1181 1182 1183 1184
	Write->LogwrtResult = LogwrtResult;
}

/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
1185
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

	if (XLOG_DEBUG)
	{
1196
		elog(DEBUG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X\n",
1197 1198 1199 1200 1201
			 (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
			 (InRedo) ? "(redo)" : "",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
T
Tom Lane 已提交
1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
		fflush(stderr);
	}

	/* Disabled during REDO */
	if (InRedo)
		return;

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
	 * piggyback as much data as we can on each fsync: if we see any more
	 * data entered into the xlog buffer, we'll write and fsync that too,
B
Bruce Momjian 已提交
1219 1220 1221
	 * so that the final value of LogwrtResult.Flush is as large as
	 * possible. This gives us some chance of avoiding another fsync
	 * immediately after.
T
Tom Lane 已提交
1222 1223 1224 1225 1226
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

1227
	/* read LogwrtResult and update local state */
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
1238 1239 1240

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1241 1242
	{
		/* if something was added to log cache then try to flush this too */
1243
		if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
T
Tom Lane 已提交
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254
		{
			XLogCtlInsert *Insert = &XLogCtl->Insert;
			uint32		freespace = INSERT_FREESPACE(Insert);

			if (freespace < SizeOfXLogRecord)	/* buffer is full */
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
			else
			{
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				WriteRqstPtr.xrecoff -= freespace;
			}
1255
			LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
1256
		}
1257 1258 1259 1260
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1261 1262 1263 1264 1265
		{
			WriteRqst.Write = WriteRqstPtr;
			WriteRqst.Flush = record;
			XLogWrite(WriteRqst);
		}
1266
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1267 1268 1269
	}

	END_CRIT_SECTION();
1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298

	/*
	 * If we still haven't flushed to the request point then we have a
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
	 *
	 * Formerly we treated this as a STOP condition, but that hurts the
	 * system's robustness rather than helping it: we do not want to take
	 * down the whole system due to corruption on one data page.  In
	 * particular, if the bad page is encountered again during recovery then
	 * we would be unable to restart the database at all!  (This scenario
	 * has actually happened in the field several times with 7.1 releases.
	 * Note that we cannot get here while InRedo is true, but if the bad
	 * page is brought in and marked dirty during recovery then
	 * CreateCheckpoint will try to flush it at the end of recovery.)
	 *
	 * The current approach is to ERROR under normal conditions, but only
	 * NOTICE during recovery, so that the system can be brought up even if
	 * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR
	 * will be promoted to STOP since xact.c calls this routine inside a
	 * critical section.  However, calls from bufmgr.c are not within
	 * critical sections and so we will not force a restart for a bad LSN
	 * on a data page.
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
		elog(InRecovery ? NOTICE : ERROR,
			 "XLogFlush: request %X/%X is not satisfied --- flushed only to %X/%X",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1299 1300
}

T
Tom Lane 已提交
1301 1302 1303
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
1304 1305 1306
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
1307
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
1308 1309
 * file was used.
 *
1310
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1311
 * place.  This should be TRUE except during bootstrap log creation.  The
1312
 * caller must *not* hold the lock at call.
1313
 *
T
Tom Lane 已提交
1314 1315
 * Returns FD of opened file.
 */
1316
static int
1317 1318
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
1319
{
1320
	char		path[MAXPGPATH];
1321
	char		tmppath[MAXPGPATH];
1322
	char		zbuffer[BLCKSZ];
1323
	int			fd;
1324
	int			nbytes;
1325 1326

	XLogFileName(path, log, seg);
V
Vadim B. Mikheev 已提交
1327 1328

	/*
B
Bruce Momjian 已提交
1329 1330
	 * Try to use existent file (checkpoint maker may have created it
	 * already)
V
Vadim B. Mikheev 已提交
1331
	 */
1332
	if (*use_existent)
V
Vadim B. Mikheev 已提交
1333
	{
1334 1335
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
1336 1337 1338
		if (fd < 0)
		{
			if (errno != ENOENT)
1339 1340
				elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
					 path, log, seg);
V
Vadim B. Mikheev 已提交
1341 1342
		}
		else
B
Bruce Momjian 已提交
1343
			return (fd);
V
Vadim B. Mikheev 已提交
1344 1345
	}

1346
	/*
B
Bruce Momjian 已提交
1347 1348 1349
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible
	 * that another process is doing the same thing.  If so, we will end
	 * up pre-creating an extra log segment.  That seems OK, and better
1350
	 * than holding the lock throughout this lengthy process.
1351
	 */
1352 1353
	snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
			 XLogDir, (int) getpid());
1354 1355

	unlink(tmppath);
1356

1357
	/* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
1358
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
1359
					   S_IRUSR | S_IWUSR);
1360
	if (fd < 0)
1361
		elog(STOP, "creation of file %s failed: %m", tmppath);
1362

1363
	/*
B
Bruce Momjian 已提交
1364
	 * Zero-fill the file.	We have to do this the hard way to ensure that
1365 1366
	 * all the file space has really been allocated --- on platforms that
	 * allow "holes" in files, just seeking to the end doesn't allocate
B
Bruce Momjian 已提交
1367
	 * intermediate space.	This way, we know that we have all the space
1368
	 * and (after the fsync below) that all the indirect blocks are down
1369 1370
	 * on disk.  Therefore, fdatasync(2) or O_DSYNC will be sufficient to
	 * sync future writes to the log file.
1371 1372 1373 1374
	 */
	MemSet(zbuffer, 0, sizeof(zbuffer));
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
	{
1375
		errno = 0;
1376
		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
T
Tom Lane 已提交
1377
		{
B
Bruce Momjian 已提交
1378
			int			save_errno = errno;
T
Tom Lane 已提交
1379

B
Bruce Momjian 已提交
1380 1381 1382 1383
			/*
			 * If we fail to make the file, delete it to release disk
			 * space
			 */
1384
			unlink(tmppath);
1385 1386
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
1387

T
Tom Lane 已提交
1388
			elog(STOP, "ZeroFill failed to write %s: %m", tmppath);
T
Tom Lane 已提交
1389
		}
1390
	}
1391

1392
	if (pg_fsync(fd) != 0)
1393
		elog(STOP, "fsync of file %s failed: %m", tmppath);
1394

V
Vadim B. Mikheev 已提交
1395
	close(fd);
T
Tom Lane 已提交
1396

1397
	/*
1398 1399
	 * Now move the segment into place with its final name.
	 *
1400 1401 1402 1403 1404
	 * If caller didn't want to use a pre-existing file, get rid of any
	 * pre-existing file.  Otherwise, cope with possibility that someone
	 * else has created the file while we were filling ours: if so, use
	 * ours to pre-create a future log segment.
	 */
1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443
	if (!InstallXLogFileSegment(log, seg, tmppath,
								*use_existent, XLOGfiles + XLOGfileslop,
								use_lock))
	{
		/* No need for any more future segments... */
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
			 path, log, seg);

	return (fd);
}

/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
 * log, seg: identify segment to install as (or first possible target).
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
 * max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  (Irrelevant if
 * find_free is FALSE.)
 *
1444
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1445
 * place.  This should be TRUE except during bootstrap log creation.  The
1446
 * caller must *not* hold the lock at call.
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464
 *
 * Returns TRUE if file installed, FALSE if not installed because of
 * exceeding max_advance limit.  (Any other kind of failure causes elog().)
 */
static bool
InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
					   bool find_free, int max_advance,
					   bool use_lock)
{
	char		path[MAXPGPATH];
	int			fd;

	XLogFileName(path, log, seg);

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
1465
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1466

1467 1468 1469 1470 1471
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
1472 1473
	else
	{
1474 1475
		/* Find a free slot to put it in */
		while ((fd = BasicOpenFile(path, O_RDWR | PG_BINARY,
1476 1477 1478
								   S_IRUSR | S_IWUSR)) >= 0)
		{
			close(fd);
1479 1480 1481 1482
			if (--max_advance < 0)
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
1483
					LWLockRelease(ControlFileLock);
1484 1485 1486 1487
				return false;
			}
			NextLogSeg(log, seg);
			XLogFileName(path, log, seg);
1488 1489 1490 1491 1492 1493 1494
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
1495
	 */
1496
#ifndef __BEOS__
1497
	if (link(tmppath, path) < 0)
1498
		elog(STOP, "link from %s to %s (initialization of log file %u, segment %u) failed: %m",
1499
			 tmppath, path, log, seg);
1500
	unlink(tmppath);
1501
#else
1502
	if (rename(tmppath, path) < 0)
1503
		elog(STOP, "rename from %s to %s (initialization of log file %u, segment %u) failed: %m",
1504
			 tmppath, path, log, seg);
1505
#endif
V
Vadim B. Mikheev 已提交
1506

1507
	if (use_lock)
1508
		LWLockRelease(ControlFileLock);
1509

1510
	return true;
1511 1512
}

T
Tom Lane 已提交
1513 1514 1515
/*
 * Open a pre-existing logfile segment.
 */
1516 1517 1518
static int
XLogFileOpen(uint32 log, uint32 seg, bool econt)
{
1519 1520
	char		path[MAXPGPATH];
	int			fd;
1521 1522 1523

	XLogFileName(path, log, seg);

1524 1525
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
1526 1527 1528 1529
	if (fd < 0)
	{
		if (econt && errno == ENOENT)
		{
1530 1531
			elog(LOG, "open of %s (log file %u, segment %u) failed: %m",
				 path, log, seg);
1532 1533
			return (fd);
		}
1534 1535
		elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
			 path, log, seg);
1536 1537
	}

1538
	return (fd);
1539 1540
}

V
Vadim B. Mikheev 已提交
1541
/*
T
Tom Lane 已提交
1542 1543 1544 1545 1546 1547 1548 1549 1550
 * Preallocate log files beyond the specified log endpoint, according to
 * the XLOGfile user parameter.
 */
static void
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
1551
	bool		use_existent;
T
Tom Lane 已提交
1552 1553 1554 1555 1556 1557 1558 1559
	int			i;

	XLByteToPrevSeg(endptr, _logId, _logSeg);
	if (XLOGfiles > 0)
	{
		for (i = 1; i <= XLOGfiles; i++)
		{
			NextLogSeg(_logId, _logSeg);
1560 1561
			use_existent = true;
			lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1562 1563 1564 1565 1566 1567 1568
			close(lf);
		}
	}
	else if ((endptr.xrecoff - 1) % XLogSegSize >=
			 (uint32) (0.75 * XLogSegSize))
	{
		NextLogSeg(_logId, _logSeg);
1569 1570
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1571 1572 1573 1574 1575 1576
		close(lf);
	}
}

/*
 * Remove or move offline all log files older or equal to passed log/seg#
1577 1578 1579
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
1580 1581
 */
static void
1582
MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
1583
{
1584 1585
	uint32		endlogId;
	uint32		endlogSeg;
B
Bruce Momjian 已提交
1586 1587 1588 1589
	DIR		   *xldir;
	struct dirent *xlde;
	char		lastoff[32];
	char		path[MAXPGPATH];
V
Vadim B. Mikheev 已提交
1590

1591
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
V
Vadim B. Mikheev 已提交
1592 1593 1594

	xldir = opendir(XLogDir);
	if (xldir == NULL)
1595 1596
		elog(STOP, "could not open transaction log directory (%s): %m",
			 XLogDir);
V
Vadim B. Mikheev 已提交
1597

T
Tom Lane 已提交
1598
	sprintf(lastoff, "%08X%08X", log, seg);
V
Vadim B. Mikheev 已提交
1599 1600 1601 1602

	errno = 0;
	while ((xlde = readdir(xldir)) != NULL)
	{
T
Tom Lane 已提交
1603 1604 1605
		if (strlen(xlde->d_name) == 16 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 16 &&
			strcmp(xlde->d_name, lastoff) <= 0)
V
Vadim B. Mikheev 已提交
1606
		{
1607
			snprintf(path, MAXPGPATH, "%s/%s", XLogDir, xlde->d_name);
1608
			if (XLOG_archive_dir[0])
1609 1610 1611 1612 1613
			{
				elog(LOG, "archiving transaction log file %s",
					 xlde->d_name);
				elog(NOTICE, "archiving log files is not implemented!");
			}
1614
			else
1615 1616 1617
			{
				/*
				 * Before deleting the file, see if it can be recycled as
1618 1619
				 * a future log segment.  We allow recycling segments up
				 * to XLOGfiles + XLOGfileslop segments beyond the current
1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636
				 * XLOG location.
				 */
				if (InstallXLogFileSegment(endlogId, endlogSeg, path,
										   true, XLOGfiles + XLOGfileslop,
										   true))
				{
					elog(LOG, "recycled transaction log file %s",
						 xlde->d_name);
				}
				else
				{
					/* No need for any more future segments... */
					elog(LOG, "removing transaction log file %s",
						 xlde->d_name);
					unlink(path);
				}
			}
V
Vadim B. Mikheev 已提交
1637 1638 1639 1640
		}
		errno = 0;
	}
	if (errno)
1641 1642
		elog(STOP, "could not read transaction log directory (%s): %m",
			 XLogDir);
V
Vadim B. Mikheev 已提交
1643 1644 1645
	closedir(xldir);
}

T
Tom Lane 已提交
1646 1647 1648 1649 1650
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
 */
1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
static void
RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
{
	Relation	reln;
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
1661
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
1662
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1663
	{
T
Tom Lane 已提交
1664
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1665 1666
			continue;

B
Bruce Momjian 已提交
1667
		memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
		blk += sizeof(BkpBlock);

		reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);

		if (reln)
		{
			buffer = XLogReadBuffer(true, reln, bkpb.block);
			if (BufferIsValid(buffer))
			{
				page = (Page) BufferGetPage(buffer);
B
Bruce Momjian 已提交
1678
				memcpy((char *) page, blk, BLCKSZ);
1679 1680 1681 1682 1683 1684 1685 1686 1687 1688
				PageSetLSN(page, lsn);
				PageSetSUI(page, ThisStartUpID);
				UnlockAndWriteBuffer(buffer);
			}
		}

		blk += BLCKSZ;
	}
}

T
Tom Lane 已提交
1689 1690 1691 1692 1693 1694 1695
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
1696 1697 1698 1699 1700 1701 1702 1703 1704
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
	crc64		crc;
	crc64		cbuf;
	int			i;
	uint32		len = record->xl_len;
	char	   *blk;

T
Tom Lane 已提交
1705
	/* Check CRC of rmgr data and record header */
1706
	INIT_CRC64(crc);
T
Tom Lane 已提交
1707
	COMP_CRC64(crc, XLogRecGetData(record), len);
B
Bruce Momjian 已提交
1708
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
1709
			   SizeOfXLogRecord - sizeof(crc64));
1710 1711
	FIN_CRC64(crc);

T
Tom Lane 已提交
1712
	if (!EQ_CRC64(record->xl_crc, crc))
1713
	{
1714
		elog(emode, "ReadRecord: bad resource manager data checksum in record at %X/%X",
T
Tom Lane 已提交
1715
			 recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1716
		return (false);
1717 1718
	}

T
Tom Lane 已提交
1719
	/* Check CRCs of backup blocks, if any */
B
Bruce Momjian 已提交
1720
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
1721
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1722
	{
T
Tom Lane 已提交
1723
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1724 1725 1726
			continue;

		INIT_CRC64(crc);
T
Tom Lane 已提交
1727 1728 1729
		COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
		COMP_CRC64(crc, blk + sizeof(crc64),
				   sizeof(BkpBlock) - sizeof(crc64));
1730
		FIN_CRC64(crc);
B
Bruce Momjian 已提交
1731 1732
		memcpy((char *) &cbuf, blk, sizeof(crc64));		/* don't assume
														 * alignment */
1733

T
Tom Lane 已提交
1734
		if (!EQ_CRC64(cbuf, crc))
1735
		{
1736
			elog(emode, "ReadRecord: bad checksum of backup block %d in record at %X/%X",
T
Tom Lane 已提交
1737
				 i + 1, recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1738
			return (false);
1739
		}
T
Tom Lane 已提交
1740
		blk += sizeof(BkpBlock) + BLCKSZ;
1741 1742
	}

B
Bruce Momjian 已提交
1743
	return (true);
1744 1745
}

T
Tom Lane 已提交
1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
 * If no valid record is available, returns NULL, or fails if emode is STOP.
 * (emode must be either STOP or LOG.)
 *
 * buffer is a workspace at least _INTL_MAXLOGRECSZ bytes long.  It is needed
 * to reassemble a record that crosses block boundaries.  Note that on
 * successful return, the returned record pointer always points at buffer.
 */
1759
static XLogRecord *
T
Tom Lane 已提交
1760
ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer)
1761
{
1762 1763
	XLogRecord *record;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
T
Tom Lane 已提交
1764 1765 1766 1767
	uint32		len,
				total_len;
	uint32		targetPageOff;
	unsigned	i;
1768
	bool		nextmode = false;
T
Tom Lane 已提交
1769 1770 1771 1772 1773 1774

	if (readBuf == NULL)
	{
		/*
		 * First time through, permanently allocate readBuf.  We do it
		 * this way, rather than just making a static array, for two
B
Bruce Momjian 已提交
1775 1776 1777 1778
		 * reasons: (1) no need to waste the storage in most
		 * instantiations of the backend; (2) a static char array isn't
		 * guaranteed to have any particular alignment, whereas malloc()
		 * will provide MAXALIGN'd storage.
T
Tom Lane 已提交
1779 1780 1781 1782
		 */
		readBuf = (char *) malloc(BLCKSZ);
		Assert(readBuf != NULL);
	}
1783

T
Tom Lane 已提交
1784
	if (RecPtr == NULL)
1785
	{
1786
		RecPtr = &tmpRecPtr;
1787
		nextmode = true;
T
Tom Lane 已提交
1788
		/* fast case if next record is on same page */
1789 1790 1791 1792 1793
		if (nextRecord != NULL)
		{
			record = nextRecord;
			goto got_record;
		}
T
Tom Lane 已提交
1794
		/* align old recptr to next page */
1795 1796 1797 1798 1799 1800 1801 1802
		if (tmpRecPtr.xrecoff % BLCKSZ != 0)
			tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
		tmpRecPtr.xrecoff += SizeOfXLogPHD;
1803
	}
1804
	else if (!XRecOffIsValid(RecPtr->xrecoff))
1805
		elog(STOP, "ReadRecord: invalid record offset at %X/%X",
1806
			 RecPtr->xlogid, RecPtr->xrecoff);
1807

T
Tom Lane 已提交
1808
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
1809
	{
1810 1811
		close(readFile);
		readFile = -1;
1812
	}
T
Tom Lane 已提交
1813
	XLByteToSeg(*RecPtr, readId, readSeg);
1814
	if (readFile < 0)
1815
	{
T
Tom Lane 已提交
1816
		readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1817 1818
		if (readFile < 0)
			goto next_record_is_invalid;
1819
		readOff = (uint32) (-1);	/* force read to occur below */
1820 1821
	}

T
Tom Lane 已提交
1822 1823
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / BLCKSZ) * BLCKSZ;
	if (readOff != targetPageOff)
1824
	{
T
Tom Lane 已提交
1825 1826 1827
		readOff = targetPageOff;
		if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
		{
1828
			elog(emode, "ReadRecord: lseek of log file %u, segment %u, offset %u failed: %m",
1829
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1830 1831
			goto next_record_is_invalid;
		}
1832
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1833
		{
1834
			elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1835
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1836 1837
			goto next_record_is_invalid;
		}
1838
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, nextmode))
1839 1840
			goto next_record_is_invalid;
	}
T
Tom Lane 已提交
1841
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
1842 1843
		RecPtr->xrecoff % BLCKSZ == SizeOfXLogPHD)
	{
1844
		elog(emode, "ReadRecord: contrecord is requested by %X/%X",
1845
			 RecPtr->xlogid, RecPtr->xrecoff);
1846 1847
		goto next_record_is_invalid;
	}
1848
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
1849 1850

got_record:;
B
Bruce Momjian 已提交
1851

T
Tom Lane 已提交
1852
	/*
B
Bruce Momjian 已提交
1853 1854
	 * Currently, xl_len == 0 must be bad data, but that might not be true
	 * forever.  See note in XLogInsert.
T
Tom Lane 已提交
1855
	 */
1856 1857
	if (record->xl_len == 0)
	{
1858
		elog(emode, "ReadRecord: record with zero length at %X/%X",
T
Tom Lane 已提交
1859
			 RecPtr->xlogid, RecPtr->xrecoff);
1860 1861
		goto next_record_is_invalid;
	}
B
Bruce Momjian 已提交
1862

T
Tom Lane 已提交
1863
	/*
B
Bruce Momjian 已提交
1864 1865
	 * Compute total length of record including any appended backup
	 * blocks.
T
Tom Lane 已提交
1866 1867 1868 1869 1870 1871 1872 1873
	 */
	total_len = SizeOfXLogRecord + record->xl_len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;
		total_len += sizeof(BkpBlock) + BLCKSZ;
	}
B
Bruce Momjian 已提交
1874

T
Tom Lane 已提交
1875 1876 1877 1878 1879 1880
	/*
	 * Make sure it will fit in buffer (currently, it is mechanically
	 * impossible for this test to fail, but it seems like a good idea
	 * anyway).
	 */
	if (total_len > _INTL_MAXLOGRECSZ)
1881
	{
1882
		elog(emode, "ReadRecord: record length %u at %X/%X too long",
T
Tom Lane 已提交
1883
			 total_len, RecPtr->xlogid, RecPtr->xrecoff);
1884 1885 1886 1887
		goto next_record_is_invalid;
	}
	if (record->xl_rmid > RM_MAX_ID)
	{
1888
		elog(emode, "ReadRecord: invalid resource manager id %u at %X/%X",
1889
			 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff);
1890 1891 1892
		goto next_record_is_invalid;
	}
	nextRecord = NULL;
T
Tom Lane 已提交
1893 1894
	len = BLCKSZ - RecPtr->xrecoff % BLCKSZ;
	if (total_len > len)
1895
	{
T
Tom Lane 已提交
1896 1897
		/* Need to reassemble record */
		XLogContRecord *contrecord;
B
Bruce Momjian 已提交
1898
		uint32		gotlen = len;
1899

T
Tom Lane 已提交
1900
		memcpy(buffer, record, len);
1901
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
1902
		buffer += len;
1903
		for (;;)
1904
		{
T
Tom Lane 已提交
1905 1906
			readOff += BLCKSZ;
			if (readOff >= XLogSegSize)
1907 1908
			{
				close(readFile);
T
Tom Lane 已提交
1909 1910 1911
				readFile = -1;
				NextLogSeg(readId, readSeg);
				readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1912 1913
				if (readFile < 0)
					goto next_record_is_invalid;
T
Tom Lane 已提交
1914
				readOff = 0;
1915 1916
			}
			if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1917
			{
1918
				elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1919
					 readId, readSeg, readOff);
T
Tom Lane 已提交
1920 1921
				goto next_record_is_invalid;
			}
1922
			if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
1923
				goto next_record_is_invalid;
T
Tom Lane 已提交
1924
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
1925
			{
1926
				elog(emode, "ReadRecord: there is no ContRecord flag in log file %u, segment %u, offset %u",
1927
					 readId, readSeg, readOff);
1928 1929
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1930
			contrecord = (XLogContRecord *) ((char *) readBuf + SizeOfXLogPHD);
B
Bruce Momjian 已提交
1931
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
1932
				total_len != (contrecord->xl_rem_len + gotlen))
1933
			{
1934
				elog(emode, "ReadRecord: invalid ContRecord length %u in log file %u, segment %u, offset %u",
T
Tom Lane 已提交
1935
					 contrecord->xl_rem_len, readId, readSeg, readOff);
1936 1937
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1938 1939
			len = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
			if (contrecord->xl_rem_len > len)
1940
			{
B
Bruce Momjian 已提交
1941
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
		if (BLCKSZ - SizeOfXLogRecord >= SizeOfXLogPHD +
			SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
		{
B
Bruce Momjian 已提交
1955
			nextRecord = (XLogRecord *) ((char *) contrecord +
T
Tom Lane 已提交
1956 1957 1958 1959
				SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
B
Bruce Momjian 已提交
1960
			SizeOfXLogPHD + SizeOfXLogContRecord +
T
Tom Lane 已提交
1961 1962 1963
			MAXALIGN(contrecord->xl_rem_len);
		ReadRecPtr = *RecPtr;
		return record;
1964 1965
	}

T
Tom Lane 已提交
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
	if (BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % BLCKSZ +
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
	return (XLogRecord *) buffer;
1977

T
Tom Lane 已提交
1978 1979 1980 1981 1982
next_record_is_invalid:;
	close(readFile);
	readFile = -1;
	nextRecord = NULL;
	return NULL;
1983 1984
}

1985 1986 1987 1988
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
1989
 * ReadRecord.	It's not intended for use from anywhere else.
1990 1991 1992 1993
 */
static bool
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI)
{
1994 1995
	XLogRecPtr	recaddr;

1996 1997
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
1998
		elog(emode, "ReadRecord: invalid magic number %04X in log file %u, segment %u, offset %u",
1999 2000 2001 2002 2003
			 hdr->xlp_magic, readId, readSeg, readOff);
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
2004
		elog(emode, "ReadRecord: invalid info bits %04X in log file %u, segment %u, offset %u",
2005 2006 2007
			 hdr->xlp_info, readId, readSeg, readOff);
		return false;
	}
2008 2009 2010 2011
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
2012
		elog(emode, "ReadRecord: unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
2013 2014 2015 2016
			 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
			 readId, readSeg, readOff);
		return false;
	}
B
Bruce Momjian 已提交
2017

2018
	/*
B
Bruce Momjian 已提交
2019 2020 2021 2022
	 * We disbelieve a SUI less than the previous page's SUI, or more than
	 * a few counts greater.  In theory as many as 512 shutdown checkpoint
	 * records could appear on a 32K-sized xlog page, so that's the most
	 * differential there could legitimately be.
2023 2024
	 *
	 * Note this check can only be applied when we are reading the next page
B
Bruce Momjian 已提交
2025 2026
	 * in sequence, so ReadRecord passes a flag indicating whether to
	 * check.
2027 2028 2029 2030 2031 2032
	 */
	if (checkSUI)
	{
		if (hdr->xlp_sui < lastReadSUI ||
			hdr->xlp_sui > lastReadSUI + 512)
		{
2033 2034
			/* translator: SUI = startup id */
			elog(emode, "ReadRecord: out-of-sequence SUI %u (after %u) in log file %u, segment %u, offset %u",
2035 2036 2037 2038 2039 2040 2041 2042
				 hdr->xlp_sui, lastReadSUI, readId, readSeg, readOff);
			return false;
		}
	}
	lastReadSUI = hdr->xlp_sui;
	return true;
}

2043 2044 2045 2046
/*
 * I/O routines for pg_control
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
2047
 * contents of pg_control.	WriteControlFile() initializes pg_control
2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */

void
XLOGPathInit(void)
{
	/* Init XLOG file paths */
2062 2063
	snprintf(XLogDir, MAXPGPATH, "%s/pg_xlog", DataDir);
	snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
2064 2065 2066 2067 2068 2069
}

static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
2070 2071
	char		buffer[BLCKSZ]; /* need not be aligned */

2072 2073 2074 2075 2076
#ifdef USE_LOCALE
	char	   *localeptr;
#endif

	/*
T
Tom Lane 已提交
2077
	 * Initialize version and compatibility-check fields
2078
	 */
T
Tom Lane 已提交
2079 2080
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
2081 2082 2083 2084 2085
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
#ifdef USE_LOCALE
	localeptr = setlocale(LC_COLLATE, NULL);
	if (!localeptr)
2086
		elog(STOP, "invalid LC_COLLATE setting");
2087 2088 2089
	StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
	localeptr = setlocale(LC_CTYPE, NULL);
	if (!localeptr)
2090
		elog(STOP, "invalid LC_CTYPE setting");
2091
	StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
B
Bruce Momjian 已提交
2092

2093 2094
	/*
	 * Issue warning notice if initdb'ing in a locale that will not permit
B
Bruce Momjian 已提交
2095 2096
	 * LIKE index optimization.  This is not a clean place to do it, but I
	 * don't see a better place either...
2097 2098 2099 2100 2101
	 */
	if (!locale_is_like_safe())
		elog(NOTICE, "Initializing database with %s collation order."
			 "\n\tThis locale setting will prevent use of index optimization for"
			 "\n\tLIKE and regexp searches.  If you are concerned about speed of"
B
Bruce Momjian 已提交
2102
		  "\n\tsuch queries, you may wish to set LC_COLLATE to \"C\" and"
2103 2104
			 "\n\tre-initdb.  For more information see the Administrator's Guide.",
			 ControlFile->lc_collate);
2105
#else							/* not USE_LOCALE */
2106 2107
	strcpy(ControlFile->lc_collate, "C");
	strcpy(ControlFile->lc_ctype, "C");
2108
#endif   /* not USE_LOCALE */
2109

T
Tom Lane 已提交
2110 2111
	/* Contents are protected with a CRC */
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2112 2113
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2114 2115 2116
			   sizeof(ControlFileData) - sizeof(crc64));
	FIN_CRC64(ControlFile->crc);

2117
	/*
B
Bruce Momjian 已提交
2118 2119 2120 2121 2122
	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
	 * over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail
	 * when we check the contents of the file, but hopefully with a more
	 * specific error than "couldn't read pg_control".
2123 2124
	 */
	if (sizeof(ControlFileData) > BLCKSZ)
2125
		elog(STOP, "sizeof(ControlFileData) is larger than BLCKSZ; fix either one");
2126

2127 2128 2129
	memset(buffer, 0, BLCKSZ);
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

2130 2131
	fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
2132
	if (fd < 0)
2133
		elog(STOP, "WriteControlFile: could not create control file (%s): %m",
2134 2135
			 ControlFilePath);

2136
	errno = 0;
2137
	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
2138 2139 2140 2141
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2142
		elog(STOP, "WriteControlFile: write to control file failed: %m");
2143
	}
2144

2145
	if (pg_fsync(fd) != 0)
2146
		elog(STOP, "WriteControlFile: fsync of control file failed: %m");
2147 2148 2149 2150 2151 2152 2153

	close(fd);
}

static void
ReadControlFile(void)
{
2154
	crc64		crc;
2155 2156 2157 2158 2159 2160 2161
	int			fd;

	/*
	 * Read data...
	 */
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
2162
		elog(STOP, "could not open control file (%s): %m", ControlFilePath);
2163 2164

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2165
		elog(STOP, "read from control file failed: %m");
2166 2167 2168

	close(fd);

T
Tom Lane 已提交
2169 2170 2171 2172 2173 2174 2175
	/*
	 * Check for expected pg_control format version.  If this is wrong,
	 * the CRC check will likely fail because we'll be checking the wrong
	 * number of bytes.  Complaining about wrong version will probably be
	 * more enlightening than complaining about wrong CRC.
	 */
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
2176 2177 2178 2179
		elog(STOP,
			 "The database cluster was initialized with PG_CONTROL_VERSION %d,\n"
			 "\tbut the server was compiled with PG_CONTROL_VERSION %d.\n"
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2180 2181 2182
			 ControlFile->pg_control_version, PG_CONTROL_VERSION);

	/* Now check the CRC. */
2183
	INIT_CRC64(crc);
B
Bruce Momjian 已提交
2184 2185
	COMP_CRC64(crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2186
			   sizeof(ControlFileData) - sizeof(crc64));
2187 2188
	FIN_CRC64(crc);

T
Tom Lane 已提交
2189
	if (!EQ_CRC64(crc, ControlFile->crc))
2190
		elog(STOP, "invalid checksum in control file");
2191

2192
	/*
B
Bruce Momjian 已提交
2193 2194
	 * Do compatibility checking immediately.  We do this here for 2
	 * reasons:
2195
	 *
B
Bruce Momjian 已提交
2196 2197
	 * (1) if the database isn't compatible with the backend executable, we
	 * want to abort before we can possibly do any damage;
2198 2199 2200
	 *
	 * (2) this code is executed in the postmaster, so the setlocale() will
	 * propagate to forked backends, which aren't going to read this file
B
Bruce Momjian 已提交
2201
	 * for themselves.	(These locale settings are considered critical
2202 2203
	 * compatibility items because they can affect sort order of indexes.)
	 */
T
Tom Lane 已提交
2204
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
2205 2206
		elog(STOP,
			 "The database cluster was initialized with CATALOG_VERSION_NO %d,\n"
2207
		   "\tbut the backend was compiled with CATALOG_VERSION_NO %d.\n"
2208
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2209
			 ControlFile->catalog_version_no, CATALOG_VERSION_NO);
2210
	if (ControlFile->blcksz != BLCKSZ)
2211 2212 2213 2214
		elog(STOP,
			 "The database cluster was initialized with BLCKSZ %d,\n"
			 "\tbut the backend was compiled with BLCKSZ %d.\n"
			 "\tIt looks like you need to initdb.",
2215 2216
			 ControlFile->blcksz, BLCKSZ);
	if (ControlFile->relseg_size != RELSEG_SIZE)
2217 2218 2219 2220
		elog(STOP,
			 "The database cluster was initialized with RELSEG_SIZE %d,\n"
			 "\tbut the backend was compiled with RELSEG_SIZE %d.\n"
			 "\tIt looks like you need to initdb.",
2221 2222 2223
			 ControlFile->relseg_size, RELSEG_SIZE);
#ifdef USE_LOCALE
	if (setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
2224
		elog(STOP,
2225
		   "The database cluster was initialized with LC_COLLATE '%s',\n"
2226 2227
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2228 2229
			 ControlFile->lc_collate);
	if (setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
2230 2231 2232 2233
		elog(STOP,
			 "The database cluster was initialized with LC_CTYPE '%s',\n"
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2234
			 ControlFile->lc_ctype);
2235
#else							/* not USE_LOCALE */
2236 2237
	if (strcmp(ControlFile->lc_collate, "C") != 0 ||
		strcmp(ControlFile->lc_ctype, "C") != 0)
2238
		elog(STOP,
2239
		"The database cluster was initialized with LC_COLLATE '%s' and\n"
2240 2241
			 "\tLC_CTYPE '%s', but the server was compiled without locale support.\n"
			 "\tIt looks like you need to initdb or recompile.",
2242
			 ControlFile->lc_collate, ControlFile->lc_ctype);
2243
#endif   /* not USE_LOCALE */
2244 2245
}

2246
void
2247
UpdateControlFile(void)
2248
{
2249
	int			fd;
2250

2251
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2252 2253
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2254
			   sizeof(ControlFileData) - sizeof(crc64));
2255 2256
	FIN_CRC64(ControlFile->crc);

2257
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
2258
	if (fd < 0)
2259
		elog(STOP, "could not open control file (%s): %m", ControlFilePath);
2260

2261
	errno = 0;
2262
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2263 2264 2265 2266
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2267
		elog(STOP, "write to control file failed: %m");
2268
	}
2269

2270
	if (pg_fsync(fd) != 0)
2271
		elog(STOP, "fsync of control file failed: %m");
2272 2273 2274 2275

	close(fd);
}

2276
/*
T
Tom Lane 已提交
2277
 * Initialization of shared memory for XLOG
2278 2279
 */

2280
int
2281
XLOGShmemSize(void)
2282 2283 2284 2285
{
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

T
Tom Lane 已提交
2286 2287 2288
	return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
		+ BLCKSZ * XLOGbuffers +
		MAXALIGN(sizeof(ControlFileData));
2289 2290 2291 2292 2293
}

void
XLOGShmemInit(void)
{
2294
	bool		found;
2295

2296
	/* this must agree with space requested by XLOGShmemSize() */
2297 2298 2299
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

2300
	XLogCtl = (XLogCtlData *)
T
Tom Lane 已提交
2301 2302 2303 2304 2305
		ShmemInitStruct("XLOG Ctl",
						MAXALIGN(sizeof(XLogCtlData) +
								 sizeof(XLogRecPtr) * XLOGbuffers)
						+ BLCKSZ * XLOGbuffers,
						&found);
2306
	Assert(!found);
2307 2308 2309 2310
	ControlFile = (ControlFileData *)
		ShmemInitStruct("Control File", sizeof(ControlFileData), &found);
	Assert(!found);

T
Tom Lane 已提交
2311
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
2312

T
Tom Lane 已提交
2313 2314 2315 2316 2317 2318 2319 2320
	/*
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be
	 * a multiple of the alignment for same, so no extra alignment padding
	 * is needed here.
	 */
	XLogCtl->xlblocks = (XLogRecPtr *)
		(((char *) XLogCtl) + sizeof(XLogCtlData));
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
B
Bruce Momjian 已提交
2321

T
Tom Lane 已提交
2322
	/*
B
Bruce Momjian 已提交
2323 2324
	 * Here, on the other hand, we must MAXALIGN to ensure the page
	 * buffers have worst-case alignment.
T
Tom Lane 已提交
2325 2326 2327 2328 2329 2330 2331
	 */
	XLogCtl->pages =
		((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
									  sizeof(XLogRecPtr) * XLOGbuffers);
	memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);

	/*
B
Bruce Momjian 已提交
2332 2333
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will
	 * fill in additional info.)
T
Tom Lane 已提交
2334 2335 2336 2337
	 */
	XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
2338
	SpinLockInit(&XLogCtl->info_lck);
T
Tom Lane 已提交
2339

2340 2341 2342 2343 2344 2345 2346
	/*
	 * If we are not in bootstrap mode, pg_control should already exist.
	 * Read and validate it immediately (see comments in ReadControlFile()
	 * for the reasons why).
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
2347 2348 2349
}

/*
T
Tom Lane 已提交
2350 2351
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
2352 2353
 */
void
T
Tom Lane 已提交
2354
BootStrapXLOG(void)
2355
{
2356
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2357 2358
	char	   *buffer;
	XLogPageHeader page;
2359
	XLogRecord *record;
B
Bruce Momjian 已提交
2360
	bool		use_existent;
2361
	crc64		crc;
2362

T
Tom Lane 已提交
2363 2364 2365 2366
	/* Use malloc() to ensure buffer is MAXALIGNED */
	buffer = (char *) malloc(BLCKSZ);
	page = (XLogPageHeader) buffer;

2367 2368 2369
	checkPoint.redo.xlogid = 0;
	checkPoint.redo.xrecoff = SizeOfXLogPHD;
	checkPoint.undo = checkPoint.redo;
T
Tom Lane 已提交
2370
	checkPoint.ThisStartUpID = 0;
2371
	checkPoint.nextXid = FirstNormalTransactionId;
2372
	checkPoint.nextOid = BootstrapObjectIdData;
T
Tom Lane 已提交
2373
	checkPoint.time = time(NULL);
2374

2375 2376 2377 2378
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;

2379 2380 2381
	memset(buffer, 0, BLCKSZ);
	page->xlp_magic = XLOG_PAGE_MAGIC;
	page->xlp_info = 0;
2382
	page->xlp_sui = checkPoint.ThisStartUpID;
2383 2384
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
2385 2386 2387
	record = (XLogRecord *) ((char *) page + SizeOfXLogPHD);
	record->xl_prev.xlogid = 0;
	record->xl_prev.xrecoff = 0;
2388 2389 2390
	record->xl_xact_prev = record->xl_prev;
	record->xl_xid = InvalidTransactionId;
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
2391
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
2392
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
2393
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
2394

2395
	INIT_CRC64(crc);
T
Tom Lane 已提交
2396
	COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
B
Bruce Momjian 已提交
2397
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
2398
			   SizeOfXLogRecord - sizeof(crc64));
2399 2400 2401
	FIN_CRC64(crc);
	record->xl_crc = crc;

2402 2403
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
2404

2405
	errno = 0;
T
Tom Lane 已提交
2406
	if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
2407 2408 2409 2410
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2411
		elog(STOP, "BootStrapXLOG failed to write log file: %m");
2412
	}
2413

T
Tom Lane 已提交
2414
	if (pg_fsync(openLogFile) != 0)
2415
		elog(STOP, "BootStrapXLOG failed to fsync log file: %m");
2416

T
Tom Lane 已提交
2417 2418
	close(openLogFile);
	openLogFile = -1;
2419

2420
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
2421 2422 2423
	/* Initialize pg_control status fields */
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
2424 2425 2426
	ControlFile->logId = 0;
	ControlFile->logSeg = 1;
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
2427
	ControlFile->checkPointCopy = checkPoint;
2428
	/* some additional ControlFile fields are set in WriteControlFile() */
2429

2430
	WriteControlFile();
2431 2432 2433

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
2434 2435
}

2436
static char *
2437 2438
str_time(time_t tnow)
{
T
Tom Lane 已提交
2439
	static char buf[32];
2440

2441
	strftime(buf, sizeof(buf),
T
Tom Lane 已提交
2442
			 "%Y-%m-%d %H:%M:%S %Z",
2443
			 localtime(&tnow));
2444

2445
	return buf;
2446 2447 2448
}

/*
T
Tom Lane 已提交
2449
 * This must be called ONCE during postmaster or standalone-backend startup
2450 2451
 */
void
T
Tom Lane 已提交
2452
StartupXLOG(void)
2453
{
2454 2455
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2456
	bool		wasShutdown;
2457
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
2458 2459 2460
				LastRec,
				checkPointLoc,
				EndOfLog;
2461
	XLogRecord *record;
T
Tom Lane 已提交
2462
	char	   *buffer;
2463

T
Tom Lane 已提交
2464 2465
	/* Use malloc() to ensure record buffer is MAXALIGNED */
	buffer = (char *) malloc(_INTL_MAXLOGRECSZ);
2466

T
Tom Lane 已提交
2467
	CritSectionCount++;
2468 2469

	/*
2470 2471
	 * Read control file and check XLOG status looks valid.
	 *
B
Bruce Momjian 已提交
2472 2473
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
2474
	 */
2475
	ReadControlFile();
2476

2477 2478 2479
	if (ControlFile->logSeg == 0 ||
		ControlFile->state < DB_SHUTDOWNED ||
		ControlFile->state > DB_IN_PRODUCTION ||
2480
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
2481
		elog(STOP, "control file context is broken");
2482 2483

	if (ControlFile->state == DB_SHUTDOWNED)
2484
		elog(LOG, "database system was shut down at %s",
2485
			 str_time(ControlFile->time));
2486
	else if (ControlFile->state == DB_SHUTDOWNING)
2487
		elog(LOG, "database system shutdown was interrupted at %s",
2488
			 str_time(ControlFile->time));
2489
	else if (ControlFile->state == DB_IN_RECOVERY)
2490
		elog(LOG, "database system was interrupted being in recovery at %s\n"
T
Tom Lane 已提交
2491
			 "\tThis probably means that some data blocks are corrupted\n"
2492
			 "\tand you will have to use the last backup for recovery.",
2493
			 str_time(ControlFile->time));
2494
	else if (ControlFile->state == DB_IN_PRODUCTION)
2495
		elog(LOG, "database system was interrupted at %s",
2496
			 str_time(ControlFile->time));
2497

T
Tom Lane 已提交
2498 2499 2500 2501
	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
2502
	record = ReadCheckpointRecord(ControlFile->checkPoint, 1, buffer);
T
Tom Lane 已提交
2503 2504 2505
	if (record != NULL)
	{
		checkPointLoc = ControlFile->checkPoint;
2506
		elog(LOG, "checkpoint record is at %X/%X",
T
Tom Lane 已提交
2507 2508 2509 2510
			 checkPointLoc.xlogid, checkPointLoc.xrecoff);
	}
	else
	{
2511
		record = ReadCheckpointRecord(ControlFile->prevCheckPoint, 2, buffer);
T
Tom Lane 已提交
2512 2513 2514
		if (record != NULL)
		{
			checkPointLoc = ControlFile->prevCheckPoint;
2515
			elog(LOG, "using previous checkpoint record at %X/%X",
T
Tom Lane 已提交
2516 2517 2518 2519
				 checkPointLoc.xlogid, checkPointLoc.xrecoff);
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
2520
			elog(STOP, "unable to locate a valid checkpoint record");
T
Tom Lane 已提交
2521 2522 2523 2524
	}
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
2525

2526
	elog(LOG, "redo record is at %X/%X; undo record is at %X/%X; shutdown %s",
2527
		 checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
V
Vadim B. Mikheev 已提交
2528
		 checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
T
Tom Lane 已提交
2529
		 wasShutdown ? "TRUE" : "FALSE");
2530
	elog(LOG, "next transaction id: %u; next oid: %u",
2531
		 checkPoint.nextXid, checkPoint.nextOid);
2532
	if (!TransactionIdIsNormal(checkPoint.nextXid))
2533
		elog(STOP, "invalid next transaction id");
2534 2535 2536

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
2537
	ShmemVariableCache->oidCount = 0;
2538

V
WAL  
Vadim B. Mikheev 已提交
2539
	ThisStartUpID = checkPoint.ThisStartUpID;
B
Bruce Momjian 已提交
2540
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
2541
		XLogCtl->RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
2542

2543
	if (XLByteLT(RecPtr, checkPoint.redo))
2544
		elog(STOP, "invalid redo in checkpoint record");
2545 2546 2547
	if (checkPoint.undo.xrecoff == 0)
		checkPoint.undo = RecPtr;

B
Bruce Momjian 已提交
2548
	if (XLByteLT(checkPoint.undo, RecPtr) ||
V
Vadim B. Mikheev 已提交
2549
		XLByteLT(checkPoint.redo, RecPtr))
2550
	{
T
Tom Lane 已提交
2551
		if (wasShutdown)
2552
			elog(STOP, "invalid redo/undo record in shutdown checkpoint");
V
WAL  
Vadim B. Mikheev 已提交
2553
		InRecovery = true;
2554 2555
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
2556
		InRecovery = true;
2557

V
WAL  
Vadim B. Mikheev 已提交
2558 2559
	/* REDO */
	if (InRecovery)
2560
	{
2561
		elog(LOG, "database system was not properly shut down; "
2562
			 "automatic recovery in progress");
2563 2564 2565 2566
		ControlFile->state = DB_IN_RECOVERY;
		ControlFile->time = time(NULL);
		UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2567
		XLogInitRelationCache();
V
Vadim B. Mikheev 已提交
2568

2569 2570
		/* Is REDO required ? */
		if (XLByteLT(checkPoint.redo, RecPtr))
T
Tom Lane 已提交
2571
			record = ReadRecord(&(checkPoint.redo), STOP, buffer);
B
Bruce Momjian 已提交
2572
		else
2573 2574
		{
			/* read past CheckPoint record */
T
Tom Lane 已提交
2575
			record = ReadRecord(NULL, LOG, buffer);
2576
		}
2577

T
Tom Lane 已提交
2578
		if (record != NULL)
2579
		{
V
WAL  
Vadim B. Mikheev 已提交
2580
			InRedo = true;
2581
			elog(LOG, "redo starts at %X/%X",
2582
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2583 2584
			do
			{
2585 2586
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
2587
											ShmemVariableCache->nextXid))
2588 2589 2590 2591
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}
V
WAL  
Vadim B. Mikheev 已提交
2592 2593
				if (XLOG_DEBUG)
				{
B
Bruce Momjian 已提交
2594
					char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
2595

2596
					sprintf(buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
2597 2598
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
							EndRecPtr.xlogid, EndRecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
2599 2600
					xlog_outrec(buf, record);
					strcat(buf, " - ");
B
Bruce Momjian 已提交
2601 2602
					RmgrTable[record->xl_rmid].rm_desc(buf,
								record->xl_info, XLogRecGetData(record));
2603
					elog(DEBUG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
2604 2605
				}

T
Tom Lane 已提交
2606
				if (record->xl_info & XLR_BKP_BLOCK_MASK)
2607 2608
					RestoreBkpBlocks(record, EndRecPtr);

2609
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
T
Tom Lane 已提交
2610 2611
				record = ReadRecord(NULL, LOG, buffer);
			} while (record != NULL);
2612
			elog(LOG, "redo done at %X/%X",
2613
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2614
			LastRec = ReadRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2615
			InRedo = false;
2616 2617
		}
		else
2618
			elog(LOG, "redo is not required");
V
WAL  
Vadim B. Mikheev 已提交
2619 2620
	}

T
Tom Lane 已提交
2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631
	/*
	 * Init xlog buffer cache using the block containing the last valid
	 * record from the previous incarnation.
	 */
	record = ReadRecord(&LastRec, STOP, buffer);
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, openLogId, openLogSeg);
	openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
	openLogOff = 0;
	ControlFile->logId = openLogId;
	ControlFile->logSeg = openLogSeg + 1;
V
WAL  
Vadim B. Mikheev 已提交
2632
	Insert = &XLogCtl->Insert;
2633
	Insert->PrevRecord = LastRec;
B
Bruce Momjian 已提交
2634 2635

	/*
2636 2637
	 * If the next record will go to the new page then initialize for that
	 * one.
T
Tom Lane 已提交
2638
	 */
2639 2640 2641 2642
	if ((BLCKSZ - EndOfLog.xrecoff % BLCKSZ) < SizeOfXLogRecord)
		EndOfLog.xrecoff += (BLCKSZ - EndOfLog.xrecoff % BLCKSZ);
	if (EndOfLog.xrecoff % BLCKSZ == 0)
	{
2643 2644 2645 2646
		XLogRecPtr	NewPageEndPtr;

		NewPageEndPtr = EndOfLog;
		if (NewPageEndPtr.xrecoff >= XLogFileSize)
2647
		{
2648 2649 2650
			/* crossing a logid boundary */
			NewPageEndPtr.xlogid += 1;
			NewPageEndPtr.xrecoff = BLCKSZ;
2651 2652
		}
		else
2653 2654
			NewPageEndPtr.xrecoff += BLCKSZ;
		XLogCtl->xlblocks[0] = NewPageEndPtr;
2655 2656 2657 2658 2659
		Insert->currpage->xlp_magic = XLOG_PAGE_MAGIC;
		if (InRecovery)
			Insert->currpage->xlp_sui = ThisStartUpID;
		else
			Insert->currpage->xlp_sui = ThisStartUpID + 1;
2660 2661
		Insert->currpage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
		Insert->currpage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
2662
		/* rest of buffer was zeroed in XLOGShmemInit */
2663
		Insert->currpos = (char *) Insert->currpage + SizeOfXLogPHD;
2664 2665 2666 2667 2668 2669
	}
	else
	{
		XLogCtl->xlblocks[0].xlogid = openLogId;
		XLogCtl->xlblocks[0].xrecoff =
			((EndOfLog.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
2670

2671 2672
		/*
		 * Tricky point here: readBuf contains the *last* block that the
2673
		 * LastRec record spans, not the one it starts in.	The last block
2674
		 * is indeed the one we want to use.
2675 2676 2677 2678 2679 2680 2681 2682
		 */
		Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - BLCKSZ) % XLogSegSize);
		memcpy((char *) Insert->currpage, readBuf, BLCKSZ);
		Insert->currpos = (char *) Insert->currpage +
			(EndOfLog.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
		/* Make sure rest of page is zero */
		memset(Insert->currpos, 0, INSERT_FREESPACE(Insert));
	}
V
WAL  
Vadim B. Mikheev 已提交
2683

T
Tom Lane 已提交
2684
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
2685

T
Tom Lane 已提交
2686 2687 2688
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
2689

T
Tom Lane 已提交
2690 2691
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
2692

V
Vadim B. Mikheev 已提交
2693
#ifdef NOT_USED
V
WAL  
Vadim B. Mikheev 已提交
2694 2695 2696
	/* UNDO */
	if (InRecovery)
	{
2697 2698 2699
		RecPtr = ReadRecPtr;
		if (XLByteLT(checkPoint.undo, RecPtr))
		{
2700
			elog(LOG, "undo starts at %X/%X",
2701
				 RecPtr.xlogid, RecPtr.xrecoff);
2702 2703
			do
			{
T
Tom Lane 已提交
2704
				record = ReadRecord(&RecPtr, STOP, buffer);
2705
				if (TransactionIdIsValid(record->xl_xid) &&
2706
					!TransactionIdDidCommit(record->xl_xid))
V
misc  
Vadim B. Mikheev 已提交
2707
					RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
2708 2709
				RecPtr = record->xl_prev;
			} while (XLByteLE(checkPoint.undo, RecPtr));
2710
			elog(LOG, "undo done at %X/%X",
2711
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2712 2713
		}
		else
2714
			elog(LOG, "undo is not required");
2715
	}
V
WAL  
Vadim B. Mikheev 已提交
2716
#endif
2717

V
WAL  
Vadim B. Mikheev 已提交
2718
	if (InRecovery)
2719
	{
T
Tom Lane 已提交
2720 2721 2722 2723 2724 2725 2726
		/*
		 * In case we had to use the secondary checkpoint, make sure that
		 * it will still be shown as the secondary checkpoint after this
		 * CreateCheckPoint operation; we don't want the broken primary
		 * checkpoint to become prevCheckPoint...
		 */
		ControlFile->checkPoint = checkPointLoc;
2727
		CreateCheckPoint(true);
V
WAL  
Vadim B. Mikheev 已提交
2728
		XLogCloseRelationCache();
2729
	}
2730

T
Tom Lane 已提交
2731 2732 2733 2734
	/*
	 * Preallocate additional log files, if wanted.
	 */
	PreallocXlogFiles(EndOfLog);
2735

V
WAL  
Vadim B. Mikheev 已提交
2736
	InRecovery = false;
2737 2738 2739 2740 2741

	ControlFile->state = DB_IN_PRODUCTION;
	ControlFile->time = time(NULL);
	UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2742 2743 2744
	ThisStartUpID++;
	XLogCtl->ThisStartUpID = ThisStartUpID;

2745 2746 2747
	/* Start up the commit log, too */
	StartupCLOG();

2748
	elog(LOG, "database system is ready");
2749
	CritSectionCount--;
2750

T
Tom Lane 已提交
2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	free(buffer);
}

2766 2767 2768 2769
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 * whichChkpt = 1 for "primary", 2 for "secondary", merely informative
 */
T
Tom Lane 已提交
2770 2771
static XLogRecord *
ReadCheckpointRecord(XLogRecPtr RecPtr,
2772
					 int whichChkpt,
T
Tom Lane 已提交
2773 2774 2775 2776 2777 2778
					 char *buffer)
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
2779 2780 2781
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint link in control file" :
				   "invalid secondary checkpoint link in control file"));
T
Tom Lane 已提交
2782 2783 2784 2785 2786 2787 2788
		return NULL;
	}

	record = ReadRecord(&RecPtr, LOG, buffer);

	if (record == NULL)
	{
2789 2790 2791
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint record" :
				   "invalid secondary checkpoint record"));
T
Tom Lane 已提交
2792 2793 2794 2795
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
2796
		elog(LOG, (whichChkpt == 1 ?
2797 2798
			 "invalid resource manager id in primary checkpoint record" :
		  "invalid resource manager id in secondary checkpoint record"));
T
Tom Lane 已提交
2799 2800 2801 2802 2803
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
2804 2805 2806
		elog(LOG, (whichChkpt == 1 ?
				   "invalid xl_info in primary checkpoint record" :
				   "invalid xl_info in secondary checkpoint record"));
T
Tom Lane 已提交
2807 2808 2809 2810
		return NULL;
	}
	if (record->xl_len != sizeof(CheckPoint))
	{
2811 2812 2813
		elog(LOG, (whichChkpt == 1 ?
				   "invalid length of primary checkpoint record" :
				   "invalid length of secondary checkpoint record"));
T
Tom Lane 已提交
2814 2815 2816
		return NULL;
	}
	return record;
2817 2818
}

V
WAL  
Vadim B. Mikheev 已提交
2819
/*
T
Tom Lane 已提交
2820
 * Postmaster uses this to initialize ThisStartUpID & RedoRecPtr from
2821
 * XLogCtlData located in shmem after successful startup.
V
WAL  
Vadim B. Mikheev 已提交
2822 2823 2824 2825 2826
 */
void
SetThisStartUpID(void)
{
	ThisStartUpID = XLogCtl->ThisStartUpID;
2827 2828 2829 2830
	RedoRecPtr = XLogCtl->RedoRecPtr;
}

/*
T
Tom Lane 已提交
2831
 * CheckPoint process called by postmaster saves copy of new RedoRecPtr
B
Bruce Momjian 已提交
2832
 * in shmem (using SetRedoRecPtr).	When checkpointer completes, postmaster
T
Tom Lane 已提交
2833 2834
 * calls GetRedoRecPtr to update its own copy of RedoRecPtr, so that
 * subsequently-spawned backends will start out with a reasonably up-to-date
2835
 * local RedoRecPtr.  Since these operations are not protected by any lock
T
Tom Lane 已提交
2836 2837 2838 2839
 * and copying an XLogRecPtr isn't atomic, it's unsafe to use either of these
 * routines at other times!
 *
 * Note: once spawned, a backend must update its local RedoRecPtr from
2840
 * XLogCtl->Insert.RedoRecPtr while holding the insert lock.  This is
T
Tom Lane 已提交
2841
 * done in XLogInsert().
2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852
 */
void
SetRedoRecPtr(void)
{
	XLogCtl->RedoRecPtr = RedoRecPtr;
}

void
GetRedoRecPtr(void)
{
	RedoRecPtr = XLogCtl->RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2853 2854
}

2855
/*
T
Tom Lane 已提交
2856
 * This must be called ONCE during postmaster or standalone-backend shutdown
2857 2858
 */
void
T
Tom Lane 已提交
2859
ShutdownXLOG(void)
2860
{
2861
	elog(LOG, "shutting down");
2862

T
Tom Lane 已提交
2863 2864 2865
	/* suppress in-transaction check in CreateCheckPoint */
	MyLastRecPtr.xrecoff = 0;

2866
	CritSectionCount++;
V
Vadim B. Mikheev 已提交
2867
	CreateDummyCaches();
2868
	CreateCheckPoint(true);
2869
	ShutdownCLOG();
2870
	CritSectionCount--;
2871

2872
	elog(LOG, "database system is shut down");
2873 2874
}

T
Tom Lane 已提交
2875 2876 2877
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
2878 2879 2880
void
CreateCheckPoint(bool shutdown)
{
2881 2882 2883
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
2884
	XLogRecData rdata;
2885
	uint32		freespace;
V
Vadim B. Mikheev 已提交
2886 2887 2888 2889 2890
	uint32		_logId;
	uint32		_logSeg;

	if (MyLastRecPtr.xrecoff != 0)
		elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
B
Bruce Momjian 已提交
2891

2892 2893
	/*
	 * The CheckpointLock can be held for quite a while, which is not good
2894 2895 2896 2897 2898
	 * because we won't respond to a cancel/die request while waiting for
	 * an LWLock.  (But the alternative of using a regular lock won't work
	 * for background checkpoint processes, which are not regular
	 * backends.) So, rather than use a plain LWLockAcquire, use this
	 * kluge to allow an interrupt to be accepted while we are waiting:
2899 2900
	 */
	while (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
V
Vadim B. Mikheev 已提交
2901
	{
2902 2903
		CHECK_FOR_INTERRUPTS();
		sleep(1);
V
Vadim B. Mikheev 已提交
2904
	}
2905

2906 2907
	START_CRIT_SECTION();

2908 2909 2910 2911 2912 2913
	if (shutdown)
	{
		ControlFile->state = DB_SHUTDOWNING;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
T
Tom Lane 已提交
2914 2915

	memset(&checkPoint, 0, sizeof(checkPoint));
V
WAL  
Vadim B. Mikheev 已提交
2916
	checkPoint.ThisStartUpID = ThisStartUpID;
T
Tom Lane 已提交
2917
	checkPoint.time = time(NULL);
2918

2919
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
2920 2921 2922 2923

	/*
	 * If this isn't a shutdown, and we have not inserted any XLOG records
	 * since the start of the last checkpoint, skip the checkpoint.  The
B
Bruce Momjian 已提交
2924 2925 2926 2927 2928 2929
	 * idea here is to avoid inserting duplicate checkpoints when the
	 * system is idle.	That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the
	 * previous checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
2930 2931
	 *
	 * We have to make two tests to determine that nothing has happened since
B
Bruce Momjian 已提交
2932 2933 2934
	 * the start of the last checkpoint: current insertion point must
	 * match the end of the last checkpoint record, and its redo pointer
	 * must point to itself.
T
Tom Lane 已提交
2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948
	 */
	if (!shutdown)
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
2949 2950
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961
			END_CRIT_SECTION();
			return;
		}
	}

	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will
	 * be, since other backends may insert more XLOG records while we're
	 * off doing the buffer flush work.  Those XLOG records are logically
B
Bruce Momjian 已提交
2962
	 * after the checkpoint, even though physically before it.	Got that?
T
Tom Lane 已提交
2963 2964
	 */
	freespace = INSERT_FREESPACE(Insert);
2965 2966
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
2967 2968
		(void) AdvanceXLInsertBuffer();
		/* OK to ignore update return flag, since we will do flush anyway */
2969 2970
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
T
Tom Lane 已提交
2971
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
2972

T
Tom Lane 已提交
2973 2974 2975 2976
	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls;
	 * this must be done while holding the insert lock.
	 */
2977
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
B
Bruce Momjian 已提交
2978

T
Tom Lane 已提交
2979
	/*
B
Bruce Momjian 已提交
2980 2981 2982 2983
	 * Get UNDO record ptr - this is oldest of PROC->logRec values. We do
	 * this while holding insert lock to ensure that we won't miss any
	 * about-to-commit transactions (UNDO must include all xacts that have
	 * commits after REDO point).
2984 2985 2986 2987 2988 2989 2990 2991 2992
	 *
	 * XXX temporarily ifdef'd out to avoid three-way deadlock condition:
	 * GetUndoRecPtr needs to grab SInvalLock to ensure that it is looking
	 * at a stable set of proc records, but grabbing SInvalLock while holding
	 * WALInsertLock is no good.  GetNewTransactionId may cause a WAL record
	 * to be written while holding XidGenLock, and GetSnapshotData needs to
	 * get XidGenLock while holding SInvalLock, so there's a risk of deadlock.
	 * Need to find a better solution.  See pgsql-hackers discussion of
	 * 17-Dec-01.
T
Tom Lane 已提交
2993
	 */
2994
#ifdef NOT_USED
T
Tom Lane 已提交
2995 2996 2997
	checkPoint.undo = GetUndoRecPtr();

	if (shutdown && checkPoint.undo.xrecoff != 0)
2998
		elog(STOP, "active transaction while database system is shutting down");
2999
#endif
T
Tom Lane 已提交
3000 3001 3002 3003 3004

	/*
	 * Now we can release insert lock, allowing other xacts to proceed
	 * even while we are flushing disk buffers.
	 */
3005
	LWLockRelease(WALInsertLock);
3006

3007
	LWLockAcquire(XidGenLock, LW_SHARED);
3008
	checkPoint.nextXid = ShmemVariableCache->nextXid;
3009
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
3010

3011
	LWLockAcquire(OidGenLock, LW_SHARED);
3012
	checkPoint.nextOid = ShmemVariableCache->nextOid;
3013 3014
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
3015
	LWLockRelease(OidGenLock);
3016

T
Tom Lane 已提交
3017
	/*
B
Bruce Momjian 已提交
3018 3019
	 * Having constructed the checkpoint record, ensure all shmem disk
	 * buffers are flushed to disk.
T
Tom Lane 已提交
3020
	 */
V
Vadim B. Mikheev 已提交
3021
	FlushBufferPool();
3022

3023 3024 3025
	/* And commit-log buffers, too */
	CheckPointCLOG();

T
Tom Lane 已提交
3026 3027 3028
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
3029
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3030
	rdata.data = (char *) (&checkPoint);
3031 3032 3033
	rdata.len = sizeof(checkPoint);
	rdata.next = NULL;

T
Tom Lane 已提交
3034 3035 3036 3037 3038 3039
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
3040

T
Tom Lane 已提交
3041 3042 3043 3044 3045
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record,
	 * recptr = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
3046
		elog(STOP, "concurrent transaction log activity while database system is shutting down");
3047

T
Tom Lane 已提交
3048
	/*
3049 3050 3051 3052 3053 3054 3055
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
	 *
	 * With UNDO support: oldest item is redo or undo, whichever is older;
	 * but watch out for case that undo = 0.
	 *
	 * Without UNDO support: just use the redo pointer.  This allows xlog
3056 3057
	 * space to be freed much faster when there are long-running
	 * transactions.
T
Tom Lane 已提交
3058
	 */
3059
#ifdef NOT_USED
B
Bruce Momjian 已提交
3060
	if (ControlFile->checkPointCopy.undo.xrecoff != 0 &&
T
Tom Lane 已提交
3061 3062 3063 3064
		XLByteLT(ControlFile->checkPointCopy.undo,
				 ControlFile->checkPointCopy.redo))
		XLByteToSeg(ControlFile->checkPointCopy.undo, _logId, _logSeg);
	else
3065
#endif
T
Tom Lane 已提交
3066
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
3067

T
Tom Lane 已提交
3068 3069 3070
	/*
	 * Update the control file.
	 */
3071
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3072 3073
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
3074 3075 3076
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
3077 3078
	ControlFile->time = time(NULL);
	UpdateControlFile();
3079
	LWLockRelease(ControlFileLock);
3080

V
Vadim B. Mikheev 已提交
3081
	/*
T
Tom Lane 已提交
3082 3083
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
V
Vadim B. Mikheev 已提交
3084 3085 3086
	 */
	if (_logId || _logSeg)
	{
T
Tom Lane 已提交
3087
		PrevLogSeg(_logId, _logSeg);
3088
		MoveOfflineLogs(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
3089 3090
	}

T
Tom Lane 已提交
3091 3092 3093 3094 3095 3096 3097 3098
	/*
	 * Make more log segments if needed.  (Do this after deleting offline
	 * log segments, to avoid having peak disk space usage higher than
	 * necessary.)
	 */
	if (!shutdown)
		PreallocXlogFiles(recptr);

3099
	LWLockRelease(CheckpointLock);
V
Vadim B. Mikheev 已提交
3100

3101
	END_CRIT_SECTION();
3102
}
V
WAL  
Vadim B. Mikheev 已提交
3103

T
Tom Lane 已提交
3104 3105 3106
/*
 * Write a NEXTOID log record
 */
3107 3108 3109
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
3110
	XLogRecData rdata;
3111

3112
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3113
	rdata.data = (char *) (&nextOid);
3114 3115 3116 3117
	rdata.len = sizeof(Oid);
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
}
V
WAL  
Vadim B. Mikheev 已提交
3118

T
Tom Lane 已提交
3119 3120 3121
/*
 * XLOG resource manager's routines
 */
V
WAL  
Vadim B. Mikheev 已提交
3122 3123 3124
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
3125
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
3126

3127
	if (info == XLOG_NEXTOID)
3128
	{
B
Bruce Momjian 已提交
3129
		Oid			nextOid;
3130 3131 3132

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
3133
		{
3134
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
3153
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
3154 3155
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
3156 3157 3158 3159 3160 3161
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
3162
	}
V
WAL  
Vadim B. Mikheev 已提交
3163
}
B
Bruce Momjian 已提交
3164

V
WAL  
Vadim B. Mikheev 已提交
3165 3166 3167 3168
void
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
B
Bruce Momjian 已提交
3169

V
WAL  
Vadim B. Mikheev 已提交
3170
void
B
Bruce Momjian 已提交
3171
xlog_desc(char *buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
3172
{
B
Bruce Momjian 已提交
3173
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
3174

T
Tom Lane 已提交
3175 3176
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
3177
	{
B
Bruce Momjian 已提交
3178 3179
		CheckPoint *checkpoint = (CheckPoint *) rec;

3180
		sprintf(buf + strlen(buf), "checkpoint: redo %X/%X; undo %X/%X; "
B
Bruce Momjian 已提交
3181 3182 3183 3184 3185 3186
				"sui %u; xid %u; oid %u; %s",
				checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
				checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
				checkpoint->ThisStartUpID, checkpoint->nextXid,
				checkpoint->nextOid,
			 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
3187
	}
3188 3189
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
3190
		Oid			nextOid;
3191 3192 3193 3194

		memcpy(&nextOid, rec, sizeof(Oid));
		sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
	}
V
WAL  
Vadim B. Mikheev 已提交
3195 3196 3197 3198 3199 3200 3201
	else
		strcat(buf, "UNKNOWN");
}

static void
xlog_outrec(char *buf, XLogRecord *record)
{
B
Bruce Momjian 已提交
3202 3203
	int			bkpb;
	int			i;
3204

3205
	sprintf(buf + strlen(buf), "prev %X/%X; xprev %X/%X; xid %u",
B
Bruce Momjian 已提交
3206 3207 3208
			record->xl_prev.xlogid, record->xl_prev.xrecoff,
			record->xl_xact_prev.xlogid, record->xl_xact_prev.xrecoff,
			record->xl_xid);
3209

T
Tom Lane 已提交
3210
	for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3211 3212 3213 3214 3215 3216 3217 3218 3219 3220
	{
		if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i))))
			continue;
		bkpb++;
	}

	if (bkpb)
		sprintf(buf + strlen(buf), "; bkpb %d", bkpb);

	sprintf(buf + strlen(buf), ": %s",
B
Bruce Momjian 已提交
3221
			RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
3222
}
3223 3224 3225 3226 3227 3228 3229 3230 3231


/*
 * GUC support routines
 */

bool
check_xlog_sync_method(const char *method)
{
B
Bruce Momjian 已提交
3232 3233
	if (strcasecmp(method, "fsync") == 0)
		return true;
3234
#ifdef HAVE_FDATASYNC
B
Bruce Momjian 已提交
3235 3236
	if (strcasecmp(method, "fdatasync") == 0)
		return true;
3237 3238
#endif
#ifdef OPEN_SYNC_FLAG
B
Bruce Momjian 已提交
3239 3240
	if (strcasecmp(method, "open_sync") == 0)
		return true;
3241 3242
#endif
#ifdef OPEN_DATASYNC_FLAG
B
Bruce Momjian 已提交
3243 3244
	if (strcasecmp(method, "open_datasync") == 0)
		return true;
3245 3246 3247 3248 3249 3250 3251
#endif
	return false;
}

void
assign_xlog_sync_method(const char *method)
{
B
Bruce Momjian 已提交
3252 3253
	int			new_sync_method;
	int			new_sync_bit;
3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283

	if (strcasecmp(method, "fsync") == 0)
	{
		new_sync_method = SYNC_METHOD_FSYNC;
		new_sync_bit = 0;
	}
#ifdef HAVE_FDATASYNC
	else if (strcasecmp(method, "fdatasync") == 0)
	{
		new_sync_method = SYNC_METHOD_FDATASYNC;
		new_sync_bit = 0;
	}
#endif
#ifdef OPEN_SYNC_FLAG
	else if (strcasecmp(method, "open_sync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_SYNC_FLAG;
	}
#endif
#ifdef OPEN_DATASYNC_FLAG
	else if (strcasecmp(method, "open_datasync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_DATASYNC_FLAG;
	}
#endif
	else
	{
		/* Can't get here unless guc.c screwed up */
3284
		elog(ERROR, "bogus wal_sync_method %s", method);
3285 3286 3287 3288 3289 3290 3291
		new_sync_method = 0;	/* keep compiler quiet */
		new_sync_bit = 0;
	}

	if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
	{
		/*
B
Bruce Momjian 已提交
3292 3293 3294 3295
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new
		 * flag bit) at next use.
3296 3297 3298 3299
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
3300
				elog(STOP, "fsync of log file %u, segment %u failed: %m",
3301 3302 3303 3304
					 openLogId, openLogSeg);
			if (open_sync_bit != new_sync_bit)
			{
				if (close(openLogFile) != 0)
3305
					elog(STOP, "close of log file %u, segment %u failed: %m",
3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
		}
		sync_method = new_sync_method;
		open_sync_bit = new_sync_bit;
	}
}


/*
 * Issue appropriate kind of fsync (if any) on the current XLOG output file
 */
static void
issue_xlog_fsync(void)
{
	switch (sync_method)
	{
3324
		case SYNC_METHOD_FSYNC:
3325
			if (pg_fsync(openLogFile) != 0)
3326
				elog(STOP, "fsync of log file %u, segment %u failed: %m",
3327 3328 3329 3330 3331
					 openLogId, openLogSeg);
			break;
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
			if (pg_fdatasync(openLogFile) != 0)
3332
				elog(STOP, "fdatasync of log file %u, segment %u failed: %m",
3333 3334 3335 3336 3337 3338 3339
					 openLogId, openLogSeg);
			break;
#endif
		case SYNC_METHOD_OPEN:
			/* write synced it already */
			break;
		default:
3340
			elog(STOP, "bogus wal_sync_method %d", sync_method);
3341 3342 3343
			break;
	}
}