xlog.c 97.5 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
B
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.99 2002/08/04 06:53:10 thomas Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <fcntl.h>
T
Tom Lane 已提交
18
#include <signal.h>
19 20 21
#include <unistd.h>
#include <errno.h>
#include <sys/stat.h>
V
Vadim B. Mikheev 已提交
22
#include <sys/time.h>
V
Vadim B. Mikheev 已提交
23 24
#include <sys/types.h>
#include <dirent.h>
25
#include <locale.h>
26

27
#include "access/clog.h"
28
#include "access/transam.h"
29
#include "access/xact.h"
30 31
#include "access/xlog.h"
#include "access/xlogutils.h"
32
#include "catalog/catversion.h"
T
Tom Lane 已提交
33
#include "catalog/pg_control.h"
34 35
#include "storage/bufpage.h"
#include "storage/lwlock.h"
36
#include "storage/pmsignal.h"
37
#include "storage/proc.h"
38
#include "storage/sinval.h"
39
#include "storage/spin.h"
40
#include "utils/builtins.h"
41
#include "utils/relcache.h"
V
WAL  
Vadim B. Mikheev 已提交
42 43
#include "miscadmin.h"

44

45 46 47
/*
 * This chunk of hackery attempts to determine which file sync methods
 * are available on the current platform, and to choose an appropriate
B
Bruce Momjian 已提交
48
 * default method.	We assume that fsync() is always available, and that
49 50 51 52
 * configure determined whether fdatasync() is.
 */
#define SYNC_METHOD_FSYNC		0
#define SYNC_METHOD_FDATASYNC	1
B
Bruce Momjian 已提交
53 54
#define SYNC_METHOD_OPEN		2		/* used for both O_SYNC and
										 * O_DSYNC */
55 56

#if defined(O_SYNC)
B
Bruce Momjian 已提交
57
#define OPEN_SYNC_FLAG	   O_SYNC
58
#else
B
Bruce Momjian 已提交
59 60 61
#if defined(O_FSYNC)
#define OPEN_SYNC_FLAG	  O_FSYNC
#endif
62 63 64
#endif

#if defined(OPEN_SYNC_FLAG)
B
Bruce Momjian 已提交
65 66 67
#if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
#define OPEN_DATASYNC_FLAG	  O_DSYNC
#endif
68 69 70
#endif

#if defined(OPEN_DATASYNC_FLAG)
B
Bruce Momjian 已提交
71 72 73
#define DEFAULT_SYNC_METHOD_STR    "open_datasync"
#define DEFAULT_SYNC_METHOD		   SYNC_METHOD_OPEN
#define DEFAULT_SYNC_FLAGBIT	   OPEN_DATASYNC_FLAG
74
#else
B
Bruce Momjian 已提交
75 76 77 78 79 80 81 82 83
#if defined(HAVE_FDATASYNC)
#define DEFAULT_SYNC_METHOD_STR   "fdatasync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FDATASYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#else
#define DEFAULT_SYNC_METHOD_STR   "fsync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FSYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#endif
84 85 86
#endif


T
Tom Lane 已提交
87 88
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
89
int			XLOGbuffers = 8;
90
int			XLOGfiles = 0;		/* # of files to preallocate during ckpt */
T
Tom Lane 已提交
91
int			XLOG_DEBUG = 0;
92 93
char	   *XLOG_sync_method = NULL;
const char	XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
B
Bruce Momjian 已提交
94 95
char		XLOG_archive_dir[MAXPGPATH];		/* null string means
												 * delete 'em */
T
Tom Lane 已提交
96

97
/*
98
 * XLOGfileslop is used in the code as the allowed "fuzz" in the number of
99 100 101 102 103 104 105 106 107 108 109 110
 * preallocated XLOG segments --- we try to have at least XLOGfiles advance
 * segments but no more than XLOGfiles+XLOGfileslop segments.  This could
 * be made a separate GUC variable, but at present I think it's sufficient
 * to hardwire it as 2*CheckPointSegments+1.  Under normal conditions, a
 * checkpoint will free no more than 2*CheckPointSegments log segments, and
 * we want to recycle all of them; the +1 allows boundary cases to happen
 * without wasting a delete/create-segment cycle.
 */

#define XLOGfileslop	(2*CheckPointSegments + 1)


111 112 113 114 115 116
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
static int	sync_method = DEFAULT_SYNC_METHOD;
static int	open_sync_bit = DEFAULT_SYNC_FLAGBIT;

#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)

117 118
#define MinXLOGbuffers	4

T
Tom Lane 已提交
119 120 121 122 123

/*
 * ThisStartUpID will be same in all backends --- it identifies current
 * instance of the database system.
 */
V
WAL  
Vadim B. Mikheev 已提交
124 125
StartUpID	ThisStartUpID = 0;

T
Tom Lane 已提交
126 127
/* Are we doing recovery by reading XLOG? */
bool		InRecovery = false;
128

T
Tom Lane 已提交
129 130
/*
 * MyLastRecPtr points to the start of the last XLOG record inserted by the
131 132
 * current transaction.  If MyLastRecPtr.xrecoff == 0, then the current
 * xact hasn't yet inserted any transaction-controlled XLOG records.
T
Tom Lane 已提交
133 134
 *
 * Note that XLOG records inserted outside transaction control are not
135 136 137 138
 * reflected into MyLastRecPtr.  They do, however, cause MyXactMadeXLogEntry
 * to be set true.  The latter can be used to test whether the current xact
 * made any loggable changes (including out-of-xact changes, such as
 * sequence updates).
T
Tom Lane 已提交
139 140
 */
XLogRecPtr	MyLastRecPtr = {0, 0};
V
Vadim B. Mikheev 已提交
141

142 143
bool		MyXactMadeXLogEntry = false;

T
Tom Lane 已提交
144 145 146
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 * current backend.  It is updated for all inserts, transaction-controlled
147
 * or not.  ProcLastRecEnd is similar but points to end+1 of last record.
T
Tom Lane 已提交
148 149
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
150

151 152
XLogRecPtr	ProcLastRecEnd = {0, 0};

T
Tom Lane 已提交
153 154 155
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
156
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
157
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
158 159 160
 * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 * see GetRedoRecPtr.
T
Tom Lane 已提交
161 162
 */
static XLogRecPtr RedoRecPtr;
163

T
Tom Lane 已提交
164 165 166 167 168 169 170 171 172
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
173
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
174 175 176
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
177 178 179 180
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
181 182 183
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
184 185
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
186 187 188
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
189
 * but is updated when convenient.	Again, it exists for the convenience of
190
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
191 192 193 194 195 196 197 198 199 200
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
 * CheckpointLock: must be held to do a checkpoint (ensures only one
 * checkpointer at a time; even though the postmaster won't launch
 * parallel checkpoint processes, we need this because manual checkpoints
 * could be launched simultaneously).
 *
T
Tom Lane 已提交
219 220 221
 *----------
 */
typedef struct XLogwrtRqst
222
{
T
Tom Lane 已提交
223 224
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
225
} XLogwrtRqst;
226

T
Tom Lane 已提交
227
typedef struct XLogwrtResult
228
{
T
Tom Lane 已提交
229 230
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
231
} XLogwrtResult;
232

T
Tom Lane 已提交
233 234 235
/*
 * Shared state data for XLogInsert.
 */
236 237
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
238 239 240 241 242 243
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
	uint16		curridx;		/* current block index in cache */
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
244 245
} XLogCtlInsert;

T
Tom Lane 已提交
246 247 248
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
249 250
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
251 252
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	uint16		curridx;		/* cache index of next block to write */
253 254
} XLogCtlWrite;

T
Tom Lane 已提交
255 256 257
/*
 * Total shared-memory state for XLOG.
 */
258 259
typedef struct XLogCtlData
{
260
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
261
	XLogCtlInsert Insert;
T
Tom Lane 已提交
262
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
263 264
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
265
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
266 267
	XLogCtlWrite Write;

T
Tom Lane 已提交
268 269
	/*
	 * These values do not change after startup, although the pointed-to
270 271 272
	 * pages and xlblocks values certainly do.	Permission to read/write
	 * the pages and xlblocks values depends on WALInsertLock and
	 * WALWriteLock.
T
Tom Lane 已提交
273
	 */
B
Bruce Momjian 已提交
274 275 276 277 278
	char	   *pages;			/* buffers for unwritten XLOG pages */
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + BLCKSZ */
	uint32		XLogCacheByte;	/* # bytes in xlog buffers */
	uint32		XLogCacheBlck;	/* highest allocated xlog buffer index */
	StartUpID	ThisStartUpID;
T
Tom Lane 已提交
279

280
	/* This value is not protected by *any* lock... */
281 282
	/* see SetSavedRedoRecPtr/GetSavedRedoRecPtr */
	XLogRecPtr	SavedRedoRecPtr;
T
Tom Lane 已提交
283

B
Bruce Momjian 已提交
284
	slock_t		info_lck;		/* locks shared LogwrtRqst/LogwrtResult */
285 286
} XLogCtlData;

287
static XLogCtlData *XLogCtl = NULL;
288

289
/*
T
Tom Lane 已提交
290
 * We maintain an image of pg_control in shared memory.
291
 */
292
static ControlFileData *ControlFile = NULL;
293

T
Tom Lane 已提交
294 295 296 297 298
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
299

T
Tom Lane 已提交
300 301 302 303 304 305 306 307 308
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
	(BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
309
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
	)


/* Increment an xlogid/segment pair */
#define NextLogSeg(logId, logSeg)	\
	do { \
		if ((logSeg) >= XLogSegsPerFile-1) \
		{ \
			(logId)++; \
			(logSeg) = 0; \
		} \
		else \
			(logSeg)++; \
	} while (0)

/* Decrement an xlogid/segment pair (assume it's not 0,0) */
#define PrevLogSeg(logId, logSeg)	\
	do { \
		if (logSeg) \
			(logSeg)--; \
		else \
		{ \
			(logId)--; \
			(logSeg) = XLogSegsPerFile-1; \
		} \
	} while (0)
V
WAL  
Vadim B. Mikheev 已提交
336

T
Tom Lane 已提交
337 338 339 340
/*
 * Compute ID and segment from an XLogRecPtr.
 *
 * For XLByteToSeg, do the computation at face value.  For XLByteToPrevSeg,
B
Bruce Momjian 已提交
341
 * a boundary byte is taken to be in the previous segment.	This is suitable
T
Tom Lane 已提交
342 343 344 345 346 347 348 349 350 351 352
 * for deciding which segment to write given a pointer to a record end,
 * for example.
 */
#define XLByteToSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = (xlrp).xrecoff / XLogSegSize \
	)
#define XLByteToPrevSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = ((xlrp).xrecoff - 1) / XLogSegSize \
	)
353

354
/*
T
Tom Lane 已提交
355 356 357 358
 * Is an XLogRecPtr within a particular XLOG segment?
 *
 * For XLByteInSeg, do the computation at face value.  For XLByteInPrevSeg,
 * a boundary byte is taken to be in the previous segment.
359
 */
T
Tom Lane 已提交
360 361 362 363 364 365 366
#define XLByteInSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 (xlrp).xrecoff / XLogSegSize == (logSeg))

#define XLByteInPrevSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 ((xlrp).xrecoff - 1) / XLogSegSize == (logSeg))
367 368


369
#define XLogFileName(path, log, seg)	\
370 371
			snprintf(path, MAXPGPATH, "%s/%08X%08X",	\
					 XLogDir, log, seg)
372

T
Tom Lane 已提交
373 374 375 376 377
#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
378

379
#define XRecOffIsValid(xrecoff) \
T
Tom Lane 已提交
380 381
		((xrecoff) % BLCKSZ >= SizeOfXLogPHD && \
		(BLCKSZ - (xrecoff) % BLCKSZ) >= SizeOfXLogRecord)
382

T
Tom Lane 已提交
383 384 385 386 387 388
/*
 * _INTL_MAXLOGRECSZ: max space needed for a record including header and
 * any backup-block data.
 */
#define _INTL_MAXLOGRECSZ	(SizeOfXLogRecord + MAXLOGRECSZ + \
							 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
389

390

T
Tom Lane 已提交
391
/* File path names */
392
static char XLogDir[MAXPGPATH] = "";
B
Bruce Momjian 已提交
393
static char ControlFilePath[MAXPGPATH];
T
Tom Lane 已提交
394 395 396 397 398 399

/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
400

T
Tom Lane 已提交
401 402 403 404 405 406 407 408 409 410
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
411

T
Tom Lane 已提交
412 413 414 415 416 417
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
 * will be just past that page.
 */
418 419 420 421
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
B
Bruce Momjian 已提交
422

T
Tom Lane 已提交
423 424
/* Buffer for currently read page (BLCKSZ bytes) */
static char *readBuf = NULL;
B
Bruce Momjian 已提交
425

T
Tom Lane 已提交
426 427 428
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr;
static XLogRecPtr EndRecPtr;
429
static XLogRecord *nextRecord = NULL;
430
static StartUpID lastReadSUI;
431

V
WAL  
Vadim B. Mikheev 已提交
432 433
static bool InRedo = false;

T
Tom Lane 已提交
434 435 436

static bool AdvanceXLInsertBuffer(void);
static void XLogWrite(XLogwrtRqst WriteRqst);
B
Bruce Momjian 已提交
437 438
static int XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock);
439
static bool InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
440 441
					   bool find_free, int max_advance,
					   bool use_lock);
T
Tom Lane 已提交
442 443
static int	XLogFileOpen(uint32 log, uint32 seg, bool econt);
static void PreallocXlogFiles(XLogRecPtr endptr);
444
static void MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr);
T
Tom Lane 已提交
445
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer);
446
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI);
T
Tom Lane 已提交
447
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr,
448
					 int whichChkpt,
B
Bruce Momjian 已提交
449
					 char *buffer);
T
Tom Lane 已提交
450 451 452 453
static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(time_t tnow);
static void xlog_outrec(char *buf, XLogRecord *record);
454
static void issue_xlog_fsync(void);
T
Tom Lane 已提交
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
 * the rdata list (see xlog.h for notes about rdata).
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
472
XLogRecPtr
473
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
474
{
B
Bruce Momjian 已提交
475 476
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
477
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
478 479 480 481 482 483 484 485 486 487 488 489 490 491
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
	uint16		curridx;
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
	crc64		rdata_crc;
	uint32		len,
				write_len;
	unsigned	i;
492
	XLogwrtRqst LogwrtRqst;
B
Bruce Momjian 已提交
493 494
	bool		updrqst;
	bool		no_tran = (rmid == RM_XLOG_ID) ? true : false;
V
Vadim B. Mikheev 已提交
495 496 497 498

	if (info & XLR_INFO_MASK)
	{
		if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
499
			elog(PANIC, "XLogInsert: invalid info mask %02X",
T
Tom Lane 已提交
500
				 (info & XLR_INFO_MASK));
V
Vadim B. Mikheev 已提交
501 502 503 504
		no_tran = true;
		info &= ~XLR_INFO_MASK;
	}

T
Tom Lane 已提交
505
	/*
B
Bruce Momjian 已提交
506 507
	 * In bootstrap mode, we don't actually log anything but XLOG
	 * resources; return a phony record pointer.
T
Tom Lane 已提交
508
	 */
V
Vadim B. Mikheev 已提交
509
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
510 511
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
512
		RecPtr.xrecoff = SizeOfXLogPHD; /* start of 1st checkpoint record */
V
WAL  
Vadim B. Mikheev 已提交
513 514 515
		return (RecPtr);
	}

T
Tom Lane 已提交
516 517 518 519 520 521
	/*
	 * Here we scan the rdata list, determine which buffers must be backed
	 * up, and compute the CRC values for the data.  Note that the record
	 * header isn't added into the CRC yet since we don't know the final
	 * length or info bits quite yet.
	 *
B
Bruce Momjian 已提交
522 523
	 * We may have to loop back to here if a race condition is detected
	 * below. We could prevent the race by doing all this work while
524
	 * holding the insert lock, but it seems better to avoid doing CRC
B
Bruce Momjian 已提交
525 526 527 528 529 530 531 532
	 * calculations while holding the lock.  This means we have to be
	 * careful about modifying the rdata list until we know we aren't
	 * going to loop back again.  The only change we allow ourselves to
	 * make earlier is to set rdt->data = NULL in list items we have
	 * decided we will have to back up the whole buffer for.  This is OK
	 * because we will certainly decide the same thing again for those
	 * items if we do it over; doing it here saves an extra pass over the
	 * list later.
T
Tom Lane 已提交
533
	 */
534
begin:;
T
Tom Lane 已提交
535 536 537 538 539 540
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

541
	INIT_CRC64(rdata_crc);
T
Tom Lane 已提交
542
	len = 0;
B
Bruce Momjian 已提交
543
	for (rdt = rdata;;)
544 545 546
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
547
			/* Simple data, just include it */
548 549 550
			len += rdt->len;
			COMP_CRC64(rdata_crc, rdt->data, rdt->len);
		}
T
Tom Lane 已提交
551
		else
552
		{
T
Tom Lane 已提交
553 554
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
555
			{
T
Tom Lane 已提交
556
				if (rdt->buffer == dtbuf[i])
557
				{
T
Tom Lane 已提交
558 559 560 561 562 563 564 565 566
					/* Buffer already referenced by earlier list item */
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
567
				}
T
Tom Lane 已提交
568
				if (dtbuf[i] == InvalidBuffer)
569
				{
T
Tom Lane 已提交
570 571
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
B
Bruce Momjian 已提交
572

T
Tom Lane 已提交
573 574 575
					/*
					 * XXX We assume page LSN is first data on page
					 */
B
Bruce Momjian 已提交
576
					dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
T
Tom Lane 已提交
577 578
					if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
					{
B
Bruce Momjian 已提交
579
						crc64		dtcrc;
T
Tom Lane 已提交
580 581 582 583 584 585 586 587 588 589

						dtbuf_bkp[i] = true;
						rdt->data = NULL;
						INIT_CRC64(dtcrc);
						COMP_CRC64(dtcrc,
								   BufferGetBlock(dtbuf[i]),
								   BLCKSZ);
						dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
						dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
						COMP_CRC64(dtcrc,
B
Bruce Momjian 已提交
590
								(char *) &(dtbuf_xlg[i]) + sizeof(crc64),
T
Tom Lane 已提交
591 592 593 594 595 596 597 598 599 600
								   sizeof(BkpBlock) - sizeof(crc64));
						FIN_CRC64(dtcrc);
						dtbuf_xlg[i].crc = dtcrc;
					}
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
601 602
				}
			}
T
Tom Lane 已提交
603
			if (i >= XLR_MAX_BKP_BLOCKS)
604
				elog(PANIC, "XLogInsert: can backup %d blocks at most",
T
Tom Lane 已提交
605
					 XLR_MAX_BKP_BLOCKS);
606
		}
T
Tom Lane 已提交
607
		/* Break out of loop when rdt points to last list item */
608 609 610 611 612
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

T
Tom Lane 已提交
613 614 615
	/*
	 * NOTE: the test for len == 0 here is somewhat fishy, since in theory
	 * all of the rmgr data might have been suppressed in favor of backup
B
Bruce Momjian 已提交
616
	 * blocks.	Currently, all callers of XLogInsert provide at least some
T
Tom Lane 已提交
617 618 619 620
	 * not-in-a-buffer data and so len == 0 should never happen, but that
	 * may not be true forever.  If you need to remove the len == 0 check,
	 * also remove the check for xl_len == 0 in ReadRecord, below.
	 */
621
	if (len == 0 || len > MAXLOGRECSZ)
622
		elog(PANIC, "XLogInsert: invalid record length %u", len);
623

624
	START_CRIT_SECTION();
625

626
	/* update LogwrtResult before doing cache fill check */
627 628 629 630 631 632 633 634 635
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		LogwrtRqst = xlogctl->LogwrtRqst;
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
636

637
	/*
638 639
	 * If cache is half filled then try to acquire write lock and do
	 * XLogWrite. Ignore any fractional blocks in performing this check.
640 641 642 643 644
	 */
	LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % BLCKSZ;
	if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
		(LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
		 XLogCtl->XLogCacheByte / 2))
T
Tom Lane 已提交
645
	{
646
		if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
647
		{
648 649 650 651
			LogwrtResult = XLogCtl->Write.LogwrtResult;
			if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
				XLogWrite(LogwrtRqst);
			LWLockRelease(WALWriteLock);
652 653 654
		}
	}

655 656 657
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
658 659
	/*
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to
B
Bruce Momjian 已提交
660 661 662
	 * go back and recompute everything.  This can only happen just after
	 * a checkpoint, so it's better to be slow in this case and fast
	 * otherwise.
T
Tom Lane 已提交
663 664
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
665
	{
T
Tom Lane 已提交
666 667 668 669
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

		for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
670
		{
T
Tom Lane 已提交
671 672 673 674 675 676
			if (dtbuf[i] == InvalidBuffer)
				continue;
			if (dtbuf_bkp[i] == false &&
				XLByteLE(dtbuf_lsn[i], RedoRecPtr))
			{
				/*
B
Bruce Momjian 已提交
677 678
				 * Oops, this buffer now needs to be backed up, but we
				 * didn't think so above.  Start over.
T
Tom Lane 已提交
679
				 */
680
				LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
681 682 683
				END_CRIT_SECTION();
				goto begin;
			}
684 685 686
		}
	}

T
Tom Lane 已提交
687 688 689 690 691 692 693
	/*
	 * Make additional rdata list entries for the backup blocks, so that
	 * we don't need to special-case them in the write loop.  Note that we
	 * have now irrevocably changed the input rdata list.  At the exit of
	 * this loop, write_len includes the backup block data.
	 *
	 * Also set the appropriate info bits to show which buffers were backed
B
Bruce Momjian 已提交
694 695 696
	 * up.	The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th
	 * distinct buffer value (ignoring InvalidBuffer) appearing in the
	 * rdata list.
T
Tom Lane 已提交
697 698 699
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
700 701 702 703
	{
		if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
			continue;

T
Tom Lane 已提交
704
		info |= XLR_SET_BKP_BLOCK(i);
705 706 707

		rdt->next = &(dtbuf_rdt[2 * i]);

B
Bruce Momjian 已提交
708
		dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
709
		dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
T
Tom Lane 已提交
710
		write_len += sizeof(BkpBlock);
711 712 713

		rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);

B
Bruce Momjian 已提交
714
		dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
715
		dtbuf_rdt[2 * i + 1].len = BLCKSZ;
T
Tom Lane 已提交
716
		write_len += BLCKSZ;
717 718 719
		dtbuf_rdt[2 * i + 1].next = NULL;
	}

T
Tom Lane 已提交
720
	/* Insert record header */
721

T
Tom Lane 已提交
722 723
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
724 725
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
726
		updrqst = AdvanceXLInsertBuffer();
727 728 729
		freespace = BLCKSZ - SizeOfXLogPHD;
	}

T
Tom Lane 已提交
730
	curridx = Insert->curridx;
731
	record = (XLogRecord *) Insert->currpos;
T
Tom Lane 已提交
732

733
	record->xl_prev = Insert->PrevRecord;
V
Vadim B. Mikheev 已提交
734
	if (no_tran)
735 736 737 738
	{
		record->xl_xact_prev.xlogid = 0;
		record->xl_xact_prev.xrecoff = 0;
	}
V
Vadim B. Mikheev 已提交
739 740 741
	else
		record->xl_xact_prev = MyLastRecPtr;

742
	record->xl_xid = GetCurrentTransactionId();
T
Tom Lane 已提交
743
	record->xl_len = len;		/* doesn't include backup blocks */
744
	record->xl_info = info;
745
	record->xl_rmid = rmid;
746

T
Tom Lane 已提交
747
	/* Now we can finish computing the main CRC */
B
Bruce Momjian 已提交
748
	COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
749
			   SizeOfXLogRecord - sizeof(crc64));
750 751 752
	FIN_CRC64(rdata_crc);
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
753 754 755
	/* Compute record's XLOG location */
	INSERT_RECPTR(RecPtr, Insert, curridx);

J
Jan Wieck 已提交
756
	/* If first XLOG record of transaction, save it in PGPROC array */
V
Vadim B. Mikheev 已提交
757
	if (MyLastRecPtr.xrecoff == 0 && !no_tran)
758
	{
759 760 761 762 763 764
		/*
		 * We do not acquire SInvalLock here because of possible deadlock.
		 * Anyone who wants to inspect other procs' logRec must acquire
		 * WALInsertLock, instead.  A better solution would be a per-PROC
		 * spinlock, but no time for that before 7.2 --- tgl 12/19/01.
		 */
765 766
		MyProc->logRec = RecPtr;
	}
V
WAL  
Vadim B. Mikheev 已提交
767 768 769

	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
770
		char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
771

772
		sprintf(buf, "INSERT @ %X/%X: ", RecPtr.xlogid, RecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
773
		xlog_outrec(buf, record);
774
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
775 776
		{
			strcat(buf, " - ");
777
			RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
778
		}
779
		elog(LOG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
780 781
	}

T
Tom Lane 已提交
782 783 784 785 786
	/* Record begin of record in appropriate places */
	if (!no_tran)
		MyLastRecPtr = RecPtr;
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;
787
	MyXactMadeXLogEntry = true;
T
Tom Lane 已提交
788

789
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
790
	freespace -= SizeOfXLogRecord;
791

T
Tom Lane 已提交
792 793 794 795
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
796
	{
797 798 799 800
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
801
		{
802 803 804 805 806
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
807
				write_len -= freespace;
808 809 810 811 812
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
813
				write_len -= rdata->len;
814 815 816 817
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
818 819
		}

820
		/* Use next buffer */
T
Tom Lane 已提交
821 822 823 824 825 826 827 828
		updrqst = AdvanceXLInsertBuffer();
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
		freespace = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
829
	}
830

T
Tom Lane 已提交
831 832
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
833
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
834
	freespace = INSERT_FREESPACE(Insert);
835

V
Vadim B. Mikheev 已提交
836
	/*
B
Bruce Momjian 已提交
837 838
	 * The recptr I return is the beginning of the *next* record. This
	 * will be stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
839
	 */
T
Tom Lane 已提交
840
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
841

T
Tom Lane 已提交
842
	/* Need to update shared LogwrtRqst if some block was filled up */
843
	if (freespace < SizeOfXLogRecord)
B
Bruce Momjian 已提交
844 845
		updrqst = true;			/* curridx is filled and available for
								 * writing out */
846 847
	else
		curridx = PrevBufIdx(curridx);
T
Tom Lane 已提交
848
	WriteRqst = XLogCtl->xlblocks[curridx];
849

850
	LWLockRelease(WALInsertLock);
851 852 853

	if (updrqst)
	{
854 855 856 857
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
T
Tom Lane 已提交
858
		/* advance global request to include new block(s) */
859 860
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
861
		/* update local result copy while I have the chance */
862 863
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
864 865
	}

866 867
	ProcLastRecEnd = RecPtr;

868
	END_CRIT_SECTION();
869

870
	return (RecPtr);
871
}
872

T
Tom Lane 已提交
873 874 875 876 877
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
878
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
879 880 881
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
882
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
883 884 885
 */
static bool
AdvanceXLInsertBuffer(void)
886
{
T
Tom Lane 已提交
887 888 889 890 891 892
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
	uint16		nextidx = NextBufIdx(Insert->curridx);
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
893 894
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
895

T
Tom Lane 已提交
896 897 898
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
899

T
Tom Lane 已提交
900
	/*
B
Bruce Momjian 已提交
901 902 903
	 * Get ending-offset of the buffer page we need to replace (this may
	 * be zero if the buffer hasn't been used yet).  Fall through if it's
	 * already written out.
T
Tom Lane 已提交
904 905 906 907 908 909
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
910

T
Tom Lane 已提交
911
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
912

913
		/* Before waiting, get info_lck and update LogwrtResult */
914 915 916 917 918 919 920 921 922 923
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
			SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
		}
924 925 926 927 928 929 930 931 932

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
933
		{
934 935 936 937
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
938
			{
939 940 941
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
942
			}
943
			else
T
Tom Lane 已提交
944 945
			{
				/*
B
Bruce Momjian 已提交
946 947
				 * Have to write buffers while holding insert lock. This
				 * is not good, so only write as much as we absolutely
T
Tom Lane 已提交
948 949 950 951 952 953
				 * must.
				 */
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
				XLogWrite(WriteRqst);
954
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
955
				Insert->LogwrtResult = LogwrtResult;
956 957 958 959
			}
		}
	}

T
Tom Lane 已提交
960 961 962 963
	/*
	 * Now the next buffer slot is free and we can set it up to be the
	 * next output page.
	 */
964 965
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
966
	{
T
Tom Lane 已提交
967
		/* crossing a logid boundary */
968 969
		NewPageEndPtr.xlogid += 1;
		NewPageEndPtr.xrecoff = BLCKSZ;
970
	}
T
Tom Lane 已提交
971
	else
972 973 974
		NewPageEndPtr.xrecoff += BLCKSZ;
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * BLCKSZ);
T
Tom Lane 已提交
975
	Insert->curridx = nextidx;
976 977
	Insert->currpage = NewPage;
	Insert->currpos = ((char *) NewPage) + SizeOfXLogPHD;
B
Bruce Momjian 已提交
978

T
Tom Lane 已提交
979
	/*
B
Bruce Momjian 已提交
980 981
	 * Be sure to re-zero the buffer so that bytes beyond what we've
	 * written will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
982
	 */
983 984 985 986
	MemSet((char *) NewPage, 0, BLCKSZ);

	/* And fill the new page's header */
	NewPage->xlp_magic = XLOG_PAGE_MAGIC;
987
	/* NewPage->xlp_info = 0; */	/* done by memset */
988 989 990
	NewPage->xlp_sui = ThisStartUpID;
	NewPage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
	NewPage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
T
Tom Lane 已提交
991 992

	return update_needed;
993 994
}

T
Tom Lane 已提交
995 996 997
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
998
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
999
 */
1000
static void
T
Tom Lane 已提交
1001
XLogWrite(XLogwrtRqst WriteRqst)
1002
{
1003 1004
	XLogCtlWrite *Write = &XLogCtl->Write;
	char	   *from;
T
Tom Lane 已提交
1005
	bool		ispartialpage;
1006
	bool		use_existent;
1007

B
Bruce Momjian 已提交
1008 1009 1010 1011
	/*
	 * Update local LogwrtResult (caller probably did this already,
	 * but...)
	 */
T
Tom Lane 已提交
1012 1013 1014
	LogwrtResult = Write->LogwrtResult;

	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1015
	{
1016 1017 1018 1019 1020 1021 1022
		/*
		 * Make sure we're not ahead of the insert process.  This could
		 * happen if we're passed a bogus WriteRqst.Write that is past the
		 * end of the last page that's been initialized by
		 * AdvanceXLInsertBuffer.
		 */
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
1023
			elog(PANIC, "XLogWrite: write request %X/%X is past end of log %X/%X",
1024 1025 1026
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
				 XLogCtl->xlblocks[Write->curridx].xlogid,
				 XLogCtl->xlblocks[Write->curridx].xrecoff);
1027

T
Tom Lane 已提交
1028 1029 1030 1031 1032
		/* Advance LogwrtResult.Write to end of current buffer page */
		LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1033
		{
T
Tom Lane 已提交
1034 1035 1036 1037
			/*
			 * Switch to new logfile segment.
			 */
			if (openLogFile >= 0)
1038
			{
T
Tom Lane 已提交
1039
				if (close(openLogFile) != 0)
1040
					elog(PANIC, "close of log file %u, segment %u failed: %m",
T
Tom Lane 已提交
1041 1042
						 openLogId, openLogSeg);
				openLogFile = -1;
1043
			}
T
Tom Lane 已提交
1044 1045
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1046 1047 1048 1049
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1050
			openLogOff = 0;
1051 1052 1053

			if (!use_existent)	/* there was no precreated file */
				elog(LOG, "XLogWrite: new log file created - "
B
Bruce Momjian 已提交
1054
					 "consider increasing 'wal_files' in postgresql.conf.");
1055

T
Tom Lane 已提交
1056
			/* update pg_control, unless someone else already did */
1057
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1058 1059 1060
			if (ControlFile->logId < openLogId ||
				(ControlFile->logId == openLogId &&
				 ControlFile->logSeg < openLogSeg + 1))
T
Tom Lane 已提交
1061 1062 1063 1064 1065
			{
				ControlFile->logId = openLogId;
				ControlFile->logSeg = openLogSeg + 1;
				ControlFile->time = time(NULL);
				UpdateControlFile();
B
Bruce Momjian 已提交
1066

1067
				/*
B
Bruce Momjian 已提交
1068 1069 1070 1071
				 * Signal postmaster to start a checkpoint if it's been
				 * too long since the last one.  (We look at local copy of
				 * RedoRecPtr which might be a little out of date, but
				 * should be close enough for this purpose.)
1072 1073 1074 1075 1076 1077 1078
				 */
				if (IsUnderPostmaster &&
					(openLogId != RedoRecPtr.xlogid ||
					 openLogSeg >= (RedoRecPtr.xrecoff / XLogSegSize) +
					 (uint32) CheckPointSegments))
				{
					if (XLOG_DEBUG)
1079
						elog(LOG, "XLogWrite: time for a checkpoint, signaling postmaster");
1080
					SendPostmasterSignal(PMSIGNAL_DO_CHECKPOINT);
1081
				}
T
Tom Lane 已提交
1082
			}
1083
			LWLockRelease(ControlFileLock);
1084 1085
		}

T
Tom Lane 已提交
1086
		if (openLogFile < 0)
1087
		{
T
Tom Lane 已提交
1088 1089 1090
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
			openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
			openLogOff = 0;
1091 1092
		}

T
Tom Lane 已提交
1093 1094
		/* Need to seek in the file? */
		if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1095
		{
T
Tom Lane 已提交
1096 1097
			openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
			if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1098
				elog(PANIC, "lseek of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1099
					 openLogId, openLogSeg, openLogOff);
1100 1101
		}

T
Tom Lane 已提交
1102 1103
		/* OK to write the page */
		from = XLogCtl->pages + Write->curridx * BLCKSZ;
1104
		errno = 0;
T
Tom Lane 已提交
1105
		if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1106 1107 1108 1109
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
1110
			elog(PANIC, "write of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1111
				 openLogId, openLogSeg, openLogOff);
1112
		}
T
Tom Lane 已提交
1113
		openLogOff += BLCKSZ;
1114

T
Tom Lane 已提交
1115 1116 1117
		/*
		 * If we just wrote the whole last page of a logfile segment,
		 * fsync the segment immediately.  This avoids having to go back
B
Bruce Momjian 已提交
1118 1119 1120
		 * and re-open prior segments when an fsync request comes along
		 * later. Doing it here ensures that one and only one backend will
		 * perform this fsync.
T
Tom Lane 已提交
1121 1122 1123
		 */
		if (openLogOff >= XLogSegSize && !ispartialpage)
		{
1124
			issue_xlog_fsync();
B
Bruce Momjian 已提交
1125
			LogwrtResult.Flush = LogwrtResult.Write;	/* end of current page */
T
Tom Lane 已提交
1126
		}
1127

T
Tom Lane 已提交
1128 1129 1130 1131 1132 1133 1134
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
		Write->curridx = NextBufIdx(Write->curridx);
1135 1136
	}

T
Tom Lane 已提交
1137 1138 1139 1140 1141
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1142
	{
T
Tom Lane 已提交
1143
		/*
B
Bruce Momjian 已提交
1144 1145 1146
		 * Could get here without iterating above loop, in which case we
		 * might have no open file or the wrong one.  However, we do not
		 * need to fsync more than one file.
T
Tom Lane 已提交
1147
		 */
1148
		if (sync_method != SYNC_METHOD_OPEN)
T
Tom Lane 已提交
1149
		{
1150
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1151
			 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1152 1153
			{
				if (close(openLogFile) != 0)
1154
					elog(PANIC, "close of log file %u, segment %u failed: %m",
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
				openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
				openLogOff = 0;
			}
			issue_xlog_fsync();
T
Tom Lane 已提交
1165 1166
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1167 1168
	}

T
Tom Lane 已提交
1169 1170 1171
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1172 1173
	 * We make sure that the shared 'request' values do not fall behind the
	 * 'result' values.  This is not absolutely essential, but it saves
T
Tom Lane 已提交
1174 1175
	 * some code in a couple of places.
	 */
1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
1188

T
Tom Lane 已提交
1189 1190 1191 1192 1193 1194
	Write->LogwrtResult = LogwrtResult;
}

/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
1195
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

	if (XLOG_DEBUG)
	{
1206
		elog(LOG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X\n",
1207 1208 1209 1210 1211
			 (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
			 (InRedo) ? "(redo)" : "",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
T
Tom Lane 已提交
1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228
		fflush(stderr);
	}

	/* Disabled during REDO */
	if (InRedo)
		return;

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
	 * piggyback as much data as we can on each fsync: if we see any more
	 * data entered into the xlog buffer, we'll write and fsync that too,
B
Bruce Momjian 已提交
1229 1230 1231
	 * so that the final value of LogwrtResult.Flush is as large as
	 * possible. This gives us some chance of avoiding another fsync
	 * immediately after.
T
Tom Lane 已提交
1232 1233 1234 1235 1236
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

1237
	/* read LogwrtResult and update local state */
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
1248 1249 1250

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1251 1252
	{
		/* if something was added to log cache then try to flush this too */
1253
		if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
T
Tom Lane 已提交
1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
		{
			XLogCtlInsert *Insert = &XLogCtl->Insert;
			uint32		freespace = INSERT_FREESPACE(Insert);

			if (freespace < SizeOfXLogRecord)	/* buffer is full */
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
			else
			{
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				WriteRqstPtr.xrecoff -= freespace;
			}
1265
			LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
1266
		}
1267 1268 1269 1270
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1271 1272 1273 1274 1275
		{
			WriteRqst.Write = WriteRqstPtr;
			WriteRqst.Flush = record;
			XLogWrite(WriteRqst);
		}
1276
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1277 1278 1279
	}

	END_CRIT_SECTION();
1280 1281 1282 1283 1284 1285

	/*
	 * If we still haven't flushed to the request point then we have a
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
	 *
1286
	 * Formerly we treated this as a PANIC condition, but that hurts the
1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
	 * system's robustness rather than helping it: we do not want to take
	 * down the whole system due to corruption on one data page.  In
	 * particular, if the bad page is encountered again during recovery then
	 * we would be unable to restart the database at all!  (This scenario
	 * has actually happened in the field several times with 7.1 releases.
	 * Note that we cannot get here while InRedo is true, but if the bad
	 * page is brought in and marked dirty during recovery then
	 * CreateCheckpoint will try to flush it at the end of recovery.)
	 *
	 * The current approach is to ERROR under normal conditions, but only
B
Bruce Momjian 已提交
1297
	 * WARNING during recovery, so that the system can be brought up even if
1298
	 * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR
1299
	 * will be promoted to PANIC since xact.c calls this routine inside a
1300 1301 1302 1303 1304
	 * critical section.  However, calls from bufmgr.c are not within
	 * critical sections and so we will not force a restart for a bad LSN
	 * on a data page.
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
B
Bruce Momjian 已提交
1305
		elog(InRecovery ? WARNING : ERROR,
1306 1307 1308
			 "XLogFlush: request %X/%X is not satisfied --- flushed only to %X/%X",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1309 1310
}

T
Tom Lane 已提交
1311 1312 1313
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
1314 1315 1316
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
1317
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
1318 1319
 * file was used.
 *
1320
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1321
 * place.  This should be TRUE except during bootstrap log creation.  The
1322
 * caller must *not* hold the lock at call.
1323
 *
T
Tom Lane 已提交
1324 1325
 * Returns FD of opened file.
 */
1326
static int
1327 1328
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
1329
{
1330
	char		path[MAXPGPATH];
1331
	char		tmppath[MAXPGPATH];
1332
	char		zbuffer[BLCKSZ];
1333
	int			fd;
1334
	int			nbytes;
1335 1336

	XLogFileName(path, log, seg);
V
Vadim B. Mikheev 已提交
1337 1338

	/*
B
Bruce Momjian 已提交
1339 1340
	 * Try to use existent file (checkpoint maker may have created it
	 * already)
V
Vadim B. Mikheev 已提交
1341
	 */
1342
	if (*use_existent)
V
Vadim B. Mikheev 已提交
1343
	{
1344 1345
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
1346 1347 1348
		if (fd < 0)
		{
			if (errno != ENOENT)
1349
				elog(PANIC, "open of %s (log file %u, segment %u) failed: %m",
1350
					 path, log, seg);
V
Vadim B. Mikheev 已提交
1351 1352
		}
		else
B
Bruce Momjian 已提交
1353
			return (fd);
V
Vadim B. Mikheev 已提交
1354 1355
	}

1356
	/*
B
Bruce Momjian 已提交
1357 1358 1359
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible
	 * that another process is doing the same thing.  If so, we will end
	 * up pre-creating an extra log segment.  That seems OK, and better
1360
	 * than holding the lock throughout this lengthy process.
1361
	 */
1362 1363
	snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
			 XLogDir, (int) getpid());
1364 1365

	unlink(tmppath);
1366

1367
	/* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
1368
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
1369
					   S_IRUSR | S_IWUSR);
1370
	if (fd < 0)
1371
		elog(PANIC, "creation of file %s failed: %m", tmppath);
1372

1373
	/*
B
Bruce Momjian 已提交
1374
	 * Zero-fill the file.	We have to do this the hard way to ensure that
1375 1376
	 * all the file space has really been allocated --- on platforms that
	 * allow "holes" in files, just seeking to the end doesn't allocate
B
Bruce Momjian 已提交
1377
	 * intermediate space.	This way, we know that we have all the space
1378
	 * and (after the fsync below) that all the indirect blocks are down
1379 1380
	 * on disk.  Therefore, fdatasync(2) or O_DSYNC will be sufficient to
	 * sync future writes to the log file.
1381 1382 1383 1384
	 */
	MemSet(zbuffer, 0, sizeof(zbuffer));
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
	{
1385
		errno = 0;
1386
		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
T
Tom Lane 已提交
1387
		{
B
Bruce Momjian 已提交
1388
			int			save_errno = errno;
T
Tom Lane 已提交
1389

B
Bruce Momjian 已提交
1390 1391 1392 1393
			/*
			 * If we fail to make the file, delete it to release disk
			 * space
			 */
1394
			unlink(tmppath);
1395 1396
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
1397

1398
			elog(PANIC, "ZeroFill failed to write %s: %m", tmppath);
T
Tom Lane 已提交
1399
		}
1400
	}
1401

1402
	if (pg_fsync(fd) != 0)
1403
		elog(PANIC, "fsync of file %s failed: %m", tmppath);
1404

V
Vadim B. Mikheev 已提交
1405
	close(fd);
T
Tom Lane 已提交
1406

1407
	/*
1408 1409
	 * Now move the segment into place with its final name.
	 *
1410 1411 1412 1413 1414
	 * If caller didn't want to use a pre-existing file, get rid of any
	 * pre-existing file.  Otherwise, cope with possibility that someone
	 * else has created the file while we were filling ours: if so, use
	 * ours to pre-create a future log segment.
	 */
1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
	if (!InstallXLogFileSegment(log, seg, tmppath,
								*use_existent, XLOGfiles + XLOGfileslop,
								use_lock))
	{
		/* No need for any more future segments... */
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
1430
		elog(PANIC, "open of %s (log file %u, segment %u) failed: %m",
1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
			 path, log, seg);

	return (fd);
}

/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
 * log, seg: identify segment to install as (or first possible target).
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
 * max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  (Irrelevant if
 * find_free is FALSE.)
 *
1454
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1455
 * place.  This should be TRUE except during bootstrap log creation.  The
1456
 * caller must *not* hold the lock at call.
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474
 *
 * Returns TRUE if file installed, FALSE if not installed because of
 * exceeding max_advance limit.  (Any other kind of failure causes elog().)
 */
static bool
InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
					   bool find_free, int max_advance,
					   bool use_lock)
{
	char		path[MAXPGPATH];
	int			fd;

	XLogFileName(path, log, seg);

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
1475
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1476

1477 1478 1479 1480 1481
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
1482 1483
	else
	{
1484 1485
		/* Find a free slot to put it in */
		while ((fd = BasicOpenFile(path, O_RDWR | PG_BINARY,
1486 1487 1488
								   S_IRUSR | S_IWUSR)) >= 0)
		{
			close(fd);
1489 1490 1491 1492
			if (--max_advance < 0)
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
1493
					LWLockRelease(ControlFileLock);
1494 1495 1496 1497
				return false;
			}
			NextLogSeg(log, seg);
			XLogFileName(path, log, seg);
1498 1499 1500 1501 1502 1503 1504
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
1505
	 */
1506
#if !defined(__BEOS__) && !defined(N_PLAT_NLM) && !defined(__CYGWIN__)
1507
	if (link(tmppath, path) < 0)
1508
		elog(PANIC, "link from %s to %s (initialization of log file %u, segment %u) failed: %m",
1509
			 tmppath, path, log, seg);
1510
	unlink(tmppath);
1511
#else
1512
	if (rename(tmppath, path) < 0)
1513
		elog(PANIC, "rename from %s to %s (initialization of log file %u, segment %u) failed: %m",
1514
			 tmppath, path, log, seg);
1515
#endif
V
Vadim B. Mikheev 已提交
1516

1517
	if (use_lock)
1518
		LWLockRelease(ControlFileLock);
1519

1520
	return true;
1521 1522
}

T
Tom Lane 已提交
1523 1524 1525
/*
 * Open a pre-existing logfile segment.
 */
1526 1527 1528
static int
XLogFileOpen(uint32 log, uint32 seg, bool econt)
{
1529 1530
	char		path[MAXPGPATH];
	int			fd;
1531 1532 1533

	XLogFileName(path, log, seg);

1534 1535
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
1536 1537 1538 1539
	if (fd < 0)
	{
		if (econt && errno == ENOENT)
		{
1540 1541
			elog(LOG, "open of %s (log file %u, segment %u) failed: %m",
				 path, log, seg);
1542 1543
			return (fd);
		}
1544
		elog(PANIC, "open of %s (log file %u, segment %u) failed: %m",
1545
			 path, log, seg);
1546 1547
	}

1548
	return (fd);
1549 1550
}

V
Vadim B. Mikheev 已提交
1551
/*
T
Tom Lane 已提交
1552 1553 1554 1555 1556 1557 1558 1559 1560
 * Preallocate log files beyond the specified log endpoint, according to
 * the XLOGfile user parameter.
 */
static void
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
1561
	bool		use_existent;
T
Tom Lane 已提交
1562 1563 1564 1565 1566 1567 1568 1569
	int			i;

	XLByteToPrevSeg(endptr, _logId, _logSeg);
	if (XLOGfiles > 0)
	{
		for (i = 1; i <= XLOGfiles; i++)
		{
			NextLogSeg(_logId, _logSeg);
1570 1571
			use_existent = true;
			lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1572 1573 1574 1575 1576 1577 1578
			close(lf);
		}
	}
	else if ((endptr.xrecoff - 1) % XLogSegSize >=
			 (uint32) (0.75 * XLogSegSize))
	{
		NextLogSeg(_logId, _logSeg);
1579 1580
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1581 1582 1583 1584 1585 1586
		close(lf);
	}
}

/*
 * Remove or move offline all log files older or equal to passed log/seg#
1587 1588 1589
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
1590 1591
 */
static void
1592
MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
1593
{
1594 1595
	uint32		endlogId;
	uint32		endlogSeg;
B
Bruce Momjian 已提交
1596 1597 1598 1599
	DIR		   *xldir;
	struct dirent *xlde;
	char		lastoff[32];
	char		path[MAXPGPATH];
V
Vadim B. Mikheev 已提交
1600

1601
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
V
Vadim B. Mikheev 已提交
1602 1603 1604

	xldir = opendir(XLogDir);
	if (xldir == NULL)
1605
		elog(PANIC, "could not open transaction log directory (%s): %m",
1606
			 XLogDir);
V
Vadim B. Mikheev 已提交
1607

T
Tom Lane 已提交
1608
	sprintf(lastoff, "%08X%08X", log, seg);
V
Vadim B. Mikheev 已提交
1609 1610 1611 1612

	errno = 0;
	while ((xlde = readdir(xldir)) != NULL)
	{
T
Tom Lane 已提交
1613 1614 1615
		if (strlen(xlde->d_name) == 16 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 16 &&
			strcmp(xlde->d_name, lastoff) <= 0)
V
Vadim B. Mikheev 已提交
1616
		{
1617
			snprintf(path, MAXPGPATH, "%s/%s", XLogDir, xlde->d_name);
1618
			if (XLOG_archive_dir[0])
1619 1620 1621
			{
				elog(LOG, "archiving transaction log file %s",
					 xlde->d_name);
B
Bruce Momjian 已提交
1622
				elog(WARNING, "archiving log files is not implemented!");
1623
			}
1624
			else
1625 1626 1627
			{
				/*
				 * Before deleting the file, see if it can be recycled as
1628 1629
				 * a future log segment.  We allow recycling segments up
				 * to XLOGfiles + XLOGfileslop segments beyond the current
1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646
				 * XLOG location.
				 */
				if (InstallXLogFileSegment(endlogId, endlogSeg, path,
										   true, XLOGfiles + XLOGfileslop,
										   true))
				{
					elog(LOG, "recycled transaction log file %s",
						 xlde->d_name);
				}
				else
				{
					/* No need for any more future segments... */
					elog(LOG, "removing transaction log file %s",
						 xlde->d_name);
					unlink(path);
				}
			}
V
Vadim B. Mikheev 已提交
1647 1648 1649 1650
		}
		errno = 0;
	}
	if (errno)
1651
		elog(PANIC, "could not read transaction log directory (%s): %m",
1652
			 XLogDir);
V
Vadim B. Mikheev 已提交
1653 1654 1655
	closedir(xldir);
}

T
Tom Lane 已提交
1656 1657 1658 1659 1660
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
 */
1661 1662 1663 1664 1665 1666 1667 1668 1669 1670
static void
RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
{
	Relation	reln;
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
1671
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
1672
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1673
	{
T
Tom Lane 已提交
1674
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1675 1676
			continue;

B
Bruce Momjian 已提交
1677
		memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
		blk += sizeof(BkpBlock);

		reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);

		if (reln)
		{
			buffer = XLogReadBuffer(true, reln, bkpb.block);
			if (BufferIsValid(buffer))
			{
				page = (Page) BufferGetPage(buffer);
B
Bruce Momjian 已提交
1688
				memcpy((char *) page, blk, BLCKSZ);
1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
				PageSetLSN(page, lsn);
				PageSetSUI(page, ThisStartUpID);
				UnlockAndWriteBuffer(buffer);
			}
		}

		blk += BLCKSZ;
	}
}

T
Tom Lane 已提交
1699 1700 1701 1702 1703 1704 1705
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
1706 1707 1708 1709 1710 1711 1712 1713 1714
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
	crc64		crc;
	crc64		cbuf;
	int			i;
	uint32		len = record->xl_len;
	char	   *blk;

T
Tom Lane 已提交
1715
	/* Check CRC of rmgr data and record header */
1716
	INIT_CRC64(crc);
T
Tom Lane 已提交
1717
	COMP_CRC64(crc, XLogRecGetData(record), len);
B
Bruce Momjian 已提交
1718
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
1719
			   SizeOfXLogRecord - sizeof(crc64));
1720 1721
	FIN_CRC64(crc);

T
Tom Lane 已提交
1722
	if (!EQ_CRC64(record->xl_crc, crc))
1723
	{
1724
		elog(emode, "ReadRecord: bad resource manager data checksum in record at %X/%X",
T
Tom Lane 已提交
1725
			 recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1726
		return (false);
1727 1728
	}

T
Tom Lane 已提交
1729
	/* Check CRCs of backup blocks, if any */
B
Bruce Momjian 已提交
1730
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
1731
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1732
	{
T
Tom Lane 已提交
1733
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1734 1735 1736
			continue;

		INIT_CRC64(crc);
T
Tom Lane 已提交
1737 1738 1739
		COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
		COMP_CRC64(crc, blk + sizeof(crc64),
				   sizeof(BkpBlock) - sizeof(crc64));
1740
		FIN_CRC64(crc);
B
Bruce Momjian 已提交
1741 1742
		memcpy((char *) &cbuf, blk, sizeof(crc64));		/* don't assume
														 * alignment */
1743

T
Tom Lane 已提交
1744
		if (!EQ_CRC64(cbuf, crc))
1745
		{
1746
			elog(emode, "ReadRecord: bad checksum of backup block %d in record at %X/%X",
T
Tom Lane 已提交
1747
				 i + 1, recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1748
			return (false);
1749
		}
T
Tom Lane 已提交
1750
		blk += sizeof(BkpBlock) + BLCKSZ;
1751 1752
	}

B
Bruce Momjian 已提交
1753
	return (true);
1754 1755
}

T
Tom Lane 已提交
1756 1757 1758 1759 1760 1761
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
1762 1763
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
 * (emode must be either PANIC or LOG.)
T
Tom Lane 已提交
1764 1765 1766 1767 1768
 *
 * buffer is a workspace at least _INTL_MAXLOGRECSZ bytes long.  It is needed
 * to reassemble a record that crosses block boundaries.  Note that on
 * successful return, the returned record pointer always points at buffer.
 */
1769
static XLogRecord *
T
Tom Lane 已提交
1770
ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer)
1771
{
1772 1773
	XLogRecord *record;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
T
Tom Lane 已提交
1774 1775 1776 1777
	uint32		len,
				total_len;
	uint32		targetPageOff;
	unsigned	i;
1778
	bool		nextmode = false;
T
Tom Lane 已提交
1779 1780 1781 1782 1783 1784

	if (readBuf == NULL)
	{
		/*
		 * First time through, permanently allocate readBuf.  We do it
		 * this way, rather than just making a static array, for two
B
Bruce Momjian 已提交
1785 1786 1787 1788
		 * reasons: (1) no need to waste the storage in most
		 * instantiations of the backend; (2) a static char array isn't
		 * guaranteed to have any particular alignment, whereas malloc()
		 * will provide MAXALIGN'd storage.
T
Tom Lane 已提交
1789 1790 1791 1792
		 */
		readBuf = (char *) malloc(BLCKSZ);
		Assert(readBuf != NULL);
	}
1793

T
Tom Lane 已提交
1794
	if (RecPtr == NULL)
1795
	{
1796
		RecPtr = &tmpRecPtr;
1797
		nextmode = true;
T
Tom Lane 已提交
1798
		/* fast case if next record is on same page */
1799 1800 1801 1802 1803
		if (nextRecord != NULL)
		{
			record = nextRecord;
			goto got_record;
		}
T
Tom Lane 已提交
1804
		/* align old recptr to next page */
1805 1806 1807 1808 1809 1810 1811 1812
		if (tmpRecPtr.xrecoff % BLCKSZ != 0)
			tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
		tmpRecPtr.xrecoff += SizeOfXLogPHD;
1813
	}
1814
	else if (!XRecOffIsValid(RecPtr->xrecoff))
1815
		elog(PANIC, "ReadRecord: invalid record offset at %X/%X",
1816
			 RecPtr->xlogid, RecPtr->xrecoff);
1817

T
Tom Lane 已提交
1818
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
1819
	{
1820 1821
		close(readFile);
		readFile = -1;
1822
	}
T
Tom Lane 已提交
1823
	XLByteToSeg(*RecPtr, readId, readSeg);
1824
	if (readFile < 0)
1825
	{
T
Tom Lane 已提交
1826
		readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1827 1828
		if (readFile < 0)
			goto next_record_is_invalid;
1829
		readOff = (uint32) (-1);	/* force read to occur below */
1830 1831
	}

T
Tom Lane 已提交
1832 1833
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / BLCKSZ) * BLCKSZ;
	if (readOff != targetPageOff)
1834
	{
T
Tom Lane 已提交
1835 1836 1837
		readOff = targetPageOff;
		if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
		{
1838
			elog(emode, "ReadRecord: lseek of log file %u, segment %u, offset %u failed: %m",
1839
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1840 1841
			goto next_record_is_invalid;
		}
1842
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1843
		{
1844
			elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1845
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1846 1847
			goto next_record_is_invalid;
		}
1848
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, nextmode))
1849 1850
			goto next_record_is_invalid;
	}
T
Tom Lane 已提交
1851
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
1852 1853
		RecPtr->xrecoff % BLCKSZ == SizeOfXLogPHD)
	{
1854
		elog(emode, "ReadRecord: contrecord is requested by %X/%X",
1855
			 RecPtr->xlogid, RecPtr->xrecoff);
1856 1857
		goto next_record_is_invalid;
	}
1858
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
1859 1860

got_record:;
B
Bruce Momjian 已提交
1861

T
Tom Lane 已提交
1862
	/*
B
Bruce Momjian 已提交
1863 1864
	 * Currently, xl_len == 0 must be bad data, but that might not be true
	 * forever.  See note in XLogInsert.
T
Tom Lane 已提交
1865
	 */
1866 1867
	if (record->xl_len == 0)
	{
1868
		elog(emode, "ReadRecord: record with zero length at %X/%X",
T
Tom Lane 已提交
1869
			 RecPtr->xlogid, RecPtr->xrecoff);
1870 1871
		goto next_record_is_invalid;
	}
B
Bruce Momjian 已提交
1872

T
Tom Lane 已提交
1873
	/*
B
Bruce Momjian 已提交
1874 1875
	 * Compute total length of record including any appended backup
	 * blocks.
T
Tom Lane 已提交
1876 1877 1878 1879 1880 1881 1882 1883
	 */
	total_len = SizeOfXLogRecord + record->xl_len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;
		total_len += sizeof(BkpBlock) + BLCKSZ;
	}
B
Bruce Momjian 已提交
1884

T
Tom Lane 已提交
1885 1886 1887 1888 1889 1890
	/*
	 * Make sure it will fit in buffer (currently, it is mechanically
	 * impossible for this test to fail, but it seems like a good idea
	 * anyway).
	 */
	if (total_len > _INTL_MAXLOGRECSZ)
1891
	{
1892
		elog(emode, "ReadRecord: record length %u at %X/%X too long",
T
Tom Lane 已提交
1893
			 total_len, RecPtr->xlogid, RecPtr->xrecoff);
1894 1895 1896 1897
		goto next_record_is_invalid;
	}
	if (record->xl_rmid > RM_MAX_ID)
	{
1898
		elog(emode, "ReadRecord: invalid resource manager id %u at %X/%X",
1899
			 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff);
1900 1901 1902
		goto next_record_is_invalid;
	}
	nextRecord = NULL;
T
Tom Lane 已提交
1903 1904
	len = BLCKSZ - RecPtr->xrecoff % BLCKSZ;
	if (total_len > len)
1905
	{
T
Tom Lane 已提交
1906 1907
		/* Need to reassemble record */
		XLogContRecord *contrecord;
B
Bruce Momjian 已提交
1908
		uint32		gotlen = len;
1909

T
Tom Lane 已提交
1910
		memcpy(buffer, record, len);
1911
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
1912
		buffer += len;
1913
		for (;;)
1914
		{
T
Tom Lane 已提交
1915 1916
			readOff += BLCKSZ;
			if (readOff >= XLogSegSize)
1917 1918
			{
				close(readFile);
T
Tom Lane 已提交
1919 1920 1921
				readFile = -1;
				NextLogSeg(readId, readSeg);
				readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1922 1923
				if (readFile < 0)
					goto next_record_is_invalid;
T
Tom Lane 已提交
1924
				readOff = 0;
1925 1926
			}
			if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1927
			{
1928
				elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1929
					 readId, readSeg, readOff);
T
Tom Lane 已提交
1930 1931
				goto next_record_is_invalid;
			}
1932
			if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
1933
				goto next_record_is_invalid;
T
Tom Lane 已提交
1934
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
1935
			{
1936
				elog(emode, "ReadRecord: there is no ContRecord flag in log file %u, segment %u, offset %u",
1937
					 readId, readSeg, readOff);
1938 1939
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1940
			contrecord = (XLogContRecord *) ((char *) readBuf + SizeOfXLogPHD);
B
Bruce Momjian 已提交
1941
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
1942
				total_len != (contrecord->xl_rem_len + gotlen))
1943
			{
1944
				elog(emode, "ReadRecord: invalid ContRecord length %u in log file %u, segment %u, offset %u",
T
Tom Lane 已提交
1945
					 contrecord->xl_rem_len, readId, readSeg, readOff);
1946 1947
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1948 1949
			len = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
			if (contrecord->xl_rem_len > len)
1950
			{
B
Bruce Momjian 已提交
1951
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
		if (BLCKSZ - SizeOfXLogRecord >= SizeOfXLogPHD +
			SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
		{
B
Bruce Momjian 已提交
1965
			nextRecord = (XLogRecord *) ((char *) contrecord +
T
Tom Lane 已提交
1966 1967 1968 1969
				SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
B
Bruce Momjian 已提交
1970
			SizeOfXLogPHD + SizeOfXLogContRecord +
T
Tom Lane 已提交
1971 1972 1973
			MAXALIGN(contrecord->xl_rem_len);
		ReadRecPtr = *RecPtr;
		return record;
1974 1975
	}

T
Tom Lane 已提交
1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
	if (BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % BLCKSZ +
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
	return (XLogRecord *) buffer;
1987

T
Tom Lane 已提交
1988 1989 1990 1991 1992
next_record_is_invalid:;
	close(readFile);
	readFile = -1;
	nextRecord = NULL;
	return NULL;
1993 1994
}

1995 1996 1997 1998
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
1999
 * ReadRecord.	It's not intended for use from anywhere else.
2000 2001 2002 2003
 */
static bool
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI)
{
2004 2005
	XLogRecPtr	recaddr;

2006 2007
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
2008
		elog(emode, "ReadRecord: invalid magic number %04X in log file %u, segment %u, offset %u",
2009 2010 2011 2012 2013
			 hdr->xlp_magic, readId, readSeg, readOff);
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
2014
		elog(emode, "ReadRecord: invalid info bits %04X in log file %u, segment %u, offset %u",
2015 2016 2017
			 hdr->xlp_info, readId, readSeg, readOff);
		return false;
	}
2018 2019 2020 2021
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
2022
		elog(emode, "ReadRecord: unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
2023 2024 2025 2026
			 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
			 readId, readSeg, readOff);
		return false;
	}
B
Bruce Momjian 已提交
2027

2028
	/*
B
Bruce Momjian 已提交
2029 2030 2031 2032
	 * We disbelieve a SUI less than the previous page's SUI, or more than
	 * a few counts greater.  In theory as many as 512 shutdown checkpoint
	 * records could appear on a 32K-sized xlog page, so that's the most
	 * differential there could legitimately be.
2033 2034
	 *
	 * Note this check can only be applied when we are reading the next page
B
Bruce Momjian 已提交
2035 2036
	 * in sequence, so ReadRecord passes a flag indicating whether to
	 * check.
2037 2038 2039 2040 2041 2042
	 */
	if (checkSUI)
	{
		if (hdr->xlp_sui < lastReadSUI ||
			hdr->xlp_sui > lastReadSUI + 512)
		{
2043 2044
			/* translator: SUI = startup id */
			elog(emode, "ReadRecord: out-of-sequence SUI %u (after %u) in log file %u, segment %u, offset %u",
2045 2046 2047 2048 2049 2050 2051 2052
				 hdr->xlp_sui, lastReadSUI, readId, readSeg, readOff);
			return false;
		}
	}
	lastReadSUI = hdr->xlp_sui;
	return true;
}

2053 2054 2055 2056
/*
 * I/O routines for pg_control
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
2057
 * contents of pg_control.	WriteControlFile() initializes pg_control
2058 2059 2060 2061 2062 2063 2064 2065 2066 2067
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */

2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083
void
SetXLogDir(char *path)
{
	if (path != NULL)
	{
		if (strlen(path) >= MAXPGPATH)
			elog(FATAL, "XLOG path '%s' is too long"
				 "; maximum length is %d characters", path, MAXPGPATH-1);
		strcpy(XLogDir, path);
	}
	else
	{
		snprintf(XLogDir, MAXPGPATH, "%s/pg_xlog", DataDir);
	}
}

2084 2085 2086 2087
void
XLOGPathInit(void)
{
	/* Init XLOG file paths */
2088 2089
	if (strlen(XLogDir) <= 0)
		SetXLogDir(NULL);
2090
	snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
2091 2092 2093 2094 2095 2096
}

static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
2097
	char		buffer[BLCKSZ]; /* need not be aligned */
2098 2099 2100
	char	   *localeptr;

	/*
T
Tom Lane 已提交
2101
	 * Initialize version and compatibility-check fields
2102
	 */
T
Tom Lane 已提交
2103 2104
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
2105 2106
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117

	ControlFile->nameDataLen = NAMEDATALEN;
	ControlFile->funcMaxArgs = FUNC_MAX_ARGS;

#ifdef HAVE_INT64_TIMESTAMP
	ControlFile->enableIntTimes = TRUE;
#else
	ControlFile->enableIntTimes = FALSE;
#endif

	ControlFile->localeBuflen = LOCALE_NAME_BUFLEN;
2118 2119
	localeptr = setlocale(LC_COLLATE, NULL);
	if (!localeptr)
2120
		elog(PANIC, "invalid LC_COLLATE setting");
2121 2122 2123
	StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
	localeptr = setlocale(LC_CTYPE, NULL);
	if (!localeptr)
2124
		elog(PANIC, "invalid LC_CTYPE setting");
2125
	StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
B
Bruce Momjian 已提交
2126

T
Tom Lane 已提交
2127 2128
	/* Contents are protected with a CRC */
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2129 2130
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2131 2132 2133
			   sizeof(ControlFileData) - sizeof(crc64));
	FIN_CRC64(ControlFile->crc);

2134
	/*
B
Bruce Momjian 已提交
2135 2136 2137 2138 2139
	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
	 * over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail
	 * when we check the contents of the file, but hopefully with a more
	 * specific error than "couldn't read pg_control".
2140 2141
	 */
	if (sizeof(ControlFileData) > BLCKSZ)
2142
		elog(PANIC, "sizeof(ControlFileData) is larger than BLCKSZ; fix either one");
2143

2144 2145 2146
	memset(buffer, 0, BLCKSZ);
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

2147 2148
	fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
2149
	if (fd < 0)
2150
		elog(PANIC, "WriteControlFile: could not create control file (%s): %m",
2151 2152
			 ControlFilePath);

2153
	errno = 0;
2154
	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
2155 2156 2157 2158
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2159
		elog(PANIC, "WriteControlFile: write to control file failed: %m");
2160
	}
2161

2162
	if (pg_fsync(fd) != 0)
2163
		elog(PANIC, "WriteControlFile: fsync of control file failed: %m");
2164 2165 2166 2167 2168 2169 2170

	close(fd);
}

static void
ReadControlFile(void)
{
2171
	crc64		crc;
2172 2173 2174 2175 2176 2177 2178
	int			fd;

	/*
	 * Read data...
	 */
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
2179
		elog(PANIC, "could not open control file (%s): %m", ControlFilePath);
2180 2181

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2182
		elog(PANIC, "read from control file failed: %m");
2183 2184 2185

	close(fd);

T
Tom Lane 已提交
2186 2187 2188 2189 2190 2191 2192
	/*
	 * Check for expected pg_control format version.  If this is wrong,
	 * the CRC check will likely fail because we'll be checking the wrong
	 * number of bytes.  Complaining about wrong version will probably be
	 * more enlightening than complaining about wrong CRC.
	 */
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
2193
		elog(PANIC,
2194 2195 2196
			 "The database cluster was initialized with PG_CONTROL_VERSION %d,\n"
			 "\tbut the server was compiled with PG_CONTROL_VERSION %d.\n"
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2197 2198 2199
			 ControlFile->pg_control_version, PG_CONTROL_VERSION);

	/* Now check the CRC. */
2200
	INIT_CRC64(crc);
B
Bruce Momjian 已提交
2201 2202
	COMP_CRC64(crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2203
			   sizeof(ControlFileData) - sizeof(crc64));
2204 2205
	FIN_CRC64(crc);

T
Tom Lane 已提交
2206
	if (!EQ_CRC64(crc, ControlFile->crc))
2207
		elog(PANIC, "invalid checksum in control file");
2208

2209
	/*
B
Bruce Momjian 已提交
2210 2211
	 * Do compatibility checking immediately.  We do this here for 2
	 * reasons:
2212
	 *
B
Bruce Momjian 已提交
2213 2214
	 * (1) if the database isn't compatible with the backend executable, we
	 * want to abort before we can possibly do any damage;
2215 2216 2217
	 *
	 * (2) this code is executed in the postmaster, so the setlocale() will
	 * propagate to forked backends, which aren't going to read this file
B
Bruce Momjian 已提交
2218
	 * for themselves.	(These locale settings are considered critical
2219 2220
	 * compatibility items because they can affect sort order of indexes.)
	 */
T
Tom Lane 已提交
2221
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
2222
		elog(PANIC,
2223
			 "The database cluster was initialized with CATALOG_VERSION_NO %d,\n"
2224
			 "\tbut the backend was compiled with CATALOG_VERSION_NO %d.\n"
2225
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2226
			 ControlFile->catalog_version_no, CATALOG_VERSION_NO);
2227
	if (ControlFile->blcksz != BLCKSZ)
2228
		elog(PANIC,
2229 2230 2231
			 "The database cluster was initialized with BLCKSZ %d,\n"
			 "\tbut the backend was compiled with BLCKSZ %d.\n"
			 "\tIt looks like you need to initdb.",
2232 2233
			 ControlFile->blcksz, BLCKSZ);
	if (ControlFile->relseg_size != RELSEG_SIZE)
2234
		elog(PANIC,
2235 2236
			 "The database cluster was initialized with RELSEG_SIZE %d,\n"
			 "\tbut the backend was compiled with RELSEG_SIZE %d.\n"
2237
			 "\tIt looks like you need to recompile or initdb.",
2238
			 ControlFile->relseg_size, RELSEG_SIZE);
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274

	if (ControlFile->nameDataLen != NAMEDATALEN)
		elog(PANIC,
			 "The database cluster was initialized with NAMEDATALEN %d,\n"
			 "\tbut the backend was compiled with NAMEDATALEN %d.\n"
			 "\tIt looks like you need to recompile or initdb.",
			 ControlFile->nameDataLen, NAMEDATALEN);

	if (ControlFile->funcMaxArgs != FUNC_MAX_ARGS)
		elog(PANIC,
			 "The database cluster was initialized with FUNC_MAX_ARGS %d,\n"
			 "\tbut the backend was compiled with FUNC_MAX_ARGS %d.\n"
			 "\tIt looks like you need to recompile or initdb.",
			 ControlFile->funcMaxArgs, FUNC_MAX_ARGS);

#ifdef HAVE_INT64_TIMESTAMP
	if (ControlFile->enableIntTimes != TRUE)
		elog(PANIC,
			 "The database cluster was initialized without HAVE_INT64_TIMESTAMP\n"
			 "\tbut the backend was compiled with HAVE_INT64_TIMESTAMP.\n"
			 "\tIt looks like you need to recompile or initdb.");
#else
	if (ControlFile->enableIntTimes != FALSE)
		elog(PANIC,
			 "The database cluster was initialized with HAVE_INT64_TIMESTAMP\n"
			 "\tbut the backend was compiled without HAVE_INT64_TIMESTAMP.\n"
			 "\tIt looks like you need to recompile or initdb.");
#endif

	if (ControlFile->localeBuflen != LOCALE_NAME_BUFLEN)
		elog(PANIC,
			 "The database cluster was initialized with LOCALE_NAME_BUFLEN %d,\n"
			 "\tbut the backend was compiled with LOCALE_NAME_BUFLEN %d.\n"
			 "\tIt looks like you need to initdb.",
			 ControlFile->localeBuflen, LOCALE_NAME_BUFLEN);

2275
	if (setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
2276
		elog(PANIC,
2277
			 "The database cluster was initialized with LC_COLLATE '%s',\n"
2278 2279
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2280 2281
			 ControlFile->lc_collate);
	if (setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
2282
		elog(PANIC,
2283 2284 2285
			 "The database cluster was initialized with LC_CTYPE '%s',\n"
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2286 2287 2288
			 ControlFile->lc_ctype);
}

2289
void
2290
UpdateControlFile(void)
2291
{
2292
	int			fd;
2293

2294
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2295 2296
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2297
			   sizeof(ControlFileData) - sizeof(crc64));
2298 2299
	FIN_CRC64(ControlFile->crc);

2300
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
2301
	if (fd < 0)
2302
		elog(PANIC, "could not open control file (%s): %m", ControlFilePath);
2303

2304
	errno = 0;
2305
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2306 2307 2308 2309
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2310
		elog(PANIC, "write to control file failed: %m");
2311
	}
2312

2313
	if (pg_fsync(fd) != 0)
2314
		elog(PANIC, "fsync of control file failed: %m");
2315 2316 2317 2318

	close(fd);
}

2319
/*
T
Tom Lane 已提交
2320
 * Initialization of shared memory for XLOG
2321 2322
 */

2323
int
2324
XLOGShmemSize(void)
2325 2326 2327 2328
{
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

T
Tom Lane 已提交
2329 2330 2331
	return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
		+ BLCKSZ * XLOGbuffers +
		MAXALIGN(sizeof(ControlFileData));
2332 2333 2334 2335 2336
}

void
XLOGShmemInit(void)
{
2337
	bool		found;
2338

2339
	/* this must agree with space requested by XLOGShmemSize() */
2340 2341 2342
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

2343
	XLogCtl = (XLogCtlData *)
T
Tom Lane 已提交
2344 2345 2346 2347 2348
		ShmemInitStruct("XLOG Ctl",
						MAXALIGN(sizeof(XLogCtlData) +
								 sizeof(XLogRecPtr) * XLOGbuffers)
						+ BLCKSZ * XLOGbuffers,
						&found);
2349
	Assert(!found);
2350 2351 2352 2353
	ControlFile = (ControlFileData *)
		ShmemInitStruct("Control File", sizeof(ControlFileData), &found);
	Assert(!found);

T
Tom Lane 已提交
2354
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
2355

T
Tom Lane 已提交
2356 2357 2358 2359 2360 2361 2362 2363
	/*
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be
	 * a multiple of the alignment for same, so no extra alignment padding
	 * is needed here.
	 */
	XLogCtl->xlblocks = (XLogRecPtr *)
		(((char *) XLogCtl) + sizeof(XLogCtlData));
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
B
Bruce Momjian 已提交
2364

T
Tom Lane 已提交
2365
	/*
B
Bruce Momjian 已提交
2366 2367
	 * Here, on the other hand, we must MAXALIGN to ensure the page
	 * buffers have worst-case alignment.
T
Tom Lane 已提交
2368 2369 2370 2371 2372 2373 2374
	 */
	XLogCtl->pages =
		((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
									  sizeof(XLogRecPtr) * XLOGbuffers);
	memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);

	/*
B
Bruce Momjian 已提交
2375 2376
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will
	 * fill in additional info.)
T
Tom Lane 已提交
2377 2378 2379 2380
	 */
	XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
2381
	SpinLockInit(&XLogCtl->info_lck);
T
Tom Lane 已提交
2382

2383 2384 2385 2386 2387 2388 2389
	/*
	 * If we are not in bootstrap mode, pg_control should already exist.
	 * Read and validate it immediately (see comments in ReadControlFile()
	 * for the reasons why).
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
2390 2391 2392
}

/*
T
Tom Lane 已提交
2393 2394
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
2395 2396
 */
void
T
Tom Lane 已提交
2397
BootStrapXLOG(void)
2398
{
2399
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2400 2401
	char	   *buffer;
	XLogPageHeader page;
2402
	XLogRecord *record;
B
Bruce Momjian 已提交
2403
	bool		use_existent;
2404
	crc64		crc;
2405

T
Tom Lane 已提交
2406 2407 2408 2409
	/* Use malloc() to ensure buffer is MAXALIGNED */
	buffer = (char *) malloc(BLCKSZ);
	page = (XLogPageHeader) buffer;

2410 2411 2412
	checkPoint.redo.xlogid = 0;
	checkPoint.redo.xrecoff = SizeOfXLogPHD;
	checkPoint.undo = checkPoint.redo;
T
Tom Lane 已提交
2413
	checkPoint.ThisStartUpID = 0;
2414
	checkPoint.nextXid = FirstNormalTransactionId;
2415
	checkPoint.nextOid = BootstrapObjectIdData;
T
Tom Lane 已提交
2416
	checkPoint.time = time(NULL);
2417

2418 2419 2420 2421
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;

2422 2423 2424
	memset(buffer, 0, BLCKSZ);
	page->xlp_magic = XLOG_PAGE_MAGIC;
	page->xlp_info = 0;
2425
	page->xlp_sui = checkPoint.ThisStartUpID;
2426 2427
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
2428 2429 2430
	record = (XLogRecord *) ((char *) page + SizeOfXLogPHD);
	record->xl_prev.xlogid = 0;
	record->xl_prev.xrecoff = 0;
2431 2432 2433
	record->xl_xact_prev = record->xl_prev;
	record->xl_xid = InvalidTransactionId;
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
2434
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
2435
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
2436
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
2437

2438
	INIT_CRC64(crc);
T
Tom Lane 已提交
2439
	COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
B
Bruce Momjian 已提交
2440
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
2441
			   SizeOfXLogRecord - sizeof(crc64));
2442 2443 2444
	FIN_CRC64(crc);
	record->xl_crc = crc;

2445 2446
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
2447

2448
	errno = 0;
T
Tom Lane 已提交
2449
	if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
2450 2451 2452 2453
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2454
		elog(PANIC, "BootStrapXLOG failed to write log file: %m");
2455
	}
2456

T
Tom Lane 已提交
2457
	if (pg_fsync(openLogFile) != 0)
2458
		elog(PANIC, "BootStrapXLOG failed to fsync log file: %m");
2459

T
Tom Lane 已提交
2460 2461
	close(openLogFile);
	openLogFile = -1;
2462

2463
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
2464 2465 2466
	/* Initialize pg_control status fields */
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
2467 2468 2469
	ControlFile->logId = 0;
	ControlFile->logSeg = 1;
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
2470
	ControlFile->checkPointCopy = checkPoint;
2471
	/* some additional ControlFile fields are set in WriteControlFile() */
2472

2473
	WriteControlFile();
2474 2475 2476

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
2477 2478
}

2479
static char *
2480 2481
str_time(time_t tnow)
{
T
Tom Lane 已提交
2482
	static char buf[32];
2483

2484
	strftime(buf, sizeof(buf),
T
Tom Lane 已提交
2485
			 "%Y-%m-%d %H:%M:%S %Z",
2486
			 localtime(&tnow));
2487

2488
	return buf;
2489 2490 2491
}

/*
T
Tom Lane 已提交
2492
 * This must be called ONCE during postmaster or standalone-backend startup
2493 2494
 */
void
T
Tom Lane 已提交
2495
StartupXLOG(void)
2496
{
2497 2498
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2499
	bool		wasShutdown;
2500
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
2501 2502 2503
				LastRec,
				checkPointLoc,
				EndOfLog;
2504
	XLogRecord *record;
T
Tom Lane 已提交
2505
	char	   *buffer;
2506

T
Tom Lane 已提交
2507 2508
	/* Use malloc() to ensure record buffer is MAXALIGNED */
	buffer = (char *) malloc(_INTL_MAXLOGRECSZ);
2509

T
Tom Lane 已提交
2510
	CritSectionCount++;
2511 2512

	/*
2513 2514
	 * Read control file and check XLOG status looks valid.
	 *
B
Bruce Momjian 已提交
2515 2516
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
2517
	 */
2518
	ReadControlFile();
2519

2520 2521 2522
	if (ControlFile->logSeg == 0 ||
		ControlFile->state < DB_SHUTDOWNED ||
		ControlFile->state > DB_IN_PRODUCTION ||
2523
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
2524
		elog(PANIC, "control file context is broken");
2525 2526

	if (ControlFile->state == DB_SHUTDOWNED)
2527
		elog(LOG, "database system was shut down at %s",
2528
			 str_time(ControlFile->time));
2529
	else if (ControlFile->state == DB_SHUTDOWNING)
2530
		elog(LOG, "database system shutdown was interrupted at %s",
2531
			 str_time(ControlFile->time));
2532
	else if (ControlFile->state == DB_IN_RECOVERY)
2533
		elog(LOG, "database system was interrupted being in recovery at %s\n"
T
Tom Lane 已提交
2534
			 "\tThis probably means that some data blocks are corrupted\n"
2535
			 "\tand you will have to use the last backup for recovery.",
2536
			 str_time(ControlFile->time));
2537
	else if (ControlFile->state == DB_IN_PRODUCTION)
2538
		elog(LOG, "database system was interrupted at %s",
2539
			 str_time(ControlFile->time));
2540

T
Tom Lane 已提交
2541 2542 2543 2544
	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
2545
	record = ReadCheckpointRecord(ControlFile->checkPoint, 1, buffer);
T
Tom Lane 已提交
2546 2547 2548
	if (record != NULL)
	{
		checkPointLoc = ControlFile->checkPoint;
2549
		elog(LOG, "checkpoint record is at %X/%X",
T
Tom Lane 已提交
2550 2551 2552 2553
			 checkPointLoc.xlogid, checkPointLoc.xrecoff);
	}
	else
	{
2554
		record = ReadCheckpointRecord(ControlFile->prevCheckPoint, 2, buffer);
T
Tom Lane 已提交
2555 2556 2557
		if (record != NULL)
		{
			checkPointLoc = ControlFile->prevCheckPoint;
2558
			elog(LOG, "using previous checkpoint record at %X/%X",
T
Tom Lane 已提交
2559 2560 2561 2562
				 checkPointLoc.xlogid, checkPointLoc.xrecoff);
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
2563
			elog(PANIC, "unable to locate a valid checkpoint record");
T
Tom Lane 已提交
2564 2565 2566 2567
	}
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
2568

2569
	elog(LOG, "redo record is at %X/%X; undo record is at %X/%X; shutdown %s",
2570
		 checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
V
Vadim B. Mikheev 已提交
2571
		 checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
T
Tom Lane 已提交
2572
		 wasShutdown ? "TRUE" : "FALSE");
2573
	elog(LOG, "next transaction id: %u; next oid: %u",
2574
		 checkPoint.nextXid, checkPoint.nextOid);
2575
	if (!TransactionIdIsNormal(checkPoint.nextXid))
2576
		elog(PANIC, "invalid next transaction id");
2577 2578 2579

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
2580
	ShmemVariableCache->oidCount = 0;
2581

V
WAL  
Vadim B. Mikheev 已提交
2582
	ThisStartUpID = checkPoint.ThisStartUpID;
B
Bruce Momjian 已提交
2583
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
2584
		XLogCtl->SavedRedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
2585

2586
	if (XLByteLT(RecPtr, checkPoint.redo))
2587
		elog(PANIC, "invalid redo in checkpoint record");
2588 2589 2590
	if (checkPoint.undo.xrecoff == 0)
		checkPoint.undo = RecPtr;

B
Bruce Momjian 已提交
2591
	if (XLByteLT(checkPoint.undo, RecPtr) ||
V
Vadim B. Mikheev 已提交
2592
		XLByteLT(checkPoint.redo, RecPtr))
2593
	{
T
Tom Lane 已提交
2594
		if (wasShutdown)
2595
			elog(PANIC, "invalid redo/undo record in shutdown checkpoint");
V
WAL  
Vadim B. Mikheev 已提交
2596
		InRecovery = true;
2597 2598
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
2599
		InRecovery = true;
2600

V
WAL  
Vadim B. Mikheev 已提交
2601 2602
	/* REDO */
	if (InRecovery)
2603
	{
2604
		elog(LOG, "database system was not properly shut down; "
2605
			 "automatic recovery in progress");
2606 2607 2608 2609
		ControlFile->state = DB_IN_RECOVERY;
		ControlFile->time = time(NULL);
		UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2610
		XLogInitRelationCache();
V
Vadim B. Mikheev 已提交
2611

2612 2613
		/* Is REDO required ? */
		if (XLByteLT(checkPoint.redo, RecPtr))
2614
			record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
B
Bruce Momjian 已提交
2615
		else
2616 2617
		{
			/* read past CheckPoint record */
T
Tom Lane 已提交
2618
			record = ReadRecord(NULL, LOG, buffer);
2619
		}
2620

T
Tom Lane 已提交
2621
		if (record != NULL)
2622
		{
V
WAL  
Vadim B. Mikheev 已提交
2623
			InRedo = true;
2624
			elog(LOG, "redo starts at %X/%X",
2625
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2626 2627
			do
			{
2628 2629
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
2630
											ShmemVariableCache->nextXid))
2631 2632 2633 2634
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}
V
WAL  
Vadim B. Mikheev 已提交
2635 2636
				if (XLOG_DEBUG)
				{
B
Bruce Momjian 已提交
2637
					char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
2638

2639
					sprintf(buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
2640 2641
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
							EndRecPtr.xlogid, EndRecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
2642 2643
					xlog_outrec(buf, record);
					strcat(buf, " - ");
B
Bruce Momjian 已提交
2644 2645
					RmgrTable[record->xl_rmid].rm_desc(buf,
								record->xl_info, XLogRecGetData(record));
2646
					elog(LOG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
2647 2648
				}

T
Tom Lane 已提交
2649
				if (record->xl_info & XLR_BKP_BLOCK_MASK)
2650 2651
					RestoreBkpBlocks(record, EndRecPtr);

2652
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
T
Tom Lane 已提交
2653 2654
				record = ReadRecord(NULL, LOG, buffer);
			} while (record != NULL);
2655
			elog(LOG, "redo done at %X/%X",
2656
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2657
			LastRec = ReadRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2658
			InRedo = false;
2659 2660
		}
		else
2661
			elog(LOG, "redo is not required");
V
WAL  
Vadim B. Mikheev 已提交
2662 2663
	}

T
Tom Lane 已提交
2664 2665 2666 2667
	/*
	 * Init xlog buffer cache using the block containing the last valid
	 * record from the previous incarnation.
	 */
2668
	record = ReadRecord(&LastRec, PANIC, buffer);
T
Tom Lane 已提交
2669 2670 2671 2672 2673 2674
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, openLogId, openLogSeg);
	openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
	openLogOff = 0;
	ControlFile->logId = openLogId;
	ControlFile->logSeg = openLogSeg + 1;
V
WAL  
Vadim B. Mikheev 已提交
2675
	Insert = &XLogCtl->Insert;
2676
	Insert->PrevRecord = LastRec;
B
Bruce Momjian 已提交
2677 2678

	/*
2679 2680
	 * If the next record will go to the new page then initialize for that
	 * one.
T
Tom Lane 已提交
2681
	 */
2682 2683 2684 2685
	if ((BLCKSZ - EndOfLog.xrecoff % BLCKSZ) < SizeOfXLogRecord)
		EndOfLog.xrecoff += (BLCKSZ - EndOfLog.xrecoff % BLCKSZ);
	if (EndOfLog.xrecoff % BLCKSZ == 0)
	{
2686 2687 2688 2689
		XLogRecPtr	NewPageEndPtr;

		NewPageEndPtr = EndOfLog;
		if (NewPageEndPtr.xrecoff >= XLogFileSize)
2690
		{
2691 2692 2693
			/* crossing a logid boundary */
			NewPageEndPtr.xlogid += 1;
			NewPageEndPtr.xrecoff = BLCKSZ;
2694 2695
		}
		else
2696 2697
			NewPageEndPtr.xrecoff += BLCKSZ;
		XLogCtl->xlblocks[0] = NewPageEndPtr;
2698 2699 2700 2701 2702
		Insert->currpage->xlp_magic = XLOG_PAGE_MAGIC;
		if (InRecovery)
			Insert->currpage->xlp_sui = ThisStartUpID;
		else
			Insert->currpage->xlp_sui = ThisStartUpID + 1;
2703 2704
		Insert->currpage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
		Insert->currpage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
2705
		/* rest of buffer was zeroed in XLOGShmemInit */
2706
		Insert->currpos = (char *) Insert->currpage + SizeOfXLogPHD;
2707 2708 2709 2710 2711 2712
	}
	else
	{
		XLogCtl->xlblocks[0].xlogid = openLogId;
		XLogCtl->xlblocks[0].xrecoff =
			((EndOfLog.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
2713

2714 2715
		/*
		 * Tricky point here: readBuf contains the *last* block that the
2716
		 * LastRec record spans, not the one it starts in.	The last block
2717
		 * is indeed the one we want to use.
2718 2719 2720 2721 2722 2723 2724 2725
		 */
		Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - BLCKSZ) % XLogSegSize);
		memcpy((char *) Insert->currpage, readBuf, BLCKSZ);
		Insert->currpos = (char *) Insert->currpage +
			(EndOfLog.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
		/* Make sure rest of page is zero */
		memset(Insert->currpos, 0, INSERT_FREESPACE(Insert));
	}
V
WAL  
Vadim B. Mikheev 已提交
2726

T
Tom Lane 已提交
2727
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
2728

T
Tom Lane 已提交
2729 2730 2731
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
2732

T
Tom Lane 已提交
2733 2734
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
2735

V
Vadim B. Mikheev 已提交
2736
#ifdef NOT_USED
V
WAL  
Vadim B. Mikheev 已提交
2737 2738 2739
	/* UNDO */
	if (InRecovery)
	{
2740 2741 2742
		RecPtr = ReadRecPtr;
		if (XLByteLT(checkPoint.undo, RecPtr))
		{
2743
			elog(LOG, "undo starts at %X/%X",
2744
				 RecPtr.xlogid, RecPtr.xrecoff);
2745 2746
			do
			{
2747
				record = ReadRecord(&RecPtr, PANIC, buffer);
2748
				if (TransactionIdIsValid(record->xl_xid) &&
2749
					!TransactionIdDidCommit(record->xl_xid))
V
misc  
Vadim B. Mikheev 已提交
2750
					RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
2751 2752
				RecPtr = record->xl_prev;
			} while (XLByteLE(checkPoint.undo, RecPtr));
2753
			elog(LOG, "undo done at %X/%X",
2754
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2755 2756
		}
		else
2757
			elog(LOG, "undo is not required");
2758
	}
V
WAL  
Vadim B. Mikheev 已提交
2759
#endif
2760

V
WAL  
Vadim B. Mikheev 已提交
2761
	if (InRecovery)
2762
	{
T
Tom Lane 已提交
2763 2764 2765 2766 2767 2768 2769
		/*
		 * In case we had to use the secondary checkpoint, make sure that
		 * it will still be shown as the secondary checkpoint after this
		 * CreateCheckPoint operation; we don't want the broken primary
		 * checkpoint to become prevCheckPoint...
		 */
		ControlFile->checkPoint = checkPointLoc;
2770
		CreateCheckPoint(true);
V
WAL  
Vadim B. Mikheev 已提交
2771
		XLogCloseRelationCache();
2772
	}
2773

T
Tom Lane 已提交
2774 2775 2776 2777
	/*
	 * Preallocate additional log files, if wanted.
	 */
	PreallocXlogFiles(EndOfLog);
2778

V
WAL  
Vadim B. Mikheev 已提交
2779
	InRecovery = false;
2780 2781 2782 2783 2784

	ControlFile->state = DB_IN_PRODUCTION;
	ControlFile->time = time(NULL);
	UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2785 2786 2787
	ThisStartUpID++;
	XLogCtl->ThisStartUpID = ThisStartUpID;

2788 2789 2790
	/* Start up the commit log, too */
	StartupCLOG();

2791
	elog(LOG, "database system is ready");
2792
	CritSectionCount--;
2793

T
Tom Lane 已提交
2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	free(buffer);
}

2809 2810 2811 2812
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 * whichChkpt = 1 for "primary", 2 for "secondary", merely informative
 */
T
Tom Lane 已提交
2813 2814
static XLogRecord *
ReadCheckpointRecord(XLogRecPtr RecPtr,
2815
					 int whichChkpt,
T
Tom Lane 已提交
2816 2817 2818 2819 2820 2821
					 char *buffer)
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
2822 2823 2824
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint link in control file" :
				   "invalid secondary checkpoint link in control file"));
T
Tom Lane 已提交
2825 2826 2827 2828 2829 2830 2831
		return NULL;
	}

	record = ReadRecord(&RecPtr, LOG, buffer);

	if (record == NULL)
	{
2832 2833 2834
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint record" :
				   "invalid secondary checkpoint record"));
T
Tom Lane 已提交
2835 2836 2837 2838
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
2839
		elog(LOG, (whichChkpt == 1 ?
2840 2841
			 "invalid resource manager id in primary checkpoint record" :
		  "invalid resource manager id in secondary checkpoint record"));
T
Tom Lane 已提交
2842 2843 2844 2845 2846
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
2847 2848 2849
		elog(LOG, (whichChkpt == 1 ?
				   "invalid xl_info in primary checkpoint record" :
				   "invalid xl_info in secondary checkpoint record"));
T
Tom Lane 已提交
2850 2851 2852 2853
		return NULL;
	}
	if (record->xl_len != sizeof(CheckPoint))
	{
2854 2855 2856
		elog(LOG, (whichChkpt == 1 ?
				   "invalid length of primary checkpoint record" :
				   "invalid length of secondary checkpoint record"));
T
Tom Lane 已提交
2857 2858 2859
		return NULL;
	}
	return record;
2860 2861
}

V
WAL  
Vadim B. Mikheev 已提交
2862
/*
T
Tom Lane 已提交
2863
 * Postmaster uses this to initialize ThisStartUpID & RedoRecPtr from
2864
 * XLogCtlData located in shmem after successful startup.
V
WAL  
Vadim B. Mikheev 已提交
2865 2866 2867 2868 2869
 */
void
SetThisStartUpID(void)
{
	ThisStartUpID = XLogCtl->ThisStartUpID;
2870
	RedoRecPtr = XLogCtl->SavedRedoRecPtr;
2871 2872 2873
}

/*
T
Tom Lane 已提交
2874
 * CheckPoint process called by postmaster saves copy of new RedoRecPtr
2875 2876 2877 2878 2879 2880
 * in shmem (using SetSavedRedoRecPtr).  When checkpointer completes,
 * postmaster calls GetSavedRedoRecPtr to update its own copy of RedoRecPtr,
 * so that subsequently-spawned backends will start out with a reasonably
 * up-to-date local RedoRecPtr.  Since these operations are not protected by
 * any lock and copying an XLogRecPtr isn't atomic, it's unsafe to use either
 * of these routines at other times!
2881 2882
 */
void
2883
SetSavedRedoRecPtr(void)
2884
{
2885
	XLogCtl->SavedRedoRecPtr = RedoRecPtr;
2886 2887 2888
}

void
2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899
GetSavedRedoRecPtr(void)
{
	RedoRecPtr = XLogCtl->SavedRedoRecPtr;
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
2900 2901
GetRedoRecPtr(void)
{
2902 2903 2904 2905 2906 2907 2908 2909 2910
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
	SpinLockRelease_NoHoldoff(&xlogctl->info_lck);

	return RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2911 2912
}

2913
/*
T
Tom Lane 已提交
2914
 * This must be called ONCE during postmaster or standalone-backend shutdown
2915 2916
 */
void
T
Tom Lane 已提交
2917
ShutdownXLOG(void)
2918
{
2919
	elog(LOG, "shutting down");
2920

T
Tom Lane 已提交
2921 2922
	/* suppress in-transaction check in CreateCheckPoint */
	MyLastRecPtr.xrecoff = 0;
2923
	MyXactMadeXLogEntry = false;
T
Tom Lane 已提交
2924

2925
	CritSectionCount++;
V
Vadim B. Mikheev 已提交
2926
	CreateDummyCaches();
2927
	CreateCheckPoint(true);
2928
	ShutdownCLOG();
2929
	CritSectionCount--;
2930

2931
	elog(LOG, "database system is shut down");
2932 2933
}

T
Tom Lane 已提交
2934 2935 2936
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
2937 2938 2939
void
CreateCheckPoint(bool shutdown)
{
2940 2941 2942
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
2943
	XLogRecData rdata;
2944
	uint32		freespace;
V
Vadim B. Mikheev 已提交
2945 2946 2947
	uint32		_logId;
	uint32		_logSeg;

2948
	if (MyXactMadeXLogEntry)
V
Vadim B. Mikheev 已提交
2949
		elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
B
Bruce Momjian 已提交
2950

2951 2952
	/*
	 * The CheckpointLock can be held for quite a while, which is not good
2953 2954 2955 2956 2957
	 * because we won't respond to a cancel/die request while waiting for
	 * an LWLock.  (But the alternative of using a regular lock won't work
	 * for background checkpoint processes, which are not regular
	 * backends.) So, rather than use a plain LWLockAcquire, use this
	 * kluge to allow an interrupt to be accepted while we are waiting:
2958 2959
	 */
	while (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
V
Vadim B. Mikheev 已提交
2960
	{
2961 2962
		CHECK_FOR_INTERRUPTS();
		sleep(1);
V
Vadim B. Mikheev 已提交
2963
	}
2964

2965 2966
	START_CRIT_SECTION();

2967 2968 2969 2970 2971 2972
	if (shutdown)
	{
		ControlFile->state = DB_SHUTDOWNING;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
T
Tom Lane 已提交
2973 2974

	memset(&checkPoint, 0, sizeof(checkPoint));
V
WAL  
Vadim B. Mikheev 已提交
2975
	checkPoint.ThisStartUpID = ThisStartUpID;
T
Tom Lane 已提交
2976
	checkPoint.time = time(NULL);
2977

2978
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
2979 2980 2981 2982

	/*
	 * If this isn't a shutdown, and we have not inserted any XLOG records
	 * since the start of the last checkpoint, skip the checkpoint.  The
B
Bruce Momjian 已提交
2983 2984 2985 2986 2987 2988
	 * idea here is to avoid inserting duplicate checkpoints when the
	 * system is idle.	That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the
	 * previous checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
2989 2990
	 *
	 * We have to make two tests to determine that nothing has happened since
B
Bruce Momjian 已提交
2991 2992 2993
	 * the start of the last checkpoint: current insertion point must
	 * match the end of the last checkpoint record, and its redo pointer
	 * must point to itself.
T
Tom Lane 已提交
2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007
	 */
	if (!shutdown)
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
3008 3009
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020
			END_CRIT_SECTION();
			return;
		}
	}

	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will
	 * be, since other backends may insert more XLOG records while we're
	 * off doing the buffer flush work.  Those XLOG records are logically
B
Bruce Momjian 已提交
3021
	 * after the checkpoint, even though physically before it.	Got that?
T
Tom Lane 已提交
3022 3023
	 */
	freespace = INSERT_FREESPACE(Insert);
3024 3025
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
3026 3027
		(void) AdvanceXLInsertBuffer();
		/* OK to ignore update return flag, since we will do flush anyway */
3028 3029
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
T
Tom Lane 已提交
3030
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
3031

T
Tom Lane 已提交
3032 3033
	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls;
3034
	 * this must be done while holding the insert lock AND the info_lck.
T
Tom Lane 已提交
3035
	 */
3036 3037 3038 3039 3040 3041 3042 3043
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
		SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
	}
B
Bruce Momjian 已提交
3044

T
Tom Lane 已提交
3045
	/*
J
Jan Wieck 已提交
3046
	 * Get UNDO record ptr - this is oldest of PGPROC->logRec values. We do
B
Bruce Momjian 已提交
3047 3048 3049
	 * this while holding insert lock to ensure that we won't miss any
	 * about-to-commit transactions (UNDO must include all xacts that have
	 * commits after REDO point).
3050 3051 3052 3053 3054 3055 3056 3057 3058
	 *
	 * XXX temporarily ifdef'd out to avoid three-way deadlock condition:
	 * GetUndoRecPtr needs to grab SInvalLock to ensure that it is looking
	 * at a stable set of proc records, but grabbing SInvalLock while holding
	 * WALInsertLock is no good.  GetNewTransactionId may cause a WAL record
	 * to be written while holding XidGenLock, and GetSnapshotData needs to
	 * get XidGenLock while holding SInvalLock, so there's a risk of deadlock.
	 * Need to find a better solution.  See pgsql-hackers discussion of
	 * 17-Dec-01.
T
Tom Lane 已提交
3059
	 */
3060
#ifdef NOT_USED
T
Tom Lane 已提交
3061 3062 3063
	checkPoint.undo = GetUndoRecPtr();

	if (shutdown && checkPoint.undo.xrecoff != 0)
3064
		elog(PANIC, "active transaction while database system is shutting down");
3065
#endif
T
Tom Lane 已提交
3066 3067 3068 3069 3070

	/*
	 * Now we can release insert lock, allowing other xacts to proceed
	 * even while we are flushing disk buffers.
	 */
3071
	LWLockRelease(WALInsertLock);
3072

3073
	LWLockAcquire(XidGenLock, LW_SHARED);
3074
	checkPoint.nextXid = ShmemVariableCache->nextXid;
3075
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
3076

3077
	LWLockAcquire(OidGenLock, LW_SHARED);
3078
	checkPoint.nextOid = ShmemVariableCache->nextOid;
3079 3080
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
3081
	LWLockRelease(OidGenLock);
3082

T
Tom Lane 已提交
3083
	/*
B
Bruce Momjian 已提交
3084 3085
	 * Having constructed the checkpoint record, ensure all shmem disk
	 * buffers are flushed to disk.
T
Tom Lane 已提交
3086
	 */
V
Vadim B. Mikheev 已提交
3087
	FlushBufferPool();
3088

3089 3090 3091
	/* And commit-log buffers, too */
	CheckPointCLOG();

T
Tom Lane 已提交
3092 3093 3094
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
3095
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3096
	rdata.data = (char *) (&checkPoint);
3097 3098 3099
	rdata.len = sizeof(checkPoint);
	rdata.next = NULL;

T
Tom Lane 已提交
3100 3101 3102 3103 3104 3105
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
3106

T
Tom Lane 已提交
3107 3108 3109 3110 3111
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record,
	 * recptr = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
3112
		elog(PANIC, "concurrent transaction log activity while database system is shutting down");
3113

T
Tom Lane 已提交
3114
	/*
3115 3116 3117 3118 3119 3120 3121
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
	 *
	 * With UNDO support: oldest item is redo or undo, whichever is older;
	 * but watch out for case that undo = 0.
	 *
	 * Without UNDO support: just use the redo pointer.  This allows xlog
3122 3123
	 * space to be freed much faster when there are long-running
	 * transactions.
T
Tom Lane 已提交
3124
	 */
3125
#ifdef NOT_USED
B
Bruce Momjian 已提交
3126
	if (ControlFile->checkPointCopy.undo.xrecoff != 0 &&
T
Tom Lane 已提交
3127 3128 3129 3130
		XLByteLT(ControlFile->checkPointCopy.undo,
				 ControlFile->checkPointCopy.redo))
		XLByteToSeg(ControlFile->checkPointCopy.undo, _logId, _logSeg);
	else
3131
#endif
T
Tom Lane 已提交
3132
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
3133

T
Tom Lane 已提交
3134 3135 3136
	/*
	 * Update the control file.
	 */
3137
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3138 3139
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
3140 3141 3142
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
3143 3144
	ControlFile->time = time(NULL);
	UpdateControlFile();
3145
	LWLockRelease(ControlFileLock);
3146

V
Vadim B. Mikheev 已提交
3147
	/*
T
Tom Lane 已提交
3148 3149
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
V
Vadim B. Mikheev 已提交
3150 3151 3152
	 */
	if (_logId || _logSeg)
	{
T
Tom Lane 已提交
3153
		PrevLogSeg(_logId, _logSeg);
3154
		MoveOfflineLogs(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
3155 3156
	}

T
Tom Lane 已提交
3157 3158 3159 3160 3161 3162 3163 3164
	/*
	 * Make more log segments if needed.  (Do this after deleting offline
	 * log segments, to avoid having peak disk space usage higher than
	 * necessary.)
	 */
	if (!shutdown)
		PreallocXlogFiles(recptr);

3165
	LWLockRelease(CheckpointLock);
V
Vadim B. Mikheev 已提交
3166

3167
	END_CRIT_SECTION();
3168
}
V
WAL  
Vadim B. Mikheev 已提交
3169

T
Tom Lane 已提交
3170 3171 3172
/*
 * Write a NEXTOID log record
 */
3173 3174 3175
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
3176
	XLogRecData rdata;
3177

3178
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3179
	rdata.data = (char *) (&nextOid);
3180 3181 3182 3183
	rdata.len = sizeof(Oid);
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
}
V
WAL  
Vadim B. Mikheev 已提交
3184

T
Tom Lane 已提交
3185 3186 3187
/*
 * XLOG resource manager's routines
 */
V
WAL  
Vadim B. Mikheev 已提交
3188 3189 3190
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
3191
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
3192

3193
	if (info == XLOG_NEXTOID)
3194
	{
B
Bruce Momjian 已提交
3195
		Oid			nextOid;
3196 3197 3198

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
3199
		{
3200
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
3219
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
3220 3221
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
3222 3223 3224 3225 3226 3227
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
3228
	}
V
WAL  
Vadim B. Mikheev 已提交
3229
}
B
Bruce Momjian 已提交
3230

V
WAL  
Vadim B. Mikheev 已提交
3231 3232 3233 3234
void
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
B
Bruce Momjian 已提交
3235

V
WAL  
Vadim B. Mikheev 已提交
3236
void
B
Bruce Momjian 已提交
3237
xlog_desc(char *buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
3238
{
B
Bruce Momjian 已提交
3239
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
3240

T
Tom Lane 已提交
3241 3242
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
3243
	{
B
Bruce Momjian 已提交
3244 3245
		CheckPoint *checkpoint = (CheckPoint *) rec;

3246
		sprintf(buf + strlen(buf), "checkpoint: redo %X/%X; undo %X/%X; "
B
Bruce Momjian 已提交
3247 3248 3249 3250 3251 3252
				"sui %u; xid %u; oid %u; %s",
				checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
				checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
				checkpoint->ThisStartUpID, checkpoint->nextXid,
				checkpoint->nextOid,
			 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
3253
	}
3254 3255
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
3256
		Oid			nextOid;
3257 3258 3259 3260

		memcpy(&nextOid, rec, sizeof(Oid));
		sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
	}
V
WAL  
Vadim B. Mikheev 已提交
3261 3262 3263 3264 3265 3266 3267
	else
		strcat(buf, "UNKNOWN");
}

static void
xlog_outrec(char *buf, XLogRecord *record)
{
B
Bruce Momjian 已提交
3268 3269
	int			bkpb;
	int			i;
3270

3271
	sprintf(buf + strlen(buf), "prev %X/%X; xprev %X/%X; xid %u",
B
Bruce Momjian 已提交
3272 3273 3274
			record->xl_prev.xlogid, record->xl_prev.xrecoff,
			record->xl_xact_prev.xlogid, record->xl_xact_prev.xrecoff,
			record->xl_xid);
3275

T
Tom Lane 已提交
3276
	for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3277 3278 3279 3280 3281 3282 3283 3284 3285 3286
	{
		if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i))))
			continue;
		bkpb++;
	}

	if (bkpb)
		sprintf(buf + strlen(buf), "; bkpb %d", bkpb);

	sprintf(buf + strlen(buf), ": %s",
B
Bruce Momjian 已提交
3287
			RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
3288
}
3289 3290 3291


/*
3292
 * GUC support
3293
 */
3294 3295
const char *
assign_xlog_sync_method(const char *method, bool doit, bool interactive)
3296
{
B
Bruce Momjian 已提交
3297 3298
	int			new_sync_method;
	int			new_sync_bit;
3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327

	if (strcasecmp(method, "fsync") == 0)
	{
		new_sync_method = SYNC_METHOD_FSYNC;
		new_sync_bit = 0;
	}
#ifdef HAVE_FDATASYNC
	else if (strcasecmp(method, "fdatasync") == 0)
	{
		new_sync_method = SYNC_METHOD_FDATASYNC;
		new_sync_bit = 0;
	}
#endif
#ifdef OPEN_SYNC_FLAG
	else if (strcasecmp(method, "open_sync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_SYNC_FLAG;
	}
#endif
#ifdef OPEN_DATASYNC_FLAG
	else if (strcasecmp(method, "open_datasync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_DATASYNC_FLAG;
	}
#endif
	else
	{
3328
		return NULL;
3329 3330
	}

3331 3332 3333
	if (!doit)
		return method;

3334 3335 3336
	if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
	{
		/*
B
Bruce Momjian 已提交
3337 3338 3339 3340
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new
		 * flag bit) at next use.
3341 3342 3343 3344
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
3345
				elog(PANIC, "fsync of log file %u, segment %u failed: %m",
3346 3347 3348 3349
					 openLogId, openLogSeg);
			if (open_sync_bit != new_sync_bit)
			{
				if (close(openLogFile) != 0)
3350
					elog(PANIC, "close of log file %u, segment %u failed: %m",
3351 3352 3353 3354 3355 3356 3357
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
		}
		sync_method = new_sync_method;
		open_sync_bit = new_sync_bit;
	}
3358 3359

	return method;
3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370
}


/*
 * Issue appropriate kind of fsync (if any) on the current XLOG output file
 */
static void
issue_xlog_fsync(void)
{
	switch (sync_method)
	{
3371
		case SYNC_METHOD_FSYNC:
3372
			if (pg_fsync(openLogFile) != 0)
3373
				elog(PANIC, "fsync of log file %u, segment %u failed: %m",
3374 3375 3376 3377 3378
					 openLogId, openLogSeg);
			break;
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
			if (pg_fdatasync(openLogFile) != 0)
3379
				elog(PANIC, "fdatasync of log file %u, segment %u failed: %m",
3380 3381 3382 3383 3384 3385 3386
					 openLogId, openLogSeg);
			break;
#endif
		case SYNC_METHOD_OPEN:
			/* write synced it already */
			break;
		default:
3387
			elog(PANIC, "bogus wal_sync_method %d", sync_method);
3388 3389 3390
			break;
	}
}