xlog.c 93.4 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
7
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.80 2001/10/28 06:25:42 momjian Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <fcntl.h>
T
Tom Lane 已提交
18
#include <signal.h>
19 20 21
#include <unistd.h>
#include <errno.h>
#include <sys/stat.h>
V
Vadim B. Mikheev 已提交
22
#include <sys/time.h>
V
Vadim B. Mikheev 已提交
23 24
#include <sys/types.h>
#include <dirent.h>
25 26 27
#ifdef USE_LOCALE
#include <locale.h>
#endif
28

29
#include "access/clog.h"
30
#include "access/transam.h"
31
#include "access/xact.h"
32 33
#include "access/xlog.h"
#include "access/xlogutils.h"
34
#include "catalog/catversion.h"
T
Tom Lane 已提交
35
#include "catalog/pg_control.h"
36 37
#include "storage/bufpage.h"
#include "storage/lwlock.h"
38
#include "storage/proc.h"
39
#include "storage/sinval.h"
40
#include "storage/spin.h"
41
#include "utils/builtins.h"
42
#include "utils/relcache.h"
43
#include "utils/selfuncs.h"
V
WAL  
Vadim B. Mikheev 已提交
44 45
#include "miscadmin.h"

46

47 48 49
/*
 * This chunk of hackery attempts to determine which file sync methods
 * are available on the current platform, and to choose an appropriate
B
Bruce Momjian 已提交
50
 * default method.	We assume that fsync() is always available, and that
51 52 53 54
 * configure determined whether fdatasync() is.
 */
#define SYNC_METHOD_FSYNC		0
#define SYNC_METHOD_FDATASYNC	1
B
Bruce Momjian 已提交
55 56
#define SYNC_METHOD_OPEN		2		/* used for both O_SYNC and
										 * O_DSYNC */
57 58

#if defined(O_SYNC)
B
Bruce Momjian 已提交
59
#define OPEN_SYNC_FLAG	   O_SYNC
60
#else
B
Bruce Momjian 已提交
61 62 63
#if defined(O_FSYNC)
#define OPEN_SYNC_FLAG	  O_FSYNC
#endif
64 65 66
#endif

#if defined(OPEN_SYNC_FLAG)
B
Bruce Momjian 已提交
67 68 69
#if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
#define OPEN_DATASYNC_FLAG	  O_DSYNC
#endif
70 71 72
#endif

#if defined(OPEN_DATASYNC_FLAG)
B
Bruce Momjian 已提交
73 74 75
#define DEFAULT_SYNC_METHOD_STR    "open_datasync"
#define DEFAULT_SYNC_METHOD		   SYNC_METHOD_OPEN
#define DEFAULT_SYNC_FLAGBIT	   OPEN_DATASYNC_FLAG
76
#else
B
Bruce Momjian 已提交
77 78 79 80 81 82 83 84 85
#if defined(HAVE_FDATASYNC)
#define DEFAULT_SYNC_METHOD_STR   "fdatasync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FDATASYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#else
#define DEFAULT_SYNC_METHOD_STR   "fsync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FSYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#endif
86 87 88
#endif


T
Tom Lane 已提交
89 90
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
91
int			XLOGbuffers = 8;
92
int			XLOGfiles = 0;		/* # of files to preallocate during ckpt */
T
Tom Lane 已提交
93
int			XLOG_DEBUG = 0;
94 95
char	   *XLOG_sync_method = NULL;
const char	XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
B
Bruce Momjian 已提交
96 97
char		XLOG_archive_dir[MAXPGPATH];		/* null string means
												 * delete 'em */
T
Tom Lane 已提交
98

99
/*
100
 * XLOGfileslop is used in the code as the allowed "fuzz" in the number of
101 102 103 104 105 106 107 108 109 110 111 112
 * preallocated XLOG segments --- we try to have at least XLOGfiles advance
 * segments but no more than XLOGfiles+XLOGfileslop segments.  This could
 * be made a separate GUC variable, but at present I think it's sufficient
 * to hardwire it as 2*CheckPointSegments+1.  Under normal conditions, a
 * checkpoint will free no more than 2*CheckPointSegments log segments, and
 * we want to recycle all of them; the +1 allows boundary cases to happen
 * without wasting a delete/create-segment cycle.
 */

#define XLOGfileslop	(2*CheckPointSegments + 1)


113 114 115 116 117 118
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
static int	sync_method = DEFAULT_SYNC_METHOD;
static int	open_sync_bit = DEFAULT_SYNC_FLAGBIT;

#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)

119 120
#define MinXLOGbuffers	4

T
Tom Lane 已提交
121 122 123 124 125

/*
 * ThisStartUpID will be same in all backends --- it identifies current
 * instance of the database system.
 */
V
WAL  
Vadim B. Mikheev 已提交
126 127
StartUpID	ThisStartUpID = 0;

T
Tom Lane 已提交
128 129
/* Are we doing recovery by reading XLOG? */
bool		InRecovery = false;
130

T
Tom Lane 已提交
131 132 133 134 135 136 137 138 139
/*
 * MyLastRecPtr points to the start of the last XLOG record inserted by the
 * current transaction.  If MyLastRecPtr.xrecoff == 0, then we are not in
 * a transaction or the transaction has not yet made any loggable changes.
 *
 * Note that XLOG records inserted outside transaction control are not
 * reflected into MyLastRecPtr.
 */
XLogRecPtr	MyLastRecPtr = {0, 0};
V
Vadim B. Mikheev 已提交
140

T
Tom Lane 已提交
141 142 143 144 145 146
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 * current backend.  It is updated for all inserts, transaction-controlled
 * or not.
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
147

T
Tom Lane 已提交
148 149 150
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
151
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
152
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
153
 * hold the Insert lock).  See XLogInsert for details.
T
Tom Lane 已提交
154 155
 */
static XLogRecPtr RedoRecPtr;
156

T
Tom Lane 已提交
157 158 159 160 161 162 163 164 165
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
166
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
167 168 169
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
170 171 172 173
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
174 175 176
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
177 178
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
179 180 181
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
182
 * but is updated when convenient.	Again, it exists for the convenience of
183
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
184 185 186 187 188 189 190 191 192 193
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
 * CheckpointLock: must be held to do a checkpoint (ensures only one
 * checkpointer at a time; even though the postmaster won't launch
 * parallel checkpoint processes, we need this because manual checkpoints
 * could be launched simultaneously).
 *
T
Tom Lane 已提交
212 213 214
 *----------
 */
typedef struct XLogwrtRqst
215
{
T
Tom Lane 已提交
216 217
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
218
} XLogwrtRqst;
219

T
Tom Lane 已提交
220
typedef struct XLogwrtResult
221
{
T
Tom Lane 已提交
222 223
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
224
} XLogwrtResult;
225

T
Tom Lane 已提交
226 227 228
/*
 * Shared state data for XLogInsert.
 */
229 230
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
231 232 233 234 235 236
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
	uint16		curridx;		/* current block index in cache */
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
237 238
} XLogCtlInsert;

T
Tom Lane 已提交
239 240 241
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
242 243
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
244 245
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	uint16		curridx;		/* cache index of next block to write */
246 247
} XLogCtlWrite;

T
Tom Lane 已提交
248 249 250
/*
 * Total shared-memory state for XLOG.
 */
251 252
typedef struct XLogCtlData
{
253
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
254
	XLogCtlInsert Insert;
T
Tom Lane 已提交
255
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
256 257
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
258
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
259 260
	XLogCtlWrite Write;

T
Tom Lane 已提交
261 262
	/*
	 * These values do not change after startup, although the pointed-to
263 264 265
	 * pages and xlblocks values certainly do.	Permission to read/write
	 * the pages and xlblocks values depends on WALInsertLock and
	 * WALWriteLock.
T
Tom Lane 已提交
266
	 */
B
Bruce Momjian 已提交
267 268 269 270 271
	char	   *pages;			/* buffers for unwritten XLOG pages */
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + BLCKSZ */
	uint32		XLogCacheByte;	/* # bytes in xlog buffers */
	uint32		XLogCacheBlck;	/* highest allocated xlog buffer index */
	StartUpID	ThisStartUpID;
T
Tom Lane 已提交
272

273
	/* This value is not protected by *any* lock... */
B
Bruce Momjian 已提交
274
	XLogRecPtr	RedoRecPtr;		/* see SetRedoRecPtr/GetRedoRecPtr */
T
Tom Lane 已提交
275

B
Bruce Momjian 已提交
276
	slock_t		info_lck;		/* locks shared LogwrtRqst/LogwrtResult */
277 278
} XLogCtlData;

279
static XLogCtlData *XLogCtl = NULL;
280

281
/*
T
Tom Lane 已提交
282
 * We maintain an image of pg_control in shared memory.
283
 */
284
static ControlFileData *ControlFile = NULL;
285

T
Tom Lane 已提交
286 287 288 289 290
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
291

T
Tom Lane 已提交
292 293 294 295 296 297 298 299 300
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
	(BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
301
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
	)


/* Increment an xlogid/segment pair */
#define NextLogSeg(logId, logSeg)	\
	do { \
		if ((logSeg) >= XLogSegsPerFile-1) \
		{ \
			(logId)++; \
			(logSeg) = 0; \
		} \
		else \
			(logSeg)++; \
	} while (0)

/* Decrement an xlogid/segment pair (assume it's not 0,0) */
#define PrevLogSeg(logId, logSeg)	\
	do { \
		if (logSeg) \
			(logSeg)--; \
		else \
		{ \
			(logId)--; \
			(logSeg) = XLogSegsPerFile-1; \
		} \
	} while (0)
V
WAL  
Vadim B. Mikheev 已提交
328

T
Tom Lane 已提交
329 330 331 332
/*
 * Compute ID and segment from an XLogRecPtr.
 *
 * For XLByteToSeg, do the computation at face value.  For XLByteToPrevSeg,
B
Bruce Momjian 已提交
333
 * a boundary byte is taken to be in the previous segment.	This is suitable
T
Tom Lane 已提交
334 335 336 337 338 339 340 341 342 343 344
 * for deciding which segment to write given a pointer to a record end,
 * for example.
 */
#define XLByteToSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = (xlrp).xrecoff / XLogSegSize \
	)
#define XLByteToPrevSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = ((xlrp).xrecoff - 1) / XLogSegSize \
	)
345

346
/*
T
Tom Lane 已提交
347 348 349 350
 * Is an XLogRecPtr within a particular XLOG segment?
 *
 * For XLByteInSeg, do the computation at face value.  For XLByteInPrevSeg,
 * a boundary byte is taken to be in the previous segment.
351
 */
T
Tom Lane 已提交
352 353 354 355 356 357 358
#define XLByteInSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 (xlrp).xrecoff / XLogSegSize == (logSeg))

#define XLByteInPrevSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 ((xlrp).xrecoff - 1) / XLogSegSize == (logSeg))
359 360


361
#define XLogFileName(path, log, seg)	\
362 363
			snprintf(path, MAXPGPATH, "%s/%08X%08X",	\
					 XLogDir, log, seg)
364

T
Tom Lane 已提交
365 366 367 368 369
#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
370

371
#define XRecOffIsValid(xrecoff) \
T
Tom Lane 已提交
372 373
		((xrecoff) % BLCKSZ >= SizeOfXLogPHD && \
		(BLCKSZ - (xrecoff) % BLCKSZ) >= SizeOfXLogRecord)
374

T
Tom Lane 已提交
375 376 377 378 379 380
/*
 * _INTL_MAXLOGRECSZ: max space needed for a record including header and
 * any backup-block data.
 */
#define _INTL_MAXLOGRECSZ	(SizeOfXLogRecord + MAXLOGRECSZ + \
							 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
381

382

T
Tom Lane 已提交
383
/* File path names */
B
Bruce Momjian 已提交
384 385
static char XLogDir[MAXPGPATH];
static char ControlFilePath[MAXPGPATH];
T
Tom Lane 已提交
386 387 388 389 390 391

/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
392

T
Tom Lane 已提交
393 394 395 396 397 398 399 400 401 402
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
403

T
Tom Lane 已提交
404 405 406 407 408 409
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
 * will be just past that page.
 */
410 411 412 413
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
B
Bruce Momjian 已提交
414

T
Tom Lane 已提交
415 416
/* Buffer for currently read page (BLCKSZ bytes) */
static char *readBuf = NULL;
B
Bruce Momjian 已提交
417

T
Tom Lane 已提交
418 419 420
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr;
static XLogRecPtr EndRecPtr;
421
static XLogRecord *nextRecord = NULL;
422
static StartUpID lastReadSUI;
423

V
WAL  
Vadim B. Mikheev 已提交
424 425
static bool InRedo = false;

T
Tom Lane 已提交
426 427 428

static bool AdvanceXLInsertBuffer(void);
static void XLogWrite(XLogwrtRqst WriteRqst);
B
Bruce Momjian 已提交
429 430
static int XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock);
431
static bool InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
432 433
					   bool find_free, int max_advance,
					   bool use_lock);
T
Tom Lane 已提交
434 435
static int	XLogFileOpen(uint32 log, uint32 seg, bool econt);
static void PreallocXlogFiles(XLogRecPtr endptr);
436
static void MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr);
T
Tom Lane 已提交
437
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer);
438
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI);
T
Tom Lane 已提交
439
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr,
440
					 int whichChkpt,
B
Bruce Momjian 已提交
441
					 char *buffer);
T
Tom Lane 已提交
442 443 444 445
static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(time_t tnow);
static void xlog_outrec(char *buf, XLogRecord *record);
446
static void issue_xlog_fsync(void);
T
Tom Lane 已提交
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
 * the rdata list (see xlog.h for notes about rdata).
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
464
XLogRecPtr
465
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
466
{
B
Bruce Momjian 已提交
467 468
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
469
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
470 471 472 473 474 475 476 477 478 479 480 481 482 483
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
	uint16		curridx;
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
	crc64		rdata_crc;
	uint32		len,
				write_len;
	unsigned	i;
484
	XLogwrtRqst LogwrtRqst;
B
Bruce Momjian 已提交
485 486
	bool		updrqst;
	bool		no_tran = (rmid == RM_XLOG_ID) ? true : false;
V
Vadim B. Mikheev 已提交
487 488 489 490

	if (info & XLR_INFO_MASK)
	{
		if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
B
Bruce Momjian 已提交
491
			elog(STOP, "XLogInsert: invalid info mask %02X",
T
Tom Lane 已提交
492
				 (info & XLR_INFO_MASK));
V
Vadim B. Mikheev 已提交
493 494 495 496
		no_tran = true;
		info &= ~XLR_INFO_MASK;
	}

T
Tom Lane 已提交
497
	/*
B
Bruce Momjian 已提交
498 499
	 * In bootstrap mode, we don't actually log anything but XLOG
	 * resources; return a phony record pointer.
T
Tom Lane 已提交
500
	 */
V
Vadim B. Mikheev 已提交
501
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
502 503
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
504
		RecPtr.xrecoff = SizeOfXLogPHD; /* start of 1st checkpoint record */
V
WAL  
Vadim B. Mikheev 已提交
505 506 507
		return (RecPtr);
	}

T
Tom Lane 已提交
508 509 510 511 512 513
	/*
	 * Here we scan the rdata list, determine which buffers must be backed
	 * up, and compute the CRC values for the data.  Note that the record
	 * header isn't added into the CRC yet since we don't know the final
	 * length or info bits quite yet.
	 *
B
Bruce Momjian 已提交
514 515
	 * We may have to loop back to here if a race condition is detected
	 * below. We could prevent the race by doing all this work while
516
	 * holding the insert lock, but it seems better to avoid doing CRC
B
Bruce Momjian 已提交
517 518 519 520 521 522 523 524
	 * calculations while holding the lock.  This means we have to be
	 * careful about modifying the rdata list until we know we aren't
	 * going to loop back again.  The only change we allow ourselves to
	 * make earlier is to set rdt->data = NULL in list items we have
	 * decided we will have to back up the whole buffer for.  This is OK
	 * because we will certainly decide the same thing again for those
	 * items if we do it over; doing it here saves an extra pass over the
	 * list later.
T
Tom Lane 已提交
525
	 */
526
begin:;
T
Tom Lane 已提交
527 528 529 530 531 532
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

533
	INIT_CRC64(rdata_crc);
T
Tom Lane 已提交
534
	len = 0;
B
Bruce Momjian 已提交
535
	for (rdt = rdata;;)
536 537 538
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
539
			/* Simple data, just include it */
540 541 542
			len += rdt->len;
			COMP_CRC64(rdata_crc, rdt->data, rdt->len);
		}
T
Tom Lane 已提交
543
		else
544
		{
T
Tom Lane 已提交
545 546
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
547
			{
T
Tom Lane 已提交
548
				if (rdt->buffer == dtbuf[i])
549
				{
T
Tom Lane 已提交
550 551 552 553 554 555 556 557 558
					/* Buffer already referenced by earlier list item */
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
559
				}
T
Tom Lane 已提交
560
				if (dtbuf[i] == InvalidBuffer)
561
				{
T
Tom Lane 已提交
562 563
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
B
Bruce Momjian 已提交
564

T
Tom Lane 已提交
565 566 567
					/*
					 * XXX We assume page LSN is first data on page
					 */
B
Bruce Momjian 已提交
568
					dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
T
Tom Lane 已提交
569 570
					if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
					{
B
Bruce Momjian 已提交
571
						crc64		dtcrc;
T
Tom Lane 已提交
572 573 574 575 576 577 578 579 580 581

						dtbuf_bkp[i] = true;
						rdt->data = NULL;
						INIT_CRC64(dtcrc);
						COMP_CRC64(dtcrc,
								   BufferGetBlock(dtbuf[i]),
								   BLCKSZ);
						dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
						dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
						COMP_CRC64(dtcrc,
B
Bruce Momjian 已提交
582
								(char *) &(dtbuf_xlg[i]) + sizeof(crc64),
T
Tom Lane 已提交
583 584 585 586 587 588 589 590 591 592
								   sizeof(BkpBlock) - sizeof(crc64));
						FIN_CRC64(dtcrc);
						dtbuf_xlg[i].crc = dtcrc;
					}
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
593 594
				}
			}
T
Tom Lane 已提交
595 596 597
			if (i >= XLR_MAX_BKP_BLOCKS)
				elog(STOP, "XLogInsert: can backup %d blocks at most",
					 XLR_MAX_BKP_BLOCKS);
598
		}
T
Tom Lane 已提交
599
		/* Break out of loop when rdt points to last list item */
600 601 602 603 604
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

T
Tom Lane 已提交
605 606 607
	/*
	 * NOTE: the test for len == 0 here is somewhat fishy, since in theory
	 * all of the rmgr data might have been suppressed in favor of backup
B
Bruce Momjian 已提交
608
	 * blocks.	Currently, all callers of XLogInsert provide at least some
T
Tom Lane 已提交
609 610 611 612
	 * not-in-a-buffer data and so len == 0 should never happen, but that
	 * may not be true forever.  If you need to remove the len == 0 check,
	 * also remove the check for xl_len == 0 in ReadRecord, below.
	 */
613
	if (len == 0 || len > MAXLOGRECSZ)
614
		elog(STOP, "XLogInsert: invalid record length %u", len);
615

616
	START_CRIT_SECTION();
617

618 619 620 621 622
	/* update LogwrtResult before doing cache fill check */
	SpinLockAcquire_NoHoldoff(&XLogCtl->info_lck);
	LogwrtRqst = XLogCtl->LogwrtRqst;
	LogwrtResult = XLogCtl->LogwrtResult;
	SpinLockRelease_NoHoldoff(&XLogCtl->info_lck);
623

624
	/*
625 626
	 * If cache is half filled then try to acquire write lock and do
	 * XLogWrite. Ignore any fractional blocks in performing this check.
627 628 629 630 631
	 */
	LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % BLCKSZ;
	if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
		(LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
		 XLogCtl->XLogCacheByte / 2))
T
Tom Lane 已提交
632
	{
633
		if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
634
		{
635 636 637 638
			LogwrtResult = XLogCtl->Write.LogwrtResult;
			if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
				XLogWrite(LogwrtRqst);
			LWLockRelease(WALWriteLock);
639 640 641
		}
	}

642 643 644
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
645 646
	/*
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to
B
Bruce Momjian 已提交
647 648 649
	 * go back and recompute everything.  This can only happen just after
	 * a checkpoint, so it's better to be slow in this case and fast
	 * otherwise.
T
Tom Lane 已提交
650 651
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
652
	{
T
Tom Lane 已提交
653 654 655 656
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

		for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
657
		{
T
Tom Lane 已提交
658 659 660 661 662 663
			if (dtbuf[i] == InvalidBuffer)
				continue;
			if (dtbuf_bkp[i] == false &&
				XLByteLE(dtbuf_lsn[i], RedoRecPtr))
			{
				/*
B
Bruce Momjian 已提交
664 665
				 * Oops, this buffer now needs to be backed up, but we
				 * didn't think so above.  Start over.
T
Tom Lane 已提交
666
				 */
667
				LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
668 669 670
				END_CRIT_SECTION();
				goto begin;
			}
671 672 673
		}
	}

T
Tom Lane 已提交
674 675 676 677 678 679 680
	/*
	 * Make additional rdata list entries for the backup blocks, so that
	 * we don't need to special-case them in the write loop.  Note that we
	 * have now irrevocably changed the input rdata list.  At the exit of
	 * this loop, write_len includes the backup block data.
	 *
	 * Also set the appropriate info bits to show which buffers were backed
B
Bruce Momjian 已提交
681 682 683
	 * up.	The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th
	 * distinct buffer value (ignoring InvalidBuffer) appearing in the
	 * rdata list.
T
Tom Lane 已提交
684 685 686
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
687 688 689 690
	{
		if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
			continue;

T
Tom Lane 已提交
691
		info |= XLR_SET_BKP_BLOCK(i);
692 693 694

		rdt->next = &(dtbuf_rdt[2 * i]);

B
Bruce Momjian 已提交
695
		dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
696
		dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
T
Tom Lane 已提交
697
		write_len += sizeof(BkpBlock);
698 699 700

		rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);

B
Bruce Momjian 已提交
701
		dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
702
		dtbuf_rdt[2 * i + 1].len = BLCKSZ;
T
Tom Lane 已提交
703
		write_len += BLCKSZ;
704 705 706
		dtbuf_rdt[2 * i + 1].next = NULL;
	}

T
Tom Lane 已提交
707
	/* Insert record header */
708

T
Tom Lane 已提交
709 710
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
711 712
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
713
		updrqst = AdvanceXLInsertBuffer();
714 715 716
		freespace = BLCKSZ - SizeOfXLogPHD;
	}

T
Tom Lane 已提交
717
	curridx = Insert->curridx;
718
	record = (XLogRecord *) Insert->currpos;
T
Tom Lane 已提交
719

720
	record->xl_prev = Insert->PrevRecord;
V
Vadim B. Mikheev 已提交
721
	if (no_tran)
722 723 724 725
	{
		record->xl_xact_prev.xlogid = 0;
		record->xl_xact_prev.xrecoff = 0;
	}
V
Vadim B. Mikheev 已提交
726 727 728
	else
		record->xl_xact_prev = MyLastRecPtr;

729
	record->xl_xid = GetCurrentTransactionId();
T
Tom Lane 已提交
730
	record->xl_len = len;		/* doesn't include backup blocks */
731
	record->xl_info = info;
732
	record->xl_rmid = rmid;
733

T
Tom Lane 已提交
734
	/* Now we can finish computing the main CRC */
B
Bruce Momjian 已提交
735
	COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
736
			   SizeOfXLogRecord - sizeof(crc64));
737 738 739
	FIN_CRC64(rdata_crc);
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
740 741 742 743
	/* Compute record's XLOG location */
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/* If first XLOG record of transaction, save it in PROC array */
V
Vadim B. Mikheev 已提交
744
	if (MyLastRecPtr.xrecoff == 0 && !no_tran)
745
	{
746
		LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
747
		MyProc->logRec = RecPtr;
748
		LWLockRelease(SInvalLock);
749
	}
V
WAL  
Vadim B. Mikheev 已提交
750 751 752

	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
753
		char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
754

755
		sprintf(buf, "INSERT @ %X/%X: ", RecPtr.xlogid, RecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
756
		xlog_outrec(buf, record);
757
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
758 759
		{
			strcat(buf, " - ");
760
			RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
761
		}
762
		elog(DEBUG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
763 764
	}

T
Tom Lane 已提交
765 766 767 768 769 770
	/* Record begin of record in appropriate places */
	if (!no_tran)
		MyLastRecPtr = RecPtr;
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

771
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
772
	freespace -= SizeOfXLogRecord;
773

T
Tom Lane 已提交
774 775 776 777
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
778
	{
779 780 781 782
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
783
		{
784 785 786 787 788
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
789
				write_len -= freespace;
790 791 792 793 794
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
795
				write_len -= rdata->len;
796 797 798 799
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
800 801
		}

802
		/* Use next buffer */
T
Tom Lane 已提交
803 804 805 806 807 808 809 810
		updrqst = AdvanceXLInsertBuffer();
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
		freespace = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
811
	}
812

T
Tom Lane 已提交
813 814
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
815
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
816
	freespace = INSERT_FREESPACE(Insert);
817

V
Vadim B. Mikheev 已提交
818
	/*
B
Bruce Momjian 已提交
819 820
	 * The recptr I return is the beginning of the *next* record. This
	 * will be stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
821
	 */
T
Tom Lane 已提交
822
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
823

T
Tom Lane 已提交
824
	/* Need to update shared LogwrtRqst if some block was filled up */
825
	if (freespace < SizeOfXLogRecord)
B
Bruce Momjian 已提交
826 827
		updrqst = true;			/* curridx is filled and available for
								 * writing out */
828 829
	else
		curridx = PrevBufIdx(curridx);
T
Tom Lane 已提交
830
	WriteRqst = XLogCtl->xlblocks[curridx];
831

832
	LWLockRelease(WALInsertLock);
833 834 835

	if (updrqst)
	{
836
		SpinLockAcquire_NoHoldoff(&XLogCtl->info_lck);
T
Tom Lane 已提交
837 838 839 840 841
		/* advance global request to include new block(s) */
		if (XLByteLT(XLogCtl->LogwrtRqst.Write, WriteRqst))
			XLogCtl->LogwrtRqst.Write = WriteRqst;
		/* update local result copy while I have the chance */
		LogwrtResult = XLogCtl->LogwrtResult;
842
		SpinLockRelease_NoHoldoff(&XLogCtl->info_lck);
843 844
	}

845
	END_CRIT_SECTION();
846
	return (RecPtr);
847
}
848

T
Tom Lane 已提交
849 850 851 852 853
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
854
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
855 856 857
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
858
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
859 860 861
 */
static bool
AdvanceXLInsertBuffer(void)
862
{
T
Tom Lane 已提交
863 864 865 866 867 868
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
	uint16		nextidx = NextBufIdx(Insert->curridx);
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
869 870
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
871

T
Tom Lane 已提交
872 873 874
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
875

T
Tom Lane 已提交
876
	/*
B
Bruce Momjian 已提交
877 878 879
	 * Get ending-offset of the buffer page we need to replace (this may
	 * be zero if the buffer hasn't been used yet).  Fall through if it's
	 * already written out.
T
Tom Lane 已提交
880 881 882 883 884 885
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
886

T
Tom Lane 已提交
887
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
888

889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
		/* Before waiting, get info_lck and update LogwrtResult */
		SpinLockAcquire_NoHoldoff(&XLogCtl->info_lck);
		if (XLByteLT(XLogCtl->LogwrtRqst.Write, FinishedPageRqstPtr))
			XLogCtl->LogwrtRqst.Write = FinishedPageRqstPtr;
		LogwrtResult = XLogCtl->LogwrtResult;
		SpinLockRelease_NoHoldoff(&XLogCtl->info_lck);

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
904
		{
905 906 907 908
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
909
			{
910 911 912
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
913
			}
914
			else
T
Tom Lane 已提交
915 916
			{
				/*
B
Bruce Momjian 已提交
917 918
				 * Have to write buffers while holding insert lock. This
				 * is not good, so only write as much as we absolutely
T
Tom Lane 已提交
919 920 921 922 923 924
				 * must.
				 */
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
				XLogWrite(WriteRqst);
925
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
926
				Insert->LogwrtResult = LogwrtResult;
927 928 929 930
			}
		}
	}

T
Tom Lane 已提交
931 932 933 934
	/*
	 * Now the next buffer slot is free and we can set it up to be the
	 * next output page.
	 */
935 936
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
937
	{
T
Tom Lane 已提交
938
		/* crossing a logid boundary */
939 940
		NewPageEndPtr.xlogid += 1;
		NewPageEndPtr.xrecoff = BLCKSZ;
941
	}
T
Tom Lane 已提交
942
	else
943 944 945
		NewPageEndPtr.xrecoff += BLCKSZ;
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * BLCKSZ);
T
Tom Lane 已提交
946
	Insert->curridx = nextidx;
947 948
	Insert->currpage = NewPage;
	Insert->currpos = ((char *) NewPage) + SizeOfXLogPHD;
B
Bruce Momjian 已提交
949

T
Tom Lane 已提交
950
	/*
B
Bruce Momjian 已提交
951 952
	 * Be sure to re-zero the buffer so that bytes beyond what we've
	 * written will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
953
	 */
954 955 956 957
	MemSet((char *) NewPage, 0, BLCKSZ);

	/* And fill the new page's header */
	NewPage->xlp_magic = XLOG_PAGE_MAGIC;
958
	/* NewPage->xlp_info = 0; */	/* done by memset */
959 960 961
	NewPage->xlp_sui = ThisStartUpID;
	NewPage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
	NewPage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
T
Tom Lane 已提交
962 963

	return update_needed;
964 965
}

T
Tom Lane 已提交
966 967 968
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
969
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
970
 */
971
static void
T
Tom Lane 已提交
972
XLogWrite(XLogwrtRqst WriteRqst)
973
{
974 975
	XLogCtlWrite *Write = &XLogCtl->Write;
	char	   *from;
T
Tom Lane 已提交
976
	bool		ispartialpage;
977
	bool		use_existent;
978

B
Bruce Momjian 已提交
979 980 981 982
	/*
	 * Update local LogwrtResult (caller probably did this already,
	 * but...)
	 */
T
Tom Lane 已提交
983 984 985
	LogwrtResult = Write->LogwrtResult;

	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
986
	{
987 988 989 990 991 992 993
		/*
		 * Make sure we're not ahead of the insert process.  This could
		 * happen if we're passed a bogus WriteRqst.Write that is past the
		 * end of the last page that's been initialized by
		 * AdvanceXLInsertBuffer.
		 */
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
994 995 996 997
			elog(STOP, "XLogWrite: write request %X/%X is past end of log %X/%X",
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
				 XLogCtl->xlblocks[Write->curridx].xlogid,
				 XLogCtl->xlblocks[Write->curridx].xrecoff);
998

T
Tom Lane 已提交
999 1000 1001 1002 1003
		/* Advance LogwrtResult.Write to end of current buffer page */
		LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1004
		{
T
Tom Lane 已提交
1005 1006 1007 1008
			/*
			 * Switch to new logfile segment.
			 */
			if (openLogFile >= 0)
1009
			{
T
Tom Lane 已提交
1010
				if (close(openLogFile) != 0)
1011
					elog(STOP, "close of log file %u, segment %u failed: %m",
T
Tom Lane 已提交
1012 1013
						 openLogId, openLogSeg);
				openLogFile = -1;
1014
			}
T
Tom Lane 已提交
1015 1016
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1017 1018 1019 1020
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1021
			openLogOff = 0;
1022 1023 1024 1025 1026

			if (!use_existent)	/* there was no precreated file */
				elog(LOG, "XLogWrite: new log file created - "
					 "consider increasing WAL_FILES");

T
Tom Lane 已提交
1027
			/* update pg_control, unless someone else already did */
1028
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1029 1030 1031
			if (ControlFile->logId < openLogId ||
				(ControlFile->logId == openLogId &&
				 ControlFile->logSeg < openLogSeg + 1))
T
Tom Lane 已提交
1032 1033 1034 1035 1036
			{
				ControlFile->logId = openLogId;
				ControlFile->logSeg = openLogSeg + 1;
				ControlFile->time = time(NULL);
				UpdateControlFile();
B
Bruce Momjian 已提交
1037

1038
				/*
B
Bruce Momjian 已提交
1039 1040 1041 1042
				 * Signal postmaster to start a checkpoint if it's been
				 * too long since the last one.  (We look at local copy of
				 * RedoRecPtr which might be a little out of date, but
				 * should be close enough for this purpose.)
1043 1044 1045 1046 1047 1048 1049
				 */
				if (IsUnderPostmaster &&
					(openLogId != RedoRecPtr.xlogid ||
					 openLogSeg >= (RedoRecPtr.xrecoff / XLogSegSize) +
					 (uint32) CheckPointSegments))
				{
					if (XLOG_DEBUG)
1050
						elog(DEBUG, "XLogWrite: time for a checkpoint, signaling postmaster");
1051 1052
					kill(getppid(), SIGUSR1);
				}
T
Tom Lane 已提交
1053
			}
1054
			LWLockRelease(ControlFileLock);
1055 1056
		}

T
Tom Lane 已提交
1057
		if (openLogFile < 0)
1058
		{
T
Tom Lane 已提交
1059 1060 1061
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
			openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
			openLogOff = 0;
1062 1063
		}

T
Tom Lane 已提交
1064 1065
		/* Need to seek in the file? */
		if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1066
		{
T
Tom Lane 已提交
1067 1068
			openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
			if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1069
				elog(STOP, "lseek of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1070
					 openLogId, openLogSeg, openLogOff);
1071 1072
		}

T
Tom Lane 已提交
1073 1074
		/* OK to write the page */
		from = XLogCtl->pages + Write->curridx * BLCKSZ;
1075
		errno = 0;
T
Tom Lane 已提交
1076
		if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1077 1078 1079 1080
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
1081
			elog(STOP, "write of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1082
				 openLogId, openLogSeg, openLogOff);
1083
		}
T
Tom Lane 已提交
1084
		openLogOff += BLCKSZ;
1085

T
Tom Lane 已提交
1086 1087 1088
		/*
		 * If we just wrote the whole last page of a logfile segment,
		 * fsync the segment immediately.  This avoids having to go back
B
Bruce Momjian 已提交
1089 1090 1091
		 * and re-open prior segments when an fsync request comes along
		 * later. Doing it here ensures that one and only one backend will
		 * perform this fsync.
T
Tom Lane 已提交
1092 1093 1094
		 */
		if (openLogOff >= XLogSegSize && !ispartialpage)
		{
1095
			issue_xlog_fsync();
B
Bruce Momjian 已提交
1096
			LogwrtResult.Flush = LogwrtResult.Write;	/* end of current page */
T
Tom Lane 已提交
1097
		}
1098

T
Tom Lane 已提交
1099 1100 1101 1102 1103 1104 1105
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
		Write->curridx = NextBufIdx(Write->curridx);
1106 1107
	}

T
Tom Lane 已提交
1108 1109 1110 1111 1112
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1113
	{
T
Tom Lane 已提交
1114
		/*
B
Bruce Momjian 已提交
1115 1116 1117
		 * Could get here without iterating above loop, in which case we
		 * might have no open file or the wrong one.  However, we do not
		 * need to fsync more than one file.
T
Tom Lane 已提交
1118
		 */
1119
		if (sync_method != SYNC_METHOD_OPEN)
T
Tom Lane 已提交
1120
		{
1121
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1122
			 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1123 1124
			{
				if (close(openLogFile) != 0)
1125
					elog(STOP, "close of log file %u, segment %u failed: %m",
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
				openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
				openLogOff = 0;
			}
			issue_xlog_fsync();
T
Tom Lane 已提交
1136 1137
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1138 1139
	}

T
Tom Lane 已提交
1140 1141 1142
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1143 1144
	 * We make sure that the shared 'request' values do not fall behind the
	 * 'result' values.  This is not absolutely essential, but it saves
T
Tom Lane 已提交
1145 1146
	 * some code in a couple of places.
	 */
1147
	SpinLockAcquire_NoHoldoff(&XLogCtl->info_lck);
T
Tom Lane 已提交
1148 1149 1150 1151 1152
	XLogCtl->LogwrtResult = LogwrtResult;
	if (XLByteLT(XLogCtl->LogwrtRqst.Write, LogwrtResult.Write))
		XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
	if (XLByteLT(XLogCtl->LogwrtRqst.Flush, LogwrtResult.Flush))
		XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
1153
	SpinLockRelease_NoHoldoff(&XLogCtl->info_lck);
1154

T
Tom Lane 已提交
1155 1156 1157 1158 1159 1160
	Write->LogwrtResult = LogwrtResult;
}

/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
1161
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

	if (XLOG_DEBUG)
	{
1172
		elog(DEBUG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X\n",
1173 1174 1175 1176 1177
			 (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
			 (InRedo) ? "(redo)" : "",
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
T
Tom Lane 已提交
1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
		fflush(stderr);
	}

	/* Disabled during REDO */
	if (InRedo)
		return;

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
	 * piggyback as much data as we can on each fsync: if we see any more
	 * data entered into the xlog buffer, we'll write and fsync that too,
B
Bruce Momjian 已提交
1195 1196 1197
	 * so that the final value of LogwrtResult.Flush is as large as
	 * possible. This gives us some chance of avoiding another fsync
	 * immediately after.
T
Tom Lane 已提交
1198 1199 1200 1201 1202
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

1203 1204 1205 1206 1207 1208 1209 1210 1211
	/* read LogwrtResult and update local state */
	SpinLockAcquire_NoHoldoff(&XLogCtl->info_lck);
	if (XLByteLT(WriteRqstPtr, XLogCtl->LogwrtRqst.Write))
		WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
	LogwrtResult = XLogCtl->LogwrtResult;
	SpinLockRelease_NoHoldoff(&XLogCtl->info_lck);

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1212 1213
	{
		/* if something was added to log cache then try to flush this too */
1214
		if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
T
Tom Lane 已提交
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225
		{
			XLogCtlInsert *Insert = &XLogCtl->Insert;
			uint32		freespace = INSERT_FREESPACE(Insert);

			if (freespace < SizeOfXLogRecord)	/* buffer is full */
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
			else
			{
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				WriteRqstPtr.xrecoff -= freespace;
			}
1226
			LWLockRelease(WALInsertLock);
T
Tom Lane 已提交
1227
		}
1228 1229 1230 1231
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1232 1233 1234 1235 1236
		{
			WriteRqst.Write = WriteRqstPtr;
			WriteRqst.Flush = record;
			XLogWrite(WriteRqst);
			if (XLByteLT(LogwrtResult.Flush, record))
1237 1238
				elog(STOP, "XLogFlush: request %X/%X is not satisfied --- flushed only to %X/%X",
					 record.xlogid, record.xrecoff,
1239
				  LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
T
Tom Lane 已提交
1240
		}
1241
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1242 1243 1244
	}

	END_CRIT_SECTION();
1245 1246
}

T
Tom Lane 已提交
1247 1248 1249
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
1250 1251 1252
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
1253
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
1254 1255
 * file was used.
 *
1256
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1257
 * place.  This should be TRUE except during bootstrap log creation.  The
1258
 * caller must *not* hold the lock at call.
1259
 *
T
Tom Lane 已提交
1260 1261
 * Returns FD of opened file.
 */
1262
static int
1263 1264
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
1265
{
1266
	char		path[MAXPGPATH];
1267
	char		tmppath[MAXPGPATH];
1268
	char		zbuffer[BLCKSZ];
1269
	int			fd;
1270
	int			nbytes;
1271 1272

	XLogFileName(path, log, seg);
V
Vadim B. Mikheev 已提交
1273 1274

	/*
B
Bruce Momjian 已提交
1275 1276
	 * Try to use existent file (checkpoint maker may have created it
	 * already)
V
Vadim B. Mikheev 已提交
1277
	 */
1278
	if (*use_existent)
V
Vadim B. Mikheev 已提交
1279
	{
1280 1281
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
1282 1283 1284
		if (fd < 0)
		{
			if (errno != ENOENT)
1285 1286
				elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
					 path, log, seg);
V
Vadim B. Mikheev 已提交
1287 1288
		}
		else
B
Bruce Momjian 已提交
1289
			return (fd);
V
Vadim B. Mikheev 已提交
1290 1291
	}

1292
	/*
B
Bruce Momjian 已提交
1293 1294 1295
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible
	 * that another process is doing the same thing.  If so, we will end
	 * up pre-creating an extra log segment.  That seems OK, and better
1296
	 * than holding the lock throughout this lengthy process.
1297
	 */
1298 1299
	snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
			 XLogDir, (int) getpid());
1300 1301

	unlink(tmppath);
1302

1303
	/* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
1304
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
1305
					   S_IRUSR | S_IWUSR);
1306
	if (fd < 0)
1307
		elog(STOP, "creation of file %s failed: %m", tmppath);
1308

1309
	/*
B
Bruce Momjian 已提交
1310
	 * Zero-fill the file.	We have to do this the hard way to ensure that
1311 1312
	 * all the file space has really been allocated --- on platforms that
	 * allow "holes" in files, just seeking to the end doesn't allocate
B
Bruce Momjian 已提交
1313
	 * intermediate space.	This way, we know that we have all the space
1314
	 * and (after the fsync below) that all the indirect blocks are down
1315 1316
	 * on disk.  Therefore, fdatasync(2) or O_DSYNC will be sufficient to
	 * sync future writes to the log file.
1317 1318 1319 1320
	 */
	MemSet(zbuffer, 0, sizeof(zbuffer));
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
	{
1321
		errno = 0;
1322
		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
T
Tom Lane 已提交
1323
		{
B
Bruce Momjian 已提交
1324
			int			save_errno = errno;
T
Tom Lane 已提交
1325

B
Bruce Momjian 已提交
1326 1327 1328 1329
			/*
			 * If we fail to make the file, delete it to release disk
			 * space
			 */
1330
			unlink(tmppath);
1331 1332
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
1333

T
Tom Lane 已提交
1334
			elog(STOP, "ZeroFill failed to write %s: %m", tmppath);
T
Tom Lane 已提交
1335
		}
1336
	}
1337

1338
	if (pg_fsync(fd) != 0)
1339
		elog(STOP, "fsync of file %s failed: %m", tmppath);
1340

V
Vadim B. Mikheev 已提交
1341
	close(fd);
T
Tom Lane 已提交
1342

1343
	/*
1344 1345
	 * Now move the segment into place with its final name.
	 *
1346 1347 1348 1349 1350
	 * If caller didn't want to use a pre-existing file, get rid of any
	 * pre-existing file.  Otherwise, cope with possibility that someone
	 * else has created the file while we were filling ours: if so, use
	 * ours to pre-create a future log segment.
	 */
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
	if (!InstallXLogFileSegment(log, seg, tmppath,
								*use_existent, XLOGfiles + XLOGfileslop,
								use_lock))
	{
		/* No need for any more future segments... */
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
			 path, log, seg);

	return (fd);
}

/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
 * log, seg: identify segment to install as (or first possible target).
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
 * max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  (Irrelevant if
 * find_free is FALSE.)
 *
1390
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
1391
 * place.  This should be TRUE except during bootstrap log creation.  The
1392
 * caller must *not* hold the lock at call.
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410
 *
 * Returns TRUE if file installed, FALSE if not installed because of
 * exceeding max_advance limit.  (Any other kind of failure causes elog().)
 */
static bool
InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
					   bool find_free, int max_advance,
					   bool use_lock)
{
	char		path[MAXPGPATH];
	int			fd;

	XLogFileName(path, log, seg);

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
1411
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1412

1413 1414 1415 1416 1417
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
1418 1419
	else
	{
1420 1421
		/* Find a free slot to put it in */
		while ((fd = BasicOpenFile(path, O_RDWR | PG_BINARY,
1422 1423 1424
								   S_IRUSR | S_IWUSR)) >= 0)
		{
			close(fd);
1425 1426 1427 1428
			if (--max_advance < 0)
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
1429
					LWLockRelease(ControlFileLock);
1430 1431 1432 1433
				return false;
			}
			NextLogSeg(log, seg);
			XLogFileName(path, log, seg);
1434 1435 1436 1437 1438 1439 1440
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
1441
	 */
1442
#ifndef __BEOS__
1443
	if (link(tmppath, path) < 0)
1444
		elog(STOP, "link from %s to %s (initialization of log file %u, segment %u) failed: %m",
1445
			 tmppath, path, log, seg);
1446
	unlink(tmppath);
1447
#else
1448
	if (rename(tmppath, path) < 0)
1449
		elog(STOP, "rename from %s to %s (initialization of log file %u, segment %u) failed: %m",
1450
			 tmppath, path, log, seg);
1451
#endif
V
Vadim B. Mikheev 已提交
1452

1453
	if (use_lock)
1454
		LWLockRelease(ControlFileLock);
1455

1456
	return true;
1457 1458
}

T
Tom Lane 已提交
1459 1460 1461
/*
 * Open a pre-existing logfile segment.
 */
1462 1463 1464
static int
XLogFileOpen(uint32 log, uint32 seg, bool econt)
{
1465 1466
	char		path[MAXPGPATH];
	int			fd;
1467 1468 1469

	XLogFileName(path, log, seg);

1470 1471
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
1472 1473 1474 1475
	if (fd < 0)
	{
		if (econt && errno == ENOENT)
		{
1476 1477
			elog(LOG, "open of %s (log file %u, segment %u) failed: %m",
				 path, log, seg);
1478 1479
			return (fd);
		}
1480 1481
		elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
			 path, log, seg);
1482 1483
	}

1484
	return (fd);
1485 1486
}

V
Vadim B. Mikheev 已提交
1487
/*
T
Tom Lane 已提交
1488 1489 1490 1491 1492 1493 1494 1495 1496
 * Preallocate log files beyond the specified log endpoint, according to
 * the XLOGfile user parameter.
 */
static void
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
1497
	bool		use_existent;
T
Tom Lane 已提交
1498 1499 1500 1501 1502 1503 1504 1505
	int			i;

	XLByteToPrevSeg(endptr, _logId, _logSeg);
	if (XLOGfiles > 0)
	{
		for (i = 1; i <= XLOGfiles; i++)
		{
			NextLogSeg(_logId, _logSeg);
1506 1507
			use_existent = true;
			lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1508 1509 1510 1511 1512 1513 1514
			close(lf);
		}
	}
	else if ((endptr.xrecoff - 1) % XLogSegSize >=
			 (uint32) (0.75 * XLogSegSize))
	{
		NextLogSeg(_logId, _logSeg);
1515 1516
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1517 1518 1519 1520 1521 1522
		close(lf);
	}
}

/*
 * Remove or move offline all log files older or equal to passed log/seg#
1523 1524 1525
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
1526 1527
 */
static void
1528
MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
1529
{
1530 1531
	uint32		endlogId;
	uint32		endlogSeg;
B
Bruce Momjian 已提交
1532 1533 1534 1535
	DIR		   *xldir;
	struct dirent *xlde;
	char		lastoff[32];
	char		path[MAXPGPATH];
V
Vadim B. Mikheev 已提交
1536

1537
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
V
Vadim B. Mikheev 已提交
1538 1539 1540

	xldir = opendir(XLogDir);
	if (xldir == NULL)
1541 1542
		elog(STOP, "could not open transaction log directory (%s): %m",
			 XLogDir);
V
Vadim B. Mikheev 已提交
1543

T
Tom Lane 已提交
1544
	sprintf(lastoff, "%08X%08X", log, seg);
V
Vadim B. Mikheev 已提交
1545 1546 1547 1548

	errno = 0;
	while ((xlde = readdir(xldir)) != NULL)
	{
T
Tom Lane 已提交
1549 1550 1551
		if (strlen(xlde->d_name) == 16 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 16 &&
			strcmp(xlde->d_name, lastoff) <= 0)
V
Vadim B. Mikheev 已提交
1552
		{
1553
			snprintf(path, MAXPGPATH, "%s/%s", XLogDir, xlde->d_name);
1554
			if (XLOG_archive_dir[0])
1555 1556 1557 1558 1559
			{
				elog(LOG, "archiving transaction log file %s",
					 xlde->d_name);
				elog(NOTICE, "archiving log files is not implemented!");
			}
1560
			else
1561 1562 1563
			{
				/*
				 * Before deleting the file, see if it can be recycled as
1564 1565
				 * a future log segment.  We allow recycling segments up
				 * to XLOGfiles + XLOGfileslop segments beyond the current
1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582
				 * XLOG location.
				 */
				if (InstallXLogFileSegment(endlogId, endlogSeg, path,
										   true, XLOGfiles + XLOGfileslop,
										   true))
				{
					elog(LOG, "recycled transaction log file %s",
						 xlde->d_name);
				}
				else
				{
					/* No need for any more future segments... */
					elog(LOG, "removing transaction log file %s",
						 xlde->d_name);
					unlink(path);
				}
			}
V
Vadim B. Mikheev 已提交
1583 1584 1585 1586
		}
		errno = 0;
	}
	if (errno)
1587 1588
		elog(STOP, "could not read transaction log directory (%s): %m",
			 XLogDir);
V
Vadim B. Mikheev 已提交
1589 1590 1591
	closedir(xldir);
}

T
Tom Lane 已提交
1592 1593 1594 1595 1596
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
 */
1597 1598 1599 1600 1601 1602 1603 1604 1605 1606
static void
RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
{
	Relation	reln;
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
1607
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
1608
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1609
	{
T
Tom Lane 已提交
1610
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1611 1612
			continue;

B
Bruce Momjian 已提交
1613
		memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
1614 1615 1616 1617 1618 1619 1620 1621 1622 1623
		blk += sizeof(BkpBlock);

		reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);

		if (reln)
		{
			buffer = XLogReadBuffer(true, reln, bkpb.block);
			if (BufferIsValid(buffer))
			{
				page = (Page) BufferGetPage(buffer);
B
Bruce Momjian 已提交
1624
				memcpy((char *) page, blk, BLCKSZ);
1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
				PageSetLSN(page, lsn);
				PageSetSUI(page, ThisStartUpID);
				UnlockAndWriteBuffer(buffer);
			}
		}

		blk += BLCKSZ;
	}
}

T
Tom Lane 已提交
1635 1636 1637 1638 1639 1640 1641
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
1642 1643 1644 1645 1646 1647 1648 1649 1650
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
	crc64		crc;
	crc64		cbuf;
	int			i;
	uint32		len = record->xl_len;
	char	   *blk;

T
Tom Lane 已提交
1651
	/* Check CRC of rmgr data and record header */
1652
	INIT_CRC64(crc);
T
Tom Lane 已提交
1653
	COMP_CRC64(crc, XLogRecGetData(record), len);
B
Bruce Momjian 已提交
1654
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
1655
			   SizeOfXLogRecord - sizeof(crc64));
1656 1657
	FIN_CRC64(crc);

T
Tom Lane 已提交
1658
	if (!EQ_CRC64(record->xl_crc, crc))
1659
	{
1660
		elog(emode, "ReadRecord: bad resource manager data checksum in record at %X/%X",
T
Tom Lane 已提交
1661
			 recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1662
		return (false);
1663 1664
	}

T
Tom Lane 已提交
1665
	/* Check CRCs of backup blocks, if any */
B
Bruce Momjian 已提交
1666
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
1667
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1668
	{
T
Tom Lane 已提交
1669
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1670 1671 1672
			continue;

		INIT_CRC64(crc);
T
Tom Lane 已提交
1673 1674 1675
		COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
		COMP_CRC64(crc, blk + sizeof(crc64),
				   sizeof(BkpBlock) - sizeof(crc64));
1676
		FIN_CRC64(crc);
B
Bruce Momjian 已提交
1677 1678
		memcpy((char *) &cbuf, blk, sizeof(crc64));		/* don't assume
														 * alignment */
1679

T
Tom Lane 已提交
1680
		if (!EQ_CRC64(cbuf, crc))
1681
		{
1682
			elog(emode, "ReadRecord: bad checksum of backup block %d in record at %X/%X",
T
Tom Lane 已提交
1683
				 i + 1, recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1684
			return (false);
1685
		}
T
Tom Lane 已提交
1686
		blk += sizeof(BkpBlock) + BLCKSZ;
1687 1688
	}

B
Bruce Momjian 已提交
1689
	return (true);
1690 1691
}

T
Tom Lane 已提交
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
 * If no valid record is available, returns NULL, or fails if emode is STOP.
 * (emode must be either STOP or LOG.)
 *
 * buffer is a workspace at least _INTL_MAXLOGRECSZ bytes long.  It is needed
 * to reassemble a record that crosses block boundaries.  Note that on
 * successful return, the returned record pointer always points at buffer.
 */
1705
static XLogRecord *
T
Tom Lane 已提交
1706
ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer)
1707
{
1708 1709
	XLogRecord *record;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
T
Tom Lane 已提交
1710 1711 1712 1713
	uint32		len,
				total_len;
	uint32		targetPageOff;
	unsigned	i;
1714
	bool		nextmode = false;
T
Tom Lane 已提交
1715 1716 1717 1718 1719 1720

	if (readBuf == NULL)
	{
		/*
		 * First time through, permanently allocate readBuf.  We do it
		 * this way, rather than just making a static array, for two
B
Bruce Momjian 已提交
1721 1722 1723 1724
		 * reasons: (1) no need to waste the storage in most
		 * instantiations of the backend; (2) a static char array isn't
		 * guaranteed to have any particular alignment, whereas malloc()
		 * will provide MAXALIGN'd storage.
T
Tom Lane 已提交
1725 1726 1727 1728
		 */
		readBuf = (char *) malloc(BLCKSZ);
		Assert(readBuf != NULL);
	}
1729

T
Tom Lane 已提交
1730
	if (RecPtr == NULL)
1731
	{
1732
		RecPtr = &tmpRecPtr;
1733
		nextmode = true;
T
Tom Lane 已提交
1734
		/* fast case if next record is on same page */
1735 1736 1737 1738 1739
		if (nextRecord != NULL)
		{
			record = nextRecord;
			goto got_record;
		}
T
Tom Lane 已提交
1740
		/* align old recptr to next page */
1741 1742 1743 1744 1745 1746 1747 1748
		if (tmpRecPtr.xrecoff % BLCKSZ != 0)
			tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
		tmpRecPtr.xrecoff += SizeOfXLogPHD;
1749
	}
1750
	else if (!XRecOffIsValid(RecPtr->xrecoff))
1751
		elog(STOP, "ReadRecord: invalid record offset at %X/%X",
1752
			 RecPtr->xlogid, RecPtr->xrecoff);
1753

T
Tom Lane 已提交
1754
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
1755
	{
1756 1757
		close(readFile);
		readFile = -1;
1758
	}
T
Tom Lane 已提交
1759
	XLByteToSeg(*RecPtr, readId, readSeg);
1760
	if (readFile < 0)
1761
	{
T
Tom Lane 已提交
1762
		readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1763 1764
		if (readFile < 0)
			goto next_record_is_invalid;
1765
		readOff = (uint32) (-1);	/* force read to occur below */
1766 1767
	}

T
Tom Lane 已提交
1768 1769
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / BLCKSZ) * BLCKSZ;
	if (readOff != targetPageOff)
1770
	{
T
Tom Lane 已提交
1771 1772 1773
		readOff = targetPageOff;
		if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
		{
1774
			elog(emode, "ReadRecord: lseek of log file %u, segment %u, offset %u failed: %m",
1775
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1776 1777
			goto next_record_is_invalid;
		}
1778
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1779
		{
1780
			elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1781
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1782 1783
			goto next_record_is_invalid;
		}
1784
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, nextmode))
1785 1786
			goto next_record_is_invalid;
	}
T
Tom Lane 已提交
1787
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
1788 1789
		RecPtr->xrecoff % BLCKSZ == SizeOfXLogPHD)
	{
1790
		elog(emode, "ReadRecord: contrecord is requested by %X/%X",
1791
			 RecPtr->xlogid, RecPtr->xrecoff);
1792 1793
		goto next_record_is_invalid;
	}
1794
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
1795 1796

got_record:;
B
Bruce Momjian 已提交
1797

T
Tom Lane 已提交
1798
	/*
B
Bruce Momjian 已提交
1799 1800
	 * Currently, xl_len == 0 must be bad data, but that might not be true
	 * forever.  See note in XLogInsert.
T
Tom Lane 已提交
1801
	 */
1802 1803
	if (record->xl_len == 0)
	{
1804
		elog(emode, "ReadRecord: record with zero length at %X/%X",
T
Tom Lane 已提交
1805
			 RecPtr->xlogid, RecPtr->xrecoff);
1806 1807
		goto next_record_is_invalid;
	}
B
Bruce Momjian 已提交
1808

T
Tom Lane 已提交
1809
	/*
B
Bruce Momjian 已提交
1810 1811
	 * Compute total length of record including any appended backup
	 * blocks.
T
Tom Lane 已提交
1812 1813 1814 1815 1816 1817 1818 1819
	 */
	total_len = SizeOfXLogRecord + record->xl_len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;
		total_len += sizeof(BkpBlock) + BLCKSZ;
	}
B
Bruce Momjian 已提交
1820

T
Tom Lane 已提交
1821 1822 1823 1824 1825 1826
	/*
	 * Make sure it will fit in buffer (currently, it is mechanically
	 * impossible for this test to fail, but it seems like a good idea
	 * anyway).
	 */
	if (total_len > _INTL_MAXLOGRECSZ)
1827
	{
1828
		elog(emode, "ReadRecord: record length %u at %X/%X too long",
T
Tom Lane 已提交
1829
			 total_len, RecPtr->xlogid, RecPtr->xrecoff);
1830 1831 1832 1833
		goto next_record_is_invalid;
	}
	if (record->xl_rmid > RM_MAX_ID)
	{
1834
		elog(emode, "ReadRecord: invalid resource manager id %u at %X/%X",
1835
			 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff);
1836 1837 1838
		goto next_record_is_invalid;
	}
	nextRecord = NULL;
T
Tom Lane 已提交
1839 1840
	len = BLCKSZ - RecPtr->xrecoff % BLCKSZ;
	if (total_len > len)
1841
	{
T
Tom Lane 已提交
1842 1843
		/* Need to reassemble record */
		XLogContRecord *contrecord;
B
Bruce Momjian 已提交
1844
		uint32		gotlen = len;
1845

T
Tom Lane 已提交
1846
		memcpy(buffer, record, len);
1847
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
1848
		buffer += len;
1849
		for (;;)
1850
		{
T
Tom Lane 已提交
1851 1852
			readOff += BLCKSZ;
			if (readOff >= XLogSegSize)
1853 1854
			{
				close(readFile);
T
Tom Lane 已提交
1855 1856 1857
				readFile = -1;
				NextLogSeg(readId, readSeg);
				readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1858 1859
				if (readFile < 0)
					goto next_record_is_invalid;
T
Tom Lane 已提交
1860
				readOff = 0;
1861 1862
			}
			if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1863
			{
1864
				elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1865
					 readId, readSeg, readOff);
T
Tom Lane 已提交
1866 1867
				goto next_record_is_invalid;
			}
1868
			if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
1869
				goto next_record_is_invalid;
T
Tom Lane 已提交
1870
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
1871
			{
1872
				elog(emode, "ReadRecord: there is no ContRecord flag in log file %u, segment %u, offset %u",
1873
					 readId, readSeg, readOff);
1874 1875
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1876
			contrecord = (XLogContRecord *) ((char *) readBuf + SizeOfXLogPHD);
B
Bruce Momjian 已提交
1877
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
1878
				total_len != (contrecord->xl_rem_len + gotlen))
1879
			{
1880
				elog(emode, "ReadRecord: invalid ContRecord length %u in log file %u, segment %u, offset %u",
T
Tom Lane 已提交
1881
					 contrecord->xl_rem_len, readId, readSeg, readOff);
1882 1883
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1884 1885
			len = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
			if (contrecord->xl_rem_len > len)
1886
			{
B
Bruce Momjian 已提交
1887
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
		if (BLCKSZ - SizeOfXLogRecord >= SizeOfXLogPHD +
			SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
		{
B
Bruce Momjian 已提交
1901
			nextRecord = (XLogRecord *) ((char *) contrecord +
T
Tom Lane 已提交
1902 1903 1904 1905
				SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
B
Bruce Momjian 已提交
1906
			SizeOfXLogPHD + SizeOfXLogContRecord +
T
Tom Lane 已提交
1907 1908 1909
			MAXALIGN(contrecord->xl_rem_len);
		ReadRecPtr = *RecPtr;
		return record;
1910 1911
	}

T
Tom Lane 已提交
1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
	if (BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % BLCKSZ +
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
	return (XLogRecord *) buffer;
1923

T
Tom Lane 已提交
1924 1925 1926 1927 1928
next_record_is_invalid:;
	close(readFile);
	readFile = -1;
	nextRecord = NULL;
	return NULL;
1929 1930
}

1931 1932 1933 1934
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
1935
 * ReadRecord.	It's not intended for use from anywhere else.
1936 1937 1938 1939
 */
static bool
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI)
{
1940 1941
	XLogRecPtr	recaddr;

1942 1943
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
1944
		elog(emode, "ReadRecord: invalid magic number %04X in log file %u, segment %u, offset %u",
1945 1946 1947 1948 1949
			 hdr->xlp_magic, readId, readSeg, readOff);
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
1950
		elog(emode, "ReadRecord: invalid info bits %04X in log file %u, segment %u, offset %u",
1951 1952 1953
			 hdr->xlp_info, readId, readSeg, readOff);
		return false;
	}
1954 1955 1956 1957
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
1958
		elog(emode, "ReadRecord: unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
1959 1960 1961 1962
			 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
			 readId, readSeg, readOff);
		return false;
	}
B
Bruce Momjian 已提交
1963

1964
	/*
B
Bruce Momjian 已提交
1965 1966 1967 1968
	 * We disbelieve a SUI less than the previous page's SUI, or more than
	 * a few counts greater.  In theory as many as 512 shutdown checkpoint
	 * records could appear on a 32K-sized xlog page, so that's the most
	 * differential there could legitimately be.
1969 1970
	 *
	 * Note this check can only be applied when we are reading the next page
B
Bruce Momjian 已提交
1971 1972
	 * in sequence, so ReadRecord passes a flag indicating whether to
	 * check.
1973 1974 1975 1976 1977 1978
	 */
	if (checkSUI)
	{
		if (hdr->xlp_sui < lastReadSUI ||
			hdr->xlp_sui > lastReadSUI + 512)
		{
1979 1980
			/* translator: SUI = startup id */
			elog(emode, "ReadRecord: out-of-sequence SUI %u (after %u) in log file %u, segment %u, offset %u",
1981 1982 1983 1984 1985 1986 1987 1988
				 hdr->xlp_sui, lastReadSUI, readId, readSeg, readOff);
			return false;
		}
	}
	lastReadSUI = hdr->xlp_sui;
	return true;
}

1989 1990 1991 1992
/*
 * I/O routines for pg_control
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
1993
 * contents of pg_control.	WriteControlFile() initializes pg_control
1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */

void
XLOGPathInit(void)
{
	/* Init XLOG file paths */
2008 2009
	snprintf(XLogDir, MAXPGPATH, "%s/pg_xlog", DataDir);
	snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
2010 2011 2012 2013 2014 2015
}

static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
2016 2017
	char		buffer[BLCKSZ]; /* need not be aligned */

2018 2019 2020 2021 2022
#ifdef USE_LOCALE
	char	   *localeptr;
#endif

	/*
T
Tom Lane 已提交
2023
	 * Initialize version and compatibility-check fields
2024
	 */
T
Tom Lane 已提交
2025 2026
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
2027 2028 2029 2030 2031
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
#ifdef USE_LOCALE
	localeptr = setlocale(LC_COLLATE, NULL);
	if (!localeptr)
2032
		elog(STOP, "invalid LC_COLLATE setting");
2033 2034 2035
	StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
	localeptr = setlocale(LC_CTYPE, NULL);
	if (!localeptr)
2036
		elog(STOP, "invalid LC_CTYPE setting");
2037
	StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
B
Bruce Momjian 已提交
2038

2039 2040
	/*
	 * Issue warning notice if initdb'ing in a locale that will not permit
B
Bruce Momjian 已提交
2041 2042
	 * LIKE index optimization.  This is not a clean place to do it, but I
	 * don't see a better place either...
2043 2044 2045 2046 2047
	 */
	if (!locale_is_like_safe())
		elog(NOTICE, "Initializing database with %s collation order."
			 "\n\tThis locale setting will prevent use of index optimization for"
			 "\n\tLIKE and regexp searches.  If you are concerned about speed of"
B
Bruce Momjian 已提交
2048
		  "\n\tsuch queries, you may wish to set LC_COLLATE to \"C\" and"
2049 2050
			 "\n\tre-initdb.  For more information see the Administrator's Guide.",
			 ControlFile->lc_collate);
2051
#else							/* not USE_LOCALE */
2052 2053
	strcpy(ControlFile->lc_collate, "C");
	strcpy(ControlFile->lc_ctype, "C");
2054
#endif	 /* not USE_LOCALE */
2055

T
Tom Lane 已提交
2056 2057
	/* Contents are protected with a CRC */
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2058 2059
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2060 2061 2062
			   sizeof(ControlFileData) - sizeof(crc64));
	FIN_CRC64(ControlFile->crc);

2063
	/*
B
Bruce Momjian 已提交
2064 2065 2066 2067 2068
	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
	 * over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail
	 * when we check the contents of the file, but hopefully with a more
	 * specific error than "couldn't read pg_control".
2069 2070
	 */
	if (sizeof(ControlFileData) > BLCKSZ)
2071
		elog(STOP, "sizeof(ControlFileData) is larger than BLCKSZ; fix either one");
2072

2073 2074 2075
	memset(buffer, 0, BLCKSZ);
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

2076 2077
	fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
2078
	if (fd < 0)
2079
		elog(STOP, "WriteControlFile: could not create control file (%s): %m",
2080 2081
			 ControlFilePath);

2082
	errno = 0;
2083
	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
2084 2085 2086 2087
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2088
		elog(STOP, "WriteControlFile: write to control file failed: %m");
2089
	}
2090

2091
	if (pg_fsync(fd) != 0)
2092
		elog(STOP, "WriteControlFile: fsync of control file failed: %m");
2093 2094 2095 2096 2097 2098 2099

	close(fd);
}

static void
ReadControlFile(void)
{
2100
	crc64		crc;
2101 2102 2103 2104 2105 2106 2107
	int			fd;

	/*
	 * Read data...
	 */
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
2108
		elog(STOP, "could not open control file (%s): %m", ControlFilePath);
2109 2110

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2111
		elog(STOP, "read from control file failed: %m");
2112 2113 2114

	close(fd);

T
Tom Lane 已提交
2115 2116 2117 2118 2119 2120 2121
	/*
	 * Check for expected pg_control format version.  If this is wrong,
	 * the CRC check will likely fail because we'll be checking the wrong
	 * number of bytes.  Complaining about wrong version will probably be
	 * more enlightening than complaining about wrong CRC.
	 */
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
2122 2123 2124 2125
		elog(STOP,
			 "The database cluster was initialized with PG_CONTROL_VERSION %d,\n"
			 "\tbut the server was compiled with PG_CONTROL_VERSION %d.\n"
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2126 2127 2128
			 ControlFile->pg_control_version, PG_CONTROL_VERSION);

	/* Now check the CRC. */
2129
	INIT_CRC64(crc);
B
Bruce Momjian 已提交
2130 2131
	COMP_CRC64(crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2132
			   sizeof(ControlFileData) - sizeof(crc64));
2133 2134
	FIN_CRC64(crc);

T
Tom Lane 已提交
2135
	if (!EQ_CRC64(crc, ControlFile->crc))
2136
		elog(STOP, "invalid checksum in control file");
2137

2138
	/*
B
Bruce Momjian 已提交
2139 2140
	 * Do compatibility checking immediately.  We do this here for 2
	 * reasons:
2141
	 *
B
Bruce Momjian 已提交
2142 2143
	 * (1) if the database isn't compatible with the backend executable, we
	 * want to abort before we can possibly do any damage;
2144 2145 2146
	 *
	 * (2) this code is executed in the postmaster, so the setlocale() will
	 * propagate to forked backends, which aren't going to read this file
B
Bruce Momjian 已提交
2147
	 * for themselves.	(These locale settings are considered critical
2148 2149
	 * compatibility items because they can affect sort order of indexes.)
	 */
T
Tom Lane 已提交
2150
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
2151 2152
		elog(STOP,
			 "The database cluster was initialized with CATALOG_VERSION_NO %d,\n"
2153
		   "\tbut the backend was compiled with CATALOG_VERSION_NO %d.\n"
2154
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2155
			 ControlFile->catalog_version_no, CATALOG_VERSION_NO);
2156
	if (ControlFile->blcksz != BLCKSZ)
2157 2158 2159 2160
		elog(STOP,
			 "The database cluster was initialized with BLCKSZ %d,\n"
			 "\tbut the backend was compiled with BLCKSZ %d.\n"
			 "\tIt looks like you need to initdb.",
2161 2162
			 ControlFile->blcksz, BLCKSZ);
	if (ControlFile->relseg_size != RELSEG_SIZE)
2163 2164 2165 2166
		elog(STOP,
			 "The database cluster was initialized with RELSEG_SIZE %d,\n"
			 "\tbut the backend was compiled with RELSEG_SIZE %d.\n"
			 "\tIt looks like you need to initdb.",
2167 2168 2169
			 ControlFile->relseg_size, RELSEG_SIZE);
#ifdef USE_LOCALE
	if (setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
2170
		elog(STOP,
2171
		   "The database cluster was initialized with LC_COLLATE '%s',\n"
2172 2173
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2174 2175
			 ControlFile->lc_collate);
	if (setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
2176 2177 2178 2179
		elog(STOP,
			 "The database cluster was initialized with LC_CTYPE '%s',\n"
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2180
			 ControlFile->lc_ctype);
2181
#else							/* not USE_LOCALE */
2182 2183
	if (strcmp(ControlFile->lc_collate, "C") != 0 ||
		strcmp(ControlFile->lc_ctype, "C") != 0)
2184
		elog(STOP,
2185
		"The database cluster was initialized with LC_COLLATE '%s' and\n"
2186 2187
			 "\tLC_CTYPE '%s', but the server was compiled without locale support.\n"
			 "\tIt looks like you need to initdb or recompile.",
2188
			 ControlFile->lc_collate, ControlFile->lc_ctype);
2189
#endif	 /* not USE_LOCALE */
2190 2191
}

2192
void
2193
UpdateControlFile(void)
2194
{
2195
	int			fd;
2196

2197
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2198 2199
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2200
			   sizeof(ControlFileData) - sizeof(crc64));
2201 2202
	FIN_CRC64(ControlFile->crc);

2203
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
2204
	if (fd < 0)
2205
		elog(STOP, "could not open control file (%s): %m", ControlFilePath);
2206

2207
	errno = 0;
2208
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2209 2210 2211 2212
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2213
		elog(STOP, "write to control file failed: %m");
2214
	}
2215

2216
	if (pg_fsync(fd) != 0)
2217
		elog(STOP, "fsync of control file failed: %m");
2218 2219 2220 2221

	close(fd);
}

2222
/*
T
Tom Lane 已提交
2223
 * Initialization of shared memory for XLOG
2224 2225
 */

2226
int
2227
XLOGShmemSize(void)
2228 2229 2230 2231
{
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

T
Tom Lane 已提交
2232 2233 2234
	return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
		+ BLCKSZ * XLOGbuffers +
		MAXALIGN(sizeof(ControlFileData));
2235 2236 2237 2238 2239
}

void
XLOGShmemInit(void)
{
2240
	bool		found;
2241

2242
	/* this must agree with space requested by XLOGShmemSize() */
2243 2244 2245
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

2246
	XLogCtl = (XLogCtlData *)
T
Tom Lane 已提交
2247 2248 2249 2250 2251
		ShmemInitStruct("XLOG Ctl",
						MAXALIGN(sizeof(XLogCtlData) +
								 sizeof(XLogRecPtr) * XLOGbuffers)
						+ BLCKSZ * XLOGbuffers,
						&found);
2252
	Assert(!found);
2253 2254 2255 2256
	ControlFile = (ControlFileData *)
		ShmemInitStruct("Control File", sizeof(ControlFileData), &found);
	Assert(!found);

T
Tom Lane 已提交
2257
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
2258

T
Tom Lane 已提交
2259 2260 2261 2262 2263 2264 2265 2266
	/*
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be
	 * a multiple of the alignment for same, so no extra alignment padding
	 * is needed here.
	 */
	XLogCtl->xlblocks = (XLogRecPtr *)
		(((char *) XLogCtl) + sizeof(XLogCtlData));
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
B
Bruce Momjian 已提交
2267

T
Tom Lane 已提交
2268
	/*
B
Bruce Momjian 已提交
2269 2270
	 * Here, on the other hand, we must MAXALIGN to ensure the page
	 * buffers have worst-case alignment.
T
Tom Lane 已提交
2271 2272 2273 2274 2275 2276 2277
	 */
	XLogCtl->pages =
		((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
									  sizeof(XLogRecPtr) * XLOGbuffers);
	memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);

	/*
B
Bruce Momjian 已提交
2278 2279
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will
	 * fill in additional info.)
T
Tom Lane 已提交
2280 2281 2282 2283
	 */
	XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
2284
	SpinLockInit(&XLogCtl->info_lck);
T
Tom Lane 已提交
2285

2286 2287 2288 2289 2290 2291 2292
	/*
	 * If we are not in bootstrap mode, pg_control should already exist.
	 * Read and validate it immediately (see comments in ReadControlFile()
	 * for the reasons why).
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
2293 2294 2295
}

/*
T
Tom Lane 已提交
2296 2297
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
2298 2299
 */
void
T
Tom Lane 已提交
2300
BootStrapXLOG(void)
2301
{
2302
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2303 2304
	char	   *buffer;
	XLogPageHeader page;
2305
	XLogRecord *record;
B
Bruce Momjian 已提交
2306
	bool		use_existent;
2307
	crc64		crc;
2308

T
Tom Lane 已提交
2309 2310 2311 2312
	/* Use malloc() to ensure buffer is MAXALIGNED */
	buffer = (char *) malloc(BLCKSZ);
	page = (XLogPageHeader) buffer;

2313 2314 2315
	checkPoint.redo.xlogid = 0;
	checkPoint.redo.xrecoff = SizeOfXLogPHD;
	checkPoint.undo = checkPoint.redo;
T
Tom Lane 已提交
2316
	checkPoint.ThisStartUpID = 0;
2317
	checkPoint.nextXid = FirstNormalTransactionId;
2318
	checkPoint.nextOid = BootstrapObjectIdData;
T
Tom Lane 已提交
2319
	checkPoint.time = time(NULL);
2320

2321 2322 2323 2324
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;

2325 2326 2327
	memset(buffer, 0, BLCKSZ);
	page->xlp_magic = XLOG_PAGE_MAGIC;
	page->xlp_info = 0;
2328
	page->xlp_sui = checkPoint.ThisStartUpID;
2329 2330
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
2331 2332 2333
	record = (XLogRecord *) ((char *) page + SizeOfXLogPHD);
	record->xl_prev.xlogid = 0;
	record->xl_prev.xrecoff = 0;
2334 2335 2336
	record->xl_xact_prev = record->xl_prev;
	record->xl_xid = InvalidTransactionId;
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
2337
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
2338
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
2339
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
2340

2341
	INIT_CRC64(crc);
T
Tom Lane 已提交
2342
	COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
B
Bruce Momjian 已提交
2343
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
2344
			   SizeOfXLogRecord - sizeof(crc64));
2345 2346 2347
	FIN_CRC64(crc);
	record->xl_crc = crc;

2348 2349
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
2350

2351
	errno = 0;
T
Tom Lane 已提交
2352
	if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
2353 2354 2355 2356
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2357
		elog(STOP, "BootStrapXLOG failed to write log file: %m");
2358
	}
2359

T
Tom Lane 已提交
2360
	if (pg_fsync(openLogFile) != 0)
2361
		elog(STOP, "BootStrapXLOG failed to fsync log file: %m");
2362

T
Tom Lane 已提交
2363 2364
	close(openLogFile);
	openLogFile = -1;
2365

2366
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
2367 2368 2369
	/* Initialize pg_control status fields */
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
2370 2371 2372
	ControlFile->logId = 0;
	ControlFile->logSeg = 1;
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
2373
	ControlFile->checkPointCopy = checkPoint;
2374
	/* some additional ControlFile fields are set in WriteControlFile() */
2375

2376
	WriteControlFile();
2377 2378 2379

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
2380 2381
}

2382
static char *
2383 2384
str_time(time_t tnow)
{
T
Tom Lane 已提交
2385
	static char buf[32];
2386

2387
	strftime(buf, sizeof(buf),
T
Tom Lane 已提交
2388
			 "%Y-%m-%d %H:%M:%S %Z",
2389
			 localtime(&tnow));
2390

2391
	return buf;
2392 2393 2394
}

/*
T
Tom Lane 已提交
2395
 * This must be called ONCE during postmaster or standalone-backend startup
2396 2397
 */
void
T
Tom Lane 已提交
2398
StartupXLOG(void)
2399
{
2400 2401
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2402
	bool		wasShutdown;
2403
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
2404 2405 2406
				LastRec,
				checkPointLoc,
				EndOfLog;
2407
	XLogRecord *record;
T
Tom Lane 已提交
2408
	char	   *buffer;
2409

T
Tom Lane 已提交
2410 2411
	/* Use malloc() to ensure record buffer is MAXALIGNED */
	buffer = (char *) malloc(_INTL_MAXLOGRECSZ);
2412

T
Tom Lane 已提交
2413
	CritSectionCount++;
2414 2415

	/*
2416 2417
	 * Read control file and check XLOG status looks valid.
	 *
B
Bruce Momjian 已提交
2418 2419
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
2420
	 */
2421
	ReadControlFile();
2422

2423 2424 2425
	if (ControlFile->logSeg == 0 ||
		ControlFile->state < DB_SHUTDOWNED ||
		ControlFile->state > DB_IN_PRODUCTION ||
2426
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
2427
		elog(STOP, "control file context is broken");
2428 2429

	if (ControlFile->state == DB_SHUTDOWNED)
2430
		elog(LOG, "database system was shut down at %s",
2431
			 str_time(ControlFile->time));
2432
	else if (ControlFile->state == DB_SHUTDOWNING)
2433
		elog(LOG, "database system shutdown was interrupted at %s",
2434
			 str_time(ControlFile->time));
2435
	else if (ControlFile->state == DB_IN_RECOVERY)
2436
		elog(LOG, "database system was interrupted being in recovery at %s\n"
2437
			 "\tThis propably means that some data blocks are corrupted\n"
2438
			 "\tand you will have to use the last backup for recovery.",
2439
			 str_time(ControlFile->time));
2440
	else if (ControlFile->state == DB_IN_PRODUCTION)
2441
		elog(LOG, "database system was interrupted at %s",
2442
			 str_time(ControlFile->time));
2443

T
Tom Lane 已提交
2444 2445 2446 2447
	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
2448
	record = ReadCheckpointRecord(ControlFile->checkPoint, 1, buffer);
T
Tom Lane 已提交
2449 2450 2451
	if (record != NULL)
	{
		checkPointLoc = ControlFile->checkPoint;
2452
		elog(LOG, "checkpoint record is at %X/%X",
T
Tom Lane 已提交
2453 2454 2455 2456
			 checkPointLoc.xlogid, checkPointLoc.xrecoff);
	}
	else
	{
2457
		record = ReadCheckpointRecord(ControlFile->prevCheckPoint, 2, buffer);
T
Tom Lane 已提交
2458 2459 2460
		if (record != NULL)
		{
			checkPointLoc = ControlFile->prevCheckPoint;
2461
			elog(LOG, "using previous checkpoint record at %X/%X",
T
Tom Lane 已提交
2462 2463 2464 2465
				 checkPointLoc.xlogid, checkPointLoc.xrecoff);
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
2466
			elog(STOP, "unable to locate a valid checkpoint record");
T
Tom Lane 已提交
2467 2468 2469 2470
	}
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
2471

2472
	elog(LOG, "redo record is at %X/%X; undo record is at %X/%X; shutdown %s",
2473
		 checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
V
Vadim B. Mikheev 已提交
2474
		 checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
T
Tom Lane 已提交
2475
		 wasShutdown ? "TRUE" : "FALSE");
2476
	elog(LOG, "next transaction id: %u; next oid: %u",
2477
		 checkPoint.nextXid, checkPoint.nextOid);
2478
	if (!TransactionIdIsNormal(checkPoint.nextXid))
2479
		elog(STOP, "invalid next transaction id");
2480 2481 2482

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
2483
	ShmemVariableCache->oidCount = 0;
2484

V
WAL  
Vadim B. Mikheev 已提交
2485
	ThisStartUpID = checkPoint.ThisStartUpID;
B
Bruce Momjian 已提交
2486
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
2487
		XLogCtl->RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
2488

2489
	if (XLByteLT(RecPtr, checkPoint.redo))
2490
		elog(STOP, "invalid redo in checkpoint record");
2491 2492 2493
	if (checkPoint.undo.xrecoff == 0)
		checkPoint.undo = RecPtr;

B
Bruce Momjian 已提交
2494
	if (XLByteLT(checkPoint.undo, RecPtr) ||
V
Vadim B. Mikheev 已提交
2495
		XLByteLT(checkPoint.redo, RecPtr))
2496
	{
T
Tom Lane 已提交
2497
		if (wasShutdown)
2498
			elog(STOP, "invalid redo/undo record in shutdown checkpoint");
V
WAL  
Vadim B. Mikheev 已提交
2499
		InRecovery = true;
2500 2501
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
2502
		InRecovery = true;
2503

V
WAL  
Vadim B. Mikheev 已提交
2504 2505
	/* REDO */
	if (InRecovery)
2506
	{
2507
		elog(LOG, "database system was not properly shut down; "
2508
			 "automatic recovery in progress");
2509 2510 2511 2512
		ControlFile->state = DB_IN_RECOVERY;
		ControlFile->time = time(NULL);
		UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2513
		XLogInitRelationCache();
V
Vadim B. Mikheev 已提交
2514

2515 2516
		/* Is REDO required ? */
		if (XLByteLT(checkPoint.redo, RecPtr))
T
Tom Lane 已提交
2517
			record = ReadRecord(&(checkPoint.redo), STOP, buffer);
B
Bruce Momjian 已提交
2518
		else
2519 2520
		{
			/* read past CheckPoint record */
T
Tom Lane 已提交
2521
			record = ReadRecord(NULL, LOG, buffer);
2522
		}
2523

T
Tom Lane 已提交
2524
		if (record != NULL)
2525
		{
V
WAL  
Vadim B. Mikheev 已提交
2526
			InRedo = true;
2527
			elog(LOG, "redo starts at %X/%X",
2528
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2529 2530
			do
			{
2531 2532
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
2533
											ShmemVariableCache->nextXid))
2534 2535 2536 2537
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}
V
WAL  
Vadim B. Mikheev 已提交
2538 2539
				if (XLOG_DEBUG)
				{
B
Bruce Momjian 已提交
2540
					char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
2541

2542
					sprintf(buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
2543 2544
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
							EndRecPtr.xlogid, EndRecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
2545 2546
					xlog_outrec(buf, record);
					strcat(buf, " - ");
B
Bruce Momjian 已提交
2547 2548
					RmgrTable[record->xl_rmid].rm_desc(buf,
								record->xl_info, XLogRecGetData(record));
2549
					elog(DEBUG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
2550 2551
				}

T
Tom Lane 已提交
2552
				if (record->xl_info & XLR_BKP_BLOCK_MASK)
2553 2554
					RestoreBkpBlocks(record, EndRecPtr);

2555
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
T
Tom Lane 已提交
2556 2557
				record = ReadRecord(NULL, LOG, buffer);
			} while (record != NULL);
2558
			elog(LOG, "redo done at %X/%X",
2559
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2560
			LastRec = ReadRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2561
			InRedo = false;
2562 2563
		}
		else
2564
			elog(LOG, "redo is not required");
V
WAL  
Vadim B. Mikheev 已提交
2565 2566
	}

T
Tom Lane 已提交
2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577
	/*
	 * Init xlog buffer cache using the block containing the last valid
	 * record from the previous incarnation.
	 */
	record = ReadRecord(&LastRec, STOP, buffer);
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, openLogId, openLogSeg);
	openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
	openLogOff = 0;
	ControlFile->logId = openLogId;
	ControlFile->logSeg = openLogSeg + 1;
V
WAL  
Vadim B. Mikheev 已提交
2578
	Insert = &XLogCtl->Insert;
2579
	Insert->PrevRecord = LastRec;
B
Bruce Momjian 已提交
2580 2581

	/*
2582 2583
	 * If the next record will go to the new page then initialize for that
	 * one.
T
Tom Lane 已提交
2584
	 */
2585 2586 2587 2588
	if ((BLCKSZ - EndOfLog.xrecoff % BLCKSZ) < SizeOfXLogRecord)
		EndOfLog.xrecoff += (BLCKSZ - EndOfLog.xrecoff % BLCKSZ);
	if (EndOfLog.xrecoff % BLCKSZ == 0)
	{
2589 2590 2591 2592
		XLogRecPtr	NewPageEndPtr;

		NewPageEndPtr = EndOfLog;
		if (NewPageEndPtr.xrecoff >= XLogFileSize)
2593
		{
2594 2595 2596
			/* crossing a logid boundary */
			NewPageEndPtr.xlogid += 1;
			NewPageEndPtr.xrecoff = BLCKSZ;
2597 2598
		}
		else
2599 2600
			NewPageEndPtr.xrecoff += BLCKSZ;
		XLogCtl->xlblocks[0] = NewPageEndPtr;
2601 2602 2603 2604 2605
		Insert->currpage->xlp_magic = XLOG_PAGE_MAGIC;
		if (InRecovery)
			Insert->currpage->xlp_sui = ThisStartUpID;
		else
			Insert->currpage->xlp_sui = ThisStartUpID + 1;
2606 2607
		Insert->currpage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
		Insert->currpage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
2608
		/* rest of buffer was zeroed in XLOGShmemInit */
2609
		Insert->currpos = (char *) Insert->currpage + SizeOfXLogPHD;
2610 2611 2612 2613 2614 2615
	}
	else
	{
		XLogCtl->xlblocks[0].xlogid = openLogId;
		XLogCtl->xlblocks[0].xrecoff =
			((EndOfLog.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
2616

2617 2618
		/*
		 * Tricky point here: readBuf contains the *last* block that the
2619
		 * LastRec record spans, not the one it starts in.	The last block
2620
		 * is indeed the one we want to use.
2621 2622 2623 2624 2625 2626 2627 2628
		 */
		Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - BLCKSZ) % XLogSegSize);
		memcpy((char *) Insert->currpage, readBuf, BLCKSZ);
		Insert->currpos = (char *) Insert->currpage +
			(EndOfLog.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
		/* Make sure rest of page is zero */
		memset(Insert->currpos, 0, INSERT_FREESPACE(Insert));
	}
V
WAL  
Vadim B. Mikheev 已提交
2629

T
Tom Lane 已提交
2630
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
2631

T
Tom Lane 已提交
2632 2633 2634
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
2635

T
Tom Lane 已提交
2636 2637
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
2638

V
Vadim B. Mikheev 已提交
2639
#ifdef NOT_USED
V
WAL  
Vadim B. Mikheev 已提交
2640 2641 2642
	/* UNDO */
	if (InRecovery)
	{
2643 2644 2645
		RecPtr = ReadRecPtr;
		if (XLByteLT(checkPoint.undo, RecPtr))
		{
2646
			elog(LOG, "undo starts at %X/%X",
2647
				 RecPtr.xlogid, RecPtr.xrecoff);
2648 2649
			do
			{
T
Tom Lane 已提交
2650
				record = ReadRecord(&RecPtr, STOP, buffer);
2651
				if (TransactionIdIsValid(record->xl_xid) &&
2652
					!TransactionIdDidCommit(record->xl_xid))
V
misc  
Vadim B. Mikheev 已提交
2653
					RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
2654 2655
				RecPtr = record->xl_prev;
			} while (XLByteLE(checkPoint.undo, RecPtr));
2656
			elog(LOG, "undo done at %X/%X",
2657
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2658 2659
		}
		else
2660
			elog(LOG, "undo is not required");
2661
	}
V
WAL  
Vadim B. Mikheev 已提交
2662
#endif
2663

V
WAL  
Vadim B. Mikheev 已提交
2664
	if (InRecovery)
2665
	{
T
Tom Lane 已提交
2666 2667 2668 2669 2670 2671 2672
		/*
		 * In case we had to use the secondary checkpoint, make sure that
		 * it will still be shown as the secondary checkpoint after this
		 * CreateCheckPoint operation; we don't want the broken primary
		 * checkpoint to become prevCheckPoint...
		 */
		ControlFile->checkPoint = checkPointLoc;
2673
		CreateCheckPoint(true);
V
WAL  
Vadim B. Mikheev 已提交
2674
		XLogCloseRelationCache();
2675
	}
2676

T
Tom Lane 已提交
2677 2678 2679 2680
	/*
	 * Preallocate additional log files, if wanted.
	 */
	PreallocXlogFiles(EndOfLog);
2681

V
WAL  
Vadim B. Mikheev 已提交
2682
	InRecovery = false;
2683 2684 2685 2686 2687

	ControlFile->state = DB_IN_PRODUCTION;
	ControlFile->time = time(NULL);
	UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2688 2689 2690
	ThisStartUpID++;
	XLogCtl->ThisStartUpID = ThisStartUpID;

2691 2692 2693
	/* Start up the commit log, too */
	StartupCLOG();

2694
	elog(LOG, "database system is ready");
2695
	CritSectionCount--;
2696

T
Tom Lane 已提交
2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	free(buffer);
}

2712 2713 2714 2715
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 * whichChkpt = 1 for "primary", 2 for "secondary", merely informative
 */
T
Tom Lane 已提交
2716 2717
static XLogRecord *
ReadCheckpointRecord(XLogRecPtr RecPtr,
2718
					 int whichChkpt,
T
Tom Lane 已提交
2719 2720 2721 2722 2723 2724
					 char *buffer)
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
2725 2726 2727
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint link in control file" :
				   "invalid secondary checkpoint link in control file"));
T
Tom Lane 已提交
2728 2729 2730 2731 2732 2733 2734
		return NULL;
	}

	record = ReadRecord(&RecPtr, LOG, buffer);

	if (record == NULL)
	{
2735 2736 2737
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint record" :
				   "invalid secondary checkpoint record"));
T
Tom Lane 已提交
2738 2739 2740 2741
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
2742
		elog(LOG, (whichChkpt == 1 ?
2743 2744
			 "invalid resource manager id in primary checkpoint record" :
		  "invalid resource manager id in secondary checkpoint record"));
T
Tom Lane 已提交
2745 2746 2747 2748 2749
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
2750 2751 2752
		elog(LOG, (whichChkpt == 1 ?
				   "invalid xl_info in primary checkpoint record" :
				   "invalid xl_info in secondary checkpoint record"));
T
Tom Lane 已提交
2753 2754 2755 2756
		return NULL;
	}
	if (record->xl_len != sizeof(CheckPoint))
	{
2757 2758 2759
		elog(LOG, (whichChkpt == 1 ?
				   "invalid length of primary checkpoint record" :
				   "invalid length of secondary checkpoint record"));
T
Tom Lane 已提交
2760 2761 2762
		return NULL;
	}
	return record;
2763 2764
}

V
WAL  
Vadim B. Mikheev 已提交
2765
/*
T
Tom Lane 已提交
2766
 * Postmaster uses this to initialize ThisStartUpID & RedoRecPtr from
2767
 * XLogCtlData located in shmem after successful startup.
V
WAL  
Vadim B. Mikheev 已提交
2768 2769 2770 2771 2772
 */
void
SetThisStartUpID(void)
{
	ThisStartUpID = XLogCtl->ThisStartUpID;
2773 2774 2775 2776
	RedoRecPtr = XLogCtl->RedoRecPtr;
}

/*
T
Tom Lane 已提交
2777
 * CheckPoint process called by postmaster saves copy of new RedoRecPtr
B
Bruce Momjian 已提交
2778
 * in shmem (using SetRedoRecPtr).	When checkpointer completes, postmaster
T
Tom Lane 已提交
2779 2780
 * calls GetRedoRecPtr to update its own copy of RedoRecPtr, so that
 * subsequently-spawned backends will start out with a reasonably up-to-date
2781
 * local RedoRecPtr.  Since these operations are not protected by any lock
T
Tom Lane 已提交
2782 2783 2784 2785
 * and copying an XLogRecPtr isn't atomic, it's unsafe to use either of these
 * routines at other times!
 *
 * Note: once spawned, a backend must update its local RedoRecPtr from
2786
 * XLogCtl->Insert.RedoRecPtr while holding the insert lock.  This is
T
Tom Lane 已提交
2787
 * done in XLogInsert().
2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798
 */
void
SetRedoRecPtr(void)
{
	XLogCtl->RedoRecPtr = RedoRecPtr;
}

void
GetRedoRecPtr(void)
{
	RedoRecPtr = XLogCtl->RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2799 2800
}

2801
/*
T
Tom Lane 已提交
2802
 * This must be called ONCE during postmaster or standalone-backend shutdown
2803 2804
 */
void
T
Tom Lane 已提交
2805
ShutdownXLOG(void)
2806
{
2807
	elog(LOG, "shutting down");
2808

T
Tom Lane 已提交
2809 2810 2811
	/* suppress in-transaction check in CreateCheckPoint */
	MyLastRecPtr.xrecoff = 0;

2812
	CritSectionCount++;
V
Vadim B. Mikheev 已提交
2813
	CreateDummyCaches();
2814
	CreateCheckPoint(true);
2815
	ShutdownCLOG();
2816
	CritSectionCount--;
2817

2818
	elog(LOG, "database system is shut down");
2819 2820
}

T
Tom Lane 已提交
2821 2822 2823
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
2824 2825 2826
void
CreateCheckPoint(bool shutdown)
{
2827 2828 2829
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
2830
	XLogRecData rdata;
2831
	uint32		freespace;
V
Vadim B. Mikheev 已提交
2832 2833 2834 2835 2836
	uint32		_logId;
	uint32		_logSeg;

	if (MyLastRecPtr.xrecoff != 0)
		elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
B
Bruce Momjian 已提交
2837

2838 2839
	/*
	 * The CheckpointLock can be held for quite a while, which is not good
2840 2841 2842 2843 2844
	 * because we won't respond to a cancel/die request while waiting for
	 * an LWLock.  (But the alternative of using a regular lock won't work
	 * for background checkpoint processes, which are not regular
	 * backends.) So, rather than use a plain LWLockAcquire, use this
	 * kluge to allow an interrupt to be accepted while we are waiting:
2845 2846
	 */
	while (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
V
Vadim B. Mikheev 已提交
2847
	{
2848 2849
		CHECK_FOR_INTERRUPTS();
		sleep(1);
V
Vadim B. Mikheev 已提交
2850
	}
2851

2852 2853
	START_CRIT_SECTION();

2854 2855 2856 2857 2858 2859
	if (shutdown)
	{
		ControlFile->state = DB_SHUTDOWNING;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
T
Tom Lane 已提交
2860 2861

	memset(&checkPoint, 0, sizeof(checkPoint));
V
WAL  
Vadim B. Mikheev 已提交
2862
	checkPoint.ThisStartUpID = ThisStartUpID;
T
Tom Lane 已提交
2863
	checkPoint.time = time(NULL);
2864

2865
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
2866 2867 2868 2869

	/*
	 * If this isn't a shutdown, and we have not inserted any XLOG records
	 * since the start of the last checkpoint, skip the checkpoint.  The
B
Bruce Momjian 已提交
2870 2871 2872 2873 2874 2875
	 * idea here is to avoid inserting duplicate checkpoints when the
	 * system is idle.	That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the
	 * previous checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
2876 2877
	 *
	 * We have to make two tests to determine that nothing has happened since
B
Bruce Momjian 已提交
2878 2879 2880
	 * the start of the last checkpoint: current insertion point must
	 * match the end of the last checkpoint record, and its redo pointer
	 * must point to itself.
T
Tom Lane 已提交
2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894
	 */
	if (!shutdown)
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
2895 2896
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907
			END_CRIT_SECTION();
			return;
		}
	}

	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will
	 * be, since other backends may insert more XLOG records while we're
	 * off doing the buffer flush work.  Those XLOG records are logically
B
Bruce Momjian 已提交
2908
	 * after the checkpoint, even though physically before it.	Got that?
T
Tom Lane 已提交
2909 2910
	 */
	freespace = INSERT_FREESPACE(Insert);
2911 2912
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
2913 2914
		(void) AdvanceXLInsertBuffer();
		/* OK to ignore update return flag, since we will do flush anyway */
2915 2916
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
T
Tom Lane 已提交
2917
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
2918

T
Tom Lane 已提交
2919 2920 2921 2922
	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls;
	 * this must be done while holding the insert lock.
	 */
2923
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
B
Bruce Momjian 已提交
2924

T
Tom Lane 已提交
2925
	/*
B
Bruce Momjian 已提交
2926 2927 2928 2929
	 * Get UNDO record ptr - this is oldest of PROC->logRec values. We do
	 * this while holding insert lock to ensure that we won't miss any
	 * about-to-commit transactions (UNDO must include all xacts that have
	 * commits after REDO point).
T
Tom Lane 已提交
2930 2931 2932 2933
	 */
	checkPoint.undo = GetUndoRecPtr();

	if (shutdown && checkPoint.undo.xrecoff != 0)
2934
		elog(STOP, "active transaction while database system is shutting down");
T
Tom Lane 已提交
2935 2936 2937 2938 2939

	/*
	 * Now we can release insert lock, allowing other xacts to proceed
	 * even while we are flushing disk buffers.
	 */
2940
	LWLockRelease(WALInsertLock);
2941

2942
	LWLockAcquire(XidGenLock, LW_SHARED);
2943
	checkPoint.nextXid = ShmemVariableCache->nextXid;
2944
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
2945

2946
	LWLockAcquire(OidGenLock, LW_SHARED);
2947
	checkPoint.nextOid = ShmemVariableCache->nextOid;
2948 2949
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
2950
	LWLockRelease(OidGenLock);
2951

T
Tom Lane 已提交
2952
	/*
B
Bruce Momjian 已提交
2953 2954
	 * Having constructed the checkpoint record, ensure all shmem disk
	 * buffers are flushed to disk.
T
Tom Lane 已提交
2955
	 */
V
Vadim B. Mikheev 已提交
2956
	FlushBufferPool();
2957

2958 2959 2960
	/* And commit-log buffers, too */
	CheckPointCLOG();

T
Tom Lane 已提交
2961 2962 2963
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
2964
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
2965
	rdata.data = (char *) (&checkPoint);
2966 2967 2968
	rdata.len = sizeof(checkPoint);
	rdata.next = NULL;

T
Tom Lane 已提交
2969 2970 2971 2972 2973 2974
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
2975

T
Tom Lane 已提交
2976 2977 2978 2979 2980
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record,
	 * recptr = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
2981
		elog(STOP, "concurrent transaction log activity while database system is shutting down");
2982

T
Tom Lane 已提交
2983
	/*
2984 2985 2986 2987 2988 2989 2990
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
	 *
	 * With UNDO support: oldest item is redo or undo, whichever is older;
	 * but watch out for case that undo = 0.
	 *
	 * Without UNDO support: just use the redo pointer.  This allows xlog
2991 2992
	 * space to be freed much faster when there are long-running
	 * transactions.
T
Tom Lane 已提交
2993
	 */
2994
#ifdef NOT_USED
B
Bruce Momjian 已提交
2995
	if (ControlFile->checkPointCopy.undo.xrecoff != 0 &&
T
Tom Lane 已提交
2996 2997 2998 2999
		XLByteLT(ControlFile->checkPointCopy.undo,
				 ControlFile->checkPointCopy.redo))
		XLByteToSeg(ControlFile->checkPointCopy.undo, _logId, _logSeg);
	else
3000
#endif
T
Tom Lane 已提交
3001
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
3002

T
Tom Lane 已提交
3003 3004 3005
	/*
	 * Update the control file.
	 */
3006
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3007 3008
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
3009 3010 3011
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
3012 3013
	ControlFile->time = time(NULL);
	UpdateControlFile();
3014
	LWLockRelease(ControlFileLock);
3015

V
Vadim B. Mikheev 已提交
3016
	/*
T
Tom Lane 已提交
3017 3018
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
V
Vadim B. Mikheev 已提交
3019 3020 3021
	 */
	if (_logId || _logSeg)
	{
T
Tom Lane 已提交
3022
		PrevLogSeg(_logId, _logSeg);
3023
		MoveOfflineLogs(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
3024 3025
	}

T
Tom Lane 已提交
3026 3027 3028 3029 3030 3031 3032 3033
	/*
	 * Make more log segments if needed.  (Do this after deleting offline
	 * log segments, to avoid having peak disk space usage higher than
	 * necessary.)
	 */
	if (!shutdown)
		PreallocXlogFiles(recptr);

3034
	LWLockRelease(CheckpointLock);
V
Vadim B. Mikheev 已提交
3035

3036
	END_CRIT_SECTION();
3037
}
V
WAL  
Vadim B. Mikheev 已提交
3038

T
Tom Lane 已提交
3039 3040 3041
/*
 * Write a NEXTOID log record
 */
3042 3043 3044
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
3045
	XLogRecData rdata;
3046

3047
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3048
	rdata.data = (char *) (&nextOid);
3049 3050 3051 3052
	rdata.len = sizeof(Oid);
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
}
V
WAL  
Vadim B. Mikheev 已提交
3053

T
Tom Lane 已提交
3054 3055 3056
/*
 * XLOG resource manager's routines
 */
V
WAL  
Vadim B. Mikheev 已提交
3057 3058 3059
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
3060
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
3061

3062
	if (info == XLOG_NEXTOID)
3063
	{
B
Bruce Momjian 已提交
3064
		Oid			nextOid;
3065 3066 3067

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
3068
		{
3069
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
3088
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
3089 3090
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
3091 3092 3093 3094 3095 3096
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
3097
	}
V
WAL  
Vadim B. Mikheev 已提交
3098
}
B
Bruce Momjian 已提交
3099

V
WAL  
Vadim B. Mikheev 已提交
3100 3101 3102 3103
void
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
B
Bruce Momjian 已提交
3104

V
WAL  
Vadim B. Mikheev 已提交
3105
void
B
Bruce Momjian 已提交
3106
xlog_desc(char *buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
3107
{
B
Bruce Momjian 已提交
3108
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
3109

T
Tom Lane 已提交
3110 3111
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
3112
	{
B
Bruce Momjian 已提交
3113 3114
		CheckPoint *checkpoint = (CheckPoint *) rec;

3115
		sprintf(buf + strlen(buf), "checkpoint: redo %X/%X; undo %X/%X; "
B
Bruce Momjian 已提交
3116 3117 3118 3119 3120 3121
				"sui %u; xid %u; oid %u; %s",
				checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
				checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
				checkpoint->ThisStartUpID, checkpoint->nextXid,
				checkpoint->nextOid,
			 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
3122
	}
3123 3124
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
3125
		Oid			nextOid;
3126 3127 3128 3129

		memcpy(&nextOid, rec, sizeof(Oid));
		sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
	}
V
WAL  
Vadim B. Mikheev 已提交
3130 3131 3132 3133 3134 3135 3136
	else
		strcat(buf, "UNKNOWN");
}

static void
xlog_outrec(char *buf, XLogRecord *record)
{
B
Bruce Momjian 已提交
3137 3138
	int			bkpb;
	int			i;
3139

3140
	sprintf(buf + strlen(buf), "prev %X/%X; xprev %X/%X; xid %u",
B
Bruce Momjian 已提交
3141 3142 3143
			record->xl_prev.xlogid, record->xl_prev.xrecoff,
			record->xl_xact_prev.xlogid, record->xl_xact_prev.xrecoff,
			record->xl_xid);
3144

T
Tom Lane 已提交
3145
	for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3146 3147 3148 3149 3150 3151 3152 3153 3154 3155
	{
		if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i))))
			continue;
		bkpb++;
	}

	if (bkpb)
		sprintf(buf + strlen(buf), "; bkpb %d", bkpb);

	sprintf(buf + strlen(buf), ": %s",
B
Bruce Momjian 已提交
3156
			RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
3157
}
3158 3159 3160 3161 3162 3163 3164 3165 3166


/*
 * GUC support routines
 */

bool
check_xlog_sync_method(const char *method)
{
B
Bruce Momjian 已提交
3167 3168
	if (strcasecmp(method, "fsync") == 0)
		return true;
3169
#ifdef HAVE_FDATASYNC
B
Bruce Momjian 已提交
3170 3171
	if (strcasecmp(method, "fdatasync") == 0)
		return true;
3172 3173
#endif
#ifdef OPEN_SYNC_FLAG
B
Bruce Momjian 已提交
3174 3175
	if (strcasecmp(method, "open_sync") == 0)
		return true;
3176 3177
#endif
#ifdef OPEN_DATASYNC_FLAG
B
Bruce Momjian 已提交
3178 3179
	if (strcasecmp(method, "open_datasync") == 0)
		return true;
3180 3181 3182 3183 3184 3185 3186
#endif
	return false;
}

void
assign_xlog_sync_method(const char *method)
{
B
Bruce Momjian 已提交
3187 3188
	int			new_sync_method;
	int			new_sync_bit;
3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218

	if (strcasecmp(method, "fsync") == 0)
	{
		new_sync_method = SYNC_METHOD_FSYNC;
		new_sync_bit = 0;
	}
#ifdef HAVE_FDATASYNC
	else if (strcasecmp(method, "fdatasync") == 0)
	{
		new_sync_method = SYNC_METHOD_FDATASYNC;
		new_sync_bit = 0;
	}
#endif
#ifdef OPEN_SYNC_FLAG
	else if (strcasecmp(method, "open_sync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_SYNC_FLAG;
	}
#endif
#ifdef OPEN_DATASYNC_FLAG
	else if (strcasecmp(method, "open_datasync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_DATASYNC_FLAG;
	}
#endif
	else
	{
		/* Can't get here unless guc.c screwed up */
3219
		elog(ERROR, "bogus wal_sync_method %s", method);
3220 3221 3222 3223 3224 3225 3226
		new_sync_method = 0;	/* keep compiler quiet */
		new_sync_bit = 0;
	}

	if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
	{
		/*
B
Bruce Momjian 已提交
3227 3228 3229 3230
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new
		 * flag bit) at next use.
3231 3232 3233 3234
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
3235
				elog(STOP, "fsync of log file %u, segment %u failed: %m",
3236 3237 3238 3239
					 openLogId, openLogSeg);
			if (open_sync_bit != new_sync_bit)
			{
				if (close(openLogFile) != 0)
3240
					elog(STOP, "close of log file %u, segment %u failed: %m",
3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
		}
		sync_method = new_sync_method;
		open_sync_bit = new_sync_bit;
	}
}


/*
 * Issue appropriate kind of fsync (if any) on the current XLOG output file
 */
static void
issue_xlog_fsync(void)
{
	switch (sync_method)
	{
3259
		case SYNC_METHOD_FSYNC:
3260
			if (pg_fsync(openLogFile) != 0)
3261
				elog(STOP, "fsync of log file %u, segment %u failed: %m",
3262 3263 3264 3265 3266
					 openLogId, openLogSeg);
			break;
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
			if (pg_fdatasync(openLogFile) != 0)
3267
				elog(STOP, "fdatasync of log file %u, segment %u failed: %m",
3268 3269 3270 3271 3272 3273 3274
					 openLogId, openLogSeg);
			break;
#endif
		case SYNC_METHOD_OPEN:
			/* write synced it already */
			break;
		default:
3275
			elog(STOP, "bogus wal_sync_method %d", sync_method);
3276 3277 3278
			break;
	}
}