xlog.c 93.0 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
7
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.71 2001/07/19 02:12:34 tgl Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <fcntl.h>
T
Tom Lane 已提交
18
#include <signal.h>
19 20 21
#include <unistd.h>
#include <errno.h>
#include <sys/stat.h>
V
Vadim B. Mikheev 已提交
22
#include <sys/time.h>
V
Vadim B. Mikheev 已提交
23 24
#include <sys/types.h>
#include <dirent.h>
25 26 27
#ifdef USE_LOCALE
#include <locale.h>
#endif
28

29
#include "access/transam.h"
30
#include "access/xact.h"
31
#include "catalog/catversion.h"
T
Tom Lane 已提交
32
#include "catalog/pg_control.h"
33 34 35 36
#include "storage/sinval.h"
#include "storage/proc.h"
#include "storage/spin.h"
#include "storage/s_lock.h"
37
#include "storage/bufpage.h"
V
Vadim B. Mikheev 已提交
38 39
#include "access/xlog.h"
#include "access/xlogutils.h"
40
#include "utils/builtins.h"
41
#include "utils/relcache.h"
V
WAL  
Vadim B. Mikheev 已提交
42 43
#include "miscadmin.h"

44

45 46 47
/*
 * This chunk of hackery attempts to determine which file sync methods
 * are available on the current platform, and to choose an appropriate
B
Bruce Momjian 已提交
48
 * default method.	We assume that fsync() is always available, and that
49 50 51 52
 * configure determined whether fdatasync() is.
 */
#define SYNC_METHOD_FSYNC		0
#define SYNC_METHOD_FDATASYNC	1
B
Bruce Momjian 已提交
53 54
#define SYNC_METHOD_OPEN		2		/* used for both O_SYNC and
										 * O_DSYNC */
55 56

#if defined(O_SYNC)
B
Bruce Momjian 已提交
57
#define OPEN_SYNC_FLAG	   O_SYNC
58
#else
B
Bruce Momjian 已提交
59 60 61
#if defined(O_FSYNC)
#define OPEN_SYNC_FLAG	  O_FSYNC
#endif
62 63 64
#endif

#if defined(OPEN_SYNC_FLAG)
B
Bruce Momjian 已提交
65 66 67
#if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
#define OPEN_DATASYNC_FLAG	  O_DSYNC
#endif
68 69 70
#endif

#if defined(OPEN_DATASYNC_FLAG)
B
Bruce Momjian 已提交
71 72 73
#define DEFAULT_SYNC_METHOD_STR    "open_datasync"
#define DEFAULT_SYNC_METHOD		   SYNC_METHOD_OPEN
#define DEFAULT_SYNC_FLAGBIT	   OPEN_DATASYNC_FLAG
74
#else
B
Bruce Momjian 已提交
75 76 77 78 79 80 81 82 83
#if defined(HAVE_FDATASYNC)
#define DEFAULT_SYNC_METHOD_STR   "fdatasync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FDATASYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#else
#define DEFAULT_SYNC_METHOD_STR   "fsync"
#define DEFAULT_SYNC_METHOD		  SYNC_METHOD_FSYNC
#define DEFAULT_SYNC_FLAGBIT	  0
#endif
84 85 86
#endif


87
/* Max time to wait to acquire XLog activity locks */
B
Bruce Momjian 已提交
88
#define XLOG_LOCK_TIMEOUT			(5*60*1000000)		/* 5 minutes */
89
/* Max time to wait to acquire checkpoint lock */
B
Bruce Momjian 已提交
90
#define CHECKPOINT_LOCK_TIMEOUT		(20*60*1000000)		/* 20 minutes */
91

T
Tom Lane 已提交
92 93
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
94
int			XLOGbuffers = 8;
95
int			XLOGfiles = 0;		/* # of files to preallocate during ckpt */
T
Tom Lane 已提交
96
int			XLOG_DEBUG = 0;
97 98
char	   *XLOG_sync_method = NULL;
const char	XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
B
Bruce Momjian 已提交
99 100
char		XLOG_archive_dir[MAXPGPATH];		/* null string means
												 * delete 'em */
T
Tom Lane 已提交
101

102 103 104 105 106 107 108 109 110 111 112 113 114 115
/*
 * XLOGfileslop is used in the code as the allowed "fuzz" in the number of 
 * preallocated XLOG segments --- we try to have at least XLOGfiles advance
 * segments but no more than XLOGfiles+XLOGfileslop segments.  This could
 * be made a separate GUC variable, but at present I think it's sufficient
 * to hardwire it as 2*CheckPointSegments+1.  Under normal conditions, a
 * checkpoint will free no more than 2*CheckPointSegments log segments, and
 * we want to recycle all of them; the +1 allows boundary cases to happen
 * without wasting a delete/create-segment cycle.
 */

#define XLOGfileslop	(2*CheckPointSegments + 1)


116 117 118 119 120 121
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
static int	sync_method = DEFAULT_SYNC_METHOD;
static int	open_sync_bit = DEFAULT_SYNC_FLAGBIT;

#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)

122 123
#define MinXLOGbuffers	4

T
Tom Lane 已提交
124 125 126 127 128

/*
 * ThisStartUpID will be same in all backends --- it identifies current
 * instance of the database system.
 */
V
WAL  
Vadim B. Mikheev 已提交
129 130
StartUpID	ThisStartUpID = 0;

T
Tom Lane 已提交
131 132
/* Are we doing recovery by reading XLOG? */
bool		InRecovery = false;
133

T
Tom Lane 已提交
134 135 136 137 138 139 140 141 142
/*
 * MyLastRecPtr points to the start of the last XLOG record inserted by the
 * current transaction.  If MyLastRecPtr.xrecoff == 0, then we are not in
 * a transaction or the transaction has not yet made any loggable changes.
 *
 * Note that XLOG records inserted outside transaction control are not
 * reflected into MyLastRecPtr.
 */
XLogRecPtr	MyLastRecPtr = {0, 0};
V
Vadim B. Mikheev 已提交
143

T
Tom Lane 已提交
144 145 146 147 148 149
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 * current backend.  It is updated for all inserts, transaction-controlled
 * or not.
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
150

T
Tom Lane 已提交
151 152 153
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
154
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
155 156 157 158
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 * hold the Insert spinlock).  See XLogInsert for details.
 */
static XLogRecPtr RedoRecPtr;
159

T
Tom Lane 已提交
160 161
/* This lock must be held to read/update control file or create new log file */
SPINLOCK	ControlFileLockId;
162

T
Tom Lane 已提交
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
 * We do a lot of pushups to minimize the amount of access to spinlocked
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
 *		XLogCtl->Write.LogwrtResult is protected by logwrt_lck
 *		XLogCtl->Insert.LogwrtResult is protected by insert_lck
 * One must hold the associated spinlock to read or write any of these, but
 * of course no spinlock is needed to read/write the unshared LogwrtResult.
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
B
Bruce Momjian 已提交
183
 * it releases logwrt_lck.	The point of keeping XLogCtl->Write.LogwrtResult
T
Tom Lane 已提交
184 185 186 187
 * is that it can be examined/modified by code that already holds logwrt_lck
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
188
 * but is updated when convenient.	Again, it exists for the convenience of
T
Tom Lane 已提交
189 190 191 192 193 194 195 196 197 198 199 200 201 202
 * code that is already holding insert_lck but not the other locks.
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
 *----------
 */
typedef struct XLogwrtRqst
203
{
T
Tom Lane 已提交
204 205
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
B
Bruce Momjian 已提交
206
}			XLogwrtRqst;
207

T
Tom Lane 已提交
208
typedef struct XLogwrtResult
209
{
T
Tom Lane 已提交
210 211
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
B
Bruce Momjian 已提交
212
}			XLogwrtResult;
213

T
Tom Lane 已提交
214 215 216
/*
 * Shared state data for XLogInsert.
 */
217 218
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
219 220 221 222 223 224
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
	uint16		curridx;		/* current block index in cache */
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
225 226
} XLogCtlInsert;

T
Tom Lane 已提交
227 228 229
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
230 231
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
232 233
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	uint16		curridx;		/* cache index of next block to write */
234 235
} XLogCtlWrite;

T
Tom Lane 已提交
236 237 238
/*
 * Total shared-memory state for XLOG.
 */
239 240
typedef struct XLogCtlData
{
T
Tom Lane 已提交
241
	/* Protected by insert_lck: */
B
Bruce Momjian 已提交
242
	XLogCtlInsert Insert;
T
Tom Lane 已提交
243
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
244 245
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
T
Tom Lane 已提交
246
	/* Protected by logwrt_lck: */
B
Bruce Momjian 已提交
247 248
	XLogCtlWrite Write;

T
Tom Lane 已提交
249 250
	/*
	 * These values do not change after startup, although the pointed-to
B
Bruce Momjian 已提交
251
	 * pages and xlblocks values certainly do.	Permission to read/write
T
Tom Lane 已提交
252 253
	 * the pages and xlblocks values depends on insert_lck and logwrt_lck.
	 */
B
Bruce Momjian 已提交
254 255 256 257 258
	char	   *pages;			/* buffers for unwritten XLOG pages */
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + BLCKSZ */
	uint32		XLogCacheByte;	/* # bytes in xlog buffers */
	uint32		XLogCacheBlck;	/* highest allocated xlog buffer index */
	StartUpID	ThisStartUpID;
T
Tom Lane 已提交
259 260

	/* This value is not protected by *any* spinlock... */
B
Bruce Momjian 已提交
261
	XLogRecPtr	RedoRecPtr;		/* see SetRedoRecPtr/GetRedoRecPtr */
T
Tom Lane 已提交
262

B
Bruce Momjian 已提交
263 264 265 266
	slock_t		insert_lck;		/* XLogInsert lock */
	slock_t		info_lck;		/* locks shared LogwrtRqst/LogwrtResult */
	slock_t		logwrt_lck;		/* XLogWrite/XLogFlush lock */
	slock_t		chkp_lck;		/* checkpoint lock */
267 268
} XLogCtlData;

269
static XLogCtlData *XLogCtl = NULL;
270

271
/*
T
Tom Lane 已提交
272
 * We maintain an image of pg_control in shared memory.
273
 */
274
static ControlFileData *ControlFile = NULL;
275

T
Tom Lane 已提交
276 277 278 279 280
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
281

T
Tom Lane 已提交
282 283 284 285 286 287 288 289 290
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
	(BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
291
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
	)


/* Increment an xlogid/segment pair */
#define NextLogSeg(logId, logSeg)	\
	do { \
		if ((logSeg) >= XLogSegsPerFile-1) \
		{ \
			(logId)++; \
			(logSeg) = 0; \
		} \
		else \
			(logSeg)++; \
	} while (0)

/* Decrement an xlogid/segment pair (assume it's not 0,0) */
#define PrevLogSeg(logId, logSeg)	\
	do { \
		if (logSeg) \
			(logSeg)--; \
		else \
		{ \
			(logId)--; \
			(logSeg) = XLogSegsPerFile-1; \
		} \
	} while (0)
V
WAL  
Vadim B. Mikheev 已提交
318

T
Tom Lane 已提交
319 320 321 322
/*
 * Compute ID and segment from an XLogRecPtr.
 *
 * For XLByteToSeg, do the computation at face value.  For XLByteToPrevSeg,
B
Bruce Momjian 已提交
323
 * a boundary byte is taken to be in the previous segment.	This is suitable
T
Tom Lane 已提交
324 325 326 327 328 329 330 331 332 333 334
 * for deciding which segment to write given a pointer to a record end,
 * for example.
 */
#define XLByteToSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = (xlrp).xrecoff / XLogSegSize \
	)
#define XLByteToPrevSeg(xlrp, logId, logSeg)	\
	( logId = (xlrp).xlogid, \
	  logSeg = ((xlrp).xrecoff - 1) / XLogSegSize \
	)
335

336
/*
T
Tom Lane 已提交
337 338 339 340
 * Is an XLogRecPtr within a particular XLOG segment?
 *
 * For XLByteInSeg, do the computation at face value.  For XLByteInPrevSeg,
 * a boundary byte is taken to be in the previous segment.
341
 */
T
Tom Lane 已提交
342 343 344 345 346 347 348
#define XLByteInSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 (xlrp).xrecoff / XLogSegSize == (logSeg))

#define XLByteInPrevSeg(xlrp, logId, logSeg)	\
	((xlrp).xlogid == (logId) && \
	 ((xlrp).xrecoff - 1) / XLogSegSize == (logSeg))
349 350


351
#define XLogFileName(path, log, seg)	\
352 353
			snprintf(path, MAXPGPATH, "%s/%08X%08X",	\
					 XLogDir, log, seg)
354

T
Tom Lane 已提交
355 356 357 358 359
#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
360

361
#define XRecOffIsValid(xrecoff) \
T
Tom Lane 已提交
362 363
		((xrecoff) % BLCKSZ >= SizeOfXLogPHD && \
		(BLCKSZ - (xrecoff) % BLCKSZ) >= SizeOfXLogRecord)
364

T
Tom Lane 已提交
365 366 367 368 369 370
/*
 * _INTL_MAXLOGRECSZ: max space needed for a record including header and
 * any backup-block data.
 */
#define _INTL_MAXLOGRECSZ	(SizeOfXLogRecord + MAXLOGRECSZ + \
							 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
371

372

T
Tom Lane 已提交
373
/* File path names */
B
Bruce Momjian 已提交
374 375
static char XLogDir[MAXPGPATH];
static char ControlFilePath[MAXPGPATH];
T
Tom Lane 已提交
376 377 378 379 380 381

/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
382

T
Tom Lane 已提交
383 384 385 386 387 388 389 390 391 392
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
393

T
Tom Lane 已提交
394 395 396 397 398 399
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
 * will be just past that page.
 */
400 401 402 403
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
B
Bruce Momjian 已提交
404

T
Tom Lane 已提交
405 406
/* Buffer for currently read page (BLCKSZ bytes) */
static char *readBuf = NULL;
B
Bruce Momjian 已提交
407

T
Tom Lane 已提交
408 409 410
/* State information for XLOG reading */
static XLogRecPtr ReadRecPtr;
static XLogRecPtr EndRecPtr;
411
static XLogRecord *nextRecord = NULL;
412
static StartUpID lastReadSUI;
413

V
WAL  
Vadim B. Mikheev 已提交
414 415
static bool InRedo = false;

T
Tom Lane 已提交
416 417 418

static bool AdvanceXLInsertBuffer(void);
static void XLogWrite(XLogwrtRqst WriteRqst);
B
Bruce Momjian 已提交
419 420
static int XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock);
421 422 423
static bool InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
								   bool find_free, int max_advance,
								   bool use_lock);
T
Tom Lane 已提交
424 425
static int	XLogFileOpen(uint32 log, uint32 seg, bool econt);
static void PreallocXlogFiles(XLogRecPtr endptr);
426
static void MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr);
T
Tom Lane 已提交
427
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer);
428
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI);
T
Tom Lane 已提交
429
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr,
430
					 int whichChkpt,
B
Bruce Momjian 已提交
431
					 char *buffer);
T
Tom Lane 已提交
432 433 434 435
static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(time_t tnow);
static void xlog_outrec(char *buf, XLogRecord *record);
436
static void issue_xlog_fsync(void);
T
Tom Lane 已提交
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
 * the rdata list (see xlog.h for notes about rdata).
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
454
XLogRecPtr
455
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
456
{
B
Bruce Momjian 已提交
457 458
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
459
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
	uint16		curridx;
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
	crc64		rdata_crc;
	uint32		len,
				write_len;
	unsigned	i;
	bool		do_logwrt;
	bool		updrqst;
	bool		no_tran = (rmid == RM_XLOG_ID) ? true : false;
V
Vadim B. Mikheev 已提交
477 478 479 480

	if (info & XLR_INFO_MASK)
	{
		if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
B
Bruce Momjian 已提交
481
			elog(STOP, "XLogInsert: invalid info mask %02X",
T
Tom Lane 已提交
482
				 (info & XLR_INFO_MASK));
V
Vadim B. Mikheev 已提交
483 484 485 486
		no_tran = true;
		info &= ~XLR_INFO_MASK;
	}

T
Tom Lane 已提交
487
	/*
B
Bruce Momjian 已提交
488 489
	 * In bootstrap mode, we don't actually log anything but XLOG
	 * resources; return a phony record pointer.
T
Tom Lane 已提交
490
	 */
V
Vadim B. Mikheev 已提交
491
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
492 493
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
494
		RecPtr.xrecoff = SizeOfXLogPHD; /* start of 1st checkpoint record */
V
WAL  
Vadim B. Mikheev 已提交
495 496 497
		return (RecPtr);
	}

T
Tom Lane 已提交
498 499 500 501 502 503
	/*
	 * Here we scan the rdata list, determine which buffers must be backed
	 * up, and compute the CRC values for the data.  Note that the record
	 * header isn't added into the CRC yet since we don't know the final
	 * length or info bits quite yet.
	 *
B
Bruce Momjian 已提交
504 505 506 507 508 509 510 511 512 513 514
	 * We may have to loop back to here if a race condition is detected
	 * below. We could prevent the race by doing all this work while
	 * holding the insert spinlock, but it seems better to avoid doing CRC
	 * calculations while holding the lock.  This means we have to be
	 * careful about modifying the rdata list until we know we aren't
	 * going to loop back again.  The only change we allow ourselves to
	 * make earlier is to set rdt->data = NULL in list items we have
	 * decided we will have to back up the whole buffer for.  This is OK
	 * because we will certainly decide the same thing again for those
	 * items if we do it over; doing it here saves an extra pass over the
	 * list later.
T
Tom Lane 已提交
515
	 */
516
begin:;
T
Tom Lane 已提交
517 518 519 520 521 522
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

523
	INIT_CRC64(rdata_crc);
T
Tom Lane 已提交
524
	len = 0;
B
Bruce Momjian 已提交
525
	for (rdt = rdata;;)
526 527 528
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
529
			/* Simple data, just include it */
530 531 532
			len += rdt->len;
			COMP_CRC64(rdata_crc, rdt->data, rdt->len);
		}
T
Tom Lane 已提交
533
		else
534
		{
T
Tom Lane 已提交
535 536
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
537
			{
T
Tom Lane 已提交
538
				if (rdt->buffer == dtbuf[i])
539
				{
T
Tom Lane 已提交
540 541 542 543 544 545 546 547 548
					/* Buffer already referenced by earlier list item */
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
549
				}
T
Tom Lane 已提交
550
				if (dtbuf[i] == InvalidBuffer)
551
				{
T
Tom Lane 已提交
552 553
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
B
Bruce Momjian 已提交
554

T
Tom Lane 已提交
555 556 557
					/*
					 * XXX We assume page LSN is first data on page
					 */
B
Bruce Momjian 已提交
558
					dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
T
Tom Lane 已提交
559 560
					if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
					{
B
Bruce Momjian 已提交
561
						crc64		dtcrc;
T
Tom Lane 已提交
562 563 564 565 566 567 568 569 570 571

						dtbuf_bkp[i] = true;
						rdt->data = NULL;
						INIT_CRC64(dtcrc);
						COMP_CRC64(dtcrc,
								   BufferGetBlock(dtbuf[i]),
								   BLCKSZ);
						dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
						dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
						COMP_CRC64(dtcrc,
B
Bruce Momjian 已提交
572
								(char *) &(dtbuf_xlg[i]) + sizeof(crc64),
T
Tom Lane 已提交
573 574 575 576 577 578 579 580 581 582
								   sizeof(BkpBlock) - sizeof(crc64));
						FIN_CRC64(dtcrc);
						dtbuf_xlg[i].crc = dtcrc;
					}
					else if (rdt->data)
					{
						len += rdt->len;
						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
					}
					break;
583 584
				}
			}
T
Tom Lane 已提交
585 586 587
			if (i >= XLR_MAX_BKP_BLOCKS)
				elog(STOP, "XLogInsert: can backup %d blocks at most",
					 XLR_MAX_BKP_BLOCKS);
588
		}
T
Tom Lane 已提交
589
		/* Break out of loop when rdt points to last list item */
590 591 592 593 594
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

T
Tom Lane 已提交
595 596 597
	/*
	 * NOTE: the test for len == 0 here is somewhat fishy, since in theory
	 * all of the rmgr data might have been suppressed in favor of backup
B
Bruce Momjian 已提交
598
	 * blocks.	Currently, all callers of XLogInsert provide at least some
T
Tom Lane 已提交
599 600 601 602
	 * not-in-a-buffer data and so len == 0 should never happen, but that
	 * may not be true forever.  If you need to remove the len == 0 check,
	 * also remove the check for xl_len == 0 in ReadRecord, below.
	 */
603
	if (len == 0 || len > MAXLOGRECSZ)
604
		elog(STOP, "XLogInsert: invalid record length %u", len);
605

606
	START_CRIT_SECTION();
607

T
Tom Lane 已提交
608 609
	/* wait to obtain xlog insert lock */
	do_logwrt = true;
610

T
Tom Lane 已提交
611 612 613 614
	for (i = 0;;)
	{
		/* try to update LogwrtResult while waiting for insert lock */
		if (!TAS(&(XLogCtl->info_lck)))
615
		{
B
Bruce Momjian 已提交
616
			XLogwrtRqst LogwrtRqst;
617

T
Tom Lane 已提交
618 619 620 621 622
			LogwrtRqst = XLogCtl->LogwrtRqst;
			LogwrtResult = XLogCtl->LogwrtResult;
			S_UNLOCK(&(XLogCtl->info_lck));

			/*
B
Bruce Momjian 已提交
623 624 625
			 * If cache is half filled then try to acquire logwrt lock and
			 * do LOGWRT work, but only once per XLogInsert call. Ignore
			 * any fractional blocks in performing this check.
T
Tom Lane 已提交
626 627 628 629 630 631 632 633
			 */
			LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % BLCKSZ;
			if (do_logwrt &&
				(LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
				 (LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
				  XLogCtl->XLogCacheByte / 2)))
			{
				if (!TAS(&(XLogCtl->logwrt_lck)))
634
				{
T
Tom Lane 已提交
635 636
					LogwrtResult = XLogCtl->Write.LogwrtResult;
					if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
637
					{
T
Tom Lane 已提交
638 639
						XLogWrite(LogwrtRqst);
						do_logwrt = false;
640
					}
T
Tom Lane 已提交
641
					S_UNLOCK(&(XLogCtl->logwrt_lck));
642 643 644
				}
			}
		}
T
Tom Lane 已提交
645 646 647
		if (!TAS(&(XLogCtl->insert_lck)))
			break;
		S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++, XLOG_LOCK_TIMEOUT);
648 649
	}

T
Tom Lane 已提交
650 651
	/*
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to
B
Bruce Momjian 已提交
652 653 654
	 * go back and recompute everything.  This can only happen just after
	 * a checkpoint, so it's better to be slow in this case and fast
	 * otherwise.
T
Tom Lane 已提交
655 656
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
657
	{
T
Tom Lane 已提交
658 659 660 661
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

		for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
662
		{
T
Tom Lane 已提交
663 664 665 666 667
			if (dtbuf[i] == InvalidBuffer)
				continue;
			if (dtbuf_bkp[i] == false &&
				XLByteLE(dtbuf_lsn[i], RedoRecPtr))
			{
B
Bruce Momjian 已提交
668

T
Tom Lane 已提交
669
				/*
B
Bruce Momjian 已提交
670 671
				 * Oops, this buffer now needs to be backed up, but we
				 * didn't think so above.  Start over.
T
Tom Lane 已提交
672 673 674 675 676
				 */
				S_UNLOCK(&(XLogCtl->insert_lck));
				END_CRIT_SECTION();
				goto begin;
			}
677 678 679
		}
	}

T
Tom Lane 已提交
680 681 682 683 684 685 686
	/*
	 * Make additional rdata list entries for the backup blocks, so that
	 * we don't need to special-case them in the write loop.  Note that we
	 * have now irrevocably changed the input rdata list.  At the exit of
	 * this loop, write_len includes the backup block data.
	 *
	 * Also set the appropriate info bits to show which buffers were backed
B
Bruce Momjian 已提交
687 688 689
	 * up.	The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th
	 * distinct buffer value (ignoring InvalidBuffer) appearing in the
	 * rdata list.
T
Tom Lane 已提交
690 691 692
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
693 694 695 696
	{
		if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
			continue;

T
Tom Lane 已提交
697
		info |= XLR_SET_BKP_BLOCK(i);
698 699 700

		rdt->next = &(dtbuf_rdt[2 * i]);

B
Bruce Momjian 已提交
701
		dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
702
		dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
T
Tom Lane 已提交
703
		write_len += sizeof(BkpBlock);
704 705 706

		rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);

B
Bruce Momjian 已提交
707
		dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
708
		dtbuf_rdt[2 * i + 1].len = BLCKSZ;
T
Tom Lane 已提交
709
		write_len += BLCKSZ;
710 711 712
		dtbuf_rdt[2 * i + 1].next = NULL;
	}

T
Tom Lane 已提交
713
	/* Insert record header */
714

T
Tom Lane 已提交
715 716
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
717 718
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
719
		updrqst = AdvanceXLInsertBuffer();
720 721 722
		freespace = BLCKSZ - SizeOfXLogPHD;
	}

T
Tom Lane 已提交
723
	curridx = Insert->curridx;
724
	record = (XLogRecord *) Insert->currpos;
T
Tom Lane 已提交
725

726
	record->xl_prev = Insert->PrevRecord;
V
Vadim B. Mikheev 已提交
727
	if (no_tran)
728 729 730 731
	{
		record->xl_xact_prev.xlogid = 0;
		record->xl_xact_prev.xrecoff = 0;
	}
V
Vadim B. Mikheev 已提交
732 733 734
	else
		record->xl_xact_prev = MyLastRecPtr;

735
	record->xl_xid = GetCurrentTransactionId();
T
Tom Lane 已提交
736
	record->xl_len = len;		/* doesn't include backup blocks */
737
	record->xl_info = info;
738
	record->xl_rmid = rmid;
739

T
Tom Lane 已提交
740
	/* Now we can finish computing the main CRC */
B
Bruce Momjian 已提交
741
	COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
742
			   SizeOfXLogRecord - sizeof(crc64));
743 744 745
	FIN_CRC64(rdata_crc);
	record->xl_crc = rdata_crc;

T
Tom Lane 已提交
746 747 748 749
	/* Compute record's XLOG location */
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/* If first XLOG record of transaction, save it in PROC array */
V
Vadim B. Mikheev 已提交
750
	if (MyLastRecPtr.xrecoff == 0 && !no_tran)
751 752 753 754 755
	{
		SpinAcquire(SInvalLock);
		MyProc->logRec = RecPtr;
		SpinRelease(SInvalLock);
	}
V
WAL  
Vadim B. Mikheev 已提交
756 757 758

	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
759
		char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
760 761 762

		sprintf(buf, "INSERT @ %u/%u: ", RecPtr.xlogid, RecPtr.xrecoff);
		xlog_outrec(buf, record);
763
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
764 765
		{
			strcat(buf, " - ");
766
			RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
767
		}
768
		elog(DEBUG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
769 770
	}

T
Tom Lane 已提交
771 772 773 774 775 776
	/* Record begin of record in appropriate places */
	if (!no_tran)
		MyLastRecPtr = RecPtr;
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

777
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
778
	freespace -= SizeOfXLogRecord;
779

T
Tom Lane 已提交
780 781 782 783
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
784
	{
785 786 787 788
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
789
		{
790 791 792 793 794
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
795
				write_len -= freespace;
796 797 798 799 800
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
801
				write_len -= rdata->len;
802 803 804 805
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
806 807
		}

808
		/* Use next buffer */
T
Tom Lane 已提交
809 810 811 812 813 814 815 816
		updrqst = AdvanceXLInsertBuffer();
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
		freespace = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
817
	}
818

T
Tom Lane 已提交
819 820
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
821
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
822
	freespace = INSERT_FREESPACE(Insert);
823

V
Vadim B. Mikheev 已提交
824
	/*
B
Bruce Momjian 已提交
825 826
	 * The recptr I return is the beginning of the *next* record. This
	 * will be stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
827
	 */
T
Tom Lane 已提交
828
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
829

T
Tom Lane 已提交
830
	/* Need to update shared LogwrtRqst if some block was filled up */
831
	if (freespace < SizeOfXLogRecord)
B
Bruce Momjian 已提交
832 833
		updrqst = true;			/* curridx is filled and available for
								 * writing out */
834 835
	else
		curridx = PrevBufIdx(curridx);
T
Tom Lane 已提交
836
	WriteRqst = XLogCtl->xlblocks[curridx];
837 838 839 840 841

	S_UNLOCK(&(XLogCtl->insert_lck));

	if (updrqst)
	{
842
		S_LOCK(&(XLogCtl->info_lck));
T
Tom Lane 已提交
843 844 845 846 847
		/* advance global request to include new block(s) */
		if (XLByteLT(XLogCtl->LogwrtRqst.Write, WriteRqst))
			XLogCtl->LogwrtRqst.Write = WriteRqst;
		/* update local result copy while I have the chance */
		LogwrtResult = XLogCtl->LogwrtResult;
848
		S_UNLOCK(&(XLogCtl->info_lck));
849 850
	}

851
	END_CRIT_SECTION();
852
	return (RecPtr);
853
}
854

T
Tom Lane 已提交
855 856 857 858 859 860 861 862 863 864 865 866 867
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
 * just-filled page.  If we can do this for free (without an extra spinlock),
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
 * Must be called with insert_lck held.
 */
static bool
AdvanceXLInsertBuffer(void)
868
{
T
Tom Lane 已提交
869 870 871 872 873 874
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
	uint16		nextidx = NextBufIdx(Insert->curridx);
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
875 876
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
877

T
Tom Lane 已提交
878 879 880
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
881

T
Tom Lane 已提交
882
	/*
B
Bruce Momjian 已提交
883 884 885
	 * Get ending-offset of the buffer page we need to replace (this may
	 * be zero if the buffer hasn't been used yet).  Fall through if it's
	 * already written out.
T
Tom Lane 已提交
886 887 888 889 890 891 892
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		unsigned	spins = 0;
		XLogRecPtr	FinishedPageRqstPtr;
893

T
Tom Lane 已提交
894
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
895

T
Tom Lane 已提交
896
		for (;;)
897
		{
T
Tom Lane 已提交
898 899
			/* While waiting, try to get info_lck and update LogwrtResult */
			if (!TAS(&(XLogCtl->info_lck)))
900
			{
T
Tom Lane 已提交
901 902
				if (XLByteLT(XLogCtl->LogwrtRqst.Write, FinishedPageRqstPtr))
					XLogCtl->LogwrtRqst.Write = FinishedPageRqstPtr;
B
Bruce Momjian 已提交
903
				update_needed = false;	/* Did the shared-request update */
T
Tom Lane 已提交
904
				LogwrtResult = XLogCtl->LogwrtResult;
905 906
				S_UNLOCK(&(XLogCtl->info_lck));

T
Tom Lane 已提交
907
				if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
908
				{
T
Tom Lane 已提交
909 910 911
					/* OK, someone wrote it already */
					Insert->LogwrtResult = LogwrtResult;
					break;
912
				}
T
Tom Lane 已提交
913 914 915
			}

			/*
B
Bruce Momjian 已提交
916 917
			 * LogwrtResult lock is busy or we know the page is still
			 * dirty. Try to acquire logwrt lock and write full blocks.
T
Tom Lane 已提交
918 919 920 921 922
			 */
			if (!TAS(&(XLogCtl->logwrt_lck)))
			{
				LogwrtResult = Write->LogwrtResult;
				if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
923
				{
T
Tom Lane 已提交
924 925 926 927
					S_UNLOCK(&(XLogCtl->logwrt_lck));
					/* OK, someone wrote it already */
					Insert->LogwrtResult = LogwrtResult;
					break;
928
				}
B
Bruce Momjian 已提交
929

T
Tom Lane 已提交
930
				/*
B
Bruce Momjian 已提交
931 932
				 * Have to write buffers while holding insert lock. This
				 * is not good, so only write as much as we absolutely
T
Tom Lane 已提交
933 934 935 936 937 938 939 940
				 * must.
				 */
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
				XLogWrite(WriteRqst);
				S_UNLOCK(&(XLogCtl->logwrt_lck));
				Insert->LogwrtResult = LogwrtResult;
941 942
				break;
			}
T
Tom Lane 已提交
943
			S_LOCK_SLEEP(&(XLogCtl->logwrt_lck), spins++, XLOG_LOCK_TIMEOUT);
944 945 946
		}
	}

T
Tom Lane 已提交
947 948 949 950
	/*
	 * Now the next buffer slot is free and we can set it up to be the
	 * next output page.
	 */
951 952
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
953
	{
T
Tom Lane 已提交
954
		/* crossing a logid boundary */
955 956
		NewPageEndPtr.xlogid += 1;
		NewPageEndPtr.xrecoff = BLCKSZ;
957
	}
T
Tom Lane 已提交
958
	else
959
	{
960
		NewPageEndPtr.xrecoff += BLCKSZ;
961
	}
962 963
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * BLCKSZ);
T
Tom Lane 已提交
964
	Insert->curridx = nextidx;
965 966
	Insert->currpage = NewPage;
	Insert->currpos = ((char *) NewPage) + SizeOfXLogPHD;
B
Bruce Momjian 已提交
967

T
Tom Lane 已提交
968
	/*
B
Bruce Momjian 已提交
969 970
	 * Be sure to re-zero the buffer so that bytes beyond what we've
	 * written will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
971
	 */
972 973 974 975 976 977 978 979
	MemSet((char *) NewPage, 0, BLCKSZ);

	/* And fill the new page's header */
	NewPage->xlp_magic = XLOG_PAGE_MAGIC;
	/* NewPage->xlp_info = 0; */			/* done by memset */
	NewPage->xlp_sui = ThisStartUpID;
	NewPage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
	NewPage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
T
Tom Lane 已提交
980 981

	return update_needed;
982 983
}

T
Tom Lane 已提交
984 985 986 987 988
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
 * Must be called with logwrt_lck held.
 */
989
static void
T
Tom Lane 已提交
990
XLogWrite(XLogwrtRqst WriteRqst)
991
{
992 993
	XLogCtlWrite *Write = &XLogCtl->Write;
	char	   *from;
T
Tom Lane 已提交
994
	bool		ispartialpage;
995
	bool		use_existent;
996

B
Bruce Momjian 已提交
997 998 999 1000
	/*
	 * Update local LogwrtResult (caller probably did this already,
	 * but...)
	 */
T
Tom Lane 已提交
1001 1002 1003
	LogwrtResult = Write->LogwrtResult;

	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1004
	{
B
Bruce Momjian 已提交
1005

1006 1007 1008 1009 1010 1011 1012 1013 1014
		/*
		 * Make sure we're not ahead of the insert process.  This could
		 * happen if we're passed a bogus WriteRqst.Write that is past the
		 * end of the last page that's been initialized by
		 * AdvanceXLInsertBuffer.
		 */
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
			elog(STOP, "XLogWrite: write request is past end of log");

T
Tom Lane 已提交
1015 1016 1017 1018 1019
		/* Advance LogwrtResult.Write to end of current buffer page */
		LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1020
		{
B
Bruce Momjian 已提交
1021

T
Tom Lane 已提交
1022 1023 1024 1025
			/*
			 * Switch to new logfile segment.
			 */
			if (openLogFile >= 0)
1026
			{
T
Tom Lane 已提交
1027
				if (close(openLogFile) != 0)
1028
					elog(STOP, "close of log file %u, segment %u failed: %m",
T
Tom Lane 已提交
1029 1030
						 openLogId, openLogSeg);
				openLogFile = -1;
1031
			}
T
Tom Lane 已提交
1032 1033
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1034 1035 1036 1037
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1038
			openLogOff = 0;
1039 1040 1041 1042 1043

			if (!use_existent)	/* there was no precreated file */
				elog(LOG, "XLogWrite: new log file created - "
					 "consider increasing WAL_FILES");

T
Tom Lane 已提交
1044
			/* update pg_control, unless someone else already did */
1045
			SpinAcquire(ControlFileLockId);
1046 1047 1048
			if (ControlFile->logId < openLogId ||
				(ControlFile->logId == openLogId &&
				 ControlFile->logSeg < openLogSeg + 1))
T
Tom Lane 已提交
1049 1050 1051 1052 1053
			{
				ControlFile->logId = openLogId;
				ControlFile->logSeg = openLogSeg + 1;
				ControlFile->time = time(NULL);
				UpdateControlFile();
B
Bruce Momjian 已提交
1054

1055
				/*
B
Bruce Momjian 已提交
1056 1057 1058 1059
				 * Signal postmaster to start a checkpoint if it's been
				 * too long since the last one.  (We look at local copy of
				 * RedoRecPtr which might be a little out of date, but
				 * should be close enough for this purpose.)
1060 1061 1062 1063 1064 1065 1066
				 */
				if (IsUnderPostmaster &&
					(openLogId != RedoRecPtr.xlogid ||
					 openLogSeg >= (RedoRecPtr.xrecoff / XLogSegSize) +
					 (uint32) CheckPointSegments))
				{
					if (XLOG_DEBUG)
1067
						elog(DEBUG, "XLogWrite: time for a checkpoint, signaling postmaster");
1068 1069
					kill(getppid(), SIGUSR1);
				}
T
Tom Lane 已提交
1070
			}
1071 1072 1073
			SpinRelease(ControlFileLockId);
		}

T
Tom Lane 已提交
1074
		if (openLogFile < 0)
1075
		{
T
Tom Lane 已提交
1076 1077 1078
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
			openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
			openLogOff = 0;
1079 1080
		}

T
Tom Lane 已提交
1081 1082
		/* Need to seek in the file? */
		if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1083
		{
T
Tom Lane 已提交
1084 1085
			openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
			if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1086
				elog(STOP, "lseek of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1087
					 openLogId, openLogSeg, openLogOff);
1088 1089
		}

T
Tom Lane 已提交
1090 1091
		/* OK to write the page */
		from = XLogCtl->pages + Write->curridx * BLCKSZ;
1092
		errno = 0;
T
Tom Lane 已提交
1093
		if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1094 1095 1096 1097
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
1098
			elog(STOP, "write of log file %u, segment %u, offset %u failed: %m",
T
Tom Lane 已提交
1099
				 openLogId, openLogSeg, openLogOff);
1100
		}
T
Tom Lane 已提交
1101
		openLogOff += BLCKSZ;
1102

T
Tom Lane 已提交
1103 1104 1105
		/*
		 * If we just wrote the whole last page of a logfile segment,
		 * fsync the segment immediately.  This avoids having to go back
B
Bruce Momjian 已提交
1106 1107 1108
		 * and re-open prior segments when an fsync request comes along
		 * later. Doing it here ensures that one and only one backend will
		 * perform this fsync.
T
Tom Lane 已提交
1109 1110 1111
		 */
		if (openLogOff >= XLogSegSize && !ispartialpage)
		{
1112
			issue_xlog_fsync();
B
Bruce Momjian 已提交
1113
			LogwrtResult.Flush = LogwrtResult.Write;	/* end of current page */
T
Tom Lane 已提交
1114
		}
1115

T
Tom Lane 已提交
1116 1117 1118 1119 1120 1121 1122
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
		Write->curridx = NextBufIdx(Write->curridx);
1123 1124
	}

T
Tom Lane 已提交
1125 1126 1127 1128 1129
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1130
	{
B
Bruce Momjian 已提交
1131

T
Tom Lane 已提交
1132
		/*
B
Bruce Momjian 已提交
1133 1134 1135
		 * Could get here without iterating above loop, in which case we
		 * might have no open file or the wrong one.  However, we do not
		 * need to fsync more than one file.
T
Tom Lane 已提交
1136
		 */
1137
		if (sync_method != SYNC_METHOD_OPEN)
T
Tom Lane 已提交
1138
		{
1139
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1140
			 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1141 1142
			{
				if (close(openLogFile) != 0)
1143
					elog(STOP, "close of log file %u, segment %u failed: %m",
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
				openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
				openLogOff = 0;
			}
			issue_xlog_fsync();
T
Tom Lane 已提交
1154 1155
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1156 1157
	}

T
Tom Lane 已提交
1158 1159 1160
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1161 1162
	 * We make sure that the shared 'request' values do not fall behind the
	 * 'result' values.  This is not absolutely essential, but it saves
T
Tom Lane 已提交
1163 1164
	 * some code in a couple of places.
	 */
1165
	S_LOCK(&(XLogCtl->info_lck));
T
Tom Lane 已提交
1166 1167 1168 1169 1170
	XLogCtl->LogwrtResult = LogwrtResult;
	if (XLByteLT(XLogCtl->LogwrtRqst.Write, LogwrtResult.Write))
		XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
	if (XLByteLT(XLogCtl->LogwrtRqst.Flush, LogwrtResult.Flush))
		XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
1171 1172
	S_UNLOCK(&(XLogCtl->info_lck));

T
Tom Lane 已提交
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
	Write->LogwrtResult = LogwrtResult;
}

/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
 * NOTE: this differs from XLogWrite mainly in that the logwrt_lck is not
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;
	unsigned	spins = 0;

	if (XLOG_DEBUG)
	{
1191
		elog(DEBUG, "XLogFlush%s%s: request %u/%u; write %u/%u; flush %u/%u\n",
T
Tom Lane 已提交
1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
				(IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
				(InRedo) ? "(redo)" : "",
				record.xlogid, record.xrecoff,
				LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
				LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
		fflush(stderr);
	}

	/* Disabled during REDO */
	if (InRedo)
		return;

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
	 * piggyback as much data as we can on each fsync: if we see any more
	 * data entered into the xlog buffer, we'll write and fsync that too,
B
Bruce Momjian 已提交
1214 1215 1216
	 * so that the final value of LogwrtResult.Flush is as large as
	 * possible. This gives us some chance of avoiding another fsync
	 * immediately after.
T
Tom Lane 已提交
1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

	for (;;)
	{
		/* try to read LogwrtResult and update local state */
		if (!TAS(&(XLogCtl->info_lck)))
		{
			if (XLByteLT(WriteRqstPtr, XLogCtl->LogwrtRqst.Write))
				WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
			LogwrtResult = XLogCtl->LogwrtResult;
			S_UNLOCK(&(XLogCtl->info_lck));
			if (XLByteLE(record, LogwrtResult.Flush))
			{
				/* Done already */
				break;
			}
		}
		/* if something was added to log cache then try to flush this too */
		if (!TAS(&(XLogCtl->insert_lck)))
		{
			XLogCtlInsert *Insert = &XLogCtl->Insert;
			uint32		freespace = INSERT_FREESPACE(Insert);

			if (freespace < SizeOfXLogRecord)	/* buffer is full */
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
			else
			{
				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				WriteRqstPtr.xrecoff -= freespace;
			}
			S_UNLOCK(&(XLogCtl->insert_lck));
		}
		/* now try to get the logwrt lock */
		if (!TAS(&(XLogCtl->logwrt_lck)))
		{
			LogwrtResult = XLogCtl->Write.LogwrtResult;
			if (XLByteLE(record, LogwrtResult.Flush))
			{
				/* Done already */
				S_UNLOCK(&(XLogCtl->logwrt_lck));
				break;
			}
			WriteRqst.Write = WriteRqstPtr;
			WriteRqst.Flush = record;
			XLogWrite(WriteRqst);
			S_UNLOCK(&(XLogCtl->logwrt_lck));
			if (XLByteLT(LogwrtResult.Flush, record))
				elog(STOP, "XLogFlush: request is not satisfied");
			break;
		}
		S_LOCK_SLEEP(&(XLogCtl->logwrt_lck), spins++, XLOG_LOCK_TIMEOUT);
	}

	END_CRIT_SECTION();
1274 1275
}

T
Tom Lane 已提交
1276 1277 1278
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
1279 1280 1281
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
1282
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
1283 1284 1285 1286 1287 1288
 * file was used.
 *
 * use_lock: if TRUE, acquire ControlFileLock spinlock while moving file into
 * place.  This should be TRUE except during bootstrap log creation.  The
 * caller must *not* hold the spinlock at call.
 *
T
Tom Lane 已提交
1289 1290
 * Returns FD of opened file.
 */
1291
static int
1292 1293
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
1294
{
1295
	char		path[MAXPGPATH];
1296
	char		tmppath[MAXPGPATH];
1297
	char		zbuffer[BLCKSZ];
1298
	int			fd;
1299
	int			nbytes;
1300 1301

	XLogFileName(path, log, seg);
V
Vadim B. Mikheev 已提交
1302 1303

	/*
B
Bruce Momjian 已提交
1304 1305
	 * Try to use existent file (checkpoint maker may have created it
	 * already)
V
Vadim B. Mikheev 已提交
1306
	 */
1307
	if (*use_existent)
V
Vadim B. Mikheev 已提交
1308
	{
1309 1310
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
1311 1312 1313
		if (fd < 0)
		{
			if (errno != ENOENT)
1314 1315
				elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
					 path, log, seg);
V
Vadim B. Mikheev 已提交
1316 1317
		}
		else
B
Bruce Momjian 已提交
1318
			return (fd);
V
Vadim B. Mikheev 已提交
1319 1320
	}

1321
	/*
B
Bruce Momjian 已提交
1322 1323 1324 1325
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible
	 * that another process is doing the same thing.  If so, we will end
	 * up pre-creating an extra log segment.  That seems OK, and better
	 * than holding the spinlock throughout this lengthy process.
1326
	 */
1327 1328
	snprintf(tmppath, MAXPGPATH, "%s/xlogtemp.%d",
			 XLogDir, (int) getpid());
1329 1330

	unlink(tmppath);
1331

1332
	/* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
1333
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
1334
					   S_IRUSR | S_IWUSR);
1335
	if (fd < 0)
1336
		elog(STOP, "creation of file %s failed: %m", tmppath);
1337

1338
	/*
B
Bruce Momjian 已提交
1339
	 * Zero-fill the file.	We have to do this the hard way to ensure that
1340 1341
	 * all the file space has really been allocated --- on platforms that
	 * allow "holes" in files, just seeking to the end doesn't allocate
B
Bruce Momjian 已提交
1342
	 * intermediate space.	This way, we know that we have all the space
1343
	 * and (after the fsync below) that all the indirect blocks are down
1344 1345
	 * on disk.  Therefore, fdatasync(2) or O_DSYNC will be sufficient to
	 * sync future writes to the log file.
1346 1347 1348 1349
	 */
	MemSet(zbuffer, 0, sizeof(zbuffer));
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
	{
1350
		errno = 0;
1351
		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
T
Tom Lane 已提交
1352
		{
B
Bruce Momjian 已提交
1353
			int			save_errno = errno;
T
Tom Lane 已提交
1354

B
Bruce Momjian 已提交
1355 1356 1357 1358
			/*
			 * If we fail to make the file, delete it to release disk
			 * space
			 */
1359
			unlink(tmppath);
1360 1361
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
1362

T
Tom Lane 已提交
1363
			elog(STOP, "ZeroFill failed to write %s: %m", tmppath);
T
Tom Lane 已提交
1364
		}
1365
	}
1366

1367
	if (pg_fsync(fd) != 0)
1368
		elog(STOP, "fsync of file %s failed: %m", tmppath);
1369

V
Vadim B. Mikheev 已提交
1370
	close(fd);
T
Tom Lane 已提交
1371

1372
	/*
1373 1374
	 * Now move the segment into place with its final name.
	 *
1375 1376 1377 1378 1379
	 * If caller didn't want to use a pre-existing file, get rid of any
	 * pre-existing file.  Otherwise, cope with possibility that someone
	 * else has created the file while we were filling ours: if so, use
	 * ours to pre-create a future log segment.
	 */
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
	if (!InstallXLogFileSegment(log, seg, tmppath,
								*use_existent, XLOGfiles + XLOGfileslop,
								use_lock))
	{
		/* No need for any more future segments... */
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
			 path, log, seg);

	return (fd);
}

/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
 * log, seg: identify segment to install as (or first possible target).
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
 * max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  (Irrelevant if
 * find_free is FALSE.)
 *
 * use_lock: if TRUE, acquire ControlFileLock spinlock while moving file into
 * place.  This should be TRUE except during bootstrap log creation.  The
 * caller must *not* hold the spinlock at call.
 *
 * Returns TRUE if file installed, FALSE if not installed because of
 * exceeding max_advance limit.  (Any other kind of failure causes elog().)
 */
static bool
InstallXLogFileSegment(uint32 log, uint32 seg, char *tmppath,
					   bool find_free, int max_advance,
					   bool use_lock)
{
	char		path[MAXPGPATH];
	int			fd;

	XLogFileName(path, log, seg);

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
		SpinAcquire(ControlFileLockId);
1441

1442 1443 1444 1445 1446
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
1447 1448
	else
	{
1449 1450
		/* Find a free slot to put it in */
		while ((fd = BasicOpenFile(path, O_RDWR | PG_BINARY,
1451 1452 1453
								   S_IRUSR | S_IWUSR)) >= 0)
		{
			close(fd);
1454 1455 1456 1457 1458 1459 1460 1461 1462
			if (--max_advance < 0)
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
					SpinRelease(ControlFileLockId);
				return false;
			}
			NextLogSeg(log, seg);
			XLogFileName(path, log, seg);
1463 1464 1465 1466 1467 1468 1469
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
1470
	 */
1471
#ifndef __BEOS__
1472
	if (link(tmppath, path) < 0)
1473
		elog(STOP, "link from %s to %s (initialization of log file %u, segment %u) failed: %m",
1474
			 tmppath, path, log, seg);
1475
	unlink(tmppath);
1476
#else
1477
	if (rename(tmppath, path) < 0)
1478
		elog(STOP, "rename from %s to %s (initialization of log file %u, segment %u) failed: %m",
1479
			 tmppath, path, log, seg);
1480
#endif
V
Vadim B. Mikheev 已提交
1481

1482 1483 1484
	if (use_lock)
		SpinRelease(ControlFileLockId);

1485
	return true;
1486 1487
}

T
Tom Lane 已提交
1488 1489 1490
/*
 * Open a pre-existing logfile segment.
 */
1491 1492 1493
static int
XLogFileOpen(uint32 log, uint32 seg, bool econt)
{
1494 1495
	char		path[MAXPGPATH];
	int			fd;
1496 1497 1498

	XLogFileName(path, log, seg);

1499 1500
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
					   S_IRUSR | S_IWUSR);
1501 1502 1503 1504
	if (fd < 0)
	{
		if (econt && errno == ENOENT)
		{
1505 1506
			elog(LOG, "open of %s (log file %u, segment %u) failed: %m",
				 path, log, seg);
1507 1508
			return (fd);
		}
1509 1510
		elog(STOP, "open of %s (log file %u, segment %u) failed: %m",
			 path, log, seg);
1511 1512
	}

1513
	return (fd);
1514 1515
}

V
Vadim B. Mikheev 已提交
1516
/*
T
Tom Lane 已提交
1517 1518 1519 1520 1521 1522 1523 1524 1525
 * Preallocate log files beyond the specified log endpoint, according to
 * the XLOGfile user parameter.
 */
static void
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
1526
	bool		use_existent;
T
Tom Lane 已提交
1527 1528 1529 1530 1531 1532 1533 1534
	int			i;

	XLByteToPrevSeg(endptr, _logId, _logSeg);
	if (XLOGfiles > 0)
	{
		for (i = 1; i <= XLOGfiles; i++)
		{
			NextLogSeg(_logId, _logSeg);
1535 1536
			use_existent = true;
			lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1537 1538 1539 1540 1541 1542 1543
			close(lf);
		}
	}
	else if ((endptr.xrecoff - 1) % XLogSegSize >=
			 (uint32) (0.75 * XLogSegSize))
	{
		NextLogSeg(_logId, _logSeg);
1544 1545
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
1546 1547 1548 1549 1550 1551
		close(lf);
	}
}

/*
 * Remove or move offline all log files older or equal to passed log/seg#
1552 1553 1554
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
1555 1556
 */
static void
1557
MoveOfflineLogs(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
1558
{
1559 1560
	uint32		endlogId;
	uint32		endlogSeg;
B
Bruce Momjian 已提交
1561 1562 1563 1564
	DIR		   *xldir;
	struct dirent *xlde;
	char		lastoff[32];
	char		path[MAXPGPATH];
V
Vadim B. Mikheev 已提交
1565

1566
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
V
Vadim B. Mikheev 已提交
1567 1568 1569

	xldir = opendir(XLogDir);
	if (xldir == NULL)
1570 1571
		elog(STOP, "could not open transaction log directory (%s): %m",
			 XLogDir);
V
Vadim B. Mikheev 已提交
1572

T
Tom Lane 已提交
1573
	sprintf(lastoff, "%08X%08X", log, seg);
V
Vadim B. Mikheev 已提交
1574 1575 1576 1577

	errno = 0;
	while ((xlde = readdir(xldir)) != NULL)
	{
T
Tom Lane 已提交
1578 1579 1580
		if (strlen(xlde->d_name) == 16 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 16 &&
			strcmp(xlde->d_name, lastoff) <= 0)
V
Vadim B. Mikheev 已提交
1581
		{
1582
			sprintf(path, "%s/%s", XLogDir, xlde->d_name);
1583
			if (XLOG_archive_dir[0])
1584 1585 1586 1587 1588
			{
				elog(LOG, "archiving transaction log file %s",
					 xlde->d_name);
				elog(NOTICE, "archiving log files is not implemented!");
			}
1589
			else
1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611
			{
				/*
				 * Before deleting the file, see if it can be recycled as
				 * a future log segment.  We allow recycling segments up to
				 * XLOGfiles + XLOGfileslop segments beyond the current
				 * XLOG location.
				 */
				if (InstallXLogFileSegment(endlogId, endlogSeg, path,
										   true, XLOGfiles + XLOGfileslop,
										   true))
				{
					elog(LOG, "recycled transaction log file %s",
						 xlde->d_name);
				}
				else
				{
					/* No need for any more future segments... */
					elog(LOG, "removing transaction log file %s",
						 xlde->d_name);
					unlink(path);
				}
			}
V
Vadim B. Mikheev 已提交
1612 1613 1614 1615
		}
		errno = 0;
	}
	if (errno)
1616 1617
		elog(STOP, "could not read transaction log directory (%s): %m",
			 XLogDir);
V
Vadim B. Mikheev 已提交
1618 1619 1620
	closedir(xldir);
}

T
Tom Lane 已提交
1621 1622 1623 1624 1625
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
 */
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
static void
RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
{
	Relation	reln;
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

B
Bruce Momjian 已提交
1636
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
1637
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1638
	{
T
Tom Lane 已提交
1639
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1640 1641
			continue;

B
Bruce Momjian 已提交
1642
		memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
1643 1644 1645 1646 1647 1648 1649 1650 1651 1652
		blk += sizeof(BkpBlock);

		reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);

		if (reln)
		{
			buffer = XLogReadBuffer(true, reln, bkpb.block);
			if (BufferIsValid(buffer))
			{
				page = (Page) BufferGetPage(buffer);
B
Bruce Momjian 已提交
1653
				memcpy((char *) page, blk, BLCKSZ);
1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
				PageSetLSN(page, lsn);
				PageSetSUI(page, ThisStartUpID);
				UnlockAndWriteBuffer(buffer);
			}
		}

		blk += BLCKSZ;
	}
}

T
Tom Lane 已提交
1664 1665 1666 1667 1668 1669 1670
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
1671 1672 1673 1674 1675 1676 1677 1678 1679
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
	crc64		crc;
	crc64		cbuf;
	int			i;
	uint32		len = record->xl_len;
	char	   *blk;

T
Tom Lane 已提交
1680
	/* Check CRC of rmgr data and record header */
1681
	INIT_CRC64(crc);
T
Tom Lane 已提交
1682
	COMP_CRC64(crc, XLogRecGetData(record), len);
B
Bruce Momjian 已提交
1683
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
1684
			   SizeOfXLogRecord - sizeof(crc64));
1685 1686
	FIN_CRC64(crc);

T
Tom Lane 已提交
1687
	if (!EQ_CRC64(record->xl_crc, crc))
1688
	{
1689
		elog(emode, "ReadRecord: bad resource manager data checksum in record at %u/%u",
T
Tom Lane 已提交
1690
			 recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1691
		return (false);
1692 1693
	}

T
Tom Lane 已提交
1694
	/* Check CRCs of backup blocks, if any */
B
Bruce Momjian 已提交
1695
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
1696
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
1697
	{
T
Tom Lane 已提交
1698
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
1699 1700 1701
			continue;

		INIT_CRC64(crc);
T
Tom Lane 已提交
1702 1703 1704
		COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
		COMP_CRC64(crc, blk + sizeof(crc64),
				   sizeof(BkpBlock) - sizeof(crc64));
1705
		FIN_CRC64(crc);
B
Bruce Momjian 已提交
1706 1707
		memcpy((char *) &cbuf, blk, sizeof(crc64));		/* don't assume
														 * alignment */
1708

T
Tom Lane 已提交
1709
		if (!EQ_CRC64(cbuf, crc))
1710
		{
1711
			elog(emode, "ReadRecord: bad checksum of backup block %d in record at %u/%u",
T
Tom Lane 已提交
1712
				 i + 1, recptr.xlogid, recptr.xrecoff);
B
Bruce Momjian 已提交
1713
			return (false);
1714
		}
T
Tom Lane 已提交
1715
		blk += sizeof(BkpBlock) + BLCKSZ;
1716 1717
	}

B
Bruce Momjian 已提交
1718
	return (true);
1719 1720
}

T
Tom Lane 已提交
1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
 * If no valid record is available, returns NULL, or fails if emode is STOP.
 * (emode must be either STOP or LOG.)
 *
 * buffer is a workspace at least _INTL_MAXLOGRECSZ bytes long.  It is needed
 * to reassemble a record that crosses block boundaries.  Note that on
 * successful return, the returned record pointer always points at buffer.
 */
1734
static XLogRecord *
T
Tom Lane 已提交
1735
ReadRecord(XLogRecPtr *RecPtr, int emode, char *buffer)
1736
{
1737 1738
	XLogRecord *record;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
T
Tom Lane 已提交
1739 1740 1741 1742
	uint32		len,
				total_len;
	uint32		targetPageOff;
	unsigned	i;
1743
	bool		nextmode = false;
T
Tom Lane 已提交
1744 1745 1746

	if (readBuf == NULL)
	{
B
Bruce Momjian 已提交
1747

T
Tom Lane 已提交
1748 1749 1750
		/*
		 * First time through, permanently allocate readBuf.  We do it
		 * this way, rather than just making a static array, for two
B
Bruce Momjian 已提交
1751 1752 1753 1754
		 * reasons: (1) no need to waste the storage in most
		 * instantiations of the backend; (2) a static char array isn't
		 * guaranteed to have any particular alignment, whereas malloc()
		 * will provide MAXALIGN'd storage.
T
Tom Lane 已提交
1755 1756 1757 1758
		 */
		readBuf = (char *) malloc(BLCKSZ);
		Assert(readBuf != NULL);
	}
1759

T
Tom Lane 已提交
1760
	if (RecPtr == NULL)
1761
	{
1762
		RecPtr = &tmpRecPtr;
1763
		nextmode = true;
T
Tom Lane 已提交
1764
		/* fast case if next record is on same page */
1765 1766 1767 1768 1769
		if (nextRecord != NULL)
		{
			record = nextRecord;
			goto got_record;
		}
T
Tom Lane 已提交
1770
		/* align old recptr to next page */
1771 1772 1773 1774 1775 1776 1777 1778
		if (tmpRecPtr.xrecoff % BLCKSZ != 0)
			tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
		tmpRecPtr.xrecoff += SizeOfXLogPHD;
1779
	}
1780
	else if (!XRecOffIsValid(RecPtr->xrecoff))
1781
		elog(STOP, "ReadRecord: invalid record offset at (%u, %u)",
1782
			 RecPtr->xlogid, RecPtr->xrecoff);
1783

T
Tom Lane 已提交
1784
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
1785
	{
1786 1787
		close(readFile);
		readFile = -1;
1788
	}
T
Tom Lane 已提交
1789
	XLByteToSeg(*RecPtr, readId, readSeg);
1790
	if (readFile < 0)
1791
	{
T
Tom Lane 已提交
1792
		readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1793 1794
		if (readFile < 0)
			goto next_record_is_invalid;
B
Bruce Momjian 已提交
1795
		readOff = (uint32) (-1);/* force read to occur below */
1796 1797
	}

T
Tom Lane 已提交
1798 1799
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / BLCKSZ) * BLCKSZ;
	if (readOff != targetPageOff)
1800
	{
T
Tom Lane 已提交
1801 1802 1803
		readOff = targetPageOff;
		if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
		{
1804
			elog(emode, "ReadRecord: lseek of log file %u, segment %u, offset %u failed: %m",
1805
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1806 1807
			goto next_record_is_invalid;
		}
1808
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1809
		{
1810
			elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1811
				 readId, readSeg, readOff);
T
Tom Lane 已提交
1812 1813
			goto next_record_is_invalid;
		}
1814
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, nextmode))
1815 1816
			goto next_record_is_invalid;
	}
T
Tom Lane 已提交
1817
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
1818 1819
		RecPtr->xrecoff % BLCKSZ == SizeOfXLogPHD)
	{
T
Tom Lane 已提交
1820
		elog(emode, "ReadRecord: contrecord is requested by (%u, %u)",
1821
			 RecPtr->xlogid, RecPtr->xrecoff);
1822 1823
		goto next_record_is_invalid;
	}
1824
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
1825 1826

got_record:;
B
Bruce Momjian 已提交
1827

T
Tom Lane 已提交
1828
	/*
B
Bruce Momjian 已提交
1829 1830
	 * Currently, xl_len == 0 must be bad data, but that might not be true
	 * forever.  See note in XLogInsert.
T
Tom Lane 已提交
1831
	 */
1832 1833
	if (record->xl_len == 0)
	{
1834
		elog(emode, "ReadRecord: record with zero length at (%u, %u)",
T
Tom Lane 已提交
1835
			 RecPtr->xlogid, RecPtr->xrecoff);
1836 1837
		goto next_record_is_invalid;
	}
B
Bruce Momjian 已提交
1838

T
Tom Lane 已提交
1839
	/*
B
Bruce Momjian 已提交
1840 1841
	 * Compute total length of record including any appended backup
	 * blocks.
T
Tom Lane 已提交
1842 1843 1844 1845 1846 1847 1848 1849
	 */
	total_len = SizeOfXLogRecord + record->xl_len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
			continue;
		total_len += sizeof(BkpBlock) + BLCKSZ;
	}
B
Bruce Momjian 已提交
1850

T
Tom Lane 已提交
1851 1852 1853 1854 1855 1856
	/*
	 * Make sure it will fit in buffer (currently, it is mechanically
	 * impossible for this test to fail, but it seems like a good idea
	 * anyway).
	 */
	if (total_len > _INTL_MAXLOGRECSZ)
1857
	{
1858
		elog(emode, "ReadRecord: record length %u at (%u, %u) too long",
T
Tom Lane 已提交
1859
			 total_len, RecPtr->xlogid, RecPtr->xrecoff);
1860 1861 1862 1863
		goto next_record_is_invalid;
	}
	if (record->xl_rmid > RM_MAX_ID)
	{
T
Tom Lane 已提交
1864
		elog(emode, "ReadRecord: invalid resource manager id %u at (%u, %u)",
1865
			 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff);
1866 1867 1868
		goto next_record_is_invalid;
	}
	nextRecord = NULL;
T
Tom Lane 已提交
1869 1870
	len = BLCKSZ - RecPtr->xrecoff % BLCKSZ;
	if (total_len > len)
1871
	{
T
Tom Lane 已提交
1872 1873
		/* Need to reassemble record */
		XLogContRecord *contrecord;
B
Bruce Momjian 已提交
1874
		uint32		gotlen = len;
1875

T
Tom Lane 已提交
1876
		memcpy(buffer, record, len);
1877
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
1878
		buffer += len;
1879
		for (;;)
1880
		{
T
Tom Lane 已提交
1881 1882
			readOff += BLCKSZ;
			if (readOff >= XLogSegSize)
1883 1884
			{
				close(readFile);
T
Tom Lane 已提交
1885 1886 1887
				readFile = -1;
				NextLogSeg(readId, readSeg);
				readFile = XLogFileOpen(readId, readSeg, (emode == LOG));
1888 1889
				if (readFile < 0)
					goto next_record_is_invalid;
T
Tom Lane 已提交
1890
				readOff = 0;
1891 1892
			}
			if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
T
Tom Lane 已提交
1893
			{
1894
				elog(emode, "ReadRecord: read of log file %u, segment %u, offset %u failed: %m",
1895
					 readId, readSeg, readOff);
T
Tom Lane 已提交
1896 1897
				goto next_record_is_invalid;
			}
1898
			if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
1899
				goto next_record_is_invalid;
T
Tom Lane 已提交
1900
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
1901
			{
1902
				elog(emode, "ReadRecord: there is no ContRecord flag in log file %u, segment %u, offset %u",
1903
					 readId, readSeg, readOff);
1904 1905
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1906
			contrecord = (XLogContRecord *) ((char *) readBuf + SizeOfXLogPHD);
B
Bruce Momjian 已提交
1907
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
1908
				total_len != (contrecord->xl_rem_len + gotlen))
1909
			{
1910
				elog(emode, "ReadRecord: invalid ContRecord length %u in log file %u, segment %u, offset %u",
T
Tom Lane 已提交
1911
					 contrecord->xl_rem_len, readId, readSeg, readOff);
1912 1913
				goto next_record_is_invalid;
			}
T
Tom Lane 已提交
1914 1915
			len = BLCKSZ - SizeOfXLogPHD - SizeOfXLogContRecord;
			if (contrecord->xl_rem_len > len)
1916
			{
B
Bruce Momjian 已提交
1917
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
		if (BLCKSZ - SizeOfXLogRecord >= SizeOfXLogPHD +
			SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
		{
B
Bruce Momjian 已提交
1931
			nextRecord = (XLogRecord *) ((char *) contrecord +
T
Tom Lane 已提交
1932 1933 1934 1935
				SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
		}
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
B
Bruce Momjian 已提交
1936
			SizeOfXLogPHD + SizeOfXLogContRecord +
T
Tom Lane 已提交
1937 1938 1939
			MAXALIGN(contrecord->xl_rem_len);
		ReadRecPtr = *RecPtr;
		return record;
1940 1941
	}

T
Tom Lane 已提交
1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
	if (BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % BLCKSZ +
		MAXALIGN(total_len))
		nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
	return (XLogRecord *) buffer;
1953

T
Tom Lane 已提交
1954 1955 1956 1957 1958
next_record_is_invalid:;
	close(readFile);
	readFile = -1;
	nextRecord = NULL;
	return NULL;
1959 1960
}

1961 1962 1963 1964
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
1965
 * ReadRecord.	It's not intended for use from anywhere else.
1966 1967 1968 1969
 */
static bool
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool checkSUI)
{
1970 1971
	XLogRecPtr	recaddr;

1972 1973
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
1974
		elog(emode, "ReadRecord: invalid magic number %04X in log file %u, segment %u, offset %u",
1975 1976 1977 1978 1979
			 hdr->xlp_magic, readId, readSeg, readOff);
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
1980
		elog(emode, "ReadRecord: invalid info bits %04X in log file %u, segment %u, offset %u",
1981 1982 1983
			 hdr->xlp_info, readId, readSeg, readOff);
		return false;
	}
1984 1985 1986 1987 1988 1989 1990 1991 1992
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
		elog(emode, "ReadRecord: unexpected pageaddr (%u, %u) in log file %u, segment %u, offset %u",
			 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
			 readId, readSeg, readOff);
		return false;
	}
B
Bruce Momjian 已提交
1993

1994
	/*
B
Bruce Momjian 已提交
1995 1996 1997 1998
	 * We disbelieve a SUI less than the previous page's SUI, or more than
	 * a few counts greater.  In theory as many as 512 shutdown checkpoint
	 * records could appear on a 32K-sized xlog page, so that's the most
	 * differential there could legitimately be.
1999 2000
	 *
	 * Note this check can only be applied when we are reading the next page
B
Bruce Momjian 已提交
2001 2002
	 * in sequence, so ReadRecord passes a flag indicating whether to
	 * check.
2003 2004 2005 2006 2007 2008
	 */
	if (checkSUI)
	{
		if (hdr->xlp_sui < lastReadSUI ||
			hdr->xlp_sui > lastReadSUI + 512)
		{
2009 2010
			/* translator: SUI = startup id */
			elog(emode, "ReadRecord: out-of-sequence SUI %u (after %u) in log file %u, segment %u, offset %u",
2011 2012 2013 2014 2015 2016 2017 2018
				 hdr->xlp_sui, lastReadSUI, readId, readSeg, readOff);
			return false;
		}
	}
	lastReadSUI = hdr->xlp_sui;
	return true;
}

2019 2020 2021 2022
/*
 * I/O routines for pg_control
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
2023
 * contents of pg_control.	WriteControlFile() initializes pg_control
2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */

void
XLOGPathInit(void)
{
	/* Init XLOG file paths */
2038 2039
	snprintf(XLogDir, MAXPGPATH, "%s/pg_xlog", DataDir);
	snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
2040 2041 2042 2043 2044 2045
}

static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
2046 2047
	char		buffer[BLCKSZ]; /* need not be aligned */

2048 2049
#ifdef USE_LOCALE
	char	   *localeptr;
B
Bruce Momjian 已提交
2050

2051 2052 2053
#endif

	/*
T
Tom Lane 已提交
2054
	 * Initialize version and compatibility-check fields
2055
	 */
T
Tom Lane 已提交
2056 2057
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
2058 2059 2060 2061 2062
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
#ifdef USE_LOCALE
	localeptr = setlocale(LC_COLLATE, NULL);
	if (!localeptr)
2063
		elog(STOP, "invalid LC_COLLATE setting");
2064 2065 2066
	StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
	localeptr = setlocale(LC_CTYPE, NULL);
	if (!localeptr)
2067
		elog(STOP, "invalid LC_CTYPE setting");
2068
	StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
B
Bruce Momjian 已提交
2069

2070 2071
	/*
	 * Issue warning notice if initdb'ing in a locale that will not permit
B
Bruce Momjian 已提交
2072 2073
	 * LIKE index optimization.  This is not a clean place to do it, but I
	 * don't see a better place either...
2074 2075 2076 2077 2078
	 */
	if (!locale_is_like_safe())
		elog(NOTICE, "Initializing database with %s collation order."
			 "\n\tThis locale setting will prevent use of index optimization for"
			 "\n\tLIKE and regexp searches.  If you are concerned about speed of"
B
Bruce Momjian 已提交
2079
		  "\n\tsuch queries, you may wish to set LC_COLLATE to \"C\" and"
2080 2081
			 "\n\tre-initdb.  For more information see the Administrator's Guide.",
			 ControlFile->lc_collate);
2082
#else /* not USE_LOCALE */
2083 2084
	strcpy(ControlFile->lc_collate, "C");
	strcpy(ControlFile->lc_ctype, "C");
2085
#endif /* not USE_LOCALE */
2086

T
Tom Lane 已提交
2087 2088
	/* Contents are protected with a CRC */
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2089 2090
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2091 2092 2093
			   sizeof(ControlFileData) - sizeof(crc64));
	FIN_CRC64(ControlFile->crc);

2094
	/*
B
Bruce Momjian 已提交
2095 2096 2097 2098 2099
	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
	 * over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail
	 * when we check the contents of the file, but hopefully with a more
	 * specific error than "couldn't read pg_control".
2100 2101
	 */
	if (sizeof(ControlFileData) > BLCKSZ)
2102
		elog(STOP, "sizeof(ControlFileData) is larger than BLCKSZ; fix either one");
2103

2104 2105 2106
	memset(buffer, 0, BLCKSZ);
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

2107 2108
	fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
2109
	if (fd < 0)
2110
		elog(STOP, "WriteControlFile: could not create control file (%s): %m",
2111 2112
			 ControlFilePath);

2113
	errno = 0;
2114
	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
2115 2116 2117 2118
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2119
		elog(STOP, "WriteControlFile: write to control file failed: %m");
2120
	}
2121

2122
	if (pg_fsync(fd) != 0)
2123
		elog(STOP, "WriteControlFile: fsync of control file failed: %m");
2124 2125 2126 2127 2128 2129 2130

	close(fd);
}

static void
ReadControlFile(void)
{
2131
	crc64		crc;
2132 2133 2134 2135 2136 2137 2138
	int			fd;

	/*
	 * Read data...
	 */
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
2139
		elog(STOP, "could not open control file (%s): %m", ControlFilePath);
2140 2141

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2142
		elog(STOP, "read from control file failed: %m");
2143 2144 2145

	close(fd);

T
Tom Lane 已提交
2146 2147 2148 2149 2150 2151 2152
	/*
	 * Check for expected pg_control format version.  If this is wrong,
	 * the CRC check will likely fail because we'll be checking the wrong
	 * number of bytes.  Complaining about wrong version will probably be
	 * more enlightening than complaining about wrong CRC.
	 */
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
2153 2154 2155 2156
		elog(STOP,
			 "The database cluster was initialized with PG_CONTROL_VERSION %d,\n"
			 "\tbut the server was compiled with PG_CONTROL_VERSION %d.\n"
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2157 2158 2159
			 ControlFile->pg_control_version, PG_CONTROL_VERSION);

	/* Now check the CRC. */
2160
	INIT_CRC64(crc);
B
Bruce Momjian 已提交
2161 2162
	COMP_CRC64(crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2163
			   sizeof(ControlFileData) - sizeof(crc64));
2164 2165
	FIN_CRC64(crc);

T
Tom Lane 已提交
2166
	if (!EQ_CRC64(crc, ControlFile->crc))
2167
		elog(STOP, "invalid checksum in control file");
2168

2169
	/*
B
Bruce Momjian 已提交
2170 2171
	 * Do compatibility checking immediately.  We do this here for 2
	 * reasons:
2172
	 *
B
Bruce Momjian 已提交
2173 2174
	 * (1) if the database isn't compatible with the backend executable, we
	 * want to abort before we can possibly do any damage;
2175 2176 2177
	 *
	 * (2) this code is executed in the postmaster, so the setlocale() will
	 * propagate to forked backends, which aren't going to read this file
B
Bruce Momjian 已提交
2178
	 * for themselves.	(These locale settings are considered critical
2179 2180
	 * compatibility items because they can affect sort order of indexes.)
	 */
T
Tom Lane 已提交
2181
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
2182 2183 2184 2185
		elog(STOP,
			 "The database cluster was initialized with CATALOG_VERSION_NO %d,\n"
			 "\tbut the backend was compiled with CATALOG_VERSION_NO %d.\n"
			 "\tIt looks like you need to initdb.",
T
Tom Lane 已提交
2186
			 ControlFile->catalog_version_no, CATALOG_VERSION_NO);
2187
	if (ControlFile->blcksz != BLCKSZ)
2188 2189 2190 2191
		elog(STOP,
			 "The database cluster was initialized with BLCKSZ %d,\n"
			 "\tbut the backend was compiled with BLCKSZ %d.\n"
			 "\tIt looks like you need to initdb.",
2192 2193
			 ControlFile->blcksz, BLCKSZ);
	if (ControlFile->relseg_size != RELSEG_SIZE)
2194 2195 2196 2197
		elog(STOP,
			 "The database cluster was initialized with RELSEG_SIZE %d,\n"
			 "\tbut the backend was compiled with RELSEG_SIZE %d.\n"
			 "\tIt looks like you need to initdb.",
2198 2199 2200
			 ControlFile->relseg_size, RELSEG_SIZE);
#ifdef USE_LOCALE
	if (setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
2201 2202 2203 2204
		elog(STOP,
			 "The database cluster was initialized with LC_COLLATE '%s',\n"
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2205 2206
			 ControlFile->lc_collate);
	if (setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
2207 2208 2209 2210
		elog(STOP,
			 "The database cluster was initialized with LC_CTYPE '%s',\n"
			 "\twhich is not recognized by setlocale().\n"
			 "\tIt looks like you need to initdb.",
2211
			 ControlFile->lc_ctype);
2212
#else /* not USE_LOCALE */
2213 2214
	if (strcmp(ControlFile->lc_collate, "C") != 0 ||
		strcmp(ControlFile->lc_ctype, "C") != 0)
2215 2216 2217 2218
		elog(STOP,
			 "The database cluster was initialized with LC_COLLATE '%s' and\n"
			 "\tLC_CTYPE '%s', but the server was compiled without locale support.\n"
			 "\tIt looks like you need to initdb or recompile.",
2219
			 ControlFile->lc_collate, ControlFile->lc_ctype);
2220
#endif /* not USE_LOCALE */
2221 2222
}

2223
void
2224
UpdateControlFile(void)
2225
{
2226
	int			fd;
2227

2228
	INIT_CRC64(ControlFile->crc);
B
Bruce Momjian 已提交
2229 2230
	COMP_CRC64(ControlFile->crc,
			   (char *) ControlFile + sizeof(crc64),
T
Tom Lane 已提交
2231
			   sizeof(ControlFileData) - sizeof(crc64));
2232 2233
	FIN_CRC64(ControlFile->crc);

2234
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
2235
	if (fd < 0)
2236
		elog(STOP, "could not open control file (%s): %m", ControlFilePath);
2237

2238
	errno = 0;
2239
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
2240 2241 2242 2243
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2244
		elog(STOP, "write to control file failed: %m");
2245
	}
2246

2247
	if (pg_fsync(fd) != 0)
2248
		elog(STOP, "fsync of control file failed: %m");
2249 2250 2251 2252

	close(fd);
}

2253
/*
T
Tom Lane 已提交
2254
 * Initialization of shared memory for XLOG
2255 2256
 */

2257
int
2258
XLOGShmemSize(void)
2259 2260 2261 2262
{
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

T
Tom Lane 已提交
2263 2264 2265
	return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
		+ BLCKSZ * XLOGbuffers +
		MAXALIGN(sizeof(ControlFileData));
2266 2267 2268 2269 2270
}

void
XLOGShmemInit(void)
{
2271
	bool		found;
2272

2273
	/* this must agree with space requested by XLOGShmemSize() */
2274 2275 2276
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

2277
	XLogCtl = (XLogCtlData *)
T
Tom Lane 已提交
2278 2279 2280 2281 2282
		ShmemInitStruct("XLOG Ctl",
						MAXALIGN(sizeof(XLogCtlData) +
								 sizeof(XLogRecPtr) * XLOGbuffers)
						+ BLCKSZ * XLOGbuffers,
						&found);
2283
	Assert(!found);
2284 2285 2286 2287
	ControlFile = (ControlFileData *)
		ShmemInitStruct("Control File", sizeof(ControlFileData), &found);
	Assert(!found);

T
Tom Lane 已提交
2288
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
2289

T
Tom Lane 已提交
2290 2291 2292 2293 2294 2295 2296 2297
	/*
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be
	 * a multiple of the alignment for same, so no extra alignment padding
	 * is needed here.
	 */
	XLogCtl->xlblocks = (XLogRecPtr *)
		(((char *) XLogCtl) + sizeof(XLogCtlData));
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
B
Bruce Momjian 已提交
2298

T
Tom Lane 已提交
2299
	/*
B
Bruce Momjian 已提交
2300 2301
	 * Here, on the other hand, we must MAXALIGN to ensure the page
	 * buffers have worst-case alignment.
T
Tom Lane 已提交
2302 2303 2304 2305 2306 2307 2308
	 */
	XLogCtl->pages =
		((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
									  sizeof(XLogRecPtr) * XLOGbuffers);
	memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);

	/*
B
Bruce Momjian 已提交
2309 2310
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will
	 * fill in additional info.)
T
Tom Lane 已提交
2311 2312 2313 2314 2315 2316 2317 2318 2319
	 */
	XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
	S_INIT_LOCK(&(XLogCtl->insert_lck));
	S_INIT_LOCK(&(XLogCtl->info_lck));
	S_INIT_LOCK(&(XLogCtl->logwrt_lck));
	S_INIT_LOCK(&(XLogCtl->chkp_lck));

2320 2321 2322 2323 2324 2325 2326
	/*
	 * If we are not in bootstrap mode, pg_control should already exist.
	 * Read and validate it immediately (see comments in ReadControlFile()
	 * for the reasons why).
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
2327 2328 2329
}

/*
T
Tom Lane 已提交
2330 2331
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
2332 2333
 */
void
T
Tom Lane 已提交
2334
BootStrapXLOG(void)
2335
{
2336
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2337 2338
	char	   *buffer;
	XLogPageHeader page;
2339
	XLogRecord *record;
B
Bruce Momjian 已提交
2340
	bool		use_existent;
2341
	crc64		crc;
2342

T
Tom Lane 已提交
2343 2344 2345 2346
	/* Use malloc() to ensure buffer is MAXALIGNED */
	buffer = (char *) malloc(BLCKSZ);
	page = (XLogPageHeader) buffer;

2347 2348 2349
	checkPoint.redo.xlogid = 0;
	checkPoint.redo.xrecoff = SizeOfXLogPHD;
	checkPoint.undo = checkPoint.redo;
T
Tom Lane 已提交
2350
	checkPoint.ThisStartUpID = 0;
2351
	checkPoint.nextXid = FirstTransactionId;
2352
	checkPoint.nextOid = BootstrapObjectIdData;
T
Tom Lane 已提交
2353
	checkPoint.time = time(NULL);
2354

2355 2356 2357 2358
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;

2359 2360 2361
	memset(buffer, 0, BLCKSZ);
	page->xlp_magic = XLOG_PAGE_MAGIC;
	page->xlp_info = 0;
2362
	page->xlp_sui = checkPoint.ThisStartUpID;
2363 2364
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
2365 2366 2367
	record = (XLogRecord *) ((char *) page + SizeOfXLogPHD);
	record->xl_prev.xlogid = 0;
	record->xl_prev.xrecoff = 0;
2368 2369 2370
	record->xl_xact_prev = record->xl_prev;
	record->xl_xid = InvalidTransactionId;
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
2371
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
2372
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
2373
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
2374

2375
	INIT_CRC64(crc);
T
Tom Lane 已提交
2376
	COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
B
Bruce Momjian 已提交
2377
	COMP_CRC64(crc, (char *) record + sizeof(crc64),
T
Tom Lane 已提交
2378
			   SizeOfXLogRecord - sizeof(crc64));
2379 2380 2381
	FIN_CRC64(crc);
	record->xl_crc = crc;

2382 2383
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
2384

2385
	errno = 0;
T
Tom Lane 已提交
2386
	if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
2387 2388 2389 2390
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
2391
		elog(STOP, "BootStrapXLOG failed to write log file: %m");
2392
	}
2393

T
Tom Lane 已提交
2394
	if (pg_fsync(openLogFile) != 0)
2395
		elog(STOP, "BootStrapXLOG failed to fsync log file: %m");
2396

T
Tom Lane 已提交
2397 2398
	close(openLogFile);
	openLogFile = -1;
2399

2400
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
2401 2402 2403
	/* Initialize pg_control status fields */
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
2404 2405 2406
	ControlFile->logId = 0;
	ControlFile->logSeg = 1;
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
2407
	ControlFile->checkPointCopy = checkPoint;
2408
	/* some additional ControlFile fields are set in WriteControlFile() */
2409

2410
	WriteControlFile();
2411 2412
}

2413
static char *
2414 2415
str_time(time_t tnow)
{
T
Tom Lane 已提交
2416
	static char buf[32];
2417

2418
	strftime(buf, sizeof(buf),
T
Tom Lane 已提交
2419
			 "%Y-%m-%d %H:%M:%S %Z",
2420
			 localtime(&tnow));
2421

2422
	return buf;
2423 2424 2425
}

/*
T
Tom Lane 已提交
2426
 * This must be called ONCE during postmaster or standalone-backend startup
2427 2428
 */
void
T
Tom Lane 已提交
2429
StartupXLOG(void)
2430
{
2431 2432
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
2433
	bool		wasShutdown;
2434
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
2435 2436 2437
				LastRec,
				checkPointLoc,
				EndOfLog;
2438
	XLogRecord *record;
T
Tom Lane 已提交
2439
	char	   *buffer;
2440

T
Tom Lane 已提交
2441 2442
	/* Use malloc() to ensure record buffer is MAXALIGNED */
	buffer = (char *) malloc(_INTL_MAXLOGRECSZ);
2443

T
Tom Lane 已提交
2444
	CritSectionCount++;
2445 2446

	/*
2447 2448
	 * Read control file and check XLOG status looks valid.
	 *
B
Bruce Momjian 已提交
2449 2450
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
2451
	 */
2452
	ReadControlFile();
2453

2454 2455 2456 2457
	if (ControlFile->logSeg == 0 ||
		ControlFile->time <= 0 ||
		ControlFile->state < DB_SHUTDOWNED ||
		ControlFile->state > DB_IN_PRODUCTION ||
2458
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
2459
		elog(STOP, "control file context is broken");
2460 2461

	if (ControlFile->state == DB_SHUTDOWNED)
2462
		elog(LOG, "database system was shut down at %s",
2463
			 str_time(ControlFile->time));
2464
	else if (ControlFile->state == DB_SHUTDOWNING)
2465
		elog(LOG, "database system shutdown was interrupted at %s",
2466
			 str_time(ControlFile->time));
2467
	else if (ControlFile->state == DB_IN_RECOVERY)
2468
		elog(LOG, "database system was interrupted being in recovery at %s\n"
2469
			 "\tThis propably means that some data blocks are corrupted\n"
2470
			 "\tand you will have to use the last backup for recovery.",
2471
			 str_time(ControlFile->time));
2472
	else if (ControlFile->state == DB_IN_PRODUCTION)
2473
		elog(LOG, "database system was interrupted at %s",
2474
			 str_time(ControlFile->time));
2475

T
Tom Lane 已提交
2476 2477 2478 2479
	/*
	 * Get the last valid checkpoint record.  If the latest one according
	 * to pg_control is broken, try the next-to-last one.
	 */
2480
	record = ReadCheckpointRecord(ControlFile->checkPoint, 1, buffer);
T
Tom Lane 已提交
2481 2482 2483
	if (record != NULL)
	{
		checkPointLoc = ControlFile->checkPoint;
2484
		elog(LOG, "checkpoint record is at (%u, %u)",
T
Tom Lane 已提交
2485 2486 2487 2488
			 checkPointLoc.xlogid, checkPointLoc.xrecoff);
	}
	else
	{
2489
		record = ReadCheckpointRecord(ControlFile->prevCheckPoint, 2, buffer);
T
Tom Lane 已提交
2490 2491 2492
		if (record != NULL)
		{
			checkPointLoc = ControlFile->prevCheckPoint;
2493
			elog(LOG, "using previous checkpoint record at (%u, %u)",
T
Tom Lane 已提交
2494 2495 2496 2497
				 checkPointLoc.xlogid, checkPointLoc.xrecoff);
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
2498
			elog(STOP, "unable to locate a valid checkpoint record");
T
Tom Lane 已提交
2499 2500 2501 2502
	}
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
2503

2504
	elog(LOG, "redo record is at (%u, %u); undo record is at (%u, %u); shutdown %s",
2505
		 checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
V
Vadim B. Mikheev 已提交
2506
		 checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
T
Tom Lane 已提交
2507
		 wasShutdown ? "TRUE" : "FALSE");
2508
	elog(LOG, "next transaction id: %u; next oid: %u",
2509
		 checkPoint.nextXid, checkPoint.nextOid);
2510 2511 2512 2513
	if (checkPoint.nextXid < FirstTransactionId)
		elog(STOP, "invalid next transaction id");
	if (checkPoint.nextOid < BootstrapObjectIdData)
		elog(STOP, "invalid next oid");
2514 2515 2516

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
2517
	ShmemVariableCache->oidCount = 0;
2518

V
WAL  
Vadim B. Mikheev 已提交
2519
	ThisStartUpID = checkPoint.ThisStartUpID;
B
Bruce Momjian 已提交
2520
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
2521
		XLogCtl->RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
2522

2523
	if (XLByteLT(RecPtr, checkPoint.redo))
2524
		elog(STOP, "invalid redo in checkpoint record");
2525 2526 2527
	if (checkPoint.undo.xrecoff == 0)
		checkPoint.undo = RecPtr;

B
Bruce Momjian 已提交
2528
	if (XLByteLT(checkPoint.undo, RecPtr) ||
V
Vadim B. Mikheev 已提交
2529
		XLByteLT(checkPoint.redo, RecPtr))
2530
	{
T
Tom Lane 已提交
2531
		if (wasShutdown)
2532
			elog(STOP, "invalid redo/undo record in shutdown checkpoint");
V
WAL  
Vadim B. Mikheev 已提交
2533
		InRecovery = true;
2534 2535
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
2536
		InRecovery = true;
2537

V
WAL  
Vadim B. Mikheev 已提交
2538 2539
	/* REDO */
	if (InRecovery)
2540
	{
2541
		elog(LOG, "database system was not properly shut down; "
2542
			 "automatic recovery in progress");
2543 2544 2545 2546
		ControlFile->state = DB_IN_RECOVERY;
		ControlFile->time = time(NULL);
		UpdateControlFile();

V
Vadim B. Mikheev 已提交
2547
		XLogOpenLogRelation();	/* open pg_log */
V
WAL  
Vadim B. Mikheev 已提交
2548
		XLogInitRelationCache();
V
Vadim B. Mikheev 已提交
2549

2550 2551
		/* Is REDO required ? */
		if (XLByteLT(checkPoint.redo, RecPtr))
T
Tom Lane 已提交
2552
			record = ReadRecord(&(checkPoint.redo), STOP, buffer);
B
Bruce Momjian 已提交
2553 2554
		else
/* read past CheckPoint record */
T
Tom Lane 已提交
2555
			record = ReadRecord(NULL, LOG, buffer);
2556

T
Tom Lane 已提交
2557
		if (record != NULL)
2558
		{
V
WAL  
Vadim B. Mikheev 已提交
2559
			InRedo = true;
2560
			elog(LOG, "redo starts at (%u, %u)",
2561
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2562 2563 2564 2565
			do
			{
				if (record->xl_xid >= ShmemVariableCache->nextXid)
					ShmemVariableCache->nextXid = record->xl_xid + 1;
V
WAL  
Vadim B. Mikheev 已提交
2566 2567
				if (XLOG_DEBUG)
				{
B
Bruce Momjian 已提交
2568
					char		buf[8192];
V
WAL  
Vadim B. Mikheev 已提交
2569

B
Bruce Momjian 已提交
2570 2571 2572
					sprintf(buf, "REDO @ %u/%u; LSN %u/%u: ",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
							EndRecPtr.xlogid, EndRecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
2573 2574
					xlog_outrec(buf, record);
					strcat(buf, " - ");
B
Bruce Momjian 已提交
2575 2576
					RmgrTable[record->xl_rmid].rm_desc(buf,
								record->xl_info, XLogRecGetData(record));
2577
					elog(DEBUG, "%s", buf);
V
WAL  
Vadim B. Mikheev 已提交
2578 2579
				}

T
Tom Lane 已提交
2580
				if (record->xl_info & XLR_BKP_BLOCK_MASK)
2581 2582
					RestoreBkpBlocks(record, EndRecPtr);

2583
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
T
Tom Lane 已提交
2584 2585
				record = ReadRecord(NULL, LOG, buffer);
			} while (record != NULL);
2586
			elog(LOG, "redo done at (%u, %u)",
2587
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2588
			LastRec = ReadRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2589
			InRedo = false;
2590 2591
		}
		else
2592
			elog(LOG, "redo is not required");
V
WAL  
Vadim B. Mikheev 已提交
2593 2594
	}

T
Tom Lane 已提交
2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
	/*
	 * Init xlog buffer cache using the block containing the last valid
	 * record from the previous incarnation.
	 */
	record = ReadRecord(&LastRec, STOP, buffer);
	EndOfLog = EndRecPtr;
	XLByteToPrevSeg(EndOfLog, openLogId, openLogSeg);
	openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
	openLogOff = 0;
	ControlFile->logId = openLogId;
	ControlFile->logSeg = openLogSeg + 1;
V
WAL  
Vadim B. Mikheev 已提交
2606
	Insert = &XLogCtl->Insert;
2607
	Insert->PrevRecord = LastRec;
B
Bruce Momjian 已提交
2608 2609

	/*
2610 2611
	 * If the next record will go to the new page
	 * then initialize for that one.
T
Tom Lane 已提交
2612
	 */
2613 2614 2615 2616
	if ((BLCKSZ - EndOfLog.xrecoff % BLCKSZ) < SizeOfXLogRecord)
		EndOfLog.xrecoff += (BLCKSZ - EndOfLog.xrecoff % BLCKSZ);
	if (EndOfLog.xrecoff % BLCKSZ == 0)
	{
2617 2618 2619 2620
		XLogRecPtr	NewPageEndPtr;

		NewPageEndPtr = EndOfLog;
		if (NewPageEndPtr.xrecoff >= XLogFileSize)
2621
		{
2622 2623 2624
			/* crossing a logid boundary */
			NewPageEndPtr.xlogid += 1;
			NewPageEndPtr.xrecoff = BLCKSZ;
2625 2626 2627
		}
		else
		{
2628
			NewPageEndPtr.xrecoff += BLCKSZ;
2629
		}
2630
		XLogCtl->xlblocks[0] = NewPageEndPtr;
2631 2632 2633 2634 2635
		Insert->currpage->xlp_magic = XLOG_PAGE_MAGIC;
		if (InRecovery)
			Insert->currpage->xlp_sui = ThisStartUpID;
		else
			Insert->currpage->xlp_sui = ThisStartUpID + 1;
2636 2637
		Insert->currpage->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
		Insert->currpage->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - BLCKSZ;
2638
		/* rest of buffer was zeroed in XLOGShmemInit */
2639
		Insert->currpos = (char *) Insert->currpage + SizeOfXLogPHD;
2640 2641 2642 2643 2644 2645 2646 2647
	}
	else
	{
		XLogCtl->xlblocks[0].xlogid = openLogId;
		XLogCtl->xlblocks[0].xrecoff =
			((EndOfLog.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
		/*
		 * Tricky point here: readBuf contains the *last* block that the
2648 2649
		 * LastRec record spans, not the one it starts in.  The last block
		 * is indeed the one we want to use.
2650 2651 2652 2653 2654 2655 2656 2657
		 */
		Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - BLCKSZ) % XLogSegSize);
		memcpy((char *) Insert->currpage, readBuf, BLCKSZ);
		Insert->currpos = (char *) Insert->currpage +
			(EndOfLog.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
		/* Make sure rest of page is zero */
		memset(Insert->currpos, 0, INSERT_FREESPACE(Insert));
	}
V
WAL  
Vadim B. Mikheev 已提交
2658

T
Tom Lane 已提交
2659
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
2660

T
Tom Lane 已提交
2661 2662 2663
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
2664

T
Tom Lane 已提交
2665 2666
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
2667

V
Vadim B. Mikheev 已提交
2668
#ifdef NOT_USED
V
WAL  
Vadim B. Mikheev 已提交
2669 2670 2671
	/* UNDO */
	if (InRecovery)
	{
2672 2673 2674
		RecPtr = ReadRecPtr;
		if (XLByteLT(checkPoint.undo, RecPtr))
		{
2675
			elog(LOG, "undo starts at (%u, %u)",
2676
				 RecPtr.xlogid, RecPtr.xrecoff);
2677 2678
			do
			{
T
Tom Lane 已提交
2679
				record = ReadRecord(&RecPtr, STOP, buffer);
2680
				if (TransactionIdIsValid(record->xl_xid) &&
2681
					!TransactionIdDidCommit(record->xl_xid))
V
misc  
Vadim B. Mikheev 已提交
2682
					RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
2683 2684
				RecPtr = record->xl_prev;
			} while (XLByteLE(checkPoint.undo, RecPtr));
2685
			elog(LOG, "undo done at (%u, %u)",
2686
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
2687 2688
		}
		else
2689
			elog(LOG, "undo is not required");
2690
	}
V
WAL  
Vadim B. Mikheev 已提交
2691
#endif
2692

V
WAL  
Vadim B. Mikheev 已提交
2693
	if (InRecovery)
2694
	{
B
Bruce Momjian 已提交
2695

T
Tom Lane 已提交
2696 2697 2698 2699 2700 2701 2702
		/*
		 * In case we had to use the secondary checkpoint, make sure that
		 * it will still be shown as the secondary checkpoint after this
		 * CreateCheckPoint operation; we don't want the broken primary
		 * checkpoint to become prevCheckPoint...
		 */
		ControlFile->checkPoint = checkPointLoc;
2703
		CreateCheckPoint(true);
V
WAL  
Vadim B. Mikheev 已提交
2704
		XLogCloseRelationCache();
2705
	}
2706

T
Tom Lane 已提交
2707 2708 2709 2710
	/*
	 * Preallocate additional log files, if wanted.
	 */
	PreallocXlogFiles(EndOfLog);
2711

V
WAL  
Vadim B. Mikheev 已提交
2712
	InRecovery = false;
2713 2714 2715 2716 2717

	ControlFile->state = DB_IN_PRODUCTION;
	ControlFile->time = time(NULL);
	UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
2718 2719 2720
	ThisStartUpID++;
	XLogCtl->ThisStartUpID = ThisStartUpID;

2721
	elog(LOG, "database system is ready");
2722
	CritSectionCount--;
2723

T
Tom Lane 已提交
2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}

	free(buffer);
}

2739 2740 2741 2742
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 * whichChkpt = 1 for "primary", 2 for "secondary", merely informative
 */
T
Tom Lane 已提交
2743 2744
static XLogRecord *
ReadCheckpointRecord(XLogRecPtr RecPtr,
2745
					 int whichChkpt,
T
Tom Lane 已提交
2746 2747 2748 2749 2750 2751
					 char *buffer)
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
2752 2753 2754
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint link in control file" :
				   "invalid secondary checkpoint link in control file"));
T
Tom Lane 已提交
2755 2756 2757 2758 2759 2760 2761
		return NULL;
	}

	record = ReadRecord(&RecPtr, LOG, buffer);

	if (record == NULL)
	{
2762 2763 2764
		elog(LOG, (whichChkpt == 1 ?
				   "invalid primary checkpoint record" :
				   "invalid secondary checkpoint record"));
T
Tom Lane 已提交
2765 2766 2767 2768
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
2769 2770 2771
		elog(LOG, (whichChkpt == 1 ?
				   "invalid resource manager id in primary checkpoint record" :
				   "invalid resource manager id in secondary checkpoint record"));
T
Tom Lane 已提交
2772 2773 2774 2775 2776
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
2777 2778 2779
		elog(LOG, (whichChkpt == 1 ?
				   "invalid xl_info in primary checkpoint record" :
				   "invalid xl_info in secondary checkpoint record"));
T
Tom Lane 已提交
2780 2781 2782 2783
		return NULL;
	}
	if (record->xl_len != sizeof(CheckPoint))
	{
2784 2785 2786
		elog(LOG, (whichChkpt == 1 ?
				   "invalid length of primary checkpoint record" :
				   "invalid length of secondary checkpoint record"));
T
Tom Lane 已提交
2787 2788 2789
		return NULL;
	}
	return record;
2790 2791
}

V
WAL  
Vadim B. Mikheev 已提交
2792
/*
T
Tom Lane 已提交
2793
 * Postmaster uses this to initialize ThisStartUpID & RedoRecPtr from
2794
 * XLogCtlData located in shmem after successful startup.
V
WAL  
Vadim B. Mikheev 已提交
2795 2796 2797 2798 2799
 */
void
SetThisStartUpID(void)
{
	ThisStartUpID = XLogCtl->ThisStartUpID;
2800 2801 2802 2803
	RedoRecPtr = XLogCtl->RedoRecPtr;
}

/*
T
Tom Lane 已提交
2804
 * CheckPoint process called by postmaster saves copy of new RedoRecPtr
B
Bruce Momjian 已提交
2805
 * in shmem (using SetRedoRecPtr).	When checkpointer completes, postmaster
T
Tom Lane 已提交
2806 2807 2808 2809 2810 2811 2812 2813 2814
 * calls GetRedoRecPtr to update its own copy of RedoRecPtr, so that
 * subsequently-spawned backends will start out with a reasonably up-to-date
 * local RedoRecPtr.  Since these operations are not protected by any spinlock
 * and copying an XLogRecPtr isn't atomic, it's unsafe to use either of these
 * routines at other times!
 *
 * Note: once spawned, a backend must update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr while holding the insert spinlock.  This is
 * done in XLogInsert().
2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825
 */
void
SetRedoRecPtr(void)
{
	XLogCtl->RedoRecPtr = RedoRecPtr;
}

void
GetRedoRecPtr(void)
{
	RedoRecPtr = XLogCtl->RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
2826 2827
}

2828
/*
T
Tom Lane 已提交
2829
 * This must be called ONCE during postmaster or standalone-backend shutdown
2830 2831
 */
void
T
Tom Lane 已提交
2832
ShutdownXLOG(void)
2833
{
2834
	elog(LOG, "shutting down");
2835

T
Tom Lane 已提交
2836 2837 2838
	/* suppress in-transaction check in CreateCheckPoint */
	MyLastRecPtr.xrecoff = 0;

2839
	CritSectionCount++;
V
Vadim B. Mikheev 已提交
2840
	CreateDummyCaches();
2841
	CreateCheckPoint(true);
2842
	CritSectionCount--;
2843

2844
	elog(LOG, "database system is shut down");
2845 2846
}

T
Tom Lane 已提交
2847 2848 2849
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 */
2850 2851 2852
void
CreateCheckPoint(bool shutdown)
{
2853 2854 2855
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
2856
	XLogRecData rdata;
2857
	uint32		freespace;
V
Vadim B. Mikheev 已提交
2858 2859
	uint32		_logId;
	uint32		_logSeg;
2860
	unsigned	spins = 0;
V
Vadim B. Mikheev 已提交
2861 2862 2863

	if (MyLastRecPtr.xrecoff != 0)
		elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
B
Bruce Momjian 已提交
2864

2865
	START_CRIT_SECTION();
2866 2867

	/* Grab lock, using larger than normal sleep between tries (1 sec) */
V
Vadim B. Mikheev 已提交
2868 2869
	while (TAS(&(XLogCtl->chkp_lck)))
	{
2870 2871
		S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++,
							  CHECKPOINT_LOCK_TIMEOUT, 1000000);
V
Vadim B. Mikheev 已提交
2872
	}
2873 2874 2875 2876 2877 2878 2879

	if (shutdown)
	{
		ControlFile->state = DB_SHUTDOWNING;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
T
Tom Lane 已提交
2880 2881

	memset(&checkPoint, 0, sizeof(checkPoint));
V
WAL  
Vadim B. Mikheev 已提交
2882
	checkPoint.ThisStartUpID = ThisStartUpID;
T
Tom Lane 已提交
2883
	checkPoint.time = time(NULL);
2884

2885
	S_LOCK(&(XLogCtl->insert_lck));
T
Tom Lane 已提交
2886 2887 2888 2889

	/*
	 * If this isn't a shutdown, and we have not inserted any XLOG records
	 * since the start of the last checkpoint, skip the checkpoint.  The
B
Bruce Momjian 已提交
2890 2891 2892 2893 2894 2895
	 * idea here is to avoid inserting duplicate checkpoints when the
	 * system is idle.	That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the
	 * previous checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
2896 2897
	 *
	 * We have to make two tests to determine that nothing has happened since
B
Bruce Momjian 已提交
2898 2899 2900
	 * the start of the last checkpoint: current insertion point must
	 * match the end of the last checkpoint record, and its redo pointer
	 * must point to itself.
T
Tom Lane 已提交
2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927
	 */
	if (!shutdown)
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
			S_UNLOCK(&(XLogCtl->insert_lck));
			S_UNLOCK(&(XLogCtl->chkp_lck));
			END_CRIT_SECTION();
			return;
		}
	}

	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
	 * NB: this is NOT necessarily where the checkpoint record itself will
	 * be, since other backends may insert more XLOG records while we're
	 * off doing the buffer flush work.  Those XLOG records are logically
B
Bruce Momjian 已提交
2928
	 * after the checkpoint, even though physically before it.	Got that?
T
Tom Lane 已提交
2929 2930
	 */
	freespace = INSERT_FREESPACE(Insert);
2931 2932
	if (freespace < SizeOfXLogRecord)
	{
T
Tom Lane 已提交
2933 2934
		(void) AdvanceXLInsertBuffer();
		/* OK to ignore update return flag, since we will do flush anyway */
2935 2936
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
T
Tom Lane 已提交
2937
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
2938

T
Tom Lane 已提交
2939 2940 2941 2942
	/*
	 * Here we update the shared RedoRecPtr for future XLogInsert calls;
	 * this must be done while holding the insert lock.
	 */
2943
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
B
Bruce Momjian 已提交
2944

T
Tom Lane 已提交
2945
	/*
B
Bruce Momjian 已提交
2946 2947 2948 2949
	 * Get UNDO record ptr - this is oldest of PROC->logRec values. We do
	 * this while holding insert lock to ensure that we won't miss any
	 * about-to-commit transactions (UNDO must include all xacts that have
	 * commits after REDO point).
T
Tom Lane 已提交
2950 2951 2952 2953
	 */
	checkPoint.undo = GetUndoRecPtr();

	if (shutdown && checkPoint.undo.xrecoff != 0)
2954
		elog(STOP, "active transaction while database system is shutting down");
T
Tom Lane 已提交
2955 2956 2957 2958 2959

	/*
	 * Now we can release insert lock, allowing other xacts to proceed
	 * even while we are flushing disk buffers.
	 */
2960 2961 2962 2963 2964
	S_UNLOCK(&(XLogCtl->insert_lck));

	SpinAcquire(XidGenLockId);
	checkPoint.nextXid = ShmemVariableCache->nextXid;
	SpinRelease(XidGenLockId);
T
Tom Lane 已提交
2965

2966 2967
	SpinAcquire(OidGenLockId);
	checkPoint.nextOid = ShmemVariableCache->nextOid;
2968 2969
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
2970 2971
	SpinRelease(OidGenLockId);

T
Tom Lane 已提交
2972
	/*
B
Bruce Momjian 已提交
2973 2974
	 * Having constructed the checkpoint record, ensure all shmem disk
	 * buffers are flushed to disk.
T
Tom Lane 已提交
2975
	 */
V
Vadim B. Mikheev 已提交
2976
	FlushBufferPool();
2977

T
Tom Lane 已提交
2978 2979 2980
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
2981
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
2982
	rdata.data = (char *) (&checkPoint);
2983 2984 2985
	rdata.len = sizeof(checkPoint);
	rdata.next = NULL;

T
Tom Lane 已提交
2986 2987 2988 2989 2990 2991
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
2992

T
Tom Lane 已提交
2993 2994 2995 2996 2997
	/*
	 * We now have ProcLastRecPtr = start of actual checkpoint record,
	 * recptr = end of actual checkpoint record.
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
2998
		elog(STOP, "concurrent transaction log activity while database system is shutting down");
2999

T
Tom Lane 已提交
3000
	/*
3001 3002 3003 3004 3005 3006 3007 3008
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
	 *
	 * With UNDO support: oldest item is redo or undo, whichever is older;
	 * but watch out for case that undo = 0.
	 *
	 * Without UNDO support: just use the redo pointer.  This allows xlog
	 * space to be freed much faster when there are long-running transactions.
T
Tom Lane 已提交
3009
	 */
3010
#ifdef NOT_USED
B
Bruce Momjian 已提交
3011
	if (ControlFile->checkPointCopy.undo.xrecoff != 0 &&
T
Tom Lane 已提交
3012 3013 3014 3015
		XLByteLT(ControlFile->checkPointCopy.undo,
				 ControlFile->checkPointCopy.redo))
		XLByteToSeg(ControlFile->checkPointCopy.undo, _logId, _logSeg);
	else
3016
#endif
T
Tom Lane 已提交
3017
		XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
3018

T
Tom Lane 已提交
3019 3020 3021
	/*
	 * Update the control file.
	 */
3022 3023 3024
	SpinAcquire(ControlFileLockId);
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
3025 3026 3027
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
3028 3029 3030 3031
	ControlFile->time = time(NULL);
	UpdateControlFile();
	SpinRelease(ControlFileLockId);

V
Vadim B. Mikheev 已提交
3032
	/*
T
Tom Lane 已提交
3033 3034
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
V
Vadim B. Mikheev 已提交
3035 3036 3037
	 */
	if (_logId || _logSeg)
	{
T
Tom Lane 已提交
3038
		PrevLogSeg(_logId, _logSeg);
3039
		MoveOfflineLogs(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
3040 3041
	}

T
Tom Lane 已提交
3042 3043 3044 3045 3046 3047 3048 3049
	/*
	 * Make more log segments if needed.  (Do this after deleting offline
	 * log segments, to avoid having peak disk space usage higher than
	 * necessary.)
	 */
	if (!shutdown)
		PreallocXlogFiles(recptr);

V
Vadim B. Mikheev 已提交
3050 3051
	S_UNLOCK(&(XLogCtl->chkp_lck));

3052
	END_CRIT_SECTION();
3053
}
V
WAL  
Vadim B. Mikheev 已提交
3054

T
Tom Lane 已提交
3055 3056 3057
/*
 * Write a NEXTOID log record
 */
3058 3059 3060
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
3061
	XLogRecData rdata;
3062

3063
	rdata.buffer = InvalidBuffer;
B
Bruce Momjian 已提交
3064
	rdata.data = (char *) (&nextOid);
3065 3066 3067 3068
	rdata.len = sizeof(Oid);
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
}
V
WAL  
Vadim B. Mikheev 已提交
3069

T
Tom Lane 已提交
3070 3071 3072
/*
 * XLOG resource manager's routines
 */
V
WAL  
Vadim B. Mikheev 已提交
3073 3074 3075
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
3076
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
3077

3078
	if (info == XLOG_NEXTOID)
3079
	{
B
Bruce Momjian 已提交
3080
		Oid			nextOid;
3081 3082 3083

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
3084
		{
3085
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
3104
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
T
Tom Lane 已提交
3105 3106 3107 3108 3109 3110 3111
		if (ShmemVariableCache->nextXid < checkPoint.nextXid)
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
3112
	}
V
WAL  
Vadim B. Mikheev 已提交
3113
}
B
Bruce Momjian 已提交
3114

V
WAL  
Vadim B. Mikheev 已提交
3115 3116 3117 3118
void
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
B
Bruce Momjian 已提交
3119

V
WAL  
Vadim B. Mikheev 已提交
3120
void
B
Bruce Momjian 已提交
3121
xlog_desc(char *buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
3122
{
B
Bruce Momjian 已提交
3123
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
3124

T
Tom Lane 已提交
3125 3126
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
3127
	{
B
Bruce Momjian 已提交
3128 3129
		CheckPoint *checkpoint = (CheckPoint *) rec;

V
WAL  
Vadim B. Mikheev 已提交
3130
		sprintf(buf + strlen(buf), "checkpoint: redo %u/%u; undo %u/%u; "
B
Bruce Momjian 已提交
3131 3132 3133 3134 3135 3136
				"sui %u; xid %u; oid %u; %s",
				checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
				checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
				checkpoint->ThisStartUpID, checkpoint->nextXid,
				checkpoint->nextOid,
			 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
3137
	}
3138 3139
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
3140
		Oid			nextOid;
3141 3142 3143 3144

		memcpy(&nextOid, rec, sizeof(Oid));
		sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
	}
V
WAL  
Vadim B. Mikheev 已提交
3145 3146 3147 3148 3149 3150 3151
	else
		strcat(buf, "UNKNOWN");
}

static void
xlog_outrec(char *buf, XLogRecord *record)
{
B
Bruce Momjian 已提交
3152 3153
	int			bkpb;
	int			i;
3154 3155

	sprintf(buf + strlen(buf), "prev %u/%u; xprev %u/%u; xid %u",
B
Bruce Momjian 已提交
3156 3157 3158
			record->xl_prev.xlogid, record->xl_prev.xrecoff,
			record->xl_xact_prev.xlogid, record->xl_xact_prev.xrecoff,
			record->xl_xid);
3159

T
Tom Lane 已提交
3160
	for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3161 3162 3163 3164 3165 3166 3167 3168 3169 3170
	{
		if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i))))
			continue;
		bkpb++;
	}

	if (bkpb)
		sprintf(buf + strlen(buf), "; bkpb %d", bkpb);

	sprintf(buf + strlen(buf), ": %s",
B
Bruce Momjian 已提交
3171
			RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
3172
}
3173 3174 3175 3176 3177 3178 3179 3180 3181


/*
 * GUC support routines
 */

bool
check_xlog_sync_method(const char *method)
{
B
Bruce Momjian 已提交
3182 3183
	if (strcasecmp(method, "fsync") == 0)
		return true;
3184
#ifdef HAVE_FDATASYNC
B
Bruce Momjian 已提交
3185 3186
	if (strcasecmp(method, "fdatasync") == 0)
		return true;
3187 3188
#endif
#ifdef OPEN_SYNC_FLAG
B
Bruce Momjian 已提交
3189 3190
	if (strcasecmp(method, "open_sync") == 0)
		return true;
3191 3192
#endif
#ifdef OPEN_DATASYNC_FLAG
B
Bruce Momjian 已提交
3193 3194
	if (strcasecmp(method, "open_datasync") == 0)
		return true;
3195 3196 3197 3198 3199 3200 3201
#endif
	return false;
}

void
assign_xlog_sync_method(const char *method)
{
B
Bruce Momjian 已提交
3202 3203
	int			new_sync_method;
	int			new_sync_bit;
3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233

	if (strcasecmp(method, "fsync") == 0)
	{
		new_sync_method = SYNC_METHOD_FSYNC;
		new_sync_bit = 0;
	}
#ifdef HAVE_FDATASYNC
	else if (strcasecmp(method, "fdatasync") == 0)
	{
		new_sync_method = SYNC_METHOD_FDATASYNC;
		new_sync_bit = 0;
	}
#endif
#ifdef OPEN_SYNC_FLAG
	else if (strcasecmp(method, "open_sync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_SYNC_FLAG;
	}
#endif
#ifdef OPEN_DATASYNC_FLAG
	else if (strcasecmp(method, "open_datasync") == 0)
	{
		new_sync_method = SYNC_METHOD_OPEN;
		new_sync_bit = OPEN_DATASYNC_FLAG;
	}
#endif
	else
	{
		/* Can't get here unless guc.c screwed up */
3234
		elog(ERROR, "bogus wal_sync_method %s", method);
3235 3236 3237 3238 3239 3240
		new_sync_method = 0;	/* keep compiler quiet */
		new_sync_bit = 0;
	}

	if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
	{
B
Bruce Momjian 已提交
3241

3242
		/*
B
Bruce Momjian 已提交
3243 3244 3245 3246
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
		 * changing, close the log file so it will be reopened (with new
		 * flag bit) at next use.
3247 3248 3249 3250
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
3251
				elog(STOP, "fsync of log file %u, segment %u failed: %m",
3252 3253 3254 3255
					 openLogId, openLogSeg);
			if (open_sync_bit != new_sync_bit)
			{
				if (close(openLogFile) != 0)
3256
					elog(STOP, "close of log file %u, segment %u failed: %m",
3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274
						 openLogId, openLogSeg);
				openLogFile = -1;
			}
		}
		sync_method = new_sync_method;
		open_sync_bit = new_sync_bit;
	}
}


/*
 * Issue appropriate kind of fsync (if any) on the current XLOG output file
 */
static void
issue_xlog_fsync(void)
{
	switch (sync_method)
	{
B
Bruce Momjian 已提交
3275
			case SYNC_METHOD_FSYNC:
3276
			if (pg_fsync(openLogFile) != 0)
3277
				elog(STOP, "fsync of log file %u, segment %u failed: %m",
3278 3279 3280 3281 3282
					 openLogId, openLogSeg);
			break;
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
			if (pg_fdatasync(openLogFile) != 0)
3283
				elog(STOP, "fdatasync of log file %u, segment %u failed: %m",
3284 3285 3286 3287 3288 3289 3290
					 openLogId, openLogSeg);
			break;
#endif
		case SYNC_METHOD_OPEN:
			/* write synced it already */
			break;
		default:
3291
			elog(STOP, "bogus wal_sync_method %d", sync_method);
3292 3293 3294
			break;
	}
}