xlog.c 48.9 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3 4 5
 *
 * xlog.c
 *
 *
B
Add:  
Bruce Momjian 已提交
6 7
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
V
Vadim B. Mikheev 已提交
9
 * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.24 2000/11/05 22:50:19 vadim Exp $
10 11 12
 *
 *-------------------------------------------------------------------------
 */
13

14 15 16 17
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/stat.h>
V
Vadim B. Mikheev 已提交
18
#include <sys/time.h>
V
Vadim B. Mikheev 已提交
19 20
#include <sys/types.h>
#include <dirent.h>
21 22

#include "postgres.h"
23

24
#include "access/xact.h"
25
#include "catalog/catversion.h"
26 27 28 29
#include "storage/sinval.h"
#include "storage/proc.h"
#include "storage/spin.h"
#include "storage/s_lock.h"
V
Vadim B. Mikheev 已提交
30 31
#include "access/xlog.h"
#include "access/xlogutils.h"
32

V
WAL  
Vadim B. Mikheev 已提交
33 34
#include "miscadmin.h"

35 36
void		UpdateControlFile(void);
int			XLOGShmemSize(void);
37
void		XLOGShmemInit(void);
38 39
void		BootStrapXLOG(void);
void		StartupXLOG(void);
40
void		ShutdownXLOG(void);
41 42
void		CreateCheckPoint(bool shutdown);

43 44
char		XLogDir[MAXPGPATH];
char		ControlFilePath[MAXPGPATH];
45 46 47
uint32		XLOGbuffers = 0;
XLogRecPtr	MyLastRecPtr = {0, 0};
bool		StopIfError = false;
48
bool		InRecovery = false;
V
WAL  
Vadim B. Mikheev 已提交
49 50 51
StartUpID	ThisStartUpID = 0;

int			XLOG_DEBUG = 1;
52

V
Vadim B. Mikheev 已提交
53
/* To read/update control file and create new log file */
54
SPINLOCK	ControlFileLockId;
V
Vadim B. Mikheev 已提交
55 56

/* To generate new xid */
57 58
SPINLOCK	XidGenLockId;

59
extern VariableCache ShmemVariableCache;
60

61
#define MinXLOGbuffers	4
62 63 64

typedef struct XLgwrRqst
{
65 66
	XLogRecPtr	Write;			/* byte (1-based) to write out */
	XLogRecPtr	Flush;			/* byte (1-based) to flush */
67 68 69 70
} XLgwrRqst;

typedef struct XLgwrResult
{
71 72
	XLogRecPtr	Write;			/* bytes written out */
	XLogRecPtr	Flush;			/* bytes flushed */
73 74 75 76
} XLgwrResult;

typedef struct XLogCtlInsert
{
77 78 79 80 81
	XLgwrResult LgwrResult;
	XLogRecPtr	PrevRecord;
	uint16		curridx;		/* current block index in cache */
	XLogPageHeader currpage;
	char	   *currpos;
82 83 84 85
} XLogCtlInsert;

typedef struct XLogCtlWrite
{
86 87
	XLgwrResult LgwrResult;
	uint16		curridx;		/* index of next block to write */
88 89
} XLogCtlWrite;

90 91

#ifndef HAS_TEST_AND_SET
92 93 94
#define TAS(lck)		0
#define S_UNLOCK(lck)
#define S_INIT_LOCK(lck)
95 96
#endif

97 98
typedef struct XLogCtlData
{
V
Vadim B. Mikheev 已提交
99 100 101 102 103 104 105 106 107
	XLogCtlInsert	Insert;
	XLgwrRqst		LgwrRqst;
	XLgwrResult		LgwrResult;
	XLogCtlWrite	Write;
	char		   *pages;
	XLogRecPtr	   *xlblocks;		/* 1st byte ptr-s + BLCKSZ */
	uint32			XLogCacheByte;
	uint32			XLogCacheBlck;
	StartUpID		ThisStartUpID;
108
#ifdef HAS_TEST_AND_SET
V
Vadim B. Mikheev 已提交
109 110 111 112
	slock_t			insert_lck;
	slock_t			info_lck;
	slock_t			lgwr_lck;
	slock_t			chkp_lck;		/* checkpoint lock */
113 114 115
#endif
} XLogCtlData;

116
static XLogCtlData *XLogCtl = NULL;
117 118 119

typedef enum DBState
{
120 121
	DB_STARTUP = 0,
	DB_SHUTDOWNED,
122 123 124 125 126 127 128
	DB_SHUTDOWNING,
	DB_IN_RECOVERY,
	DB_IN_PRODUCTION
} DBState;

typedef struct ControlFileData
{
129 130 131 132 133
	uint32		logId;			/* current log file id */
	uint32		logSeg;			/* current log file segment (1-based) */
	XLogRecPtr	checkPoint;		/* last check point record ptr */
	time_t		time;			/* time stamp of last modification */
	DBState		state;			/* */
134 135

	/*
136 137
	 * this data is used to make sure that configuration of this DB is
	 * compatible with the current backend
138
	 */
139 140 141
	uint32		blcksz;			/* block size for this DB */
	uint32		relseg_size;	/* blocks per segment of large relation */
	uint32		catalog_version_no;		/* internal version number */
V
Vadim B. Mikheev 已提交
142
	char		archdir[MAXPGPATH];		/* where to move offline log files */
143 144

	/*
145 146
	 * MORE DATA FOLLOWS AT THE END OF THIS STRUCTURE - locations of data
	 * dirs
147 148 149
	 */
} ControlFileData;

150
static ControlFileData *ControlFile = NULL;
151 152 153

typedef struct CheckPoint
{
V
WAL  
Vadim B. Mikheev 已提交
154 155 156 157 158 159 160 161 162 163
	XLogRecPtr		redo;		/* next RecPtr available when we */
								/* began to create CheckPoint */
								/* (i.e. REDO start point) */
	XLogRecPtr		undo;		/* first record of oldest in-progress */
								/* transaction when we started */
								/* (i.e. UNDO end point) */
	StartUpID		ThisStartUpID;
	TransactionId	nextXid;
	Oid				nextOid;
	bool			Shutdown;
164 165
} CheckPoint;

V
WAL  
Vadim B. Mikheev 已提交
166
#define XLOG_CHECKPOINT		0x00
167
#define XLOG_NEXTOID		0x10
V
WAL  
Vadim B. Mikheev 已提交
168

169 170
/*
 * We break each log file in 16Mb segments
171
 */
172
#define XLogSegSize		(16*1024*1024)
173 174
#define XLogLastSeg		(0xffffffff / XLogSegSize)
#define XLogFileSize	(XLogLastSeg * XLogSegSize)
175

176
#define XLogFileName(path, log, seg)	\
177 178
			snprintf(path, MAXPGPATH, "%s%c%08X%08X",	\
					 XLogDir, SEP_CHAR, log, seg)
179

V
Vadim B. Mikheev 已提交
180 181 182 183
#define XLogTempFileName(path, log, seg)	\
			snprintf(path, MAXPGPATH, "%s%cT%08X%08X",	\
					 XLogDir, SEP_CHAR, log, seg)

184
#define PrevBufIdx(curridx)		\
185 186
		((curridx == 0) ? XLogCtl->XLogCacheBlck : (curridx - 1))

187
#define NextBufIdx(curridx)		\
188 189
		((curridx == XLogCtl->XLogCacheBlck) ? 0 : (curridx + 1))

190
#define InitXLBuffer(curridx)	(\
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
				XLogCtl->xlblocks[curridx].xrecoff = \
				(XLogCtl->xlblocks[Insert->curridx].xrecoff == XLogFileSize) ? \
				BLCKSZ : (XLogCtl->xlblocks[Insert->curridx].xrecoff + BLCKSZ), \
				XLogCtl->xlblocks[curridx].xlogid = \
				(XLogCtl->xlblocks[Insert->curridx].xrecoff == XLogFileSize) ? \
				(XLogCtl->xlblocks[Insert->curridx].xlogid + 1) : \
				XLogCtl->xlblocks[Insert->curridx].xlogid, \
				Insert->curridx = curridx, \
				Insert->currpage = (XLogPageHeader) (XLogCtl->pages + curridx * BLCKSZ), \
				Insert->currpos = \
					((char*) Insert->currpage) + SizeOfXLogPHD, \
				Insert->currpage->xlp_magic = XLOG_PAGE_MAGIC, \
				Insert->currpage->xlp_info = 0 \
				)

206
#define XRecOffIsValid(xrecoff) \
207 208 209
		(xrecoff % BLCKSZ >= SizeOfXLogPHD && \
		(BLCKSZ - xrecoff % BLCKSZ) >= SizeOfXLogRecord)

210 211
static void GetFreeXLBuffer(void);
static void XLogWrite(char *buffer);
V
Vadim B. Mikheev 已提交
212
static int	XLogFileInit(uint32 log, uint32 seg, bool *usexistent);
213 214 215
static int	XLogFileOpen(uint32 log, uint32 seg, bool econt);
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, char *buffer);
static char *str_time(time_t tnow);
V
WAL  
Vadim B. Mikheev 已提交
216
static void xlog_outrec(char *buf, XLogRecord *record);
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

static XLgwrResult LgwrResult = {{0, 0}, {0, 0}};
static XLgwrRqst LgwrRqst = {{0, 0}, {0, 0}};

static int	logFile = -1;
static uint32 logId = 0;
static uint32 logSeg = 0;
static uint32 logOff = 0;

static XLogRecPtr ReadRecPtr;
static XLogRecPtr EndRecPtr;
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
static char readBuf[BLCKSZ];
static XLogRecord *nextRecord = NULL;
234

V
WAL  
Vadim B. Mikheev 已提交
235 236
static bool InRedo = false;

237
XLogRecPtr
V
Vadim B. Mikheev 已提交
238
XLogInsert(RmgrId rmid, uint8 info, char *hdr, uint32 hdrlen, char *buf, uint32 buflen)
239
{
240 241 242 243 244 245 246 247 248
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
	XLogSubRecord *subrecord;
	XLogRecPtr	RecPtr;
	uint32		len = hdrlen + buflen,
				freespace,
				wlen;
	uint16		curridx;
	bool		updrqst = false;
249

V
Vadim B. Mikheev 已提交
250
	Assert(!(info & XLR_INFO_MASK));
251 252 253
	if (len == 0 || len > MAXLOGRECSZ)
		elog(STOP, "XLogInsert: invalid record len %u", len);

V
WAL  
Vadim B. Mikheev 已提交
254 255 256 257 258 259 260
	if (IsBootstrapProcessingMode())
	{
		RecPtr.xlogid = 0;
		RecPtr.xrecoff = SizeOfXLogPHD;	/* start of 1st checkpoint record */
		return (RecPtr);
	}

261 262 263 264 265 266
	/* obtain xlog insert lock */
	if (TAS(&(XLogCtl->insert_lck)))	/* busy */
	{
		bool		do_lgwr = true;
		unsigned	i = 0;

267
		for (;;)
268 269 270 271 272 273 274
		{
			/* try to read LgwrResult while waiting for insert lock */
			if (!TAS(&(XLogCtl->info_lck)))
			{
				LgwrRqst = XLogCtl->LgwrRqst;
				LgwrResult = XLogCtl->LgwrResult;
				S_UNLOCK(&(XLogCtl->info_lck));
275

276 277 278 279
				/*
				 * If cache is half filled then try to acquire lgwr lock
				 * and do LGWR work, but only once.
				 */
280 281 282 283
				if (do_lgwr &&
					(LgwrRqst.Write.xlogid != LgwrResult.Write.xlogid ||
					 (LgwrRqst.Write.xrecoff - LgwrResult.Write.xrecoff >=
					  XLogCtl->XLogCacheByte / 2)))
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
				{
					if (!TAS(&(XLogCtl->lgwr_lck)))
					{
						LgwrResult = XLogCtl->Write.LgwrResult;
						if (!TAS(&(XLogCtl->info_lck)))
						{
							LgwrRqst = XLogCtl->LgwrRqst;
							S_UNLOCK(&(XLogCtl->info_lck));
						}
						if (XLByteLT(LgwrResult.Write, LgwrRqst.Write))
						{
							XLogWrite(NULL);
							do_lgwr = false;
						}
						S_UNLOCK(&(XLogCtl->lgwr_lck));
					}
				}
			}
			s_lock_sleep(i++);
			if (!TAS(&(XLogCtl->insert_lck)))
				break;
		}
	}

308
	freespace = ((char *) Insert->currpage) + BLCKSZ - Insert->currpos;
309 310 311 312 313
	if (freespace < SizeOfXLogRecord)
	{
		curridx = NextBufIdx(Insert->curridx);
		if (XLByteLE(XLogCtl->xlblocks[curridx], LgwrResult.Write))
			InitXLBuffer(curridx);
314
		else
315 316 317 318 319 320 321
			GetFreeXLBuffer();
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
	else
		curridx = Insert->curridx;

	freespace -= SizeOfXLogRecord;
322
	record = (XLogRecord *) Insert->currpos;
323
	record->xl_prev = Insert->PrevRecord;
324 325 326 327 328 329 330
	if (rmid != RM_XLOG_ID)
		record->xl_xact_prev = MyLastRecPtr;
	else
	{
		record->xl_xact_prev.xlogid = 0;
		record->xl_xact_prev.xrecoff = 0;
	}
331 332
	record->xl_xid = GetCurrentTransactionId();
	record->xl_len = (len > freespace) ? freespace : len;
V
Vadim B. Mikheev 已提交
333 334
	record->xl_info = (len > freespace) ? 
		(info | XLR_TO_BE_CONTINUED) : info;
335 336
	record->xl_rmid = rmid;
	RecPtr.xlogid = XLogCtl->xlblocks[curridx].xlogid;
337 338 339
	RecPtr.xrecoff =
		XLogCtl->xlblocks[curridx].xrecoff - BLCKSZ +
		Insert->currpos - ((char *) Insert->currpage);
340
	if (MyLastRecPtr.xrecoff == 0 && rmid != RM_XLOG_ID)
341 342 343 344 345
	{
		SpinAcquire(SInvalLock);
		MyProc->logRec = RecPtr;
		SpinRelease(SInvalLock);
	}
V
WAL  
Vadim B. Mikheev 已提交
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
	Insert->PrevRecord = RecPtr;

	if (XLOG_DEBUG)
	{
		char	buf[8192];

		sprintf(buf, "INSERT @ %u/%u: ", RecPtr.xlogid, RecPtr.xrecoff);
		xlog_outrec(buf, record);
		if (hdr != NULL)
		{
			strcat(buf, " - ");
			RmgrTable[record->xl_rmid].rm_desc(buf, record->xl_info, hdr);
		}
		strcat(buf, "\n");
		write(2, buf, strlen(buf));
	}

V
Vadim B. Mikheev 已提交
363
	MyLastRecPtr = RecPtr;	/* begin of record */
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
	Insert->currpos += SizeOfXLogRecord;
	if (freespace > 0)
	{
		wlen = (hdrlen > freespace) ? freespace : hdrlen;
		memcpy(Insert->currpos, hdr, wlen);
		freespace -= wlen;
		hdrlen -= wlen;
		hdr += wlen;
		Insert->currpos += wlen;
		if (buflen > 0 && freespace > 0)
		{
			wlen = (buflen > freespace) ? freespace : buflen;
			memcpy(Insert->currpos, buf, wlen);
			freespace -= wlen;
			buflen -= wlen;
			buf += wlen;
			Insert->currpos += wlen;
		}
382
		Insert->currpos = ((char *) Insert->currpage) +
V
WAL  
Vadim B. Mikheev 已提交
383
			MAXALIGN(Insert->currpos - ((char *) Insert->currpage));
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
		len = hdrlen + buflen;
	}

	if (len != 0)
	{
nbuf:
		curridx = NextBufIdx(curridx);
		if (XLByteLE(XLogCtl->xlblocks[curridx], LgwrResult.Write))
		{
			InitXLBuffer(curridx);
			updrqst = true;
		}
		else
		{
			GetFreeXLBuffer();
			updrqst = false;
		}
		freespace = BLCKSZ - SizeOfXLogPHD - SizeOfXLogSubRecord;
		Insert->currpage->xlp_info |= XLP_FIRST_IS_SUBRECORD;
403
		subrecord = (XLogSubRecord *) Insert->currpos;
404 405 406 407
		Insert->currpos += SizeOfXLogSubRecord;
		if (hdrlen > freespace)
		{
			subrecord->xl_len = freespace;
V
Vadim B. Mikheev 已提交
408
			/* we don't store info in subrecord' xl_info */
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
			subrecord->xl_info = XLR_TO_BE_CONTINUED;
			memcpy(Insert->currpos, hdr, freespace);
			hdrlen -= freespace;
			hdr += freespace;
			goto nbuf;
		}
		else if (hdrlen > 0)
		{
			subrecord->xl_len = hdrlen;
			memcpy(Insert->currpos, hdr, hdrlen);
			Insert->currpos += hdrlen;
			freespace -= hdrlen;
			hdrlen = 0;
		}
		else
			subrecord->xl_len = 0;
		if (buflen > freespace)
		{
			subrecord->xl_len += freespace;
V
Vadim B. Mikheev 已提交
428
			/* we don't store info in subrecord' xl_info */
429 430 431 432 433 434 435 436 437 438 439 440
			subrecord->xl_info = XLR_TO_BE_CONTINUED;
			memcpy(Insert->currpos, buf, freespace);
			buflen -= freespace;
			buf += freespace;
			goto nbuf;
		}
		else if (buflen > 0)
		{
			subrecord->xl_len += buflen;
			memcpy(Insert->currpos, buf, buflen);
			Insert->currpos += buflen;
		}
V
Vadim B. Mikheev 已提交
441
		/* we don't store info in subrecord' xl_info */
442
		subrecord->xl_info = 0;
443
		Insert->currpos = ((char *) Insert->currpage) +
V
WAL  
Vadim B. Mikheev 已提交
444
			MAXALIGN(Insert->currpos - ((char *) Insert->currpage));
445
	}
446 447
	freespace = ((char *) Insert->currpage) + BLCKSZ - Insert->currpos;

V
Vadim B. Mikheev 已提交
448 449 450 451 452 453 454 455 456
	/*
	 * Begin of the next record will be stored as LSN for
	 * changed data page...
	 */
	RecPtr.xlogid = XLogCtl->xlblocks[curridx].xlogid;
	RecPtr.xrecoff =
		XLogCtl->xlblocks[curridx].xrecoff - BLCKSZ +
		Insert->currpos - ((char *) Insert->currpage);

457 458 459 460
	/*
	 * All done! Update global LgwrRqst if some block was filled up.
	 */
	if (freespace < SizeOfXLogRecord)
461 462
		updrqst = true;			/* curridx is filled and available for
								 * writing out */
463 464 465 466 467 468 469 470 471 472
	else
		curridx = PrevBufIdx(curridx);
	LgwrRqst.Write = XLogCtl->xlblocks[curridx];

	S_UNLOCK(&(XLogCtl->insert_lck));

	if (updrqst)
	{
		unsigned	i = 0;

473
		for (;;)
474 475 476 477 478 479 480 481 482 483 484 485 486
		{
			if (!TAS(&(XLogCtl->info_lck)))
			{
				if (XLByteLT(XLogCtl->LgwrRqst.Write, LgwrRqst.Write))
					XLogCtl->LgwrRqst.Write = LgwrRqst.Write;
				S_UNLOCK(&(XLogCtl->info_lck));
				break;
			}
			s_lock_sleep(i++);
		}
	}

	return (RecPtr);
487
}
488 489 490 491

void
XLogFlush(XLogRecPtr record)
{
492 493 494 495 496
	XLogRecPtr	WriteRqst;
	char		buffer[BLCKSZ];
	char	   *usebuf = NULL;
	unsigned	i = 0;
	bool		force_lgwr = false;
497

V
WAL  
Vadim B. Mikheev 已提交
498 499 500 501 502 503 504 505 506 507 508 509 510
	if (XLOG_DEBUG)
	{
		fprintf(stderr, "XLogFlush%s%s: rqst %u/%u; wrt %u/%u; flsh %u/%u\n",
			(IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
			(InRedo) ? "(redo)" : "",
			record.xlogid, record.xrecoff,
			LgwrResult.Write.xlogid, LgwrResult.Write.xrecoff,
			LgwrResult.Flush.xlogid, LgwrResult.Flush.xrecoff);
		fflush(stderr);
	}

	if (IsBootstrapProcessingMode() || InRedo)
		return;
511 512 513
	if (XLByteLE(record, LgwrResult.Flush))
		return;
	WriteRqst = LgwrRqst.Write;
514
	for (;;)
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
	{
		/* try to read LgwrResult */
		if (!TAS(&(XLogCtl->info_lck)))
		{
			LgwrResult = XLogCtl->LgwrResult;
			if (XLByteLE(record, LgwrResult.Flush))
			{
				S_UNLOCK(&(XLogCtl->info_lck));
				return;
			}
			if (XLByteLT(XLogCtl->LgwrRqst.Flush, record))
				XLogCtl->LgwrRqst.Flush = record;
			if (XLByteLT(WriteRqst, XLogCtl->LgwrRqst.Write))
			{
				WriteRqst = XLogCtl->LgwrRqst.Write;
				usebuf = NULL;
			}
			S_UNLOCK(&(XLogCtl->info_lck));
		}
		/* if something was added to log cache then try to flush this too */
		if (!TAS(&(XLogCtl->insert_lck)))
		{
537 538 539
			XLogCtlInsert *Insert = &XLogCtl->Insert;
			uint32		freespace =
			((char *) Insert->currpage) + BLCKSZ - Insert->currpos;
540 541 542 543 544 545 546 547 548 549 550 551

			if (freespace < SizeOfXLogRecord)	/* buffer is full */
			{
				usebuf = NULL;
				LgwrRqst.Write = WriteRqst = XLogCtl->xlblocks[Insert->curridx];
			}
			else
			{
				usebuf = buffer;
				memcpy(usebuf, Insert->currpage, BLCKSZ - freespace);
				memset(usebuf + BLCKSZ - freespace, 0, freespace);
				WriteRqst = XLogCtl->xlblocks[Insert->curridx];
552 553
				WriteRqst.xrecoff = WriteRqst.xrecoff - BLCKSZ +
					Insert->currpos - ((char *) Insert->currpage);
554 555 556 557
			}
			S_UNLOCK(&(XLogCtl->insert_lck));
			force_lgwr = true;
		}
558 559
		if (force_lgwr || WriteRqst.xlogid > record.xlogid ||
			(WriteRqst.xlogid == record.xlogid &&
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
			 WriteRqst.xrecoff >= record.xrecoff + BLCKSZ))
		{
			if (!TAS(&(XLogCtl->lgwr_lck)))
			{
				LgwrResult = XLogCtl->Write.LgwrResult;
				if (XLByteLE(record, LgwrResult.Flush))
				{
					S_UNLOCK(&(XLogCtl->lgwr_lck));
					return;
				}
				if (XLByteLT(LgwrResult.Write, WriteRqst))
				{
					LgwrRqst.Flush = LgwrRqst.Write = WriteRqst;
					XLogWrite(usebuf);
					S_UNLOCK(&(XLogCtl->lgwr_lck));
					if (XLByteLT(LgwrResult.Flush, record))
						elog(STOP, "XLogFlush: request is not satisfyed");
					return;
				}
				break;
			}
		}
		s_lock_sleep(i++);
	}

585 586
	if (logFile >= 0 && (LgwrResult.Write.xlogid != logId ||
				 (LgwrResult.Write.xrecoff - 1) / XLogSegSize != logSeg))
587 588
	{
		if (close(logFile) != 0)
589 590
			elog(STOP, "Close(logfile %u seg %u) failed: %d",
				 logId, logSeg, errno);
591 592 593 594 595 596 597
		logFile = -1;
	}

	if (logFile < 0)
	{
		logId = LgwrResult.Write.xlogid;
		logSeg = (LgwrResult.Write.xrecoff - 1) / XLogSegSize;
598
		logOff = 0;
599 600 601 602
		logFile = XLogFileOpen(logId, logSeg, false);
	}

	if (fsync(logFile) != 0)
603 604
		elog(STOP, "Fsync(logfile %u seg %u) failed: %d",
			 logId, logSeg, errno);
605 606
	LgwrResult.Flush = LgwrResult.Write;

607
	for (i = 0;;)
608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
	{
		if (!TAS(&(XLogCtl->info_lck)))
		{
			XLogCtl->LgwrResult = LgwrResult;
			if (XLByteLT(XLogCtl->LgwrRqst.Write, LgwrResult.Write))
				XLogCtl->LgwrRqst.Write = LgwrResult.Write;
			S_UNLOCK(&(XLogCtl->info_lck));
			break;
		}
		s_lock_sleep(i++);
	}
	XLogCtl->Write.LgwrResult = LgwrResult;

	S_UNLOCK(&(XLogCtl->lgwr_lck));
	return;

}

static void
GetFreeXLBuffer()
{
629 630 631
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
	uint16		curridx = NextBufIdx(Insert->curridx);
632 633

	LgwrRqst.Write = XLogCtl->xlblocks[Insert->curridx];
634
	for (;;)
635 636 637 638 639 640 641 642 643 644 645 646 647
	{
		if (!TAS(&(XLogCtl->info_lck)))
		{
			LgwrResult = XLogCtl->LgwrResult;
			XLogCtl->LgwrRqst.Write = LgwrRqst.Write;
			S_UNLOCK(&(XLogCtl->info_lck));
			if (XLByteLE(XLogCtl->xlblocks[curridx], LgwrResult.Write))
			{
				Insert->LgwrResult = LgwrResult;
				InitXLBuffer(curridx);
				return;
			}
		}
648

649 650 651 652 653 654 655 656 657 658 659 660 661 662
		/*
		 * LgwrResult lock is busy or un-updated. Try to acquire lgwr lock
		 * and write full blocks.
		 */
		if (!TAS(&(XLogCtl->lgwr_lck)))
		{
			LgwrResult = Write->LgwrResult;
			if (XLByteLE(XLogCtl->xlblocks[curridx], LgwrResult.Write))
			{
				S_UNLOCK(&(XLogCtl->lgwr_lck));
				Insert->LgwrResult = LgwrResult;
				InitXLBuffer(curridx);
				return;
			}
663 664 665 666

			/*
			 * Have to write buffers while holding insert lock - not
			 * good...
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
			 */
			XLogWrite(NULL);
			S_UNLOCK(&(XLogCtl->lgwr_lck));
			Insert->LgwrResult = LgwrResult;
			InitXLBuffer(curridx);
			return;
		}
	}

	return;
}

static void
XLogWrite(char *buffer)
{
682 683 684 685
	XLogCtlWrite *Write = &XLogCtl->Write;
	char	   *from;
	uint32		wcnt = 0;
	int			i = 0;
V
Vadim B. Mikheev 已提交
686
	bool		usexistent;
687

688
	for (; XLByteLT(LgwrResult.Write, LgwrRqst.Write);)
689 690
	{
		LgwrResult.Write = XLogCtl->xlblocks[Write->curridx];
691
		if (LgwrResult.Write.xlogid != logId ||
692 693 694 695 696
			(LgwrResult.Write.xrecoff - 1) / XLogSegSize != logSeg)
		{
			if (wcnt > 0)
			{
				if (fsync(logFile) != 0)
697 698
					elog(STOP, "Fsync(logfile %u seg %u) failed: %d",
						 logId, logSeg, errno);
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
				if (LgwrResult.Write.xlogid != logId)
					LgwrResult.Flush.xrecoff = XLogFileSize;
				else
					LgwrResult.Flush.xrecoff = LgwrResult.Write.xrecoff - BLCKSZ;
				LgwrResult.Flush.xlogid = logId;
				if (!TAS(&(XLogCtl->info_lck)))
				{
					XLogCtl->LgwrResult.Flush = LgwrResult.Flush;
					XLogCtl->LgwrResult.Write = LgwrResult.Flush;
					if (XLByteLT(XLogCtl->LgwrRqst.Write, LgwrResult.Flush))
						XLogCtl->LgwrRqst.Write = LgwrResult.Flush;
					if (XLByteLT(XLogCtl->LgwrRqst.Flush, LgwrResult.Flush))
						XLogCtl->LgwrRqst.Flush = LgwrResult.Flush;
					S_UNLOCK(&(XLogCtl->info_lck));
				}
			}
			if (logFile >= 0)
			{
				if (close(logFile) != 0)
718 719
					elog(STOP, "Close(logfile %u seg %u) failed: %d",
						 logId, logSeg, errno);
720 721 722 723
				logFile = -1;
			}
			logId = LgwrResult.Write.xlogid;
			logSeg = (LgwrResult.Write.xrecoff - 1) / XLogSegSize;
724
			logOff = 0;
725
			SpinAcquire(ControlFileLockId);
V
Vadim B. Mikheev 已提交
726 727 728
			/* create/use new log file */
			usexistent = true;
			logFile = XLogFileInit(logId, logSeg, &usexistent);
729 730 731 732 733
			ControlFile->logId = logId;
			ControlFile->logSeg = logSeg + 1;
			ControlFile->time = time(NULL);
			UpdateControlFile();
			SpinRelease(ControlFileLockId);
V
Vadim B. Mikheev 已提交
734 735 736
			if (!usexistent)	/* there was no file */
				elog(LOG, "XLogWrite: had to create new log file - "
					"you probably should do checkpoints more often");
737 738 739 740 741 742
		}

		if (logFile < 0)
		{
			logId = LgwrResult.Write.xlogid;
			logSeg = (LgwrResult.Write.xrecoff - 1) / XLogSegSize;
743
			logOff = 0;
744 745 746 747 748 749
			logFile = XLogFileOpen(logId, logSeg, false);
		}

		if (logOff != (LgwrResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
		{
			logOff = (LgwrResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
750 751 752
			if (lseek(logFile, (off_t) logOff, SEEK_SET) < 0)
				elog(STOP, "Lseek(logfile %u seg %u off %u) failed: %d",
					 logId, logSeg, logOff, errno);
753 754 755 756 757 758 759 760
		}

		if (buffer != NULL && XLByteLT(LgwrRqst.Write, LgwrResult.Write))
			from = buffer;
		else
			from = XLogCtl->pages + Write->curridx * BLCKSZ;

		if (write(logFile, from, BLCKSZ) != BLCKSZ)
761 762
			elog(STOP, "Write(logfile %u seg %u off %u) failed: %d",
				 logId, logSeg, logOff, errno);
763 764 765 766 767 768 769 770 771 772 773 774

		wcnt++;
		logOff += BLCKSZ;

		if (from != buffer)
			Write->curridx = NextBufIdx(Write->curridx);
		else
			LgwrResult.Write = LgwrRqst.Write;
	}
	if (wcnt == 0)
		elog(STOP, "XLogWrite: nothing written");

775
	if (XLByteLT(LgwrResult.Flush, LgwrRqst.Flush) &&
776 777 778
		XLByteLE(LgwrRqst.Flush, LgwrResult.Write))
	{
		if (fsync(logFile) != 0)
779 780
			elog(STOP, "Fsync(logfile %u seg %u) failed: %d",
				 logId, logSeg, errno);
781 782 783
		LgwrResult.Flush = LgwrResult.Write;
	}

784
	for (;;)
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
	{
		if (!TAS(&(XLogCtl->info_lck)))
		{
			XLogCtl->LgwrResult = LgwrResult;
			if (XLByteLT(XLogCtl->LgwrRqst.Write, LgwrResult.Write))
				XLogCtl->LgwrRqst.Write = LgwrResult.Write;
			S_UNLOCK(&(XLogCtl->info_lck));
			break;
		}
		s_lock_sleep(i++);
	}
	Write->LgwrResult = LgwrResult;
}

static int
V
Vadim B. Mikheev 已提交
800
XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
801
{
802
	char		path[MAXPGPATH];
V
Vadim B. Mikheev 已提交
803
	char		tpath[MAXPGPATH];
804
	int			fd;
805 806

	XLogFileName(path, log, seg);
V
Vadim B. Mikheev 已提交
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827

	/*
	 * Try to use existent file (checkpoint maker
	 * creates it sometime).
	 */
	if (*usexistent)
	{
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
		if (fd < 0)
		{
			if (errno != ENOENT)
				elog(STOP, "InitOpen(logfile %u seg %u) failed: %d",
					logId, logSeg, errno);
		}
		else
			return(fd);
		*usexistent = false;
	}

	XLogTempFileName(tpath, log, seg);
	unlink(tpath);
828 829
	unlink(path);

V
Vadim B. Mikheev 已提交
830
	fd = BasicOpenFile(tpath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
831
	if (fd < 0)
V
Vadim B. Mikheev 已提交
832
		elog(STOP, "InitCreate(logfile %u seg %u) failed: %d",
833
			 logId, logSeg, errno);
834 835

	if (lseek(fd, XLogSegSize - 1, SEEK_SET) != (off_t) (XLogSegSize - 1))
836 837
		elog(STOP, "Lseek(logfile %u seg %u) failed: %d",
			 logId, logSeg, errno);
838 839

	if (write(fd, "", 1) != 1)
840 841
		elog(STOP, "Init(logfile %u seg %u) failed: %d",
			 logId, logSeg, errno);
842 843

	if (fsync(fd) != 0)
844 845
		elog(STOP, "Fsync(logfile %u seg %u) failed: %d",
			 logId, logSeg, errno);
846

847
	if (lseek(fd, 0, SEEK_SET) < 0)
848 849
		elog(STOP, "Lseek(logfile %u seg %u off %u) failed: %d",
			 log, seg, 0, errno);
850

V
Vadim B. Mikheev 已提交
851 852 853 854 855 856 857 858 859
	close(fd);
	link(tpath, path);
	unlink(tpath);

	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
		elog(STOP, "InitReopen(logfile %u seg %u) failed: %d",
			 logId, logSeg, errno);

860
	return (fd);
861 862 863 864 865
}

static int
XLogFileOpen(uint32 log, uint32 seg, bool econt)
{
866 867
	char		path[MAXPGPATH];
	int			fd;
868 869 870

	XLogFileName(path, log, seg);

871
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
872 873 874 875 876
	if (fd < 0)
	{
		if (econt && errno == ENOENT)
		{
			elog(LOG, "Open(logfile %u seg %u) failed: file doesn't exist",
877
				 logId, logSeg);
878 879
			return (fd);
		}
V
WAL  
Vadim B. Mikheev 已提交
880
		abort();
881 882
		elog(STOP, "Open(logfile %u seg %u) failed: %d",
			 logId, logSeg, errno);
883 884
	}

885
	return (fd);
886 887
}

V
Vadim B. Mikheev 已提交
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
/*
 * (Re)move offline log files older or equal to passwd one
 */
static void
MoveOfflineLogs(char *archdir, uint32 _logId, uint32 _logSeg)
{
	DIR			   *xldir;
	struct dirent  *xlde;
	char			lastoff[32];
	char			path[MAXPGPATH];

	Assert(archdir[0] == 0);	/* ! implemented yet */

	xldir = opendir(XLogDir);
	if (xldir == NULL)
		elog(STOP, "MoveOfflineLogs: cannot open xlog dir: %d", errno);

	sprintf(lastoff, "%08X%08X", _logId, _logSeg);

	errno = 0;
	while ((xlde = readdir(xldir)) != NULL)
	{
		if (strlen(xlde->d_name) != 16 || 
			strspn(xlde->d_name, "0123456789ABCDEF") != 16)
			continue;
		if (strcmp(xlde->d_name, lastoff) > 0)
		{
			elog(LOG, "MoveOfflineLogs: skip %s", xlde->d_name);
			errno = 0;
			continue;
		}
		elog(LOG, "MoveOfflineLogs: %s %s", (archdir[0]) ? 
			"archive" : "remove", xlde->d_name);
		sprintf(path, "%s%c%s",	XLogDir, SEP_CHAR, xlde->d_name);
		if (archdir[0] != 0)
			unlink(path);
		errno = 0;
	}
	if (errno)
		elog(STOP, "MoveOfflineLogs: cannot read xlog dir: %d", errno);
	closedir(xldir);
}

931
static XLogRecord *
932
ReadRecord(XLogRecPtr *RecPtr, char *buffer)
933
{
934 935 936 937 938
	XLogRecord *record;
	XLogRecPtr	tmpRecPtr = EndRecPtr;
	bool		nextmode = (RecPtr == NULL);
	int			emode = (nextmode) ? LOG : STOP;
	bool		noBlck = false;
939

940
	if (nextmode)
941
	{
942 943 944 945 946 947 948 949 950 951 952 953 954 955
		RecPtr = &tmpRecPtr;
		if (nextRecord != NULL)
		{
			record = nextRecord;
			goto got_record;
		}
		if (tmpRecPtr.xrecoff % BLCKSZ != 0)
			tmpRecPtr.xrecoff += (BLCKSZ - tmpRecPtr.xrecoff % BLCKSZ);
		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
		tmpRecPtr.xrecoff += SizeOfXLogPHD;
956
	}
957 958
	else if (!XRecOffIsValid(RecPtr->xrecoff))
		elog(STOP, "ReadRecord: invalid record offset in (%u, %u)",
959
			 RecPtr->xlogid, RecPtr->xrecoff);
960

961 962
	if (readFile >= 0 && (RecPtr->xlogid != readId ||
						  RecPtr->xrecoff / XLogSegSize != readSeg))
963
	{
964 965
		close(readFile);
		readFile = -1;
966
	}
967 968 969
	readId = RecPtr->xlogid;
	readSeg = RecPtr->xrecoff / XLogSegSize;
	if (readFile < 0)
970
	{
971 972 973 974
		noBlck = true;
		readFile = XLogFileOpen(readId, readSeg, nextmode);
		if (readFile < 0)
			goto next_record_is_invalid;
975 976
	}

977
	if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
978 979
	{
		readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
980 981 982
		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
983
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
984 985 986
			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
		if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
987 988
		{
			elog(emode, "ReadRecord: invalid magic number %u in logfile %u seg %u off %u",
989 990
				 ((XLogPageHeader) readBuf)->xlp_magic,
				 readId, readSeg, readOff);
991 992 993
			goto next_record_is_invalid;
		}
	}
994
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_SUBRECORD) &&
995 996 997
		RecPtr->xrecoff % BLCKSZ == SizeOfXLogPHD)
	{
		elog(emode, "ReadRecord: subrecord is requested by (%u, %u)",
998
			 RecPtr->xlogid, RecPtr->xrecoff);
999 1000
		goto next_record_is_invalid;
	}
1001
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % BLCKSZ);
1002 1003

got_record:;
V
WAL  
Vadim B. Mikheev 已提交
1004
	if (record->xl_len >
1005 1006 1007
		(BLCKSZ - RecPtr->xrecoff % BLCKSZ - SizeOfXLogRecord))
	{
		elog(emode, "ReadRecord: invalid record len %u in (%u, %u)",
1008
			 record->xl_len, RecPtr->xlogid, RecPtr->xrecoff);
1009 1010 1011 1012 1013
		goto next_record_is_invalid;
	}
	if (record->xl_rmid > RM_MAX_ID)
	{
		elog(emode, "ReadRecord: invalid resource managed id %u in (%u, %u)",
1014
			 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff);
1015 1016 1017 1018 1019
		goto next_record_is_invalid;
	}
	nextRecord = NULL;
	if (record->xl_info & XLR_TO_BE_CONTINUED)
	{
1020 1021
		XLogSubRecord *subrecord;
		uint32		len = record->xl_len;
1022

V
WAL  
Vadim B. Mikheev 已提交
1023
		if (MAXALIGN(record->xl_len) + RecPtr->xrecoff % BLCKSZ + 
V
Vadim B. Mikheev 已提交
1024
			SizeOfXLogRecord != BLCKSZ)
1025 1026
		{
			elog(emode, "ReadRecord: invalid fragmented record len %u in (%u, %u)",
1027
				 record->xl_len, RecPtr->xlogid, RecPtr->xrecoff);
1028 1029 1030
			goto next_record_is_invalid;
		}
		memcpy(buffer, record, record->xl_len + SizeOfXLogRecord);
1031
		record = (XLogRecord *) buffer;
1032
		buffer += record->xl_len + SizeOfXLogRecord;
1033
		for (;;)
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
		{
			readOff++;
			if (readOff == XLogSegSize / BLCKSZ)
			{
				readSeg++;
				if (readSeg == XLogLastSeg)
				{
					readSeg = 0;
					readId++;
				}
				close(readFile);
1045
				readOff = 0;
1046 1047 1048 1049 1050
				readFile = XLogFileOpen(readId, readSeg, nextmode);
				if (readFile < 0)
					goto next_record_is_invalid;
			}
			if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
1051 1052 1053
				elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %d",
					 readId, readSeg, readOff, errno);
			if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
1054 1055
			{
				elog(emode, "ReadRecord: invalid magic number %u in logfile %u seg %u off %u",
1056 1057
					 ((XLogPageHeader) readBuf)->xlp_magic,
					 readId, readSeg, readOff);
1058 1059
				goto next_record_is_invalid;
			}
1060
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_SUBRECORD))
1061 1062
			{
				elog(emode, "ReadRecord: there is no subrecord flag in logfile %u seg %u off %u",
1063
					 readId, readSeg, readOff);
1064 1065
				goto next_record_is_invalid;
			}
1066 1067
			subrecord = (XLogSubRecord *) ((char *) readBuf + SizeOfXLogPHD);
			if (subrecord->xl_len == 0 || subrecord->xl_len >
1068 1069 1070
				(BLCKSZ - SizeOfXLogPHD - SizeOfXLogSubRecord))
			{
				elog(emode, "ReadRecord: invalid subrecord len %u in logfile %u seg %u off %u",
1071
					 subrecord->xl_len, readId, readSeg, readOff);
1072 1073 1074 1075 1076 1077
				goto next_record_is_invalid;
			}
			len += subrecord->xl_len;
			if (len > MAXLOGRECSZ)
			{
				elog(emode, "ReadRecord: too long record len %u in (%u, %u)",
1078
					 len, RecPtr->xlogid, RecPtr->xrecoff);
1079 1080
				goto next_record_is_invalid;
			}
1081
			memcpy(buffer, (char *) subrecord + SizeOfXLogSubRecord, subrecord->xl_len);
1082 1083 1084
			buffer += subrecord->xl_len;
			if (subrecord->xl_info & XLR_TO_BE_CONTINUED)
			{
V
WAL  
Vadim B. Mikheev 已提交
1085
				if (MAXALIGN(subrecord->xl_len) +
1086 1087 1088
					SizeOfXLogPHD + SizeOfXLogSubRecord != BLCKSZ)
				{
					elog(emode, "ReadRecord: invalid fragmented subrecord len %u in logfile %u seg %u off %u",
1089
						 subrecord->xl_len, readId, readSeg, readOff);
1090 1091 1092 1093 1094 1095
					goto next_record_is_invalid;
				}
				continue;
			}
			break;
		}
V
WAL  
Vadim B. Mikheev 已提交
1096
		if (BLCKSZ - SizeOfXLogRecord >= MAXALIGN(subrecord->xl_len) + 
V
Vadim B. Mikheev 已提交
1097
			SizeOfXLogPHD + SizeOfXLogSubRecord)
1098
		{
V
Vadim B. Mikheev 已提交
1099
			nextRecord = (XLogRecord *) ((char *) subrecord + 
V
WAL  
Vadim B. Mikheev 已提交
1100
				MAXALIGN(subrecord->xl_len) + SizeOfXLogSubRecord);
1101
		}
1102
		record->xl_len = len;
1103
		EndRecPtr.xlogid = readId;
1104
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff * BLCKSZ +
V
Vadim B. Mikheev 已提交
1105
			SizeOfXLogPHD + SizeOfXLogSubRecord + 
V
WAL  
Vadim B. Mikheev 已提交
1106
			MAXALIGN(subrecord->xl_len);
1107
		ReadRecPtr = *RecPtr;
1108
		return (record);
1109
	}
V
WAL  
Vadim B. Mikheev 已提交
1110
	if (BLCKSZ - SizeOfXLogRecord >= MAXALIGN(record->xl_len) + 
V
Vadim B. Mikheev 已提交
1111 1112
		RecPtr->xrecoff % BLCKSZ + SizeOfXLogRecord)
		nextRecord = (XLogRecord *) ((char *) record + 
V
WAL  
Vadim B. Mikheev 已提交
1113
			MAXALIGN(record->xl_len) + SizeOfXLogRecord);
1114
	EndRecPtr.xlogid = RecPtr->xlogid;
V
Vadim B. Mikheev 已提交
1115
	EndRecPtr.xrecoff = RecPtr->xrecoff + 
V
WAL  
Vadim B. Mikheev 已提交
1116
		MAXALIGN(record->xl_len) + SizeOfXLogRecord;
1117 1118
	ReadRecPtr = *RecPtr;

1119
	return (record);
1120 1121 1122 1123 1124 1125

next_record_is_invalid:;
	close(readFile);
	readFile = -1;
	nextRecord = NULL;
	memset(buffer, 0, SizeOfXLogRecord);
1126 1127
	record = (XLogRecord *) buffer;

1128 1129 1130 1131 1132 1133
	/*
	 * If we assumed that next record began on the same page where
	 * previous one ended - zero end of page.
	 */
	if (XLByteEQ(tmpRecPtr, EndRecPtr))
	{
1134 1135
		Assert(EndRecPtr.xrecoff % BLCKSZ > (SizeOfXLogPHD + SizeOfXLogSubRecord) &&
			   BLCKSZ - EndRecPtr.xrecoff % BLCKSZ >= SizeOfXLogRecord);
1136 1137 1138
		readId = EndRecPtr.xlogid;
		readSeg = EndRecPtr.xrecoff / XLogSegSize;
		readOff = (EndRecPtr.xrecoff % XLogSegSize) / BLCKSZ;
1139
		elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
1140
			 readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
1141
		readFile = XLogFileOpen(readId, readSeg, false);
1142 1143 1144
		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
1145
		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
1146 1147 1148 1149 1150 1151 1152
			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
		memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
			   BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
1153
		if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
1154 1155
			elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
1156 1157 1158 1159
		readOff++;
	}
	else
	{
1160 1161
		Assert(EndRecPtr.xrecoff % BLCKSZ == 0 ||
			   BLCKSZ - EndRecPtr.xrecoff % BLCKSZ < SizeOfXLogRecord);
1162 1163 1164
		readId = tmpRecPtr.xlogid;
		readSeg = tmpRecPtr.xrecoff / XLogSegSize;
		readOff = (tmpRecPtr.xrecoff % XLogSegSize) / BLCKSZ;
1165
		Assert(readOff > 0);
1166 1167 1168
	}
	if (readOff > 0)
	{
1169
		if (!XLByteEQ(tmpRecPtr, EndRecPtr))
1170
			elog(LOG, "Formatting logfile %u seg %u block %u at offset 0",
1171
				 readId, readSeg, readOff);
1172 1173 1174
		readOff *= BLCKSZ;
		memset(readBuf, 0, BLCKSZ);
		readFile = XLogFileOpen(readId, readSeg, false);
1175 1176 1177
		if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %d",
				 readId, readSeg, readOff, errno);
1178 1179 1180
		while (readOff < XLogSegSize)
		{
			if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
1181 1182
				elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %d",
					 readId, readSeg, readOff, errno);
1183 1184 1185 1186 1187 1188
			readOff += BLCKSZ;
		}
	}
	if (readFile >= 0)
	{
		if (fsync(readFile) < 0)
1189 1190
			elog(STOP, "ReadRecord: fsync(logfile %u seg %u) failed: %d",
				 readId, readSeg, errno);
1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
		close(readFile);
		readFile = -1;
	}

	readId = EndRecPtr.xlogid;
	readSeg = (EndRecPtr.xrecoff - 1) / XLogSegSize + 1;
	elog(LOG, "The last logId/logSeg is (%u, %u)", readId, readSeg - 1);
	if (ControlFile->logId != readId || ControlFile->logSeg != readSeg)
	{
		elog(LOG, "Set logId/logSeg in control file");
		ControlFile->logId = readId;
		ControlFile->logSeg = readSeg;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
	if (readSeg == XLogLastSeg)
	{
		readSeg = 0;
		readId++;
	}
	{
1212
		char		path[MAXPGPATH];
1213 1214 1215 1216 1217

		XLogFileName(path, readId, readSeg);
		unlink(path);
	}

1218
	return (record);
1219 1220
}

1221 1222 1223
void
UpdateControlFile()
{
1224
	int			fd;
1225

1226
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
	if (fd < 0)
		elog(STOP, "Open(cntlfile) failed: %d", errno);

	if (write(fd, ControlFile, BLCKSZ) != BLCKSZ)
		elog(STOP, "Write(cntlfile) failed: %d", errno);

	if (fsync(fd) != 0)
		elog(STOP, "Fsync(cntlfile) failed: %d", errno);

	close(fd);

	return;
}

int
XLOGShmemSize()
{
	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

1247
	return (sizeof(XLogCtlData) + BLCKSZ * XLOGbuffers +
1248 1249 1250 1251 1252 1253
			sizeof(XLogRecPtr) * XLOGbuffers + BLCKSZ);
}

void
XLOGShmemInit(void)
{
1254
	bool		found;
1255 1256 1257 1258

	if (XLOGbuffers < MinXLOGbuffers)
		XLOGbuffers = MinXLOGbuffers;

1259
	ControlFile = (ControlFileData *)
1260 1261
		ShmemInitStruct("Control File", BLCKSZ, &found);
	Assert(!found);
1262 1263
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", sizeof(XLogCtlData) + BLCKSZ * XLOGbuffers +
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
						sizeof(XLogRecPtr) * XLOGbuffers, &found);
	Assert(!found);
}

/*
 * This func must be called ONCE on system install
 */
void
BootStrapXLOG()
{
1274 1275 1276
	int			fd;
	char		buffer[BLCKSZ];
	CheckPoint	checkPoint;
V
Vadim B. Mikheev 已提交
1277
	bool		usexistent = false;
1278

V
WAL  
Vadim B. Mikheev 已提交
1279
#ifdef XLOG
1280 1281 1282
	XLogPageHeader page = (XLogPageHeader) buffer;
	XLogRecord *record;

1283
#endif
1284

1285
	fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
1286
	if (fd < 0)
1287 1288
		elog(STOP, "BootStrapXLOG failed to create control file (%s): %d",
			 ControlFilePath, errno);
1289 1290 1291 1292 1293

	checkPoint.redo.xlogid = 0;
	checkPoint.redo.xrecoff = SizeOfXLogPHD;
	checkPoint.undo = checkPoint.redo;
	checkPoint.nextXid = FirstTransactionId;
1294
	checkPoint.nextOid = BootstrapObjectIdData;
V
WAL  
Vadim B. Mikheev 已提交
1295
	checkPoint.ThisStartUpID = 0;
1296

V
WAL  
Vadim B. Mikheev 已提交
1297
#ifdef XLOG
1298

1299 1300 1301
	memset(buffer, 0, BLCKSZ);
	page->xlp_magic = XLOG_PAGE_MAGIC;
	page->xlp_info = 0;
1302 1303 1304
	record = (XLogRecord *) ((char *) page + SizeOfXLogPHD);
	record->xl_prev.xlogid = 0;
	record->xl_prev.xrecoff = 0;
1305 1306 1307 1308 1309
	record->xl_xact_prev = record->xl_prev;
	record->xl_xid = InvalidTransactionId;
	record->xl_len = sizeof(checkPoint);
	record->xl_info = 0;
	record->xl_rmid = RM_XLOG_ID;
1310
	memcpy((char *) record + SizeOfXLogRecord, &checkPoint, sizeof(checkPoint));
1311

V
Vadim B. Mikheev 已提交
1312
	logFile = XLogFileInit(0, 0, &usexistent);
1313

1314 1315 1316 1317 1318 1319 1320 1321 1322
	if (write(logFile, buffer, BLCKSZ) != BLCKSZ)
		elog(STOP, "BootStrapXLOG failed to write logfile: %d", errno);

	if (fsync(logFile) != 0)
		elog(STOP, "BootStrapXLOG failed to fsync logfile: %d", errno);

	close(logFile);
	logFile = -1;

1323 1324
#endif

1325
	memset(buffer, 0, BLCKSZ);
1326
	ControlFile = (ControlFileData *) buffer;
1327 1328 1329 1330 1331
	ControlFile->logId = 0;
	ControlFile->logSeg = 1;
	ControlFile->checkPoint = checkPoint.redo;
	ControlFile->time = time(NULL);
	ControlFile->state = DB_SHUTDOWNED;
1332
	ControlFile->blcksz = BLCKSZ;
1333
	ControlFile->relseg_size = RELSEG_SIZE;
1334
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
1335 1336 1337 1338 1339 1340 1341 1342 1343 1344

	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
		elog(STOP, "BootStrapXLOG failed to write control file: %d", errno);

	if (fsync(fd) != 0)
		elog(STOP, "BootStrapXLOG failed to fsync control file: %d", errno);

	close(fd);
}

1345
static char *
1346 1347
str_time(time_t tnow)
{
1348 1349
	char	   *result = ctime(&tnow);
	char	   *p = strchr(result, '\n');
1350 1351 1352 1353

	if (p != NULL)
		*p = 0;

1354
	return (result);
1355 1356 1357 1358 1359 1360 1361 1362
}

/*
 * This func must be called ONCE on system startup
 */
void
StartupXLOG()
{
V
WAL  
Vadim B. Mikheev 已提交
1363
#ifdef XLOG
1364 1365 1366 1367 1368 1369 1370 1371
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
	XLogRecPtr	RecPtr,
				LastRec;
	XLogRecord *record;
	char		buffer[MAXLOGRECSZ + SizeOfXLogRecord];
	bool		sie_saved = false;

1372
#endif
1373
	int			fd;
1374 1375 1376

	elog(LOG, "Data Base System is starting up at %s", str_time(time(NULL)));

1377 1378
	XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));
	XLogCtl->pages = ((char *) XLogCtl->xlblocks + sizeof(XLogRecPtr) * XLOGbuffers);
1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
	XLogCtl->XLogCacheByte = BLCKSZ * XLOGbuffers;
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
	XLogCtl->LgwrRqst = LgwrRqst;
	XLogCtl->LgwrResult = LgwrResult;
	XLogCtl->Insert.LgwrResult = LgwrResult;
	XLogCtl->Insert.curridx = 0;
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
	XLogCtl->Write.LgwrResult = LgwrResult;
	XLogCtl->Write.curridx = 0;
	S_INIT_LOCK(&(XLogCtl->insert_lck));
	S_INIT_LOCK(&(XLogCtl->info_lck));
	S_INIT_LOCK(&(XLogCtl->lgwr_lck));
V
Vadim B. Mikheev 已提交
1392
	S_INIT_LOCK(&(XLogCtl->chkp_lck));
1393 1394 1395 1396

	/*
	 * Open/read Control file
	 */
1397
	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
1398
	if (fd < 0)
1399
		elog(STOP, "Open(\"%s\") failed: %d", ControlFilePath, errno);
1400 1401

	if (read(fd, ControlFile, BLCKSZ) != BLCKSZ)
1402
		elog(STOP, "Read(\"%s\") failed: %d", ControlFilePath, errno);
1403 1404 1405

	close(fd);

1406 1407 1408 1409
	if (ControlFile->logSeg == 0 ||
		ControlFile->time <= 0 ||
		ControlFile->state < DB_SHUTDOWNED ||
		ControlFile->state > DB_IN_PRODUCTION ||
1410 1411 1412
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
		elog(STOP, "Control file context is broken");

1413
	/* Check for incompatible database */
1414
	if (ControlFile->blcksz != BLCKSZ)
1415 1416
		elog(STOP, "database was initialized with BLCKSZ %d,\n\tbut the backend was compiled with BLCKSZ %d.\n\tlooks like you need to initdb.",
			 ControlFile->blcksz, BLCKSZ);
1417
	if (ControlFile->relseg_size != RELSEG_SIZE)
1418 1419 1420 1421 1422
		elog(STOP, "database was initialized with RELSEG_SIZE %d,\n\tbut the backend was compiled with RELSEG_SIZE %d.\n\tlooks like you need to initdb.",
			 ControlFile->relseg_size, RELSEG_SIZE);
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
		elog(STOP, "database was initialized with CATALOG_VERSION_NO %d,\n\tbut the backend was compiled with CATALOG_VERSION_NO %d.\n\tlooks like you need to initdb.",
			 ControlFile->catalog_version_no, CATALOG_VERSION_NO);
1423

1424
	if (ControlFile->state == DB_SHUTDOWNED)
1425
		elog(LOG, "Data Base System was shut down at %s",
1426
			 str_time(ControlFile->time));
1427 1428
	else if (ControlFile->state == DB_SHUTDOWNING)
		elog(LOG, "Data Base System was interrupted when shutting down at %s",
1429
			 str_time(ControlFile->time));
1430 1431 1432
	else if (ControlFile->state == DB_IN_RECOVERY)
	{
		elog(LOG, "Data Base System was interrupted being in recovery at %s\n"
1433 1434 1435
			 "\tThis propably means that some data blocks are corrupted\n"
			 "\tAnd you will have to use last backup for recovery",
			 str_time(ControlFile->time));
1436 1437 1438
	}
	else if (ControlFile->state == DB_IN_PRODUCTION)
		elog(LOG, "Data Base System was interrupted being in production at %s",
1439
			 str_time(ControlFile->time));
1440

V
WAL  
Vadim B. Mikheev 已提交
1441
#ifdef XLOG
1442

1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
	LastRec = RecPtr = ControlFile->checkPoint;
	if (!XRecOffIsValid(RecPtr.xrecoff))
		elog(STOP, "Invalid checkPoint in control file");
	elog(LOG, "CheckPoint record at (%u, %u)", RecPtr.xlogid, RecPtr.xrecoff);

	record = ReadRecord(&RecPtr, buffer);
	if (record->xl_rmid != RM_XLOG_ID)
		elog(STOP, "Invalid RMID in checkPoint record");
	if (record->xl_len != sizeof(checkPoint))
		elog(STOP, "Invalid length of checkPoint record");
1453
	checkPoint = *((CheckPoint *) ((char *) record + SizeOfXLogRecord));
1454

V
Vadim B. Mikheev 已提交
1455
	elog(LOG, "Redo record at (%u, %u); Undo record at (%u, %u); Shutdown %s",
1456
		 checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
V
Vadim B. Mikheev 已提交
1457 1458
		 checkPoint.undo.xlogid, checkPoint.undo.xrecoff,
		 (checkPoint.Shutdown) ? "TRUE" : "FALSE");
1459
	elog(LOG, "NextTransactionId: %u; NextOid: %u",
1460 1461
		 checkPoint.nextXid, checkPoint.nextOid);
	if (checkPoint.nextXid < FirstTransactionId ||
1462
		checkPoint.nextOid < BootstrapObjectIdData)
V
WAL  
Vadim B. Mikheev 已提交
1463 1464

#ifdef XLOG_2
1465 1466 1467 1468 1469 1470 1471
		elog(STOP, "Invalid NextTransactionId/NextOid");
#else
		elog(LOG, "Invalid NextTransactionId/NextOid");
#endif

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
1472
	ShmemVariableCache->oidCount = 0;
1473

V
WAL  
Vadim B. Mikheev 已提交
1474 1475
	ThisStartUpID = checkPoint.ThisStartUpID;

1476 1477 1478 1479 1480 1481 1482
	if (XLByteLT(RecPtr, checkPoint.redo))
		elog(STOP, "Invalid redo in checkPoint record");
	if (checkPoint.undo.xrecoff == 0)
		checkPoint.undo = RecPtr;
	if (XLByteLT(RecPtr, checkPoint.undo))
		elog(STOP, "Invalid undo in checkPoint record");

V
Vadim B. Mikheev 已提交
1483 1484
	if (XLByteLT(checkPoint.undo, RecPtr) || 
		XLByteLT(checkPoint.redo, RecPtr))
1485
	{
V
Vadim B. Mikheev 已提交
1486 1487
		if (checkPoint.Shutdown)
			elog(STOP, "Invalid Redo/Undo record in shutdown checkpoint");
1488 1489
		if (ControlFile->state == DB_SHUTDOWNED)
			elog(STOP, "Invalid Redo/Undo record in Shutdowned state");
V
WAL  
Vadim B. Mikheev 已提交
1490
		InRecovery = true;
1491 1492
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
Vadim B. Mikheev 已提交
1493
	{
V
WAL  
Vadim B. Mikheev 已提交
1494
		InRecovery = true;
V
Vadim B. Mikheev 已提交
1495
	}
1496

V
WAL  
Vadim B. Mikheev 已提交
1497 1498
	/* REDO */
	if (InRecovery)
1499
	{
1500
		elog(LOG, "The DataBase system was not properly shut down\n"
1501
			 "\tAutomatic recovery is in progress...");
1502 1503 1504 1505 1506 1507 1508
		ControlFile->state = DB_IN_RECOVERY;
		ControlFile->time = time(NULL);
		UpdateControlFile();

		sie_saved = StopIfError;
		StopIfError = true;

V
Vadim B. Mikheev 已提交
1509
		XLogOpenLogRelation();	/* open pg_log */
V
WAL  
Vadim B. Mikheev 已提交
1510
		XLogInitRelationCache();
V
Vadim B. Mikheev 已提交
1511

1512 1513 1514
		/* Is REDO required ? */
		if (XLByteLT(checkPoint.redo, RecPtr))
			record = ReadRecord(&(checkPoint.redo), buffer);
1515 1516
		else
/* read past CheckPoint record */
1517 1518 1519 1520
			record = ReadRecord(NULL, buffer);

		if (record->xl_len != 0)
		{
V
WAL  
Vadim B. Mikheev 已提交
1521
			InRedo = true;
1522 1523
			elog(LOG, "Redo starts at (%u, %u)",
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
1524 1525 1526 1527
			do
			{
				if (record->xl_xid >= ShmemVariableCache->nextXid)
					ShmemVariableCache->nextXid = record->xl_xid + 1;
V
WAL  
Vadim B. Mikheev 已提交
1528 1529 1530 1531
				if (XLOG_DEBUG)
				{
					char	buf[8192];

1532 1533 1534
					sprintf(buf, "REDO @ %u/%u; LSN %u/%u: ", 
						ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
						EndRecPtr.xlogid, EndRecPtr.xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
1535 1536 1537 1538 1539 1540 1541 1542
					xlog_outrec(buf, record);
					strcat(buf, " - ");
					RmgrTable[record->xl_rmid].rm_desc(buf, 
						record->xl_info, XLogRecGetData(record));
					strcat(buf, "\n");
					write(2, buf, strlen(buf));
				}

1543 1544 1545
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
				record = ReadRecord(NULL, buffer);
			} while (record->xl_len != 0);
1546 1547
			elog(LOG, "Redo done at (%u, %u)",
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
1548
			LastRec = ReadRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
1549
			InRedo = false;
1550 1551 1552
		}
		else
			elog(LOG, "Redo is not required");
V
WAL  
Vadim B. Mikheev 已提交
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
	}

	/* Init xlog buffer cache */
	record = ReadRecord(&LastRec, buffer);
	logId = EndRecPtr.xlogid;
	logSeg = (EndRecPtr.xrecoff - 1) / XLogSegSize;
	logOff = 0;
	logFile = XLogFileOpen(logId, logSeg, false);
	XLogCtl->xlblocks[0].xlogid = logId;
	XLogCtl->xlblocks[0].xrecoff =
		((EndRecPtr.xrecoff - 1) / BLCKSZ + 1) * BLCKSZ;
	Insert = &XLogCtl->Insert;
	memcpy((char *) (Insert->currpage), readBuf, BLCKSZ);
	Insert->currpos = ((char *) Insert->currpage) +
		(EndRecPtr.xrecoff + BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
	Insert->PrevRecord = LastRec;

	LgwrRqst.Write = LgwrRqst.Flush =
	LgwrResult.Write = LgwrResult.Flush = EndRecPtr;

	XLogCtl->Write.LgwrResult = LgwrResult;
	Insert->LgwrResult = LgwrResult;

	XLogCtl->LgwrRqst = LgwrRqst;
	XLogCtl->LgwrResult = LgwrResult;
1578

V
Vadim B. Mikheev 已提交
1579
#ifdef NOT_USED
V
WAL  
Vadim B. Mikheev 已提交
1580 1581 1582
	/* UNDO */
	if (InRecovery)
	{
1583 1584 1585
		RecPtr = ReadRecPtr;
		if (XLByteLT(checkPoint.undo, RecPtr))
		{
1586 1587
			elog(LOG, "Undo starts at (%u, %u)",
				 RecPtr.xlogid, RecPtr.xrecoff);
1588 1589 1590
			do
			{
				record = ReadRecord(&RecPtr, buffer);
1591
				if (TransactionIdIsValid(record->xl_xid) &&
1592
					!TransactionIdDidCommit(record->xl_xid))
V
misc  
Vadim B. Mikheev 已提交
1593
					RmgrTable[record->xl_rmid].rm_undo(EndRecPtr, record);
1594 1595
				RecPtr = record->xl_prev;
			} while (XLByteLE(checkPoint.undo, RecPtr));
1596 1597
			elog(LOG, "Undo done at (%u, %u)",
				 ReadRecPtr.xlogid, ReadRecPtr.xrecoff);
1598 1599 1600 1601
		}
		else
			elog(LOG, "Undo is not required");
	}
V
WAL  
Vadim B. Mikheev 已提交
1602
#endif
1603

V
WAL  
Vadim B. Mikheev 已提交
1604
	if (InRecovery)
1605 1606 1607
	{
		CreateCheckPoint(true);
		StopIfError = sie_saved;
V
WAL  
Vadim B. Mikheev 已提交
1608
		XLogCloseRelationCache();
1609
	}
V
WAL  
Vadim B. Mikheev 已提交
1610
	InRecovery = false;
1611

V
WAL  
Vadim B. Mikheev 已提交
1612
#endif	 /* XLOG */
1613

1614 1615 1616 1617
	ControlFile->state = DB_IN_PRODUCTION;
	ControlFile->time = time(NULL);
	UpdateControlFile();

V
WAL  
Vadim B. Mikheev 已提交
1618 1619 1620
	ThisStartUpID++;
	XLogCtl->ThisStartUpID = ThisStartUpID;

1621 1622 1623 1624 1625
	elog(LOG, "Data Base System is in production state at %s", str_time(time(NULL)));

	return;
}

V
WAL  
Vadim B. Mikheev 已提交
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637
/*
 * Postmaster uses it to set ThisStartUpID from XLogCtlData
 * located in shmem after successful startup.
 */
void	SetThisStartUpID(void);

void
SetThisStartUpID(void)
{
	ThisStartUpID = XLogCtl->ThisStartUpID;
}

1638 1639 1640 1641 1642 1643 1644
/*
 * This func must be called ONCE on system shutdown
 */
void
ShutdownXLOG()
{

1645
	elog(LOG, "Data Base System shutting down at %s", str_time(time(NULL)));
1646 1647 1648

	CreateCheckPoint(true);

1649
	elog(LOG, "Data Base System shut down at %s", str_time(time(NULL)));
1650 1651
}

V
Vadim B. Mikheev 已提交
1652 1653
extern XLogRecPtr	GetUndoRecPtr(void);

1654 1655 1656
void
CreateCheckPoint(bool shutdown)
{
V
WAL  
Vadim B. Mikheev 已提交
1657
#ifdef XLOG
1658 1659 1660 1661 1662
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	uint32		freespace;
	uint16		curridx;
V
Vadim B. Mikheev 已提交
1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
	uint32		_logId;
	uint32		_logSeg;
	char		archdir[MAXPGPATH];

	if (MyLastRecPtr.xrecoff != 0)
		elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
 
	while (TAS(&(XLogCtl->chkp_lck)))
	{
		struct timeval delay = {2, 0};

		if (shutdown)
			elog(STOP, "Checkpoint lock is busy while data base is shutting down");
		(void) select(0, NULL, NULL, NULL, &delay);
	}
1678 1679 1680 1681 1682 1683 1684 1685

	memset(&checkPoint, 0, sizeof(checkPoint));
	if (shutdown)
	{
		ControlFile->state = DB_SHUTDOWNING;
		ControlFile->time = time(NULL);
		UpdateControlFile();
	}
V
WAL  
Vadim B. Mikheev 已提交
1686 1687
	checkPoint.ThisStartUpID = ThisStartUpID;
	checkPoint.Shutdown = shutdown;
1688 1689

	/* Get REDO record ptr */
1690
	while (TAS(&(XLogCtl->insert_lck)))
1691
	{
V
Vadim B. Mikheev 已提交
1692
		struct timeval delay = {1, 0};
1693 1694 1695 1696 1697

		if (shutdown)
			elog(STOP, "XLog insert lock is busy while data base is shutting down");
		(void) select(0, NULL, NULL, NULL, &delay);
	}
1698
	freespace = ((char *) Insert->currpage) + BLCKSZ - Insert->currpos;
1699 1700 1701 1702 1703
	if (freespace < SizeOfXLogRecord)
	{
		curridx = NextBufIdx(Insert->curridx);
		if (XLByteLE(XLogCtl->xlblocks[curridx], LgwrResult.Write))
			InitXLBuffer(curridx);
1704
		else
1705 1706 1707 1708 1709 1710
			GetFreeXLBuffer();
		freespace = BLCKSZ - SizeOfXLogPHD;
	}
	else
		curridx = Insert->curridx;
	checkPoint.redo.xlogid = XLogCtl->xlblocks[curridx].xlogid;
1711 1712
	checkPoint.redo.xrecoff = XLogCtl->xlblocks[curridx].xrecoff - BLCKSZ +
		Insert->currpos - ((char *) Insert->currpage);
1713 1714 1715 1716 1717 1718 1719
	S_UNLOCK(&(XLogCtl->insert_lck));

	SpinAcquire(XidGenLockId);
	checkPoint.nextXid = ShmemVariableCache->nextXid;
	SpinRelease(XidGenLockId);
	SpinAcquire(OidGenLockId);
	checkPoint.nextOid = ShmemVariableCache->nextOid;
1720 1721 1722
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;

1723 1724
	SpinRelease(OidGenLockId);

V
Vadim B. Mikheev 已提交
1725
	FlushBufferPool();
1726

V
Vadim B. Mikheev 已提交
1727
	/* Get UNDO record ptr - should use oldest of PROC->logRec */
V
Vadim B. Mikheev 已提交
1728
	checkPoint.undo = GetUndoRecPtr();
1729 1730 1731 1732

	if (shutdown && checkPoint.undo.xrecoff != 0)
		elog(STOP, "Active transaction while data base is shutting down");

V
WAL  
Vadim B. Mikheev 已提交
1733 1734
	recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT, (char *) &checkPoint, 
			sizeof(checkPoint), NULL, 0);
1735 1736 1737 1738 1739 1740

	if (shutdown && !XLByteEQ(checkPoint.redo, MyLastRecPtr))
		elog(STOP, "XLog concurrent activity while data base is shutting down");

	XLogFlush(recptr);

V
WAL  
Vadim B. Mikheev 已提交
1741
#endif	 /* XLOG */
1742

1743 1744 1745
	SpinAcquire(ControlFileLockId);
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
V
WAL  
Vadim B. Mikheev 已提交
1746
#ifdef XLOG
V
Vadim B. Mikheev 已提交
1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768
	else	/* create new log file */
	{
		if (recptr.xrecoff % XLogSegSize >= 
			(uint32) (0.75 * XLogSegSize))
		{
			int		lf;
			bool	usexistent = true;

			_logId = recptr.xlogid;
			_logSeg = recptr.xrecoff / XLogSegSize;
			if (_logSeg >= XLogLastSeg)
			{
				_logId++;
				_logSeg = 0;
			}
			else
				_logSeg++;
			lf = XLogFileInit(_logId, _logSeg, &usexistent);
			close(lf);
		}
	}

1769
	ControlFile->checkPoint = MyLastRecPtr;
V
Vadim B. Mikheev 已提交
1770 1771 1772 1773 1774

	_logId = ControlFile->logId;
	_logSeg = ControlFile->logSeg - 1;
	strcpy(archdir, ControlFile->archdir);

1775 1776 1777 1778 1779
#else
	ControlFile->checkPoint.xlogid = 0;
	ControlFile->checkPoint.xrecoff = SizeOfXLogPHD;
#endif

1780 1781 1782 1783
	ControlFile->time = time(NULL);
	UpdateControlFile();
	SpinRelease(ControlFileLockId);

V
Vadim B. Mikheev 已提交
1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810
#ifdef XLOG
	/*
	 * Delete offline log files. Get oldest online
	 * log file from undo rec if it's valid.
	 */
	if (checkPoint.undo.xrecoff != 0)
	{
		_logId = checkPoint.undo.xlogid;
		_logSeg = checkPoint.undo.xrecoff / XLogSegSize;
	}
	if (_logId || _logSeg)
	{
		if (_logSeg)
			_logSeg--;
		else
		{
			_logId--;
			_logSeg = 0;
		}
		MoveOfflineLogs(archdir, _logId, _logSeg);
	}

	S_UNLOCK(&(XLogCtl->chkp_lck));

	MyLastRecPtr.xrecoff = 0;	/* to avoid commit record */
#endif

1811 1812
	return;
}
V
WAL  
Vadim B. Mikheev 已提交
1813

1814 1815 1816 1817 1818 1819 1820 1821 1822
void XLogPutNextOid(Oid nextOid);

void
XLogPutNextOid(Oid nextOid)
{
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, 
					(char *) &nextOid, sizeof(Oid), NULL, 0);
}

V
WAL  
Vadim B. Mikheev 已提交
1823 1824 1825 1826 1827 1828 1829
void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
void xlog_undo(XLogRecPtr lsn, XLogRecord *record);
void xlog_desc(char *buf, uint8 xl_info, char* rec);

void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
1830 1831 1832 1833 1834 1835 1836 1837 1838 1839
	uint8	info = record->xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_NEXTOID)
	{
		Oid		nextOid;

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
			ShmemVariableCache->nextOid = nextOid;
	}
V
WAL  
Vadim B. Mikheev 已提交
1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
}
 
void
xlog_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
 
void
xlog_desc(char *buf, uint8 xl_info, char* rec)
{
	uint8	info = xl_info & ~XLR_INFO_MASK;

	if (info == XLOG_CHECKPOINT)
	{
		CheckPoint	*checkpoint = (CheckPoint*) rec;
		sprintf(buf + strlen(buf), "checkpoint: redo %u/%u; undo %u/%u; "
		"sui %u; xid %u; oid %u; %s",
			checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
			checkpoint->undo.xlogid, checkpoint->undo.xrecoff,
			checkpoint->ThisStartUpID, checkpoint->nextXid, 
			checkpoint->nextOid,
			(checkpoint->Shutdown) ? "shutdown" : "online");
	}
1863 1864 1865 1866 1867 1868 1869
	else if (info == XLOG_NEXTOID)
	{
		Oid		nextOid;

		memcpy(&nextOid, rec, sizeof(Oid));
		sprintf(buf + strlen(buf), "nextOid: %u", nextOid);
	}
V
WAL  
Vadim B. Mikheev 已提交
1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882
	else
		strcat(buf, "UNKNOWN");
}

static void
xlog_outrec(char *buf, XLogRecord *record)
{
	sprintf(buf + strlen(buf), "prev %u/%u; xprev %u/%u; xid %u: %s",
		record->xl_prev.xlogid, record->xl_prev.xrecoff,
		record->xl_xact_prev.xlogid, record->xl_xact_prev.xrecoff,
		record->xl_xid, 
		RmgrTable[record->xl_rmid].rm_name);
}