From 4a78cdeb6b598940e9d9adb92deca6494628802a Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 1 Aug 2007 22:45:09 +0000
Subject: [PATCH] Support an optional asynchronous commit mode, in which we
 don't flush WAL before reporting a transaction committed.  Data consistency
 is still guaranteed (unlike setting fsync = off), but a crash may lose the
 effects of the last few transactions.  Patch by Simon, some editorialization
 by Tom.

---
 doc/src/sgml/config.sgml                      |  41 ++-
 doc/src/sgml/wal.sgml                         | 187 ++++++++--
 src/backend/access/transam/README             | 113 +++++-
 src/backend/access/transam/clog.c             |  78 +++-
 src/backend/access/transam/multixact.c        |  24 +-
 src/backend/access/transam/slru.c             | 112 +++++-
 src/backend/access/transam/subtrans.c         |   8 +-
 src/backend/access/transam/transam.c          | 122 +++++--
 src/backend/access/transam/twophase.c         |   8 +-
 src/backend/access/transam/xact.c             |  90 ++++-
 src/backend/access/transam/xlog.c             |  48 ++-
 src/backend/commands/dbcommands.c             |  14 +-
 src/backend/commands/tablespace.c             |  18 +-
 src/backend/commands/vacuum.c                 |  22 +-
 src/backend/utils/init/flatfiles.c            |  10 +-
 src/backend/utils/misc/guc.c                  |  14 +-
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/backend/utils/time/tqual.c                | 334 +++++++++---------
 src/include/access/clog.h                     |   6 +-
 src/include/access/gist_private.h             |   4 +-
 src/include/access/slru.h                     |  25 +-
 src/include/access/transam.h                  |   7 +-
 src/include/access/xact.h                     |   6 +-
 src/include/access/xlog.h                     |   5 +-
 src/include/access/xlogdefs.h                 |   4 +-
 25 files changed, 998 insertions(+), 303 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0e49ba3217..37afd9845a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.133 2007/07/24 04:54:08 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.134 2007/08/01 22:45:07 tgl Exp $ -->
 
 <chapter Id="runtime-config">
   <title>Server Configuration</title>
@@ -1307,6 +1307,13 @@ SET ENABLE_SEQSCAN TO OFF;
         disabling <varname>fsync</varname>.
        </para>
 
+       <para>
+        In many situations, turning off <xref linkend="guc-synchronous-commit">
+        for noncritical transactions can provide much of the potential
+        performance benefit of turning off <varname>fsync</varname>, without
+        the attendant risks of data corruption.
+       </para>
+
        <para>
         This parameter can only be set in the <filename>postgresql.conf</>
         file or on the server command line.
@@ -1315,6 +1322,38 @@ SET ENABLE_SEQSCAN TO OFF;
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry id="guc-synchronous-commit" xreflabel="synchronous_commit">
+      <term><varname>synchronous_commit</varname> (<type>boolean</type>)</term>
+      <indexterm>
+       <primary><varname>synchronous_commit</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies whether transaction commit will wait for WAL records
+        to be written to disk before the command returns a <quote>success</>
+        indication to the client.  The default, and safe, setting is
+        <literal>on</>.  When <literal>off</>, there can be a delay between
+        when success is reported to the client and when the transaction is
+        really guaranteed to be safe against a server crash.  (The maximum
+        delay is three times <xref linkend="guc-wal-writer-delay">.)  Unlike
+        <xref linkend="guc-fsync">, setting this parameter to <literal>off</>
+        does not create any risk of database inconsistency: a crash might
+        result in some recent allegedly-committed transactions being lost, but
+        the database state will be just the same as if those transactions had
+        been aborted cleanly.  So, turning <varname>synchronous_commit</> off
+        can be a useful alternative when performance is more important than
+        exact certainty about the durability of a transaction.  For more
+        discussion see <xref linkend="wal-async-commit">.
+       </para>
+       <para>
+        This parameter can be changed at any time; the behavior for any
+        one transaction is determined by the setting in effect when it
+        commits.  It is therefore possible, and useful, to have some
+        transactions commit synchronously and others asynchronously.
+       </para>
+      </listitem>
+     </varlistentry>
      
      <varlistentry id="guc-wal-sync-method" xreflabel="wal_sync_method">
       <term><varname>wal_sync_method</varname> (<type>string</type>)</term>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index aaf1d0c71e..d1fa700ace 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -1,10 +1,10 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/wal.sgml,v 1.44 2007/06/28 00:02:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/wal.sgml,v 1.45 2007/08/01 22:45:07 tgl Exp $ -->
 
 <chapter id="wal">
  <title>Reliability and the Write-Ahead Log</title>
 
  <para>
-  This chapter explain how the Write-Ahead Log is used to obtain
+  This chapter explains how the Write-Ahead Log is used to obtain
   efficient, reliable operation.
  </para>
 
@@ -71,7 +71,7 @@
    write caches.  At the drive level, disable write-back caching if the
    drive cannot guarantee the data will be written before shutdown.
   </para>
-  
+
   <para>
    Another risk of data loss is posed by the disk platter write
    operations themselves. Disk platters are divided into sectors,
@@ -86,11 +86,11 @@
    disk. By doing this, during crash recovery <productname>PostgreSQL</> can
    restore partially-written pages.  If you have a battery-backed disk
    controller or file-system software that prevents partial page writes
-   (e.g., ReiserFS 4),  you can turn off this page imaging by using the 
+   (e.g., ReiserFS 4),  you can turn off this page imaging by using the
    <xref linkend="guc-full-page-writes"> parameter.
   </para>
  </sect1>
- 
+
   <sect1 id="wal-intro">
    <title>Write-Ahead Logging (<acronym>WAL</acronym>)</title>
 
@@ -105,12 +105,12 @@
 
    <para>
     <firstterm>Write-Ahead Logging</firstterm> (<acronym>WAL</acronym>)
-    is a standard approach to transaction logging.  Its detailed
+    is a standard method for ensuring data integrity.  A detailed
     description can be found in most (if not all) books about
     transaction processing. Briefly, <acronym>WAL</acronym>'s central
     concept is that changes to data files (where tables and indexes
     reside) must be written only after those changes have been logged,
-    that is, when log records describing the changes have been flushed
+    that is, after log records describing the changes have been flushed
     to permanent storage. If we follow this procedure, we do not need
     to flush data pages to disk on every transaction commit, because we
     know that in the event of a crash we will be able to recover the
@@ -120,17 +120,17 @@
    </para>
 
    <para>
-    A major benefit of using <acronym>WAL</acronym> is a
+    Using <acronym>WAL</acronym> results in a
     significantly reduced number of disk writes, because only the log
-    file needs to be flushed to disk at the time of transaction
-    commit, rather than every data file changed by the transaction.
-    In multiuser environments, commits of many transactions
-    can be accomplished with a single <function>fsync</function> of
-    the log file. Furthermore, the log file is written sequentially,
+    file needs to be flushed to disk to guarantee that a transaction is
+    committed, rather than every data file changed by the transaction.
+    The log file is written sequentially,
     and so the cost of syncing the log is much less than the cost of
-    flushing the data pages.   This is especially true for servers
+    flushing the data pages.  This is especially true for servers
     handling many small transactions touching different parts of the data
-    store.
+    store.  Furthermore, when the server is processing many small concurrent
+    transactions, one <function>fsync</function> of the log file may
+    suffice to commit many transactions.
    </para>
 
    <para>
@@ -147,6 +147,139 @@
    </para>
   </sect1>
 
+ <sect1 id="wal-async-commit">
+  <title>Asynchronous Commit</title>
+
+   <indexterm>
+    <primary>synchronous commit</primary>
+   </indexterm>
+
+   <indexterm>
+    <primary>asynchronous commit</primary>
+   </indexterm>
+
+  <para>
+   <firstterm>Asynchronous commit</> is an option that allows transactions
+   to complete more quickly, at the cost that the most recent transactions may
+   be lost if the database should crash.  In many applications this is an
+   acceptable tradeoff.
+  </para>
+
+  <para>
+   As described in the previous section, transaction commit is normally
+   <firstterm>synchronous</>: the server waits for the transaction's
+   <acronym>WAL</acronym> records to be flushed to permanent storage
+   before returning a success indication to the client.  The client is
+   therefore guaranteed that a transaction reported to be committed will
+   be preserved, even in the event of a server crash immediately after.
+   However, for short transactions this delay is a major component of the
+   total transaction time.  Selecting asynchronous commit mode means that
+   the server returns success as soon as the transaction is logically
+   completed, before the <acronym>WAL</acronym> records it generated have
+   actually made their way to disk.  This can provide a significant boost
+   in throughput for small transactions.
+  </para>
+
+  <para>
+   Asynchronous commit introduces the risk of data loss. There is a short
+   time window between the report of transaction completion to the client
+   and the time that the transaction is truly committed (that is, it is
+   guaranteed not to be lost if the server crashes).  Thus asynchronous
+   commit should not be used if the client will take external actions
+   relying on the assumption that the transaction will be remembered.
+   As an example, a bank would certainly not use asynchronous commit for
+   a transaction recording an ATM's dispensing of cash.  But in many
+   scenarios, such as event logging, there is no need for a strong
+   guarantee of this kind.
+  </para>
+
+  <para>
+   The risk that is taken by using asynchronous commit is of data loss,
+   not data corruption.  If the database should crash, it will recover
+   by replaying <acronym>WAL</acronym> up to the last record that was
+   flushed.  The database will therefore be restored to a self-consistent
+   state, but any transactions that were not yet flushed to disk will
+   not be reflected in that state.  The net effect is therefore loss of
+   the last few transactions.  Because the transactions are replayed in
+   commit order, no inconsistency can be introduced &mdash; for example,
+   if transaction B made changes relying on the effects of a previous
+   transaction A, it is not possible for A's effects to be lost while B's
+   effects are preserved.
+  </para>
+
+  <para>
+   The user can select the commit mode of each transaction, so that
+   it is possible to have both synchronous and asynchronous commit
+   transactions running concurrently.  This allows flexible tradeoffs
+   between performance and certainty of transaction durability.
+   The commit mode is controlled by the user-settable parameter
+   <xref linkend="guc-synchronous-commit">, which can be changed in any of
+   the ways that a configuration parameter can be set.  The mode used for
+   any one transaction depends on the value of
+   <varname>synchronous_commit</varname> when transaction commit begins.
+  </para>
+
+  <para>
+   Certain utility commands, for instance <command>DROP TABLE</>, are
+   forced to commit synchronously regardless of the setting of
+   <varname>synchronous_commit</varname>.  This is to ensure consistency
+   between the server's filesystem and the logical state of the database.
+   The commands supporting two-phase commit, such as <command>PREPARE
+   TRANSACTION</>, are also always synchronous.
+  </para>
+
+  <para>
+   If the database crashes during the risk window between an
+   asynchronous commit and the writing of the transaction's
+   <acronym>WAL</acronym> records,
+   then changes made during that transaction <emphasis>will</> be lost.
+   The duration of the
+   risk window is limited because a background process (the <quote>wal
+   writer</>) flushes unwritten <acronym>WAL</acronym> records to disk
+   every <xref linkend="guc-wal-writer-delay"> milliseconds.
+   The actual maximum duration of the risk window is three times
+   <varname>wal_writer_delay</varname> because the wal writer is
+   designed to favor writing whole pages at a time during busy periods.
+  </para>
+
+  <caution>
+   <para>
+    An immediate-mode shutdown is equivalent to a server crash, and will
+    therefore cause loss of any unflushed asynchronous commits.
+   </para>
+  </caution>
+
+  <para>
+   Asynchronous commit provides behavior different from setting
+   <xref linkend="guc-fsync"> = off.
+   <varname>fsync</varname> is a server-wide
+   setting that will alter the behavior of all transactions.  It disables
+   all logic within <productname>PostgreSQL</> that attempts to synchronize
+   writes to different portions of the database, and therefore a system
+   crash (that is, a hardware or operating system crash, not a failure of
+   <productname>PostgreSQL</> itself) could result in arbitrarily bad
+   corruption of the database state.  In many scenarios, asynchronous
+   commit provides most of the performance improvement that could be
+   obtained by turning off <varname>fsync</varname>, but without the risk
+   of data corruption.
+  </para>
+
+  <para>
+   <xref linkend="guc-commit-delay"> also sounds very similar to
+   asynchronous commit, but it is actually a synchronous commit method
+   (in fact, <varname>commit_delay</varname> is ignored during an
+   asynchronous commit).  <varname>commit_delay</varname> causes a delay
+   just before a synchronous commit attempts to flush
+   <acronym>WAL</acronym> to disk, in the hope that a single flush
+   executed by one such transaction can also serve other transactions
+   committing at about the same time.  Setting <varname>commit_delay</varname>
+   can only help when there are many concurrently committing transactions,
+   and it is difficult to tune it to a value that actually helps rather
+   than hurting throughput.
+  </para>
+
+ </sect1>
+
  <sect1 id="wal-configuration">
   <title><acronym>WAL</acronym> Configuration</title>
 
@@ -188,13 +321,13 @@
    <varname>checkpoint_timeout</varname> causes checkpoints to be done
    more often. This allows faster after-crash recovery (since less work
    will need to be redone). However, one must balance this against the
-   increased cost of flushing dirty data pages more often. If 
-   <xref linkend="guc-full-page-writes"> is set (as is the default), there is 
-   another factor to consider. To ensure data page consistency, 
-   the first modification of a data page after each checkpoint results in 
+   increased cost of flushing dirty data pages more often. If
+   <xref linkend="guc-full-page-writes"> is set (as is the default), there is
+   another factor to consider. To ensure data page consistency,
+   the first modification of a data page after each checkpoint results in
    logging the entire page content. In that case,
    a smaller checkpoint interval increases the volume of output to the WAL log,
-   partially negating the goal of using a smaller interval, 
+   partially negating the goal of using a smaller interval,
    and in any case causing more disk I/O.
   </para>
 
@@ -206,8 +339,8 @@
    don't happen too often.  As a simple sanity check on your checkpointing
    parameters, you can set the <xref linkend="guc-checkpoint-warning">
    parameter.  If checkpoints happen closer together than
-   <varname>checkpoint_warning</> seconds, 
-   a message will be output to the server log recommending increasing 
+   <varname>checkpoint_warning</> seconds,
+   a message will be output to the server log recommending increasing
    <varname>checkpoint_segments</varname>.  Occasional appearance of such
    a message is not cause for alarm, but if it appears often then the
    checkpoint control parameters should be increased. Bulk operations such
@@ -280,9 +413,9 @@
    modifying the configuration parameter <xref
    linkend="guc-wal-buffers">.  The default number of <acronym>WAL</acronym>
    buffers is 8.  Increasing this value will
-   correspondingly increase shared memory usage.  When 
-   <xref linkend="guc-full-page-writes"> is set and the system is very busy, 
-   setting this value higher will help smooth response times during the 
+   correspondingly increase shared memory usage.  When
+   <xref linkend="guc-full-page-writes"> is set and the system is very busy,
+   setting this value higher will help smooth response times during the
    period immediately following each checkpoint.
   </para>
 
@@ -307,7 +440,7 @@
   <para>
    The <xref linkend="guc-wal-sync-method"> parameter determines how
    <productname>PostgreSQL</productname> will ask the kernel to force
-    <acronym>WAL</acronym> updates out to disk. 
+    <acronym>WAL</acronym> updates out to disk.
    All the options should be the same as far as reliability goes,
    but it's quite platform-specific which one will be the fastest.
    Note that this parameter is irrelevant if <varname>fsync</varname>
@@ -360,7 +493,7 @@
    The aim of <acronym>WAL</acronym>, to ensure that the log is
    written before database records are altered, can be subverted by
    disk drives<indexterm><primary>disk drive</></> that falsely report a
-   successful write to the kernel, 
+   successful write to the kernel,
    when in fact they have only cached the data and not yet stored it
    on the disk.  A power failure in such a situation might still lead to
    irrecoverable data corruption.  Administrators should try to ensure
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index cec93e6f76..6e7e132aca 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.5 2006/03/31 23:32:05 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.6 2007/08/01 22:45:07 tgl Exp $
 
 The Transaction System
 ----------------------
@@ -409,4 +409,113 @@ two separate WAL records.  The replay code has to remember "unfinished" split
 operations, and match them up to subsequent insertions in the parent level.
 If no matching insert has been found by the time the WAL replay ends, the
 replay code has to do the insertion on its own to restore the index to
-consistency.
+consistency.  Such insertions occur after WAL is operational, so they can
+and should write WAL records for the additional generated actions.
+
+
+Asynchronous Commit
+-------------------
+
+As of PostgreSQL 8.3 it is possible to perform asynchronous commits - i.e.,
+we don't wait while the WAL record for the commit is fsync'ed.
+We perform an asynchronous commit when synchronous_commit = off.  Instead
+of performing an XLogFlush() up to the LSN of the commit, we merely note
+the LSN in shared memory.  The backend then continues with other work.
+We record the LSN only for an asynchronous commit, not an abort; there's
+never any need to flush an abort record, since the presumption after a
+crash would be that the transaction aborted anyway.
+
+We always force synchronous commit when the transaction is deleting
+relations, to ensure the commit record is down to disk before the relations
+are removed from the filesystem.  Also, certain utility commands that have
+non-roll-backable side effects (such as filesystem changes) force sync
+commit to minimize the window in which the filesystem change has been made
+but the transaction isn't guaranteed committed.
+
+Every wal_writer_delay milliseconds, the walwriter process performs an
+XLogBackgroundFlush().  This checks the location of the last completely
+filled WAL page.  If that has moved forwards, then we write all the changed
+buffers up to that point, so that under full load we write only whole
+buffers.  If there has been a break in activity and the current WAL page is
+the same as before, then we find out the LSN of the most recent
+asynchronous commit, and flush up to that point, if required (i.e.,
+if it's in the current WAL page).  This arrangement in itself would
+guarantee that an async commit record reaches disk during at worst the
+second walwriter cycle after the transaction completes.  However, we also
+allow XLogFlush to flush full buffers "flexibly" (ie, not wrapping around
+at the end of the circular WAL buffer area), so as to minimize the number
+of writes issued under high load when multiple WAL pages are filled per
+walwriter cycle.  This makes the worst-case delay three walwriter cycles.
+
+There are some other subtle points to consider with asynchronous commits.
+First, for each page of CLOG we must remember the LSN of the latest commit
+affecting the page, so that we can enforce the same flush-WAL-before-write
+rule that we do for ordinary relation pages.  Otherwise the record of the
+commit might reach disk before the WAL record does.  Again, abort records
+need not factor into this consideration.
+
+In fact, we store more than one LSN for each clog page.  This relates to
+the way we set transaction status hint bits during visibility tests.
+We must not set a transaction-committed hint bit on a relation page and
+have that record make it to disk prior to the WAL record of the commit.
+Since visibility tests are normally made while holding buffer share locks,
+we do not have the option of changing the page's LSN to guarantee WAL
+synchronization.  Instead, we defer the setting of the hint bit if we have
+not yet flushed WAL as far as the LSN associated with the transaction.
+This requires tracking the LSN of each unflushed async commit.  It is
+convenient to associate this data with clog buffers: because we will flush
+WAL before writing a clog page, we know that we do not need to remember a
+transaction's LSN longer than the clog page holding its commit status
+remains in memory.  However, the naive approach of storing an LSN for each
+clog position is unattractive: the LSNs are 32x bigger than the two-bit
+commit status fields, and so we'd need 256K of additional shared memory for
+each 8K clog buffer page.  We choose instead to store a smaller number of
+LSNs per page, where each LSN is the highest LSN associated with any
+transaction commit in a contiguous range of transaction IDs on that page.
+This saves storage at the price of some possibly-unnecessary delay in
+setting transaction hint bits.
+
+How many transactions should share the same cached LSN (N)?  If the
+system's workload consists only of small async-commit transactions, then
+it's reasonable to have N similar to the number of transactions per
+walwriter cycle, since that is the granularity with which transactions will
+become truly committed (and thus hintable) anyway.  The worst case is where
+a sync-commit xact shares a cached LSN with an async-commit xact that
+commits a bit later; even though we paid to sync the first xact to disk,
+we won't be able to hint its outputs until the second xact is sync'd, up to
+three walwriter cycles later.  This argues for keeping N (the group size)
+as small as possible.  For the moment we are setting the group size to 32,
+which makes the LSN cache space the same size as the actual clog buffer
+space (independently of BLCKSZ).
+
+It is useful that we can run both synchronous and asynchronous commit
+transactions concurrently, but the safety of this is perhaps not
+immediately obvious.  Assume we have two transactions, T1 and T2.  The Log
+Sequence Number (LSN) is the point in the WAL sequence where a transaction
+commit is recorded, so LSN1 and LSN2 are the commit records of those
+transactions.  If T2 can see changes made by T1 then when T2 commits it
+must be true that LSN2 follows LSN1.  Thus when T2 commits it is certain
+that all of the changes made by T1 are also now recorded in the WAL.  This
+is true whether T1 was asynchronous or synchronous.  As a result, it is
+safe for asynchronous commits and synchronous commits to work concurrently
+without endangering data written by synchronous commits.  Sub-transactions
+are not important here since the final write to disk only occurs at the
+commit of the top level transaction.
+
+Changes to data blocks cannot reach disk unless WAL is flushed up to the
+point of the LSN of the data blocks.  Any attempt to write unsafe data to
+disk will trigger a write which ensures the safety of all data written by
+that and prior transactions.  Data blocks and clog pages are both protected
+by LSNs.
+
+Changes to a temp table are not WAL-logged, hence could reach disk in
+advance of T1's commit, but we don't care since temp table contents don't
+survive crashes anyway.
+
+Database writes made via any of the paths we have introduced to avoid WAL
+overhead for bulk updates are also safe.  In these cases it's entirely
+possible for the data to reach disk before T1's commit, because T1 will
+fsync it down to disk without any sort of interlock, as soon as it finishes
+the bulk update.  However, all these paths are designed to write data that
+no other transaction can see until after T1 commits.  The situation is thus
+not different from ordinary WAL-logged updates.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 5bafef1be3..9665d12954 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -14,17 +14,19 @@
  * CLOG page is initialized to zeroes.	Other writes of CLOG come from
  * recording of transaction commit or abort in xact.c, which generates its
  * own XLOG records for these events and will re-perform the status update
- * on redo; so we need make no additional XLOG entry here.	Also, the XLOG
- * is guaranteed flushed through the XLOG commit record before we are called
- * to log a commit, so the WAL rule "write xlog before data" is satisfied
- * automatically for commits, and we don't really care for aborts.  Therefore,
- * we don't need to mark CLOG pages with LSN information; we have enough
- * synchronization already.
+ * on redo; so we need make no additional XLOG entry here.  For synchronous
+ * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
+ * record before we are called to log a commit, so the WAL rule "write xlog
+ * before data" is satisfied automatically.  However, for async commits we
+ * must track the latest LSN affecting each CLOG page, so that we can flush
+ * XLOG that far and satisfy the WAL rule.  We don't have to worry about this
+ * for aborts (whether sync or async), since the post-crash assumption would
+ * be that such transactions failed anyway.
  *
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.42 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.43 2007/08/01 22:45:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,6 +59,13 @@
 #define TransactionIdToByte(xid)	(TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
 #define TransactionIdToBIndex(xid)	((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
 
+/* We store the latest async LSN for each group of transactions */
+#define CLOG_XACTS_PER_LSN_GROUP	32		/* keep this a power of 2 */
+#define CLOG_LSNS_PER_PAGE  (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+#define GetLSNIndex(slotno, xid)	((slotno) * CLOG_LSNS_PER_PAGE + \
+	((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
+
 
 /*
  * Link to shared-memory data structures for CLOG control
@@ -75,11 +84,16 @@ static void WriteTruncateXlogRec(int pageno);
 /*
  * Record the final state of a transaction in the commit log.
  *
+ * lsn must be the WAL location of the commit record when recording an async
+ * commit.  For a synchronous commit it can be InvalidXLogRecPtr, since the
+ * caller guarantees the commit record is already flushed in that case.  It
+ * should be InvalidXLogRecPtr for abort cases, too.
+ *
  * NB: this is a low-level routine and is NOT the preferred entry point
  * for most uses; TransactionLogUpdate() in transam.c is the intended caller.
  */
 void
-TransactionIdSetStatus(TransactionId xid, XidStatus status)
+TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
 {
 	int			pageno = TransactionIdToPage(xid);
 	int			byteno = TransactionIdToByte(xid);
@@ -94,7 +108,16 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 
 	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
-	slotno = SimpleLruReadPage(ClogCtl, pageno, xid);
+	/*
+	 * If we're doing an async commit (ie, lsn is valid), then we must wait
+	 * for any active write on the page slot to complete.  Otherwise our
+	 * update could reach disk in that write, which will not do since we
+	 * mustn't let it reach disk until we've done the appropriate WAL flush.
+	 * But when lsn is invalid, it's OK to scribble on a page while it is
+	 * write-busy, since we don't care if the update reaches disk sooner than
+	 * we think.  Hence, pass write_ok = XLogRecPtrIsInvalid(lsn).
+	 */
+	slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
 	byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
 
 	/* Current state should be 0, subcommitted or target state */
@@ -110,22 +133,48 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 
 	ClogCtl->shared->page_dirty[slotno] = true;
 
+	/*
+	 * Update the group LSN if the transaction completion LSN is higher.
+	 *
+	 * Note: lsn will be invalid when supplied during InRecovery processing,
+	 * so we don't need to do anything special to avoid LSN updates during
+	 * recovery. After recovery completes the next clog change will set the
+	 * LSN correctly.
+	 */
+	if (!XLogRecPtrIsInvalid(lsn))
+	{
+		int			lsnindex = GetLSNIndex(slotno, xid);
+
+		if (XLByteLT(ClogCtl->shared->group_lsn[lsnindex], lsn))
+			ClogCtl->shared->group_lsn[lsnindex] = lsn;
+	}
+
 	LWLockRelease(CLogControlLock);
 }
 
 /*
  * Interrogate the state of a transaction in the commit log.
  *
+ * Aside from the actual commit status, this function returns (into *lsn)
+ * an LSN that is late enough to be able to guarantee that if we flush up to
+ * that LSN then we will have flushed the transaction's commit record to disk.
+ * The result is not necessarily the exact LSN of the transaction's commit
+ * record!  For example, for long-past transactions (those whose clog pages
+ * already migrated to disk), we'll return InvalidXLogRecPtr.  Also, because
+ * we group transactions on the same clog page to conserve storage, we might
+ * return the LSN of a later transaction that falls into the same group.
+ *
  * NB: this is a low-level routine and is NOT the preferred entry point
  * for most uses; TransactionLogFetch() in transam.c is the intended caller.
  */
 XidStatus
-TransactionIdGetStatus(TransactionId xid)
+TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
 {
 	int			pageno = TransactionIdToPage(xid);
 	int			byteno = TransactionIdToByte(xid);
 	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
 	int			slotno;
+	int			lsnindex;
 	char	   *byteptr;
 	XidStatus	status;
 
@@ -136,6 +185,9 @@ TransactionIdGetStatus(TransactionId xid)
 
 	status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
 
+	lsnindex = GetLSNIndex(slotno, xid);
+	*lsn = ClogCtl->shared->group_lsn[lsnindex];
+
 	LWLockRelease(CLogControlLock);
 
 	return status;
@@ -148,14 +200,14 @@ TransactionIdGetStatus(TransactionId xid)
 Size
 CLOGShmemSize(void)
 {
-	return SimpleLruShmemSize(NUM_CLOG_BUFFERS);
+	return SimpleLruShmemSize(NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE);
 }
 
 void
 CLOGShmemInit(void)
 {
 	ClogCtl->PagePrecedes = CLOGPagePrecedes;
-	SimpleLruInit(ClogCtl, "CLOG Ctl", NUM_CLOG_BUFFERS,
+	SimpleLruInit(ClogCtl, "CLOG Ctl", NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE,
 				  CLogControlLock, "pg_clog");
 }
 
@@ -240,7 +292,7 @@ StartupCLOG(void)
 		int			slotno;
 		char	   *byteptr;
 
-		slotno = SimpleLruReadPage(ClogCtl, pageno, xid);
+		slotno = SimpleLruReadPage(ClogCtl, pageno, false, xid);
 		byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
 
 		/* Zero so-far-unused positions in the current byte */
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 704bf6a0ba..3ce6f14bcf 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -42,7 +42,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.23 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.24 2007/08/01 22:45:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -749,7 +749,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	 * enough that a MultiXactId is really involved.  Perhaps someday we'll
 	 * take the trouble to generalize the slru.c error reporting code.
 	 */
-	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
+	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 	offptr += entryno;
 
@@ -773,7 +773,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 
 		if (pageno != prev_pageno)
 		{
-			slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, multi);
+			slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
 			prev_pageno = pageno;
 		}
 
@@ -993,7 +993,7 @@ retry:
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
-	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
+	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 	offptr += entryno;
 	offset = *offptr;
@@ -1025,7 +1025,7 @@ retry:
 		entryno = MultiXactIdToOffsetEntry(tmpMXact);
 
 		if (pageno != prev_pageno)
-			slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, tmpMXact);
+			slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
 
 		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 		offptr += entryno;
@@ -1061,7 +1061,7 @@ retry:
 
 		if (pageno != prev_pageno)
 		{
-			slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, multi);
+			slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
 			prev_pageno = pageno;
 		}
 
@@ -1289,8 +1289,8 @@ MultiXactShmemSize(void)
 			 mul_size(sizeof(MultiXactId) * 2, MaxBackends))
 
 	size = SHARED_MULTIXACT_STATE_SIZE;
-	size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS));
-	size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS));
+	size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0));
 
 	return size;
 }
@@ -1306,10 +1306,10 @@ MultiXactShmemInit(void)
 	MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
 
 	SimpleLruInit(MultiXactOffsetCtl,
-				  "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS,
+				  "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS, 0,
 				  MultiXactOffsetControlLock, "pg_multixact/offsets");
 	SimpleLruInit(MultiXactMemberCtl,
-				  "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS,
+				  "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS, 0,
 				  MultiXactMemberControlLock, "pg_multixact/members");
 
 	/* Initialize our shared state struct */
@@ -1442,7 +1442,7 @@ StartupMultiXact(void)
 		int			slotno;
 		MultiXactOffset *offptr;
 
-		slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
+		slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
 		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 		offptr += entryno;
 
@@ -1472,7 +1472,7 @@ StartupMultiXact(void)
 		int			slotno;
 		TransactionId *xidptr;
 
-		slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, offset);
+		slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
 		xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno];
 		xidptr += entryno;
 
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index e68ed7e331..bf3990bc29 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -41,7 +41,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.40 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.41 2007/08/01 22:45:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -140,6 +140,8 @@ static SlruErrorCause slru_errcause;
 static int	slru_errno;
 
 
+static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
+static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
 static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
 static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
 					  SlruFlush fdata);
@@ -152,7 +154,7 @@ static int	SlruSelectLRUPage(SlruCtl ctl, int pageno);
  */
 
 Size
-SimpleLruShmemSize(int nslots)
+SimpleLruShmemSize(int nslots, int nlsns)
 {
 	Size		sz;
 
@@ -165,18 +167,21 @@ SimpleLruShmemSize(int nslots)
 	sz += MAXALIGN(nslots * sizeof(int));		/* page_lru_count[] */
 	sz += MAXALIGN(nslots * sizeof(LWLockId));	/* buffer_locks[] */
 
+	if (nlsns > 0)
+		sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
+
 	return BUFFERALIGN(sz) + BLCKSZ * nslots;
 }
 
 void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 			  LWLockId ctllock, const char *subdir)
 {
 	SlruShared	shared;
 	bool		found;
 
 	shared = (SlruShared) ShmemInitStruct(name,
-										  SimpleLruShmemSize(nslots),
+										  SimpleLruShmemSize(nslots, nlsns),
 										  &found);
 
 	if (!IsUnderPostmaster)
@@ -193,6 +198,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
 		shared->ControlLock = ctllock;
 
 		shared->num_slots = nslots;
+		shared->lsn_groups_per_page = nlsns;
 
 		shared->cur_lru_count = 0;
 
@@ -212,8 +218,14 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
 		offset += MAXALIGN(nslots * sizeof(int));
 		shared->buffer_locks = (LWLockId *) (ptr + offset);
 		offset += MAXALIGN(nslots * sizeof(LWLockId));
-		ptr += BUFFERALIGN(offset);
 
+		if (nlsns > 0)
+		{
+			shared->group_lsn = (XLogRecPtr *) (ptr + offset);
+			offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
+		}
+
+		ptr += BUFFERALIGN(offset);
 		for (slotno = 0; slotno < nslots; slotno++)
 		{
 			shared->page_buffer[slotno] = ptr;
@@ -266,15 +278,37 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
 	/* Set the buffer to zeroes */
 	MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
 
+	/* Set the LSNs for this new page to zero */
+	SimpleLruZeroLSNs(ctl, slotno);
+
 	/* Assume this page is now the latest active page */
 	shared->latest_page_number = pageno;
 
 	return slotno;
 }
 
+/*
+ * Zero all the LSNs we store for this slru page.
+ *
+ * This should be called each time we create a new page, and each time we read
+ * in a page from disk into an existing buffer.  (Such an old page cannot
+ * have any interesting LSNs, since we'd have flushed them before writing
+ * the page in the first place.)
+ */
+static void
+SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
+{
+	SlruShared	shared = ctl->shared;
+
+	if (shared->lsn_groups_per_page > 0)
+		MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
+			   shared->lsn_groups_per_page * sizeof(XLogRecPtr));
+}
+
 /*
  * Wait for any active I/O on a page slot to finish.  (This does not
- * guarantee that new I/O hasn't been started before we return, though.)
+ * guarantee that new I/O hasn't been started before we return, though.
+ * In fact the slot might not even contain the same page anymore.)
  *
  * Control lock must be held at entry, and will be held at exit.
  */
@@ -305,8 +339,7 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
 			/* indeed, the I/O must have failed */
 			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
 				shared->page_status[slotno] = SLRU_PAGE_EMPTY;
-			else
-				/* write_in_progress */
+			else				/* write_in_progress */
 			{
 				shared->page_status[slotno] = SLRU_PAGE_VALID;
 				shared->page_dirty[slotno] = true;
@@ -320,6 +353,11 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
  * Find a page in a shared buffer, reading it in if necessary.
  * The page number must correspond to an already-initialized page.
  *
+ * If write_ok is true then it is OK to return a page that is in
+ * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
+ * that modification of the page is safe.  If write_ok is false then we
+ * will not return the page until it is not undergoing active I/O.
+ *
  * The passed-in xid is used only for error reporting, and may be
  * InvalidTransactionId if no specific xid is associated with the action.
  *
@@ -329,7 +367,8 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
  * Control lock must be held at entry, and will be held at exit.
  */
 int
-SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
+SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+				  TransactionId xid)
 {
 	SlruShared	shared = ctl->shared;
 
@@ -346,8 +385,13 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
 		if (shared->page_number[slotno] == pageno &&
 			shared->page_status[slotno] != SLRU_PAGE_EMPTY)
 		{
-			/* If page is still being read in, we must wait for I/O */
-			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
+			/*
+			 * If page is still being read in, we must wait for I/O.  Likewise
+			 * if the page is being written and the caller said that's not OK.
+			 */
+			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
+				(shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+				 !write_ok))
 			{
 				SimpleLruWaitIO(ctl, slotno);
 				/* Now we must recheck state from the top */
@@ -383,6 +427,9 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
 		/* Do the read */
 		ok = SlruPhysicalReadPage(ctl, pageno, slotno);
 
+		/* Set the LSNs for this newly read-in page to zero */
+		SimpleLruZeroLSNs(ctl, slotno);
+
 		/* Re-acquire control lock and update page state */
 		LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
@@ -443,7 +490,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
 	LWLockRelease(shared->ControlLock);
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
-	return SimpleLruReadPage(ctl, pageno, xid);
+	return SimpleLruReadPage(ctl, pageno, true, xid);
 }
 
 /*
@@ -621,6 +668,47 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	char		path[MAXPGPATH];
 	int			fd = -1;
 
+	/*
+	 * Honor the write-WAL-before-data rule, if appropriate, so that we do
+	 * not write out data before associated WAL records.  This is the same
+	 * action performed during FlushBuffer() in the main buffer manager.
+	 */
+	if (shared->group_lsn != NULL)
+	{
+		/*
+		 * We must determine the largest async-commit LSN for the page.
+		 * This is a bit tedious, but since this entire function is a slow
+		 * path anyway, it seems better to do this here than to maintain
+		 * a per-page LSN variable (which'd need an extra comparison in the
+		 * transaction-commit path).
+		 */
+		XLogRecPtr	max_lsn;
+		int			lsnindex, lsnoff;
+
+		lsnindex = slotno * shared->lsn_groups_per_page;
+		max_lsn = shared->group_lsn[lsnindex++];
+		for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
+		{
+			XLogRecPtr	this_lsn = shared->group_lsn[lsnindex++];
+
+			if (XLByteLT(max_lsn, this_lsn))
+				max_lsn = this_lsn;
+		}
+
+		if (!XLogRecPtrIsInvalid(max_lsn))
+		{
+			/*
+			 * As noted above, elog(ERROR) is not acceptable here, so if
+			 * XLogFlush were to fail, we must PANIC.  This isn't much of
+			 * a restriction because XLogFlush is just about all critical
+			 * section anyway, but let's make sure.
+			 */
+			START_CRIT_SECTION();
+			XLogFlush(max_lsn);
+			END_CRIT_SECTION();
+		}
+	}
+
 	/*
 	 * During a Flush, we may already have the desired file open.
 	 */
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 6205d43820..b3836c5231 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -22,7 +22,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.18 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.19 2007/08/01 22:45:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -78,7 +78,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
 
 	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-	slotno = SimpleLruReadPage(SubTransCtl, pageno, xid);
+	slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
 	ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
 	ptr += entryno;
 
@@ -165,14 +165,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
 Size
 SUBTRANSShmemSize(void)
 {
-	return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
+	return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
 }
 
 void
 SUBTRANSShmemInit(void)
 {
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
-	SimpleLruInit(SubTransCtl, "SUBTRANS Ctl", NUM_SUBTRANS_BUFFERS,
+	SimpleLruInit(SubTransCtl, "SUBTRANS Ctl", NUM_SUBTRANS_BUFFERS, 0,
 				  SubtransControlLock, "pg_subtrans");
 	/* Override default assumption that writes should be fsync'd */
 	SubTransCtl->do_fsync = false;
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index c2ad0c11a0..3466b50ef2 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.69 2007/01/05 22:19:23 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.70 2007/08/01 22:45:07 tgl Exp $
  *
  * NOTES
  *	  This file contains the high level access-method interface to the
@@ -27,14 +27,17 @@
 
 static XidStatus TransactionLogFetch(TransactionId transactionId);
 static void TransactionLogUpdate(TransactionId transactionId,
-					 XidStatus status);
+					 XidStatus status, XLogRecPtr lsn);
 
-/* ----------------
- *		Single-item cache for results of TransactionLogFetch.
- * ----------------
+/*
+ * Single-item cache for results of TransactionLogFetch.
  */
 static TransactionId cachedFetchXid = InvalidTransactionId;
 static XidStatus cachedFetchXidStatus;
+static XLogRecPtr cachedCommitLSN;
+
+/* Handy constant for an invalid xlog recptr */
+static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
 
 
 /* ----------------------------------------------------------------
@@ -52,6 +55,7 @@ static XidStatus
 TransactionLogFetch(TransactionId transactionId)
 {
 	XidStatus	xidstatus;
+	XLogRecPtr	xidlsn;
 
 	/*
 	 * Before going to the commit log manager, check our single item cache to
@@ -73,9 +77,9 @@ TransactionLogFetch(TransactionId transactionId)
 	}
 
 	/*
-	 * Get the status.
+	 * Get the transaction status.
 	 */
-	xidstatus = TransactionIdGetStatus(transactionId);
+	xidstatus = TransactionIdGetStatus(transactionId, &xidlsn);
 
 	/*
 	 * DO NOT cache status for unfinished or sub-committed transactions! We
@@ -84,8 +88,9 @@ TransactionLogFetch(TransactionId transactionId)
 	if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
 		xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
 	{
-		TransactionIdStore(transactionId, &cachedFetchXid);
+		cachedFetchXid = transactionId;
 		cachedFetchXidStatus = xidstatus;
+		cachedCommitLSN = xidlsn;
 	}
 
 	return xidstatus;
@@ -93,16 +98,19 @@ TransactionLogFetch(TransactionId transactionId)
 
 /* --------------------------------
  *		TransactionLogUpdate
+ *
+ * Store the new status of a transaction.  The commit record LSN must be
+ * passed when recording an async commit; else it should be InvalidXLogRecPtr.
  * --------------------------------
  */
-static void
-TransactionLogUpdate(TransactionId transactionId,		/* trans id to update */
-					 XidStatus status)	/* new trans status */
+static inline void
+TransactionLogUpdate(TransactionId transactionId,
+					 XidStatus status, XLogRecPtr lsn)
 {
 	/*
 	 * update the commit log
 	 */
-	TransactionIdSetStatus(transactionId, status);
+	TransactionIdSetStatus(transactionId, status, lsn);
 }
 
 /*
@@ -111,15 +119,16 @@ TransactionLogUpdate(TransactionId transactionId,		/* trans id to update */
  * Update multiple transaction identifiers to a given status.
  * Don't depend on this being atomic; it's not.
  */
-static void
-TransactionLogMultiUpdate(int nxids, TransactionId *xids, XidStatus status)
+static inline void
+TransactionLogMultiUpdate(int nxids, TransactionId *xids,
+						  XidStatus status, XLogRecPtr lsn)
 {
 	int			i;
 
 	Assert(nxids != 0);
 
 	for (i = 0; i < nxids; i++)
-		TransactionIdSetStatus(xids[i], status);
+		TransactionIdSetStatus(xids[i], status, lsn);
 }
 
 /* ----------------------------------------------------------------
@@ -269,31 +278,49 @@ TransactionIdDidAbort(TransactionId transactionId)
 void
 TransactionIdCommit(TransactionId transactionId)
 {
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED);
+	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED,
+						 InvalidXLogRecPtr);
+}
+
+/*
+ * TransactionIdAsyncCommit
+ *		Same as above, but for async commits.  The commit record LSN is needed.
+ */
+void
+TransactionIdAsyncCommit(TransactionId transactionId, XLogRecPtr lsn)
+{
+	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED, lsn);
 }
 
+
 /*
  * TransactionIdAbort
  *		Aborts the transaction associated with the identifier.
  *
  * Note:
  *		Assumes transaction identifier is valid.
+ *		No async version of this is needed.
  */
 void
 TransactionIdAbort(TransactionId transactionId)
 {
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_ABORTED);
+	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_ABORTED,
+						 InvalidXLogRecPtr);
 }
 
 /*
  * TransactionIdSubCommit
  *		Marks the subtransaction associated with the identifier as
  *		sub-committed.
+ *
+ * Note:
+ *		No async version of this is needed.
  */
 void
 TransactionIdSubCommit(TransactionId transactionId)
 {
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_SUB_COMMITTED);
+	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_SUB_COMMITTED,
+						 InvalidXLogRecPtr);
 }
 
 /*
@@ -309,9 +336,23 @@ void
 TransactionIdCommitTree(int nxids, TransactionId *xids)
 {
 	if (nxids > 0)
-		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED);
+		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED,
+								  InvalidXLogRecPtr);
 }
 
+/*
+ * TransactionIdAsyncCommitTree
+ *		Same as above, but for async commits.  The commit record LSN is needed.
+ */
+void
+TransactionIdAsyncCommitTree(int nxids, TransactionId *xids, XLogRecPtr lsn)
+{
+	if (nxids > 0)
+		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED,
+								  lsn);
+}
+
+
 /*
  * TransactionIdAbortTree
  *		Marks all the given transaction ids as aborted.
@@ -323,7 +364,8 @@ void
 TransactionIdAbortTree(int nxids, TransactionId *xids)
 {
 	if (nxids > 0)
-		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_ABORTED);
+		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_ABORTED,
+								  InvalidXLogRecPtr);
 }
 
 /*
@@ -389,3 +431,43 @@ TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
 	diff = (int32) (id1 - id2);
 	return (diff >= 0);
 }
+
+/*
+ * TransactionIdGetCommitLSN
+ *
+ * This function returns an LSN that is late enough to be able
+ * to guarantee that if we flush up to the LSN returned then we
+ * will have flushed the transaction's commit record to disk.
+ *
+ * The result is not necessarily the exact LSN of the transaction's
+ * commit record!  For example, for long-past transactions (those whose
+ * clog pages already migrated to disk), we'll return InvalidXLogRecPtr.
+ * Also, because we group transactions on the same clog page to conserve
+ * storage, we might return the LSN of a later transaction that falls into
+ * the same group.
+ */
+XLogRecPtr
+TransactionIdGetCommitLSN(TransactionId xid)
+{
+	XLogRecPtr	result;
+
+	/*
+	 * Currently, all uses of this function are for xids that were just
+	 * reported to be committed by TransactionLogFetch, so we expect that
+	 * checking TransactionLogFetch's cache will usually succeed and avoid an
+	 * extra trip to shared memory.
+	 */
+	if (TransactionIdEquals(xid, cachedFetchXid))
+		return cachedCommitLSN;
+
+	/* Special XIDs are always known committed */
+	if (!TransactionIdIsNormal(xid))
+		return InvalidXLogRecPtr;
+
+	/*
+	 * Get the transaction status.
+	 */
+	(void) TransactionIdGetStatus(xid, &result);
+
+	return result;
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 7fdf5a7eed..2ae81e823d 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.31 2007/05/27 03:50:39 tgl Exp $
+ *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.32 2007/08/01 22:45:07 tgl Exp $
  *
  * NOTES
  *		Each global transaction is associated with a global transaction
@@ -1706,7 +1706,11 @@ RecordTransactionCommitPrepared(TransactionId xid,
 						XLOG_XACT_COMMIT_PREPARED | XLOG_NO_TRAN,
 						rdata);
 
-	/* we don't currently try to sleep before flush here ... */
+	/*
+	 * We don't currently try to sleep before flush here ... nor is there
+	 * any support for async commit of a prepared xact (the very idea is
+	 * probably a contradiction)
+	 */
 
 	/* Flush XLOG to disk */
 	XLogFlush(recptr);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 72a7cf40a6..117525b5ac 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.245 2007/06/07 21:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.246 2007/08/01 22:45:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -55,6 +55,8 @@ int			XactIsoLevel;
 bool		DefaultXactReadOnly = false;
 bool		XactReadOnly;
 
+bool		XactSyncCommit = true;
+
 int			CommitDelay = 0;	/* precommit delay in microseconds */
 int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 
@@ -174,6 +176,11 @@ static TimestampTz xactStopTimestamp;
  */
 static char *prepareGID;
 
+/*
+ * Some commands want to force synchronous commit.
+ */
+static bool forceSyncCommit = false;
+
 /*
  * Private context for transaction-abort work --- we reserve space for this
  * at startup to ensure that AbortTransaction and AbortSubTransaction can work
@@ -554,6 +561,18 @@ CommandCounterIncrement(void)
 	AtStart_Cache();
 }
 
+/*
+ * ForceSyncCommit
+ *
+ * Interface routine to allow commands to force a synchronous commit of the
+ * current top-level transaction
+ */
+void
+ForceSyncCommit(void)
+{
+	forceSyncCommit = true;
+}
+
 
 /* ----------------------------------------------------------------
  *						StartTransaction stuff
@@ -724,6 +743,7 @@ RecordTransactionCommit(void)
 	{
 		TransactionId xid = GetCurrentTransactionId();
 		bool		madeTCentries;
+		bool		isAsyncCommit = false;
 		XLogRecPtr	recptr;
 
 		/* Tell bufmgr and smgr to prepare for commit */
@@ -810,21 +830,44 @@ RecordTransactionCommit(void)
 		if (MyXactMadeXLogEntry)
 		{
 			/*
-			 * Sleep before flush! So we can flush more than one commit
-			 * records per single fsync.  (The idea is some other backend may
-			 * do the XLogFlush while we're sleeping.  This needs work still,
-			 * because on most Unixen, the minimum select() delay is 10msec or
-			 * more, which is way too long.)
-			 *
-			 * We do not sleep if enableFsync is not turned on, nor if there
-			 * are fewer than CommitSiblings other backends with active
-			 * transactions.
+			 * If the user has set synchronous_commit = off, and we're
+			 * not doing cleanup of any rels nor committing any command
+			 * that wanted to force sync commit, then we can defer fsync.
 			 */
-			if (CommitDelay > 0 && enableFsync &&
-				CountActiveBackends() >= CommitSiblings)
-				pg_usleep(CommitDelay);
+			if (XactSyncCommit || forceSyncCommit || nrels > 0)
+			{
+				/*
+				 * Synchronous commit case.
+				 *
+				 * Sleep before flush! So we can flush more than one commit
+				 * records per single fsync.  (The idea is some other backend
+				 * may do the XLogFlush while we're sleeping.  This needs work
+				 * still, because on most Unixen, the minimum select() delay
+				 * is 10msec or more, which is way too long.)
+				 *
+				 * We do not sleep if enableFsync is not turned on, nor if
+				 * there are fewer than CommitSiblings other backends with
+				 * active transactions.
+				 */
+				if (CommitDelay > 0 && enableFsync &&
+					CountActiveBackends() >= CommitSiblings)
+					pg_usleep(CommitDelay);
 
-			XLogFlush(recptr);
+				XLogFlush(recptr);
+			}
+			else
+			{
+				/*
+				 * Asynchronous commit case.
+				 */
+				isAsyncCommit = true;
+
+				/*
+				 * Report the latest async commit LSN, so that
+				 * the WAL writer knows to flush this commit.
+				 */
+				XLogSetAsyncCommitLSN(recptr);
+			}
 		}
 
 		/*
@@ -835,12 +878,24 @@ RecordTransactionCommit(void)
 		 * emitted an XLOG record for our commit, and so in the event of a
 		 * crash the clog update might be lost.  This is okay because no one
 		 * else will ever care whether we committed.
+		 *
+		 * The recptr here refers to the last xlog entry by this transaction
+		 * so is the correct value to use for setting the clog.
 		 */
 		if (madeTCentries || MyXactMadeTempRelUpdate)
 		{
-			TransactionIdCommit(xid);
-			/* to avoid race conditions, the parent must commit first */
-			TransactionIdCommitTree(nchildren, children);
+			if (isAsyncCommit)
+			{
+				TransactionIdAsyncCommit(xid, recptr);
+				/* to avoid race conditions, the parent must commit first */
+				TransactionIdAsyncCommitTree(nchildren, children, recptr);
+			}
+			else
+			{
+				TransactionIdCommit(xid);
+				/* to avoid race conditions, the parent must commit first */
+				TransactionIdCommitTree(nchildren, children);
+			}
 		}
 
 		/* Checkpoint can proceed now */
@@ -1406,6 +1461,7 @@ StartTransaction(void)
 	FreeXactSnapshot();
 	XactIsoLevel = DefaultXactIsoLevel;
 	XactReadOnly = DefaultXactReadOnly;
+	forceSyncCommit = false;
 
 	/*
 	 * reinitialize within-transaction counters
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 25789ddaa6..4c7024baa3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.275 2007/07/24 04:54:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.276 2007/08/01 22:45:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -305,6 +305,7 @@ typedef struct XLogCtlData
 	XLogwrtResult LogwrtResult;
 	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
 	TransactionId ckptXid;
+	XLogRecPtr	asyncCommitLSN;	/* LSN of newest async commit */
 
 	/* Protected by WALWriteLock: */
 	XLogCtlWrite Write;
@@ -1643,6 +1644,22 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
 	Write->LogwrtResult = LogwrtResult;
 }
 
+/*
+ * Record the LSN for an asynchronous transaction commit.
+ * (This should not be called for aborts, nor for synchronous commits.)
+ */
+void
+XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
+
+	SpinLockAcquire(&xlogctl->info_lck);
+	if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
+		xlogctl->asyncCommitLSN = asyncCommitLSN;
+	SpinLockRelease(&xlogctl->info_lck);
+}
+
 /*
  * Ensure that all XLOG data through the given position is flushed to disk.
  *
@@ -1797,19 +1814,17 @@ XLogBackgroundFlush(void)
 	/* back off to last completed page boundary */
 	WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
 
-#ifdef NOT_YET					/* async commit patch is still to come */
 	/* if we have already flushed that far, consider async commit records */
 	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
 	{
 		/* use volatile pointer to prevent code rearrangement */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
-		SpinLockAcquire(&xlogctl->async_commit_lck);
+		SpinLockAcquire(&xlogctl->info_lck);
 		WriteRqstPtr = xlogctl->asyncCommitLSN;
-		SpinLockRelease(&xlogctl->async_commit_lck);
+		SpinLockRelease(&xlogctl->info_lck);
 		flexible = false;		/* ensure it all gets written */
 	}
-#endif
 
 	/* Done if already known flushed */
 	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
@@ -1841,6 +1856,23 @@ XLogBackgroundFlush(void)
 	END_CRIT_SECTION();
 }
 
+/*
+ * Flush any previous asynchronously-committed transactions' commit records.
+ */
+void
+XLogAsyncCommitFlush(void)
+{
+	XLogRecPtr	WriteRqstPtr;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
+
+	SpinLockAcquire(&xlogctl->info_lck);
+	WriteRqstPtr = xlogctl->asyncCommitLSN;
+	SpinLockRelease(&xlogctl->info_lck);
+
+	XLogFlush(WriteRqstPtr);
+}
+
 /*
  * Test whether XLOG data has been flushed up to (at least) the given position.
  *
@@ -5466,7 +5498,7 @@ ShutdownXLOG(int code, Datum arg)
 			(errmsg("database system is shut down")));
 }
 
-/* 
+/*
  * Log start of a checkpoint.
  */
 static void
@@ -5481,7 +5513,7 @@ LogCheckpointStart(int flags)
 		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
 }
 
-/* 
+/*
  * Log end of a checkpoint.
  */
 static void
@@ -5523,7 +5555,7 @@ LogCheckpointEnd(void)
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
- *		ignoring checkpoint_completion_target parameter. 
+ *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
  *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
  *
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 4ceb962fb9..34b6da99df 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.196 2007/06/28 00:02:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.197 2007/08/01 22:45:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -515,7 +515,11 @@ createdb(const CreatedbStmt *stmt)
 		heap_close(pg_database_rel, NoLock);
 
 		/*
-		 * Set flag to update flat database file at commit.
+		 * Set flag to update flat database file at commit.  Note: this also
+		 * forces synchronous commit, which minimizes the window between
+		 * creation of the database files and commital of the transaction.
+		 * If we crash before committing, we'll have a DB that's taking up
+		 * disk space but is not in pg_database, which is not good.
 		 */
 		database_file_update_needed();
 	}
@@ -675,7 +679,11 @@ dropdb(const char *dbname, bool missing_ok)
 	heap_close(pgdbrel, NoLock);
 
 	/*
-	 * Set flag to update flat database file at commit.
+	 * Set flag to update flat database file at commit.  Note: this also
+	 * forces synchronous commit, which minimizes the window between
+	 * removal of the database files and commital of the transaction.
+	 * If we crash before committing, we'll have a DB that's gone on disk
+	 * but still there according to pg_database, which is not good.
 	 */
 	database_file_update_needed();
 }
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index d0dacf1078..f19e237315 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.48 2007/06/07 19:19:56 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -354,6 +354,14 @@ CreateTableSpace(CreateTableSpaceStmt *stmt)
 		(void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, rdata);
 	}
 
+	/*
+	 * Force synchronous commit, to minimize the window between creating
+	 * the symlink on-disk and marking the transaction committed.  It's
+	 * not great that there is any window at all, but definitely we don't
+	 * want to make it larger than necessary.
+	 */
+	ForceSyncCommit();
+
 	pfree(linkloc);
 	pfree(location);
 
@@ -480,6 +488,14 @@ DropTableSpace(DropTableSpaceStmt *stmt)
 	 * entries for relations in the tablespace.
 	 */
 
+	/*
+	 * Force synchronous commit, to minimize the window between removing
+	 * the files on-disk and marking the transaction committed.  It's
+	 * not great that there is any window at all, but definitely we don't
+	 * want to make it larger than necessary.
+	 */
+	ForceSyncCommit();
+
 	/*
 	 * Allow TablespaceCreateDbspace again.
 	 */
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 8fa17ab235..41c3b86791 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.353 2007/06/14 13:53:14 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.354 2007/08/01 22:45:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -27,6 +27,7 @@
 #include "access/heapam.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_database.h"
 #include "commands/dbcommands.h"
@@ -1162,6 +1163,16 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
 						  &OldestXmin, &FreezeLimit);
 
+	/* 
+	 * VACUUM FULL assumes that all tuple states are well-known prior to
+	 * moving tuples around --- see comment "known dead" in repair_frag(),
+	 * as well as simplifications in tqual.c.  So before we start we must
+	 * ensure that any asynchronously-committed transactions with changes
+	 * against this table have been flushed to disk.  It's sufficient to do
+	 * this once after we've acquired AccessExclusiveLock.
+	 */
+	XLogAsyncCommitFlush();
+
 	/*
 	 * Set up statistics-gathering machinery.
 	 */
@@ -2373,8 +2384,15 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 		 * exclusive access to the relation.  However, that would require a
 		 * lot of extra code to close and re-open the relation, indexes, etc.
 		 * For now, a quick hack: record status of current transaction as
-		 * committed, and continue.
+		 * committed, and continue.  We force the commit to be synchronous
+		 * so that it's down to disk before we truncate.  (Note: tqual.c
+		 * knows that VACUUM FULL always uses sync commit, too.)
+		 *
+		 * XXX This desperately needs to be revisited.  Any failure after
+		 * this point will result in a PANIC "cannot abort transaction nnn,
+		 * it was already committed"!
 		 */
+		ForceSyncCommit();
 		RecordTransactionCommit();
 	}
 
diff --git a/src/backend/utils/init/flatfiles.c b/src/backend/utils/init/flatfiles.c
index 992fc70bb4..c9b1ac509c 100644
--- a/src/backend/utils/init/flatfiles.c
+++ b/src/backend/utils/init/flatfiles.c
@@ -23,7 +23,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/utils/init/flatfiles.c,v 1.26 2007/06/12 17:16:52 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/init/flatfiles.c,v 1.27 2007/08/01 22:45:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -855,6 +855,14 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
 	 * Signal the postmaster to reload its caches.
 	 */
 	SendPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE);
+
+	/*
+	 * Force synchronous commit, to minimize the window between changing
+	 * the flat files on-disk and marking the transaction committed.  It's
+	 * not great that there is any window at all, but definitely we don't
+	 * want to make it larger than necessary.
+	 */
+	ForceSyncCommit();
 }
 
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b2d0ea9cae..c30d8b50a0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.407 2007/07/24 04:54:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.408 2007/08/01 22:45:09 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -553,6 +553,14 @@ static struct config_bool ConfigureNamesBool[] =
 		&enableFsync,
 		true, NULL, NULL
 	},
+	{
+		{"synchronous_commit", PGC_USERSET, WAL_SETTINGS,
+			gettext_noop("Sets immediate fsync at commit."),
+			NULL
+		},
+		&XactSyncCommit,
+		true, NULL, NULL
+	},
 	{
 		{"zero_damaged_pages", PGC_SUSET, DEVELOPER_OPTIONS,
 			gettext_noop("Continues processing past damaged page headers."),
@@ -1521,7 +1529,7 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"commit_delay", PGC_USERSET, WAL_CHECKPOINTS,
+		{"commit_delay", PGC_USERSET, WAL_SETTINGS,
 			gettext_noop("Sets the delay in microseconds between transaction commit and "
 						 "flushing WAL to disk."),
 			NULL
@@ -1531,7 +1539,7 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"commit_siblings", PGC_USERSET, WAL_CHECKPOINTS,
+		{"commit_siblings", PGC_USERSET, WAL_SETTINGS,
 			gettext_noop("Sets the minimum concurrent open transactions before performing "
 						 "commit_delay."),
 			NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 8bfad997ff..c87e4baf43 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -149,6 +149,7 @@
 # - Settings -
 
 #fsync = on				# turns forced synchronization on or off
+#synchronous_commit = on		# immediate fsync at commit
 #wal_sync_method = fsync		# the default is the first option 
 					# supported by the operating system:
 					#   open_datasync
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 429005a843..edbaa4d6b1 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -31,7 +31,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.102 2007/03/25 19:45:14 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.103 2007/08/01 22:45:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -80,6 +80,44 @@ TransactionId RecentGlobalXmin = InvalidTransactionId;
 static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
 
+/*
+ * HeapTupleSetHintBits()
+ *
+ * Set commit/abort hint bits on a tuple, if appropriate at this time.
+ *
+ * We cannot change the LSN of the page here because we may hold only a share
+ * lock on the buffer, so it is only safe to set a transaction-committed hint
+ * bit if we know the transaction's commit record has been flushed to disk.
+ *
+ * We can always set hint bits when marking a transaction aborted.  Also,
+ * if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then
+ * we can always set the hint bits, since VACUUM FULL always uses synchronous
+ * commits.
+ *
+ * Normal commits may be asynchronous, so for those we need to get the LSN
+ * of the transaction and then check whether this is flushed.
+ *
+ * The caller should pass xid as the XID of the transaction to check, or
+ * InvalidTransactionId if no check is needed.
+ */
+static inline void
+HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
+					 uint16 infomask, TransactionId xid)
+{
+	if (TransactionIdIsValid(xid))
+	{
+		/* NB: xid must be known committed here! */
+		XLogRecPtr  commitLSN = TransactionIdGetCommitLSN(xid);
+
+		if (XLogNeedsFlush(commitLSN))
+			return;				/* not flushed yet, so don't set hint */
+	}
+
+	tuple->t_infomask |= infomask;
+	SetBufferCommitInfoNeedsSave(buffer);
+}
+
+
 /*
  * HeapTupleSatisfiesSelf
  *		True iff heap tuple is valid "for itself".
@@ -122,12 +160,12 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			{
 				if (TransactionIdDidCommit(xvac))
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			}
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -139,14 +177,12 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 				if (TransactionIdIsInProgress(xvac))
 					return false;
 				if (TransactionIdDidCommit(xvac))
-				{
-					tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-					SetBufferCommitInfoNeedsSave(buffer);
-				}
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+										 InvalidTransactionId);
 				else
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
 			}
@@ -164,8 +200,8 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			/* deleting subtransaction aborted? */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
 			{
-				tuple->t_infomask |= HEAP_XMAX_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+									 InvalidTransactionId);
 				return true;
 			}
 
@@ -176,15 +212,13 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 		else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
 			return false;
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 HeapTupleHeaderGetXmin(tuple));
 		else
 		{
 			/* it must have aborted or crashed */
-			tuple->t_infomask |= HEAP_XMIN_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								 InvalidTransactionId);
 			return false;
 		}
 	}
@@ -221,8 +255,8 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return true;
 	}
 
@@ -230,13 +264,13 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	if (tuple->t_infomask & HEAP_IS_LOCKED)
 	{
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return true;
 	}
 
-	tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-	SetBufferCommitInfoNeedsSave(buffer);
+	HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+						 HeapTupleHeaderGetXmax(tuple));
 	return false;
 }
 
@@ -299,12 +333,12 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			{
 				if (TransactionIdDidCommit(xvac))
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			}
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -316,14 +350,12 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 				if (TransactionIdIsInProgress(xvac))
 					return false;
 				if (TransactionIdDidCommit(xvac))
-				{
-					tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-					SetBufferCommitInfoNeedsSave(buffer);
-				}
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+										 InvalidTransactionId);
 				else
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
 			}
@@ -344,8 +376,8 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			/* deleting subtransaction aborted? */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
 			{
-				tuple->t_infomask |= HEAP_XMAX_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+									 InvalidTransactionId);
 				return true;
 			}
 
@@ -359,15 +391,13 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 		else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
 			return false;
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 HeapTupleHeaderGetXmin(tuple));
 		else
 		{
 			/* it must have aborted or crashed */
-			tuple->t_infomask |= HEAP_XMIN_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								 InvalidTransactionId);
 			return false;
 		}
 	}
@@ -407,8 +437,8 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return true;
 	}
 
@@ -416,13 +446,13 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	if (tuple->t_infomask & HEAP_IS_LOCKED)
 	{
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return true;
 	}
 
-	tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-	SetBufferCommitInfoNeedsSave(buffer);
+	HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+						 HeapTupleHeaderGetXmax(tuple));
 	return false;
 }
 
@@ -469,12 +499,12 @@ HeapTupleSatisfiesToast(HeapTupleHeader tuple, Snapshot snapshot,
 			{
 				if (TransactionIdDidCommit(xvac))
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			}
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -486,14 +516,12 @@ HeapTupleSatisfiesToast(HeapTupleHeader tuple, Snapshot snapshot,
 				if (TransactionIdIsInProgress(xvac))
 					return false;
 				if (TransactionIdDidCommit(xvac))
-				{
-					tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-					SetBufferCommitInfoNeedsSave(buffer);
-				}
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+										 InvalidTransactionId);
 				else
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
 			}
@@ -550,12 +578,12 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 			{
 				if (TransactionIdDidCommit(xvac))
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return HeapTupleInvisible;
 				}
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			}
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -567,14 +595,12 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 				if (TransactionIdIsInProgress(xvac))
 					return HeapTupleInvisible;
 				if (TransactionIdDidCommit(xvac))
-				{
-					tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-					SetBufferCommitInfoNeedsSave(buffer);
-				}
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+										 InvalidTransactionId);
 				else
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return HeapTupleInvisible;
 				}
 			}
@@ -595,8 +621,8 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 			/* deleting subtransaction aborted? */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
 			{
-				tuple->t_infomask |= HEAP_XMAX_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+									 InvalidTransactionId);
 				return HeapTupleMayBeUpdated;
 			}
 
@@ -610,15 +636,13 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 		else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
 			return HeapTupleInvisible;
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 HeapTupleHeaderGetXmin(tuple));
 		else
 		{
 			/* it must have aborted or crashed */
-			tuple->t_infomask |= HEAP_XMIN_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								 InvalidTransactionId);
 			return HeapTupleInvisible;
 		}
 	}
@@ -642,8 +666,8 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
 		if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
 			return HeapTupleBeingUpdated;
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return HeapTupleMayBeUpdated;
 	}
 
@@ -663,8 +687,8 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return HeapTupleMayBeUpdated;
 	}
 
@@ -672,13 +696,13 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
 	if (tuple->t_infomask & HEAP_IS_LOCKED)
 	{
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return HeapTupleMayBeUpdated;
 	}
 
-	tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-	SetBufferCommitInfoNeedsSave(buffer);
+	HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+						 HeapTupleHeaderGetXmax(tuple));
 	return HeapTupleUpdated;	/* updated by other */
 }
 
@@ -723,12 +747,12 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 			{
 				if (TransactionIdDidCommit(xvac))
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			}
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -740,14 +764,12 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 				if (TransactionIdIsInProgress(xvac))
 					return false;
 				if (TransactionIdDidCommit(xvac))
-				{
-					tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-					SetBufferCommitInfoNeedsSave(buffer);
-				}
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+										 InvalidTransactionId);
 				else
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
 			}
@@ -765,8 +787,8 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 			/* deleting subtransaction aborted? */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
 			{
-				tuple->t_infomask |= HEAP_XMAX_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+									 InvalidTransactionId);
 				return true;
 			}
 
@@ -781,15 +803,13 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 			return true;		/* in insertion by other */
 		}
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 HeapTupleHeaderGetXmin(tuple));
 		else
 		{
 			/* it must have aborted or crashed */
-			tuple->t_infomask |= HEAP_XMIN_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								 InvalidTransactionId);
 			return false;
 		}
 	}
@@ -829,8 +849,8 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return true;
 	}
 
@@ -838,13 +858,13 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
 	if (tuple->t_infomask & HEAP_IS_LOCKED)
 	{
-		tuple->t_infomask |= HEAP_XMAX_INVALID;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							 InvalidTransactionId);
 		return true;
 	}
 
-	tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-	SetBufferCommitInfoNeedsSave(buffer);
+	HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+						 HeapTupleHeaderGetXmax(tuple));
 	return false;				/* updated by other */
 }
 
@@ -888,12 +908,12 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 			{
 				if (TransactionIdDidCommit(xvac))
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			}
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -905,14 +925,12 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 				if (TransactionIdIsInProgress(xvac))
 					return false;
 				if (TransactionIdDidCommit(xvac))
-				{
-					tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-					SetBufferCommitInfoNeedsSave(buffer);
-				}
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+										 InvalidTransactionId);
 				else
 				{
-					tuple->t_infomask |= HEAP_XMIN_INVALID;
-					SetBufferCommitInfoNeedsSave(buffer);
+					HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+										 InvalidTransactionId);
 					return false;
 				}
 			}
@@ -934,8 +952,8 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 			/* FIXME -- is this correct w.r.t. the cmax of the tuple? */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
 			{
-				tuple->t_infomask |= HEAP_XMAX_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+									 InvalidTransactionId);
 				return true;
 			}
 
@@ -949,15 +967,13 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 		else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
 			return false;
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 HeapTupleHeaderGetXmin(tuple));
 		else
 		{
 			/* it must have aborted or crashed */
-			tuple->t_infomask |= HEAP_XMIN_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								 InvalidTransactionId);
 			return false;
 		}
 	}
@@ -998,14 +1014,14 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 		if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
 		{
 			/* it must have aborted or crashed */
-			tuple->t_infomask |= HEAP_XMAX_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+								 InvalidTransactionId);
 			return true;
 		}
 
 		/* xmax transaction committed */
-		tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-		SetBufferCommitInfoNeedsSave(buffer);
+		HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+							 HeapTupleHeaderGetXmax(tuple));
 	}
 
 	/*
@@ -1054,12 +1070,12 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 				return HEAPTUPLE_DELETE_IN_PROGRESS;
 			if (TransactionIdDidCommit(xvac))
 			{
-				tuple->t_infomask |= HEAP_XMIN_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+									 InvalidTransactionId);
 				return HEAPTUPLE_DEAD;
 			}
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 InvalidTransactionId);
 		}
 		else if (tuple->t_infomask & HEAP_MOVED_IN)
 		{
@@ -1070,14 +1086,12 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 			if (TransactionIdIsInProgress(xvac))
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
 			if (TransactionIdDidCommit(xvac))
-			{
-				tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-				SetBufferCommitInfoNeedsSave(buffer);
-			}
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+									 InvalidTransactionId);
 			else
 			{
-				tuple->t_infomask |= HEAP_XMIN_INVALID;
-				SetBufferCommitInfoNeedsSave(buffer);
+				HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+									 InvalidTransactionId);
 				return HEAPTUPLE_DEAD;
 			}
 		}
@@ -1091,21 +1105,22 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 			return HEAPTUPLE_DELETE_IN_PROGRESS;
 		}
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								 HeapTupleHeaderGetXmin(tuple));
 		else
 		{
 			/*
 			 * Not in Progress, Not Committed, so either Aborted or crashed
 			 */
-			tuple->t_infomask |= HEAP_XMIN_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								 InvalidTransactionId);
 			return HEAPTUPLE_DEAD;
 		}
-		/* Should only get here if we set XMIN_COMMITTED */
-		Assert(tuple->t_infomask & HEAP_XMIN_COMMITTED);
+		/*
+		 * At this point the xmin is known committed, but we might not have
+		 * been able to set the hint bit yet; so we can no longer Assert
+		 * that it's set.
+		 */
 	}
 
 	/*
@@ -1143,8 +1158,8 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 			 * We know that xmax did lock the tuple, but it did not and will
 			 * never actually update it.
 			 */
-			tuple->t_infomask |= HEAP_XMAX_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+								 InvalidTransactionId);
 		}
 		return HEAPTUPLE_LIVE;
 	}
@@ -1161,21 +1176,22 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 		if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
 			return HEAPTUPLE_DELETE_IN_PROGRESS;
 		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
-		{
-			tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-			SetBufferCommitInfoNeedsSave(buffer);
-		}
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+								 HeapTupleHeaderGetXmax(tuple));
 		else
 		{
 			/*
 			 * Not in Progress, Not Committed, so either Aborted or crashed
 			 */
-			tuple->t_infomask |= HEAP_XMAX_INVALID;
-			SetBufferCommitInfoNeedsSave(buffer);
+			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+								 InvalidTransactionId);
 			return HEAPTUPLE_LIVE;
 		}
-		/* Should only get here if we set XMAX_COMMITTED */
-		Assert(tuple->t_infomask & HEAP_XMAX_COMMITTED);
+		/*
+		 * At this point the xmax is known committed, but we might not have
+		 * been able to set the hint bit yet; so we can no longer Assert
+		 * that it's set.
+		 */
 	}
 
 	/*
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index f67eb2c048..5e6cabe194 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.19 2007/01/05 22:19:50 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.20 2007/08/01 22:45:09 tgl Exp $
  */
 #ifndef CLOG_H
 #define CLOG_H
@@ -32,8 +32,8 @@ typedef int XidStatus;
 #define NUM_CLOG_BUFFERS	8
 
 
-extern void TransactionIdSetStatus(TransactionId xid, XidStatus status);
-extern XidStatus TransactionIdGetStatus(TransactionId xid);
+extern void TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn);
+extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn);
 
 extern Size CLOGShmemSize(void);
 extern void CLOGShmemInit(void);
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 6cb2e5294e..d7fb404f4f 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/gist_private.h,v 1.26 2007/01/20 18:43:35 neilc Exp $
+ * $PostgreSQL: pgsql/src/include/access/gist_private.h,v 1.27 2007/08/01 22:45:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -200,8 +200,6 @@ typedef struct GistSplitVector
 								 * distributed between left and right pages */
 } GistSplitVector;
 
-#define XLogRecPtrIsInvalid( r )	( (r).xlogid == 0 && (r).xrecoff == 0 )
-
 typedef struct
 {
 	Relation	r;
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 3cc30e76b7..9e18b9608b 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -6,13 +6,14 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.20 2007/01/05 22:19:51 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.21 2007/08/01 22:45:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef SLRU_H
 #define SLRU_H
 
+#include "access/xlogdefs.h"
 #include "storage/lwlock.h"
 
 
@@ -51,6 +52,17 @@ typedef struct SlruSharedData
 	int		   *page_lru_count;
 	LWLockId   *buffer_locks;
 
+	/*
+	 * Optional array of WAL flush LSNs associated with entries in the SLRU
+	 * pages.  If not zero/NULL, we must flush WAL before writing pages (true
+	 * for pg_clog, false for multixact and pg_subtrans).  group_lsn[] has
+	 * lsn_groups_per_page entries per buffer slot, each containing the
+	 * highest LSN known for a contiguous group of SLRU entries on that slot's
+	 * page.
+	 */
+	XLogRecPtr *group_lsn;
+	int			lsn_groups_per_page;
+
 	/*----------
 	 * We mark a page "most recently used" by setting
 	 *		page_lru_count[slotno] = ++cur_lru_count;
@@ -81,8 +93,8 @@ typedef struct SlruCtlData
 	SlruShared	shared;
 
 	/*
-	 * This flag tells whether to fsync writes (true for pg_clog, false for
-	 * pg_subtrans).
+	 * This flag tells whether to fsync writes (true for pg_clog and multixact
+	 * stuff, false for pg_subtrans).
 	 */
 	bool		do_fsync;
 
@@ -106,11 +118,12 @@ typedef SlruCtlData *SlruCtl;
 typedef struct SlruFlushData *SlruFlush;
 
 
-extern Size SimpleLruShmemSize(int nslots);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
+extern Size SimpleLruShmemSize(int nslots, int nlsns);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 			  LWLockId ctllock, const char *subdir);
 extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern int	SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid);
+extern int	SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+							  TransactionId xid);
 extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
 						   TransactionId xid);
 extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index b6fadcd436..98850cc0d3 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -7,13 +7,15 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.60 2007/01/05 22:19:51 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.61 2007/08/01 22:45:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef TRANSAM_H
 #define TRANSAM_H
 
+#include "access/xlogdefs.h"
+
 
 /* ----------------
  *		Special transaction ID values
@@ -115,14 +117,17 @@ extern VariableCache ShmemVariableCache;
 extern bool TransactionIdDidCommit(TransactionId transactionId);
 extern bool TransactionIdDidAbort(TransactionId transactionId);
 extern void TransactionIdCommit(TransactionId transactionId);
+extern void TransactionIdAsyncCommit(TransactionId transactionId, XLogRecPtr lsn);
 extern void TransactionIdAbort(TransactionId transactionId);
 extern void TransactionIdSubCommit(TransactionId transactionId);
 extern void TransactionIdCommitTree(int nxids, TransactionId *xids);
+extern void TransactionIdAsyncCommitTree(int nxids, TransactionId *xids, XLogRecPtr lsn);
 extern void TransactionIdAbortTree(int nxids, TransactionId *xids);
 extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2);
 extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2);
 extern bool TransactionIdFollows(TransactionId id1, TransactionId id2);
 extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2);
+extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid);
 
 /* in transam/varsup.c */
 extern TransactionId GetNewTransactionId(bool isSubXact);
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 759eab1a3d..e8e2b08de4 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.87 2007/04/30 21:01:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.88 2007/08/01 22:45:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,6 +41,9 @@ extern int	XactIsoLevel;
 extern bool DefaultXactReadOnly;
 extern bool XactReadOnly;
 
+/* Asynchronous commits */
+extern bool XactSyncCommit;
+
 /*
  *	start- and end-of-transaction callbacks for dynamically loaded modules
  */
@@ -147,6 +150,7 @@ extern void SetCurrentStatementStartTimestamp(void);
 extern int	GetCurrentTransactionNestLevel(void);
 extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
 extern void CommandCounterIncrement(void);
+extern void ForceSyncCommit(void);
 extern void StartTransactionCommand(void);
 extern void CommitTransactionCommand(void);
 extern void AbortCurrentTransaction(void);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index adc99a6eb0..2e1928dace 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.81 2007/07/24 04:54:09 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.82 2007/08/01 22:45:09 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -197,8 +197,11 @@ extern CheckpointStatsData CheckpointStats;
 extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
 extern void XLogFlush(XLogRecPtr RecPtr);
 extern void XLogBackgroundFlush(void);
+extern void XLogAsyncCommitFlush(void);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 
+extern void XLogSetAsyncCommitLSN(XLogRecPtr record);
+
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
 
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index ceca779432..843f078d65 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.17 2007/02/14 05:00:40 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.18 2007/08/01 22:45:09 tgl Exp $
  */
 #ifndef XLOG_DEFS_H
 #define XLOG_DEFS_H
@@ -33,6 +33,8 @@ typedef struct XLogRecPtr
 	uint32		xrecoff;		/* byte offset of location in log file */
 } XLogRecPtr;
 
+#define XLogRecPtrIsInvalid(r)	((r).xrecoff == 0)
+
 
 /*
  * Macros for comparing XLogRecPtrs
-- 
GitLab