提交 88f0623e 编写于 作者: H Heikki Linnakangas

Fix relfilenode conflicts.

There was a race condition in the way relfilenodes were chosen, because
QE nodes chose relfilenodes for existing relations, e.g. at REINDEX or
TRUNCATE, independently of the master, while for newly created tables,
the relfilenode was chosen by the master. To fix:

1. If the OID of a newly-created table is already in use as relfilenode
   of a different table in a QE segment, use a different relfilenode.
   (This shouldn't happen in the dispatcher, for the same reasons it
   cannot happen in a single-server PostgreSQL instance)

2. Use a small cache of values recently used for a relfilenode, to close
   a race condition between checking if a relfilenode is in use, and
   actually creating the file
上级 7a1a98c9
......@@ -389,3 +389,60 @@ GetNewObjectId(void)
return result;
}
/*
* To avoid clashes on pg_class.relfilenode, we keep track of which
* OIDs have recently been used for a relfilenode. PostgreSQL doesn't
* need this, as they rely on the monotonicity of GetNewObjectId(),
* with checks for already-used relfilenodes in GetNewRelFileNode(),
* but that's not enough in GPDB. In a GPDB segment node, we try to use
* a table's OID as its relfilenode like in PostgreSQL, but because the
* OIDs are generated in the master node, it's possible that
* GetNewRelFileNode() chooses a value that has just been assigned for
* a different table in the master. To work around that, we have a
* small cache of values that have recently been used for relfilenodes,
* and refrain from choosing them again.
*
* If the given OID has recently been used as a relfilenode, returns
* false. Otherwise returns true, and marks the OID as used, so that
* subsequent calls with the same OID will return false.
*
* We use a small cache of 100 OIDs (NUM_RECENT_RELFILENODES). This is
* not bulletproof, but is good enough in practice to close the race
* condition. It is not enough by itself to ensure that a relfilenode
* value is unique, you still need to also check that there's no
* file in the data directory with the same name.
*/
bool
UseOidForRelFileNode(Oid oid)
{
int i;
int next;
/*
* Note: we assume below that the array and recentRelfilenodes are
* all initialized to 0 at postmaster startup.
*/
LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
for (i = 0; i < NUM_RECENT_RELFILENODES; i++)
{
if (ShmemVariableCache->recentRelfilenodes[i] == oid)
{
/* Already used. Not cool. */
LWLockRelease(OidGenLock);
return false;
}
}
next = ShmemVariableCache->nextRecentRelfilenode;
ShmemVariableCache->recentRelfilenodes[next] = oid;
if (++next >= NUM_RECENT_RELFILENODES)
next = 0;
ShmemVariableCache->nextRecentRelfilenode = next;
LWLockRelease(OidGenLock);
return true;
}
......@@ -925,6 +925,9 @@ GetNewRelFileNode(Oid reltablespace, bool relisshared, Relation pg_class)
else
rnode.relNode = GetNewObjectId();
if (!UseOidForRelFileNode(rnode.relNode))
continue;
/* Check for existing file of same name */
rpath = relpath(rnode);
fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);
......@@ -957,20 +960,24 @@ GetNewRelFileNode(Oid reltablespace, bool relisshared, Relation pg_class)
if (Gp_role == GP_ROLE_EXECUTE)
Insist(!PointerIsValid(pg_class));
elog(DEBUG1, "Calling GetNewRelFileNode in %s mode %s pg_class. "
"New relOid = %d",
elog(DEBUG1, "Calling GetNewRelFileNode in %s mode %s pg_class. New relOid = %d",
(Gp_role == GP_ROLE_EXECUTE ? "execute" :
Gp_role == GP_ROLE_UTILITY ? "utility" :
"dispatch"), pg_class ? "with" : "without",
rnode.relNode );
rnode.relNode);
return rnode.relNode;
}
/*
* Can the given OID be used as pg_class.relfilenode?
*
* As a side-effect, advances OID counter to the given OID and remembers
* that the OID has been used as a relfilenode, so that the same value
* doesn't get chosen again.
*/
bool
CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared,
Relation pg_class)
CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared)
{
RelFileNode rnode;
char *rpath;
......@@ -978,50 +985,30 @@ CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared,
bool collides;
SnapshotData SnapshotDirty;
InitDirtySnapshot(SnapshotDirty);
if (pg_class)
{
Oid oidIndex;
Relation indexrel;
IndexScanDesc scan;
ScanKeyData key;
Assert(!IsBootstrapProcessingMode());
Assert(pg_class->rd_rel->relhasoids);
/* The relcache will cache the identity of the OID index for us */
oidIndex = RelationGetOidIndex(pg_class);
Assert(OidIsValid(oidIndex));
indexrel = index_open(oidIndex, AccessShareLock);
ScanKeyInit(&key,
(AttrNumber) 1,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(newOid));
scan = index_beginscan(pg_class, indexrel, &SnapshotDirty, 1, &key);
/*
* Advance our current OID counter with the given value, to keep
* the counter roughly in sync across all nodes. This ensures
* that a GetNewRelFileNode() call after this will not choose the
* same OID, and won't have to loop excessively to retry. That
* still leaves a race condition, if GetNewRelFileNode() is called
* just before CheckNewRelFileNodeIsOk() - UseOidForRelFileNode()
* is called to plug that.
*
* FIXME: handle OID wraparound gracefully.
*/
while(GetNewObjectId() < newOid);
collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));
if (!UseOidForRelFileNode(newOid))
return false;
index_endscan(scan);
index_close(indexrel, AccessShareLock);
if (collides)
elog(ERROR, "relfilenode %d already in use in \"pg_class\"",
newOid);
}
InitDirtySnapshot(SnapshotDirty);
/* This should match RelationInitPhysicalAddr */
rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;
rnode.relNode = newOid;
/* Check for existing file of same name */
rpath = relpath(rnode);
fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);
......@@ -1036,11 +1023,13 @@ CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared,
collides = false;
pfree(rpath);
if (collides && !relisshared)
elog(ERROR, "oid %d already in use", newOid);
while(GetNewObjectId() < newOid);
elog(DEBUG1, "Called CheckNewRelFileNodeIsOk in %s mode for %u / %u / %u. "
"collides = %s",
(Gp_role == GP_ROLE_EXECUTE ? "execute" :
Gp_role == GP_ROLE_UTILITY ? "utility" :
"dispatch"), newOid, reltablespace, relisshared,
collides ? "true" : "false");
return !collides;
}
......
......@@ -1576,6 +1576,9 @@ heap_create_with_catalog(const char *relname,
*
* The OID will be the relfilenode as well, so make sure it doesn't
* collide with either pg_class OIDs or existing physical files.
*
* (In GPDB, heap_create can choose a different relfilenode, in a QE node,
* if the one we choose is already in use.)
*/
if (!OidIsValid(relid) && Gp_role == GP_ROLE_EXECUTE)
relid = GetPreassignedOidForRelation(relnamespace, relname);
......@@ -1583,12 +1586,6 @@ heap_create_with_catalog(const char *relname,
if (!OidIsValid(relid))
relid = GetNewRelFileNode(reltablespace, shared_relation,
pg_class_desc);
else
if (IsUnderPostmaster)
{
CheckNewRelFileNodeIsOk(relid, reltablespace, shared_relation,
pg_class_desc);
}
/*
* Create the relcache entry (mostly dummy at this point) and the physical
......
......@@ -628,23 +628,18 @@ index_create(Oid heapRelationId,
*
* The OID will be the relfilenode as well, so make sure it doesn't
* collide with either pg_class OIDs or existing physical files.
*
* (In GPDB, heap_create can choose a different relfilenode, in a QE node,
* if the one we choose is already in use.)
*/
if (!OidIsValid(indexRelationId))
{
if (Gp_role == GP_ROLE_EXECUTE)
{
indexRelationId = GetPreassignedOidForRelation(namespaceId, indexRelationName);
CheckNewRelFileNodeIsOk(indexRelationId, tableSpaceId, shared_relation, pg_class);
}
else
indexRelationId = GetNewRelFileNode(tableSpaceId, shared_relation,
pg_class);
}
else
if (IsUnderPostmaster)
{
CheckNewRelFileNodeIsOk(indexRelationId, tableSpaceId, shared_relation, pg_class);
}
/*
* create the index relation's relcache entry and physical disk file. (If
......@@ -1422,7 +1417,7 @@ index_update_stats(Relation rel, bool hasindex, bool isprimary,
void
setNewRelfilenode(Relation relation, TransactionId freezeXid)
{
Oid newrelfilenode = InvalidOid;
Oid newrelfilenode;
RelFileNode newrnode;
SMgrRelation srel;
Relation pg_class;
......@@ -1445,26 +1440,10 @@ setNewRelfilenode(Relation relation, TransactionId freezeXid)
freezeXid == InvalidTransactionId) ||
TransactionIdIsNormal(freezeXid));
if (Gp_role == GP_ROLE_EXECUTE)
newrelfilenode = GetPreassignedRelfilenodeForRelation(RelationGetRelid(relation));
if (newrelfilenode == InvalidOid)
{
/* Allocate a new relfilenode */
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace,
relation->rd_rel->relisshared,
NULL);
AddDispatchRelfilenodeForRelation(RelationGetRelid(relation), newrelfilenode);
}
else
{
CheckNewRelFileNodeIsOk(newrelfilenode, relation->rd_rel->reltablespace,
relation->rd_rel->relisshared, NULL);
elog(DEBUG3, "setNewRelfilenodeToOid called. newrelfilenode = %d",
newrelfilenode);
}
/* Allocate a new relfilenode */
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace,
relation->rd_rel->relisshared,
NULL);
/*
* Find the pg_class tuple for the given relation. This is not used
......
......@@ -527,35 +527,6 @@ GetPreassignedOidForType(Oid namespaceOid, const char *typname)
return oid;
}
/*
* Get a pre-assigned relfilenode (not OID!) for a relation.
*/
Oid
GetPreassignedRelfilenodeForRelation(Oid relid)
{
OidAssignment searchkey;
Oid oid;
memset(&searchkey, 0, sizeof(OidAssignment));
searchkey.catalog = RelationRelationId;
searchkey.keyOid2 = relid;
/*
* At least VACUUM of a bitmap index, is implemented by reindexing, which
* allocates a new relfilenode. The reindexing is done only if there were
* any dead tuples, so it might fire on some segments but not others, and
* never on the master. So don't insist that there are no requests for new
* relfilenodes in segments. AFAICS there's nothing that strictly requires
* matching relfilenodes across segments, but if there are, we'll to
* revisit this (perhaps by fixing whatever currently requires them to
* match, to not require that).
*/
if ((oid = GetPreassignedOid(&searchkey)) == InvalidOid)
elog(DEBUG1, "no pre-assigned relfilenode for relation %u", relid);
return oid;
}
/* ----------------------------------------------------------------
* Functions for use in the master node.
......@@ -603,35 +574,6 @@ AddDispatchOidFromTuple(Relation catalogrel, HeapTuple tuple)
#endif
}
/*
* Remember a newly assigned relfilenode, for a given relation.
*/
void
AddDispatchRelfilenodeForRelation(Oid relid, Oid relfilenode)
{
OidAssignment *assignment;
MemoryContext oldcontext;
if (Gp_role == GP_ROLE_EXECUTE || IsBootstrapProcessingMode())
return;
oldcontext = MemoryContextSwitchTo(TopTransactionContext);
assignment = makeNode(OidAssignment);
assignment->catalog = RelationRelationId;
assignment->keyOid2 = relid;
assignment->oid = relfilenode;
dispatch_oids = lappend(dispatch_oids, assignment);
MemoryContextSwitchTo(oldcontext);
#ifdef OID_DISPATCH_DEBUG
elog(NOTICE, "adding relfilenode assignment: relid %u: %u",
relid, relfilenode);
#endif
}
/*
* Get list of OIDs assigned in this transaction, since the last call.
*/
......
......@@ -1204,10 +1204,9 @@ vacuumStatement_Relation(VacuumStmt *vacstmt, Oid relid,
*/
if (Gp_role == GP_ROLE_DISPATCH)
{
int i, j, nindexes;
int i, nindexes;
bool has_bitmap = false;
Relation *i_rel = NULL;
Oid newrelfilenode;
stats_context.ctx = vac_context;
stats_context.onerel = onerel;
......@@ -1219,24 +1218,10 @@ vacuumStatement_Relation(VacuumStmt *vacstmt, Oid relid,
{
for (i = 0; i < nindexes; i++)
{
if (!RelationIsBitmapIndex(i_rel[i]))
continue;
has_bitmap = true;
/*
* bitmap indexes require extra relfilenodes during vacuum,
* the exact number is unknown so we err on the side of
* caution. See comment on NUM_EXTRA_OIDS_FOR_BITMAP for
* more information.
*/
for (j = 0; j < NUM_EXTRA_OIDS_FOR_BITMAP; j++)
if (RelationIsBitmapIndex(i_rel[i]))
{
newrelfilenode = GetNewRelFileNode(i_rel[i]->rd_rel->reltablespace,
i_rel[i]->rd_rel->relisshared,
NULL);
AddDispatchRelfilenodeForRelation(RelationGetRelid(i_rel[i]),
newrelfilenode);
has_bitmap = true;
break;
}
}
}
......
......@@ -3064,6 +3064,7 @@ RelationBuildLocalRelation(const char *relname,
int i;
bool has_not_null;
bool nailit;
Oid relfilenode;
AssertArg(natts >= 0);
......@@ -3196,8 +3197,12 @@ RelationBuildLocalRelation(const char *relname,
/*
* Insert relation physical and logical identifiers (OIDs) into the right
* places. Note that the physical ID (relfilenode) is initially the same
* as the logical ID (OID).
* places.
*
* In PostgreSQL, the physical ID (relfilenode) is initially the same
* as the logical ID (OID). In GPDB, the table's logical OID is allocated
* in the master, and might already be in use as a relfilenode of an
* existing relation in a segment.
*/
rel->rd_rel->relisshared = shared_relation;
......@@ -3206,7 +3211,18 @@ RelationBuildLocalRelation(const char *relname,
for (i = 0; i < natts; i++)
rel->rd_att->attrs[i]->attrelid = relid;
rel->rd_rel->relfilenode = relid;
if (Gp_role != GP_ROLE_EXECUTE ||
CheckNewRelFileNodeIsOk(relid, reltablespace, shared_relation))
{
relfilenode = relid;
}
else
{
/* FIXME: should we pass pg_class here? */
relfilenode = GetNewRelFileNode(reltablespace, shared_relation, NULL);
}
rel->rd_rel->relfilenode = relfilenode;
rel->rd_rel->reltablespace = reltablespace;
RelationInitLockInfo(rel); /* see lmgr.c */
......
......@@ -59,6 +59,8 @@
(dest)--; \
} while ((dest) < FirstNormalTransactionId)
#define NUM_RECENT_RELFILENODES 100
/*
* VariableCache is a data structure in shared memory that is used to track
* OID and XID assignment state. For largely historical reasons, there is
......@@ -77,6 +79,10 @@ typedef struct VariableCacheData
Oid nextOid; /* next OID to assign */
uint32 oidCount; /* OIDs available before must do XLOG work */
/* cache of recently used relfilenodes, for UseOidForRelFileNode() */
Oid recentRelfilenodes[NUM_RECENT_RELFILENODES];
int nextRecentRelfilenode;
/*
* These fields are protected by XidGenLock.
*/
......@@ -137,5 +143,6 @@ extern TransactionId ReadNewTransactionId(void);
extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
Name oldest_datname);
extern Oid GetNewObjectId(void);
extern bool UseOidForRelFileNode(Oid oid);
#endif /* TRAMSAM_H */
......@@ -45,6 +45,6 @@ extern Oid GetNewOid(Relation relation);
extern Oid GetNewOidWithIndex(Relation relation, Relation indexrel);
extern Oid GetNewRelFileNode(Oid reltablespace, bool relisshared,
Relation pg_class);
extern bool CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared, Relation pg_class);
extern bool CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared);
#endif /* CATALOG_H */
......@@ -17,7 +17,6 @@
/* Functions used in master */
extern void AddDispatchOidFromTuple(Relation catalogrel, HeapTuple tuple);
extern void AddDispatchRelfilenodeForRelation(Oid relid, Oid relfilenode);
extern List *GetAssignedOidsForDispatch(void);
/* Functions used in QE nodes */
......@@ -26,7 +25,6 @@ extern Oid GetPreassignedOidForTuple(Relation catalogrel, HeapTuple tuple);
extern Oid GetPreassignedOidForRelation(Oid namespaceOid, const char *relname);
extern Oid GetPreassignedOidForType(Oid namespaceOid, const char *typname);
extern Oid GetPreassignedOidForDatabase(const char *datname);
extern Oid GetPreassignedRelfilenodeForRelation(Oid relid);
extern void AtEOXact_DispatchOids(bool isCommit);
......
......@@ -209,7 +209,7 @@ typedef struct
Oid namespaceOid; /* namespace OID for most objects */
Oid keyOid2; /* 2nd key OID field, meaning depends on object type */
Oid oid; /* OID (or relfilenode) to assign */
Oid oid; /* OID to assign */
} OidAssignment;
......
......@@ -1286,40 +1286,6 @@ create table atsdb (i int, j text) distributed by (j);
alter table atsdb set with(appendonly = true);
ERROR: option "appendonly" not supported
drop table atsdb;
-- MPP-8474: Index relfilenode mismatch: entry db to segment db.
--
-- XXX This really belongs in alter_table.sql but this is not
-- in use in current_good_schedule.
drop table if exists mpp8474 cascade; --ignore
NOTICE: table "mpp8474" does not exist, skipping
create table mpp8474(a int, b int, c text)
with (appendonly=true)
distributed by (b);
create index mpp8474_a
on mpp8474(a);
alter table mpp8474
add column d int default 10;
select
'Mismatched relfilenodes:' as oops,
e.oid::regclass as entry_oid,
e.relkind,
e.relfilenode as entry_relfilenode,
s.segid,
s.segfilenode as segment_relfilenode
from
pg_class e,
( select gp_execution_segment(), oid, relfilenode
from gp_dist_random('pg_class')
) s (segid, segoid, segfilenode)
where
e.oid = s.segoid
and e.relfilenode != s.segfilenode
and e.relname ~ '^mpp8474.*';
oops | entry_oid | relkind | entry_relfilenode | segid | segment_relfilenode
------+-----------+---------+-------------------+-------+---------------------
(0 rows)
drop table mpp8474;
-- MPP-18660: duplicate entry in gp_distribution_policy
set enable_indexscan=on;
set enable_seqscan=off;
......
......@@ -373,39 +373,6 @@ create table atsdb (i int, j text) distributed by (j);
alter table atsdb set with(appendonly = true);
drop table atsdb;
-- MPP-8474: Index relfilenode mismatch: entry db to segment db.
--
-- XXX This really belongs in alter_table.sql but this is not
-- in use in current_good_schedule.
drop table if exists mpp8474 cascade; --ignore
create table mpp8474(a int, b int, c text)
with (appendonly=true)
distributed by (b);
create index mpp8474_a
on mpp8474(a);
alter table mpp8474
add column d int default 10;
select
'Mismatched relfilenodes:' as oops,
e.oid::regclass as entry_oid,
e.relkind,
e.relfilenode as entry_relfilenode,
s.segid,
s.segfilenode as segment_relfilenode
from
pg_class e,
( select gp_execution_segment(), oid, relfilenode
from gp_dist_random('pg_class')
) s (segid, segoid, segfilenode)
where
e.oid = s.segoid
and e.relfilenode != s.segfilenode
and e.relname ~ '^mpp8474.*';
drop table mpp8474;
-- MPP-18660: duplicate entry in gp_distribution_policy
set enable_indexscan=on;
set enable_seqscan=off;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册