From b0a30de602b9c4b0758eafe8e7cad2f0ecb7aec4 Mon Sep 17 00:00:00 2001 From: Ashwin Agrawal Date: Fri, 29 Sep 2017 14:43:37 -0700 Subject: [PATCH] pgindent cdb directory (part-3). --- src/backend/cdb/cdbdatabaseinfo.c | 1085 ++-- src/backend/cdb/cdbdirectopen.c | 347 +- src/backend/cdb/cdbdistributedsnapshot.c | 116 +- src/backend/cdb/cdbdistributedxacts.c | 12 +- src/backend/cdb/cdbdistributedxid.c | 3 +- src/backend/cdb/cdbdoublylinked.c | 88 +- src/backend/cdb/cdbfilerepresyncmanager.c | 1453 +++--- src/backend/cdb/cdbfilerepresyncworker.c | 658 +-- src/backend/cdb/cdbfilerepservice.c | 720 +-- src/backend/cdb/cdbfts.c | 61 +- src/backend/cdb/cdbglobalsequence.c | 101 +- src/backend/cdb/cdbgroup.c | 4603 +++++++++-------- src/backend/cdb/cdbpartition.c | 4585 ++++++++-------- src/backend/cdb/cdbpath.c | 2043 ++++---- src/backend/cdb/cdbpathlocus.c | 859 +-- src/backend/cdb/cdbpathtoplan.c | 299 +- src/backend/cdb/cdbpersistentbuild.c | 745 +-- src/backend/cdb/cdbpersistentcheck.c | 743 +-- .../cdb/cdbresynchronizechangetracking.c | 3048 +++++------ src/backend/cdb/cdbsetop.c | 446 +- src/backend/cdb/cdbshareddoublylinked.c | 179 +- src/backend/cdb/cdbsharedoidsearch.c | 387 +- src/backend/cdb/cdbsreh.c | 214 +- src/backend/cdb/cdbsrlz.c | 68 +- src/backend/cdb/partitionselection.c | 217 +- 25 files changed, 11979 insertions(+), 11101 deletions(-) diff --git a/src/backend/cdb/cdbdatabaseinfo.c b/src/backend/cdb/cdbdatabaseinfo.c index 69a2c13952..ae46af5c9d 100755 --- a/src/backend/cdb/cdbdatabaseinfo.c +++ b/src/backend/cdb/cdbdatabaseinfo.c @@ -42,8 +42,8 @@ /* hash table entry for relation ids */ typedef struct RelationIdEntry { - Oid relationId; /* key */ - DbInfoRel *dbInfoRel; /* pointer */ + Oid relationId; /* key */ + DbInfoRel *dbInfoRel; /* pointer */ } RelationIdEntry; /*------------------------------------------------------------------------- @@ -56,10 +56,11 @@ typedef struct RelationIdEntry * DatabaseInfo_Check() * Validate that the DatabaseInfo returned is consistent. */ -void DatabaseInfo_Check(DatabaseInfo *info) +void +DatabaseInfo_Check(DatabaseInfo *info) { - int sr; - int rsf; + int sr; + int rsf; /* * Compare Stored Relations to Relation Segment Files. @@ -68,21 +69,21 @@ void DatabaseInfo_Check(DatabaseInfo *info) rsf = 0; while (true) { - int cmp; + int cmp; cmp = TablespaceRelFile_Compare( - &info->pgClassStoredRelations[sr].tablespaceRelFile, - &info->relSegFiles[rsf].tablespaceRelFile); + &info->pgClassStoredRelations[sr].tablespaceRelFile, + &info->relSegFiles[rsf].tablespaceRelFile); if (cmp != 0) { elog(WARNING, "In database %u, stored relation doesn't match relation file on disk " - "(stored tablespace %u, disk file tablespace %u, " - "stored relation %u, disk file relation %u)", - info->database, - info->pgClassStoredRelations[sr].tablespaceRelFile.tablespace, - info->relSegFiles[rsf].tablespaceRelFile.tablespace, - info->pgClassStoredRelations[sr].tablespaceRelFile.relation, - info->relSegFiles[rsf].tablespaceRelFile.relation); + "(stored tablespace %u, disk file tablespace %u, " + "stored relation %u, disk file relation %u)", + info->database, + info->pgClassStoredRelations[sr].tablespaceRelFile.tablespace, + info->relSegFiles[rsf].tablespaceRelFile.tablespace, + info->pgClassStoredRelations[sr].tablespaceRelFile.relation, + info->relSegFiles[rsf].tablespaceRelFile.relation); return; } @@ -95,8 +96,8 @@ void DatabaseInfo_Check(DatabaseInfo *info) if (rsf >= info->relSegFilesCount) break; if (TablespaceRelFile_Compare( - &info->pgClassStoredRelations[sr].tablespaceRelFile, - &info->relSegFiles[rsf].tablespaceRelFile) != 0) + &info->pgClassStoredRelations[sr].tablespaceRelFile, + &info->relSegFiles[rsf].tablespaceRelFile) != 0) break; } @@ -108,7 +109,7 @@ void DatabaseInfo_Check(DatabaseInfo *info) if (sr < info->pgClassStoredRelationsCount) { elog(WARNING, "In database %u, extra stored relation (tablespace %u, relation %u)", - info->database, + info->database, info->pgClassStoredRelations[sr].tablespaceRelFile.tablespace, info->pgClassStoredRelations[sr].tablespaceRelFile.relation); return; @@ -117,7 +118,7 @@ void DatabaseInfo_Check(DatabaseInfo *info) if (rsf < info->relSegFilesCount) { elog(WARNING, "In database %u, extra relation file on disk (tablespace %u, relation %u)", - info->database, + info->database, info->relSegFiles[rsf].tablespaceRelFile.tablespace, info->relSegFiles[rsf].tablespaceRelFile.relation); return; @@ -131,13 +132,14 @@ void DatabaseInfo_Check(DatabaseInfo *info) * DatabaseInfo_Trace() * Output debugging information about the DatabaseInfo */ -void DatabaseInfo_Trace(DatabaseInfo *info) +void +DatabaseInfo_Trace(DatabaseInfo *info) { - int t; - int sr; - int rsf; - int grn; - int m; + int t; + int sr; + int rsf; + int grn; + int m; for (t = 0; t < info->tablespacesCount; t++) elog(WARNING, "Database Info: Tablespace #%d is %u", @@ -145,27 +147,27 @@ void DatabaseInfo_Trace(DatabaseInfo *info) for (sr = 0; sr < info->pgClassStoredRelationsCount; sr++) elog(WARNING, "Database Info: Stored relation (tablespace %u, relation %u, isBufferPoolRealtion %s, TID %s)", - info->pgClassStoredRelations[sr].tablespaceRelFile.tablespace, + info->pgClassStoredRelations[sr].tablespaceRelFile.tablespace, info->pgClassStoredRelations[sr].tablespaceRelFile.relation, (info->pgClassStoredRelations[sr].isBufferPoolRelation ? "true" : "false"), ItemPointerToString(&info->pgClassStoredRelations[sr].pgClassTid)); for (rsf = 0; rsf < info->relSegFilesCount; rsf++) elog(WARNING, "Database Info: Relation segment file (tablespace %u, relation %u, segment file num %d)", - info->relSegFiles[rsf].tablespaceRelFile.tablespace, - info->relSegFiles[rsf].tablespaceRelFile.relation, + info->relSegFiles[rsf].tablespaceRelFile.tablespace, + info->relSegFiles[rsf].tablespaceRelFile.relation, info->relSegFiles[rsf].segmentFileNum); for (grn = 0; grn < info->gpRelationNodesCount; grn++) elog(WARNING, "Database Info: Tablespace %u, relation %u node information (persistent TID %s, perstent serial number " INT64_FORMAT ")", - info->gpRelationNodes[grn].tablespaceRelFile.tablespace, - info->gpRelationNodes[grn].tablespaceRelFile.relation, + info->gpRelationNodes[grn].tablespaceRelFile.tablespace, + info->gpRelationNodes[grn].tablespaceRelFile.relation, ItemPointerToString(&info->gpRelationNodes[grn].persistentTid), info->gpRelationNodes[grn].persistentSerialNum); for (m = 0; m < info->miscEntriesCount; m++) elog(WARNING, "Database Info: Misc entry #%d (tablespace %u, directory = %s, name '%s')", - m, + m, info->miscEntries[m].tablespace, (info->miscEntries[m].isDir ? "true" : "false"), info->miscEntries[m].name); @@ -175,30 +177,31 @@ void DatabaseInfo_Trace(DatabaseInfo *info) /* * DatabaseInfo_FindDbInfoRel() * Lookup an entry in the info hash table. - * + * * Note: called nowhere in the source, purely available for debugging. */ -static DbInfoRel *DatabaseInfo_FindDbInfoRel( - HTAB *dbInfoRelHashTable, - Oid reltablespaceOid, - Oid relfilenodeOid) +static DbInfoRel * +DatabaseInfo_FindDbInfoRel( + HTAB *dbInfoRelHashTable, + Oid reltablespaceOid, + Oid relfilenodeOid) { - DbInfoRel *dbInfoRel; + DbInfoRel *dbInfoRel; DbInfoRelKeyPair *dbInfoRelKey; - bool found; + bool found; Assert(reltablespaceOid != 0); - dbInfoRelKey = (DbInfoRelKeyPair *)palloc0(sizeof(DbInfoRelKeyPair)); + dbInfoRelKey = (DbInfoRelKeyPair *) palloc0(sizeof(DbInfoRelKeyPair)); dbInfoRelKey->reltablespace = reltablespaceOid; dbInfoRelKey->relfilenode = relfilenodeOid; - dbInfoRel = - (DbInfoRel*) - hash_search(dbInfoRelHashTable, - dbInfoRelKey, - HASH_FIND, - &found); + dbInfoRel = + (DbInfoRel *) + hash_search(dbInfoRelHashTable, + dbInfoRelKey, + HASH_FIND, + &found); if (!found) { elog(ERROR, "pg_class entry (tablespace %u, relfilenode %u) not found", @@ -222,20 +225,21 @@ static DbInfoRel *DatabaseInfo_FindDbInfoRel( * * XXX - Why not just use repalloc? */ -static void DatabaseInfo_Grow( - void **array, - int32 arrayCount, - int32 *arrayMaxCount, - int32 elementLen) +static void +DatabaseInfo_Grow( + void **array, + int32 arrayCount, + int32 *arrayMaxCount, + int32 elementLen) { - void *newArray; - + void *newArray; + (*arrayMaxCount) *= 2; - newArray = palloc((*arrayMaxCount)*elementLen); + newArray = palloc((*arrayMaxCount) * elementLen); memcpy( - newArray, - (*array), - arrayCount*elementLen); + newArray, + (*array), + arrayCount * elementLen); pfree(*array); *array = newArray; } @@ -244,11 +248,11 @@ static void DatabaseInfo_Grow( * DatabaseInfo_DbInfoRelHashTableInit() * Construct a hash table of DbInfoRel */ -static HTAB* +static HTAB * DatabaseInfo_DbInfoRelHashTableInit() { - HASHCTL info; - int hash_flags; + HASHCTL info; + int hash_flags; /* Set key and entry sizes. */ MemSet(&info, 0, sizeof(info)); @@ -265,11 +269,11 @@ DatabaseInfo_DbInfoRelHashTableInit() * DatabaseInfo_RelationIdHashTableInit() * Construct a hash table of RelationIdEntry */ -static HTAB* +static HTAB * DatabaseInfo_RelationIdHashTableInit() { - HASHCTL info; - int hash_flags; + HASHCTL info; + int hash_flags; /* Set key and entry sizes. */ MemSet(&info, 0, sizeof(info)); @@ -290,8 +294,8 @@ DatabaseInfo_RelationIdHashTableInit() static HTAB * DatabaseInfo_PgAppendOnlyHashTableInit() { - HASHCTL info; - int hash_flags; + HASHCTL info; + int hash_flags; /* Set key and entry sizes. */ MemSet(&info, 0, sizeof(info)); @@ -309,20 +313,21 @@ DatabaseInfo_PgAppendOnlyHashTableInit() * DatabaseInfo_AddRelationId() * Add an entry to a dbInfoRel hash table */ -static void DatabaseInfo_AddRelationId( - HTAB *relationIdHashTable, - DbInfoRel *dbInfoRel) +static void +DatabaseInfo_AddRelationId( + HTAB *relationIdHashTable, + DbInfoRel *dbInfoRel) { RelationIdEntry *relationIdEntry; - bool found; - - relationIdEntry = - (RelationIdEntry*) - hash_search(relationIdHashTable, - (void *) &dbInfoRel->relationOid, - HASH_ENTER, - &found); + bool found; + + relationIdEntry = + (RelationIdEntry *) + hash_search(relationIdHashTable, + (void *) &dbInfoRel->relationOid, + HASH_ENTER, + &found); if (found) { elog(ERROR, "Duplicate pg_class entry (relation id %u)", @@ -336,20 +341,21 @@ static void DatabaseInfo_AddRelationId( * DatabaseInfo_FindRelationId() * Lookup an entry to a RelationIdEntry hash table */ -static DbInfoRel *DatabaseInfo_FindRelationId( - HTAB *relationIdHashTable, - Oid relationId) +static DbInfoRel * +DatabaseInfo_FindRelationId( + HTAB *relationIdHashTable, + Oid relationId) { RelationIdEntry *relationIdEntry; - bool found; - - relationIdEntry = - (RelationIdEntry*) - hash_search(relationIdHashTable, - (void *) &relationId, - HASH_FIND, - &found); + bool found; + + relationIdEntry = + (RelationIdEntry *) + hash_search(relationIdHashTable, + (void *) &relationId, + HASH_FIND, + &found); if (!found) { elog(ERROR, "pg_class entry (relation id %u) not found", @@ -364,21 +370,22 @@ static DbInfoRel *DatabaseInfo_FindRelationId( * DatabaseInfo_AddPgAppendOnly() * Add an entry to a pgAppendOnly hash table. */ -static void DatabaseInfo_AddPgAppendOnly( - HTAB *pgAppendOnlyHashTable, - Oid relationId, - Form_pg_appendonly aoEntry) +static void +DatabaseInfo_AddPgAppendOnly( + HTAB *pgAppendOnlyHashTable, + Oid relationId, + Form_pg_appendonly aoEntry) { PgAppendOnlyHashEntry *pgAppendOnlyHashEntry; - bool found; - - pgAppendOnlyHashEntry = - (PgAppendOnlyHashEntry*) - hash_search(pgAppendOnlyHashTable, - (void *) &relationId, - HASH_ENTER, - &found); + bool found; + + pgAppendOnlyHashEntry = + (PgAppendOnlyHashEntry *) + hash_search(pgAppendOnlyHashTable, + (void *) &relationId, + HASH_ENTER, + &found); if (found) elog(ERROR, "More than one pg_appendonly entry (relation id %u)", relationId); @@ -393,19 +400,19 @@ static void DatabaseInfo_AddPgAppendOnly( */ static Form_pg_appendonly DatabaseInfo_FindPgAppendOnly( - HTAB *pgAppendOnlyHashTable, - Oid relationId) + HTAB *pgAppendOnlyHashTable, + Oid relationId) { PgAppendOnlyHashEntry *pgAppendOnlyHashEntry; - bool found; - - pgAppendOnlyHashEntry = - (PgAppendOnlyHashEntry*) - hash_search(pgAppendOnlyHashTable, - (void *) &relationId, - HASH_FIND, - &found); + bool found; + + pgAppendOnlyHashEntry = + (PgAppendOnlyHashEntry *) + hash_search(pgAppendOnlyHashTable, + (void *) &relationId, + HASH_FIND, + &found); if (!found) { elog(ERROR, "pg_appendonly entry (relation id %u) not found", @@ -421,11 +428,12 @@ DatabaseInfo_FindPgAppendOnly( * DatabaseInfo_AddTablespace() * Add a tablespace to the DatabaseInfo */ -static void DatabaseInfo_AddTablespace( - DatabaseInfo *info, - Oid tablespace) +static void +DatabaseInfo_AddTablespace( + DatabaseInfo *info, + Oid tablespace) { - int t; + int t; t = 0; while (true) @@ -436,74 +444,76 @@ static void DatabaseInfo_AddTablespace( if (t >= info->tablespacesMaxCount) { DatabaseInfo_Grow( - (void**)&info->tablespaces, - info->tablespacesCount, - &info->tablespacesMaxCount, - sizeof(Oid)); + (void **) &info->tablespaces, + info->tablespacesCount, + &info->tablespacesMaxCount, + sizeof(Oid)); } info->tablespaces[info->tablespacesCount++] = tablespace; break; } - + if (info->tablespaces[t] == tablespace) break; - + t++; } } -static void DatabaseInfo_AddExtraSegmentFile( - DatabaseInfo *info, - Oid tablespace, - Oid relfilenode, - int32 segmentFileNum, - int64 eof) +static void +DatabaseInfo_AddExtraSegmentFile( + DatabaseInfo *info, + Oid tablespace, + Oid relfilenode, + int32 segmentFileNum, + int64 eof) { - DbInfoExtraSegmentFile *dbInfoExtraSegmentFile; + DbInfoExtraSegmentFile *dbInfoExtraSegmentFile; - if (info->extraSegmentFilesCount>= info->extraSegmentFilesMaxCount) + if (info->extraSegmentFilesCount >= info->extraSegmentFilesMaxCount) { DatabaseInfo_Grow( - (void**)&info->extraSegmentFiles, - info->extraSegmentFilesCount, - &info->extraSegmentFilesMaxCount, - sizeof(DbInfoExtraSegmentFile)); + (void **) &info->extraSegmentFiles, + info->extraSegmentFilesCount, + &info->extraSegmentFilesMaxCount, + sizeof(DbInfoExtraSegmentFile)); } - dbInfoExtraSegmentFile = - &info->extraSegmentFiles[info->extraSegmentFilesCount]; + dbInfoExtraSegmentFile = + &info->extraSegmentFiles[info->extraSegmentFilesCount]; info->extraSegmentFilesCount++; - + dbInfoExtraSegmentFile->relfilenode = relfilenode; dbInfoExtraSegmentFile->segmentFileNum = segmentFileNum; dbInfoExtraSegmentFile->tablespaceOid = tablespace; dbInfoExtraSegmentFile->eof = eof; } -static void DatabaseInfo_AddAppendOnlyCatalogSegmentInfo( - DbInfoRel *dbInfoRel, - int32 segmentFileNum, - int64 logicalEof) +static void +DatabaseInfo_AddAppendOnlyCatalogSegmentInfo( + DbInfoRel *dbInfoRel, + int32 segmentFileNum, + int64 logicalEof) { - DbInfoAppendOnlyCatalogSegmentInfo *appendOnlyCatalogSegmentInfo; + DbInfoAppendOnlyCatalogSegmentInfo *appendOnlyCatalogSegmentInfo; if (dbInfoRel->appendOnlyCatalogSegmentInfoCount >= dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount) { DatabaseInfo_Grow( - (void**)&dbInfoRel->appendOnlyCatalogSegmentInfo, - dbInfoRel->appendOnlyCatalogSegmentInfoCount, - &dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount, - sizeof(DbInfoAppendOnlyCatalogSegmentInfo)); + (void **) &dbInfoRel->appendOnlyCatalogSegmentInfo, + dbInfoRel->appendOnlyCatalogSegmentInfoCount, + &dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount, + sizeof(DbInfoAppendOnlyCatalogSegmentInfo)); } appendOnlyCatalogSegmentInfo = &dbInfoRel->appendOnlyCatalogSegmentInfo[dbInfoRel->appendOnlyCatalogSegmentInfoCount]; dbInfoRel->appendOnlyCatalogSegmentInfoCount++; - + appendOnlyCatalogSegmentInfo->segmentFileNum = segmentFileNum; appendOnlyCatalogSegmentInfo->logicalEof = logicalEof; if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "DatabaseInfo_AddAppendOnlyCatalogSegmentInfo: relation id %u, relation name %s, tablespace %u, relfilenode %u, segment file #%d, EOF " INT64_FORMAT, dbInfoRel->relationOid, dbInfoRel->relname, @@ -515,36 +525,37 @@ static void DatabaseInfo_AddAppendOnlyCatalogSegmentInfo( } -static void DatabaseInfo_AddPgClassStoredRelation( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - HTAB *relationIdHashTable, - Oid relfilenode, - ItemPointer pgClassTid, - Oid relationOid, - char *relname, - Oid reltablespace, - char relkind, - char relstorage, - Oid relam, - int relnatts) +static void +DatabaseInfo_AddPgClassStoredRelation( + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + HTAB *relationIdHashTable, + Oid relfilenode, + ItemPointer pgClassTid, + Oid relationOid, + char *relname, + Oid reltablespace, + char relkind, + char relstorage, + Oid relam, + int relnatts) { - DbInfoRel *dbInfoRel; - bool found; + DbInfoRel *dbInfoRel; + bool found; DbInfoRelKeyPair *dbInfoRelKey; Assert(reltablespace != 0); - dbInfoRelKey = (DbInfoRelKeyPair *)palloc0(sizeof(DbInfoRelKeyPair)); + dbInfoRelKey = (DbInfoRelKeyPair *) palloc0(sizeof(DbInfoRelKeyPair)); dbInfoRelKey->reltablespace = reltablespace; dbInfoRelKey->relfilenode = relfilenode; - dbInfoRel = - (DbInfoRel*) - hash_search(dbInfoRelHashTable, - dbInfoRelKey, - HASH_ENTER, - &found); + dbInfoRel = + (DbInfoRel *) + hash_search(dbInfoRelHashTable, + dbInfoRelKey, + HASH_ENTER, + &found); if (found) elog(ERROR, "More than one pg_class entry ('%s' %u and '%s' %u) references the same tablespace %u and relfilenode %u", dbInfoRel->relname, @@ -564,47 +575,47 @@ static void DatabaseInfo_AddPgClassStoredRelation( dbInfoRel->relnatts = relnatts; dbInfoRel->gpRelationNodesMaxCount = 1; - dbInfoRel->gpRelationNodes = - palloc0(dbInfoRel->gpRelationNodesMaxCount * sizeof(DbInfoGpRelationNode)); + dbInfoRel->gpRelationNodes = + palloc0(dbInfoRel->gpRelationNodesMaxCount * sizeof(DbInfoGpRelationNode)); dbInfoRel->gpRelationNodesCount = 0; - - dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount= 1; - dbInfoRel->appendOnlyCatalogSegmentInfo = - palloc0(dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount * sizeof(DbInfoAppendOnlyCatalogSegmentInfo)); + + dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount = 1; + dbInfoRel->appendOnlyCatalogSegmentInfo = + palloc0(dbInfoRel->appendOnlyCatalogSegmentInfoMaxCount * sizeof(DbInfoAppendOnlyCatalogSegmentInfo)); dbInfoRel->appendOnlyCatalogSegmentInfoCount = 0; - + dbInfoRel->physicalSegmentFilesMaxCount = 1; - dbInfoRel->physicalSegmentFiles = - palloc0(dbInfoRel->physicalSegmentFilesMaxCount * sizeof(DbInfoSegmentFile)); + dbInfoRel->physicalSegmentFiles = + palloc0(dbInfoRel->physicalSegmentFilesMaxCount * sizeof(DbInfoSegmentFile)); dbInfoRel->physicalSegmentFilesCount = 0; DatabaseInfo_AddRelationId( - relationIdHashTable, - dbInfoRel); + relationIdHashTable, + dbInfoRel); } -static bool DatabaseInfo_AddGpRelationNode( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - Oid reltablespace, - Oid relfilenode, - int32 segmentFileNum, - ItemPointer persistentTid, - int64 persistentSerialNum, - ItemPointer gpRelationNodeTid) +static bool +DatabaseInfo_AddGpRelationNode( + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + Oid reltablespace, + Oid relfilenode, + int32 segmentFileNum, + ItemPointer persistentTid, + int64 persistentSerialNum, + ItemPointer gpRelationNodeTid) { - DbInfoRel *dbInfoRel; - bool found; + DbInfoRel *dbInfoRel; + bool found; DbInfoGpRelationNode *dbInfoGpRelationNode; DbInfoRelKeyPair *dbInfoRelKey; - dbInfoRelKey = (DbInfoRelKeyPair *)palloc0(sizeof(DbInfoRelKeyPair)); + dbInfoRelKey = (DbInfoRelKeyPair *) palloc0(sizeof(DbInfoRelKeyPair)); /* - * pg_class and gp_relation_node stores 0 for tablespace if - * relation uses default tablespace so get the value of the - * default tablespace here + * pg_class and gp_relation_node stores 0 for tablespace if relation uses + * default tablespace so get the value of the default tablespace here */ if (reltablespace == 0) dbInfoRelKey->reltablespace = info->defaultTablespace; @@ -612,30 +623,30 @@ static bool DatabaseInfo_AddGpRelationNode( dbInfoRelKey->reltablespace = reltablespace; dbInfoRelKey->relfilenode = relfilenode; - dbInfoRel = - (DbInfoRel*) - hash_search(dbInfoRelHashTable, - dbInfoRelKey, - HASH_FIND, - &found); + dbInfoRel = + (DbInfoRel *) + hash_search(dbInfoRelHashTable, + dbInfoRelKey, + HASH_FIND, + &found); - //Changes to solve MPP-16346 - if(!dbInfoRel) - return found; + /* Changes to solve MPP-16346 */ + if (!dbInfoRel) + return found; if (found) { if (dbInfoRel->gpRelationNodesCount >= dbInfoRel->gpRelationNodesMaxCount) { DatabaseInfo_Grow( - (void**)&dbInfoRel->gpRelationNodes, - dbInfoRel->gpRelationNodesCount, - &dbInfoRel->gpRelationNodesMaxCount, - sizeof(DbInfoGpRelationNode)); + (void **) &dbInfoRel->gpRelationNodes, + dbInfoRel->gpRelationNodesCount, + &dbInfoRel->gpRelationNodesMaxCount, + sizeof(DbInfoGpRelationNode)); } - dbInfoGpRelationNode = - &dbInfoRel->gpRelationNodes[dbInfoRel->gpRelationNodesCount]; + dbInfoGpRelationNode = + &dbInfoRel->gpRelationNodes[dbInfoRel->gpRelationNodesCount]; dbInfoRel->gpRelationNodesCount++; } else @@ -643,121 +654,125 @@ static bool DatabaseInfo_AddGpRelationNode( if (info->parentlessGpRelationNodesCount >= dbInfoRel->physicalSegmentFilesMaxCount) { DatabaseInfo_Grow( - (void**)&info->parentlessGpRelationNodes, - info->parentlessGpRelationNodesCount, - &info->parentlessGpRelationNodesMaxCount, - sizeof(DbInfoGpRelationNode)); + (void **) &info->parentlessGpRelationNodes, + info->parentlessGpRelationNodesCount, + &info->parentlessGpRelationNodesMaxCount, + sizeof(DbInfoGpRelationNode)); } - dbInfoGpRelationNode = - &info->parentlessGpRelationNodes[info->parentlessGpRelationNodesCount]; + dbInfoGpRelationNode = + &info->parentlessGpRelationNodes[info->parentlessGpRelationNodesCount]; info->parentlessGpRelationNodesCount++; } - + dbInfoGpRelationNode->gpRelationNodeTid = *gpRelationNodeTid; dbInfoGpRelationNode->relfilenodeOid = relfilenode; dbInfoGpRelationNode->segmentFileNum = segmentFileNum; dbInfoGpRelationNode->persistentTid = *persistentTid; dbInfoGpRelationNode->persistentSerialNum = persistentSerialNum; - dbInfoGpRelationNode->logicalEof = 0; // This will obtained from the other sources later (e.g. aoseg / aocsseg). + dbInfoGpRelationNode->logicalEof = 0; + //This will obtained from the other sources later(e.g.aoseg / aocsseg). - if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + if (Debug_persistent_print) + elog(Persistent_DebugPrintLevel(), "DatabaseInfo_AddGpRelationNode: gp_relation_node TID %s, relfilenode %u, segment file #%d, persistent serial number " INT64_FORMAT ", persistent TID %s", ItemPointerToString(gpRelationNodeTid), relfilenode, segmentFileNum, persistentSerialNum, ItemPointerToString(persistentTid)); - + return found; } -static void DatabaseInfo_AddMiscEntry( - DatabaseInfo *info, - Oid tablespace, - bool isDir, - char *name) +static void +DatabaseInfo_AddMiscEntry( + DatabaseInfo *info, + Oid tablespace, + bool isDir, + char *name) { - MiscEntry *miscEntry; + MiscEntry *miscEntry; if (info->miscEntriesCount >= info->miscEntriesMaxCount) { DatabaseInfo_Grow( - (void**)&info->miscEntries, - info->miscEntriesCount, - &info->miscEntriesMaxCount, - sizeof(MiscEntry)); + (void **) &info->miscEntries, + info->miscEntriesCount, + &info->miscEntriesMaxCount, + sizeof(MiscEntry)); } miscEntry = &info->miscEntries[info->miscEntriesCount]; info->miscEntriesCount++; - + miscEntry->tablespace = tablespace; miscEntry->isDir = isDir; miscEntry->name = pstrdup(name); } -static void DatabaseInfo_AddPhysicalSegmentFile( - DbInfoRel *dbInfoRel, - int32 segmentFileNum, - int64 eof) +static void +DatabaseInfo_AddPhysicalSegmentFile( + DbInfoRel *dbInfoRel, + int32 segmentFileNum, + int64 eof) { - DbInfoSegmentFile *dbInfoSegmentFile; + DbInfoSegmentFile *dbInfoSegmentFile; if (dbInfoRel->physicalSegmentFilesCount >= dbInfoRel->physicalSegmentFilesMaxCount) { DatabaseInfo_Grow( - (void**)&dbInfoRel->physicalSegmentFiles, - dbInfoRel->physicalSegmentFilesCount, - &dbInfoRel->physicalSegmentFilesMaxCount, - sizeof(DbInfoSegmentFile)); + (void **) &dbInfoRel->physicalSegmentFiles, + dbInfoRel->physicalSegmentFilesCount, + &dbInfoRel->physicalSegmentFilesMaxCount, + sizeof(DbInfoSegmentFile)); } dbInfoSegmentFile = &dbInfoRel->physicalSegmentFiles[dbInfoRel->physicalSegmentFilesCount]; dbInfoRel->physicalSegmentFilesCount++; - + dbInfoSegmentFile->segmentFileNum = segmentFileNum; dbInfoSegmentFile->eof = eof; } -static void DatabaseInfo_AddRelSegFile( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - Oid tablespace, - Oid relfilenode, - int32 segmentFileNum, - int64 eof) +static void +DatabaseInfo_AddRelSegFile( + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + Oid tablespace, + Oid relfilenode, + int32 segmentFileNum, + int64 eof) { - DbInfoRel *dbInfoRel; - bool found; + DbInfoRel *dbInfoRel; + bool found; DbInfoRelKeyPair *dbInfoRelKey; - dbInfoRelKey = (DbInfoRelKeyPair *)palloc0(sizeof(DbInfoRelKeyPair)); + dbInfoRelKey = (DbInfoRelKeyPair *) palloc0(sizeof(DbInfoRelKeyPair)); dbInfoRelKey->reltablespace = tablespace; dbInfoRelKey->relfilenode = relfilenode; /* Lookup the relfilenode in our catalog cache */ - dbInfoRel = (DbInfoRel*) \ - hash_search(dbInfoRelHashTable, + dbInfoRel = (DbInfoRel *) \ + hash_search(dbInfoRelHashTable, dbInfoRelKey, HASH_FIND, &found); - /* + /* * If the relfilenode doesn't exist in the catalog then add it to the list * of orphaned relfilenodes. */ if (!found || dbInfoRel->dbInfoRelKey.reltablespace != tablespace) { DatabaseInfo_AddExtraSegmentFile( - info, - tablespace, - relfilenode, - segmentFileNum, - eof); + info, + tablespace, + relfilenode, + segmentFileNum, + eof); return; } @@ -767,38 +782,41 @@ static void DatabaseInfo_AddRelSegFile( eof); } -static void DatabaseInfo_AddFile( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - Oid tablespace, - char *dbDirPath, - char *name) +static void +DatabaseInfo_AddFile( + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + Oid tablespace, + char *dbDirPath, + char *name) { - int64 eof; - int itemCount; - Oid relfilenode; - uint32 segmentFileNum; - char path[MAXPGPATH]; - int fileFlags = O_RDONLY | PG_BINARY; - int fileMode = 0400; - /* File mode is S_IRUSR 00400 user has read permission */ - File file; + int64 eof; + int itemCount; + Oid relfilenode; + uint32 segmentFileNum; + char path[MAXPGPATH]; + int fileFlags = O_RDONLY | PG_BINARY; + int fileMode = 0400; + + /* File mode is S_IRUSR 00400 user has read permission */ + File file; sprintf(path, "%s/%s", dbDirPath, name); - + /* * Open the file for read. - */ + */ file = PathNameOpenFile(path, fileFlags, fileMode); - if(file < 0) + if (file < 0) { ereport(ERROR, (errcode_for_file_access(), - errmsg("Could not open segment file '%s'", + errmsg("Could not open segment file '%s'", path))); } eof = FileSeek(file, 0L, SEEK_END); - if (eof < 0) { + if (eof < 0) + { ereport(ERROR, (errcode_for_file_access(), errmsg("Could not seek to end of file \"%s\" : %m", @@ -808,7 +826,7 @@ static void DatabaseInfo_AddFile( itemCount = sscanf(name, "%u.%u", &relfilenode, &segmentFileNum); - /* + /* * UNDONE: sscanf is a rather poor scanner. For right now, just assume * properly named files. */ @@ -831,23 +849,23 @@ static void DatabaseInfo_AddFile( * - miscEntry : non-relation database files * - physicalSegmentFiles : relation segment files */ -static void +static void DatabaseInfo_Scan( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - Oid tablespace, - Oid database) + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + Oid tablespace, + Oid database) { - char *dbDirPath; - DIR *xldir; - struct dirent *xlde; - char fromfile[MAXPGPATH]; + char *dbDirPath; + DIR *xldir; + struct dirent *xlde; + char fromfile[MAXPGPATH]; /* Lookup the database path and allocate a directory scan structure */ dbDirPath = GetDatabasePath( - (tablespace == GLOBALTABLESPACE_OID ? 0 : database), - tablespace); - + (tablespace == GLOBALTABLESPACE_OID ? 0 : database), + tablespace); + xldir = AllocateDir(dbDirPath); if (xldir == NULL) ereport(ERROR, @@ -863,7 +881,10 @@ DatabaseInfo_Scan( strcmp(xlde->d_name, "..") == 0) continue; - /* Odd... On snow leopard, we get back "/" as a subdir, which is wrong. Ingore it */ + /* + * Odd... On snow leopard, we get back "/" as a subdir, which is + * wrong. Ingore it + */ if (xlde->d_name[0] == '/' && xlde->d_name[1] == '\0') continue; @@ -875,10 +896,11 @@ DatabaseInfo_Scan( ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fromfile))); + /* - * If the file went away while scanning, it's no error. - * This could happen especillay with shared relcache init file - * that is stored in global tablespace. + * If the file went away while scanning, it's no error. This could + * happen especillay with shared relcache init file that is stored + * in global tablespace. */ elog(LOG, "skipping missing file %s", fromfile); continue; @@ -887,19 +909,19 @@ DatabaseInfo_Scan( if (S_ISDIR(fst.st_mode)) { DatabaseInfo_AddMiscEntry( - info, - tablespace, - /* isDir */ true, - xlde->d_name); + info, + tablespace, + /* isDir */ true, + xlde->d_name); } else if (S_ISREG(fst.st_mode)) { DatabaseInfo_AddFile( - info, - dbInfoRelHashTable, - tablespace, - dbDirPath, - xlde->d_name); + info, + dbInfoRelHashTable, + tablespace, + dbDirPath, + xlde->d_name); } } @@ -912,10 +934,11 @@ DatabaseInfo_Scan( static int DbInfoRelPtrArray_Compare(const void *entry1, const void *entry2) { - const DbInfoRel *dbInfoRel1 = *((DbInfoRel**)entry1); - const DbInfoRel *dbInfoRel2 = *((DbInfoRel**)entry2); + const DbInfoRel *dbInfoRel1 = *((DbInfoRel **) entry1); + const DbInfoRel *dbInfoRel2 = *((DbInfoRel **) entry2); + + int compresult; - int compresult; compresult = memcmp(&dbInfoRel1->dbInfoRelKey, &dbInfoRel2->dbInfoRelKey, sizeof(DbInfoRelKeyPair)); @@ -930,8 +953,8 @@ DbInfoRelPtrArray_Compare(const void *entry1, const void *entry2) static int DbInfoGpRelationNode_Compare(const void *entry1, const void *entry2) { - const DbInfoGpRelationNode *info1 = (DbInfoGpRelationNode*) entry1; - const DbInfoGpRelationNode *info2 = (DbInfoGpRelationNode*) entry2; + const DbInfoGpRelationNode *info1 = (DbInfoGpRelationNode *) entry1; + const DbInfoGpRelationNode *info2 = (DbInfoGpRelationNode *) entry2; if (info1->relfilenodeOid == info2->relfilenodeOid) { @@ -965,10 +988,10 @@ DbInfoSegmentFile_Compare(const void *entry1, const void *entry2) static int DbInfoAppendOnlyCatalogSegmentInfo_Compare(const void *entry1, const void *entry2) { - const DbInfoAppendOnlyCatalogSegmentInfo *info1 = - (DbInfoAppendOnlyCatalogSegmentInfo *) entry1; - const DbInfoAppendOnlyCatalogSegmentInfo *info2 = - (DbInfoAppendOnlyCatalogSegmentInfo *) entry2; + const DbInfoAppendOnlyCatalogSegmentInfo *info1 = + (DbInfoAppendOnlyCatalogSegmentInfo *) entry1; + const DbInfoAppendOnlyCatalogSegmentInfo *info2 = + (DbInfoAppendOnlyCatalogSegmentInfo *) entry2; if (info1->segmentFileNum == info2->segmentFileNum) return 0; @@ -980,50 +1003,50 @@ DbInfoAppendOnlyCatalogSegmentInfo_Compare(const void *entry1, const void *entry static void DatabaseInfo_CollectGpRelationNode( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable) + DatabaseInfo *info, + HTAB *dbInfoRelHashTable) { - HeapScanDesc scan; - Relation gp_relation_node_rel; - HeapTuple tuple; - - gp_relation_node_rel = - DirectOpen_GpRelationNodeOpen( - info->defaultTablespace, - info->database); + HeapScanDesc scan; + Relation gp_relation_node_rel; + HeapTuple tuple; + + gp_relation_node_rel = + DirectOpen_GpRelationNodeOpen( + info->defaultTablespace, + info->database); scan = heap_beginscan(gp_relation_node_rel, SnapshotNow, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { - bool nulls[Natts_gp_relation_node]; - Datum values[Natts_gp_relation_node]; - - Oid reltablespace; - Oid relfilenode; - int32 segmentFileNum; - int64 createMirrorDataLossTrackingSessionNum; - ItemPointerData persistentTid; - int64 persistentSerialNum; - + bool nulls[Natts_gp_relation_node]; + Datum values[Natts_gp_relation_node]; + + Oid reltablespace; + Oid relfilenode; + int32 segmentFileNum; + int64 createMirrorDataLossTrackingSessionNum; + ItemPointerData persistentTid; + int64 persistentSerialNum; + heap_deform_tuple(tuple, RelationGetDescr(gp_relation_node_rel), values, nulls); GpRelationNode_GetValues( - values, - &reltablespace, - &relfilenode, - &segmentFileNum, - &createMirrorDataLossTrackingSessionNum, - &persistentTid, - &persistentSerialNum); - + values, + &reltablespace, + &relfilenode, + &segmentFileNum, + &createMirrorDataLossTrackingSessionNum, + &persistentTid, + &persistentSerialNum); + if (!DatabaseInfo_AddGpRelationNode( - info, - dbInfoRelHashTable, - reltablespace, - relfilenode, - segmentFileNum, - &persistentTid, - persistentSerialNum, - &tuple->t_self)) + info, + dbInfoRelHashTable, + reltablespace, + relfilenode, + segmentFileNum, + &persistentTid, + persistentSerialNum, + &tuple->t_self)) { elog(WARNING, "Did not find matching pg_class entry for gp_relation_node entry relfilenode %u (parentless!!!)", relfilenode); @@ -1036,10 +1059,10 @@ DatabaseInfo_CollectGpRelationNode( static void DatabaseInfo_HandleAppendOnly( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - HTAB *relationIdHashTable, - HTAB *pgAppendOnlyHashTable) + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + HTAB *relationIdHashTable, + HTAB *pgAppendOnlyHashTable) { HASH_SEQ_STATUS iterateStatus; @@ -1047,24 +1070,24 @@ DatabaseInfo_HandleAppendOnly( while (true) { - DbInfoRel *dbInfoRel; + DbInfoRel *dbInfoRel; - dbInfoRel = - (DbInfoRel*) - hash_seq_search(&iterateStatus); + dbInfoRel = + (DbInfoRel *) + hash_seq_search(&iterateStatus); if (dbInfoRel == NULL) break; - + if (dbInfoRel->relstorage == RELSTORAGE_AOROWS || dbInfoRel->relstorage == RELSTORAGE_AOCOLS) { - Form_pg_appendonly aoEntry; - DbInfoRel *aosegDbInfoRel; - int i; - + Form_pg_appendonly aoEntry; + DbInfoRel *aosegDbInfoRel; + int i; + aoEntry = DatabaseInfo_FindPgAppendOnly( - pgAppendOnlyHashTable, - dbInfoRel->relationOid); + pgAppendOnlyHashTable, + dbInfoRel->relationOid); if ((aoEntry->segrelid == 0) || (aoEntry->visimaprelid == 0) || (aoEntry->visimapidxid == 0)) @@ -1075,13 +1098,13 @@ DatabaseInfo_HandleAppendOnly( aoEntry->visimaprelid, aoEntry->visimapidxid); if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "DatabaseInfo_AddPgClassStoredRelation: Append-Only entry for relation id %u, relation name %s, " - "blocksize %d, safefswritesize %d, compresslevel %d, " - " checksum %s, compresstype %s, columnstore %s, segrelid %u, blkdirrelid %u, blkdiridxid %u, " + "blocksize %d, safefswritesize %d, compresslevel %d, " + " checksum %s, compresstype %s, columnstore %s, segrelid %u, blkdirrelid %u, blkdiridxid %u, " " visimaprelid %u, visimapidxid %u", - dbInfoRel->relationOid, - dbInfoRel->relname, + dbInfoRel->relationOid, + dbInfoRel->relname, aoEntry->blocksize, aoEntry->safefswritesize, aoEntry->compresslevel, @@ -1099,36 +1122,36 @@ DatabaseInfo_HandleAppendOnly( */ aosegDbInfoRel = DatabaseInfo_FindRelationId( - relationIdHashTable, - aoEntry->segrelid); + relationIdHashTable, + aoEntry->segrelid); Assert(aosegDbInfoRel != NULL); if (dbInfoRel->relstorage == RELSTORAGE_AOROWS) { FileSegInfo **aoSegfileArray; - int totalAoSegFiles; - - Relation pg_aoseg_rel; - - pg_aoseg_rel = - DirectOpen_PgAoSegOpenDynamic( - aoEntry->segrelid, - dbInfoRel->dbInfoRelKey.reltablespace, - info->database, - aosegDbInfoRel->dbInfoRelKey.relfilenode); - - aoSegfileArray = - GetAllFileSegInfo_pg_aoseg_rel( - dbInfoRel->relname, - pg_aoseg_rel, - SnapshotNow, - &totalAoSegFiles); + int totalAoSegFiles; + + Relation pg_aoseg_rel; + + pg_aoseg_rel = + DirectOpen_PgAoSegOpenDynamic( + aoEntry->segrelid, + dbInfoRel->dbInfoRelKey.reltablespace, + info->database, + aosegDbInfoRel->dbInfoRelKey.relfilenode); + + aoSegfileArray = + GetAllFileSegInfo_pg_aoseg_rel( + dbInfoRel->relname, + pg_aoseg_rel, + SnapshotNow, + &totalAoSegFiles); for (i = 0; i < totalAoSegFiles; i++) { DatabaseInfo_AddAppendOnlyCatalogSegmentInfo( - dbInfoRel, - aoSegfileArray[i]->segno, - aoSegfileArray[i]->eof); + dbInfoRel, + aoSegfileArray[i]->segno, + aoSegfileArray[i]->eof); } DirectOpen_PgAoSegClose(pg_aoseg_rel); @@ -1136,40 +1159,40 @@ DatabaseInfo_HandleAppendOnly( else if (dbInfoRel->relstorage == RELSTORAGE_AOCOLS) { struct AOCSFileSegInfo **aocsSegfileArray; - int totalAocsSegFiles; + int totalAocsSegFiles; - Relation pg_aocsseg_rel; + Relation pg_aocsseg_rel; pg_aocsseg_rel = - DirectOpen_PgAoCsSegOpenDynamic( - aoEntry->segrelid, - dbInfoRel->dbInfoRelKey.reltablespace, - info->database, - aosegDbInfoRel->dbInfoRelKey.relfilenode); - + DirectOpen_PgAoCsSegOpenDynamic( + aoEntry->segrelid, + dbInfoRel->dbInfoRelKey.reltablespace, + info->database, + aosegDbInfoRel->dbInfoRelKey.relfilenode); + aocsSegfileArray = GetAllAOCSFileSegInfo_pg_aocsseg_rel( - dbInfoRel->relnatts, - dbInfoRel->relname, - pg_aocsseg_rel, - SnapshotNow, - &totalAocsSegFiles); + dbInfoRel->relnatts, + dbInfoRel->relname, + pg_aocsseg_rel, + SnapshotNow, + &totalAocsSegFiles); for (i = 0; i < totalAocsSegFiles; i++) { - int32 segmentFileNum; - int columnNum; + int32 segmentFileNum; + int columnNum; segmentFileNum = aocsSegfileArray[i]->segno; for (columnNum = 0; columnNum < dbInfoRel->relnatts; columnNum++) { - AOCSVPInfoEntry *entry; + AOCSVPInfoEntry *entry; entry = getAOCSVPEntry(aocsSegfileArray[i], columnNum); - + DatabaseInfo_AddAppendOnlyCatalogSegmentInfo( - dbInfoRel, - columnNum * AOTupleId_MultiplierSegmentFileNum + segmentFileNum, - entry->eof); + dbInfoRel, + columnNum * AOTupleId_MultiplierSegmentFileNum + segmentFileNum, + entry->eof); } } @@ -1181,17 +1204,17 @@ DatabaseInfo_HandleAppendOnly( static void DatabaseInfo_CollectPgAppendOnly( - DatabaseInfo *info, - HTAB *pgAppendOnlyHashTable) + DatabaseInfo *info, + HTAB *pgAppendOnlyHashTable) { Relation pg_appendonly_rel; HeapScanDesc scan; HeapTuple tuple; - pg_appendonly_rel = - DirectOpen_PgAppendOnlyOpen( - info->defaultTablespace, + pg_appendonly_rel = + DirectOpen_PgAppendOnlyOpen( + info->defaultTablespace, info->database); scan = heap_beginscan(pg_appendonly_rel, SnapshotNow, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) @@ -1207,7 +1230,7 @@ DatabaseInfo_CollectPgAppendOnly( aoEntry->relid); if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "DatabaseInfo_Collect: Append-Only entry for relation id %u, " "blocksize %d, safefswritesize %d, compresslevel %d, " " checksum %s, compresstype %s, columnstore %s, segrelid %u, blkdirrelid %u, blkdiridxid %u", @@ -1223,23 +1246,23 @@ DatabaseInfo_CollectPgAppendOnly( aoEntry->blkdiridxid); DatabaseInfo_AddPgAppendOnly( - pgAppendOnlyHashTable, - aoEntry->relid, - aoEntry); + pgAppendOnlyHashTable, + aoEntry->relid, + aoEntry); } heap_endscan(scan); DirectOpen_PgAppendOnlyClose(pg_appendonly_rel); - + } static void DatabaseInfo_CollectPgClass( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - HTAB *relationIdHashTable, - Snapshot snapshot, - int *count) + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + HTAB *relationIdHashTable, + Snapshot snapshot, + int *count) { Relation pg_class_rel; @@ -1258,23 +1281,23 @@ DatabaseInfo_CollectPgClass( * database directories are active. I.e. Fill up the tablespaces array. */ *count = 0; - pg_class_rel = - DirectOpen_PgClassOpen( - info->defaultTablespace, - info->database); + pg_class_rel = + DirectOpen_PgClassOpen( + info->defaultTablespace, + info->database); scan = heap_beginscan(pg_class_rel, snapshot, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { - Oid relationOid; + Oid relationOid; - Form_pg_class form_pg_class; + Form_pg_class form_pg_class; - Oid reltablespace; + Oid reltablespace; - char relkind; - char relstorage; + char relkind; + char relstorage; - int relnatts; + int relnatts; relationOid = HeapTupleGetOid(tuple); @@ -1286,7 +1309,7 @@ DatabaseInfo_CollectPgClass( reltablespace = info->defaultTablespace; /* - * Skip non-storage relations. + * Skip non-storage relations. */ relkind = form_pg_class->relkind; @@ -1302,22 +1325,22 @@ DatabaseInfo_CollectPgClass( relnatts = form_pg_class->relnatts; DatabaseInfo_AddTablespace( - info, - reltablespace); + info, + reltablespace); DatabaseInfo_AddPgClassStoredRelation( - info, - dbInfoRelHashTable, - relationIdHashTable, - form_pg_class->relfilenode, - &tuple->t_self, - relationOid, - form_pg_class->relname.data, - reltablespace, - relkind, - relstorage, - form_pg_class->relam, - relnatts); + info, + dbInfoRelHashTable, + relationIdHashTable, + form_pg_class->relfilenode, + &tuple->t_self, + relationOid, + form_pg_class->relname.data, + reltablespace, + relkind, + relstorage, + form_pg_class->relam, + relnatts); (*count)++; } @@ -1332,23 +1355,23 @@ DatabaseInfo_CollectPgClass( */ static void DatabaseInfo_SortRelArray( - DatabaseInfo *info, - HTAB *dbInfoRelHashTable, - int count) + DatabaseInfo *info, + HTAB *dbInfoRelHashTable, + int count) { - HASH_SEQ_STATUS iterateStatus; - DbInfoRel **dbInfoRelPtrArray; - int d; + HASH_SEQ_STATUS iterateStatus; + DbInfoRel **dbInfoRelPtrArray; + int d; /* This function will populate the dbInfoRelArray */ Assert(info->dbInfoRelArray == NULL); /* Construct an array of pointers by scanning through the hash table */ - dbInfoRelPtrArray = (DbInfoRel**) palloc(sizeof(DbInfoRel*) * count); + dbInfoRelPtrArray = (DbInfoRel **) palloc(sizeof(DbInfoRel *) * count); hash_seq_init(&iterateStatus, dbInfoRelHashTable); for (d = 0; d < count; d++) { - dbInfoRelPtrArray[d] = (DbInfoRel*) hash_seq_search(&iterateStatus); + dbInfoRelPtrArray[d] = (DbInfoRel *) hash_seq_search(&iterateStatus); /* should have as many entries in the hash scan as "count" */ if (dbInfoRelPtrArray[d] == NULL) @@ -1358,35 +1381,33 @@ DatabaseInfo_SortRelArray( /* double check that the hash contained the right number of elements */ if (hash_seq_search(&iterateStatus) != NULL) elog(ERROR, "too many entries in dbInfoRelHashTable"); - + /* sort the pointer array */ qsort(dbInfoRelPtrArray, - count, - sizeof(DbInfoRel*), + count, + sizeof(DbInfoRel *), DbInfoRelPtrArray_Compare); - + /* * Finally convert the sorted pointer array into a sorted record array. */ - info->dbInfoRelArray = (DbInfoRel*) palloc(sizeof(DbInfoRel)*count); + info->dbInfoRelArray = (DbInfoRel *) palloc(sizeof(DbInfoRel) * count); for (d = 0; d < count; d++) { info->dbInfoRelArray[d] = *(dbInfoRelPtrArray[d]); - + /* - * For each record in the array we have three lists: - * - gpRelationNodes - * - appendOnlyCatalogSegmentInfo - * - physicalSegmentFiles + * For each record in the array we have three lists: - gpRelationNodes + * - appendOnlyCatalogSegmentInfo - physicalSegmentFiles * - * All three of which need to be sorted on segmentFileNum otherwise - * we will not be able to merge the lists correctly. + * All three of which need to be sorted on segmentFileNum otherwise we + * will not be able to merge the lists correctly. * * XXX - this seems like a bad design, it seems like we have three - * sources of information on the same thing, which should be able - * to be satisfied with a single Hash rather than trying to keep - * around three different lists and have code spread throughout the - * source trying to deal with merging the lists. + * sources of information on the same thing, which should be able to + * be satisfied with a single Hash rather than trying to keep around + * three different lists and have code spread throughout the source + * trying to deal with merging the lists. */ if (info->dbInfoRelArray[d].gpRelationNodes) qsort(info->dbInfoRelArray[d].gpRelationNodes, @@ -1418,59 +1439,53 @@ DatabaseInfo_SortRelArray( *------------------------------------------------------------------------- */ DatabaseInfo * DatabaseInfo_Collect( - Oid database, - Oid defaultTablespace, - Snapshot snapshot, - bool collectGpRelationNodeInfo, - bool collectAppendOnlyCatalogSegmentInfo, - bool scanFileSystem) + Oid database, + Oid defaultTablespace, + Snapshot snapshot, + bool collectGpRelationNodeInfo, + bool collectAppendOnlyCatalogSegmentInfo, + bool scanFileSystem) { - DatabaseInfo *info; - HTAB *dbInfoRelHashTable; - HTAB *relationIdHashTable; - HTAB *pgAppendOnlyHashTable; - int count; - int t; + DatabaseInfo *info; + HTAB *dbInfoRelHashTable; + HTAB *relationIdHashTable; + HTAB *pgAppendOnlyHashTable; + int count; + int t; /* Create local hash tables */ - dbInfoRelHashTable = DatabaseInfo_DbInfoRelHashTableInit(); - relationIdHashTable = DatabaseInfo_RelationIdHashTableInit(); + dbInfoRelHashTable = DatabaseInfo_DbInfoRelHashTableInit(); + relationIdHashTable = DatabaseInfo_RelationIdHashTableInit(); pgAppendOnlyHashTable = DatabaseInfo_PgAppendOnlyHashTableInit(); /* Setup an initial empty DatabaseInfo */ - info = (DatabaseInfo*)palloc0(sizeof(DatabaseInfo)); - info->database = database; - info->defaultTablespace = defaultTablespace; - info->collectGpRelationNodeInfo = collectGpRelationNodeInfo; + info = (DatabaseInfo *) palloc0(sizeof(DatabaseInfo)); + info->database = database; + info->defaultTablespace = defaultTablespace; + info->collectGpRelationNodeInfo = collectGpRelationNodeInfo; info->collectAppendOnlyCatalogSegmentInfo = collectAppendOnlyCatalogSegmentInfo; - /* - * Allocate the extensible arrays: - * - tablespaces - * - miscEntries - * - extraSegmentFiles - * - parentlessGpRelationNodes + /* + * Allocate the extensible arrays: - tablespaces - miscEntries - + * extraSegmentFiles - parentlessGpRelationNodes */ info->tablespacesMaxCount = 10; - info->tablespaces = palloc0(info->tablespacesMaxCount*sizeof(Oid)); + info->tablespaces = palloc0(info->tablespacesMaxCount * sizeof(Oid)); info->miscEntriesMaxCount = 50; - info->miscEntries = palloc0(info->miscEntriesMaxCount*sizeof(MiscEntry)); + info->miscEntries = palloc0(info->miscEntriesMaxCount * sizeof(MiscEntry)); info->extraSegmentFilesMaxCount = 10; - info->extraSegmentFiles = \ - palloc0(info->extraSegmentFilesMaxCount*sizeof(DbInfoExtraSegmentFile)); + info->extraSegmentFiles = \ + palloc0(info->extraSegmentFilesMaxCount * sizeof(DbInfoExtraSegmentFile)); info->parentlessGpRelationNodesMaxCount = 10; - info->parentlessGpRelationNodes = \ - palloc0(info->parentlessGpRelationNodesMaxCount*sizeof(DbInfoGpRelationNode)); - - /* - * Start Collecting information: - * - from pg_class - * - from pg_appendonly [if specified] - * - from gp_relation_node [if specified] - * - from file system + info->parentlessGpRelationNodes = \ + palloc0(info->parentlessGpRelationNodesMaxCount * sizeof(DbInfoGpRelationNode)); + + /* + * Start Collecting information: - from pg_class - from pg_appendonly [if + * specified] - from gp_relation_node [if specified] - from file system */ DatabaseInfo_CollectPgClass(info, dbInfoRelHashTable, relationIdHashTable, NULL, &count); DatabaseInfo_CollectPgAppendOnly(info, pgAppendOnlyHashTable); @@ -1478,12 +1493,12 @@ DatabaseInfo_Collect( if (info->collectAppendOnlyCatalogSegmentInfo) { /* - * We need the dbInfoRel hash table to translate pg_appendonly.segrelid - * to the ao[cs]seg relfilenode. + * We need the dbInfoRel hash table to translate + * pg_appendonly.segrelid to the ao[cs]seg relfilenode. */ - DatabaseInfo_HandleAppendOnly(info, - dbInfoRelHashTable, - relationIdHashTable, + DatabaseInfo_HandleAppendOnly(info, + dbInfoRelHashTable, + relationIdHashTable, pgAppendOnlyHashTable); } @@ -1500,9 +1515,10 @@ DatabaseInfo_Collect( /* * Scan each used directory for its relation segment files and misc - * files/dirs as found within the filesystem. This /may/ contain some files - * not referenced in gp_relation_node that are from crashed backends, but - * in general should agree with the set of entries in gp_relation_node. + * files/dirs as found within the filesystem. This /may/ contain some + * files not referenced in gp_relation_node that are from crashed + * backends, but in general should agree with the set of entries in + * gp_relation_node. * * Files not present in gp_relation_node will not be mirrored and probably * require removal to maintain database/filesystem consistency. @@ -1517,7 +1533,7 @@ DatabaseInfo_Collect( /* Convert the dbInfoRelHash into array and sort it. */ DatabaseInfo_SortRelArray(info, dbInfoRelHashTable, count); - + /* Cleanup memory */ hash_destroy(dbInfoRelHashTable); hash_destroy(relationIdHashTable); @@ -1527,14 +1543,14 @@ DatabaseInfo_Collect( return info; } -void +void DatabaseInfo_AlignAppendOnly( - DatabaseInfo *info, + DatabaseInfo *info, - DbInfoRel *dbInfoRel) + DbInfoRel *dbInfoRel) { - int a; - int g; + int a; + int g; /* * Process the ao[cs]seg entries against the gp_relation_node entries. @@ -1542,8 +1558,8 @@ DatabaseInfo_AlignAppendOnly( g = 0; for (a = 0; a < dbInfoRel->appendOnlyCatalogSegmentInfoCount; a++) { - DbInfoAppendOnlyCatalogSegmentInfo *appendOnlyCatalogSegmentInfo = - &dbInfoRel->appendOnlyCatalogSegmentInfo[a]; + DbInfoAppendOnlyCatalogSegmentInfo *appendOnlyCatalogSegmentInfo = + &dbInfoRel->appendOnlyCatalogSegmentInfo[a]; while (true) { @@ -1555,7 +1571,10 @@ DatabaseInfo_AlignAppendOnly( appendOnlyCatalogSegmentInfo->segmentFileNum, appendOnlyCatalogSegmentInfo->logicalEof); - // Otherwise, ignore ao[cs]seg entries with EOF == 0 and no gp_relation_node entry. + /* + * Otherwise, ignore ao[cs]seg entries with EOF == 0 and no + * gp_relation_node entry. + */ break; } @@ -1564,12 +1583,13 @@ DatabaseInfo_AlignAppendOnly( if (dbInfoRel->gpRelationNodes[g].segmentFileNum == 0) { /* - * Segment file #0 with always have a gp_relation_node entry, but often doesn't have an aoseg entry. + * Segment file #0 with always have a gp_relation_node + * entry, but often doesn't have an aoseg entry. */ g++; continue; } - + elog(ERROR, "Append-Only relation '%s' gp_relation_node entry for segment file #%d without an aoseg /aocsseg entry (case #1)", dbInfoRel->relname, dbInfoRel->gpRelationNodes[g].segmentFileNum); @@ -1582,12 +1602,13 @@ DatabaseInfo_AlignAppendOnly( } else { - Assert (dbInfoRel->gpRelationNodes[g].segmentFileNum > appendOnlyCatalogSegmentInfo->segmentFileNum); + Assert(dbInfoRel->gpRelationNodes[g].segmentFileNum > appendOnlyCatalogSegmentInfo->segmentFileNum); elog(ERROR, "Append-Only relation '%s' gp_relation_node entry for segment file #%d without an aoseg /aocsseg entry", dbInfoRel->relname, dbInfoRel->gpRelationNodes[g].segmentFileNum); } - g++; // Not reached. Protect against overly smart compilers looking at exit conditions... + g++; + //Not reached.Protect against overly smart compilers looking at exit conditions... } } diff --git a/src/backend/cdb/cdbdirectopen.c b/src/backend/cdb/cdbdirectopen.c index cca37c38bb..c7eb900b6d 100755 --- a/src/backend/cdb/cdbdirectopen.c +++ b/src/backend/cdb/cdbdirectopen.c @@ -21,7 +21,7 @@ #include "storage/smgr.h" #include "utils/memutils.h" #include "catalog/pg_authid.h" -#include "utils/fmgroids.h" /* include this before pg_am.h, for Am_btree */ +#include "utils/fmgroids.h" /* include this before pg_am.h, for Am_btree */ #include "catalog/pg_am.h" #include "catalog/pg_class.h" #include "catalog/pg_index.h" @@ -34,133 +34,133 @@ /* * pg_class. */ -static FormData_pg_class - DatabaseInfo_PgClassPgClass = - {Class_pg_class}; +static FormData_pg_class + DatabaseInfo_PgClassPgClass = +{Class_pg_class}; -static FormData_pg_attribute +static FormData_pg_attribute DatabaseInfo_PgClassAttrArray[Natts_pg_class] = - {Schema_pg_class}; +{Schema_pg_class}; -DirectOpenDefineStatic(DirectOpen_PgClass,\ -DatabaseInfo_PgClassPgClass,\ -DatabaseInfo_PgClassAttrArray,\ -true); +DirectOpenDefineStatic(DirectOpen_PgClass, \ + DatabaseInfo_PgClassPgClass, \ + DatabaseInfo_PgClassAttrArray, \ + true); /* * gp_relation_node. */ - -static FormData_pg_class - DatabaseInfo_GpRelationNodePgClass = - {Class_gp_relation_node}; -static FormData_pg_attribute +static FormData_pg_class + DatabaseInfo_GpRelationNodePgClass = +{Class_gp_relation_node}; + +static FormData_pg_attribute DatabaseInfo_GpRelationNodeAttrArray[Natts_gp_relation_node] = - {Schema_gp_relation_node}; +{Schema_gp_relation_node}; + +DirectOpenDefineStatic(DirectOpen_GpRelationNode, \ + DatabaseInfo_GpRelationNodePgClass, \ + DatabaseInfo_GpRelationNodeAttrArray, \ + false); -DirectOpenDefineStatic(DirectOpen_GpRelationNode,\ -DatabaseInfo_GpRelationNodePgClass,\ -DatabaseInfo_GpRelationNodeAttrArray,\ -false); - /* * pg_appendonly. */ -static FormData_pg_class - DatabaseInfo_PgAppendOnlyPgClass = - {Class_pg_appendonly}; +static FormData_pg_class + DatabaseInfo_PgAppendOnlyPgClass = +{Class_pg_appendonly}; -static FormData_pg_attribute +static FormData_pg_attribute DatabaseInfo_PgAppendOnlyAttrArray[Natts_pg_appendonly] = - {Schema_pg_appendonly}; +{Schema_pg_appendonly}; -DirectOpenDefineStatic(DirectOpen_PgAppendOnly,\ -DatabaseInfo_PgAppendOnlyPgClass,\ -DatabaseInfo_PgAppendOnlyAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_PgAppendOnly, \ + DatabaseInfo_PgAppendOnlyPgClass, \ + DatabaseInfo_PgAppendOnlyAttrArray, \ + false); /* * pg_aogseg. */ -static FormData_pg_class - DatabaseInfo_PgAoSegPgClass = - {Class_pg_aoseg}; +static FormData_pg_class + DatabaseInfo_PgAoSegPgClass = +{Class_pg_aoseg}; -static FormData_pg_attribute +static FormData_pg_attribute DatabaseInfo_PgAoSegAttrArray[Natts_pg_aoseg] = - {Schema_pg_aoseg}; +{Schema_pg_aoseg}; -DirectOpenDefineStatic(DirectOpen_PgAoSeg,\ -DatabaseInfo_PgAoSegPgClass,\ -DatabaseInfo_PgAoSegAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_PgAoSeg, \ + DatabaseInfo_PgAoSegPgClass, \ + DatabaseInfo_PgAoSegAttrArray, \ + false); /* * pg_aocsseg. */ -static FormData_pg_class - DatabaseInfo_PgAoCsSegPgClass = - {Class_pg_aocsseg}; +static FormData_pg_class + DatabaseInfo_PgAoCsSegPgClass = +{Class_pg_aocsseg}; -static FormData_pg_attribute +static FormData_pg_attribute DatabaseInfo_PgAoCsSegAttrArray[Natts_pg_aocsseg] = - {Schema_pg_aocsseg}; +{Schema_pg_aocsseg}; -DirectOpenDefineStatic(DirectOpen_PgAoCsSeg,\ -DatabaseInfo_PgAoCsSegPgClass,\ -DatabaseInfo_PgAoCsSegAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_PgAoCsSeg, \ + DatabaseInfo_PgAoCsSegPgClass, \ + DatabaseInfo_PgAoCsSegAttrArray, \ + false); /* * gp_global_sequence. */ -static FormData_pg_class - GlobalSequence_PgClass = - {Class_gp_global_sequence}; +static FormData_pg_class + GlobalSequence_PgClass = +{Class_gp_global_sequence}; -static FormData_pg_attribute +static FormData_pg_attribute GlobalSequence_AttrArray[Natts_gp_global_sequence] = - {Schema_gp_global_sequence}; +{Schema_gp_global_sequence}; -DirectOpenDefineStatic(DirectOpen_GpGlobalSequence,\ -GlobalSequence_PgClass,\ -GlobalSequence_AttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_GpGlobalSequence, \ + GlobalSequence_PgClass, \ + GlobalSequence_AttrArray, \ + false); /* * gp_persistent_relation_node. */ -static FormData_pg_class - PersistentFileSysObj_RelationPgClass = - {Class_gp_persistent_relation_node}; +static FormData_pg_class + PersistentFileSysObj_RelationPgClass = +{Class_gp_persistent_relation_node}; -static FormData_pg_attribute +static FormData_pg_attribute PersistentFileSysObj_RelationAttrArray[Natts_gp_persistent_relation_node] = - {Schema_gp_persistent_relation_node}; +{Schema_gp_persistent_relation_node}; -DirectOpenDefineStatic(DirectOpen_GpPersistentRelationNode,\ -PersistentFileSysObj_RelationPgClass,\ -PersistentFileSysObj_RelationAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_GpPersistentRelationNode, \ + PersistentFileSysObj_RelationPgClass, \ + PersistentFileSysObj_RelationAttrArray, \ + false); /* * gp_persistent_database_node. */ -static FormData_pg_class - PersistentFileSysObj_DatabasePgClass = - {Class_gp_persistent_database_node}; +static FormData_pg_class + PersistentFileSysObj_DatabasePgClass = +{Class_gp_persistent_database_node}; -static FormData_pg_attribute +static FormData_pg_attribute PersistentFileSysObj_DatabaseAttrArray[Natts_gp_persistent_database_node] = - {Schema_gp_persistent_database_node}; +{Schema_gp_persistent_database_node}; -DirectOpenDefineStatic(DirectOpen_GpPersistentDatabaseNode,\ -PersistentFileSysObj_DatabasePgClass,\ -PersistentFileSysObj_DatabaseAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_GpPersistentDatabaseNode, \ + PersistentFileSysObj_DatabasePgClass, \ + PersistentFileSysObj_DatabaseAttrArray, \ + false); /* * This HUGE MAGIC DEFINE expands into module globals and two routines: @@ -168,18 +168,18 @@ false); * PersistentFileSysObj_TablespaceClose * It allows for opening the relation without going through pg_class, etc. */ -static FormData_pg_class - PersistentFileSysObj_TablespacePgClass = - {Class_gp_persistent_tablespace_node}; +static FormData_pg_class + PersistentFileSysObj_TablespacePgClass = +{Class_gp_persistent_tablespace_node}; -static FormData_pg_attribute +static FormData_pg_attribute PersistentFileSysObj_TablespaceAttrArray[Natts_gp_persistent_tablespace_node] = - {Schema_gp_persistent_tablespace_node}; +{Schema_gp_persistent_tablespace_node}; -DirectOpenDefineStatic(DirectOpen_GpPersistentTableSpaceNode,\ -PersistentFileSysObj_TablespacePgClass,\ -PersistentFileSysObj_TablespaceAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_GpPersistentTableSpaceNode, \ + PersistentFileSysObj_TablespacePgClass, \ + PersistentFileSysObj_TablespaceAttrArray, \ + false); /* @@ -188,93 +188,95 @@ false); * PersistentFileSysObj_FilespaceClose * It allows for opening the relation without going through pg_class, etc. */ -static FormData_pg_class - PersistentFileSysObj_FilespacePgClass = - {Class_gp_persistent_filespace_node}; +static FormData_pg_class + PersistentFileSysObj_FilespacePgClass = +{Class_gp_persistent_filespace_node}; -static FormData_pg_attribute +static FormData_pg_attribute PersistentFileSysObj_FilespaceAttrArray[Natts_gp_persistent_filespace_node] = - {Schema_gp_persistent_filespace_node}; +{Schema_gp_persistent_filespace_node}; -DirectOpenDefineStatic(DirectOpen_GpPersistentFileSpaceNode,\ -PersistentFileSysObj_FilespacePgClass,\ -PersistentFileSysObj_FilespaceAttrArray,\ -false); +DirectOpenDefineStatic(DirectOpen_GpPersistentFileSpaceNode, \ + PersistentFileSysObj_FilespacePgClass, \ + PersistentFileSysObj_FilespaceAttrArray, \ + false); -// INDEX Variants +/* INDEX Variants */ /* * gp_relation_node_index. */ - -static FormData_pg_class - DatabaseInfo_GpRelationNodeIndexPgClass = - {Class_gp_relation_node_index}; -static FormData_pg_attribute +static FormData_pg_class + DatabaseInfo_GpRelationNodeIndexPgClass = +{Class_gp_relation_node_index}; + +static FormData_pg_attribute DatabaseInfo_GpRelationNodeIndexAttrArray[Natts_gp_relation_node_index] = - {Schema_gp_relation_node_index}; +{Schema_gp_relation_node_index}; -static FormData_pg_am +static FormData_pg_am DatabaseInfo_GpRelationNodeIndexPgAm = - {Am_gp_relation_node_index}; +{Am_gp_relation_node_index}; -static FormData_pg_index +static FormData_pg_index DatabaseInfo_GpRelationNodeIndexPgIndex = - {Index_gp_relation_node_index}; +{Index_gp_relation_node_index}; static int2 DatabaseInfo_GpRelationNodeIndexIndKeyArray[Natts_gp_relation_node_index] = - {IndKey_gp_relation_node_index}; +{IndKey_gp_relation_node_index}; -static Oid DatabaseInfo_GpRelationNodeIndexIndClassArray[Natts_gp_relation_node_index] = - {IndClass_gp_relation_node_index}; +static Oid DatabaseInfo_GpRelationNodeIndexIndClassArray[Natts_gp_relation_node_index] = +{IndClass_gp_relation_node_index}; -DirectOpenIndexDefineStatic(DirectOpen_GpRelationNodeIndex,\ -DatabaseInfo_GpRelationNodeIndexPgClass,\ -DatabaseInfo_GpRelationNodeIndexAttrArray,\ -DatabaseInfo_GpRelationNodeIndexPgAm,\ -DatabaseInfo_GpRelationNodeIndexPgIndex,\ -DatabaseInfo_GpRelationNodeIndexIndKeyArray,\ -DatabaseInfo_GpRelationNodeIndexIndClassArray,\ -false); +DirectOpenIndexDefineStatic(DirectOpen_GpRelationNodeIndex, \ + DatabaseInfo_GpRelationNodeIndexPgClass, \ + DatabaseInfo_GpRelationNodeIndexAttrArray, \ + DatabaseInfo_GpRelationNodeIndexPgAm, \ + DatabaseInfo_GpRelationNodeIndexPgIndex, \ + DatabaseInfo_GpRelationNodeIndexIndKeyArray, \ + DatabaseInfo_GpRelationNodeIndexIndClassArray, \ + false); -Relation DirectOpen_Open( - DirectOpen *direct, +Relation +DirectOpen_Open( + DirectOpen *direct, - Oid relationId, + Oid relationId, - Oid tablespace, + Oid tablespace, - Oid database, + Oid database, - Oid relfilenode, + Oid relfilenode, - FormData_pg_class *pgClass, + FormData_pg_class *pgClass, - FormData_pg_attribute *attrArray, + FormData_pg_attribute *attrArray, - FormData_pg_am *pgAm, + FormData_pg_am *pgAm, - FormData_pg_index *pgIndex, + FormData_pg_index *pgIndex, - int2 *indKeyArray, + int2 *indKeyArray, - Oid *indClassArray, + Oid *indClassArray, - bool relHasOid) + bool relHasOid) { - int natts; - int i; + int natts; + int i; Assert(pgClass != NULL); natts = pgClass->relnatts; if (relationId == -1) - relationId = pgClass->relfilenode; // Assume it is ok to use the relfilenode as the relationId in our limited usage. + relationId = pgClass->relfilenode; + /* Assume it is ok to use the relfilenode as the relationId in our limited usage. */ - if (relfilenode == -1) + if (relfilenode == -1) relfilenode = pgClass->relfilenode; if (!direct->isInit) @@ -293,15 +295,15 @@ Relation DirectOpen_Open( if (pgIndex != NULL) { - int pgIndexFixedLen = offsetof(FormData_pg_index, indkey); - int indKeyVectorLen = Int2VectorSize(natts); + int pgIndexFixedLen = offsetof(FormData_pg_index, indkey); + int indKeyVectorLen = Int2VectorSize(natts); int2vector *indKeyVector; oidvector *indClassVector; uint16 amstrategies; uint16 amsupport; - Oid *operator; + Oid *operator; RegProcedure *support; FmgrInfo *supportinfo; @@ -310,21 +312,21 @@ Relation DirectOpen_Open( Assert(indClassArray != NULL); /* - * Allocate Formdata_pg_index with fields through indkey - * where indkey is a variable length int2vector with indKeyArray values. + * Allocate Formdata_pg_index with fields through indkey where + * indkey is a variable length int2vector with indKeyArray values. */ - direct->relationData.rd_index = - (FormData_pg_index*)palloc( - pgIndexFixedLen + indKeyVectorLen); + direct->relationData.rd_index = + (FormData_pg_index *) palloc( + pgIndexFixedLen + indKeyVectorLen); memcpy(direct->relationData.rd_index, pgIndex, pgIndexFixedLen); indKeyVector = buildint2vector( - indKeyArray, - natts); + indKeyArray, + natts); memcpy( - &direct->relationData.rd_index->indkey, - indKeyVector, - indKeyVectorLen); + &direct->relationData.rd_index->indkey, + indKeyVector, + indKeyVectorLen); pfree(indKeyVector); @@ -379,10 +381,11 @@ Relation DirectOpen_Open( * Create oidvector in rd_indclass with values from indClassArray. */ indClassVector = buildoidvector(indClassArray, natts); - + /* * Fill the operator and support procedure OID arrays. (aminfo and - * supportinfo are left as zeroes, and are filled on-the-fly when used) + * supportinfo are left as zeroes, and are filled on-the-fly when + * used) */ IndexSupportInitialize(indClassVector, operator, support, @@ -395,10 +398,10 @@ Relation DirectOpen_Open( */ direct->relationData.rd_indexprs = NIL; direct->relationData.rd_indpred = NIL; - direct->relationData.rd_amcache = NULL; + direct->relationData.rd_amcache = NULL; } - // Not much in terms of contraints. + /* Not much in terms of contraints. */ direct->constrData.has_not_null = true; /* @@ -406,24 +409,24 @@ Relation DirectOpen_Open( */ direct->descData.natts = pgClass->relnatts; - // Make the array of pointers. - direct->descData.attrs = - (Form_pg_attribute*) - MemoryContextAllocZero( - TopMemoryContext, - sizeof(Form_pg_attribute*) * pgClass->relnatts); + /* Make the array of pointers. */ + direct->descData.attrs = + (Form_pg_attribute *) + MemoryContextAllocZero( + TopMemoryContext, + sizeof(Form_pg_attribute *) * pgClass->relnatts); for (i = 0; i < pgClass->relnatts; i++) { - direct->descData.attrs[i] = - (Form_pg_attribute) - MemoryContextAllocZero( - TopMemoryContext, - sizeof(FormData_pg_attribute)); + direct->descData.attrs[i] = + (Form_pg_attribute) + MemoryContextAllocZero( + TopMemoryContext, + sizeof(FormData_pg_attribute)); memcpy(direct->descData.attrs[i], &(attrArray[i]), sizeof(FormData_pg_attribute)); - // Patch up relation id. + /* Patch up relation id. */ direct->descData.attrs[i]->attrelid = relationId; } @@ -445,9 +448,12 @@ Relation DirectOpen_Open( direct->isInit = true; } - - // UNDONE: Should verify for NON-SHARED relations we don't open relations in different databases / or - // UNDONE: open different relations in same database at same time !!! + + /* + * UNDONE: Should verify for NON-SHARED relations we don't open relations + * in different databases or open different relations in same database at + * same time !!! + */ direct->relationData.rd_node.spcNode = tablespace; direct->relationData.rd_node.dbNode = database; direct->relationData.rd_node.relNode = relfilenode; @@ -457,8 +463,8 @@ Relation DirectOpen_Open( for (i = 0; i < direct->relationData.rd_rel->relnatts; i++) { Assert(direct->descData.attrs[i] != NULL); - - // Patch up relation id. + + /* Patch up relation id. */ direct->descData.attrs[i]->attrelid = direct->relationData.rd_id; } @@ -469,12 +475,13 @@ Relation DirectOpen_Open( return &direct->relationData; } -void DirectOpen_Close( - DirectOpen *direct, - Relation rel) +void +DirectOpen_Close( + DirectOpen *direct, + Relation rel) { Assert(rel == &direct->relationData); - Assert (direct->isInit); + Assert(direct->isInit); RelationCloseSmgr(&direct->relationData); diff --git a/src/backend/cdb/cdbdistributedsnapshot.c b/src/backend/cdb/cdbdistributedsnapshot.c index 3e1919da3a..adf2882c03 100644 --- a/src/backend/cdb/cdbdistributedsnapshot.c +++ b/src/backend/cdb/cdbdistributedsnapshot.c @@ -51,6 +51,7 @@ bool localXidSatisfiesAnyDistributedSnapshot(TransactionId localXid) { DistributedSnapshotCommitted distributedSnapshotCommitted; + Assert(TransactionIdIsNormal(localXid)); /* @@ -72,9 +73,9 @@ localXidSatisfiesAnyDistributedSnapshot(TransactionId localXid) { distributedSnapshotCommitted = DistributedSnapshotWithLocalMapping_CommittedTest( - &SerializableSnapshot->distribSnapshotWithLocalMapping, - localXid, - true); + &SerializableSnapshot->distribSnapshotWithLocalMapping, + localXid, + true); switch (distributedSnapshotCommitted) { @@ -108,15 +109,15 @@ localXidSatisfiesAnyDistributedSnapshot(TransactionId localXid) * The caller should've checked that the XID is committed (in clog), * otherwise the result of this function is undefined. */ -DistributedSnapshotCommitted +DistributedSnapshotCommitted DistributedSnapshotWithLocalMapping_CommittedTest( - DistributedSnapshotWithLocalMapping *dslm, - TransactionId localXid, - bool isVacuumCheck) + DistributedSnapshotWithLocalMapping *dslm, + TransactionId localXid, + bool isVacuumCheck) { DistributedSnapshot *ds = &dslm->ds; - uint32 i; - DistributedTransactionId distribXid = InvalidDistributedTransactionId; + uint32 i; + DistributedTransactionId distribXid = InvalidDistributedTransactionId; /* * Return early if local xid is not normal as it cannot have distributed @@ -163,8 +164,8 @@ DistributedSnapshotWithLocalMapping_CommittedTest( &distribXid)) { /* - * We cache local-only committed transactions for better - * performance, too. + * We cache local-only committed transactions for better performance, + * too. */ if (distribXid == InvalidDistributedTransactionId) return DISTRIBUTEDSNAPSHOT_COMMITTED_IGNORE; @@ -199,7 +200,8 @@ DistributedSnapshotWithLocalMapping_CommittedTest( return DISTRIBUTEDSNAPSHOT_COMMITTED_IGNORE; /* - * We have a distributed committed xid that corresponds to the local xid. + * We have a distributed committed xid that corresponds to the + * local xid. */ Assert(distribXid != InvalidDistributedTransactionId); @@ -207,9 +209,9 @@ DistributedSnapshotWithLocalMapping_CommittedTest( * Since we did not find it in our process local cache, add it. */ LocalDistribXactCache_AddCommitted( - localXid, - ds->distribTransactionTimeStamp, - distribXid); + localXid, + ds->distribTransactionTimeStamp, + distribXid); } else { @@ -220,19 +222,20 @@ DistributedSnapshotWithLocalMapping_CommittedTest( */ LocalDistribXactCache_AddCommitted(localXid, ds->distribTransactionTimeStamp, - /* distribXid */ InvalidDistributedTransactionId); + /* distribXid */ InvalidDistributedTransactionId); return DISTRIBUTEDSNAPSHOT_COMMITTED_IGNORE; } } Assert(ds->xminAllDistributedSnapshots != InvalidDistributedTransactionId); + /* * If this distributed transaction is older than all the distributed * snapshots, then we can ignore it from now on. */ Assert(ds->xmin >= ds->xminAllDistributedSnapshots); - + if (distribXid < ds->xminAllDistributedSnapshots) return DISTRIBUTEDSNAPSHOT_COMMITTED_IGNORE; @@ -247,8 +250,10 @@ DistributedSnapshotWithLocalMapping_CommittedTest( if (distribXid < ds->xmin) return DISTRIBUTEDSNAPSHOT_COMMITTED_VISIBLE; - /* Any xid >= xmax is in-progress, distributed xmax points to the - * committer, so it must be visible, so ">" instead of ">=" */ + /* + * Any xid >= xmax is in-progress, distributed xmax points to the + * committer, so it must be visible, so ">" instead of ">=" + */ if (distribXid > ds->xmax) { elog((Debug_print_snapshot_dtm ? LOG : DEBUG5), @@ -291,10 +296,10 @@ DistributedSnapshotWithLocalMapping_CommittedTest( } /* - * Leverage the fact that ds->inProgressXidArray is sorted in ascending - * order based on distribXid while creating the snapshot in - * createDtxSnapshot. So, can fail fast once known are lower than - * rest of them. + * Leverage the fact that ds->inProgressXidArray is sorted in + * ascending order based on distribXid while creating the snapshot in + * createDtxSnapshot. So, can fail fast once known are lower than rest + * of them. */ if (distribXid < ds->inProgressXidArray[i]) break; @@ -319,7 +324,7 @@ DistributedSnapshot_Reset(DistributedSnapshot *distributedSnapshot) distributedSnapshot->xmin = InvalidDistributedTransactionId; distributedSnapshot->xmax = InvalidDistributedTransactionId; distributedSnapshot->count = 0; - + /* maxCount and inProgressXidArray left untouched */ } @@ -329,45 +334,45 @@ DistributedSnapshot_Reset(DistributedSnapshot *distributedSnapshot) */ void DistributedSnapshot_Copy( - DistributedSnapshot *target, - DistributedSnapshot *source) + DistributedSnapshot *target, + DistributedSnapshot *source) { if (source->maxCount <= 0 || - source->count > source->maxCount) - elog(ERROR,"Invalid distributed snapshot (maxCount %d, count %d)", - source->maxCount, source->count); + source->count > source->maxCount) + elog(ERROR, "Invalid distributed snapshot (maxCount %d, count %d)", + source->maxCount, source->count); DistributedSnapshot_Reset(target); elog((Debug_print_full_dtm ? LOG : DEBUG5), "DistributedSnapshot_Copy target maxCount %d, inProgressXidArray %p, and " - "source maxCount %d, count %d, inProgressXidArray %p", + "source maxCount %d, count %d, inProgressXidArray %p", target->maxCount, - target->inProgressXidArray, + target->inProgressXidArray, source->maxCount, source->count, source->inProgressXidArray); /* * If we have allocated space for the in-progress distributed - * transactions, check against that space. Otherwise, - * use the source maxCount as guide in allocating space. + * transactions, check against that space. Otherwise, use the source + * maxCount as guide in allocating space. */ if (target->maxCount > 0) { Assert(target->inProgressXidArray != NULL); - - if(source->count > target->maxCount) - elog(ERROR,"Too many distributed transactions for snapshot (maxCount %d, count %d)", - target->maxCount, source->count); + + if (source->count > target->maxCount) + elog(ERROR, "Too many distributed transactions for snapshot (maxCount %d, count %d)", + target->maxCount, source->count); } else { Assert(target->inProgressXidArray == NULL); - - target->inProgressXidArray = - (DistributedTransactionId*) - malloc(source->maxCount * sizeof(DistributedTransactionId)); + + target->inProgressXidArray = + (DistributedTransactionId *) + malloc(source->maxCount * sizeof(DistributedTransactionId)); if (target->inProgressXidArray == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -384,9 +389,9 @@ DistributedSnapshot_Copy( target->count = source->count; memcpy( - target->inProgressXidArray, - source->inProgressXidArray, - source->count * sizeof(DistributedTransactionId)); + target->inProgressXidArray, + source->inProgressXidArray, + source->count * sizeof(DistributedTransactionId)); } int @@ -394,18 +399,18 @@ DistributedSnapshot_SerializeSize(DistributedSnapshot *ds) { return sizeof(DistributedTransactionTimeStamp) + sizeof(DistributedSnapshotId) + - /*xminAllDistributedSnapshots, xmin, xmax */ + /* xminAllDistributedSnapshots, xmin, xmax */ 3 * sizeof(DistributedTransactionId) + - /* count, maxCount */ + /* count, maxCount */ 2 * sizeof(int32) + - /* Size of inProgressXidArray */ + /* Size of inProgressXidArray */ sizeof(DistributedTransactionId) * ds->count; } int DistributedSnapshot_Serialize(DistributedSnapshot *ds, char *buf) { - char *p = buf; + char *p = buf; memcpy(p, &ds->distribTransactionTimeStamp, sizeof(DistributedTransactionTimeStamp)); p += sizeof(DistributedTransactionTimeStamp); @@ -422,8 +427,8 @@ DistributedSnapshot_Serialize(DistributedSnapshot *ds, char *buf) memcpy(p, &ds->maxCount, sizeof(int32)); p += sizeof(int32); - memcpy(p, ds->inProgressXidArray, sizeof(DistributedTransactionId)*ds->count); - p += sizeof(DistributedTransactionId)*ds->count; + memcpy(p, ds->inProgressXidArray, sizeof(DistributedTransactionId) * ds->count); + p += sizeof(DistributedTransactionId) * ds->count; Assert((p - buf) == DistributedSnapshot_SerializeSize(ds)); @@ -434,7 +439,7 @@ int DistributedSnapshot_Deserialize(const char *buf, DistributedSnapshot *ds) { const char *p = buf; - int32 maxCount; + int32 maxCount; memcpy(&ds->distribTransactionTimeStamp, p, sizeof(DistributedTransactionTimeStamp)); p += sizeof(DistributedTransactionTimeStamp); @@ -462,8 +467,8 @@ DistributedSnapshot_Deserialize(const char *buf, DistributedSnapshot *ds) /* * If we have allocated space for the in-progress distributed - * transactions, check against that space. Otherwise, - * use the received maxCount as guide in allocating space. + * transactions, check against that space. Otherwise, use the received + * maxCount as guide in allocating space. */ if (ds->inProgressXidArray != NULL) { @@ -491,7 +496,7 @@ DistributedSnapshot_Deserialize(const char *buf, DistributedSnapshot *ds) ds->maxCount = maxCount; } - ds->inProgressXidArray = (DistributedTransactionId *)malloc(maxCount * sizeof(DistributedTransactionId)); + ds->inProgressXidArray = (DistributedTransactionId *) malloc(maxCount * sizeof(DistributedTransactionId)); if (ds->inProgressXidArray == NULL) { ereport(ERROR, @@ -503,7 +508,8 @@ DistributedSnapshot_Deserialize(const char *buf, DistributedSnapshot *ds) if (ds->count > 0) { - int xipsize; + int xipsize; + Assert(ds->inProgressXidArray != NULL); xipsize = sizeof(DistributedTransactionId) * ds->count; diff --git a/src/backend/cdb/cdbdistributedxacts.c b/src/backend/cdb/cdbdistributedxacts.c index 0677ac520d..eda0c81c07 100644 --- a/src/backend/cdb/cdbdistributedxacts.c +++ b/src/backend/cdb/cdbdistributedxacts.c @@ -37,8 +37,7 @@ gp_distributed_xacts__(PG_FUNCTION_ARGS) funcctx = SRF_FIRSTCALL_INIT(); /* - * switch to memory context appropriate for multiple function - * calls + * switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); @@ -59,8 +58,8 @@ gp_distributed_xacts__(PG_FUNCTION_ARGS) funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* - * Collect all the locking information that we will format and - * send out as a result set. + * Collect all the locking information that we will format and send + * out as a result set. */ getAllDistributedXactStatus(&allDistributedXactStatus); funcctx->user_fctx = (void *) allDistributedXactStatus; @@ -74,14 +73,14 @@ gp_distributed_xacts__(PG_FUNCTION_ARGS) while (true) { TMGXACTSTATUS *distributedXactStatus; - + Datum values[6]; bool nulls[6]; HeapTuple tuple; Datum result; if (!getNextDistributedXactStatus(allDistributedXactStatus, - &distributedXactStatus)) + &distributedXactStatus)) break; /* @@ -104,4 +103,3 @@ gp_distributed_xacts__(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } - diff --git a/src/backend/cdb/cdbdistributedxid.c b/src/backend/cdb/cdbdistributedxid.c index d25095903f..b62347f58c 100644 --- a/src/backend/cdb/cdbdistributedxid.c +++ b/src/backend/cdb/cdbdistributedxid.c @@ -20,11 +20,10 @@ Datum gp_distributed_xid(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(gp_distributed_xid); Datum -gp_distributed_xid(PG_FUNCTION_ARGS __attribute__((unused)) ) +gp_distributed_xid(PG_FUNCTION_ARGS __attribute__((unused))) { DistributedTransactionId xid = getDistributedTransactionId(); PG_RETURN_XID(xid); } - diff --git a/src/backend/cdb/cdbdoublylinked.c b/src/backend/cdb/cdbdoublylinked.c index 334ef17761..3aada97f1a 100755 --- a/src/backend/cdb/cdbdoublylinked.c +++ b/src/backend/cdb/cdbdoublylinked.c @@ -11,26 +11,26 @@ * *------------------------------------------------------------------------- */ - + #include "postgres.h" #include "cdb/cdbdoublylinked.h" void DoublyLinkedHead_Init( - DoublyLinkedHead *head) + DoublyLinkedHead *head) { head->first = NULL; head->last = NULL; head->count = 0; } -void* +void * DoublyLinkedHead_First( - int offsetToDoubleLinks, - DoublyLinkedHead *head) + int offsetToDoubleLinks, + DoublyLinkedHead *head) { - DoubleLinks *doubleLinks; - + DoubleLinks *doubleLinks; + if (head->first == NULL) { Assert(head->last == NULL); @@ -43,16 +43,16 @@ DoublyLinkedHead_First( doubleLinks = head->first; Assert(doubleLinks->prev == NULL); - return ((uint8*)doubleLinks) - offsetToDoubleLinks; + return ((uint8 *) doubleLinks) - offsetToDoubleLinks; } -void* +void * DoublyLinkedHead_Last( - int offsetToDoubleLinks, - DoublyLinkedHead *head) + int offsetToDoubleLinks, + DoublyLinkedHead *head) { - DoubleLinks *doubleLinks; - + DoubleLinks *doubleLinks; + if (head->last == NULL) { Assert(head->first == NULL); @@ -65,18 +65,18 @@ DoublyLinkedHead_Last( doubleLinks = head->last; Assert(doubleLinks->next == NULL); - return ((uint8*)doubleLinks) - offsetToDoubleLinks; + return ((uint8 *) doubleLinks) - offsetToDoubleLinks; } -void* +void * DoublyLinkedHead_Next( - int offsetToDoubleLinks, - DoublyLinkedHead *head, - void *ele) + int offsetToDoubleLinks, + DoublyLinkedHead *head, + void *ele) { - DoubleLinks *doubleLinks; + DoubleLinks *doubleLinks; - doubleLinks = (DoubleLinks*)(((uint8*)ele) + offsetToDoubleLinks); + doubleLinks = (DoubleLinks *) (((uint8 *) ele) + offsetToDoubleLinks); if (head->last == doubleLinks) { @@ -97,19 +97,19 @@ DoublyLinkedHead_Next( doubleLinks = doubleLinks->next; Assert(doubleLinks != NULL); - - return ((uint8*)doubleLinks) - offsetToDoubleLinks; + + return ((uint8 *) doubleLinks) - offsetToDoubleLinks; } void DoublyLinkedHead_AddFirst( - int offsetToDoubleLinks, - DoublyLinkedHead *head, - void *ele) + int offsetToDoubleLinks, + DoublyLinkedHead *head, + void *ele) { - DoubleLinks *doubleLinks; + DoubleLinks *doubleLinks; - doubleLinks = (DoubleLinks*)(((uint8*)ele) + offsetToDoubleLinks); + doubleLinks = (DoubleLinks *) (((uint8 *) ele) + offsetToDoubleLinks); Assert(doubleLinks->prev == NULL); Assert(doubleLinks->next == NULL); @@ -131,16 +131,16 @@ DoublyLinkedHead_AddFirst( } head->count++; - + } -void* +void * DoublyLinkedHead_RemoveLast( - int offsetToDoubleLinks, - DoublyLinkedHead *head) + int offsetToDoubleLinks, + DoublyLinkedHead *head) { - DoubleLinks *doubleLinks; - + DoubleLinks *doubleLinks; + if (head->last == NULL) { Assert(head->first == NULL); @@ -166,12 +166,12 @@ DoublyLinkedHead_RemoveLast( head->count--; Assert(head->count >= 0); - return ((uint8*)doubleLinks) - offsetToDoubleLinks; + return ((uint8 *) doubleLinks) - offsetToDoubleLinks; } void DoubleLinks_Init( - DoubleLinks *doubleLinks) + DoubleLinks *doubleLinks) { doubleLinks->next = NULL; doubleLinks->prev = NULL; @@ -179,13 +179,13 @@ DoubleLinks_Init( void DoubleLinks_Remove( - int offsetToDoubleLinks, - DoublyLinkedHead *head, - void *ele) + int offsetToDoubleLinks, + DoublyLinkedHead *head, + void *ele) { - DoubleLinks *removeDoubleLinks; + DoubleLinks *removeDoubleLinks; - removeDoubleLinks = (DoubleLinks*)(((uint8*)ele) + offsetToDoubleLinks); + removeDoubleLinks = (DoubleLinks *) (((uint8 *) ele) + offsetToDoubleLinks); if (removeDoubleLinks->prev == NULL && removeDoubleLinks->next == NULL) @@ -204,22 +204,22 @@ DoubleLinks_Remove( * Removing the first element. */ Assert(head->first == removeDoubleLinks); - + Assert(removeDoubleLinks->next->prev == removeDoubleLinks); removeDoubleLinks->next->prev = NULL; - + head->first = removeDoubleLinks->next; } else if (removeDoubleLinks->next == NULL) { Assert(head->last == removeDoubleLinks); - + /* * Removing the last element. */ Assert(removeDoubleLinks->prev->next == removeDoubleLinks); removeDoubleLinks->prev->next = NULL; - + head->last = removeDoubleLinks->prev; } else diff --git a/src/backend/cdb/cdbfilerepresyncmanager.c b/src/backend/cdb/cdbfilerepresyncmanager.c index 862b16cf27..a853be009f 100644 --- a/src/backend/cdb/cdbfilerepresyncmanager.c +++ b/src/backend/cdb/cdbfilerepresyncmanager.c @@ -16,16 +16,16 @@ * INVARIANCES * * *) Change Tracking - * *) Change Tracking is turned ON when mirror segment is down + * *) Change Tracking is turned ON when mirror segment is down * (gp_segment_configuration is in CHANGE TRACKING after failover) * *) In the transition to Change Tracking recovery from the latest checkpoint is done - * *) Change tracking keeps track of changes to Heap Data Store + * *) Change tracking keeps track of changes to Heap Data Store * (RelFileNode, Block#, LSN, Persistent TID, Persistent Serial Number) - * *) Change Tracking is turned OFF (disabled) during resynchronization transition - * while primary segment is suspended - * *) That means that new changes are not tracked and + * *) Change Tracking is turned OFF (disabled) during resynchronization transition + * while primary segment is suspended + * *) That means that new changes are not tracked and * Change tracking log files are NOT deleted - * *) Change Tracking log files are deleted when primary and mirror segment + * *) Change Tracking log files are deleted when primary and mirror segment * transitions to InSYNC state * * *) Resynchronization (full copy, incremental file copy, incremental block copy) @@ -35,12 +35,12 @@ * *) Resync to mirror all file changes up to “last resync LSN”. * Three options available: * *) Full Copy - * For each segment file in a relation for each relation in a + * For each segment file in a relation for each relation in a * segment repeat the following steps: * *) Copy all blocks in segment file that has LSN < “last resync LSN” * *) Blocks that has LSN =>”last resync LSN” will be mirrored * *) After copy of each segment file take “resync CKPT” - * (persistently store the latest LSN, it identifies the start + * (persistently store the latest LSN, it identifies the start * resync point in case of restarting resync due to failures) * *) Incremental File Copy * For each segment file in a relation that was modified during change tracking @@ -54,32 +54,32 @@ * *) Blocks that has LSN =>”last resync LSN” will be mirrored * *) After copy of each segment file take “resync CKPT” * (persistently store the latest LSN) - * *) Crash Recovery of primary segment during resynchronization will + * *) Crash Recovery of primary segment during resynchronization will * restart copy from “resync CKPT” - * *) Mirror down during resynchronization will turn Change tracking ON and + * *) Mirror down during resynchronization will turn Change tracking ON and * turn Resynchronization OFF * *) When Mirror comes up and resynchronization is restarted * the resynchronization will be restarted from the “resync CKPT” - * *) Changes to be resynchronized are cumulative when Change Tracking is + * *) Changes to be resynchronized are cumulative when Change Tracking is * turned on multiple times * *) Catch up of pg_twophase, pg_xlog, pg_control, ... before transition to InSYNC mode - * + * * *) Ongoing IO to primary segment - * *) When resynchronization transition from its recovery to ready state + * *) When resynchronization transition from its recovery to ready state * changes to primary segment are mirrored (last resync LSN + 1) - * *) If primary segment crashes during resynchronization then during - * recovery any change applied during WAL replay on primary is applied + * *) If primary segment crashes during resynchronization then during + * recovery any change applied during WAL replay on primary is applied * also to mirror - * *) If mirror segment goes down then Change Tracking is turned ON on the + * *) If mirror segment goes down then Change Tracking is turned ON on the * primary. No disruption to IO on primary segment. * *) During resynchronization WAL and flat files are excluded from mirroring. *------------------------------------------------------------------------------ */ /*------------------------------------------------------------------------------ - * The following steps are performed during dataState == DataStateResynchronizing + * The following steps are performed during dataState == DataStateResynchronizing * *) FULL Copy - * *) Transition Segment State + * *) Transition Segment State * *) mark all relations re-create needed * *) mark all relations full copy request * @@ -90,7 +90,7 @@ * *) Transition Segment State * * *) Ready Segment State - * *) Scan Change Tracking to get all relations to be marked + * *) Scan Change Tracking to get all relations to be marked * in persistent file system object *------------------------------------------------------------------------------ */ @@ -98,52 +98,52 @@ /*------------------------------------------------------------------------------ Transition from Change Tracking to InResync on primary segment ============================================================== - 1) Shutdown FileRep Main process + 1) Shutdown FileRep Main process 2) postmaster set - *) dataState = DataStateInResync + *) dataState = DataStateInResync *) segmentState = SegmentStateInResyncTransition - 3) Start FileRep Main process + 3) Start FileRep Main process 4) Start 1 FileRep backend process as a Resync Manager *) FileRepMain -> FileRepPrimary_StartResyncManager() 5) Start N FileRep backend processes as a Resync Worker ( N should be GUC, Min=1, Max=8, Default =1 (Default will be increased later)) *) FileRepMain -> FileRepPrimary_StartResyncWorker() - 6) FileRep Resync backend processes exit when resynchronization is completed + 6) FileRep Resync backend processes exit when resynchronization is completed ResyncManager set - *) dataState == DataStateInSync + *) dataState == DataStateInSync *) segmentState = SegmentStateReady - + Crash Recovery when dataState = DataStateInResync on primary segment ==================================================================== - 1) postmaster sets - *) dataState = DataStateInResync + 1) postmaster sets + *) dataState = DataStateInResync *) segmentState = SegmentStateInitialization - 2) Start FileRep Main process + 2) Start FileRep Main process 3) Start 1 FileRep backend process as a Resync Manager *) FileRepMain() -> FileRepPrimary_StartResyncManager() 5) Start N FileRep backend processes as a Resync Worker ( N should be GUC, Min=1, Max=8, Default =1 (Default will be increased later)) *) FileRepMain() -> FileRepPrimary_StartResyncWorker() 6) Resync Manager process checks the state of resync transition If resync transition has not been completed yet then - a) Resync Manager sets - *) dataState = DataStateInChangeTracking + a) Resync Manager sets + *) dataState = DataStateInChangeTracking *) segmentState = SegmentStateReady or SegmentStateChangeTrackingDisabled - b) postmaster run XLOG recovery - c) postmaster sets - *) dataState = DataStateInResync + b) postmaster run XLOG recovery + c) postmaster sets + *) dataState = DataStateInResync *) segmentState = SegmentStateInResyncTransition else - a) Resync Manager sets - *) dataState = DataStateInResync + a) Resync Manager sets + *) dataState = DataStateInResync *) segmentState = SegmentStateReady - b) postmaster run XLOG recovery - *) dataState = DataStateInResync + b) postmaster run XLOG recovery + *) dataState = DataStateInResync *) segmentState = SegmentStateReady - 6) FileRep Resync backend processes exit when resynchronization is completed + 6) FileRep Resync backend processes exit when resynchronization is completed ResyncManager set - *) dataState == DataStateInSync + *) dataState == DataStateInSync *) segmentState = SegmentStateReady ------------------------------------------------------------------------------ -*/ +*/ #include "postgres.h" @@ -168,48 +168,55 @@ #include "utils/faultinjector.h" -typedef struct FileRepResyncShmem_s { - - volatile int appendOnlyCommitCount; - /* This counter is protected by FileRepAppendOnlyCommitCountLock */ - - bool reMirrorAllowed; - /* - * This flag is used just to enable FILEREP proces to perform some operation - * (like physical file drops/creates) on Mirror while InResyncTranstion phase. - * The flag set/unset has no use/impact for non-filerep processes. - */ - - int64 blocksSynchronized; - /* required to report number of blocks resynchronized */ - - int64 totalBlocksToSynchronize; - /* required to report total number of blocks to be resynchronized */ - - struct timeval startResyncTime; - /* resynchronization start time, required to report estimate time for resync to complete */ - - slock_t lock; - - XLogRecPtr endFullResyncLSN; - - XLogRecPtr endIncrResyncLSN; - - int writeCount; - /* number of relations waiting to be resynchronized */ +typedef struct FileRepResyncShmem_s +{ + + volatile int appendOnlyCommitCount; + /* This counter is protected by FileRepAppendOnlyCommitCountLock */ + + bool reMirrorAllowed; + + /* + * This flag is used just to enable FILEREP proces to perform some + * operation (like physical file drops/creates) on Mirror while + * InResyncTranstion phase. The flag set/unset has no use/impact for + * non-filerep processes. + */ + + int64 blocksSynchronized; + /* required to report number of blocks resynchronized */ + + int64 totalBlocksToSynchronize; + /* required to report total number of blocks to be resynchronized */ + + struct timeval startResyncTime; + + /* + * resynchronization start time, required to report estimate time for + * resync to complete + */ + + slock_t lock; + + XLogRecPtr endFullResyncLSN; + + XLogRecPtr endIncrResyncLSN; + + int writeCount; + /* number of relations waiting to be resynchronized */ int curFsobjCount; - /* Current number of file system objects to create/drop during resync*/ + /* Current number of file system objects to create/drop during resync */ int totalFsobjCount; - /* Total number of file system objects to create/drop during resync*/ - - int resyncInProgressCount; - /* number of relations in resynchronization */ - - int resyncCompletedCount; - - HTAB *fileRepResyncHash; + /* Total number of file system objects to create/drop during resync */ + + int resyncInProgressCount; + /* number of relations in resynchronization */ + + int resyncCompletedCount; + + HTAB *fileRepResyncHash; /* List of relations to be resynced */ } FileRepResyncShmem_s; @@ -218,31 +225,31 @@ static volatile FileRepResyncShmem_s *fileRepResyncShmem = NULL; static void FileRepResync_ShmemReInit(void); -static int FileRepResyncManager_InResyncTransition(void); -static int FileRepResyncManager_InSyncTransition(void); -static int FileRepPrimary_RunResyncManager(void); +static int FileRepResyncManager_InResyncTransition(void); +static int FileRepResyncManager_InSyncTransition(void); +static int FileRepPrimary_RunResyncManager(void); static void FileRepResync_LockAcquire(void); static void FileRepResync_LockRelease(void); static int FileRepResync_InsertEntry( - FileRepResyncHashEntry_s* entry); -static FileRepResyncHashEntry_s* FileRepResync_LookupEntry( - FileName fileName); + FileRepResyncHashEntry_s *entry); +static FileRepResyncHashEntry_s *FileRepResync_LookupEntry( + FileName fileName); static bool FileRepResync_RemoveEntry( - FileName fileName); + FileName fileName); -static int FileRepResync_CheckProgress(void); +static int FileRepResync_CheckProgress(void); static void -FileRepResync_LockAcquire(void) -{ +FileRepResync_LockAcquire(void) +{ SpinLockAcquire(&fileRepResyncShmem->lock); } static void -FileRepResync_LockRelease(void) -{ +FileRepResync_LockRelease(void) +{ SpinLockRelease(&fileRepResyncShmem->lock); } @@ -260,20 +267,20 @@ int FileRepResync_GetCurFsobjCount(void) { return (fileRepResyncShmem != NULL) ? - fileRepResyncShmem->curFsobjCount : -1; + fileRepResyncShmem->curFsobjCount : -1; } int FileRepResync_GetTotalFsobjCount(void) { return (fileRepResyncShmem != NULL) ? - fileRepResyncShmem->totalFsobjCount : -1; + fileRepResyncShmem->totalFsobjCount : -1; } int FileRepResync_IncAppendOnlyCommitCount(void) { - // This counter is protected by FileRepAppendOnlyCommitCountLock; + /* This counter is protected by FileRepAppendOnlyCommitCountLock; */ if (fileRepResyncShmem != NULL) { return ++(fileRepResyncShmem->appendOnlyCommitCount); @@ -285,13 +292,13 @@ FileRepResync_IncAppendOnlyCommitCount(void) } int -FileRepResync_DecAppendOnlyCommitCount(int count) +FileRepResync_DecAppendOnlyCommitCount(int count) { - // This counter is protected by FileRepAppendOnlyCommitCountLock; + /* This counter is protected by FileRepAppendOnlyCommitCountLock; */ if (fileRepResyncShmem != NULL) - { + { fileRepResyncShmem->appendOnlyCommitCount -= count; - + Assert(fileRepResyncShmem->appendOnlyCommitCount >= 0); return fileRepResyncShmem->appendOnlyCommitCount; @@ -305,7 +312,7 @@ FileRepResync_DecAppendOnlyCommitCount(int count) int FileRepResync_GetAppendOnlyCommitCount(void) { - // This counter is protected by FileRepAppendOnlyCommitCountLock; + /* This counter is protected by FileRepAppendOnlyCommitCountLock; */ if (fileRepResyncShmem != NULL) { return fileRepResyncShmem->appendOnlyCommitCount; @@ -331,10 +338,10 @@ FileRepResync_GetEndFullResyncLSN(void) void FileRepResyncManager_SetEndResyncLSN(XLogRecPtr endResyncLSN) { - char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; - - Assert(! (endResyncLSN.xlogid == 0 && endResyncLSN.xrecoff == 0)); - + char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; + + Assert(!(endResyncLSN.xlogid == 0 && endResyncLSN.xrecoff == 0)); + if (isFullResync()) { fileRepResyncShmem->endFullResyncLSN.xlogid = endResyncLSN.xlogid; @@ -343,16 +350,16 @@ FileRepResyncManager_SetEndResyncLSN(XLogRecPtr endResyncLSN) else { fileRepResyncShmem->endIncrResyncLSN.xlogid = endResyncLSN.xlogid; - fileRepResyncShmem->endIncrResyncLSN.xrecoff = endResyncLSN.xrecoff; + fileRepResyncShmem->endIncrResyncLSN.xrecoff = endResyncLSN.xrecoff; } - - + + snprintf(tmpBuf, sizeof(tmpBuf), "full resync '%s' resync lsn '%s(%u/%u)' ", (isFullResync() == TRUE) ? "true" : "false", XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); - + FileRep_InsertConfigLogEntry(tmpBuf); } @@ -371,23 +378,23 @@ FileRepResync_SetReMirrorAllowed(void) bool FileRepResync_IsReMirrorAllowed(void) { - return((fileRepResyncShmem != NULL)? - (fileRepResyncShmem->reMirrorAllowed == TRUE && - fileRepProcessType == FileRepProcessTypeResyncManager):false); + return ((fileRepResyncShmem != NULL) ? + (fileRepResyncShmem->reMirrorAllowed == TRUE && + fileRepProcessType == FileRepProcessTypeResyncManager) : false); } int64 FileRepResync_GetBlocksSynchronized(void) { - return((fileRepResyncShmem != NULL)? - (fileRepResyncShmem->blocksSynchronized) : 0); + return ((fileRepResyncShmem != NULL) ? + (fileRepResyncShmem->blocksSynchronized) : 0); } int64 FileRepResync_GetTotalBlocksToSynchronize(void) { - return((fileRepResyncShmem != NULL)? - (fileRepResyncShmem->totalBlocksToSynchronize) : 0); + return ((fileRepResyncShmem != NULL) ? + (fileRepResyncShmem->totalBlocksToSynchronize) : 0); } void @@ -405,15 +412,18 @@ FileRepResync_AddToTotalBlocksToSynchronize(int64 moreBlocksToSynchronize) struct timeval FileRepResync_GetEstimateResyncCompletionTime(void) { - char temp[128]; - pg_time_t tt; - struct timeval currentResyncTime; - struct timeval estimateResyncCompletionTime = {0, 0}; - - /* pull values out of shared memory into local variables so we have consistent values for calculation here */ - int64 totalBlocksToSynchronize = fileRepResyncShmem == NULL ? 0L : fileRepResyncShmem->totalBlocksToSynchronize; - int64 blocksSynchronized = fileRepResyncShmem == NULL ? 0L : fileRepResyncShmem->blocksSynchronized; - + char temp[128]; + pg_time_t tt; + struct timeval currentResyncTime; + struct timeval estimateResyncCompletionTime = {0, 0}; + + /* + * pull values out of shared memory into local variables so we have + * consistent values for calculation here + */ + int64 totalBlocksToSynchronize = fileRepResyncShmem == NULL ? 0L : fileRepResyncShmem->totalBlocksToSynchronize; + int64 blocksSynchronized = fileRepResyncShmem == NULL ? 0L : fileRepResyncShmem->blocksSynchronized; + if (totalBlocksToSynchronize == 0L || blocksSynchronized == 0L) { return estimateResyncCompletionTime; @@ -421,12 +431,12 @@ FileRepResync_GetEstimateResyncCompletionTime(void) struct timeval startResyncTime = fileRepResyncShmem->startResyncTime; gettimeofday(¤tResyncTime, NULL); - + if (totalBlocksToSynchronize > blocksSynchronized) { - estimateResyncCompletionTime.tv_sec = + estimateResyncCompletionTime.tv_sec = (((currentResyncTime.tv_sec - startResyncTime.tv_sec) * - (totalBlocksToSynchronize - blocksSynchronized)) / + (totalBlocksToSynchronize - blocksSynchronized)) / blocksSynchronized) + currentResyncTime.tv_sec; } else @@ -434,94 +444,95 @@ FileRepResync_GetEstimateResyncCompletionTime(void) estimateResyncCompletionTime.tv_sec = 0; estimateResyncCompletionTime.tv_usec = 0; } - + if (Debug_filerep_print) { tt = (pg_time_t) estimateResyncCompletionTime.tv_sec; pg_strftime(temp, sizeof(temp), "%a %b %d %H:%M:%S.%%06d %Y %Z", pg_localtime(&tt, session_timezone)); - - elog(LOG, + + elog(LOG, "resynchronization info: " - "total blocks to synchronize " INT64_FORMAT " " - "blocks synchronized " INT64_FORMAT " " + "total blocks to synchronize " INT64_FORMAT " " + "blocks synchronized " INT64_FORMAT " " "estimate resync completion time '%s' ", totalBlocksToSynchronize, blocksSynchronized, temp); } - + return estimateResyncCompletionTime; } /**************************************************************** - * FILEREP_RESYNC SHARED MEMORY + * FILEREP_RESYNC SHARED MEMORY ****************************************************************/ Size FileRepResync_ShmemSize(void) { - Size size; - + Size size; + size = hash_estimate_size( - (Size)FILEREP_MAX_RESYNC_FILES, + (Size) FILEREP_MAX_RESYNC_FILES, sizeof(FileRepResyncHashEntry_s)); - + size = add_size(size, sizeof(FileRepResyncShmem_s)); - - return size; + + return size; } /* - * Hash table contains + * Hash table contains * FileName(identifier) is the key in the hash table. - * Hash table in shared memory is initialized only on primary segment. + * Hash table in shared memory is initialized only on primary segment. * It is not initialized on mirror and master host. */ void FileRepResync_ShmemInit(void) { - HASHCTL hash_ctl; - bool foundPtr; - struct timeval resyncTime; + HASHCTL hash_ctl; + bool foundPtr; + struct timeval resyncTime; - - fileRepResyncShmem = (FileRepResyncShmem_s *) ShmemInitStruct("filerep resync", + + fileRepResyncShmem = (FileRepResyncShmem_s *) ShmemInitStruct("filerep resync", sizeof(FileRepResyncShmem_s), &foundPtr); - - if (fileRepResyncShmem == NULL) { + + if (fileRepResyncShmem == NULL) + { ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - (errmsg("not enough shared memory to run resynchronization")))); - } - - if (! foundPtr) + (errcode(ERRCODE_OUT_OF_MEMORY), + (errmsg("not enough shared memory to run resynchronization")))); + } + + if (!foundPtr) { MemSet(fileRepResyncShmem, 0, sizeof(FileRepResyncShmem_s)); } - + fileRepResyncShmem->appendOnlyCommitCount = 0; - - fileRepResyncShmem->reMirrorAllowed = FALSE; - + + fileRepResyncShmem->reMirrorAllowed = FALSE; + fileRepResyncShmem->totalBlocksToSynchronize = 0; - + fileRepResyncShmem->blocksSynchronized = 0; - + gettimeofday(&resyncTime, NULL); - + fileRepResyncShmem->startResyncTime.tv_sec = resyncTime.tv_sec; fileRepResyncShmem->startResyncTime.tv_usec = resyncTime.tv_usec; - + SpinLockInit(&fileRepResyncShmem->lock); - + fileRepResyncShmem->endFullResyncLSN.xlogid = 0; fileRepResyncShmem->endFullResyncLSN.xrecoff = 0; - + fileRepResyncShmem->endIncrResyncLSN.xlogid = 0; fileRepResyncShmem->endIncrResyncLSN.xrecoff = 0; - + fileRepResyncShmem->writeCount = 0; fileRepResyncShmem->curFsobjCount = 0; @@ -529,26 +540,26 @@ FileRepResync_ShmemInit(void) fileRepResyncShmem->resyncInProgressCount = 0; fileRepResyncShmem->resyncCompletedCount = 0; - + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = MAXPGPATH; hash_ctl.entrysize = sizeof(FileRepResyncHashEntry_s); hash_ctl.hash = string_hash; - + fileRepResyncShmem->fileRepResyncHash = ShmemInitHash("filerep resync hash", FILEREP_MAX_RESYNC_FILES, FILEREP_MAX_RESYNC_FILES, &hash_ctl, HASH_ELEM | HASH_FUNCTION); - - if (fileRepResyncShmem->fileRepResyncHash == NULL) + + if (fileRepResyncShmem->fileRepResyncHash == NULL) { - ereport(ERROR, + ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), (errmsg("not enough shared memory to run resynchronization")))); } - - return; + + return; } /* @@ -557,39 +568,40 @@ FileRepResync_ShmemInit(void) static void FileRepResync_ShmemReInit(void) { - HASH_SEQ_STATUS hash_status; - FileRepResyncHashEntry_s *entry; - struct timeval resyncTime; + HASH_SEQ_STATUS hash_status; + FileRepResyncHashEntry_s *entry; + struct timeval resyncTime; + - - if (fileRepResyncShmem == NULL) { + if (fileRepResyncShmem == NULL) + { ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - (errmsg("not enough shared memory to run resynchronization")))); - } + (errcode(ERRCODE_OUT_OF_MEMORY), + (errmsg("not enough shared memory to run resynchronization")))); + } /* * NOTE: Do not zero Commit Work Intent count for ReInit. */ - fileRepResyncShmem->reMirrorAllowed = FALSE; - + fileRepResyncShmem->reMirrorAllowed = FALSE; + fileRepResyncShmem->totalBlocksToSynchronize = 0; - + fileRepResyncShmem->blocksSynchronized = 0; - + gettimeofday(&resyncTime, NULL); - + fileRepResyncShmem->startResyncTime.tv_sec = resyncTime.tv_sec; - fileRepResyncShmem->startResyncTime.tv_usec = resyncTime.tv_usec; - + fileRepResyncShmem->startResyncTime.tv_usec = resyncTime.tv_usec; + SpinLockInit(&fileRepResyncShmem->lock); - + fileRepResyncShmem->endFullResyncLSN.xlogid = 0; fileRepResyncShmem->endFullResyncLSN.xrecoff = 0; - + fileRepResyncShmem->endIncrResyncLSN.xlogid = 0; fileRepResyncShmem->endIncrResyncLSN.xrecoff = 0; - + fileRepResyncShmem->writeCount = 0; fileRepResyncShmem->curFsobjCount = 0; @@ -597,26 +609,27 @@ FileRepResync_ShmemReInit(void) fileRepResyncShmem->resyncInProgressCount = 0; fileRepResyncShmem->resyncCompletedCount = 0; - - if (fileRepResyncShmem->fileRepResyncHash == NULL) + + if (fileRepResyncShmem->fileRepResyncHash == NULL) { - ereport(ERROR, + ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), (errmsg("not enough shared memory to run resynchronization")))); } - + FileRepResync_LockAcquire(); - + hash_seq_init(&hash_status, fileRepResyncShmem->fileRepResyncHash); - - while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) { - + + while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) + { + FileRepResync_RemoveEntry(entry->fileName); } - + FileRepResync_LockRelease(); - - return; + + return; } /* @@ -625,112 +638,120 @@ FileRepResync_ShmemReInit(void) void FileRepResync_Cleanup(void) { - HASH_SEQ_STATUS hash_status; - FileRepResyncHashEntry_s *entry; - - if (fileRepResyncShmem == NULL) { + HASH_SEQ_STATUS hash_status; + FileRepResyncHashEntry_s *entry; + + if (fileRepResyncShmem == NULL) + { return; - } - - if (fileRepResyncShmem->fileRepResyncHash == NULL) + } + + if (fileRepResyncShmem->fileRepResyncHash == NULL) { return; } - + FileRepResync_LockAcquire(); - + hash_seq_init(&hash_status, fileRepResyncShmem->fileRepResyncHash); - - while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) { - + + while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) + { + FileRepResync_RemoveEntry(entry->fileName); UnlockRelationForResynchronize( - &entry->relFileNode, - AccessExclusiveLock); + &entry->relFileNode, + AccessExclusiveLock); } - + LockReleaseAll(DEFAULT_LOCKMETHOD, false); FileRepResync_LockRelease(); - - return; + + return; } /**************************************************************** * FILEREP SUB-PROCESS (FileRep Primary RECOVERY Process) ****************************************************************/ /* - * + * * FileRepPrimary_StartResyncManager() * * */ -void +void FileRepPrimary_StartResyncManager(void) -{ - int status = STATUS_OK; - bool isLastLocTracked = FALSE; - +{ + int status = STATUS_OK; + bool isLastLocTracked = FALSE; + FileRep_InsertConfigLogEntry("start resync manager"); Insist(fileRepRole == FileRepPrimaryRole); Insist(dataState == DataStateInResync); - + FileRepResync_ShmemReInit(); - while (1) { - - if (status != STATUS_OK) { + while (1) + { + + if (status != STATUS_OK) + { FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); FileRepSubProcess_SetState(FileRepStateFault); } - + while (FileRepSubProcess_GetState() == FileRepStateFault || - + (fileRepShmemArray[0]->state == FileRepStateNotInitialized && FileRepSubProcess_GetState() != FileRepStateShutdownBackends && - FileRepSubProcess_GetState() != FileRepStateShutdown)) { - + FileRepSubProcess_GetState() != FileRepStateShutdown)) + { + FileRepSubProcess_ProcessSignals(); - pg_usleep(50000L); /* 50 ms */ + pg_usleep(50000L); /* 50 ms */ } - + if (FileRepSubProcess_GetState() == FileRepStateShutdown || - FileRepSubProcess_GetState() == FileRepStateShutdownBackends) { - + FileRepSubProcess_GetState() == FileRepStateShutdownBackends) + { + break; } - + if (segmentState == SegmentStateInitialization) { - + if (ChangeTracking_RetrieveLastChangeTrackedLoc()) { isLastLocTracked = TRUE; FileRepSubProcess_SetState(FileRepStateReady); getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); - + while (FileRepSubProcess_GetState() != FileRepStateShutdown && FileRepSubProcess_GetState() != FileRepStateShutdownBackends && - isDatabaseRunning() == FALSE) { - + isDatabaseRunning() == FALSE) + { + FileRepSubProcess_ProcessSignals(); - pg_usleep(50000L); /* 50 ms */ - } - + pg_usleep(50000L); /* 50 ms */ + } + if (FileRepSubProcess_GetState() == FileRepStateShutdown || - FileRepSubProcess_GetState() == FileRepStateShutdownBackends) { - + FileRepSubProcess_GetState() == FileRepStateShutdownBackends) + { + break; - } - + } + } else { - - FileRep_SetDataState(DataStateInChangeTracking, FALSE /* signal postmaster */); - + + FileRep_SetDataState(DataStateInChangeTracking, FALSE /* signal postmaster */ ); + if (isFullResync()) { FileRep_SetSegmentState(SegmentStateChangeTrackingDisabled, FaultTypeNotInitialized); @@ -738,34 +759,36 @@ FileRepPrimary_StartResyncManager(void) else { ChangeTracking_CreateInitialFromPreviousCheckpoint( - /* lastChangeTrackingEndLoc */ NULL); + /* lastChangeTrackingEndLoc */ NULL); FileRepSubProcess_SetState(FileRepStateReady); } - + /* Wait that XLOG replay is done */ while (FileRepSubProcess_GetState() != FileRepStateShutdown && FileRepSubProcess_GetState() != FileRepStateShutdownBackends && - ! (segmentState == SegmentStateInResyncTransition && - dataState == DataStateInResync)) { - + !(segmentState == SegmentStateInResyncTransition && + dataState == DataStateInResync)) + { + FileRepSubProcess_ProcessSignals(); - pg_usleep(50000L); /* 50 ms */ - } + pg_usleep(50000L); /* 50 ms */ + } if (FileRepSubProcess_GetState() == FileRepStateShutdown || - FileRepSubProcess_GetState() == FileRepStateShutdownBackends) { - + FileRepSubProcess_GetState() == FileRepStateShutdownBackends) + { + break; } } - - /* - * Database was started. - * The local copies of ThisTimeLineID and RedoRecPtr has to be initialized. + + /* + * Database was started. The local copies of ThisTimeLineID and + * RedoRecPtr has to be initialized. */ InitXLOGAccess(); } - + Insist(segmentState == SegmentStateInResyncTransition || segmentState == SegmentStateReady); @@ -777,41 +800,41 @@ FileRepPrimary_StartResyncManager(void) } if (status != STATUS_OK || - ! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + !(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { continue; } - + status = FileRepPrimary_RunResyncManager(); if (status != STATUS_OK || - ! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + !(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { continue; - } - + } + status = FileRepResyncManager_InSyncTransition(); - - if (status != STATUS_OK) + + if (status != STATUS_OK) { continue; } - + ereport(LOG, (errmsg("mirror transition to sync completed, " "primary address(port) '%s(%d)' mirror address(port) '%s(%d)' ", - fileRepPrimaryHostAddress, - fileRepPrimaryPort, - fileRepMirrorHostAddress, - fileRepMirrorPort), - FileRep_errcontext())); + fileRepPrimaryHostAddress, + fileRepPrimaryPort, + fileRepMirrorHostAddress, + fileRepMirrorPort), + FileRep_errcontext())); break; - - } // while(1) - - FileRep_InsertConfigLogEntry("resync manager completed"); + + } //while (1) + + FileRep_InsertConfigLogEntry("resync manager completed"); } @@ -821,37 +844,40 @@ FileRepPrimary_StartResyncManager(void) static int FileRepResyncManager_InResyncTransition(void) { - int status = STATUS_OK; - + int status = STATUS_OK; + FileRep_InsertConfigLogEntry("run resync transition"); if (LWLockHeldByMe(MirroredLock)) - ereport(ERROR, + ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), (errmsg("'MirroredLock' is already held by primary resync manager process")))); - + LWLockAcquire(MirroredLock, LW_EXCLUSIVE); - - /* database transitions to suspended state, IO activity on the segment is suspended */ + + /* + * database transitions to suspended state, IO activity on the segment is + * suspended + */ primaryMirrorSetIOSuspended(TRUE); - + FileRep_InsertConfigLogEntry("run resync transition, record last lsn in change tracking"); ChangeTracking_RecordLastChangeTrackedLoc(); - + if (FileRepSubProcess_GetState() == FileRepStateFault) { goto exit; } - + FileRepSubProcess_ProcessSignals(); - if (! ((segmentState == SegmentStateInResyncTransition || - segmentState == SegmentStateReady) && - dataState == DataStateInResync)) + if (!((segmentState == SegmentStateInResyncTransition || + segmentState == SegmentStateReady) && + dataState == DataStateInResync)) { goto exit; - } - + } + if (isFullResync()) { FileRep_InsertConfigLogEntry("run resync transition, mark full copy"); @@ -861,26 +887,27 @@ FileRepResyncManager_InResyncTransition(void) else { /* - * First, mark the special persistent tables and others as 'Scan Incremental'. + * First, mark the special persistent tables and others as 'Scan + * Incremental'. * - * These include OIDs {5090, 5091, 5092, 5093, and 5096} and others - * (see GpPersistent_SkipXLogInfo). + * These include OIDs {5090, 5091, 5092, 5093, and 5096} and others + * (see GpPersistent_SkipXLogInfo). * - * And, mark any Buffer Pool managed relations that were physically truncated as - * as 'Scan incremental' because we don't know how to process old changes in - * the change tracking log. + * And, mark any Buffer Pool managed relations that were physically + * truncated as as 'Scan incremental' because we don't know how to + * process old changes in the change tracking log. */ FileRep_InsertConfigLogEntry("run resync transition, mark scan incremental"); - + PersistentFileSysObj_MarkSpecialScanIncremental(); /* - * Second, now mark any relations that have changes in the change tracking - * log as 'Page Incremental', except those relations marked 'Scan Incremental' in the - * first step. + * Second, now mark any relations that have changes in the change + * tracking log as 'Page Incremental', except those relations marked + * 'Scan Incremental' in the first step. */ FileRep_InsertConfigLogEntry("run resync transition, mark page incremental"); - + PersistentFileSysObj_MarkPageIncrementalFromChangeLog(); if (segmentState == SegmentStateChangeTrackingDisabled) { @@ -895,83 +922,85 @@ FileRepResyncManager_InResyncTransition(void) PersistentFileSysObj_MarkAppendOnlyCatchup(); } - + FileRepSubProcess_ProcessSignals(); - if (! ((segmentState == SegmentStateInResyncTransition || - segmentState == SegmentStateReady) && - dataState == DataStateInResync)) + if (!((segmentState == SegmentStateInResyncTransition || + segmentState == SegmentStateReady) && + dataState == DataStateInResync)) { goto exit; - } + } /* - * It is important to keep order of dropping objects before creating objects since - * it can happen that persistent table has two entries for the same object. - * For e.g Consider that an append only segment file was created (when mirror - * was active) and dropped (after mirror went down) since transaction - * was aborted. Later, if the same Append Only segment file was created and - * transaction was committed (with the Mirror still down). This situation - * can cause to have duplicate entries for the same append only segment file. - * Now during resync phase if the file is created on the mirror first (re-create phase) - * and later dropped (redrop phase for original file) you can loose the file - * created by re-created phase. + * It is important to keep order of dropping objects before creating + * objects since it can happen that persistent table has two entries for + * the same object. For e.g Consider that an append only segment file was + * created (when mirror was active) and dropped (after mirror went down) + * since transaction was aborted. Later, if the same Append Only segment + * file was created and transaction was committed (with the Mirror still + * down). This situation can cause to have duplicate entries for the same + * append only segment file. Now during resync phase if the file is + * created on the mirror first (re-create phase) and later dropped (redrop + * phase for original file) you can loose the file created by re-created + * phase. */ FileRep_InsertConfigLogEntry("run resync manager, mirror redrop"); fileRepResyncShmem->totalFsobjCount = fileRepResyncShmem->curFsobjCount - = PersistentFileSysObj_CountResyncObjects(); + = PersistentFileSysObj_CountResyncObjects(); /* - * Drop the objects on the mirror that were dropped on the primary when the - * mirror was down/inactive by looking at Persistent Tables (all object types). - * But note that the accounting information in Persistent Tables will be - * updated later. This is because at this stage the segment is still in - * resync transition state and not ready. As a result any operations for e.g. write() - * still assume that the mirror is inactive/down and so won't be mirrored. - * Once the segment changes its state to ready, all such operations will be - * correctly mirrored. + * Drop the objects on the mirror that were dropped on the primary when + * the mirror was down/inactive by looking at Persistent Tables (all + * object types). But note that the accounting information in Persistent + * Tables will be updated later. This is because at this stage the segment + * is still in resync transition state and not ready. As a result any + * operations for e.g. write() still assume that the mirror is + * inactive/down and so won't be mirrored. Once the segment changes its + * state to ready, all such operations will be correctly mirrored. */ PersistentFileSysObj_MirrorReDrop(); FileRep_InsertConfigLogEntry("run resync manager, mirror redrop completed"); FileRepSubProcess_ProcessSignals(); - if (! ((segmentState == SegmentStateInResyncTransition || - segmentState == SegmentStateReady) && - dataState == DataStateInResync)) + if (!((segmentState == SegmentStateInResyncTransition || + segmentState == SegmentStateReady) && + dataState == DataStateInResync)) { goto exit; - } + } SIMPLE_FAULT_INJECTOR(FileRepTransitionToInResyncMirrorReCreate); - - FileRep_InsertConfigLogEntry("run resync transition, mirror recreate"); - - PersistentFileSysObj_MirrorReCreate(); - + + FileRep_InsertConfigLogEntry("run resync transition, mirror recreate"); + + PersistentFileSysObj_MirrorReCreate(); + FileRepSubProcess_ProcessSignals(); - if (! ((segmentState == SegmentStateInResyncTransition || - segmentState == SegmentStateReady) && - dataState == DataStateInResync)) + if (!((segmentState == SegmentStateInResyncTransition || + segmentState == SegmentStateReady) && + dataState == DataStateInResync)) { goto exit; - } + } FileRepSubProcess_SetState(FileRepStateReady); - + SIMPLE_FAULT_INJECTOR(FileRepTransitionToInResyncMarkReCreated); /* - * Mark Persistent Table entries as 'Dropped (i.e. Free)' to indicate the drops - * performed in PersistenFileSysObj_MirrorReDrop(). This module only updates - * drop pending/aborting create entries to 'free' in Persistent tables. + * Mark Persistent Table entries as 'Dropped (i.e. Free)' to indicate the + * drops performed in PersistenFileSysObj_MirrorReDrop(). This module only + * updates drop pending/aborting create entries to 'free' in Persistent + * tables. */ FileRep_InsertConfigLogEntry("run resync transition, mark mirror redropped"); PersistentFileSysObj_MarkMirrorReDropped(); FileRepSubProcess_ProcessSignals(); - if (! (segmentState == SegmentStateReady && - dataState == DataStateInResync)) + if (!(segmentState == SegmentStateReady && + dataState == DataStateInResync)) { goto exit; } @@ -979,105 +1008,108 @@ FileRepResyncManager_InResyncTransition(void) FileRep_InsertConfigLogEntry("run resync transition, mark mirror recreated"); PersistentFileSysObj_MarkMirrorReCreated(); - + FileRepSubProcess_ProcessSignals(); - if (! (segmentState == SegmentStateReady && - dataState == DataStateInResync)) + if (!(segmentState == SegmentStateReady && + dataState == DataStateInResync)) { goto exit; - } - + } + SIMPLE_FAULT_INJECTOR(FileRepTransitionToInResyncMarkCompleted); - FileRep_InsertConfigLogEntry("run resync transition, mark transition to resync completed"); + FileRep_InsertConfigLogEntry("run resync transition, mark transition to resync completed"); ChangeTracking_MarkTransitionToResyncCompleted(); - + FileRepSubProcess_ProcessSignals(); - if (! (segmentState == SegmentStateReady && - dataState == DataStateInResync)) + if (!(segmentState == SegmentStateReady && + dataState == DataStateInResync)) { goto exit; - } - + } + exit: LWLockRelease(MirroredLock); - + /* database is resumed */ primaryMirrorSetIOSuspended(FALSE); - + return status; } /* * FileRepPrimary_RunResyncManager() */ -static int +static int FileRepPrimary_RunResyncManager(void) { - int status = STATUS_OK; - bool retval; - int ii=0; - bool mirroredLock = FALSE; - struct timeval resyncTime; - - FileRepResyncHashEntry_s entry; - ResynchronizeScanToken token; - + int status = STATUS_OK; + bool retval; + int ii = 0; + bool mirroredLock = FALSE; + struct timeval resyncTime; + + FileRepResyncHashEntry_s entry; + ResynchronizeScanToken token; + FileRep_InsertConfigLogEntry("run resync manager"); ResynchronizeScanToken_Init(&token); - + /* set start resync time required for reporting */ gettimeofday(&resyncTime, NULL); - + fileRepResyncShmem->startResyncTime.tv_sec = resyncTime.tv_sec; fileRepResyncShmem->startResyncTime.tv_usec = resyncTime.tv_usec; - - while (1) { - + + while (1) + { + FileRepSubProcess_ProcessSignals(); - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { break; } if (FileRepResync_CheckProgress() == FILEREP_MAX_RESYNC_FILES) { - pg_usleep(50000L); // 50ms - continue; + pg_usleep(50000L); + //50 ms + continue; } - + retval = PersistentFileSysObj_ResynchronizeScan( - &token, - &entry.relFileNode, - &entry.segmentFileNum, - &entry.relStorageMgr, - &entry.mirrorDataSynchronizationState, - &entry.mirrorBufpoolResyncChangedPageCount, - &entry.mirrorBufpoolResyncCkptLoc, - &entry.mirrorBufpoolResyncCkptBlockNum, - &entry.mirrorAppendOnlyLossEof, - &entry.mirrorAppendOnlyNewEof, - &entry.persistentTid, - &entry.persistentSerialNum); - + &token, + &entry.relFileNode, + &entry.segmentFileNum, + &entry.relStorageMgr, + &entry.mirrorDataSynchronizationState, + &entry.mirrorBufpoolResyncChangedPageCount, + &entry.mirrorBufpoolResyncCkptLoc, + &entry.mirrorBufpoolResyncCkptBlockNum, + &entry.mirrorAppendOnlyLossEof, + &entry.mirrorAppendOnlyNewEof, + &entry.persistentTid, + &entry.persistentSerialNum); + if (retval == FALSE) { /* wait that resync workers are completed */ while (FileRepResync_CheckProgress() > 0) { - pg_usleep(50000L); // 50ms - - FileRepSubProcess_ProcessSignals(); - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + pg_usleep(50000L); + //50 ms + + FileRepSubProcess_ProcessSignals(); + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { break; - } + } } - + if (mirroredLock == TRUE) { LWLockRelease(MirroredLock); @@ -1085,62 +1117,68 @@ FileRepPrimary_RunResyncManager(void) mirroredLock = FALSE; break; } - - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { break; - } - + } + LWLockAcquire(MirroredLock, LW_EXCLUSIVE); mirroredLock = TRUE; - + if (fileRepResyncShmem->appendOnlyCommitCount == 0) { - + FileRep_InsertConfigLogEntry("run resync transition, mark append only incremental"); - + PersistentFileSysObj_MarkAppendOnlyCatchup(); ResynchronizeScanToken_Init(&token); - - /* MirroredLock is not released. AppendOnly Resync has to finish under MirrorLock. */ + + /* + * MirroredLock is not released. AppendOnly Resync has to + * finish under MirrorLock. + */ continue; } else { LWLockRelease(MirroredLock); mirroredLock = FALSE; - /* - * wait that all in progress AppendOnly transactions are committed - * and periodically check if more work to do for resync + + /* + * wait that all in progress AppendOnly transactions are + * committed and periodically check if more work to do for + * resync */ - pg_usleep(100000L); // 100ms - - FileRep_InsertConfigLogEntry("run resync transition, mark append only incremental"); + pg_usleep(100000L); + //100 ms + + FileRep_InsertConfigLogEntry("run resync transition, mark append only incremental"); LWLockAcquire(MirroredLock, LW_EXCLUSIVE); PersistentFileSysObj_MarkAppendOnlyCatchup(); LWLockRelease(MirroredLock); - + ResynchronizeScanToken_Init(&token); continue; } } - + if (entry.mirrorDataSynchronizationState == MirroredRelDataSynchronizationState_DataSynchronized || entry.mirrorDataSynchronizationState == MirroredRelDataSynchronizationState_None) { if (Debug_filerep_print) { - elog(LOG, "Not adding this entry to hash table %s as DataSynchronized/None %d", - entry.fileName, entry.mirrorDataSynchronizationState); + elog(LOG, "Not adding this entry to hash table %s as DataSynchronized/None %d", + entry.fileName, entry.mirrorDataSynchronizationState); } continue; } - + if (isFullResync()) { - if (entry.mirrorDataSynchronizationState == MirroredRelDataSynchronizationState_BufferPoolPageIncremental) + if (entry.mirrorDataSynchronizationState == MirroredRelDataSynchronizationState_BufferPoolPageIncremental) { ereport(WARNING, (errmsg("resync failure, " @@ -1151,7 +1189,7 @@ FileRepPrimary_RunResyncManager(void) "append only loss eof " INT64_FORMAT " " "append only new eof " INT64_FORMAT " " "mirror buffer pool resync changed page count " INT64_FORMAT " ", - entry.fileName, + entry.fileName, PersistentFileSysRelStorageMgr_Name(entry.relStorageMgr), entry.relStorageMgr, MirroredRelDataSynchronizationState_Name(entry.mirrorDataSynchronizationState), @@ -1159,20 +1197,21 @@ FileRepPrimary_RunResyncManager(void) entry.mirrorAppendOnlyLossEof, entry.mirrorAppendOnlyNewEof, entry.mirrorBufpoolResyncChangedPageCount), - errhint("run gprecoverseg -F again to re-establish mirror connectivity"))); - + errhint("run gprecoverseg -F again to re-establish mirror connectivity"))); + FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); FileRepSubProcess_ProcessSignals(); } } - + /* - * Resynchronize Lock is taken for particular relation to protect from drop and truncate for particular relation. + * Resynchronize Lock is taken for particular relation to protect from + * drop and truncate for particular relation. */ LockRelationForResynchronize( - &entry.relFileNode, + &entry.relFileNode, AccessExclusiveLock); - + if (!PersistentFileSysObj_ResynchronizeRefetch( &entry.relFileNode, &entry.segmentFileNum, @@ -1182,37 +1221,38 @@ FileRepPrimary_RunResyncManager(void) &entry.mirrorDataSynchronizationState, &entry.mirrorBufpoolResyncCkptLoc, &entry.mirrorBufpoolResyncCkptBlockNum, - &entry.mirrorAppendOnlyLossEof, - &entry.mirrorAppendOnlyNewEof)) + &entry.mirrorAppendOnlyLossEof, + &entry.mirrorAppendOnlyNewEof)) { - UnlockRelationForResynchronize( - &entry.relFileNode, - AccessExclusiveLock); + UnlockRelationForResynchronize( + &entry.relFileNode, + AccessExclusiveLock); - if (Debug_filerep_print) - elog(LOG, "Not adding this entry to hash table %s", entry.fileName); - continue; + if (Debug_filerep_print) + elog(LOG, "Not adding this entry to hash table %s", entry.fileName); + continue; } SIMPLE_FAULT_INJECTOR(FileRepResync); FileRep_GetRelationPath( - entry.fileName, - entry.relFileNode, + entry.fileName, + entry.relFileNode, entry.segmentFileNum); - + status = FileRepResync_InsertEntry(&entry); - + if (status != STATUS_OK) { - /* - * UnlockRelationForResynchronize() will be issued in FileRepResync_Cleanup(). + /* + * UnlockRelationForResynchronize() will be issued in + * FileRepResync_Cleanup(). */ - FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); + FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); if (mirroredLock == TRUE) { LWLockRelease(MirroredLock); - + mirroredLock = FALSE; } } @@ -1238,55 +1278,58 @@ FileRepPrimary_RunResyncManager(void) entry.persistentSerialNum); } } - + FileRepResync_Cleanup(); - + return status; } /* * FileRepResyncManager_InSyncTransition() */ -static int +static int FileRepResyncManager_InSyncTransition(void) -{ +{ int status = STATUS_OK; - + FileRep_InsertConfigLogEntry("run resync sync transition"); SIMPLE_FAULT_INJECTOR(FileRepTransitionToInSyncBegin); - while (1) { + while (1) + { /* - * (MirroredLock, LW_EXCLUSIVE) is acquired and released in CreateCheckPoint + * (MirroredLock, LW_EXCLUSIVE) is acquired and released in + * CreateCheckPoint */ FileRep_InsertConfigLogEntry("run sync transition, request checkpoint"); - + MirroredFlatFile_DropFilesFromDir(); - + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_RESYNC_TO_INSYNC_TRANSITION); - - /* + + /* * The second checkpoint is required in order to mirror pg_control - * with last checkpoint position in the xlog file that is mirrored (XLogSwitch). + * with last checkpoint position in the xlog file that is mirrored + * (XLogSwitch). */ RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); FileRepSubProcess_ProcessSignals(); - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInSync)) + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInSync)) { break; - } - + } + FileRep_InsertConfigLogEntry("run sync transition, mirror to sync transition"); FileRepPrimary_MirrorInSyncTransition(); - + FileRep_InsertConfigLogEntry("run sync transition, mark transition to sync completed"); ChangeTracking_MarkTransitionToInsyncCompleted(); - + FileRep_InsertConfigLogEntry("run sync transition, mark transition to sync completed on primary and mirror"); /* primary and mirror have now completed re-sync */ @@ -1297,7 +1340,7 @@ FileRepResyncManager_InSyncTransition(void) break; } - + return status; } @@ -1307,101 +1350,114 @@ FileRepResyncManager_InSyncTransition(void) void FileRepResyncManager_ResyncFlatFiles(void) { - int status = STATUS_OK; - + int status = STATUS_OK; + FileRep_SetSegmentState(SegmentStateInSyncTransition, FaultTypeNotInitialized); SIMPLE_FAULT_INJECTOR(FileRepTransitionToInSync); - while (1) + while (1) { FileRep_InsertConfigLogEntry("run sync transition, resync pg_control file"); status = XLogRecoverMirrorControlFile(); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { break; } - + FileRepSubProcess_ProcessSignals(); - if (segmentState != SegmentStateInSyncTransition) { + if (segmentState != SegmentStateInSyncTransition) + { break; - } + } MirroredFlatFile_MirrorDropTemporaryFiles(); FileRep_InsertConfigLogEntry("run sync transition, resync drop temporary files"); FileRepSubProcess_ProcessSignals(); - if (segmentState != SegmentStateInSyncTransition) { + if (segmentState != SegmentStateInSyncTransition) + { break; - } - + } + status = FlatFilesTemporaryResynchronizeMirror(); FileRep_InsertConfigLogEntry("run sync transition, resync temporary files"); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { break; } - + FileRepSubProcess_ProcessSignals(); - if (segmentState != SegmentStateInSyncTransition) { + if (segmentState != SegmentStateInSyncTransition) + { break; - } - + } + FileRep_InsertConfigLogEntry("run sync transition, resync pg_database and pg_auth files"); status = FlatFilesRecoverMirror(); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { break; } - + FileRepSubProcess_ProcessSignals(); - if (segmentState != SegmentStateInSyncTransition) { + if (segmentState != SegmentStateInSyncTransition) + { break; - } - + } + FileRep_InsertConfigLogEntry("run sync transition, resync pg_twophase files"); status = TwoPhaseRecoverMirror(); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { break; } - + FileRepSubProcess_ProcessSignals(); - if (segmentState != SegmentStateInSyncTransition) { + if (segmentState != SegmentStateInSyncTransition) + { break; - } - + } + FileRep_InsertConfigLogEntry("run sync transition, resync slru files"); status = SlruRecoverMirror(); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { break; } - + FileRepSubProcess_ProcessSignals(); - if (segmentState != SegmentStateInSyncTransition) { + if (segmentState != SegmentStateInSyncTransition) + { break; - } - + } + FileRep_InsertConfigLogEntry("run sync transition, resync pgversion files"); status = PgVersionRecoverMirror(); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { break; } - FileRepSubProcess_ProcessSignals(); - + FileRepSubProcess_ProcessSignals(); + FileRep_InsertConfigLogEntry("run sync transition, resync pgxlog files"); status = XLogRecoverMirror(); - - if (status != STATUS_OK) { - break; + + if (status != STATUS_OK) + { + break; } FileRepSubProcess_ProcessSignals(); @@ -1409,211 +1465,221 @@ FileRepResyncManager_ResyncFlatFiles(void) break; } - if (status != STATUS_OK) + if (status != STATUS_OK) { FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); FileRepSubProcess_ProcessSignals(); } - + if (segmentState == SegmentStateInSyncTransition && status == STATUS_OK) { FileRep_SetDataState(DataStateInSync, FALSE); - - FileRep_SetSegmentState(SegmentStateReady, FaultTypeNotInitialized); + + FileRep_SetSegmentState(SegmentStateReady, FaultTypeNotInitialized); } - + return; } static int FileRepResync_InsertEntry( - FileRepResyncHashEntry_s* entry) + FileRepResyncHashEntry_s *entry) { - int status = STATUS_OK; - bool foundPtr; - FileRepResyncHashEntry_s *entryLocal; - char key[MAXPGPATH+1]; - + int status = STATUS_OK; + bool foundPtr; + FileRepResyncHashEntry_s *entryLocal; + char key[MAXPGPATH + 1]; + snprintf(key, sizeof(key), "%s", entry->fileName); if (Debug_filerep_print) elog(LOG, "FileRepResync_InsertEntry() identifier:'%s' ", key); - + FileRepResync_LockAcquire(); - + Assert(fileRepResyncShmem->fileRepResyncHash != NULL); - + entryLocal = (FileRepResyncHashEntry_s *) hash_search( - fileRepResyncShmem->fileRepResyncHash, - (void *) &key, - HASH_ENTER_NULL, + fileRepResyncShmem->fileRepResyncHash, + (void *) &key, + HASH_ENTER_NULL, &foundPtr); - - if (entryLocal == NULL) { - + + if (entryLocal == NULL) + { + status = STATUS_ERROR; - ereport(WARNING, + ereport(WARNING, (errmsg("resync failure, " "could not insert resync information into hash table identifier '%s', no memory " - "failover requested", - key), + "failover requested", + key), errhint("run gprecoverseg to re-establish mirror connectivity"), - FileRep_errcontext())); + FileRep_errcontext())); goto exit; } - - if (foundPtr) { - + + if (foundPtr) + { + status = STATUS_ERROR; - ereport(WARNING, + ereport(WARNING, (errmsg("resync failure, " "could not insert resync information into hash table identifier '%s', entry exists " - "failover requested", - key), + "failover requested", + key), errhint("run gprecoverseg to re-establish mirror connectivity"), - FileRep_errcontext())); - - - } else { - + FileRep_errcontext())); + + + } + else + { + entryLocal->relFileNode.relNode = entry->relFileNode.relNode; entryLocal->relFileNode.spcNode = entry->relFileNode.spcNode; entryLocal->relFileNode.dbNode = entry->relFileNode.dbNode; entryLocal->segmentFileNum = entry->segmentFileNum; - + entryLocal->relStorageMgr = entry->relStorageMgr; entryLocal->mirrorDataSynchronizationState = entry->mirrorDataSynchronizationState; entryLocal->mirrorBufpoolResyncCkptLoc = entry->mirrorBufpoolResyncCkptLoc; entryLocal->mirrorBufpoolResyncCkptBlockNum = entry->mirrorBufpoolResyncCkptBlockNum; - + entryLocal->mirrorAppendOnlyLossEof = entry->mirrorAppendOnlyLossEof; entryLocal->mirrorAppendOnlyNewEof = entry->mirrorAppendOnlyNewEof; - + entryLocal->mirrorBufpoolResyncChangedPageCount = entry->mirrorBufpoolResyncChangedPageCount; - + entryLocal->persistentTid = entry->persistentTid; entryLocal->persistentSerialNum = entry->persistentSerialNum; - + entryLocal->fileRepResyncState = FileRepResyncStateInitialized; - + fileRepResyncShmem->writeCount++; - + } - -exit: + +exit: FileRepResync_LockRelease(); - + return status; } -static FileRepResyncHashEntry_s* +static FileRepResyncHashEntry_s * FileRepResync_LookupEntry(FileName fileName) { - FileRepResyncHashEntry_s * entry; - char key[MAXPGPATH+1]; - bool foundPtr; - + FileRepResyncHashEntry_s *entry; + char key[MAXPGPATH + 1]; + bool foundPtr; + snprintf(key, sizeof(key), "%s", fileName); - + Assert(fileRepResyncShmem->fileRepResyncHash != NULL); - + entry = (FileRepResyncHashEntry_s *) hash_search( - fileRepResyncShmem->fileRepResyncHash, - (void *) &key, - HASH_ENTER_NULL, + fileRepResyncShmem->fileRepResyncHash, + (void *) &key, + HASH_ENTER_NULL, &foundPtr); - + if (Debug_filerep_print) { - if (entry == NULL) + if (entry == NULL) elog(LOG, "FileRepResync_LookupEntry() could not find resync entry identifier:'%s' ", fileName); } - - return entry; + + return entry; } static bool -FileRepResync_RemoveEntry(FileName fileName) +FileRepResync_RemoveEntry(FileName fileName) { - - FileRepResyncHashEntry_s *entry; - char key[MAXPGPATH+1]; - bool isRemoved = FALSE; - + + FileRepResyncHashEntry_s *entry; + char key[MAXPGPATH + 1]; + bool isRemoved = FALSE; + snprintf(key, sizeof(key), "%s", fileName); - + Assert(fileRepResyncShmem->fileRepResyncHash != NULL); - - entry = hash_search(fileRepResyncShmem->fileRepResyncHash, - (void *) &key, - HASH_REMOVE, + + entry = hash_search(fileRepResyncShmem->fileRepResyncHash, + (void *) &key, + HASH_REMOVE, NULL); - - if (entry) { + + if (entry) + { if (Debug_filerep_print) - elog(LOG, "FileRepResync_RemoveEntry() removed resync entry identifier:'%s' ", fileName); + elog(LOG, "FileRepResync_RemoveEntry() removed resync entry identifier:'%s' ", fileName); isRemoved = TRUE; - } - + } + return isRemoved; -} +} static int FileRepResync_CheckProgress(void) { - FileRepResyncHashEntry_s *entry = NULL; - HASH_SEQ_STATUS hash_status; - int countProgress = 0; - + FileRepResyncHashEntry_s *entry = NULL; + HASH_SEQ_STATUS hash_status; + int countProgress = 0; + FileRepResync_LockAcquire(); - + Assert(fileRepResyncShmem->fileRepResyncHash != NULL); - + if (fileRepResyncShmem->resyncCompletedCount > 0) { - + hash_seq_init(&hash_status, fileRepResyncShmem->fileRepResyncHash); - - while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) + + while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) { switch (entry->fileRepResyncState) { case FileRepResyncStateInitialized: case FileRepResyncStateInProgress: - + break; - + case FileRepResyncStateCompleted: - /* Release Resynchronize Lock on relation that was taken before fetching the relation */ + + /* + * Release Resynchronize Lock on relation that was taken + * before fetching the relation + */ UnlockRelationForResynchronize( - &entry->relFileNode, - AccessExclusiveLock); - - if (FileRepResync_RemoveEntry(entry->fileName) == TRUE) + &entry->relFileNode, + AccessExclusiveLock); + + if (FileRepResync_RemoveEntry(entry->fileName) == TRUE) { - fileRepResyncShmem->resyncCompletedCount--; + fileRepResyncShmem->resyncCompletedCount--; Assert(fileRepResyncShmem->resyncCompletedCount >= 0); - } - else + } + else { Assert(0); } - + fileRepResyncShmem->blocksSynchronized += entry->mirrorBufpoolResyncChangedPageCount; - + /* - * PersistentFileSysObj_ResynchronizeRelationComplete() issues IOs. - * That may be long operation especially if IO is issued while segment is - * transitioning to change tracking. + * PersistentFileSysObj_ResynchronizeRelationComplete() + * issues IOs. That may be long operation especially if IO + * is issued while segment is transitioning to change + * tracking. */ FileRepResync_LockRelease(); if (Debug_filerep_print) { elog(LOG, "FileRepResync_CheckProgress() identifier:'%s' state:'%d' resyncCompletedCount:'%d' " - "blocks synchronized: " INT64_FORMAT " ", + "blocks synchronized: " INT64_FORMAT " ", entry->fileName, entry->fileRepResyncState, fileRepResyncShmem->resyncCompletedCount, @@ -1621,17 +1687,17 @@ FileRepResync_CheckProgress(void) } PersistentFileSysObj_ResynchronizeRelationComplete( - &entry->persistentTid, - entry->persistentSerialNum, - entry->mirrorAppendOnlyNewEof, - TRUE); - + &entry->persistentTid, + entry->persistentSerialNum, + entry->mirrorAppendOnlyNewEof, + TRUE); + FileRepResync_LockAcquire(); - + break; - - /* FileRepResyncStateFault is not in use */ - case FileRepResyncStateFault: + + /* FileRepResyncStateFault is not in use */ + case FileRepResyncStateFault: case FileRepResyncStateNotInitialized: Assert(0); break; @@ -1639,21 +1705,22 @@ FileRepResync_CheckProgress(void) } } -#ifdef FAULT_INJECTOR +#ifdef FAULT_INJECTOR if (fileRepResyncShmem->resyncInProgressCount > 10) FaultInjector_InjectFaultIfSet( - FileRepResyncInProgress, + FileRepResyncInProgress, DDLNotSpecified, - "", // databaseName - ""); // tableName -#endif - - countProgress = fileRepResyncShmem->resyncCompletedCount + - fileRepResyncShmem->writeCount + - fileRepResyncShmem->resyncInProgressCount; - + "", //databaseName + ""); + //tableName +#endif + + countProgress = fileRepResyncShmem->resyncCompletedCount + + fileRepResyncShmem->writeCount + + fileRepResyncShmem->resyncInProgressCount; + FileRepResync_LockRelease(); - + return countProgress; } @@ -1670,35 +1737,36 @@ FileRepResync_CheckProgress(void) * resynchronizing these tables. The CT hash entry is directly returned in * this case. */ -FileRepResyncHashEntry_s* +FileRepResyncHashEntry_s * FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) { - bool found = FALSE; - FileRepResyncHashEntry_s *entry = NULL; - HASH_SEQ_STATUS hash_status; - int NumberOfRelations = 0; - ChangeTrackingRequest *requestLocal = NULL; - int64 changedPageCount = 0; - + bool found = FALSE; + FileRepResyncHashEntry_s *entry = NULL; + HASH_SEQ_STATUS hash_status; + int NumberOfRelations = 0; + ChangeTrackingRequest *requestLocal = NULL; + int64 changedPageCount = 0; + FileRepResync_LockAcquire(); - + Assert(fileRepResyncShmem->fileRepResyncHash != NULL); - - if (fileRepResyncShmem->writeCount > 0) { - + + if (fileRepResyncShmem->writeCount > 0) + { + hash_seq_init(&hash_status, fileRepResyncShmem->fileRepResyncHash); - - while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) + + while ((entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) { if (entry->fileRepResyncState != FileRepResyncStateInitialized) { continue; } - + if (entry->mirrorDataSynchronizationState == MirroredRelDataSynchronizationState_BufferPoolPageIncremental) { changedPageCount += entry->mirrorBufpoolResyncChangedPageCount; - + if (changedPageCount > gp_filerep_ct_batch_size) { if (NumberOfRelations == 0) @@ -1708,16 +1776,17 @@ FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) hash_seq_term(&hash_status); break; } - + NumberOfRelations++; } - else + else { if (NumberOfRelations > 0) - { - /* - * if first entry has BufferPoolPageIncremental state then group only - * relations with BufferPoolPageIncremental for next resync. + { + /* + * if first entry has BufferPoolPageIncremental state then + * group only relations with BufferPoolPageIncremental for + * next resync. */ continue; } @@ -1726,29 +1795,29 @@ FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) entry->fileRepResyncState = FileRepResyncStateInProgress; fileRepResyncShmem->writeCount--; fileRepResyncShmem->resyncInProgressCount++; - + Assert(fileRepResyncShmem->writeCount >= 0); - - if (Debug_filerep_print) - elog(LOG, + + if (Debug_filerep_print) + elog(LOG, "FileRepPrimary_GetResyncEntry() identifier:'%s' " "mirrorDataSynchronizationState:'%s(%d)' ", entry->fileName, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), - entry->mirrorDataSynchronizationState); - + entry->mirrorDataSynchronizationState); + break; } } - + if (NumberOfRelations > 0) { int count = 0; - requestLocal = ChangeTracking_FormRequest(NumberOfRelations); - + requestLocal = ChangeTracking_FormRequest(NumberOfRelations); + hash_seq_init(&hash_status, fileRepResyncShmem->fileRepResyncHash); - + while (count < NumberOfRelations && (entry = (FileRepResyncHashEntry_s *) hash_seq_search(&hash_status)) != NULL) { @@ -1756,10 +1825,10 @@ FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) { continue; } - + if (entry->mirrorDataSynchronizationState == MirroredRelDataSynchronizationState_BufferPoolPageIncremental) { - + /* * When a single relation has more changed blocks than * gp_filerep_ct_batch_size, it must be called in @@ -1769,10 +1838,11 @@ FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) * GetChanges() did not finish with this relation * (ask_for_more flag). The caller must invoke * GetChanges() for this relation again (when ready) and - * pass the last block number from the previous call as the - * beginning block number for this call. GetChanges() will - * then return the next batch of changes and this will - * continue until ask_for_more flag is returned as false. + * pass the last block number from the previous call as + * the beginning block number for this call. GetChanges() + * will then return the next batch of changes and this + * will continue until ask_for_more flag is returned as + * false. * * If GetChanges() was called with more than 1 relation * (persistent serial number) at a time AND it sees more @@ -1790,18 +1860,18 @@ FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) fileRepResyncShmem->resyncInProgressCount++; entry->fileRepResyncState = FileRepResyncStateInProgress; count++; - + Assert(fileRepResyncShmem->writeCount >= 0); - if (Debug_filerep_print) - elog(LOG, + if (Debug_filerep_print) + elog(LOG, "FileRepPrimary_GetResyncEntry() identifier:'%s' NumberOfRelations:'%d' " "mirrorDataSynchronizationState:'%s(%d)' count:'%d' ", entry->fileName, NumberOfRelations, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState, - count); + count); } } if (entry != NULL) @@ -1811,58 +1881,61 @@ FileRepPrimary_GetResyncEntry(ChangeTrackingRequest **request) } Insist(requestLocal->count == NumberOfRelations); *request = requestLocal; - + } } - + FileRepResync_LockRelease(); - + if (found == FALSE) { entry = NULL; } - + return entry; } int FileRepResync_UpdateEntry( - FileRepResyncHashEntry_s* entry) + FileRepResyncHashEntry_s *entry) { - FileRepResyncHashEntry_s *entryLocal; - int status = STATUS_OK; - + FileRepResyncHashEntry_s *entryLocal; + int status = STATUS_OK; + FileRepResync_LockAcquire(); - + entryLocal = FileRepResync_LookupEntry(entry->fileName); - - if (entryLocal != NULL) { + + if (entryLocal != NULL) + { fileRepResyncShmem->resyncCompletedCount++; fileRepResyncShmem->resyncInProgressCount--; - + entryLocal->fileRepResyncState = FileRepResyncStateCompleted; - + if (entryLocal->mirrorBufpoolResyncChangedPageCount == 0) { entryLocal->mirrorBufpoolResyncChangedPageCount = entry->mirrorBufpoolResyncChangedPageCount; } - + Assert(fileRepResyncShmem->resyncInProgressCount >= 0); - } else { + } + else + { Assert(0); status = STATUS_ERROR; } if (Debug_filerep_print) - { + { elog(LOG, "FileRepResync_UpdateEntry() identifier:'%s' state:'%d' resyncCompletedCount:'%d' ", entry->fileName, entry->fileRepResyncState, fileRepResyncShmem->resyncCompletedCount); } - + FileRepResync_LockRelease(); - + return status; } diff --git a/src/backend/cdb/cdbfilerepresyncworker.c b/src/backend/cdb/cdbfilerepresyncworker.c index 33e93d5771..9a8c99389b 100644 --- a/src/backend/cdb/cdbfilerepresyncworker.c +++ b/src/backend/cdb/cdbfilerepresyncworker.c @@ -34,9 +34,9 @@ #define OIDCHARS 10 /* max chars printed by %u */ -static int FileRepPrimary_RunResyncWorker(void); -static int FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry); -static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request); +static int FileRepPrimary_RunResyncWorker(void); +static int FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry); +static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request); static bool readBufferRequest = FALSE; static void FileRepResync_ResetReadBufferRequest(void); @@ -57,36 +57,35 @@ FileRepResync_SetReadBufferRequest(void) bool FileRepResyncWorker_IsResyncRequest(void) { - return (readBufferRequest == FALSE && FileRepPrimary_IsResyncWorker()); + return (readBufferRequest == FALSE && FileRepPrimary_IsResyncWorker()); } /* * FileRepPrimary_StartResyncWorker() */ -void +void FileRepPrimary_StartResyncWorker(void) -{ - int status = STATUS_OK; - +{ + int status = STATUS_OK; + FileRep_InsertConfigLogEntry("start resync worker"); - + Insist(fileRepRole == FileRepPrimaryRole); - - while (1) { - - if (status != STATUS_OK) + + while (1) + { + + if (status != STATUS_OK) { FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); FileRepSubProcess_SetState(FileRepStateFault); } - + /* * We are waiting for following conditions to move forward: * - * Database is running - * And - * if dataState is InResync, we wait for FileRepSubProcess to Ready state - * else don't wait + * Database is running And if dataState is InResync, we wait for + * FileRepSubProcess to Ready state else don't wait */ while (!isDatabaseRunning() || !(dataState == DataStateInResync ? FileRepSubProcess_GetState() == FileRepStateReady : true)) @@ -99,24 +98,26 @@ FileRepPrimary_StartResyncWorker(void) break; } - pg_usleep(50000L); /* 50 ms */ + pg_usleep(50000L); /* 50 ms */ } - + if (FileRepSubProcess_GetState() == FileRepStateShutdown || - FileRepSubProcess_GetState() == FileRepStateShutdownBackends) { + FileRepSubProcess_GetState() == FileRepStateShutdownBackends) + { break; } - + FileRepSubProcess_InitHeapAccess(); status = FileRepPrimary_RunResyncWorker(); - - if (status != STATUS_OK) { + + if (status != STATUS_OK) + { continue; } - + break; - + } } @@ -127,60 +128,62 @@ FileRepPrimary_StartResyncWorker(void) static int FileRepPrimary_RunResyncWorker(void) { - int status = STATUS_OK; - FileRepResyncHashEntry_s *entry = NULL; - ChangeTrackingRequest *request = NULL; + int status = STATUS_OK; + FileRepResyncHashEntry_s *entry = NULL; + ChangeTrackingRequest *request = NULL; FileRep_InsertConfigLogEntry("run resync worker"); - - while (1) { + + while (1) + { FileRepSubProcess_ProcessSignals(); - - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { break; } entry = FileRepPrimary_GetResyncEntry(&request); - - if (entry == NULL && request == NULL) { - + + if (entry == NULL && request == NULL) + { + pg_usleep(100000L); /* 100 ms */ continue; } - - Assert(! (entry != NULL && request != NULL)); + + Assert(!(entry != NULL && request != NULL)); if (entry != NULL) - { + { status = FileRepPrimary_ResyncWrite(entry); - + if (status == STATUS_OK) { if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = (entry->mirrorAppendOnlyNewEof - entry->mirrorAppendOnlyLossEof) / BLCKSZ; - } - + } + status = FileRepResync_UpdateEntry(entry); } } - + if (request != NULL) { status = FileRepPrimary_ResyncBufferPoolIncrementalWrite(request); request = NULL; } - + if (status != STATUS_OK) { break; } - + } - + return status; } @@ -189,10 +192,10 @@ FileRepPrimary_RunResyncWorker(void) * RESYNC relation (Buffer Pool) * INPUT parameters * *) RelFileNode - * *) beginLSN (the earliest LSN to be re-synchronized) + * *) beginLSN (the earliest LSN to be re-synchronized) * *) endLSN (the latest LSN to be re-synchronized) * *) endBlockNumber (the latest block in relation to be re-synchronized) - * *) + * *) * * * RESYNC relation (Append Only) @@ -206,33 +209,33 @@ FileRepPrimary_RunResyncWorker(void) */ static int -FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) +FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) { - int status = STATUS_OK; - Page page; - Buffer buf; - BlockNumber numBlocks; - BlockNumber blkno; - SMgrRelation smgr_relation; - char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; - XLogRecPtr loc; - int count = 0; - int thresholdCount = 0; - bool mirrorDataLossOccurred = FALSE; - + int status = STATUS_OK; + Page page; + Buffer buf; + BlockNumber numBlocks; + BlockNumber blkno; + SMgrRelation smgr_relation; + char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; + XLogRecPtr loc; + int count = 0; + int thresholdCount = 0; + bool mirrorDataLossOccurred = FALSE; + switch (entry->relStorageMgr) { case PersistentFileSysRelStorageMgr_BufferPool: - + switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_BufferPoolScanIncremental: case MirroredRelDataSynchronizationState_FullCopy: smgr_relation = smgropen(entry->relFileNode); - + numBlocks = smgrnblocks(smgr_relation); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", @@ -245,19 +248,20 @@ FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) relidstr, numBlocks); thresholdCount = Min(numBlocks, 1024); - - /* - * required in order to report how many blocks were synchronized - * if gp_persistent_relation_node does not return that information + + /* + * required in order to report how many blocks were + * synchronized if gp_persistent_relation_node does not + * return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum; } - - for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) + + for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) { - XLogRecPtr endResyncLSN = (isFullResync() ? + XLogRecPtr endResyncLSN = (isFullResync() ? FileRepResync_GetEndFullResyncLSN() : FileRepResync_GetEndIncrResyncLSN()); @@ -266,15 +270,15 @@ FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, blkno); FileRepResync_ResetReadBufferRequest(); - + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); - + loc = PageGetLSN(page); - + if (Debug_filerep_print) { - elog(LOG, + elog(LOG, "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' " "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, @@ -292,21 +296,27 @@ FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) } if (XLByteLE(PageGetLSN(page), endResyncLSN) && - XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) + XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) { /* - * Because filerep sync is a special case, we don't do our write through the buffer - * pool. We need to recalculate the checksum for every page that we ship via resync. - * We only recalculate the checksum in a copy of the buffer, leaving the version in - * shared buffer alone. As a result, the version written to disk gets the correct - * checksum, but the buffer checksum is inconsistent with the buffer's data. + * Because filerep sync is a special case, we + * don't do our write through the buffer pool. We + * need to recalculate the checksum for every page + * that we ship via resync. We only recalculate + * the checksum in a copy of the buffer, leaving + * the version in shared buffer alone. As a + * result, the version written to disk gets the + * correct checksum, but the buffer checksum is + * inconsistent with the buffer's data. * - * If we don't first calculate the checksum, we are likely to be sending over a page - * that isn't dirty, but still has the old checksum from the original disk read not - * the one that has been written. + * If we don't first calculate the checksum, we + * are likely to be sending over a page that isn't + * dirty, but still has the old checksum from the + * original disk read not the one that has been + * written. */ - char *pageCopy = PageSetChecksumCopy(page, blkno); + char *pageCopy = PageSetChecksumCopy(page, blkno); smgrwrite(smgr_relation, blkno, @@ -317,14 +327,14 @@ FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); UnlockReleaseBuffer(buf); - + if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); - - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; @@ -333,38 +343,39 @@ FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) else count++; } - + if (mirrorDataLossOccurred) break; if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy) { LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); - + numBlocks = smgrnblocks(smgr_relation); - + smgrtruncate(smgr_relation, - numBlocks, - TRUE /* isTemp, TRUE means to not record in XLOG */, - FALSE /* isLocalBuf */, - &entry->persistentTid, - entry->persistentSerialNum); - + numBlocks, + TRUE /* isTemp, TRUE means to not + * record in XLOG */ , + FALSE /* isLocalBuf */ , + &entry->persistentTid, + entry->persistentSerialNum); + UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); } - + smgrimmedsync(smgr_relation); smgrclose(smgr_relation); - + smgr_relation = NULL; break; - - case MirroredRelDataSynchronizationState_None: + + case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; - + default: - ereport(LOG, + ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, @@ -375,173 +386,177 @@ FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) break; } break; - + case PersistentFileSysRelStorageMgr_AppendOnly: - { - MirroredAppendOnlyOpen mirroredOpen; - int primaryError; - bool mirrorDataLossOccurred; - char *buffer = NULL; - int64 endOffset = entry->mirrorAppendOnlyNewEof; - int64 startOffset = entry->mirrorAppendOnlyLossEof; - int32 bufferLen = 0; - int retval = 0; - - switch (entry->mirrorDataSynchronizationState) { - case MirroredRelDataSynchronizationState_AppendOnlyCatchup: - case MirroredRelDataSynchronizationState_FullCopy: - - /* - * required in order to report how many blocks were synchronized - * if gp_persistent_relation_node does not return that information - */ - if (entry->mirrorBufpoolResyncChangedPageCount == 0) - { - entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ; - } - - /* - * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and - * will open BOTH, but write only the MIRROR!!! - */ - MirroredAppendOnly_OpenResynchonize( - &mirroredOpen, - &entry->relFileNode, - entry->segmentFileNum, - startOffset, - &primaryError, - &mirrorDataLossOccurred); - if (primaryError != 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file %u/%u/%u.%u : %s", - entry->relFileNode.dbNode, - entry->relFileNode.spcNode, - entry->relFileNode.relNode, - entry->segmentFileNum, - strerror(primaryError)))); - - break; - } + MirroredAppendOnlyOpen mirroredOpen; + int primaryError; + bool mirrorDataLossOccurred; + char *buffer = NULL; + int64 endOffset = entry->mirrorAppendOnlyNewEof; + int64 startOffset = entry->mirrorAppendOnlyLossEof; + int32 bufferLen = 0; + int retval = 0; + + switch (entry->mirrorDataSynchronizationState) + { + case MirroredRelDataSynchronizationState_AppendOnlyCatchup: + case MirroredRelDataSynchronizationState_FullCopy: - if (mirrorDataLossOccurred) - break; - - /* AO and CO Data Store writes 64k size by default */ - bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); - buffer = (char*) palloc(bufferLen); - MemSet(buffer, 0, bufferLen); - - while (startOffset < endOffset) - { - retval = MirroredAppendOnly_Read( - &mirroredOpen, - buffer, - bufferLen); - - if (retval != bufferLen) + /* + * required in order to report how many blocks were + * synchronized if gp_persistent_relation_node does + * not return that information + */ + if (entry->mirrorBufpoolResyncChangedPageCount == 0) + { + entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ; + } + + /* + * The MirroredAppendOnly_OpenResynchonize routine + * knows we are a resynch worker and will open BOTH, + * but write only the MIRROR!!! + */ + MirroredAppendOnly_OpenResynchonize( + &mirroredOpen, + &entry->relFileNode, + entry->segmentFileNum, + startOffset, + &primaryError, + &mirrorDataLossOccurred); + if (primaryError != 0) { ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m", - startOffset, + errmsg("could not open file %u/%u/%u.%u : %s", entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, - entry->segmentFileNum))); - + entry->segmentFileNum, + strerror(primaryError)))); + break; - } - - MirroredAppendOnly_Append( - &mirroredOpen, - buffer, - bufferLen, - &primaryError, - &mirrorDataLossOccurred); - + } + if (mirrorDataLossOccurred) break; - Assert(primaryError == 0); // No primary writes as resync worker. - - startOffset += bufferLen; /* AO and CO Data Store writes 64k size by default */ - bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); - } - - pfree(buffer); - buffer = NULL; - - if (mirrorDataLossOccurred) + bufferLen = (Size) Min(2 * BLCKSZ, endOffset - startOffset); + buffer = (char *) palloc(bufferLen); + MemSet(buffer, 0, bufferLen); + + while (startOffset < endOffset) + { + retval = MirroredAppendOnly_Read( + &mirroredOpen, + buffer, + bufferLen); + + if (retval != bufferLen) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m", + startOffset, + entry->relFileNode.dbNode, + entry->relFileNode.spcNode, + entry->relFileNode.relNode, + entry->segmentFileNum))); + + break; + } + + MirroredAppendOnly_Append( + &mirroredOpen, + buffer, + bufferLen, + &primaryError, + &mirrorDataLossOccurred); + + if (mirrorDataLossOccurred) + break; + + Assert(primaryError == 0); + //No primary writes as resync worker. + + startOffset += bufferLen; + /* AO and CO Data Store writes 64k size by default */ + bufferLen = (Size) Min(2 * BLCKSZ, endOffset - startOffset); + } + + pfree(buffer); + buffer = NULL; + + if (mirrorDataLossOccurred) + break; + + /* Flush written data on Mirror */ + MirroredAppendOnly_Flush( + &mirroredOpen, + &primaryError, + &mirrorDataLossOccurred); + if (mirrorDataLossOccurred) + break; + + Assert(primaryError == 0); + //Not flushed on primary as resync worker. + + /* Close Primary and Mirror */ + MirroredAppendOnly_Close( + &mirroredOpen, + &mirrorDataLossOccurred); + break; - - /* Flush written data on Mirror */ - MirroredAppendOnly_Flush( - &mirroredOpen, - &primaryError, - &mirrorDataLossOccurred); - if (mirrorDataLossOccurred) + + case MirroredRelDataSynchronizationState_None: + case MirroredRelDataSynchronizationState_DataSynchronized: break; - - Assert(primaryError == 0); // Not flushed on primary as resync worker. - - /* Close Primary and Mirror */ - MirroredAppendOnly_Close( - &mirroredOpen, - &mirrorDataLossOccurred); - - break; - - case MirroredRelDataSynchronizationState_None: - case MirroredRelDataSynchronizationState_DataSynchronized: - break; - - default: - ereport(LOG, - (errmsg("could not resynchronize relation '%u/%u/%u' " - "mirror synchronization state:'%s(%d)' ", - entry->relFileNode.relNode, - entry->relFileNode.spcNode, - entry->relFileNode.dbNode, - MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), - entry->mirrorDataSynchronizationState))); - break; - } - - break; - } //case + + default: + ereport(LOG, + (errmsg("could not resynchronize relation '%u/%u/%u' " + "mirror synchronization state:'%s(%d)' ", + entry->relFileNode.relNode, + entry->relFileNode.spcNode, + entry->relFileNode.dbNode, + MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), + entry->mirrorDataSynchronizationState))); + break; + } + + break; + } //case default: Assert(0); break; } //switch - - if (mirrorDataLossOccurred) + + if (mirrorDataLossOccurred) status = STATUS_ERROR; - + return status; } static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) { - int status = STATUS_OK; - Page page; - Buffer buf; - BlockNumber numBlocks = 0; - SMgrRelation smgr_relation = NULL; - char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; - int ii; - XLogRecPtr loc; - int count = 0; - int thresholdCount = 0; - bool mirrorDataLossOccurred = FALSE; - int NumberOfRelations = request->count; - - FileRepResyncHashEntry_s entry; - ChangeTrackingResult *result = NULL; + int status = STATUS_OK; + Page page; + Buffer buf; + BlockNumber numBlocks = 0; + SMgrRelation smgr_relation = NULL; + char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; + int ii; + XLogRecPtr loc; + int count = 0; + int thresholdCount = 0; + bool mirrorDataLossOccurred = FALSE; + int NumberOfRelations = request->count; + + FileRepResyncHashEntry_s entry; + ChangeTrackingResult *result = NULL; while (1) { @@ -554,26 +569,26 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) * them on primary. */ FileRepResync_SetReadBufferRequest(); - if ((result = ChangeTracking_GetChanges(request)) != NULL) + if ((result = ChangeTracking_GetChanges(request)) != NULL) { FileRepResync_ResetReadBufferRequest(); - + for (ii = 0; ii < result->count; ii++) { - + if (smgr_relation == NULL) { NumberOfRelations--; - + smgr_relation = smgropen(result->entries[ii].relFileNode); - + snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); numBlocks = smgrnblocks(smgr_relation); - + if (Debug_filerep_print) elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' " "number of blocks:'%u' ", @@ -581,17 +596,19 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode, numBlocks); - + thresholdCount = Min(numBlocks, 1024); } - + /* - * if relation was truncated then block_num from change tracking can be beyond numBlocks + * if relation was truncated then block_num from change + * tracking can be beyond numBlocks */ const BlockNumber blkno = result->entries[ii].block_num; + if (blkno >= numBlocks) { - ereport(LOG, + ereport(LOG, (errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), " "lsn change tracking '%X/%X' " "number of blocks '%d' ", @@ -601,33 +618,33 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) result->entries[ii].lsn_end.xrecoff, numBlocks), FileRep_errcontext())); - + goto flush_check; } - + /* * ReadBuffer() may need to write out a dirty buffer to make * room in buffer cache. Setting readBufferRequest indicates - * that resync worker process should perform writes on primary. - * When readBufferRequest flag is unset, resync workers send - * changed blocks only to mirror without writing them on - * primary. + * that resync worker process should perform writes on + * primary. When readBufferRequest flag is unset, resync + * workers send changed blocks only to mirror without writing + * them on primary. */ FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, blkno); FileRepResync_ResetReadBufferRequest(); - + Assert(blkno < numBlocks); - + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); - - loc = PageGetLSN(page); - + + loc = PageGetLSN(page); + if (Debug_filerep_config_print) { - elog(LOG, + elog(LOG, "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%X/%X' " "lsn end change tracking '%X/%X' ", relidstr, @@ -641,47 +658,49 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page))) { - if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end)) + if (!XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end)) { ereport(LOG, - (errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn more than CT lsn, " - "lsn end change tracking '%X/%X' lsn page '%X/%X' " - "number of blocks '%d'", - relidstr, - blkno, - loc.xlogid, - loc.xrecoff, - result->entries[ii].lsn_end.xlogid, - result->entries[ii].lsn_end.xrecoff, - numBlocks), - FileRep_errcontext())); + (errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn more than CT lsn, " + "lsn end change tracking '%X/%X' lsn page '%X/%X' " + "number of blocks '%d'", + relidstr, + blkno, + loc.xlogid, + loc.xrecoff, + result->entries[ii].lsn_end.xlogid, + result->entries[ii].lsn_end.xrecoff, + numBlocks), + FileRep_errcontext())); } /* - * We checksum every page before replicating for the reasons described - * in FileRepPrimary_ResyncWrite above + * We checksum every page before replicating for the + * reasons described in FileRepPrimary_ResyncWrite above */ - char *pageCopy = PageSetChecksumCopy(page, blkno); + char *pageCopy = PageSetChecksumCopy(page, blkno); /* - * It's safe and better to perform write of the page to mirror, - * for this case, as primary and mirror data pages should always - * be same. So, we might do some extra work but definitely won't - * lose out blocks, or error out and need to perform full recovery. - * Need to cover for this case as there are some known scenarios where - * CT file can have extra records which should have been discarded, - * but as we loose out information of xlog LSN cannot be discarded. - * One such case is when CT_TRANSIENT being compacted to CT_COMPACT - * with specific xlog LSN (to discard extra records) in CT mode gets - * interrupted by resync. Compaction during Resync collects all the - * CT records and doesn't have xlog LSN information to discard any - * extra records from CT_TRANSIENT. + * It's safe and better to perform write of the page to + * mirror, for this case, as primary and mirror data pages + * should always be same. So, we might do some extra work + * but definitely won't lose out blocks, or error out and + * need to perform full recovery. Need to cover for this + * case as there are some known scenarios where CT file + * can have extra records which should have been + * discarded, but as we loose out information of xlog LSN + * cannot be discarded. One such case is when CT_TRANSIENT + * being compacted to CT_COMPACT with specific xlog LSN + * (to discard extra records) in CT mode gets interrupted + * by resync. Compaction during Resync collects all the CT + * records and doesn't have xlog LSN information to + * discard any extra records from CT_TRANSIENT. */ smgrwrite(smgr_relation, - blkno, - pageCopy, - FALSE); + blkno, + pageCopy, + FALSE); } SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); @@ -690,25 +709,27 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); - flush_check: + flush_check: if (((ii + 1) == result->count) || - ! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode && - result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode && - result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode)) + !(result->entries[ii].relFileNode.spcNode == result->entries[ii + 1].relFileNode.spcNode && + result->entries[ii].relFileNode.dbNode == result->entries[ii + 1].relFileNode.dbNode && + result->entries[ii].relFileNode.relNode == result->entries[ii + 1].relFileNode.relNode)) { if (result->ask_for_more == false) { - + smgrimmedsync(smgr_relation); - + smgrclose(smgr_relation); - + smgr_relation = NULL; - + FileRep_GetRelationPath( - entry.fileName, - result->entries[ii].relFileNode, - 0 /* segment file number is always 0 for Buffer Pool */); + entry.fileName, + result->entries[ii].relFileNode, + 0 /* segment file number is + * always 0 for Buffer + * Pool */ ); /* * We only want to update the state with this call to @@ -721,13 +742,14 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) status = FileRepResync_UpdateEntry(&entry); if (status != STATUS_OK) { - break; + break; } } else { Assert(result->count == gp_filerep_ct_batch_size); Assert(request->count == 1); + /* * Update last_fetched block in request so that the * next call to GetChanges() knows where to start @@ -752,15 +774,15 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) result->entries[ii].relFileNode.relNode, blkno); } - } - + } + if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); - - if (! (FileRepSubProcess_GetState() == FileRepStateReady && - dataState == DataStateInResync)) + + if (!(FileRepSubProcess_GetState() == FileRepStateReady && + dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; @@ -774,14 +796,14 @@ FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) if (result == NULL || result->ask_for_more == false) break; } - + ChangeTracking_FreeRequest(request); ChangeTracking_FreeResult(result); - + Insist(NumberOfRelations == 0); - + if (mirrorDataLossOccurred) status = STATUS_ERROR; - - return status; + + return status; } diff --git a/src/backend/cdb/cdbfilerepservice.c b/src/backend/cdb/cdbfilerepservice.c index 323522d229..718bffe011 100644 --- a/src/backend/cdb/cdbfilerepservice.c +++ b/src/backend/cdb/cdbfilerepservice.c @@ -50,15 +50,17 @@ FileRepProcessType_e fileRepProcessType = FileRepProcessTypeNotInitialized; static FileRepState_e fileRepState = FileRepStateNotInitialized; + /* state of FileRep process */ -/* - * Parameters set by signal handlers for later service n the main loop +/* + * Parameters set by signal handlers for later service n the main loop */ static volatile sig_atomic_t reloadConfigFile = false; static volatile sig_atomic_t shutdownRequested = false; + /* graceful shutdown informed by SIGUSR2 signal from postmaster */ /* state change informed by SIGUSR1 signal from postmaster...when state change request comes in @@ -89,69 +91,70 @@ static void FileRepSubProcess_ConfigureSignals(void); * SIGHUP signal from main file rep process * It re-loads configuration file at next convenient time. */ -static void +static void FileRepSubProcess_SigHupHandler(SIGNAL_ARGS) { reloadConfigFile = true; } -/* +/* * SIGQUIT signal from main file rep process */ -static void +static void FileRepSubProcess_ImmediateShutdownHandler(SIGNAL_ARGS) { - quickdie(PASS_SIGNAL_ARGS); + quickdie(PASS_SIGNAL_ARGS); } /* * SIGUSR2 signal from main file rep process */ -static void +static void FileRepSubProcess_ShutdownHandler(SIGNAL_ARGS) { - bool isInTransition = FALSE; + bool isInTransition = FALSE; DataState_e dataStateTransition; shutdownRequested = true; - /* - * Exit the process if recv() call is hanging or - * compacting is running. Compacting can take many minutes. + /* + * Exit the process if recv() call is hanging or compacting is running. + * Compacting can take many minutes. */ if (fileRepProcessType == FileRepProcessTypePrimaryReceiverAck || - fileRepProcessType == FileRepProcessTypeMirrorReceiver || + fileRepProcessType == FileRepProcessTypeMirrorReceiver || fileRepProcessType == FileRepProcessTypePrimaryRecovery) { /* workaround for gcov testing */ if (Debug_filerep_gcov) { getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, &isInTransition, &dataStateTransition); - + if (isInTransition == TRUE && dataStateTransition == DataStateInChangeTracking) - { + { proc_exit(0); return; - } + } } - + die(PASS_SIGNAL_ARGS); } - if ( FileRepIsBackendSubProcess(fileRepProcessType)) + if (FileRepIsBackendSubProcess(fileRepProcessType)) { if (FileRepPrimary_IsResyncManagerOrWorker()) { getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, &isInTransition, &dataStateTransition); - + if (isInTransition == TRUE && dataStateTransition == DataStateInChangeTracking) { /* - * Resync workers and manager may be waiting on lock that is acquired by backend process that is - * suspended during transition to Change Tracking and so FileRep backend shutdown may - * never be completed. + * Resync workers and manager may be waiting on lock that is + * acquired by backend process that is suspended during + * transition to Change Tracking and so FileRep backend + * shutdown may never be completed. */ if (fileRepProcessType == FileRepProcessTypeResyncManager) { @@ -161,16 +164,16 @@ FileRepSubProcess_ShutdownHandler(SIGNAL_ARGS) { LockReleaseAll(DEFAULT_LOCKMETHOD, false); } - + /* * We remove ourself from LW waiter list (if applicable). * * If the current backend is waiting on a LWLock and exits w/o - * any cleanup (remove from waiters list) it can cause a breakage - * in the LWlock's waiters linked list after it dies. This can - * lead to unpleasant issues causing starvation for subsequent - * waiters because the current backend is already dead without - * assigning the LWLock to the next waiter. + * any cleanup (remove from waiters list) it can cause a + * breakage in the LWlock's waiters linked list after it dies. + * This can lead to unpleasant issues causing starvation for + * subsequent waiters because the current backend is already + * dead without assigning the LWLock to the next waiter. * * XXX Side note - Although implemented here, avoid exiting * inside an signal handler. @@ -182,8 +185,11 @@ FileRepSubProcess_ShutdownHandler(SIGNAL_ARGS) return; } } - - /* call the normal postgres die so that it requests query cancel/procdie */ + + /* + * call the normal postgres die so that it requests query + * cancel/procdie + */ die(PASS_SIGNAL_ARGS); } } @@ -192,134 +198,138 @@ FileRepSubProcess_ShutdownHandler(SIGNAL_ARGS) * SIGUSR1 signal from main file rep process * It signals about data and/or segment state change. */ -static void +static void FileRepSubProcess_FileRepStateHandler(SIGNAL_ARGS) { - ++stateChangeRequestCounter; + ++stateChangeRequestCounter; } /* * FileRepSubProcess_ProcessSignals() - * + * */ bool FileRepSubProcess_ProcessSignals() { - bool processExit = false; - - if (reloadConfigFile) + bool processExit = false; + + if (reloadConfigFile) { reloadConfigFile = false; ProcessConfigFile(PGC_SIGHUP); - - FileRep_SetFileRepRetry(); + + FileRep_SetFileRepRetry(); } - - if (shutdownRequested) + + if (shutdownRequested) { - SegmentState_e segmentState; + SegmentState_e segmentState; + getPrimaryMirrorStatusCodes(NULL, &segmentState, NULL, NULL); shutdownRequested = false; - if ( segmentState == SegmentStateShutdownFilerepBackends ) + if (segmentState == SegmentStateShutdownFilerepBackends) { - processExit = FileRepIsBackendSubProcess(fileRepProcessType); - FileRepSubProcess_SetState(FileRepStateShutdownBackends); + processExit = FileRepIsBackendSubProcess(fileRepProcessType); + FileRepSubProcess_SetState(FileRepStateShutdownBackends); } else { - processExit = true; - FileRepSubProcess_SetState(FileRepStateShutdown); + processExit = true; + FileRepSubProcess_SetState(FileRepStateShutdown); } } - + /* - * Immediate shutdown if postmaster or main filerep process - * (parent) is not alive to avoid manual cleanup. + * Immediate shutdown if postmaster or main filerep process (parent) is + * not alive to avoid manual cleanup. */ - if (!PostmasterIsAlive(false /*amDirectChild*/) || !ParentProcIsAlive()) { + if (!PostmasterIsAlive(false /* amDirectChild */ ) || !ParentProcIsAlive()) + { quickdie_impl(); } - - for ( ;; ) + + for (;;) { - /* check to see if change required */ - sig_atomic_t curStateChangeRequestCounter = stateChangeRequestCounter; - if ( curStateChangeRequestCounter == lastChangeRequestProcessCounterValue ) - break; - lastChangeRequestProcessCounterValue = curStateChangeRequestCounter; - - /* do the change in local memory */ - getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); - switch (segmentState) { - - case SegmentStateNotInitialized: - FileRepSubProcess_SetState(FileRepStateNotInitialized); - break; - - case SegmentStateInitialization: - FileRepSubProcess_SetState(FileRepStateInitialization); - break; - - case SegmentStateInResyncTransition: - FileRepSubProcess_SetState(FileRepStateInitialization); - break; + /* check to see if change required */ + sig_atomic_t curStateChangeRequestCounter = stateChangeRequestCounter; + + if (curStateChangeRequestCounter == lastChangeRequestProcessCounterValue) + break; + lastChangeRequestProcessCounterValue = curStateChangeRequestCounter; + + /* do the change in local memory */ + getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); + switch (segmentState) + { + + case SegmentStateNotInitialized: + FileRepSubProcess_SetState(FileRepStateNotInitialized); + break; + + case SegmentStateInitialization: + FileRepSubProcess_SetState(FileRepStateInitialization); + break; + + case SegmentStateInResyncTransition: + FileRepSubProcess_SetState(FileRepStateInitialization); + break; case SegmentStateInChangeTrackingTransition: - case SegmentStateInSyncTransition: - // fileRepState remains Ready - break; - - case SegmentStateChangeTrackingDisabled: - case SegmentStateReady: - FileRepSubProcess_SetState(FileRepStateReady); - break; - - case SegmentStateFault: - FileRepSubProcess_SetState(FileRepStateFault); - break; - - case SegmentStateShutdownFilerepBackends: - if (fileRepRole == FileRepPrimaryRole) - { - FileRepSubProcess_SetState(FileRepStateShutdownBackends); - } - else - { - processExit = true; - FileRepSubProcess_SetState(FileRepStateShutdown); - } - break; + case SegmentStateInSyncTransition: + /* fileRepState remains Ready */ + break; + + case SegmentStateChangeTrackingDisabled: + case SegmentStateReady: + FileRepSubProcess_SetState(FileRepStateReady); + break; + + case SegmentStateFault: + FileRepSubProcess_SetState(FileRepStateFault); + break; + + case SegmentStateShutdownFilerepBackends: + if (fileRepRole == FileRepPrimaryRole) + { + FileRepSubProcess_SetState(FileRepStateShutdownBackends); + } + else + { + processExit = true; + FileRepSubProcess_SetState(FileRepStateShutdown); + } + break; case SegmentStateImmediateShutdown: - case SegmentStateShutdown: - processExit = true; - FileRepSubProcess_SetState(FileRepStateShutdown); - break; - - default: - Assert(0); - break; - } // switch() - - if (processExit == true) - { - FileRep_IpcSignalAll(); - } - } - - return(processExit); + case SegmentStateShutdown: + processExit = true; + FileRepSubProcess_SetState(FileRepStateShutdown); + break; + + default: + Assert(0); + break; + } //switch () + + if (processExit == true) + { + FileRep_IpcSignalAll(); + } + } + + return (processExit); } - + bool FileRepSubProcess_IsStateTransitionRequested(void) { - - bool isStateTransitionRequested = FALSE; - + + bool isStateTransitionRequested = FALSE; + getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); - + switch (fileRepProcessType) { case FileRepProcessTypeMain: @@ -328,243 +338,257 @@ FileRepSubProcess_IsStateTransitionRequested(void) { isStateTransitionRequested = TRUE; } - + break; - + case FileRepProcessTypeNotInitialized: - - if (segmentState == SegmentStateShutdownFilerepBackends && + + if (segmentState == SegmentStateShutdownFilerepBackends && fileRepShmemArray[0]->state == FileRepStateFault) { FileRep_InsertConfigLogEntry("failure is detected in segment mirroring during backend shutdown, abort requested"); } /* no break */ default: - + if (fileRepProcessType != FileRepProcessTypeNotInitialized) { FileRepSubProcess_ProcessSignals(); } - + if (dataState == DataStateInChangeTracking) { - isStateTransitionRequested = TRUE; - } - + isStateTransitionRequested = TRUE; + } + switch (segmentState) { case SegmentStateFault: case SegmentStateImmediateShutdown: case SegmentStateShutdown: - + isStateTransitionRequested = TRUE; break; - + default: break; } - + break; } - + if (isStateTransitionRequested) { FileRep_InsertConfigLogEntry("state transition requested "); } return isStateTransitionRequested; } - + /* * FileRepSubProcess_GetState() * Return state of FileRep sub-process */ -FileRepState_e +FileRepState_e FileRepSubProcess_GetState(void) { - Assert(fileRepState != FileRepStateShutdownBackends ); + Assert(fileRepState != FileRepStateShutdownBackends); return fileRepState; -} +} /* * Set state in FileRep process and sent signal to postmaster */ -void +void FileRepSubProcess_SetState(FileRepState_e fileRepStateLocal) { - bool doAssignment = true; - if ( fileRepStateLocal == FileRepStateShutdownBackends ) - { - if ( FileRepIsBackendSubProcess(fileRepProcessType)) - { - /* the current process must shutdown! */ - fileRepStateLocal = FileRepStateShutdown; - } - else - { - /* the current process doesn't care about shutdown backends -- leave it as shutdown */ - doAssignment = false; - } - } - - if ( ! doAssignment ) - { - return; - } - - switch (fileRepState) { - case FileRepStateNotInitialized: - + bool doAssignment = true; + + if (fileRepStateLocal == FileRepStateShutdownBackends) + { + if (FileRepIsBackendSubProcess(fileRepProcessType)) + { + /* the current process must shutdown! */ + fileRepStateLocal = FileRepStateShutdown; + } + else + { + /* + * the current process doesn't care about shutdown backends -- + * leave it as shutdown + */ + doAssignment = false; + } + } + + if (!doAssignment) + { + return; + } + + switch (fileRepState) + { + case FileRepStateNotInitialized: + fileRepState = fileRepStateLocal; - break; + break; - case FileRepStateInitialization: + case FileRepStateInitialization: - switch (fileRepStateLocal) + switch (fileRepStateLocal) { - case FileRepStateNotInitialized: + case FileRepStateNotInitialized: ereport(WARNING, (errmsg("mirror failure, " "unexpected filerep state transition from '%s' to '%s' " "failover requested", - FileRepStateToString[fileRepState], + FileRepStateToString[fileRepState], FileRepStateToString[fileRepStateLocal]), errhint("run gprecoverseg to re-establish mirror connectivity"))); - fileRepState = FileRepStateFault; + fileRepState = FileRepStateFault; break; - + default: fileRepState = fileRepStateLocal; break; } break; - - case FileRepStateReady: - - switch (fileRepStateLocal) { - case FileRepStateFault: - case FileRepStateShutdown: - fileRepState = fileRepStateLocal; - break; - case FileRepStateNotInitialized: + + case FileRepStateReady: + + switch (fileRepStateLocal) + { + case FileRepStateFault: + case FileRepStateShutdown: + fileRepState = fileRepStateLocal; + break; + case FileRepStateNotInitialized: ereport(WARNING, (errmsg("mirror failure, " "unexpected filerep state transition from '%s' to '%s' " "failover requested", - FileRepStateToString[fileRepState], + FileRepStateToString[fileRepState], FileRepStateToString[fileRepStateLocal]), errhint("run gprecoverseg to re-establish mirror connectivity"))); - + fileRepState = FileRepStateFault; - break; - case FileRepStateInitialization: - /* don't do assignment -- this can happen when going from segmentState Ready to InSyncTransition */ - doAssignment = false; - break; - case FileRepStateReady: - break; - default: - Assert(0); - break; - } - break; - case FileRepStateFault: - - switch (fileRepStateLocal) { - case FileRepStateFault: - case FileRepStateShutdown: - fileRepState = fileRepStateLocal; - break; - case FileRepStateNotInitialized: - case FileRepStateInitialization: - case FileRepStateReady: + break; + case FileRepStateInitialization: + + /* + * don't do assignment -- this can happen when going from + * segmentState Ready to InSyncTransition + */ + doAssignment = false; + break; + case FileRepStateReady: + break; + default: + Assert(0); + break; + } + break; + case FileRepStateFault: + + switch (fileRepStateLocal) + { + case FileRepStateFault: + case FileRepStateShutdown: + fileRepState = fileRepStateLocal; + break; + case FileRepStateNotInitialized: + case FileRepStateInitialization: + case FileRepStateReady: ereport(WARNING, (errmsg("mirror failure, " "unexpected filerep state transition from '%s' to '%s' " "failover requested", - FileRepStateToString[fileRepState], + FileRepStateToString[fileRepState], FileRepStateToString[fileRepStateLocal]), errhint("run gprecoverseg to re-establish mirror connectivity"))); - + fileRepState = FileRepStateFault; - - break; - default: - Assert(0); - break; - } - - break; - case FileRepStateShutdownBackends: - Assert(!"process filerep state should never be in ShutdownBackends"); - break; - case FileRepStateShutdown: - - switch (fileRepStateLocal) { - case FileRepStateShutdown: - fileRepState = fileRepStateLocal; - break; - case FileRepStateNotInitialized: - case FileRepStateInitialization: - case FileRepStateReady: + + break; + default: + Assert(0); + break; + } + + break; + case FileRepStateShutdownBackends: + Assert(!"process filerep state should never be in ShutdownBackends"); + break; + case FileRepStateShutdown: + + switch (fileRepStateLocal) + { + case FileRepStateShutdown: + fileRepState = fileRepStateLocal; + break; + case FileRepStateNotInitialized: + case FileRepStateInitialization: + case FileRepStateReady: ereport(WARNING, (errmsg("mirror failure, " "unexpected filerep state transition from '%s' to '%s' " "failover requested", - FileRepStateToString[fileRepState], + FileRepStateToString[fileRepState], FileRepStateToString[fileRepStateLocal]), errhint("run gprecoverseg to re-establish mirror connectivity"))); fileRepState = FileRepStateFault; - - case FileRepStateFault: - break; - default: - Assert(0); - break; - } - - break; - default: - Assert(0); - break; - } - - /* check doAssignment again -- may have changed value in the switch above */ - if ( ! doAssignment ) - { - return; - } - - /* now update in shared memory if needed */ - switch (fileRepState) { + + case FileRepStateFault: + break; + default: + Assert(0); + break; + } + + break; + default: + Assert(0); + break; + } + + /* check doAssignment again -- may have changed value in the switch above */ + if (!doAssignment) + { + return; + } + + /* now update in shared memory if needed */ + switch (fileRepState) + { case FileRepStateReady: if (segmentState != SegmentStateChangeTrackingDisabled) { FileRep_SetSegmentState(SegmentStateReady, FaultTypeNotInitialized); } break; - + case FileRepStateFault: - /* update shared memory configuration - bool updateSegmentState(FAULT); - return TRUE if state was updated; - return FALSE if state was already set to FAULT - change signal to PMSIGNAL_FILEREP_SEGMENT_STATE_CHANGE - */ + + /* + * update shared memory configuration bool + * updateSegmentState(FAULT); return TRUE if state was updated; + * return FALSE if state was already set to FAULT change signal to + * PMSIGNAL_FILEREP_SEGMENT_STATE_CHANGE + */ FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); break; - + case FileRepStateInitialization: case FileRepStateShutdown: case FileRepStateNotInitialized: /* No operation */ break; case FileRepStateShutdownBackends: - Assert(0); - break; - default: - Assert(0); - break; + Assert(0); + break; + default: + Assert(0); + break; } /* report the change */ @@ -572,23 +596,23 @@ FileRepSubProcess_SetState(FileRepState_e fileRepStateLocal) { FileRep_InsertConfigLogEntry("set filerep state"); } - + } - + static void FileRepSubProcess_InitProcess(void) { SetProcessingMode(InitProcessing); - + /* - * Create a resource owner to keep track of our resources + * Create a resource owner to keep track of our resources */ - CurrentResourceOwner = ResourceOwnerCreate(NULL, - FileRepProcessTypeToString[fileRepProcessType]); - - + CurrentResourceOwner = ResourceOwnerCreate(NULL, + FileRepProcessTypeToString[fileRepProcessType]); + + InitXLOGAccess(); - + SetProcessingMode(NormalProcessing); InitBufferPoolAccess(); @@ -600,10 +624,10 @@ FileRepSubProcess_InitProcess(void) * a normal backend has acquired ProcArrayLock and is waiting for Filerep * transition to finish, the Filerep backend subprocesses will deadlock * forever as they can't acquire the ProcArray lock to remove themselves - * from the ProcArray. This directly causes the transition to stall and thus - * the whole system. + * from the ProcArray. This directly causes the transition to stall and + * thus the whole system. */ - + /* * Initialize my entry in the shared-invalidation manager's array of * per-backend data. @@ -611,12 +635,12 @@ FileRepSubProcess_InitProcess(void) * Sets up MyBackendId, a unique backend identifier. */ MyBackendId = InvalidBackendId; - + SharedInvalBackendInit(false); - + if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend id: %d", MyBackendId); - + /* * bufmgr needs another initialization call too */ @@ -626,7 +650,7 @@ FileRepSubProcess_InitProcess(void) void FileRepSubProcess_InitHeapAccess(void) { - char *fullpath; + char *fullpath; static bool heapAccessInitialized = false; if (heapAccessInitialized) @@ -646,9 +670,9 @@ FileRepSubProcess_InitHeapAccess(void) RelationCacheInitializePhase2(); /* - * In order to access the catalog, we need a database, and a - * tablespace; our access to the heap is going to be slightly - * limited, so we'll just use some defaults. + * In order to access the catalog, we need a database, and a tablespace; + * our access to the heap is going to be slightly limited, so we'll just + * use some defaults. */ MyDatabaseId = TemplateDbOid; MyDatabaseTableSpace = DEFAULTTABLESPACE_OID; @@ -669,33 +693,33 @@ FileRepSubProcess_InitHeapAccess(void) static void FileRepSubProcess_HandleCrash(SIGNAL_ARGS) { - StandardHandlerForSigillSigsegvSigbus_OnMainThread("a file replication subprocess", PASS_SIGNAL_ARGS); + StandardHandlerForSigillSigsegvSigbus_OnMainThread("a file replication subprocess", PASS_SIGNAL_ARGS); } /* * */ static void -FileRepSubProcess_ConfigureSignals(void) +FileRepSubProcess_ConfigureSignals(void) { /* Accept Signals */ /* emergency shutdown */ pqsignal(SIGQUIT, FileRepSubProcess_ImmediateShutdownHandler); - + /* graceful shutdown */ pqsignal(SIGUSR2, FileRepSubProcess_ShutdownHandler); - + /* reload configuration file */ pqsignal(SIGHUP, FileRepSubProcess_SigHupHandler); - + /* data or segment state changed */ pqsignal(SIGUSR1, FileRepSubProcess_FileRepStateHandler); - + /* Ignore Signals */ pqsignal(SIGTERM, SIG_IGN); pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); - + /* Use default action */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGINT, SIG_DFL); @@ -707,15 +731,15 @@ FileRepSubProcess_ConfigureSignals(void) #ifdef SIGSEGV pqsignal(SIGSEGV, FileRepSubProcess_HandleCrash); #endif - + #ifdef SIGILL pqsignal(SIGILL, FileRepSubProcess_HandleCrash); -#endif - +#endif + #ifdef SIGBUS - pqsignal(SIGBUS, FileRepSubProcess_HandleCrash); + pqsignal(SIGBUS, FileRepSubProcess_HandleCrash); #endif - + } /* @@ -725,25 +749,25 @@ void FileRepSubProcess_Main() { const char *statmsg; - - MemoryContext fileRepSubProcessMemoryContext; - - sigjmp_buf local_sigjmp_buf; + + MemoryContext fileRepSubProcessMemoryContext; + + sigjmp_buf local_sigjmp_buf; MyProcPid = getpid(); MyStartTime = time(NULL); - + /* - * Create a PGPROC so we can use LWLocks in FileRep sub-processes. - * The routine also register clean up at process exit + * Create a PGPROC so we can use LWLocks in FileRep sub-processes. The + * routine also register clean up at process exit */ - InitAuxiliaryProcess(); + InitAuxiliaryProcess(); InitBufferPoolBackend(); - + FileRepSubProcess_ConfigureSignals(); - + /* * If an exception is encountered, processing resumes here. * @@ -753,10 +777,10 @@ FileRepSubProcess_Main() { /* Prevents interrupts while cleaning up */ HOLD_INTERRUPTS(); - + /* Report the error to the server log */ EmitErrorReport(); - + LWLockReleaseAll(); if (FileRepPrimary_IsResyncManagerOrWorker()) @@ -768,13 +792,13 @@ FileRepSubProcess_Main() { AbortBufferIO(); UnlockBuffers(); - + /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); } - + /* * We can now go away. Note that because we'll call InitProcess, a * callback will be registered to do ProcKill, which will clean up @@ -782,122 +806,120 @@ FileRepSubProcess_Main() */ proc_exit(0); } - + /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; - - PG_SETMASK(&UnBlockSig); - + + PG_SETMASK(&UnBlockSig); + /* * Identify myself via ps */ - + statmsg = FileRepProcessTypeToString[fileRepProcessType]; - + init_ps_display(statmsg, "", "", ""); - + /* Create the memory context where cross-transaction state is stored */ fileRepSubProcessMemoryContext = AllocSetContextCreate(TopMemoryContext, - "filerep subprocess memory context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - + "filerep subprocess memory context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContextSwitchTo(fileRepSubProcessMemoryContext); - + stateChangeRequestCounter++; - + FileRepSubProcess_ProcessSignals(); - switch (fileRepProcessType) + switch (fileRepProcessType) { case FileRepProcessTypePrimarySender: FileRepPrimary_StartSender(); break; - + case FileRepProcessTypeMirrorReceiver: FileRepMirror_StartReceiver(); - break; + break; case FileRepProcessTypeMirrorConsumer: case FileRepProcessTypeMirrorConsumerWriter: case FileRepProcessTypeMirrorConsumerAppendOnly1: FileRepMirror_StartConsumer(); - break; + break; case FileRepProcessTypeMirrorSenderAck: FileRepAckMirror_StartSender(); break; - + case FileRepProcessTypePrimaryReceiverAck: FileRepAckPrimary_StartReceiver(); - break; + break; case FileRepProcessTypePrimaryConsumerAck: FileRepAckPrimary_StartConsumer(); break; - + case FileRepProcessTypePrimaryRecovery: FileRepSubProcess_InitProcess(); + /* - * At this point, database is starting up and xlog is not - * yet replayed. Initializing relcache now is dangerous, - * a sequential scan of catalog tables may end up with - * incorrect hint bits. E.g. a committed transaction's - * dirty heap pages made it to disk but pg_clog update was - * still in memory and we crashed. If a tuple inserted by - * this transaction is read during relcache - * initialization, status of the tuple's xmin will be - * incorrectly determined as "not commited" from pg_clog. - * And HEAP_XMIN_INVALID hint bit will be set, rendering - * the tuple perpetually invisible. Relcache - * initialization must be deferred to only after all of - * xlog has been replayed. + * At this point, database is starting up and xlog is not yet + * replayed. Initializing relcache now is dangerous, a sequential + * scan of catalog tables may end up with incorrect hint bits. + * E.g. a committed transaction's dirty heap pages made it to disk + * but pg_clog update was still in memory and we crashed. If a + * tuple inserted by this transaction is read during relcache + * initialization, status of the tuple's xmin will be incorrectly + * determined as "not commited" from pg_clog. And + * HEAP_XMIN_INVALID hint bit will be set, rendering the tuple + * perpetually invisible. Relcache initialization must be + * deferred to only after all of xlog has been replayed. */ FileRepPrimary_StartRecovery(); - + ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, - false, true); + false, true); break; case FileRepProcessTypeResyncManager: FileRepSubProcess_InitProcess(); FileRepPrimary_StartResyncManager(); - + ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, - false, true); + false, true); break; - + case FileRepProcessTypeResyncWorker1: case FileRepProcessTypeResyncWorker2: case FileRepProcessTypeResyncWorker3: case FileRepProcessTypeResyncWorker4: FileRepSubProcess_InitProcess(); FileRepPrimary_StartResyncWorker(); - + ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, - false, true); + false, true); break; - + default: - elog(PANIC, "unrecognized process type: %s(%d)", + elog(PANIC, "unrecognized process type: %s(%d)", statmsg, fileRepProcessType); break; } - - switch (FileRepSubProcess_GetState()) + + switch (FileRepSubProcess_GetState()) { case FileRepStateShutdown: case FileRepStateReady: proc_exit(0); break; - + default: proc_exit(2); break; } } - diff --git a/src/backend/cdb/cdbfts.c b/src/backend/cdb/cdbfts.c index 2baddf9ff8..611448158f 100644 --- a/src/backend/cdb/cdbfts.c +++ b/src/backend/cdb/cdbfts.c @@ -42,16 +42,16 @@ /* segment id for the master */ #define MASTER_SEGMENT_ID -1 -FtsProbeInfo *ftsProbeInfo = NULL; /* Probe process updates this structure */ -volatile bool *ftsEnabled; -volatile bool *ftsShutdownMaster; -static LWLockId ftsControlLock; +FtsProbeInfo *ftsProbeInfo = NULL; /* Probe process updates this structure */ +volatile bool *ftsEnabled; +volatile bool *ftsShutdownMaster; +static LWLockId ftsControlLock; -static volatile bool *ftsReadOnlyFlag; -static volatile bool *ftsAdminRequestedRO; +static volatile bool *ftsReadOnlyFlag; +static volatile bool *ftsAdminRequestedRO; -static bool local_fts_status_initialized=false; -static uint64 local_fts_statusVersion; +static bool local_fts_status_initialized = false; +static uint64 local_fts_statusVersion; /* * get fts share memory size @@ -60,8 +60,7 @@ int FtsShmemSize(void) { /* - * this shared memory block doesn't even need to *exist* on the - * QEs! + * this shared memory block doesn't even need to *exist* on the QEs! */ if ((Gp_role != GP_ROLE_DISPATCH) && (Gp_role != GP_ROLE_UTILITY)) return 0; @@ -75,7 +74,7 @@ FtsShmemInit(void) bool found; FtsControlBlock *shared; - shared = (FtsControlBlock *)ShmemInitStruct("Fault Tolerance manager", FtsShmemSize(), &found); + shared = (FtsControlBlock *) ShmemInitStruct("Fault Tolerance manager", FtsShmemSize(), &found); if (!shared) elog(FATAL, "FTS: could not initialize fault tolerance manager share memory"); @@ -87,7 +86,9 @@ FtsShmemInit(void) ftsReadOnlyFlag = &shared->ftsReadOnlyFlag; /* global RO state */ - ftsAdminRequestedRO = &shared->ftsAdminRequestedRO; /* Admin request -- guc-controlled RO state */ + ftsAdminRequestedRO = &shared->ftsAdminRequestedRO; /* Admin request -- + * guc-controlled RO + * state */ ftsProbeInfo = &shared->fts_probe_info; @@ -105,7 +106,7 @@ FtsShmemInit(void) shared->fts_probe_info.fts_discardResults = false; shared->fts_probe_info.fts_statusVersion = 0; - shared->ftsEnabled = true; /* ??? */ + shared->ftsEnabled = true; /* ??? */ shared->ftsShutdownMaster = false; } } @@ -129,9 +130,11 @@ FtsNotifyProber(void) if (ftsProbeInfo->fts_probePid == 0) return; + /* - * This is a full-scan request. We set the request-flag == to the bitmap version flag. - * When the version has been bumped, we know that the request has been filled. + * This is a full-scan request. We set the request-flag == to the bitmap + * version flag. When the version has been bumped, we know that the + * request has been filled. */ ftsProbeInfo->fts_probeScanRequested = ftsProbeInfo->fts_statusVersion; @@ -145,7 +148,7 @@ FtsNotifyProber(void) tv.tv_usec = 50000; tv.tv_sec = 0; - select(0, NULL, NULL, NULL, &tv); /* don't care about return value. */ + select(0, NULL, NULL, NULL, &tv); /* don't care about return value. */ CHECK_FOR_INTERRUPTS(); } @@ -155,7 +158,8 @@ FtsNotifyProber(void) /* * Check if master needs to shut down */ -bool FtsMasterShutdownRequested() +bool +FtsMasterShutdownRequested() { return *ftsShutdownMaster; } @@ -164,15 +168,17 @@ bool FtsMasterShutdownRequested() /* * Set flag indicating that master needs to shut down */ -void FtsRequestMasterShutdown() +void +FtsRequestMasterShutdown() { #ifdef USE_ASSERT_CHECKING Assert(!*ftsShutdownMaster); PrimaryMirrorMode pm_mode; + getPrimaryMirrorStatusCodes(&pm_mode, NULL, NULL, NULL); Assert(pm_mode == PMModeMaster); -#endif /*USE_ASSERT_CHECKING*/ +#endif /* USE_ASSERT_CHECKING */ *ftsShutdownMaster = true; } @@ -217,7 +223,7 @@ FtsReConfigureMPP(bool create_new_gangs) local_fts_statusVersion = ftsProbeInfo->fts_statusVersion; ereport(LOG, (errmsg_internal("FTS: reconfiguration is in progress"), - errSendAlert(true))); + errSendAlert(true))); DisconnectAndDestroyAllGangs(true); /* Caller should throw an error. */ @@ -225,14 +231,14 @@ FtsReConfigureMPP(bool create_new_gangs) } void -FtsHandleNetFailure(SegmentDatabaseDescriptor ** segDB, int numOfFailed) +FtsHandleNetFailure(SegmentDatabaseDescriptor **segDB, int numOfFailed) { elog(LOG, "FtsHandleNetFailure: numOfFailed %d", numOfFailed); FtsReConfigureMPP(true); ereport(ERROR, (errmsg_internal("MPP detected %d segment failures, system is reconnected", numOfFailed), - errSendAlert(true))); + errSendAlert(true))); } /* @@ -241,10 +247,10 @@ FtsHandleNetFailure(SegmentDatabaseDescriptor ** segDB, int numOfFailed) * returns true if any segment DB is down. */ bool -FtsTestSegmentDBIsDown(SegmentDatabaseDescriptor * segdbDesc, int size) +FtsTestSegmentDBIsDown(SegmentDatabaseDescriptor *segdbDesc, int size) { - int i = 0; - bool forceRescan = true; + int i = 0; + bool forceRescan = true; Assert(isFTSEnabled()); @@ -257,7 +263,7 @@ FtsTestSegmentDBIsDown(SegmentDatabaseDescriptor * segdbDesc, int size) if (!FtsTestConnection(segInfo, forceRescan)) { ereport(LOG, (errmsg_internal("FTS: found fault with segment dbid %d. " - "Reconfiguration is in progress", segInfo->dbid))); + "Reconfiguration is in progress", segInfo->dbid))); return true; } @@ -302,7 +308,8 @@ isFtsReadOnlySet(void) return *ftsReadOnlyFlag; } -uint64 getFtsVersion(void) +uint64 +getFtsVersion(void) { return ftsProbeInfo->fts_statusVersion; } diff --git a/src/backend/cdb/cdbglobalsequence.c b/src/backend/cdb/cdbglobalsequence.c index 03b54fe0f8..5925e0d86c 100755 --- a/src/backend/cdb/cdbglobalsequence.c +++ b/src/backend/cdb/cdbglobalsequence.c @@ -24,11 +24,12 @@ #include "access/genam.h" #include "access/heapam.h" -static void GlobalSequence_MakeTid( - GpGlobalSequence gpGlobalSequence, +static void +GlobalSequence_MakeTid( + GpGlobalSequence gpGlobalSequence, - ItemPointer globalSequenceTid) - /* TID of the sequence counter tuple. */ + ItemPointer globalSequenceTid) + /* TID of the sequence counter tuple. */ { /* * For now, everything is in block 0. @@ -36,40 +37,41 @@ static void GlobalSequence_MakeTid( ItemPointerSet(globalSequenceTid, 0, gpGlobalSequence); } -static void GlobalSequence_UpdateTuple( - GpGlobalSequence gpGlobalSequence, +static void +GlobalSequence_UpdateTuple( + GpGlobalSequence gpGlobalSequence, - int64 newSequenceNum) + int64 newSequenceNum) { Relation gpGlobalSequenceRel; - bool nulls[Anum_gp_global_sequence_sequence_num]; - Datum values[Anum_gp_global_sequence_sequence_num]; + bool nulls[Anum_gp_global_sequence_sequence_num]; + Datum values[Anum_gp_global_sequence_sequence_num]; HeapTuple globalSequenceTuple = NULL; - MemSet(nulls, 0 , sizeof(nulls)); - + MemSet(nulls, 0, sizeof(nulls)); + GpGlobalSequence_SetDatumValues( - values, - newSequenceNum); - - gpGlobalSequenceRel = - DirectOpen_GpGlobalSequenceOpenShared(); - + values, + newSequenceNum); + + gpGlobalSequenceRel = + DirectOpen_GpGlobalSequenceOpenShared(); + /* * Form the tuple. */ globalSequenceTuple = heap_form_tuple( - gpGlobalSequenceRel->rd_att, - values, - nulls); + gpGlobalSequenceRel->rd_att, + values, + nulls); if (!HeapTupleIsValid(globalSequenceTuple)) elog(ERROR, "Failed to build global sequence tuple"); GlobalSequence_MakeTid( - gpGlobalSequence, - &globalSequenceTuple->t_self); - + gpGlobalSequence, + &globalSequenceTuple->t_self); + frozen_heap_inplace_update(gpGlobalSequenceRel, globalSequenceTuple); heap_freetuple(globalSequenceTuple); @@ -77,59 +79,62 @@ static void GlobalSequence_UpdateTuple( DirectOpen_GpGlobalSequenceClose(gpGlobalSequenceRel); } -static void GlobalSequence_ReadTuple( - GpGlobalSequence gpGlobalSequence, +static void +GlobalSequence_ReadTuple( + GpGlobalSequence gpGlobalSequence, - int64 *currentSequenceNum) + int64 *currentSequenceNum) { Relation gpGlobalSequenceRel; - bool nulls[Anum_gp_global_sequence_sequence_num]; - Datum values[Anum_gp_global_sequence_sequence_num]; + bool nulls[Anum_gp_global_sequence_sequence_num]; + Datum values[Anum_gp_global_sequence_sequence_num]; - HeapTupleData globalSequenceTuple; - Buffer buffer; + HeapTupleData globalSequenceTuple; + Buffer buffer; - gpGlobalSequenceRel = - DirectOpen_GpGlobalSequenceOpenShared(); + gpGlobalSequenceRel = + DirectOpen_GpGlobalSequenceOpenShared(); GlobalSequence_MakeTid( - gpGlobalSequence, - &globalSequenceTuple.t_self); - + gpGlobalSequence, + &globalSequenceTuple.t_self); + if (!heap_fetch(gpGlobalSequenceRel, SnapshotAny, &globalSequenceTuple, &buffer, false, NULL)) elog(ERROR, "Failed to fetch global sequence tuple at %s", ItemPointerToString(&globalSequenceTuple.t_self)); heap_deform_tuple( - &globalSequenceTuple, - gpGlobalSequenceRel->rd_att, - values, - nulls); + &globalSequenceTuple, + gpGlobalSequenceRel->rd_att, + values, + nulls); GpGlobalSequence_GetValues( - values, - currentSequenceNum); + values, + currentSequenceNum); ReleaseBuffer(buffer); - + DirectOpen_GpGlobalSequenceClose(gpGlobalSequenceRel); } -int64 GlobalSequence_Current( - GpGlobalSequence gpGlobalSequence) +int64 +GlobalSequence_Current( + GpGlobalSequence gpGlobalSequence) { - int64 sequenceNum; + int64 sequenceNum; GlobalSequence_ReadTuple(gpGlobalSequence, &sequenceNum); return sequenceNum; } -void GlobalSequence_Set( - GpGlobalSequence gpGlobalSequence, +void +GlobalSequence_Set( + GpGlobalSequence gpGlobalSequence, - int64 newSequenceNum) + int64 newSequenceNum) { GlobalSequence_UpdateTuple(gpGlobalSequence, newSequenceNum); } diff --git a/src/backend/cdb/cdbgroup.c b/src/backend/cdb/cdbgroup.c index 6e130131b7..14086e3d2c 100644 --- a/src/backend/cdb/cdbgroup.c +++ b/src/backend/cdb/cdbgroup.c @@ -54,11 +54,11 @@ #include "catalog/pg_aggregate.h" #include "cdb/cdbllize.h" -#include "cdb/cdbpathtoplan.h" /* cdbpathtoplan_create_flow() */ +#include "cdb/cdbpathtoplan.h" /* cdbpathtoplan_create_flow() */ #include "cdb/cdbpath.h" #include "cdb/cdbpullup.h" #include "cdb/cdbvars.h" -#include "cdb/cdbhash.h" /* isGreenplumDbHashable() */ +#include "cdb/cdbhash.h" /* isGreenplumDbHashable() */ #include "cdb/cdbsetop.h" #include "cdb/cdbgroup.h" @@ -150,39 +150,43 @@ typedef enum DqaCoplanType typedef enum DqaJoinStrategy { DqaJoinUndefined = 0, - DqaJoinNone, /* No join required for solitary DQA argument. */ - DqaJoinCross, /* Scalar aggregation uses cross product. */ - DqaJoinHash, /* Hash join (possibly with subsequent sort) */ - DqaJoinMerge, /* Merge join */ - /* These last are abstract and will be replaced - * by DqaJoinHash aor DqaJoinMerge once planning - * is complete. - */ - DqaJoinSorted, /* Sorted output required. */ - DqaJoinCheapest, /* No sort requirement. */ + DqaJoinNone, /* No join required for solitary DQA argument. */ + DqaJoinCross, /* Scalar aggregation uses cross product. */ + DqaJoinHash, /* Hash join (possibly with subsequent sort) */ + DqaJoinMerge, /* Merge join */ + + /* + * These last are abstract and will be replaced by DqaJoinHash aor + * DqaJoinMerge once planning is complete. + */ + DqaJoinSorted, /* Sorted output required. */ + DqaJoinCheapest, /* No sort requirement. */ } DqaJoinStrategy; /* DQA coplan information */ typedef struct DqaInfo { - Node *distinctExpr; /* By reference from agg_counts for convenience. */ - AttrNumber base_index; /* Index of attribute in base plan targetlist */ - bool can_hash; - double num_rows; /* Estimated cardinality of grouping key, dqa arg */ - Plan *coplan; /* Coplan for this (later this and all prior) coplan */ - Query *parse; /* Plausible root->parse for the coplan. */ - bool distinctkey_collocate; /* Whether the input plan collocates on this - * distinct key */ - - /* These fields are for costing and planning. Before constructing - * the coplan for this DQA argument, determine cheapest way to get - * the answer and cheapest way to get the answer in grouping key - * order. + Node *distinctExpr; /* By reference from agg_counts for + * convenience. */ + AttrNumber base_index; /* Index of attribute in base plan targetlist */ + bool can_hash; + double num_rows; /* Estimated cardinality of grouping key, dqa + * arg */ + Plan *coplan; /* Coplan for this (later this and all prior) + * coplan */ + Query *parse; /* Plausible root->parse for the coplan. */ + bool distinctkey_collocate; /* Whether the input plan collocates + * on this distinct key */ + + /* + * These fields are for costing and planning. Before constructing the + * coplan for this DQA argument, determine cheapest way to get the answer + * and cheapest way to get the answer in grouping key order. */ - bool use_hashed_preliminary; - Cost cost_sorted; + bool use_hashed_preliminary; + Cost cost_sorted; DqaCoplanType coplan_type_sorted; - Cost cost_cheapest; + Cost cost_cheapest; DqaCoplanType coplan_type_cheapest; } DqaInfo; @@ -192,93 +196,103 @@ typedef struct DqaInfo typedef struct AggPlanInfo { /* - * The input is either represented as a Path or a Plan and a Path. - * If input_plan is given, use this plan instead of creating one - * through input_path. - * */ - Path *input_path; - Plan *input_plan; - + * The input is either represented as a Path or a Plan and a Path. If + * input_plan is given, use this plan instead of creating one through + * input_path. + */ + Path *input_path; + Plan *input_plan; + /* These are the ordinary fields characterizing an aggregation */ CdbPathLocus input_locus; MppGroupPrep group_prep; MppGroupType group_type; CdbPathLocus output_locus; - bool distinctkey_collocate; /* Whether the input plan collocates on the - * distinct key */ - + bool distinctkey_collocate; /* Whether the input plan collocates + * on the distinct key */ + /* These are extra for 3-phase plans */ DqaJoinStrategy join_strategy; - bool use_sharing; + bool use_sharing; /* These summarize the status of the structure's cost value. */ - bool valid; - Cost plan_cost; + bool valid; + Cost plan_cost; } AggPlanInfo; typedef struct MppGroupContext { MppGroupPrep prep; MppGroupType type; - - List *tlist; /* The preprocessed targetlist of the original query. */ - Node *havingQual; /* The proprocessed having qual of the original query. */ - Path *best_path; - Path *cheapest_path; - Plan *subplan; + + List *tlist; /* The preprocessed targetlist of the original + * query. */ + Node *havingQual; /* The proprocessed having qual of the + * original query. */ + Path *best_path; + Path *cheapest_path; + Plan *subplan; AggClauseCounts *agg_counts; - double tuple_fraction; - double *p_dNumGroups; /* Group count estimate shared up the call tree. */ + double tuple_fraction; + double *p_dNumGroups; /* Group count estimate shared up the call + * tree. */ CanonicalGroupingSets *canonical_grpsets; - int64 grouping; /* the GROUPING value */ - bool is_grpext; /* identify if this is a grouping extension query */ + int64 grouping; /* the GROUPING value */ + bool is_grpext; /* identify if this is a grouping extension + * query */ - List *sub_tlist; /* Derived (in cdb_grouping_planner) input targetlist. */ - int numGroupCols; + List *sub_tlist; /* Derived (in cdb_grouping_planner) input + * targetlist. */ + int numGroupCols; AttrNumber *groupColIdx; Oid *groupOperators; - int numDistinctCols; + int numDistinctCols; AttrNumber *distinctColIdx; - DqaInfo *dqaArgs; - bool use_hashed_grouping; + DqaInfo *dqaArgs; + bool use_hashed_grouping; CdbPathLocus input_locus; CdbPathLocus output_locus; - /* Indicate whether the input plan collocates on the distinct key if any. - * It is used for one or two-phase aggregation. For three-phase aggregation, - * distinctkey_collocate inside DqaInfo is used. + + /* + * Indicate whether the input plan collocates on the distinct key if any. + * It is used for one or two-phase aggregation. For three-phase + * aggregation, distinctkey_collocate inside DqaInfo is used. */ - bool distinctkey_collocate; - List *current_pathkeys; + bool distinctkey_collocate; + List *current_pathkeys; - /* Indicate if root->parse has been changed during planning. Carry in pointer - * to root for miscellaneous globals. + /* + * Indicate if root->parse has been changed during planning. Carry in + * pointer to root for miscellaneous globals. */ - bool querynode_changed; + bool querynode_changed; PlannerInfo *root; - + /* Work space for aggregate/tlist deconstruction and reconstruction */ - Index final_varno; /* input */ - bool use_irefs_tlist; /* input */ - bool use_dqa_pruning; /* input */ - List *prefs_tlist; /* Aggref attributes for prelim_tlist */ - List *irefs_tlist; /* Aggref attributes for optional inter_tlist */ - List *frefs_tlist; /* Aggref attributes for optional join tlists */ - List *dqa_tlist; /* DQA argument attributes for prelim_tlist */ - List **dref_tlists; /* Array of DQA Aggref tlists (dqa_tlist order) */ - List *grps_tlist; /* Grouping attributes for prelim_tlist */ - List *fin_tlist; /* Final tlist cache. */ - List *fin_hqual; /* Final having qual cache. */ - Index split_aggref_sortgroupref; /* for TargetEntrys made in split_aggref */ - Index outer_varno; /* work */ - Index inner_varno; /* work */ - int *dqa_offsets; /* work */ - List *top_tlist; /* work - the target list to finalize */ - + Index final_varno; /* input */ + bool use_irefs_tlist; /* input */ + bool use_dqa_pruning; /* input */ + List *prefs_tlist; /* Aggref attributes for prelim_tlist */ + List *irefs_tlist; /* Aggref attributes for optional inter_tlist */ + List *frefs_tlist; /* Aggref attributes for optional join tlists */ + List *dqa_tlist; /* DQA argument attributes for prelim_tlist */ + List **dref_tlists; /* Array of DQA Aggref tlists (dqa_tlist + * order) */ + List *grps_tlist; /* Grouping attributes for prelim_tlist */ + List *fin_tlist; /* Final tlist cache. */ + List *fin_hqual; /* Final having qual cache. */ + Index split_aggref_sortgroupref; /* for TargetEntrys made in + * split_aggref */ + Index outer_varno; /* work */ + Index inner_varno; /* work */ + int *dqa_offsets; /* work */ + List *top_tlist; /* work - the target list to finalize */ + /* 3-phase DQA decisions */ DqaJoinStrategy join_strategy; - bool use_sharing; + bool use_sharing; - List *wagSortClauses; /* List of List; within-agg multi sort level */ + List *wagSortClauses; /* List of List; within-agg multi sort level */ } MppGroupContext; /* Constants for aggregation approaches. @@ -295,79 +309,80 @@ typedef struct MppGroupContext #define AGG_ALL (AGG_SINGLEPHASE | AGG_MULTIPHASE) -/* Constants for DQA pruning: +/* Constants for DQA pruning: */ -static const Index grp_varno = 1; /* var refers to grps_tlist */ -static const Index ref_varno = 2; /* var refers to prefs_tlist or relatives */ -static const Index dqa_base_varno = 3; /* refers to one of the dref_tlists */ +static const Index grp_varno = 1; /* var refers to grps_tlist */ +static const Index ref_varno = 2; /* var refers to prefs_tlist or relatives */ +static const Index dqa_base_varno = 3; /* refers to one of the dref_tlists */ /* Coefficients for cost calculation adjustments: These are candidate GUCs - * or, perhaps, replacements for the gp_eager_... series. We wouldn't + * or, perhaps, replacements for the gp_eager_... series. We wouldn't * need these if our statistics and cost calculations were correct, but * as of 3.2, they not. * * Early testing suggested that (1.0, 0.45, 1.7) was about right, but the * risk of introducing skew in the initial redistribution of a 1-phase plan * is great (especially given the 3.2 tendency to way underestimate the - * cardinality of joins), so we penalize 1-phase and normalize to the + * cardinality of joins), so we penalize 1-phase and normalize to the * 2-phase cost (approximately). */ -static const double gp_coefficient_1phase_agg = 20.0; /* penalty */ -static const double gp_coefficient_2phase_agg = 1.0; /* normalized */ -static const double gp_coefficient_3phase_agg = 3.3; /* increase systematic under estimate */ +static const double gp_coefficient_1phase_agg = 20.0; /* penalty */ +static const double gp_coefficient_2phase_agg = 1.0; /* normalized */ +static const double gp_coefficient_3phase_agg = 3.3; /* increase systematic + * under estimate */ /* Forward declarations */ -static Plan * make_one_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx); -static Plan * make_two_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx); -static Plan * make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx); -static Plan * make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, - int dqa_index, Plan* result_plan, - Query** coquery_p); -static Plan * join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *plan, int dqa_index); -static int compareDqas(const void *larg, const void *rarg); -static void planDqaJoinOrder(PlannerInfo *root, MppGroupContext *ctx, - double input_rows); -static List *make_subplan_tlist(List *tlist, Node *havingQual, - List *grp_clauses, int *pnum_gkeys, AttrNumber **pcols_gkeys, Oid **pcols_gops, - List *dqa_args, int *pnum_dqas, AttrNumber **pcols_dqas); +static Plan *make_one_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx); +static Plan *make_two_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx); +static Plan *make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx); +static Plan *make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, + int dqa_index, Plan *result_plan, + Query **coquery_p); +static Plan *join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *plan, int dqa_index); +static int compareDqas(const void *larg, const void *rarg); +static void planDqaJoinOrder(PlannerInfo *root, MppGroupContext *ctx, + double input_rows); +static List *make_subplan_tlist(List *tlist, Node *havingQual, + List *grp_clauses, int *pnum_gkeys, AttrNumber **pcols_gkeys, Oid **pcols_gops, + List *dqa_args, int *pnum_dqas, AttrNumber **pcols_dqas); static List *describe_subplan_tlist(List *sub_tlist, - List *tlist, Node *havingQual, - List *grp_clauses, int *pnum_gkeys, AttrNumber **pcols_gkeys, Oid **pcols_gops, - List *dqa_args, int *pnum_dqas, AttrNumber **pcols_dqas); -static void generate_multi_stage_tlists(MppGroupContext* ctx, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist, - List **p_final_qual); + List *tlist, Node *havingQual, + List *grp_clauses, int *pnum_gkeys, AttrNumber **pcols_gkeys, Oid **pcols_gops, + List *dqa_args, int *pnum_dqas, AttrNumber **pcols_dqas); +static void generate_multi_stage_tlists(MppGroupContext *ctx, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist, + List **p_final_qual); static void prepare_dqa_pruning_tlists(MppGroupContext *ctx); static void generate_dqa_pruning_tlists(MppGroupContext *ctx, + int dqa_index, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist, + List **p_final_qual); +static void deconstruct_agg_info(MppGroupContext *ctx); +static void reconstruct_agg_info(MppGroupContext *ctx, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist, + List **p_final_qual); +static void reconstruct_coplan_info(MppGroupContext *ctx, int dqa_index, List **p_prelim_tlist, List **p_inter_tlist, - List **p_final_tlist, - List **p_final_qual); -static void deconstruct_agg_info(MppGroupContext *ctx); -static void reconstruct_agg_info(MppGroupContext *ctx, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist, - List **p_final_qual); -static void reconstruct_coplan_info(MppGroupContext *ctx, - int dqa_index, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist); + List **p_final_tlist); static Expr *deconstruct_expr(Expr *expr, MppGroupContext *ctx); -static Node* deconstruct_expr_mutator(Node *node, MppGroupContext *ctx); +static Node *deconstruct_expr_mutator(Node *node, MppGroupContext *ctx); static Node *split_aggref(Aggref *aggref, MppGroupContext *ctx); static List *make_vars_tlist(List *tlist, Index varno, AttrNumber offset); -static Plan* add_subqueryscan(PlannerInfo* root, List **p_pathkeys, - Index varno, Query *subquery, Plan *subplan); +static Plan *add_subqueryscan(PlannerInfo *root, List **p_pathkeys, + Index varno, Query *subquery, Plan *subplan); static List *seq_tlist_concat(List *tlist1, List *tlist2); static Node *finalize_split_expr(Node *expr, MppGroupContext *ctx); -static Node* finalize_split_expr_mutator(Node *node, MppGroupContext *ctx); -static Oid lookup_agg_transtype(Aggref *aggref); +static Node *finalize_split_expr_mutator(Node *node, MppGroupContext *ctx); +static Oid lookup_agg_transtype(Aggref *aggref); static bool hash_safe_type(Oid type); static bool sorting_prefixes_grouping(PlannerInfo *root); static bool gp_hash_safe_grouping(PlannerInfo *root); @@ -380,9 +395,9 @@ static void set_cost_of_join_strategies(MppGroupContext *ctx, Cost *hashjoin_cos static void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan); static void set_coplan_strategies(PlannerInfo *root, MppGroupContext *ctx, DqaInfo *dqaArg, Path *input); static Cost incremental_sort_cost(double rows, int width, int numKeyCols); -static Cost incremental_agg_cost(double rows, int width, AggStrategy strategy, - int numGroupCols, double numGroups, - int numAggs, int transSpace); +static Cost incremental_agg_cost(double rows, int width, AggStrategy strategy, + int numGroupCols, double numGroups, + int numAggs, int transSpace); static Cost incremental_motion_cost(double sendrows, double recvrows); static bool contain_aggfilters(Node *node); @@ -397,27 +412,27 @@ static bool contain_aggfilters(Node *node); typedef struct { bool use_deduplicate; /* true to choose deduplicate strategy */ - AttrNumber pc_pos; /* resno for peer count in outer tlist */ - AttrNumber tc_pos; /* resno for total count in inner tlist */ + AttrNumber pc_pos; /* resno for peer count in outer tlist */ + AttrNumber tc_pos; /* resno for total count in inner tlist */ List *current_pathkeys; /* pathkeys tracking */ - List *inner_pathkeys; /* pathkeys for inner plan */ - List *rtable; /* outer/inner RTE of the output */ + List *inner_pathkeys; /* pathkeys for inner plan */ + List *rtable; /* outer/inner RTE of the output */ } WithinAggContext; static bool choose_deduplicate(PlannerInfo *root, List *sortExprs, - Plan *input_plan, double *numGroups); + Plan *input_plan, double *numGroups); static Plan *wrap_plan_index(PlannerInfo *root, Plan *plan, Query *query, - List **p_pathkeys, Index varno, const char *alias_name, Query **query_p); + List **p_pathkeys, Index varno, const char *alias_name, Query **query_p); static void rebuild_simple_rel_and_rte(PlannerInfo *root); static Plan *make_parallel_or_sequential_agg(PlannerInfo *root, - AggClauseCounts *agg_count, GroupContext *group_context, - List **current_pathkeys_p); + AggClauseCounts *agg_count, GroupContext *group_context, + List **current_pathkeys_p); static Node *deconstruct_within_agg(Node *node, MppGroupContext *ctx); static Node *deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx); static List *fetch_percentiles(Query *parse, List *sortClause); static Plan *make_deduplicate_plan(PlannerInfo *root, GroupContext *group_context, - List *groupClause, List *sortClause, double numGroups, - AttrNumber *pc_pos_p, List **current_pathkeys_p, Plan *subplan); + List *groupClause, List *sortClause, double numGroups, + AttrNumber *pc_pos_p, List **current_pathkeys_p, Plan *subplan); static Plan *within_agg_make_baseplan(PlannerInfo *root, GroupContext *group_context, WithinAggContext *wag_context, @@ -442,11 +457,11 @@ static Plan *within_agg_final_agg(PlannerInfo *root, List *sortClause, Plan *result_plan); static Plan *plan_within_agg_persort(PlannerInfo *root, GroupContext *group_context, - List *sortClause, List *current_pathkeys, Plan *base_plan); + List *sortClause, List *current_pathkeys, Plan *base_plan); /* * add_motion_to_dqa_plan - * Add a Redistribute motion to a dqa child plan if the plan is not already + * Add a Redistribute motion to a dqa child plan if the plan is not already * distributed on the grouping columns */ static Plan *add_motion_to_dqa_child(Plan *plan, PlannerInfo *root, bool *motion_added); @@ -478,19 +493,19 @@ static Plan *add_motion_to_dqa_child(Plan *plan, PlannerInfo *root, bool *motion */ Plan * -cdb_grouping_planner(PlannerInfo* root, +cdb_grouping_planner(PlannerInfo *root, AggClauseCounts *agg_counts, GroupContext *group_context) { MppGroupContext ctx; - Plan * result_plan = NULL; - List * sub_tlist = NIL; - bool has_groups = root->parse->groupClause != NIL; - bool has_aggs = agg_counts->numAggs > 0; - bool has_ordered_aggs = list_length(agg_counts->aggOrder) > 0; - ListCell *lc; - - bool is_grpext = false; + Plan *result_plan = NULL; + List *sub_tlist = NIL; + bool has_groups = root->parse->groupClause != NIL; + bool has_aggs = agg_counts->numAggs > 0; + bool has_ordered_aggs = list_length(agg_counts->aggOrder) > 0; + ListCell *lc; + + bool is_grpext = false; unsigned char consider_agg = AGG_NONE; AggPlanInfo plan_1p; AggPlanInfo plan_2p; @@ -510,11 +525,12 @@ cdb_grouping_planner(PlannerInfo* root, *(group_context->querynode_changed) = false; - /* We always use sequential plans for distinct-qualified rollup queries, + /* + * We always use sequential plans for distinct-qualified rollup queries, * so don't waste time working on alternatives. - */ + */ is_grpext = is_grouping_extension(group_context->canonical_grpsets); - if ( is_grpext && agg_counts->numDistinctAggs > 0) + if (is_grpext && agg_counts->numDistinctAggs > 0) return NULL; /* @@ -523,18 +539,19 @@ cdb_grouping_planner(PlannerInfo* root, */ if (group_context->subplan == NULL) { - Path *input_path = group_context->cheapest_path; - - /* Should we prefer the "best" path? Only for vector aggregation - * of input already sorted and collocated on the grouping key. + Path *input_path = group_context->cheapest_path; + + /* + * Should we prefer the "best" path? Only for vector aggregation of + * input already sorted and collocated on the grouping key. */ - if ( has_groups && - pathkeys_contained_in(root->group_pathkeys, group_context->best_path->pathkeys) && - cdbpathlocus_collocates(root, group_context->best_path->locus, root->group_pathkeys, false /*exact_match*/) ) + if (has_groups && + pathkeys_contained_in(root->group_pathkeys, group_context->best_path->pathkeys) && + cdbpathlocus_collocates(root, group_context->best_path->locus, root->group_pathkeys, false /* exact_match */ )) { input_path = group_context->best_path; } - + initAggPlanInfo(&plan_1p, input_path, group_context->subplan); } @@ -544,20 +561,22 @@ cdb_grouping_planner(PlannerInfo* root, plan_1p.input_locus = group_context->best_path->locus; } - if ( ! CdbPathLocus_IsPartitioned(plan_1p.input_locus) ) + if (!CdbPathLocus_IsPartitioned(plan_1p.input_locus)) { /* Can use base plan with no motion yielding same locus. */ plan_1p.group_prep = MPP_GRP_PREP_NONE; plan_1p.output_locus = plan_1p.input_locus; plan_1p.distinctkey_collocate = true; } - else if ( has_groups ) /* and not single or replicated */ + else if (has_groups) /* and not single or replicated */ { if (root->group_pathkeys != NULL && - cdbpathlocus_collocates(root, plan_1p.input_locus, root->group_pathkeys, false /*exact_match*/) ) + cdbpathlocus_collocates(root, plan_1p.input_locus, root->group_pathkeys, false /* exact_match */ )) { plan_1p.group_prep = MPP_GRP_PREP_NONE; - plan_1p.output_locus = plan_1p.input_locus; /* may be less discriminating that group locus */ + plan_1p.output_locus = plan_1p.input_locus; /* may be less + * discriminating that + * group locus */ plan_1p.distinctkey_collocate = true; } else @@ -566,8 +585,8 @@ cdb_grouping_planner(PlannerInfo* root, { /* * Grouping, but no grouping key. This arises in cases like - * SELECT DISTINCT , where we need to eliminate duplicates, - * but there is no key to hash on. + * SELECT DISTINCT , where we need to eliminate + * duplicates, but there is no key to hash on. */ plan_1p.group_prep = MPP_GRP_PREP_HASH_GROUPS; CdbPathLocus_MakeGeneral(&plan_1p.output_locus); @@ -584,7 +603,8 @@ cdb_grouping_planner(PlannerInfo* root, } } } - else if ( has_aggs ) /* and not grouped and not single or replicated */ + else if (has_aggs) /* and not grouped and not single or + * replicated */ { plan_1p.group_prep = MPP_GRP_PREP_FOCUS_QE; CdbPathLocus_MakeSingleQE(&plan_1p.output_locus); @@ -595,11 +615,11 @@ cdb_grouping_planner(PlannerInfo* root, * involved, let it do so. Don't bother to investigate the 2-stage * approach. * - * If the GUC enable_groupagg is set to off and this is a DQA - * query, we won't use the sequential plan. This is because - * the sequential plan for a DQA query always uses GroupAgg. + * If the GUC enable_groupagg is set to off and this is a DQA query, we + * won't use the sequential plan. This is because the sequential plan for + * a DQA query always uses GroupAgg. */ - if ( plan_1p.group_prep == MPP_GRP_PREP_NONE ) + if (plan_1p.group_prep == MPP_GRP_PREP_NONE) { if (enable_groupagg || agg_counts->numDistinctAggs == 0) { @@ -609,22 +629,21 @@ cdb_grouping_planner(PlannerInfo* root, } /* - * When an input plan is given, use it, including its target - * list. When an input target list (and no plan) is given, - * use it for the plan to be created. When neither is given, - * generate a phase 1 target list for the plan to be created. - * Also note the location of any grouping attributes in the - * target list (numGroupCols, groupColIdx). + * When an input plan is given, use it, including its target list. When an + * input target list (and no plan) is given, use it for the plan to be + * created. When neither is given, generate a phase 1 target list for the + * plan to be created. Also note the location of any grouping attributes + * in the target list (numGroupCols, groupColIdx). * - * Also make sure there's a target entry with a non-zero - * sortgroupref for each DQA argument and note the location - * of the attributes (numDistinctCols, distinctColIdx). + * Also make sure there's a target entry with a non-zero sortgroupref for + * each DQA argument and note the location of the attributes + * (numDistinctCols, distinctColIdx). */ - if ( group_context->subplan != NULL) + if (group_context->subplan != NULL) { sub_tlist = group_context->subplan->targetlist; } - else if ( group_context->sub_tlist != NULL ) + else if (group_context->sub_tlist != NULL) { sub_tlist = group_context->sub_tlist; sub_tlist = describe_subplan_tlist(sub_tlist, @@ -646,115 +665,117 @@ cdb_grouping_planner(PlannerInfo* root, &(group_context->numGroupCols), &(group_context->groupColIdx), &(group_context->groupOperators), - agg_counts->dqaArgs, + agg_counts->dqaArgs, &(group_context->numDistinctCols), &(group_context->distinctColIdx)); - /* Where we need to and we can, add column names to the sub_tlist - * entries to make EXPLAIN output look nice. Note that we could dig - * further than this (if we come up empty handed) by probing the range - * table (root->parse->rtable), but this covers the ordinary cases. + /* + * Where we need to and we can, add column names to the sub_tlist + * entries to make EXPLAIN output look nice. Note that we could dig + * further than this (if we come up empty handed) by probing the range + * table (root->parse->rtable), but this covers the ordinary cases. */ foreach(lc, sub_tlist) { - TargetEntry *tle = (TargetEntry*)lfirst(lc); - - if ( IsA(tle->expr, Var) && tle->resname == NULL ) + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (IsA(tle->expr, Var) &&tle->resname == NULL) { TargetEntry *vartle = - tlist_member((Node*)tle->expr, group_context->tlist); - - if ( vartle != NULL && vartle->resname != NULL ) + tlist_member((Node *) tle->expr, group_context->tlist); + + if (vartle != NULL && vartle->resname != NULL) tle->resname = pstrdup(vartle->resname); } } } - /* At this point, we're committed to producing a one- , two- or - * three-stage plan with motion. Determine what aggregation approaches to explore. - * Per MPP-2378, we don't insist on has_aggs for multi-phase + /* + * At this point, we're committed to producing a one- , two- or + * three-stage plan with motion. Determine what aggregation approaches to + * explore. Per MPP-2378, we don't insist on has_aggs for multi-phase * plans. */ { unsigned char allowed_agg; unsigned char possible_agg; - + allowed_agg = AGG_ALL; - - if ( ! root->config->gp_enable_multiphase_agg ) + + if (!root->config->gp_enable_multiphase_agg) allowed_agg &= AGG_SINGLEPHASE; - - /* This prohibition could be relaxed if we tracked missing - * preliminary functions per DQA and were willing to plan - * some DQAs as single and some as multiple phases. Not - * currently, however. + + /* + * This prohibition could be relaxed if we tracked missing preliminary + * functions per DQA and were willing to plan some DQAs as single and + * some as multiple phases. Not currently, however. */ - if ( agg_counts->missing_prelimfunc ) - allowed_agg &= ~ AGG_MULTIPHASE; + if (agg_counts->missing_prelimfunc) + allowed_agg &= ~AGG_MULTIPHASE; /* * Ordered aggregates need to run the transition function on the - * values in sorted order, which in turn translates into single - * phase aggregation. + * values in sorted order, which in turn translates into single phase + * aggregation. */ - if ( has_ordered_aggs ) - allowed_agg &= ~ AGG_MULTIPHASE; + if (has_ordered_aggs) + allowed_agg &= ~AGG_MULTIPHASE; - /* We are currently unwilling to redistribute a gathered - * intermediate across the cluster. This might change - * one day. + /* + * We are currently unwilling to redistribute a gathered intermediate + * across the cluster. This might change one day. */ - if ( ! CdbPathLocus_IsPartitioned(plan_1p.input_locus ) ) + if (!CdbPathLocus_IsPartitioned(plan_1p.input_locus)) allowed_agg &= AGG_SINGLEPHASE; - - - if ( ! root->config->gp_enable_agg_distinct ) - allowed_agg &= ~ AGG_2PHASE_DQA; - - if ( ! root->config->gp_enable_dqa_pruning ) - allowed_agg &= ~ AGG_3PHASE; + + + if (!root->config->gp_enable_agg_distinct) + allowed_agg &= ~AGG_2PHASE_DQA; + + if (!root->config->gp_enable_dqa_pruning) + allowed_agg &= ~AGG_3PHASE; /* - * GPDB_84_MERGE_FIXME: Don't do three-phase aggregation if any of - * the aggregates use FILTERs. We used to do it, with the old, - * hacky, FILTER implementation, but it doesn't work with the new - * one without some extra work. + * GPDB_84_MERGE_FIXME: Don't do three-phase aggregation if any of the + * aggregates use FILTERs. We used to do it, with the old, hacky, + * FILTER implementation, but it doesn't work with the new one without + * some extra work. */ if (contain_aggfilters((Node *) group_context->tlist)) - allowed_agg &= ~ AGG_3PHASE; + allowed_agg &= ~AGG_3PHASE; possible_agg = AGG_SINGLEPHASE; - - if(gp_hash_safe_grouping(root)) + + if (gp_hash_safe_grouping(root)) { - switch ( list_length(agg_counts->dqaArgs) ) + switch (list_length(agg_counts->dqaArgs)) { - case 0: - possible_agg |= AGG_2PHASE; - break; - case 1: - possible_agg |= AGG_2PHASE_DQA | AGG_3PHASE; - break; - default: /* > 1 */ - possible_agg |= AGG_3PHASE; - break; + case 0: + possible_agg |= AGG_2PHASE; + break; + case 1: + possible_agg |= AGG_2PHASE_DQA | AGG_3PHASE; + break; + default: /* > 1 */ + possible_agg |= AGG_3PHASE; + break; } } - if ( is_grpext ) - possible_agg &= ~ (AGG_2PHASE_DQA | AGG_3PHASE); - + if (is_grpext) + possible_agg &= ~(AGG_2PHASE_DQA | AGG_3PHASE); + consider_agg = allowed_agg & possible_agg; } - Assert( consider_agg & AGG_1PHASE ); /* Always possible! */ - - if ( consider_agg & ( AGG_2PHASE | AGG_2PHASE_DQA ) ) + Assert(consider_agg & AGG_1PHASE); /* Always possible! */ + + if (consider_agg & (AGG_2PHASE | AGG_2PHASE_DQA)) { /* XXX initAggPlanInfo(&plan_2p, group_context->cheapest_path); */ initAggPlanInfo(&plan_2p, group_context->best_path, - group_context->subplan); /* but why? */ - + group_context->subplan); /* but why? */ + /* Common 2-phase setup. */ - if ( has_groups ) + if (has_groups) { plan_2p.group_type = MPP_GRP_TYPE_GROUPED_2STAGE; if (root->group_pathkeys == NIL) @@ -768,23 +789,23 @@ cdb_grouping_planner(PlannerInfo* root, CdbPathLocus_MakeSingleQE(&plan_2p.output_locus); } - if ( consider_agg & AGG_2PHASE_DQA ) + if (consider_agg & AGG_2PHASE_DQA) { PathKey *distinct_pathkey; List *l; /* Either have DQA or not! */ - Assert(! (consider_agg & AGG_2PHASE) ); - - Insist( IsA(agg_counts->dqaArgs, List) && - list_length((List*)agg_counts->dqaArgs) == 1 ); + Assert(!(consider_agg & AGG_2PHASE)); + + Insist(IsA(agg_counts->dqaArgs, List) && + list_length((List *) agg_counts->dqaArgs) == 1); distinct_pathkey = cdb_make_pathkey_for_expr(root, linitial(agg_counts->dqaArgs), list_make1(makeString("=")), true); l = list_make1(distinct_pathkey); - - if (!cdbpathlocus_collocates(root, plan_2p.input_locus, l, false /*exact_match*/)) + + if (!cdbpathlocus_collocates(root, plan_2p.input_locus, l, false /* exact_match */ )) { plan_2p.group_prep = MPP_GRP_PREP_HASH_DISTINCT; CdbPathLocus_MakeHashed(&plan_2p.input_locus, l); @@ -799,13 +820,13 @@ cdb_grouping_planner(PlannerInfo* root, list_free(l); } } - - if ( consider_agg & AGG_3PHASE ) + + if (consider_agg & AGG_3PHASE) { initAggPlanInfo(&plan_3p, group_context->cheapest_path, group_context->subplan); - if ( has_groups ) + if (has_groups) { plan_3p.group_type = MPP_GRP_TYPE_GROUPED_DQA_2STAGE; if (root->group_pathkeys == NIL) @@ -817,7 +838,7 @@ cdb_grouping_planner(PlannerInfo* root, { plan_3p.group_type = MPP_GRP_TYPE_PLAIN_DQA_2STAGE; CdbPathLocus_MakeSingleQE(&plan_3p.output_locus); - } + } } /* @@ -849,33 +870,37 @@ cdb_grouping_planner(PlannerInfo* root, ctx.current_pathkeys = NIL; /* Initialize to be tidy. */ ctx.querynode_changed = false; ctx.root = root; - - /* If we're to consider 3-phase plans, do some preparation. + + /* + * If we're to consider 3-phase plans, do some preparation. */ - if ( ctx.numDistinctCols > 0 && (consider_agg & AGG_3PHASE) ) - { - int i; - - /* Collect row count estimates and other info for the partial - * results of grouping over combined grouping and distinct (DQA) - * keys. Order the output array of DqaInfo structures (in the - * context) according to how they should be joined. + if (ctx.numDistinctCols > 0 && (consider_agg & AGG_3PHASE)) + { + int i; + + /* + * Collect row count estimates and other info for the partial results + * of grouping over combined grouping and distinct (DQA) keys. Order + * the output array of DqaInfo structures (in the context) according + * to how they should be joined. */ planDqaJoinOrder(root, &ctx, plan_3p.input_path->parent->rows); - - /* Plan the post-Motion portions of each coplan in two ways: one to - * produce the result in the cheapest way and one to produce the - * result ordered by the grouping key in the cheapest way. (For use - * by make_plan_for_one_dqa called by make_three_stage_agg_plan.) + + /* + * Plan the post-Motion portions of each coplan in two ways: one to + * produce the result in the cheapest way and one to produce the + * result ordered by the grouping key in the cheapest way. (For use by + * make_plan_for_one_dqa called by make_three_stage_agg_plan.) */ - for ( i = 0; i < ctx.numDistinctCols; i++ ) + for (i = 0; i < ctx.numDistinctCols; i++) { PathKey *distinct_pathkey; List *l; set_coplan_strategies(root, &ctx, &ctx.dqaArgs[i], plan_3p.input_path); - /* Determine if the input plan already collocates on the distinct + /* + * Determine if the input plan already collocates on the distinct * key. */ distinct_pathkey = cdb_make_pathkey_for_expr(root, @@ -884,7 +909,7 @@ cdb_grouping_planner(PlannerInfo* root, true); l = list_make1(distinct_pathkey); - if (cdbpathlocus_collocates(root, plan_3p.input_locus, l, false /*exact_match*/)) + if (cdbpathlocus_collocates(root, plan_3p.input_locus, l, false /* exact_match */ )) { ctx.dqaArgs[i].distinctkey_collocate = true; } @@ -892,39 +917,39 @@ cdb_grouping_planner(PlannerInfo* root, list_free(l); } } - - - plan_info = NULL; /* Most cost-effective, feasible plan. */ - - if ( consider_agg & AGG_1PHASE ) + + + plan_info = NULL; /* Most cost-effective, feasible plan. */ + + if (consider_agg & AGG_1PHASE) { cost_1phase_aggregation(root, &ctx, &plan_1p); - if ( gp_dev_notice_agg_cost ) + if (gp_dev_notice_agg_cost) elog(NOTICE, "1-phase cost: %.6f", plan_1p.plan_cost); - if ( plan_info == NULL || plan_info->plan_cost > plan_1p.plan_cost ) + if (plan_info == NULL || plan_info->plan_cost > plan_1p.plan_cost) plan_info = &plan_1p; } - if ( consider_agg & ( AGG_2PHASE | AGG_2PHASE_DQA ) ) + if (consider_agg & (AGG_2PHASE | AGG_2PHASE_DQA)) { cost_2phase_aggregation(root, &ctx, &plan_2p); - if ( gp_dev_notice_agg_cost ) + if (gp_dev_notice_agg_cost) elog(NOTICE, "2-phase cost: %.6f", plan_2p.plan_cost); - if ( plan_info == NULL || plan_info->plan_cost > plan_2p.plan_cost ) + if (plan_info == NULL || plan_info->plan_cost > plan_2p.plan_cost) plan_info = &plan_2p; } - if ( consider_agg & AGG_3PHASE ) + if (consider_agg & AGG_3PHASE) { cost_3phase_aggregation(root, &ctx, &plan_3p); - if ( gp_dev_notice_agg_cost ) + if (gp_dev_notice_agg_cost) elog(NOTICE, "3-phase cost: %.6f", plan_3p.plan_cost); - if ( plan_info == NULL || !enable_groupagg || plan_info->plan_cost > plan_3p.plan_cost ) + if (plan_info == NULL || !enable_groupagg || plan_info->plan_cost > plan_3p.plan_cost) plan_info = &plan_3p; } - - Insist( plan_info != NULL ); - + + Insist(plan_info != NULL); + ctx.prep = plan_info->group_prep; ctx.type = plan_info->group_type; ctx.input_locus = plan_info->input_locus; @@ -942,20 +967,20 @@ cdb_grouping_planner(PlannerInfo* root, else result_plan = NULL; /* allow sequential planner to do the work. */ } - else if (ctx.type == MPP_GRP_TYPE_PLAIN_2STAGE || + else if (ctx.type == MPP_GRP_TYPE_PLAIN_2STAGE || ctx.type == MPP_GRP_TYPE_GROUPED_2STAGE) result_plan = make_two_stage_agg_plan(root, &ctx); - else if (ctx.type == MPP_GRP_TYPE_PLAIN_DQA_2STAGE || + else if (ctx.type == MPP_GRP_TYPE_PLAIN_DQA_2STAGE || ctx.type == MPP_GRP_TYPE_GROUPED_DQA_2STAGE) result_plan = make_three_stage_agg_plan(root, &ctx); else ereport(ERROR, (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("no parallel plan for aggregation"))); + errmsg("no parallel plan for aggregation"))); if (!is_grpext && result_plan != NULL && IsA(result_plan, Agg)) - ((Agg *)result_plan)->lastAgg = true; + ((Agg *) result_plan)->lastAgg = true; *(group_context->querynode_changed) = ctx.querynode_changed; *(group_context->pcurrent_pathkeys) = ctx.current_pathkeys; @@ -970,7 +995,7 @@ cdb_grouping_planner(PlannerInfo* root, */ static Plan * make_one_stage_agg_plan(PlannerInfo *root, - MppGroupContext *ctx) + MppGroupContext *ctx) { Query *parse = root->parse; List *tlist = ctx->tlist; @@ -978,22 +1003,22 @@ make_one_stage_agg_plan(PlannerInfo *root, int numGroupCols = ctx->numGroupCols; AttrNumber *groupColIdx = ctx->groupColIdx; Oid *groupOperators = ctx->groupOperators; - Path *best_path = ctx->best_path; - Path *cheapest_path = ctx->cheapest_path; - Path *path = NULL; - bool use_hashed_grouping = ctx->use_hashed_grouping; + Path *best_path = ctx->best_path; + Path *cheapest_path = ctx->cheapest_path; + Path *path = NULL; + bool use_hashed_grouping = ctx->use_hashed_grouping; long numGroups = (*(ctx->p_dNumGroups) < 0) ? 0 : - (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : - (long)*(ctx->p_dNumGroups); + (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : + (long) *(ctx->p_dNumGroups); - List *groupExprs = NIL; + List *groupExprs = NIL; List *current_pathkeys; QualCost tlist_cost; - int i; + int i; + + Plan *result_plan; + bool is_sorted; - Plan *result_plan; - bool is_sorted; - /* * The argument to the "lower" Agg node will use a "flattened" tlist * (having just the (levelsup==0) vars mentioned in the SELECT targetlist @@ -1001,29 +1026,31 @@ make_one_stage_agg_plan(PlannerInfo *root, * simple Vars. This is the same sub_tlist as that used for 1-stage * aggregation in grouping_planner. */ - - /* Create the base plan. If the best path is in grouping key order and - * we don't plan to move it around and this is a vector aggregation, we + + /* + * Create the base plan. If the best path is in grouping key order and we + * don't plan to move it around and this is a vector aggregation, we * should use best path. In other cases, however, use cheapest. */ if (ctx->subplan == NULL) { is_sorted = pathkeys_contained_in(root->group_pathkeys, best_path->pathkeys); path = cheapest_path; - if ( is_sorted && ctx->prep == MPP_GRP_PREP_NONE && numGroupCols > 0 ) + if (is_sorted && ctx->prep == MPP_GRP_PREP_NONE && numGroupCols > 0) path = best_path; result_plan = create_plan(root, path); current_pathkeys = path->pathkeys; - /* Instead of the flat target list produced above, use the sub_tlist - * constructed in cdb_grouping_planner. Add a Result node if the - * base plan can't project. (This may be unnecessary, but, if so, - * the Result node will be removed later.) + /* + * Instead of the flat target list produced above, use the sub_tlist + * constructed in cdb_grouping_planner. Add a Result node if the base + * plan can't project. (This may be unnecessary, but, if so, the + * Result node will be removed later.) */ result_plan = plan_pushdown_tlist(root, result_plan, sub_tlist); - Assert(result_plan->flow); - + Assert(result_plan->flow); + /* Account for the cost of evaluation of the sub_tlist. */ cost_qual_eval(&tlist_cost, sub_tlist, root); result_plan->startup_cost += tlist_cost.startup; @@ -1037,56 +1064,58 @@ make_one_stage_agg_plan(PlannerInfo *root, current_pathkeys = ctx->current_pathkeys; } - /* Precondition the input by adjusting its locus prior to adding - * the Agg or Group node to the base plan, if needed. + /* + * Precondition the input by adjusting its locus prior to adding the Agg + * or Group node to the base plan, if needed. */ - switch ( ctx->prep ) + switch (ctx->prep) { - case MPP_GRP_PREP_NONE: + case MPP_GRP_PREP_NONE: break; - - case MPP_GRP_PREP_HASH_GROUPS: + + case MPP_GRP_PREP_HASH_GROUPS: Assert(numGroupCols > 0); - for ( i = 0; i < numGroupCols; i++) + for (i = 0; i < numGroupCols; i++) { TargetEntry *tle = get_tle_by_resno(sub_tlist, groupColIdx[i]); + groupExprs = lappend(groupExprs, copyObject(tle->expr)); } - result_plan = (Plan*)make_motion_hash(root, result_plan, groupExprs); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, + result_plan = (Plan *) make_motion_hash(root, result_plan, groupExprs); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, result_plan->plan_rows); current_pathkeys = NIL; /* No longer sorted. */ break; - - case MPP_GRP_PREP_FOCUS_QE: - result_plan = (Plan*)make_motion_gather_to_QE(root, result_plan, current_pathkeys); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, + + case MPP_GRP_PREP_FOCUS_QE: + result_plan = (Plan *) make_motion_gather_to_QE(root, result_plan, current_pathkeys); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, result_plan->plan_rows * root->config->cdbpath_segments); break; - - case MPP_GRP_PREP_FOCUS_QD: - result_plan = (Plan*)make_motion_gather_to_QD(root, result_plan, current_pathkeys); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, + + case MPP_GRP_PREP_FOCUS_QD: + result_plan = (Plan *) make_motion_gather_to_QD(root, result_plan, current_pathkeys); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, result_plan->plan_rows * root->config->cdbpath_segments); break; - - case MPP_GRP_PREP_HASH_DISTINCT: - case MPP_GRP_PREP_BROADCAST: + + case MPP_GRP_PREP_HASH_DISTINCT: + case MPP_GRP_PREP_BROADCAST: ereport(ERROR, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("no parallel plan for aggregation"))); - break; /* Never */ + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("no parallel plan for aggregation"))); + break; /* Never */ } Assert(result_plan->flow); /* - * Insert AGG or GROUP node if needed, plus an explicit sort step - * if necessary. + * Insert AGG or GROUP node if needed, plus an explicit sort step if + * necessary. * * HAVING clause, if any, becomes qual of the Agg or Group node. */ @@ -1101,10 +1130,10 @@ make_one_stage_agg_plan(PlannerInfo *root, groupColIdx, groupOperators, numGroups, - 0, /* num_nullcols */ - 0, /* input_grouping */ + 0, /* num_nullcols */ + 0, /* input_grouping */ ctx->grouping, - 0, /* rollup_gs_times */ + 0, /* rollup_gs_times */ ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace, result_plan); @@ -1154,10 +1183,10 @@ make_one_stage_agg_plan(PlannerInfo *root, groupColIdx, groupOperators, numGroups, - 0, /* num_nullcols */ - 0, /* input_grouping */ + 0, /* num_nullcols */ + 0, /* input_grouping */ ctx->grouping, - 0, /* rollup_gs_times */ + 0, /* rollup_gs_times */ ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace, result_plan); @@ -1183,18 +1212,18 @@ make_one_stage_agg_plan(PlannerInfo *root, } else if (root->hasHavingQual) { - /* No aggregates, and no GROUP BY, but a HAVING qual is a - * degenerate case discussed in grouping_planner. We can - * just throw away the plan-so-far and let the caller handle - * the whole enchilada. + /* + * No aggregates, and no GROUP BY, but a HAVING qual is a degenerate + * case discussed in grouping_planner. We can just throw away the + * plan-so-far and let the caller handle the whole enchilada. */ return NULL; } /* - * Decorate the top node with a Flow node if it doesn't have one yet. - * (In such cases we require the next-to-top node to have a Flow node - * from which we can obtain the distribution info.) + * Decorate the top node with a Flow node if it doesn't have one yet. (In + * such cases we require the next-to-top node to have a Flow node from + * which we can obtain the distribution info.) */ if (!result_plan->flow) { @@ -1215,7 +1244,7 @@ make_one_stage_agg_plan(PlannerInfo *root, */ static Plan * make_two_stage_agg_plan(PlannerInfo *root, - MppGroupContext *ctx) + MppGroupContext *ctx) { Query *parse = root->parse; List *prelim_tlist = NIL; @@ -1224,7 +1253,7 @@ make_two_stage_agg_plan(PlannerInfo *root, List *distinctExpr = NIL; List *groupExprs = NIL; List *current_pathkeys; - Plan *result_plan; + Plan *result_plan; QualCost tlist_cost; AggStrategy aggstrategy; int i; @@ -1233,38 +1262,41 @@ make_two_stage_agg_plan(PlannerInfo *root, Oid *groupOperators; AttrNumber *prelimGroupColIdx; Oid *prelimGroupOperators; - Path *path = ctx->best_path; /* no use for ctx->cheapest_path */ + Path *path = ctx->best_path; /* no use for ctx->cheapest_path */ long numGroups = (*(ctx->p_dNumGroups) < 0) ? 0 : - (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : - (long)*(ctx->p_dNumGroups); - - /* Copy these from context rather than using them directly because we may - * scribble on them in plan_grouping_extension(). It would be good to + (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : + (long) *(ctx->p_dNumGroups); + + /* + * Copy these from context rather than using them directly because we may + * scribble on them in plan_grouping_extension(). It would be good to * clean this up, but not today. */ numGroupCols = ctx->numGroupCols; groupColIdx = ctx->groupColIdx; groupOperators = ctx->groupOperators; - /* Create the base plan which will serve as the outer plan (argument) - * of the partial Agg node. + /* + * Create the base plan which will serve as the outer plan (argument) of + * the partial Agg node. */ if (ctx->subplan == NULL) { result_plan = create_plan(root, path); current_pathkeys = path->pathkeys; - /* Instead of the flat target list produced by create_plan above, use + /* + * Instead of the flat target list produced by create_plan above, use * the sub_tlist constructed in cdb_grouping_planner. This consists * of just the (levelsup==0) vars mentioned in the SELECT and HAVING * clauses plus entries for any GROUP BY expressions that are not * simple Vars. (This is the same sub_tlist as used in 1-stage * aggregation and in normal aggregation in grouping_planner). * - * If the base plan is of a type that can't project, add a Result - * node to carry the new target list, else install it directly. - * (Though the result node may not always be necessary, it is safe, - * and superfluous Result nodes are removed later.) + * If the base plan is of a type that can't project, add a Result node + * to carry the new target list, else install it directly. (Though the + * result node may not always be necessary, it is safe, and + * superfluous Result nodes are removed later.) */ result_plan = plan_pushdown_tlist(root, result_plan, ctx->sub_tlist); @@ -1280,81 +1312,83 @@ make_two_stage_agg_plan(PlannerInfo *root, result_plan = ctx->subplan; current_pathkeys = ctx->current_pathkeys; } - - /* At this point result_plan produces the input relation for two-stage + + /* + * At this point result_plan produces the input relation for two-stage * aggregation. * * Begin by preconditioning the input, if necessary, to collocate on * non-distinct values of a single DISTINCT argument. */ - switch ( ctx->prep ) - { - case MPP_GRP_PREP_NONE: - break; - - case MPP_GRP_PREP_HASH_DISTINCT: - Assert(list_length( ctx->agg_counts->dqaArgs) == 1 ); - Assert( ctx->agg_counts->dqaArgs != NIL); - if (!ctx->distinctkey_collocate) - { - distinctExpr = list_make1(linitial(ctx->agg_counts->dqaArgs)); - distinctExpr = copyObject(distinctExpr); - result_plan = (Plan*)make_motion_hash(root, result_plan, distinctExpr); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, - result_plan->plan_rows); - current_pathkeys = NIL; /* No longer sorted. */ - } - - break; - - case MPP_GRP_PREP_FOCUS_QD: - case MPP_GRP_PREP_FOCUS_QE: - case MPP_GRP_PREP_HASH_GROUPS: - case MPP_GRP_PREP_BROADCAST: + switch (ctx->prep) + { + case MPP_GRP_PREP_NONE: + break; + + case MPP_GRP_PREP_HASH_DISTINCT: + Assert(list_length(ctx->agg_counts->dqaArgs) == 1); + Assert(ctx->agg_counts->dqaArgs != NIL); + if (!ctx->distinctkey_collocate) + { + distinctExpr = list_make1(linitial(ctx->agg_counts->dqaArgs)); + distinctExpr = copyObject(distinctExpr); + result_plan = (Plan *) make_motion_hash(root, result_plan, distinctExpr); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, + result_plan->plan_rows); + current_pathkeys = NIL; /* No longer sorted. */ + } + + break; + + case MPP_GRP_PREP_FOCUS_QD: + case MPP_GRP_PREP_FOCUS_QE: + case MPP_GRP_PREP_HASH_GROUPS: + case MPP_GRP_PREP_BROADCAST: ereport(ERROR, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected call for two-stage aggregation"))); - break; /* Never */ + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected call for two-stage aggregation"))); + break; /* Never */ } - + /* - * Get the target lists for the preliminary and final aggregations and - * the qual (HAVING clause) for the final aggregation based on the target - * list of the base plan. Grouping attributes go on front of preliminary - * target list. + * Get the target lists for the preliminary and final aggregations and the + * qual (HAVING clause) for the final aggregation based on the target list + * of the base plan. Grouping attributes go on front of preliminary target + * list. */ generate_multi_stage_tlists(ctx, - &prelim_tlist, - NULL, - &final_tlist, - &final_qual); - + &prelim_tlist, + NULL, + &final_tlist, + &final_qual); + /* - * Since the grouping attributes, if any, are on the front and in order - * on the preliminary targetlist, we need a different vector of grouping + * Since the grouping attributes, if any, are on the front and in order on + * the preliminary targetlist, we need a different vector of grouping * attribute numbers: (1, 2, 3, ...). Later, we'll need */ prelimGroupColIdx = NULL; prelimGroupOperators = NULL; - if ( numGroupCols > 0 ) + if (numGroupCols > 0) { - prelimGroupColIdx = (AttrNumber*)palloc(numGroupCols * sizeof(AttrNumber)); + prelimGroupColIdx = (AttrNumber *) palloc(numGroupCols * sizeof(AttrNumber)); prelimGroupOperators = (Oid *) palloc(numGroupCols * sizeof(Oid)); - for ( i = 0; i < numGroupCols; i++ ) + for (i = 0; i < numGroupCols; i++) { - prelimGroupColIdx[i] = i+1; + prelimGroupColIdx[i] = i + 1; prelimGroupOperators[i] = groupOperators[i]; } } - + /* * Add the Preliminary Agg Node. * - * When this aggregate is a ROLLUP, we add a sequence of preliminary Agg node. + * When this aggregate is a ROLLUP, we add a sequence of preliminary Agg + * node. */ /* Determine the aggregation strategy to use. */ - if ( ctx->use_hashed_grouping ) + if (ctx->use_hashed_grouping) { aggstrategy = AGG_HASHED; current_pathkeys = NIL; @@ -1366,15 +1400,15 @@ make_two_stage_agg_plan(PlannerInfo *root, if (!ctx->is_grpext && !pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { - /* TODO -- Investigate WHY we might sort here! + /* + * TODO -- Investigate WHY we might sort here! * - * Good reasons would be that one of the grouping - * expressions isn't "hashable" or that too may groups - * are anticipated. + * Good reasons would be that one of the grouping expressions + * isn't "hashable" or that too may groups are anticipated. * * A bad reason would be that the final result will be in - * order of the grouping key. (Redistribution will remove - * the ordering.) + * order of the grouping key. (Redistribution will remove the + * ordering.) */ result_plan = (Plan *) make_sort_from_groupcols(root, @@ -1386,8 +1420,10 @@ make_two_stage_agg_plan(PlannerInfo *root, mark_sort_locus(result_plan); } aggstrategy = AGG_SORTED; - /* The AGG node will not change the sort ordering of its - * groups, so current_pathkeys describes the result too. + + /* + * The AGG node will not change the sort ordering of its groups, + * so current_pathkeys describes the result too. */ } else @@ -1396,21 +1432,21 @@ make_two_stage_agg_plan(PlannerInfo *root, current_pathkeys = NIL; /* One row, no sort order */ } } - + if (!ctx->is_grpext) { result_plan = (Plan *) make_agg(root, prelim_tlist, - NIL, /* no havingQual */ + NIL, /* no havingQual */ aggstrategy, root->config->gp_hashagg_streambottom, numGroupCols, groupColIdx, groupOperators, numGroups, - 0, /* num_nullcols */ - 0, /* input_grouping */ - 0, /* grouping */ - 0, /* rollup_gs_times */ + 0, /* num_nullcols */ + 0, /* input_grouping */ + 0, /* grouping */ + 0, /* rollup_gs_times */ ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace, result_plan); @@ -1425,7 +1461,7 @@ make_two_stage_agg_plan(PlannerInfo *root, ctx->use_hashed_grouping, &prelim_tlist, ctx->sub_tlist, true, true, - NIL, /* no havingQual */ + NIL, /* no havingQual */ &numGroupCols, &groupColIdx, &groupOperators, @@ -1435,15 +1471,18 @@ make_two_stage_agg_plan(PlannerInfo *root, &(ctx->querynode_changed), ¤t_pathkeys, result_plan); - /* Since we add Grouping as an additional grouping column, - * we need to add it into prelimGroupColIdx. */ + + /* + * Since we add Grouping as an additional grouping column, we need to + * add it into prelimGroupColIdx. + */ if (prelimGroupColIdx != NULL) { prelimGroupColIdx = (AttrNumber *) - repalloc(prelimGroupColIdx, + repalloc(prelimGroupColIdx, numGroupCols * sizeof(AttrNumber)); prelimGroupOperators = (Oid *) repalloc(prelimGroupOperators, - numGroupCols * sizeof(Oid)); + numGroupCols * sizeof(Oid)); } else { @@ -1454,52 +1493,52 @@ make_two_stage_agg_plan(PlannerInfo *root, } Assert(numGroupCols >= 2); - prelimGroupColIdx[numGroupCols-1] = groupColIdx[numGroupCols-1]; - prelimGroupOperators[numGroupCols-1] = groupOperators[numGroupCols-1]; - prelimGroupColIdx[numGroupCols-2] = groupColIdx[numGroupCols-2]; - prelimGroupOperators[numGroupCols-2] = groupOperators[numGroupCols-2]; + prelimGroupColIdx[numGroupCols - 1] = groupColIdx[numGroupCols - 1]; + prelimGroupOperators[numGroupCols - 1] = groupOperators[numGroupCols - 1]; + prelimGroupColIdx[numGroupCols - 2] = groupColIdx[numGroupCols - 2]; + prelimGroupOperators[numGroupCols - 2] = groupOperators[numGroupCols - 2]; } - + /* * Add Intermediate Motion to Gather or Hash on Groups - */ - switch ( ctx->type ) + */ + switch (ctx->type) { - case MPP_GRP_TYPE_GROUPED_2STAGE: - groupExprs = NIL; - Assert(numGroupCols > 0); - for ( i = 0; i < numGroupCols; i++) - { - TargetEntry *tle; + case MPP_GRP_TYPE_GROUPED_2STAGE: + groupExprs = NIL; + Assert(numGroupCols > 0); + for (i = 0; i < numGroupCols; i++) + { + TargetEntry *tle; - /* skip Grouping/GroupId columns */ - if (ctx->is_grpext && (i == numGroupCols-1 || i == numGroupCols-2)) - continue; + /* skip Grouping/GroupId columns */ + if (ctx->is_grpext && (i == numGroupCols - 1 || i == numGroupCols - 2)) + continue; - tle = get_tle_by_resno(prelim_tlist, prelimGroupColIdx[i]); - groupExprs = lappend(groupExprs, copyObject(tle->expr)); - } - result_plan = (Plan*)make_motion_hash(root, result_plan, groupExprs); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, - result_plan->plan_rows); - break; - - case MPP_GRP_TYPE_PLAIN_2STAGE: - result_plan = (Plan*)make_motion_gather_to_QE(root, result_plan, NULL); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, - result_plan->plan_rows * root->config->cdbpath_segments); - break; - - case MPP_GRP_TYPE_NONE: - case MPP_GRP_TYPE_BASEPLAN: - case MPP_GRP_TYPE_GROUPED_DQA_2STAGE: - case MPP_GRP_TYPE_PLAIN_DQA_2STAGE: - ereport(ERROR, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected use of 2-stage aggregation"))); - break; /* Never */ + tle = get_tle_by_resno(prelim_tlist, prelimGroupColIdx[i]); + groupExprs = lappend(groupExprs, copyObject(tle->expr)); + } + result_plan = (Plan *) make_motion_hash(root, result_plan, groupExprs); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, + result_plan->plan_rows); + break; + + case MPP_GRP_TYPE_PLAIN_2STAGE: + result_plan = (Plan *) make_motion_gather_to_QE(root, result_plan, NULL); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, + result_plan->plan_rows * root->config->cdbpath_segments); + break; + + case MPP_GRP_TYPE_NONE: + case MPP_GRP_TYPE_BASEPLAN: + case MPP_GRP_TYPE_GROUPED_DQA_2STAGE: + case MPP_GRP_TYPE_PLAIN_DQA_2STAGE: + ereport(ERROR, + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected use of 2-stage aggregation"))); + break; /* Never */ } /* @@ -1526,10 +1565,10 @@ make_two_stage_agg_plan(PlannerInfo *root, numGroupCols, prelimGroupColIdx, prelimGroupOperators, - 0, /* num_nullcols */ - 0, /* input_grouping */ + 0, /* num_nullcols */ + 0, /* input_grouping */ ctx->grouping, - 0, /* rollup_gs_times */ + 0, /* rollup_gs_times */ *ctx->p_dNumGroups, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace, @@ -1538,34 +1577,35 @@ make_two_stage_agg_plan(PlannerInfo *root, result_plan, !ctx->is_grpext, true); - + if (ctx->is_grpext) { - ListCell *lc; - bool found = false; + ListCell *lc; + bool found = false; - ((Agg *)result_plan)->inputHasGrouping = true; + ((Agg *) result_plan)->inputHasGrouping = true; /* * We want to make sure that the targetlist of result plan contains - * either GROUP_ID or a targetentry to represent the value of - * GROUP_ID from the subplans. This is because we may need this - * entry to determine if a tuple will be outputted repeatly, by - * the later Repeat node. In the current grouping extension - * planner, if there is no GROUP_ID entry, then it must be the last - * entry in the targetlist of the subplan. + * either GROUP_ID or a targetentry to represent the value of GROUP_ID + * from the subplans. This is because we may need this entry to + * determine if a tuple will be outputted repeatly, by the later + * Repeat node. In the current grouping extension planner, if there is + * no GROUP_ID entry, then it must be the last entry in the targetlist + * of the subplan. */ - foreach (lc, result_plan->targetlist) + foreach(lc, result_plan->targetlist) { - TargetEntry *te = (TargetEntry *)lfirst(lc); + TargetEntry *te = (TargetEntry *) lfirst(lc); /* - * Find out if GROUP_ID in the final targetlist. It should - * point to the last attribute in the subplan targetlist. + * Find out if GROUP_ID in the final targetlist. It should point + * to the last attribute in the subplan targetlist. */ if (IsA(te->expr, Var)) { - Var *var = (Var *)te->expr; + Var *var = (Var *) te->expr; + if (var->varattno == list_length(prelim_tlist)) { found = true; @@ -1573,23 +1613,23 @@ make_two_stage_agg_plan(PlannerInfo *root, } } } - + if (!found) { - /* Add a new target entry in the targetlist which point to - * GROUP_ID attribute in the subplan. Mark this entry - * as Junk. + /* + * Add a new target entry in the targetlist which point to + * GROUP_ID attribute in the subplan. Mark this entry as Junk. */ TargetEntry *te = get_tle_by_resno(prelim_tlist, list_length(prelim_tlist)); - Expr *expr; + Expr *expr; TargetEntry *new_te; - - expr = (Expr *)makeVar(1, - te->resno, - exprType((Node *)te->expr), - exprTypmod((Node *)te->expr), - 0); + + expr = (Expr *) makeVar(1, + te->resno, + exprType((Node *) te->expr), + exprTypmod((Node *) te->expr), + 0); new_te = makeTargetEntry(expr, list_length(result_plan->targetlist) + 1, "group_id", @@ -1610,15 +1650,15 @@ make_two_stage_agg_plan(PlannerInfo *root, * Function make_three_stage_agg_plan * * Construct a three-stage aggregation plan involving DQAs (DISTINCT-qualified - * aggregate functions. - * - * Such a plan will always involve the following three aggregation phases: + * aggregate functions. * - * - preliminary -- remove duplicate (grouping key, DQA argument) values - * from an arbitrarily partitioned input; pre-aggregate plain aggregate + * Such a plan will always involve the following three aggregation phases: + * + * - preliminary -- remove duplicate (grouping key, DQA argument) values + * from an arbitrarily partitioned input; pre-aggregate plain aggregate * functions. * - * - intermediate -- remove duplicate (grouping key, DQA argument) values + * - intermediate -- remove duplicate (grouping key, DQA argument) values * from an input partitioned on the grouping key; pre-aggregate the * pre-aggregated results of preliminary plain aggregate functions. * @@ -1632,7 +1672,7 @@ make_two_stage_agg_plan(PlannerInfo *root, * * The preliminary aggregation phase occurs prior to the collocating * motion and is planned independently on the theory that any ordering - * will be disrupted by the motion. There are cases where this isn't + * will be disrupted by the motion. There are cases where this isn't * necessarily the case, but they are unexploited for now. * * The intermediate and final aggregation phases... @@ -1640,36 +1680,38 @@ make_two_stage_agg_plan(PlannerInfo *root, static Plan * make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx) { - Plan *result_plan; + Plan *result_plan; QualCost tlist_cost; - Path *path = ctx->best_path; /* no use for ctx->cheapest_path */ + Path *path = ctx->best_path; /* no use for ctx->cheapest_path */ - /* We assume that we are called only when - * - there are no grouping extensions (like ROLLUP), - * - the input is partitioned and needs no preparatory Motion, - * - the required transformation involves DQAs. + /* + * We assume that we are called only when - there are no grouping + * extensions (like ROLLUP), - the input is partitioned and needs no + * preparatory Motion, - the required transformation involves DQAs. */ - Assert ( !is_grouping_extension(ctx->canonical_grpsets) ); - Assert ( ctx->prep == MPP_GRP_PREP_NONE ); - Assert ( ctx->type == MPP_GRP_TYPE_GROUPED_DQA_2STAGE - || ctx->type == MPP_GRP_TYPE_PLAIN_DQA_2STAGE ); + Assert(!is_grouping_extension(ctx->canonical_grpsets)); + Assert(ctx->prep == MPP_GRP_PREP_NONE); + Assert(ctx->type == MPP_GRP_TYPE_GROUPED_DQA_2STAGE + || ctx->type == MPP_GRP_TYPE_PLAIN_DQA_2STAGE); - /* Create the base plan which will serve as the outer plan (argument) - * of the partial Agg node(s). + /* + * Create the base plan which will serve as the outer plan (argument) of + * the partial Agg node(s). */ if (ctx->subplan == NULL) { result_plan = create_plan(root, path); - /* Instead of the flat target list produced above, use the sub_tlist - * constructed in cdb_grouping_planner. Add a Result node if the - * base plan can't project. (This may be unnecessary, but, if so, - * the Result node will be removed later.) + /* + * Instead of the flat target list produced above, use the sub_tlist + * constructed in cdb_grouping_planner. Add a Result node if the base + * plan can't project. (This may be unnecessary, but, if so, the + * Result node will be removed later.) */ result_plan = plan_pushdown_tlist(root, result_plan, ctx->sub_tlist); - Assert(result_plan->flow); - + Assert(result_plan->flow); + /* Account for the cost of evaluation of the sub_tlist. */ cost_qual_eval(&tlist_cost, ctx->sub_tlist, root); result_plan->startup_cost += tlist_cost.startup; @@ -1681,35 +1723,37 @@ make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx) { result_plan = ctx->subplan; } - + /* Use caller specified join_strategy: None, Cross, Hash, or Merge. */ - + prepare_dqa_pruning_tlists(ctx); - - if ( list_length(ctx->agg_counts->dqaArgs) == 1 ) + + if (list_length(ctx->agg_counts->dqaArgs) == 1) { - /* Note: single-DQA plans don't require a join and are handled - * specially by make_plan_for_one_dqa so we can return the result + /* + * Note: single-DQA plans don't require a join and are handled + * specially by make_plan_for_one_dqa so we can return the result * directly. */ - Query *query; - - result_plan = make_plan_for_one_dqa(root, ctx, 0, + Query *query; + + result_plan = make_plan_for_one_dqa(root, ctx, 0, result_plan, &query); memcpy(root->parse, query, sizeof(Query)); - + pfree(query); } else { - /* Multi-DQA plans are trickier because of the need to consider input - * sharing and the need to join the coplans back together. + /* + * Multi-DQA plans are trickier because of the need to consider input + * sharing and the need to join the coplans back together. */ - List *share_partners; - int i; - List *rtable = NIL; + List *share_partners; + int i; + List *rtable = NIL; - if ( ctx->use_sharing ) + if (ctx->use_sharing) { share_partners = share_plan(root, result_plan, ctx->numDistinctCols); } @@ -1717,65 +1761,67 @@ make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx) { share_partners = NIL; share_partners = lappend(share_partners, result_plan); - for ( i = 1; i < ctx->numDistinctCols; i++ ) + for (i = 1; i < ctx->numDistinctCols; i++) { share_partners = lappend(share_partners, copyObject(result_plan)); } } - + /* Construct a coplan for each distinct DQA argument. */ - for ( i = 0; i < ctx->numDistinctCols; i++ ) + for (i = 0; i < ctx->numDistinctCols; i++) { - char buffer[50]; - int j; - ListCell *l; - Alias *eref; - Plan *coplan; - Query *coquery; - - coplan = (Plan*)list_nth(share_partners,i); - coplan = make_plan_for_one_dqa(root, ctx, i, + char buffer[50]; + int j; + ListCell *l; + Alias *eref; + Plan *coplan; + Query *coquery; + + coplan = (Plan *) list_nth(share_partners, i); + coplan = make_plan_for_one_dqa(root, ctx, i, coplan, &coquery); - + eref = makeNode(Alias); - sprintf(buffer, "dqa_coplan_%d", i+1); + sprintf(buffer, "dqa_coplan_%d", i + 1); eref->aliasname = pstrdup(buffer); eref->colnames = NIL; j = 1; - foreach (l, coplan->targetlist) + foreach(l, coplan->targetlist) { - TargetEntry *tle = (TargetEntry*)lfirst(l); - Value *colname = get_tle_name(tle, coquery->rtable, buffer); + TargetEntry *tle = (TargetEntry *) lfirst(l); + Value *colname = get_tle_name(tle, coquery->rtable, buffer); + eref->colnames = lappend(eref->colnames, colname); j++; } - + rtable = lappend(rtable, package_plan_as_rte(coquery, coplan, eref, NIL)); - ctx->dqaArgs[i].coplan = add_subqueryscan(root, NULL, i+1, coquery, coplan); + ctx->dqaArgs[i].coplan = add_subqueryscan(root, NULL, i + 1, coquery, coplan); } /* Begin with the first coplan, then join in each suceeding coplan. */ result_plan = ctx->dqaArgs[0].coplan; - for ( i = 1; i < ctx->numDistinctCols; i++ ) + for (i = 1; i < ctx->numDistinctCols; i++) { result_plan = join_dqa_coplan(root, ctx, result_plan, i); } - - /* Finalize the last join plan so it has the correct target list - * and having qual. - */ + + /* + * Finalize the last join plan so it has the correct target list and + * having qual. + */ ctx->top_tlist = result_plan->targetlist; - - result_plan->targetlist = (List*) finalize_split_expr((Node*) ctx->fin_tlist, ctx); - result_plan->qual = (List*) finalize_split_expr((Node*) ctx->fin_hqual, ctx); + + result_plan->targetlist = (List *) finalize_split_expr((Node *) ctx->fin_tlist, ctx); + result_plan->qual = (List *) finalize_split_expr((Node *) ctx->fin_hqual, ctx); /* - * Reconstruct the flow since the targetlist for the result_plan may have - * changed. + * Reconstruct the flow since the targetlist for the result_plan may + * have changed. */ result_plan->flow = pull_up_Flow(result_plan, result_plan->lefttree); - + /* Need to adjust root. Is this enuf? I think so. */ root->parse->rtable = rtable; root->parse->targetList = copyObject(result_plan->targetlist); @@ -1783,8 +1829,8 @@ make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx) /* We modified the parse tree, signal that to the caller */ ctx->querynode_changed = true; } - // Rebuild arrays for RelOptInfo and RangeTblEntry for the PlannerInfo - // since the underlying range tables have been transformed + /* Rebuild arrays for RelOptInfo and RangeTblEntry for the PlannerInfo */ + /* since the underlying range tables have been transformed */ rebuild_simple_rel_and_rte(root); return result_plan; @@ -1792,81 +1838,85 @@ make_three_stage_agg_plan(PlannerInfo *root, MppGroupContext *ctx) /* Helper for qsort in planDqaJoinOrder. */ -int compareDqas(const void *larg, const void *rarg) +int +compareDqas(const void *larg, const void *rarg) { - double lft = ((DqaInfo*)larg)->num_rows; - double rgt = ((DqaInfo*)rarg)->num_rows; - return (lft < rgt)? -1 : (lft == rgt)? 0 : 1; + double lft = ((DqaInfo *) larg)->num_rows; + double rgt = ((DqaInfo *) rarg)->num_rows; + + return (lft < rgt) ? -1 : (lft == rgt) ? 0 : 1; } /* Collect per distinct DQA argument information for use in single- and * multiple-DQA planning and cache it in the context as a new array of * DqaInfo structures anchored at ctx->dqaArgs. The order of elements * in the array determines join order for a multi-DQA plan. - * - * Note: The original list of distinct DQA arguments was collected by - * the count_agg_clauses earlier in planning. Later, make_subplan_tlist - * used it to guarantee that the DQA arguments have target entries with - * non-zero sortgroupref values and to generate vector ctx->distinctColIdx + * + * Note: The original list of distinct DQA arguments was collected by + * the count_agg_clauses earlier in planning. Later, make_subplan_tlist + * used it to guarantee that the DQA arguments have target entries with + * non-zero sortgroupref values and to generate vector ctx->distinctColIdx * to locate those entries. Here, however, we use that vector to locate * the DQA arguments and reorder the vector to agree with join order. */ -void planDqaJoinOrder(PlannerInfo *root, MppGroupContext *ctx, - double input_rows) +void +planDqaJoinOrder(PlannerInfo *root, MppGroupContext *ctx, + double input_rows) { - int i; - DqaInfo *args; - Node *distinctExpr; - - Assert( ctx->numDistinctCols == list_length(ctx->agg_counts->dqaArgs) ); - + int i; + DqaInfo *args; + Node *distinctExpr; + + Assert(ctx->numDistinctCols == list_length(ctx->agg_counts->dqaArgs)); + /* Collect row count estimates for the partial results. */ - if ( ctx->numDistinctCols == 0 ) + if (ctx->numDistinctCols == 0) { ctx->dqaArgs = NULL; return; } - - args = (DqaInfo*)palloc( ctx->numDistinctCols * sizeof(DqaInfo)); - for ( i = 0; i < ctx->numDistinctCols; i++) + args = (DqaInfo *) palloc(ctx->numDistinctCols * sizeof(DqaInfo)); + + for (i = 0; i < ctx->numDistinctCols; i++) { TargetEntry *dtle; - List *x; - int j; - - /* Like PG and the SQL standard, we assume that a DQA may take only - * a single argument -- no REGR_SXY(DISTINCT X,Y). This is what allows - * distinctExpr to be an expression rather than a list of expressions. - */ + List *x; + int j; + + /* + * Like PG and the SQL standard, we assume that a DQA may take only a + * single argument -- no REGR_SXY(DISTINCT X,Y). This is what allows + * distinctExpr to be an expression rather than a list of expressions. + */ dtle = get_tle_by_resno(ctx->sub_tlist, ctx->distinctColIdx[i]); - distinctExpr = (Node*) dtle->expr; - + distinctExpr = (Node *) dtle->expr; + x = NIL; - for ( j = 0; j < ctx->numGroupCols ; j++ ) + for (j = 0; j < ctx->numGroupCols; j++) { TargetEntry *tle; - - tle = get_tle_by_resno(ctx->sub_tlist,ctx->groupColIdx[j]); + + tle = get_tle_by_resno(ctx->sub_tlist, ctx->groupColIdx[j]); x = lappend(x, tle->expr); - } + } x = lappend(x, distinctExpr); - - args[i].distinctExpr = distinctExpr; /* no copy */ + + args[i].distinctExpr = distinctExpr; /* no copy */ args[i].base_index = dtle->resno; args[i].num_rows = estimate_num_groups(root, x, input_rows); args[i].can_hash = hash_safe_type(exprType(distinctExpr)); - + list_free(x); } qsort(args, ctx->numDistinctCols, sizeof(DqaInfo), compareDqas); - + /* Reorder ctx->distinctColIdx to agree with join order. */ - for ( i = 0; i < ctx->numDistinctCols; i++ ) + for (i = 0; i < ctx->numDistinctCols; i++) { ctx->distinctColIdx[i] = args[i].base_index; - } - + } + ctx->dqaArgs = args; } @@ -1879,17 +1929,17 @@ void planDqaJoinOrder(PlannerInfo *root, MppGroupContext *ctx, * * In multi-DQA plans, coplans have minimal targetlists (just grouping * keys, DQA arguments, and results of single aggregate functions). In - * case this is a single-DQA (join-less) plan, the coplan target list is - * "finalized" to produce the result requested by the user (which may + * case this is a single-DQA (join-less) plan, the coplan target list is + * "finalized" to produce the result requested by the user (which may * include expressions over the minimal list in the targetlist and/or * having qual). * - * A Query (including range table) which approximates a query for the + * A Query (including range table) which approximates a query for the * returned plan is stored back into *coquery_p, if coquery_p is not NULL. */ static Plan * -make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, - Plan* result_plan, Query **coquery_p) +make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, + Plan *result_plan, Query **coquery_p) { DqaCoplanType coplan_type; List *prelim_tlist = NIL; @@ -1907,99 +1957,101 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, Query *original_parse; bool groups_sorted = false; long numGroups; - int i, n; - DqaInfo *dqaArg = &ctx->dqaArgs[dqa_index]; - bool sort_coplans = ( ctx->join_strategy == DqaJoinMerge ); - bool groupkeys_collocate = cdbpathlocus_collocates(root, ctx->input_locus, root->group_pathkeys, false /*exact_match*/); - bool need_inter_agg = false; - bool dqaduphazard = false; - bool stream_bottom_agg = root->config->gp_hashagg_streambottom; /* Take hint */ - - /* Planning will perturb root->parse, so we copy it's content aside - * so we can restore it later. We flat copy instead of resetting - * because code in the stack may have a local variable set to the - * value of root->parse. + int i, + n; + DqaInfo *dqaArg = &ctx->dqaArgs[dqa_index]; + bool sort_coplans = (ctx->join_strategy == DqaJoinMerge); + bool groupkeys_collocate = cdbpathlocus_collocates(root, ctx->input_locus, root->group_pathkeys, false /* exact_match */ ); + bool need_inter_agg = false; + bool dqaduphazard = false; + bool stream_bottom_agg = root->config->gp_hashagg_streambottom; /* Take hint */ + + /* + * Planning will perturb root->parse, so we copy it's content aside so we + * can restore it later. We flat copy instead of resetting because code + * in the stack may have a local variable set to the value of root->parse. */ original_parse = makeNode(Query); memcpy(original_parse, root->parse, sizeof(Query)); - /* Our caller, make_three_stage_agg_plan, pushed ctx->sub_tlist onto - * result_plan. This contains all the keys and arguments for the - * whole query. While it would be possible to generate a smaller - * targetlist to use for this single DQA it is probably not worth - * the complexity. Just use sub_tlist as given. + /* + * Our caller, make_three_stage_agg_plan, pushed ctx->sub_tlist onto + * result_plan. This contains all the keys and arguments for the whole + * query. While it would be possible to generate a smaller targetlist to + * use for this single DQA it is probably not worth the complexity. Just + * use sub_tlist as given. * * The DQA argument of interest is attribute dqaArg->baseIndex. * - * Get the target lists for the preliminary, intermediate and final - * aggregations and the qual (HAVING clause) for the final aggregation - * based on the target list of the base plan. Grouping attributes go on + * Get the target lists for the preliminary, intermediate and final + * aggregations and the qual (HAVING clause) for the final aggregation + * based on the target list of the base plan. Grouping attributes go on * front of preliminary and intermediate target lists. */ - generate_dqa_pruning_tlists(ctx, - dqa_index, - &prelim_tlist, - &inter_tlist, - &final_tlist, - &final_qual); - + generate_dqa_pruning_tlists(ctx, + dqa_index, + &prelim_tlist, + &inter_tlist, + &final_tlist, + &final_qual); + /* - * For the first aggregation phases the original grouping attributes + * For the first aggregation phases the original grouping attributes * (maybe zero of them) must be extended to include the DQA argument * attribute (exactly one of them) to be pruned. * * The grouping attributes and a single DQA argument are on the front and - * in order on the preliminary and intermediate targetlists so we need a + * in order on the preliminary and intermediate targetlists so we need a * new vector of grouping attributes, prelimGroupColIdx = (1, 2, 3, ...), * for use in these aggregations. The vector inputGroupColIdx plays a * similar role for sub_tlist. * - * The initial-phase group clause, extendedGroupClause, is the one in - * the query (assumed to have no grouping extensions) augmented by a - * GroupClause node for the DQA argument. This is where the sort - * operator for the DQA argument is selected. + * The initial-phase group clause, extendedGroupClause, is the one in the + * query (assumed to have no grouping extensions) augmented by a + * GroupClause node for the DQA argument. This is where the sort operator + * for the DQA argument is selected. */ - { - GroupClause* gc; + { + GroupClause *gc; TargetEntry *tle; Oid dqaArg_orderingop; Oid dqaArg_eqop; dqaArg_orderingop = ordering_oper_opid(exprType((Node *) dqaArg->distinctExpr)); dqaArg_eqop = get_equality_op_for_ordering_op(dqaArg_orderingop); - if (!OidIsValid(dqaArg_eqop)) /* shouldn't happen */ + if (!OidIsValid(dqaArg_eqop)) /* shouldn't happen */ elog(ERROR, "could not find equality operator for ordering operator %u", dqaArg_orderingop); - n = ctx->numGroupCols + 1; /* add the DQA argument as a grouping key */ - Assert( n > 0 ); + n = ctx->numGroupCols + 1; /* add the DQA argument as a grouping key */ + Assert(n > 0); - prelimGroupColIdx = (AttrNumber*)palloc(n * sizeof(AttrNumber)); + prelimGroupColIdx = (AttrNumber *) palloc(n * sizeof(AttrNumber)); prelimGroupOperators = (Oid *) palloc(n * sizeof(Oid)); gc = makeNode(GroupClause); - tle = get_tle_by_resno(ctx->sub_tlist, dqaArg->base_index); + tle = get_tle_by_resno(ctx->sub_tlist, dqaArg->base_index); gc->tleSortGroupRef = tle->ressortgroupref; gc->sortop = dqaArg_orderingop; extendedGroupClause = list_copy(root->parse->groupClause); extendedGroupClause = lappend(extendedGroupClause, gc); - for ( i = 0; i < ctx->numGroupCols; i++ ) + for (i = 0; i < ctx->numGroupCols; i++) { - prelimGroupColIdx[i] = i+1; + prelimGroupColIdx[i] = i + 1; prelimGroupOperators[i] = ctx->groupOperators[i]; } - prelimGroupColIdx[i] = i+1; + prelimGroupColIdx[i] = i + 1; prelimGroupOperators[i] = dqaArg_eqop; - if (!OidIsValid(prelimGroupOperators[i])) /* shouldn't happen */ + if (!OidIsValid(prelimGroupOperators[i])) /* shouldn't happen */ elog(ERROR, "could not find equality operator for ordering operator %u", prelimGroupOperators[i]); - inputGroupColIdx = (AttrNumber*)palloc(n * sizeof(AttrNumber)); + inputGroupColIdx = (AttrNumber *) palloc(n * sizeof(AttrNumber)); inputGroupOperators = (Oid *) palloc(n * sizeof(Oid)); - for ( i = 0; i < ctx->numGroupCols; i++ ) + for (i = 0; i < ctx->numGroupCols; i++) { inputGroupColIdx[i] = ctx->groupColIdx[i]; inputGroupOperators[i] = ctx->groupOperators[i]; @@ -2007,22 +2059,23 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, inputGroupColIdx[ctx->numGroupCols] = dqaArg->base_index; inputGroupOperators[ctx->numGroupCols] = dqaArg_eqop; } - - /* + + /* * Determine the first-phase aggregation strategy to use. Prefer hashing * to sorting because the benefit of the sort will be lost by the Motion * to follow. */ - if ( dqaArg->use_hashed_preliminary ) + if (dqaArg->use_hashed_preliminary) { aggstrategy = AGG_HASHED; current_pathkeys = NIL; } else { - /* Here we need to sort! The input pathkeys won't contain the - * DQA argument, so just do it. - */ + /* + * Here we need to sort! The input pathkeys won't contain the DQA + * argument, so just do it. + */ result_plan = (Plan *) make_sort_from_groupcols(root, extendedGroupClause, @@ -2032,113 +2085,121 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, current_pathkeys = root->group_pathkeys; mark_sort_locus(result_plan); aggstrategy = AGG_SORTED; - /* The AGG node will not change the sort ordering of its - * groups, so current_pathkeys describes the result too. + + /* + * The AGG node will not change the sort ordering of its groups, so + * current_pathkeys describes the result too. */ } - - /* - * Preliminary Aggregation: With the pre-existing distribution, group - * by the combined grouping key and DQA argument. In the case of the - * first coplan, this phase also pre-aggregates any non-DQAs. This - * eliminates duplicate values of the DQA argument on each QE. + + /* + * Preliminary Aggregation: With the pre-existing distribution, group by + * the combined grouping key and DQA argument. In the case of the first + * coplan, this phase also pre-aggregates any non-DQAs. This eliminates + * duplicate values of the DQA argument on each QE. */ numGroups = (dqaArg->num_rows < 0) ? 0 : - (dqaArg->num_rows > LONG_MAX) ? LONG_MAX : - (long)dqaArg->num_rows; + (dqaArg->num_rows > LONG_MAX) ? LONG_MAX : + (long) dqaArg->num_rows; - /* - * If the data is distributed on the distinct qualified aggregate's key - * and there is no grouping key, then we prefer to not stream the bottom agg + /* + * If the data is distributed on the distinct qualified aggregate's key + * and there is no grouping key, then we prefer to not stream the bottom + * agg */ if (dqaArg->distinctkey_collocate && ctx->numGroupCols == 0) { stream_bottom_agg = false; } - + result_plan = (Plan *) make_agg(root, - prelim_tlist, - NIL, /* no havingQual */ - aggstrategy, stream_bottom_agg, - ctx->numGroupCols + 1, - inputGroupColIdx, - inputGroupOperators, - numGroups, - 0, /* num_nullcols */ - 0, /* input_grouping */ - 0, /* grouping */ - 0, /* rollup_gs_times */ - ctx->agg_counts->numAggs - ctx->agg_counts->numDistinctAggs + 1, - ctx->agg_counts->transitionSpace, /* worst case */ - result_plan); - + prelim_tlist, + NIL, /* no havingQual */ + aggstrategy, stream_bottom_agg, + ctx->numGroupCols + 1, + inputGroupColIdx, + inputGroupOperators, + numGroups, + 0, /* num_nullcols */ + 0, /* input_grouping */ + 0, /* grouping */ + 0, /* rollup_gs_times */ + ctx->agg_counts->numAggs - ctx->agg_counts->numDistinctAggs + 1, + ctx->agg_counts->transitionSpace, /* worst case */ + result_plan); + dqaduphazard = (aggstrategy == AGG_HASHED && stream_bottom_agg); result_plan->flow = pull_up_Flow(result_plan, result_plan->lefttree); - + current_pathkeys = NIL; - + /* - * Intermediate Motion: Gather or Hash on Groups to get colocation - * on the grouping key. Note that this may bring duplicate values - * of the DQA argument together on the QEs. + * Intermediate Motion: Gather or Hash on Groups to get colocation on the + * grouping key. Note that this may bring duplicate values of the DQA + * argument together on the QEs. */ - switch ( ctx->type ) + switch (ctx->type) { - case MPP_GRP_TYPE_GROUPED_DQA_2STAGE: - if (!groupkeys_collocate) - { - groupExprs = NIL; - Assert(ctx->numGroupCols > 0); - for ( i = 0; i < ctx->numGroupCols; i++) + case MPP_GRP_TYPE_GROUPED_DQA_2STAGE: + if (!groupkeys_collocate) { - TargetEntry *tle; - - tle = get_tle_by_resno(prelim_tlist, prelimGroupColIdx[i]); - groupExprs = lappend(groupExprs, copyObject(tle->expr)); + groupExprs = NIL; + Assert(ctx->numGroupCols > 0); + for (i = 0; i < ctx->numGroupCols; i++) + { + TargetEntry *tle; + + tle = get_tle_by_resno(prelim_tlist, prelimGroupColIdx[i]); + groupExprs = lappend(groupExprs, copyObject(tle->expr)); + } + result_plan = (Plan *) make_motion_hash(root, result_plan, groupExprs); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, + result_plan->plan_rows); } - result_plan = (Plan*)make_motion_hash(root, result_plan, groupExprs); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, - result_plan->plan_rows); - } - - break; - - case MPP_GRP_TYPE_PLAIN_DQA_2STAGE: - /* Assert that this is only called for a plain DQA like select count(distinct x) from foo */ - - Assert(ctx->numGroupCols == 0); /* No group-by */ - Assert(n == 1); - - /* If already collocated on DQA arg, don't redistribute */ - if (!dqaArg->distinctkey_collocate) - { - TargetEntry *tle = get_tle_by_resno(ctx->sub_tlist, dqaArg->base_index); - Assert(tle); - groupExprs = lappend(NIL, copyObject(tle->expr)); - result_plan = (Plan*)make_motion_hash(root, result_plan, groupExprs); - result_plan->total_cost += + break; + + case MPP_GRP_TYPE_PLAIN_DQA_2STAGE: + + /* + * Assert that this is only called for a plain DQA like select + * count(distinct x) from foo + */ + + Assert(ctx->numGroupCols == 0); /* No group-by */ + Assert(n == 1); + + /* If already collocated on DQA arg, don't redistribute */ + if (!dqaArg->distinctkey_collocate) + { + TargetEntry *tle = get_tle_by_resno(ctx->sub_tlist, dqaArg->base_index); + + Assert(tle); + groupExprs = lappend(NIL, copyObject(tle->expr)); + + result_plan = (Plan *) make_motion_hash(root, result_plan, groupExprs); + result_plan->total_cost += incremental_motion_cost(result_plan->plan_rows, - result_plan->plan_rows); - } - break; - - case MPP_GRP_TYPE_NONE: - case MPP_GRP_TYPE_BASEPLAN: - case MPP_GRP_TYPE_GROUPED_2STAGE: - case MPP_GRP_TYPE_PLAIN_2STAGE: - ereport(ERROR, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected use of DQA pruned 2-phase aggregation"))); - break; /* Never */ + result_plan->plan_rows); + } + break; + + case MPP_GRP_TYPE_NONE: + case MPP_GRP_TYPE_BASEPLAN: + case MPP_GRP_TYPE_GROUPED_2STAGE: + case MPP_GRP_TYPE_PLAIN_2STAGE: + ereport(ERROR, + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected use of DQA pruned 2-phase aggregation"))); + break; /* Never */ } current_pathkeys = NIL; - + groups_sorted = false; - - if ( sort_coplans ) + + if (sort_coplans) { coplan_type = dqaArg->coplan_type_sorted; } @@ -2146,23 +2207,25 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, { coplan_type = dqaArg->coplan_type_cheapest; } - - if ( dqaduphazard || - (!dqaArg->distinctkey_collocate && !groupkeys_collocate) ) + + if (dqaduphazard || + (!dqaArg->distinctkey_collocate && !groupkeys_collocate)) { - /* Intermediate Aggregation: Grouping key values are colocated so group - * by the combined grouping key and DQA argument while intermediate- - * aggregating any non-DQAs. This once again (and finally) eliminates - * duplicate values of the DQA argument on each QE. + /* + * Intermediate Aggregation: Grouping key values are colocated so + * group by the combined grouping key and DQA argument while + * intermediate- aggregating any non-DQAs. This once again (and + * finally) eliminates duplicate values of the DQA argument on each + * QE. */ need_inter_agg = true; - + switch (coplan_type) { case DQACOPLAN_GGS: case DQACOPLAN_PGS: aggstrategy = AGG_SORTED; - + /* pre-sort required on combined grouping key and DQA argument */ result_plan = (Plan *) make_sort_from_groupcols(root, @@ -2174,7 +2237,7 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, current_pathkeys = root->group_pathkeys; mark_sort_locus(result_plan); break; - + case DQACOPLAN_GSH: case DQACOPLAN_SHH: case DQACOPLAN_HH: @@ -2183,7 +2246,7 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, groups_sorted = false; break; } - + result_plan = add_second_stage_agg(root, true, prelim_tlist, @@ -2193,72 +2256,76 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, ctx->numGroupCols + 1, prelimGroupColIdx, prelimGroupOperators, - 0, /* num_nullcols */ - 0, /* input_grouping */ - 0, /* grouping */ - 0, /* rollup_gs_times */ + 0, /* num_nullcols */ + 0, /* input_grouping */ + 0, /* grouping */ + 0, /* rollup_gs_times */ dqaArg->num_rows, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace, "partial_aggregation", ¤t_pathkeys, - result_plan, + result_plan, true, false); } - - /* Final Aggregation: Group by the grouping key, aggregate the now + + /* + * Final Aggregation: Group by the grouping key, aggregate the now * distinct values of the DQA argument using non-distinct-qualified * aggregation, final aggregate the intermediate values of any non-DQAs. */ - + switch (coplan_type) { - case DQACOPLAN_GSH: - /* pre-sort required on grouping key */ - result_plan = (Plan *) - make_sort_from_groupcols(root, - root->parse->groupClause, - prelimGroupColIdx, - false, - result_plan); - groups_sorted = true; - current_pathkeys = root->group_pathkeys; - mark_sort_locus(result_plan); - /* Fall though. */ - - case DQACOPLAN_GGS: - aggstrategy = AGG_SORTED; - break; - - case DQACOPLAN_SHH: - case DQACOPLAN_HH: - aggstrategy = AGG_HASHED; - groups_sorted = false; - break; - - case DQACOPLAN_PGS: - case DQACOPLAN_PH: - /* plainagg */ - aggstrategy = AGG_PLAIN; - groups_sorted = false; - break; + case DQACOPLAN_GSH: + /* pre-sort required on grouping key */ + result_plan = (Plan *) + make_sort_from_groupcols(root, + root->parse->groupClause, + prelimGroupColIdx, + false, + result_plan); + groups_sorted = true; + current_pathkeys = root->group_pathkeys; + mark_sort_locus(result_plan); + /* Fall though. */ + + case DQACOPLAN_GGS: + aggstrategy = AGG_SORTED; + break; + + case DQACOPLAN_SHH: + case DQACOPLAN_HH: + aggstrategy = AGG_HASHED; + groups_sorted = false; + break; + + case DQACOPLAN_PGS: + case DQACOPLAN_PH: + /* plainagg */ + aggstrategy = AGG_PLAIN; + groups_sorted = false; + break; } /** * In the case where there is no grouping key, we need to gather up all the rows in a single segment to compute the final aggregate. */ - if ( ctx->type == MPP_GRP_TYPE_PLAIN_DQA_2STAGE) + if (ctx->type == MPP_GRP_TYPE_PLAIN_DQA_2STAGE) { - /* Assert that this is only called for a plain DQA like select count(distinct x) from foo */ + /* + * Assert that this is only called for a plain DQA like select + * count(distinct x) from foo + */ Assert(ctx->numGroupCols == 0); /* No grouping columns */ Assert(n == 1); - result_plan = (Plan*)make_motion_gather_to_QE(root, result_plan, NULL); - result_plan->total_cost += - incremental_motion_cost(result_plan->plan_rows, - result_plan->plan_rows * root->config->cdbpath_segments); + result_plan = (Plan *) make_motion_gather_to_QE(root, result_plan, NULL); + result_plan->total_cost += + incremental_motion_cost(result_plan->plan_rows, + result_plan->plan_rows * root->config->cdbpath_segments); } - + result_plan = add_second_stage_agg(root, true, need_inter_agg ? inter_tlist : prelim_tlist, @@ -2268,10 +2335,10 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, ctx->numGroupCols, prelimGroupColIdx, prelimGroupOperators, - 0, /* num_nullcols */ - 0, /* input_grouping */ + 0, /* num_nullcols */ + 0, /* input_grouping */ ctx->grouping, - 0, /* rollup_gs_times */ + 0, /* rollup_gs_times */ *ctx->p_dNumGroups, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace, @@ -2280,42 +2347,44 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, result_plan, true, false); - + /* Final sort */ switch (coplan_type) { - case DQACOPLAN_SHH: - /* post-sort required */ - result_plan = (Plan *) - make_sort_from_groupcols(root, - root->parse->groupClause, - prelimGroupColIdx, - false, - result_plan); - groups_sorted = true; - current_pathkeys = root->group_pathkeys; - mark_sort_locus(result_plan); - break; - - case DQACOPLAN_GGS: - case DQACOPLAN_GSH: - case DQACOPLAN_HH: - case DQACOPLAN_PGS: - case DQACOPLAN_PH: - break; - } - + case DQACOPLAN_SHH: + /* post-sort required */ + result_plan = (Plan *) + make_sort_from_groupcols(root, + root->parse->groupClause, + prelimGroupColIdx, + false, + result_plan); + groups_sorted = true; + current_pathkeys = root->group_pathkeys; + mark_sort_locus(result_plan); + break; + + case DQACOPLAN_GGS: + case DQACOPLAN_GSH: + case DQACOPLAN_HH: + case DQACOPLAN_PGS: + case DQACOPLAN_PH: + break; + } + /* Marshal implicit results. Return explicit result. */ - if ( groups_sorted ) + if (groups_sorted) { - /* The following settings work correctly though they seem wrong. + /* + * The following settings work correctly though they seem wrong. * Though we changed the query tree, we say that we did not so that - * planner.c will notice the useful sort order we have produced. - * We also reset the current pathkeys to the original group keys. - * (Though our target list may differ, its attribute-wise ordering - * is on the group keys.) + * planner.c will notice the useful sort order we have produced. We + * also reset the current pathkeys to the original group keys. (Though + * our target list may differ, its attribute-wise ordering is on the + * group keys.) */ - ctx->current_pathkeys = root->group_pathkeys; /* current_pathkeys are wrong! */ + ctx->current_pathkeys = root->group_pathkeys; /* current_pathkeys are + * wrong! */ ctx->querynode_changed = false; } else @@ -2323,9 +2392,9 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, ctx->current_pathkeys = NIL; ctx->querynode_changed = true; } - + /* If requested, copy our modified Query (at root->parse) for caller. */ - if ( coquery_p != NULL ) + if (coquery_p != NULL) { *coquery_p = makeNode(Query); memcpy(*coquery_p, root->parse, sizeof(Query)); @@ -2334,23 +2403,25 @@ make_plan_for_one_dqa(PlannerInfo *root, MppGroupContext *ctx, int dqa_index, /* Restore the original referent of root->parse. */ memcpy(root->parse, original_parse, sizeof(Query)); pfree(original_parse); - + return result_plan; } -static Plan * +static Plan * join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *outer, int dqa_index) { - Plan *join_plan = NULL; - Plan *inner = ctx->dqaArgs[dqa_index].coplan; - List *join_tlist = NIL; - List *tlist = NIL; - Index outer_varno = 1; - Index inner_varno = dqa_index + 1; - Index varno = 1; - int i, ng, nd; - + Plan *join_plan = NULL; + Plan *inner = ctx->dqaArgs[dqa_index].coplan; + List *join_tlist = NIL; + List *tlist = NIL; + Index outer_varno = 1; + Index inner_varno = dqa_index + 1; + Index varno = 1; + int i, + ng, + nd; + /*--------------------------------------------------------------------- * Make the target list for this join. The outer and inner target lists * will look like @@ -2364,51 +2435,53 @@ join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *outer, int dqa_in /* Use varno 1 for grouping key. */ join_tlist = make_vars_tlist(ctx->grps_tlist, varno, 0); - ng = list_length(join_tlist); /* () */ - nd = ng + list_length(ctx->dref_tlists[0]);/* ( ) */ - - for ( i = 0; i <= dqa_index; i++ ) + ng = list_length(join_tlist); /* () */ + nd = ng + list_length(ctx->dref_tlists[0]); /* ( ) */ + + for (i = 0; i <= dqa_index; i++) { - tlist = make_vars_tlist(ctx->dref_tlists[i], varno+i, ng); - join_tlist = seq_tlist_concat(join_tlist, tlist); /* (... ) */ + tlist = make_vars_tlist(ctx->dref_tlists[i], varno + i, ng); + join_tlist = seq_tlist_concat(join_tlist, tlist); /* (... ) */ } - + tlist = make_vars_tlist(ctx->frefs_tlist, varno, nd); - join_tlist = seq_tlist_concat(join_tlist, tlist); /* (... ) */ - - /* Make the join which will be either a cartesian product (in case of - * scalar aggregation) or a merge or hash join (in case of grouped + join_tlist = seq_tlist_concat(join_tlist, tlist); /* (... ) */ + + /* + * Make the join which will be either a cartesian product (in case of + * scalar aggregation) or a merge or hash join (in case of grouped * aggregation.) */ - if ( ctx->numGroupCols > 0 ) /* MergeJoin: 1x1 */ - { - List *joinclause = NIL; - List *hashclause = NIL; - AttrNumber attrno; - - Insist( ctx->join_strategy == DqaJoinMerge || ctx->join_strategy == DqaJoinHash ); - - /* Make the join clause -- a conjunction of IS NOT DISTINCT FROM + if (ctx->numGroupCols > 0) /* MergeJoin: 1x1 */ + { + List *joinclause = NIL; + List *hashclause = NIL; + AttrNumber attrno; + + Insist(ctx->join_strategy == DqaJoinMerge || ctx->join_strategy == DqaJoinHash); + + /* + * Make the join clause -- a conjunction of IS NOT DISTINCT FROM * predicates on the attributes of the grouping key. */ - for ( attrno = 1; attrno <= ctx->numGroupCols; attrno++ ) + for (attrno = 1; attrno <= ctx->numGroupCols; attrno++) { Expr *qual; - Var *outer_var; + Var *outer_var; Var *inner_var; RestrictInfo *rinfo; TargetEntry *tle = get_tle_by_resno(outer->targetlist, attrno); - - Assert( tle && IsA(tle->expr, Var) ); - - outer_var = (Var*)copyObject(tle->expr); + + Assert(tle && IsA(tle->expr, Var)); + + outer_var = (Var *) copyObject(tle->expr); outer_var->varno = outer_varno; outer_var->varnoold = outer_varno; - - inner_var = (Var*)copyObject(tle->expr); + + inner_var = (Var *) copyObject(tle->expr); inner_var->varno = inner_varno; inner_var->varnoold = inner_varno; - + /* outer should always be on the left */ if (ctx->join_strategy == DqaJoinHash) { @@ -2421,37 +2494,39 @@ join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *outer, int dqa_in joinclause = lappend(joinclause, rinfo); } - - if ( ctx->join_strategy == DqaJoinHash ) + + if (ctx->join_strategy == DqaJoinHash) { /* Make the hash join. */ - bool motion_added_outer = false; - bool motion_added_inner = false; + bool motion_added_outer = false; + bool motion_added_inner = false; outer = add_motion_to_dqa_child(outer, root, &motion_added_outer); inner = add_motion_to_dqa_child(inner, root, &motion_added_inner); - - bool prefetch_inner = motion_added_outer || motion_added_inner; + + bool prefetch_inner = motion_added_outer || motion_added_inner; + if (motion_added_outer || motion_added_inner) { ctx->current_pathkeys = NULL; } - - Hash *hash_plan = make_hash(inner); + + Hash *hash_plan = make_hash(inner); joinclause = get_actual_clauses(joinclause); - join_plan = (Plan*)make_hashjoin(join_tlist, - NIL, /* joinclauses */ - NIL, /* otherclauses */ - hashclause, /* hashclauses */ - joinclause, /* hashqualclauses */ - outer, (Plan*)hash_plan, - JOIN_INNER); + join_plan = (Plan *) make_hashjoin(join_tlist, + NIL, /* joinclauses */ + NIL, /* otherclauses */ + hashclause, /* hashclauses */ + joinclause, /* hashqualclauses */ + outer, (Plan *) hash_plan, + JOIN_INNER); ((Join *) join_plan)->prefetch_inner = prefetch_inner; } else { - /* Make the merge join noting that the outer plan produces rows + /* + * Make the merge join noting that the outer plan produces rows * distinct in the join key. (So does the inner, for that matter, * but the MJ algorithm is only sensitive to the outer.) */ @@ -2461,7 +2536,7 @@ join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *outer, int dqa_in ListCell *l; int i = 0; - foreach (l, joinclause) + foreach(l, joinclause) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); @@ -2472,36 +2547,36 @@ join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *outer, int dqa_in } joinclause = get_actual_clauses(joinclause); - join_plan = (Plan*)make_mergejoin(join_tlist, - NIL, NIL, - joinclause, - mergefamilies, - mergestrategies, - mergenullsfirst, - outer, inner, - JOIN_INNER); - ((MergeJoin*)join_plan)->unique_outer = true; + join_plan = (Plan *) make_mergejoin(join_tlist, + NIL, NIL, + joinclause, + mergefamilies, + mergestrategies, + mergenullsfirst, + outer, inner, + JOIN_INNER); + ((MergeJoin *) join_plan)->unique_outer = true; } } - else /* NestLoop: Cartesian product: 1x1 */ + else /* NestLoop: Cartesian product: 1x1 */ { Insist(ctx->join_strategy == DqaJoinCross); - - join_plan = (Plan*)make_nestloop(join_tlist, - NIL, NIL, - outer, inner, - JOIN_INNER); - ((NestLoop*)join_plan)->singleton_outer = true; - } - + + join_plan = (Plan *) make_nestloop(join_tlist, + NIL, NIL, + outer, inner, + JOIN_INNER); + ((NestLoop *) join_plan)->singleton_outer = true; + } + join_plan->startup_cost = outer->startup_cost + inner->startup_cost; join_plan->plan_rows = outer->plan_rows; - join_plan->plan_width = outer->plan_width + inner->plan_width; /* too high for MJ */ + join_plan->plan_width = outer->plan_width + inner->plan_width; /* too high for MJ */ join_plan->total_cost = outer->total_cost + inner->total_cost; join_plan->total_cost += cpu_tuple_cost * join_plan->plan_rows; - + join_plan->flow = pull_up_Flow(join_plan, join_plan->lefttree); - + return join_plan; } @@ -2518,62 +2593,63 @@ join_dqa_coplan(PlannerInfo *root, MppGroupContext *ctx, Plan *outer, int dqa_in * of the original SELECT and HAVING clauses, plus entries for any GROUP BY * expressions and DQA arguments that are not simple Vars. * - * The implicit results are + * The implicit results are * - * - the number of grouping attributes and a vector of their positions - * (which are equal to their resno's) in the target list delivered through + * - the number of grouping attributes and a vector of their positions + * (which are equal to their resno's) in the target list delivered through * pointers pnum_gkeys and pcols_gkeys, and * * - the number of distinct arguments to DISTINCT-qualified aggregate - * function and a vector of their positions (which are equal to their - * resno's) in the target list delivered through pointers pnum_dqas and + * function and a vector of their positions (which are equal to their + * resno's) in the target list delivered through pointers pnum_dqas and * pcols_dqas. These arguments are guaranteed (by the call to function * augment_subplan_tlist) to appear as attributes of the subplan target * list. * - * There are no similar results for sort and distinct attributes since + * There are no similar results for sort and distinct attributes since * they don't necessarily appear in the subplan target list. */ -List *make_subplan_tlist(List *tlist, Node *havingQual, - List *grp_clauses, - int *pnum_gkeys, AttrNumber **pcols_gkeys, Oid **pcols_gops, - List *dqa_args, - int *pnum_dqas, AttrNumber **pcols_dqas) +List * +make_subplan_tlist(List *tlist, Node *havingQual, + List *grp_clauses, + int *pnum_gkeys, AttrNumber **pcols_gkeys, Oid **pcols_gops, + List *dqa_args, + int *pnum_dqas, AttrNumber **pcols_dqas) { List *sub_tlist; List *extravars; - int num_gkeys; + int num_gkeys; AttrNumber *cols_gkeys; Oid *cols_gops; - Assert( dqa_args != NIL? pnum_dqas != NULL && pcols_dqas != NULL: true ); - + Assert(dqa_args != NIL ? pnum_dqas != NULL && pcols_dqas != NULL : true); + sub_tlist = flatten_tlist(tlist); - // GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true - // in pull_var_clause ? + /* GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true */ + /* in pull_var_clause ? */ extravars = pull_var_clause(havingQual, false); - sub_tlist = add_to_flat_tlist(sub_tlist, extravars, false /* resjunk */); + sub_tlist = add_to_flat_tlist(sub_tlist, extravars, false /* resjunk */ ); list_free(extravars); - + num_gkeys = num_distcols_in_grouplist(grp_clauses); if (num_gkeys > 0) { int keyno = 0; - List *tles; + List *tles; List *sortops; ListCell *lc_tle; ListCell *lc_sortop; - cols_gkeys = (AttrNumber*) palloc(sizeof(AttrNumber) * num_gkeys); + cols_gkeys = (AttrNumber *) palloc(sizeof(AttrNumber) * num_gkeys); cols_gops = (Oid *) palloc(sizeof(Oid) * num_gkeys); get_sortgroupclauses_tles(grp_clauses, tlist, &tles, &sortops); - forboth (lc_tle, tles, lc_sortop, sortops) + forboth(lc_tle, tles, lc_sortop, sortops) { - TargetEntry *tle = (TargetEntry*) lfirst(lc_tle); - Node *expr = (Node*) tle->expr;; + TargetEntry *tle = (TargetEntry *) lfirst(lc_tle); + Node *expr = (Node *) tle->expr;; TargetEntry *sub_tle = NULL; ListCell *sl; @@ -2586,7 +2662,7 @@ List *make_subplan_tlist(List *tlist, Node *havingQual, } if (!sl) { - sub_tle = makeTargetEntry((Expr*) expr, + sub_tle = makeTargetEntry((Expr *) expr, list_length(sub_tlist) + 1, NULL, false); @@ -2598,7 +2674,7 @@ List *make_subplan_tlist(List *tlist, Node *havingQual, cols_gkeys[keyno] = sub_tle->resno; cols_gops[keyno] = get_equality_op_for_ordering_op(lfirst_oid(lc_sortop)); - if (!OidIsValid(cols_gops[keyno])) /* shouldn't happen */ + if (!OidIsValid(cols_gops[keyno])) /* shouldn't happen */ elog(ERROR, "could not find equality operator for ordering operator %u", cols_gops[keyno]); keyno++; @@ -2613,11 +2689,12 @@ List *make_subplan_tlist(List *tlist, Node *havingQual, *pcols_gkeys = NULL; *pcols_gops = NULL; } - - if ( dqa_args != NIL ) + + if (dqa_args != NIL) sub_tlist = augment_subplan_tlist(sub_tlist, dqa_args, pnum_dqas, pcols_dqas, true); - return sub_tlist; /* Possibly modified by appending expression entries. */ + return sub_tlist; /* Possibly modified by appending expression + * entries. */ } @@ -2629,67 +2706,71 @@ List *make_subplan_tlist(List *tlist, Node *havingQual, * for the expressions in exprs. Note that the entries in the input expression * list must be distinct. * - * New entries corresponding to the expressions in the input exprs list - * (if any) are added to the argument list. Existing entries are modified + * New entries corresponding to the expressions in the input exprs list + * (if any) are added to the argument list. Existing entries are modified * (if necessary) in place. * * Return the (modified) input targetlist. - * + * * Implicitly return an array of resno values for exprs in (pnum, *pcols), if * return_resno is true. */ -List *augment_subplan_tlist(List *tlist, List *exprs, int *pnum, AttrNumber **pcols, - bool return_resno) +List * +augment_subplan_tlist(List *tlist, List *exprs, int *pnum, AttrNumber **pcols, + bool return_resno) { - int num; + int num; AttrNumber *cols = NULL; - - num = list_length(exprs); /* Known to be distinct. */ + + num = list_length(exprs); /* Known to be distinct. */ if (num > 0) { int keyno = 0; - ListCell *lx, *lt; - TargetEntry *tle, *matched_tle; - Index max_sortgroupref = 0; - - foreach (lt, tlist) + ListCell *lx, + *lt; + TargetEntry *tle, + *matched_tle; + Index max_sortgroupref = 0; + + foreach(lt, tlist) { - tle = (TargetEntry*)lfirst(lt); - if ( tle->ressortgroupref > max_sortgroupref ) + tle = (TargetEntry *) lfirst(lt); + if (tle->ressortgroupref > max_sortgroupref) max_sortgroupref = tle->ressortgroupref; } if (return_resno) - cols = (AttrNumber*) palloc(sizeof(AttrNumber) * num); + cols = (AttrNumber *) palloc(sizeof(AttrNumber) * num); - foreach (lx, exprs) + foreach(lx, exprs) { - Node *expr = (Node*)lfirst(lx); + Node *expr = (Node *) lfirst(lx); + matched_tle = NULL; - - foreach (lt, tlist) + + foreach(lt, tlist) { - tle = (TargetEntry*)lfirst(lt); - - if ( equal(expr, tle->expr) ) + tle = (TargetEntry *) lfirst(lt); + + if (equal(expr, tle->expr)) { matched_tle = tle; break; } } - if ( matched_tle == NULL ) + if (matched_tle == NULL) { - matched_tle = makeTargetEntry((Expr*) expr, + matched_tle = makeTargetEntry((Expr *) expr, list_length(tlist) + 1, NULL, false); tlist = lappend(tlist, matched_tle); } - - if ( matched_tle->ressortgroupref == 0 ) + + if (matched_tle->ressortgroupref == 0) matched_tle->ressortgroupref = ++max_sortgroupref; - + if (return_resno) cols[keyno++] = matched_tle->resno; } @@ -2708,11 +2789,12 @@ List *augment_subplan_tlist(List *tlist, List *exprs, int *pnum, AttrNumber **pc *pcols = NULL; } } - - /* Note that result is a copy, possibly modified by appending expression - * targetlist entries and/or updating sortgroupref values. + + /* + * Note that result is a copy, possibly modified by appending expression + * targetlist entries and/or updating sortgroupref values. */ - return tlist; + return tlist; } /* @@ -2742,7 +2824,7 @@ describe_subplan_tlist(List *sub_tlist, Oid *grpops; nkeys = num_distcols_in_grouplist(grp_clauses); - if ( nkeys > 0 ) + if (nkeys > 0) { List *tles; List *sortops; @@ -2755,7 +2837,7 @@ describe_subplan_tlist(List *sub_tlist, get_sortgroupclauses_tles(grp_clauses, tlist, &tles, &sortops); - forboth (lc_tle, tles, lc_sortop, sortops) + forboth(lc_tle, tles, lc_sortop, sortops) { TargetEntry *tle = (TargetEntry *) lfirst(lc_tle); TargetEntry *sub_tle; @@ -2768,7 +2850,7 @@ describe_subplan_tlist(List *sub_tlist, cols[keyno] = sub_tle->resno; grpops[keyno] = get_equality_op_for_ordering_op(lfirst_oid(lc_sortop)); - if (!OidIsValid(grpops[keyno])) /* shouldn't happen */ + if (!OidIsValid(grpops[keyno])) /* shouldn't happen */ elog(ERROR, "could not find equality operator for ordering operator %u", grpops[keyno]); keyno++; @@ -2785,7 +2867,7 @@ describe_subplan_tlist(List *sub_tlist, *pcols_gops = NULL; } - if ( dqa_args != NIL ) + if (dqa_args != NIL) sub_tlist = augment_subplan_tlist(sub_tlist, dqa_args, pnum_dqas, pcols_dqas, true); return sub_tlist; @@ -2817,7 +2899,7 @@ generate_subquery_tlist(Index varno, List *input_tlist, TargetEntry *tle; Node *expr; - *p_resno_map = (int *)palloc0(list_length(input_tlist) * sizeof(int)); + *p_resno_map = (int *) palloc0(list_length(input_tlist) * sizeof(int)); foreach(j, input_tlist) { @@ -2837,8 +2919,8 @@ generate_subquery_tlist(Index varno, List *input_tlist, tle = makeTargetEntry((Expr *) expr, (AttrNumber) resno++, (inputtle->resname == NULL) ? - NULL : - pstrdup(inputtle->resname), + NULL : + pstrdup(inputtle->resname), keep_resjunk ? inputtle->resjunk : false); tle->ressortgroupref = inputtle->ressortgroupref; tlist = lappend(tlist, tle); @@ -2872,7 +2954,7 @@ cdbpathlocus_collocates(PlannerInfo *root, CdbPathLocus locus, List *pathkeys, return true; if (!CdbPathLocus_IsHashed(locus)) - return false; /* Or would HashedOJ ok, too? */ + return false; /* Or would HashedOJ ok, too? */ if (exact_match && list_length(pathkeys) != list_length(locus.partkey_h)) return false; @@ -2883,7 +2965,7 @@ cdbpathlocus_collocates(PlannerInfo *root, CdbPathLocus locus, List *pathkeys, pk_eclasses = NIL; foreach(i, pathkeys) { - PathKey *pathkey = (PathKey *) lfirst(i); + PathKey *pathkey = (PathKey *) lfirst(i); EquivalenceClass *ec; ec = pathkey->pk_eclass; @@ -2910,40 +2992,41 @@ cdbpathlocus_collocates(PlannerInfo *root, CdbPathLocus locus, List *pathkeys, * canonical path keys is unavailable, this function will never return a * hashed locus. */ -CdbPathLocus cdbpathlocus_from_flow(Flow *flow) +CdbPathLocus +cdbpathlocus_from_flow(Flow *flow) { - CdbPathLocus locus; - - CdbPathLocus_MakeNull(&locus); - - if (!flow) - return locus; - - switch (flow->flotype) - { - case FLOW_SINGLETON: - if (flow->segindex == -1) - CdbPathLocus_MakeEntry(&locus); - else - CdbPathLocus_MakeSingleQE(&locus); - break; - case FLOW_REPLICATED: - CdbPathLocus_MakeReplicated(&locus); - break; - case FLOW_PARTITIONED: + CdbPathLocus locus; + + CdbPathLocus_MakeNull(&locus); + + if (!flow) + return locus; + + switch (flow->flotype) + { + case FLOW_SINGLETON: + if (flow->segindex == -1) + CdbPathLocus_MakeEntry(&locus); + else + CdbPathLocus_MakeSingleQE(&locus); + break; + case FLOW_REPLICATED: + CdbPathLocus_MakeReplicated(&locus); + break; + case FLOW_PARTITIONED: CdbPathLocus_MakeStrewn(&locus); - break; - case FLOW_UNDEFINED: - default: - Insist(0); - } - return locus; + break; + case FLOW_UNDEFINED: + default: + Insist(0); + } + return locus; } /* * Generate 3 target lists for a sequence of consecutive Agg nodes. * - * This is intended for a sequence of consecutive Agg nodes used in + * This is intended for a sequence of consecutive Agg nodes used in * a ROLLUP. '*p_tlist3' is for the upper Agg node, and '*p_tlist2' is * for any Agg node in the middle, and '*p_tlist1' is for the * bottom Agg node. @@ -2955,30 +3038,33 @@ CdbPathLocus cdbpathlocus_from_flow(Flow *flow) * NB This function is called externally (from plangroupext.c) and not * used in this file! Beware: the API is now legacy here! */ -void generate_three_tlists(List *tlist, - bool twostage, - List *sub_tlist, - Node *havingQual, - int numGroupCols, - AttrNumber *groupColIdx, - Oid *groupOperators, - List **p_tlist1, - List **p_tlist2, - List **p_tlist3, - List **p_final_qual) +void +generate_three_tlists(List *tlist, + bool twostage, + List *sub_tlist, + Node *havingQual, + int numGroupCols, + AttrNumber *groupColIdx, + Oid *groupOperators, + List **p_tlist1, + List **p_tlist2, + List **p_tlist3, + List **p_final_qual) { - ListCell *lc; - int resno = 1; - - MppGroupContext ctx; /* Just for API matching! */ + ListCell *lc; + int resno = 1; - /* Similar to the final tlist entries in two-stage aggregation, - * we use consistent varno in the middle tlist entries. + MppGroupContext ctx; /* Just for API matching! */ + + /* + * Similar to the final tlist entries in two-stage aggregation, we use + * consistent varno in the middle tlist entries. */ - int middle_varno = 1; + int middle_varno = 1; - /* Generate the top and bottom tlists by calling the multi-phase - * aggregation code in cdbgroup.c. + /* + * Generate the top and bottom tlists by calling the multi-phase + * aggregation code in cdbgroup.c. */ ctx.tlist = tlist; ctx.sub_tlist = sub_tlist; @@ -2988,51 +3074,52 @@ void generate_three_tlists(List *tlist, ctx.groupOperators = groupOperators; ctx.numDistinctCols = 0; ctx.distinctColIdx = NULL; - + generate_multi_stage_tlists(&ctx, p_tlist1, - NULL, - p_tlist3, - p_final_qual); + NULL, + p_tlist3, + p_final_qual); /* - * Read target entries in '*p_tlist1' one by one, and construct - * the entries for '*p_tlist2'. + * Read target entries in '*p_tlist1' one by one, and construct the + * entries for '*p_tlist2'. */ - foreach (lc, *p_tlist1) + foreach(lc, *p_tlist1) { - TargetEntry *tle = (TargetEntry *)lfirst(lc); - Expr *new_expr; + TargetEntry *tle = (TargetEntry *) lfirst(lc); + Expr *new_expr; TargetEntry *new_tle; if (IsA(tle->expr, Aggref)) { - Aggref *aggref = (Aggref *)tle->expr; - Aggref *new_aggref = makeNode(Aggref); + Aggref *aggref = (Aggref *) tle->expr; + Aggref *new_aggref = makeNode(Aggref); new_aggref->aggfnoid = aggref->aggfnoid; new_aggref->aggtype = aggref->aggtype; new_aggref->args = - list_make1((Expr*)makeVar(middle_varno, tle->resno, aggref->aggtype, -1, 0)); + list_make1((Expr *) makeVar(middle_varno, tle->resno, aggref->aggtype, -1, 0)); /* FILTER is evaluated at the PARTIAL stage. */ new_aggref->agglevelsup = 0; new_aggref->aggstar = false; - new_aggref->aggdistinct = false; /* handled in preliminary aggregation */ + new_aggref->aggdistinct = false; /* handled in preliminary + * aggregation */ new_aggref->aggstage = AGGSTAGE_INTERMEDIATE; new_aggref->location = -1; - new_expr = (Expr *)new_aggref; + new_expr = (Expr *) new_aggref; } else { /* Just make a new Var. */ - new_expr = (Expr *)makeVar(middle_varno, - tle->resno, - exprType((Node *)tle->expr), - exprTypmod((Node *)tle->expr), - 0); - + new_expr = (Expr *) makeVar(middle_varno, + tle->resno, + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), + 0); + } new_tle = makeTargetEntry(new_expr, resno, @@ -3047,23 +3134,25 @@ void generate_three_tlists(List *tlist, } /* - * This may be called inside a two-stage aggregation. In this case, - * We want to make sure all entries in the '*p_tlist3' are visible. + * This may be called inside a two-stage aggregation. In this case, We + * want to make sure all entries in the '*p_tlist3' are visible. */ - foreach (lc, *p_tlist3) + foreach(lc, *p_tlist3) { - TargetEntry *tle = (TargetEntry *)lfirst(lc); + TargetEntry *tle = (TargetEntry *) lfirst(lc); if (twostage) tle->resjunk = false; - /* We also set aggstage to AGGSTAGE_INTERMEDIATE if this is in - * a two-stage aggregation, because the agg node in - * the second stage aggregation will do the finalize. + /* + * We also set aggstage to AGGSTAGE_INTERMEDIATE if this is in a + * two-stage aggregation, because the agg node in the second stage + * aggregation will do the finalize. */ if (twostage && IsA(tle->expr, Aggref)) { - Aggref *aggref = (Aggref *)tle->expr; + Aggref *aggref = (Aggref *) tle->expr; + aggref->aggstage = AGGSTAGE_INTERMEDIATE; } } @@ -3082,7 +3171,7 @@ void generate_three_tlists(List *tlist, * sub_tlist - the reduced target list to use as input to the aggregation * (If use_dqa_pruning, the all DQA arguments must appear in * this list and must have non-zero sortgrouprefs.) - * havingQual - the preprocesses having qual of the originaly query + * havingQual - the preprocesses having qual of the originaly query * (in list-of-conjunct-Exprs form) * numGroupCols - number of grouping attributes (no grouping extensions) * groupColIdx - resnos (= attr numbers) of the grouping attributes @@ -3100,25 +3189,28 @@ void generate_three_tlists(List *tlist, * final_tlist - the target list of the final Agg node. * final_qual - the qual of the final Agg node. */ -void generate_multi_stage_tlists(MppGroupContext *ctx, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist, - List **p_final_qual) +void +generate_multi_stage_tlists(MppGroupContext *ctx, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist, + List **p_final_qual) { - /* Use consistent varno in final and intermediate tlist entries. It will - * refer to the sole RTE (a Subquery RTE) of a SubqueryScan. */ + /* + * Use consistent varno in final and intermediate tlist entries. It will + * refer to the sole RTE (a Subquery RTE) of a SubqueryScan. + */ ctx->final_varno = 1; - + /* Do we need to build an intermediate tlist in irefs_tlist? */ - ctx->use_irefs_tlist = ( p_inter_tlist != NULL ); - + ctx->use_irefs_tlist = (p_inter_tlist != NULL); + /* Don't do DQA pruning. Use prepare/generate_dqa_pruning_tlists! */ ctx->use_dqa_pruning = false; - - deconstruct_agg_info(ctx); - reconstruct_agg_info(ctx, - p_prelim_tlist, p_inter_tlist, + + deconstruct_agg_info(ctx); + reconstruct_agg_info(ctx, + p_prelim_tlist, p_inter_tlist, p_final_tlist, p_final_qual); } @@ -3132,51 +3224,57 @@ void generate_multi_stage_tlists(MppGroupContext *ctx, * several "coplans" each with its own target list requirements. This * function lays the groundwork for all such target lists. */ -void prepare_dqa_pruning_tlists(MppGroupContext *ctx) +void +prepare_dqa_pruning_tlists(MppGroupContext *ctx) { - /* Use consistent varno in final and intermediate tlist entries. It will - * refer to the sole RTE (a Subquery RTE) of a SubqueryScan. */ + /* + * Use consistent varno in final and intermediate tlist entries. It will + * refer to the sole RTE (a Subquery RTE) of a SubqueryScan. + */ ctx->final_varno = 1; - + /* Do we need to build an intermediate tlist in irefs_tlist? */ ctx->use_irefs_tlist = true; - - /* Do we want to do DQA pruning (in case there are any DISTINCT-qualified - * aggregate functions)? */ + + /* + * Do we want to do DQA pruning (in case there are any DISTINCT-qualified + * aggregate functions)? + */ ctx->use_dqa_pruning = true; - - deconstruct_agg_info(ctx); + + deconstruct_agg_info(ctx); } /* * Function: generate_dqa_pruning_tlists * - * Performs the last phase of generate_multi_phase_tlist in the context of + * Performs the last phase of generate_multi_phase_tlist in the context of * DQA pruning. */ -void generate_dqa_pruning_tlists(MppGroupContext *ctx, - int dqa_index, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist, - List **p_final_qual) +void +generate_dqa_pruning_tlists(MppGroupContext *ctx, + int dqa_index, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist, + List **p_final_qual) { - Assert( p_inter_tlist != NULL ); /* optional elsewhere, required here. */ - Assert( ctx->use_dqa_pruning ); - - if ( ctx->numDistinctCols == 1 ) + Assert(p_inter_tlist != NULL); /* optional elsewhere, required here. */ + Assert(ctx->use_dqa_pruning); + + if (ctx->numDistinctCols == 1) { /* Finalized results for single-DQA (join-less) plan. */ - reconstruct_agg_info(ctx, - p_prelim_tlist, + reconstruct_agg_info(ctx, + p_prelim_tlist, p_inter_tlist, - p_final_tlist, + p_final_tlist, p_final_qual); } else { /* Minimal results for multi-DQA (join) plan. */ - reconstruct_coplan_info(ctx, + reconstruct_coplan_info(ctx, dqa_index, p_prelim_tlist, p_inter_tlist, @@ -3193,13 +3291,16 @@ void generate_dqa_pruning_tlists(MppGroupContext *ctx, * in a multi-phase aggregation plan, possibly with DISTINCT-qualified * aggregate functions (DQAs). */ -void deconstruct_agg_info(MppGroupContext *ctx) +void +deconstruct_agg_info(MppGroupContext *ctx) { - int i; - ListCell *lc; - - /* Initialize temporaries to hold the parts of the preliminary target - * list under construction. */ + int i; + ListCell *lc; + + /* + * Initialize temporaries to hold the parts of the preliminary target list + * under construction. + */ ctx->grps_tlist = NIL; ctx->dqa_tlist = NIL; ctx->prefs_tlist = NIL; @@ -3208,95 +3309,102 @@ void deconstruct_agg_info(MppGroupContext *ctx) ctx->dref_tlists = NULL; ctx->fin_tlist = NIL; ctx->fin_hqual = NIL; - + /* - * Begin constructing the target list for the preliminary Agg node - * by placing targets for the grouping attributes on the grps_tlist - * temporary. Make sure ressortgroupref matches the original. Copying - * the expression may be overkill, but it is safe. + * Begin constructing the target list for the preliminary Agg node by + * placing targets for the grouping attributes on the grps_tlist + * temporary. Make sure ressortgroupref matches the original. Copying the + * expression may be overkill, but it is safe. */ - for ( i = 0; i < ctx->numGroupCols; i++ ) + for (i = 0; i < ctx->numGroupCols; i++) { - TargetEntry *sub_tle, *prelim_tle; - + TargetEntry *sub_tle, + *prelim_tle; + sub_tle = get_tle_by_resno(ctx->sub_tlist, ctx->groupColIdx[i]); prelim_tle = makeTargetEntry(copyObject(sub_tle->expr), - list_length(ctx->grps_tlist) + 1, - (sub_tle->resname == NULL) ? - NULL : - pstrdup(sub_tle->resname), - false); + list_length(ctx->grps_tlist) + 1, + (sub_tle->resname == NULL) ? + NULL : + pstrdup(sub_tle->resname), + false); prelim_tle->ressortgroupref = sub_tle->ressortgroupref; prelim_tle->resjunk = false; ctx->grps_tlist = lappend(ctx->grps_tlist, prelim_tle); } /* - * Continue to construct the target list for the preliminary Agg node - * by placing targets for the argument attribute of each DQA on the - * dqa_tlist temporary. Make sure ressortgroupref matches the original. + * Continue to construct the target list for the preliminary Agg node by + * placing targets for the argument attribute of each DQA on the dqa_tlist + * temporary. Make sure ressortgroupref matches the original. */ - for ( i = 0; i < ctx->numDistinctCols; i++ ) + for (i = 0; i < ctx->numDistinctCols; i++) { - TargetEntry *sub_tle, *prelim_tle; - + TargetEntry *sub_tle, + *prelim_tle; + sub_tle = get_tle_by_resno(ctx->sub_tlist, ctx->distinctColIdx[i]); prelim_tle = makeTargetEntry(copyObject(sub_tle->expr), - list_length(ctx->dqa_tlist) + 1, - (sub_tle->resname == NULL) ? - NULL : - pstrdup(sub_tle->resname), - false); + list_length(ctx->dqa_tlist) + 1, + (sub_tle->resname == NULL) ? + NULL : + pstrdup(sub_tle->resname), + false); prelim_tle->ressortgroupref = sub_tle->ressortgroupref; prelim_tle->resjunk = false; ctx->dqa_tlist = lappend(ctx->dqa_tlist, prelim_tle); } - - /* Initialize the array of Aggref target lists corresponding to the - * DQA argument target list just constructed. + + /* + * Initialize the array of Aggref target lists corresponding to the DQA + * argument target list just constructed. */ - ctx->dref_tlists = (List **)palloc0(ctx->numDistinctCols * sizeof(List*)); - + ctx->dref_tlists = (List **) palloc0(ctx->numDistinctCols * sizeof(List *)); + /* * Derive the final target list with entries corresponding to the input * target list, but referring to the attributes of the preliminary target * list rather than to the input attributes. Note that this involves * augmenting the prefs_tlist temporary as we encounter new Aggref nodes. */ - foreach (lc, ctx->tlist) + foreach(lc, ctx->tlist) { - TargetEntry *tle, *final_tle; - Expr *expr; - - tle = (TargetEntry*)lfirst(lc); - ctx->split_aggref_sortgroupref = tle->ressortgroupref; /* for deconstruction subroutines */ + TargetEntry *tle, + *final_tle; + Expr *expr; + + tle = (TargetEntry *) lfirst(lc); + ctx->split_aggref_sortgroupref = tle->ressortgroupref; /* for deconstruction + * subroutines */ expr = deconstruct_expr(tle->expr, ctx); ctx->split_aggref_sortgroupref = 0; final_tle = makeTargetEntry(expr, tle->resno, (tle->resname == NULL) ? - NULL : - pstrdup(tle->resname), + NULL : + pstrdup(tle->resname), tle->resjunk); final_tle->ressortgroupref = tle->ressortgroupref; ctx->fin_tlist = lappend(ctx->fin_tlist, final_tle); } - + /* - * Derive the final qual while augmenting the preliminary target list. */ - ctx->fin_hqual = (List*)deconstruct_expr((Expr*)ctx->havingQual, ctx); - - + * Derive the final qual while augmenting the preliminary target list. + */ + ctx->fin_hqual = (List *) deconstruct_expr((Expr *) ctx->havingQual, ctx); + + /* Now cache some values to avoid repeated recalculation by subroutines. */ - - /* Use consistent varno in final, intermediate an join tlist entries. - * final refers to the sole RTE (a Subquery RTE) of a SubqueryScan. - * outer and inner to the respective inputs to a join. + + /* + * Use consistent varno in final, intermediate an join tlist entries. + * final refers to the sole RTE (a Subquery RTE) of a SubqueryScan. outer + * and inner to the respective inputs to a join. */ ctx->final_varno = 1; ctx->outer_varno = OUTER; ctx->inner_varno = INNER; - + /*--------------------------------------------------------------------- * Target lists used in multi-phase planning at or above the level of * individual DQA coplans have one of the forms @@ -3315,10 +3423,10 @@ void deconstruct_agg_info(MppGroupContext *ctx) */ ctx->dqa_offsets = palloc(sizeof(int) * (1 + ctx->numDistinctCols)); ctx->dqa_offsets[0] = ctx->numGroupCols; - for ( i = 0; i < ctx->numDistinctCols; i++ ) + for (i = 0; i < ctx->numDistinctCols; i++) { - ctx->dqa_offsets[i+1] = ctx->dqa_offsets[i] - + list_length(ctx->dref_tlists[i]); + ctx->dqa_offsets[i + 1] = ctx->dqa_offsets[i] + + list_length(ctx->dref_tlists[i]); } } @@ -3329,73 +3437,75 @@ void deconstruct_agg_info(MppGroupContext *ctx) * DQA pruning, this function is appropriate only for the cases of 0 or 1 * DQA. * - * During processing we set ctx->top_tlist to be the flat target list + * During processing we set ctx->top_tlist to be the flat target list * containing only the grouping key and the results of individual aggregate * functions. This list is transient -- it drives the production of the - * final target list and having qual through finalize_split_expression. + * final target list and having qual through finalize_split_expression. */ -void reconstruct_agg_info(MppGroupContext *ctx, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist, - List **p_final_qual) -{ - List *prelim_tlist = NIL; - List *inter_tlist = NIL; - List *final_tlist = NIL; - +void +reconstruct_agg_info(MppGroupContext *ctx, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist, + List **p_final_qual) +{ + List *prelim_tlist = NIL; + List *inter_tlist = NIL; + List *final_tlist = NIL; + /* Grouping keys */ - + prelim_tlist = ctx->grps_tlist; - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) inter_tlist = make_vars_tlist(ctx->grps_tlist, ctx->final_varno, 0); - final_tlist = make_vars_tlist(ctx->grps_tlist, ctx->final_varno, 0); - + final_tlist = make_vars_tlist(ctx->grps_tlist, ctx->final_varno, 0); + /* If applicable, single DQA argument, corresponding DQAs */ - - if ( ctx->use_dqa_pruning ) + + if (ctx->use_dqa_pruning) { - if ( list_length(ctx->dqa_tlist) == 1 ) + if (list_length(ctx->dqa_tlist) == 1) { - int n = list_length(prelim_tlist); - TargetEntry *tle = (TargetEntry*)linitial(ctx->dqa_tlist); - tle->resno = n+1; - + int n = list_length(prelim_tlist); + TargetEntry *tle = (TargetEntry *) linitial(ctx->dqa_tlist); + + tle->resno = n + 1; + prelim_tlist = lappend(prelim_tlist, tle); - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) { inter_tlist = list_concat(inter_tlist, - make_vars_tlist(ctx->dqa_tlist, + make_vars_tlist(ctx->dqa_tlist, ctx->final_varno, n)); } final_tlist = seq_tlist_concat(final_tlist, ctx->dref_tlists[0]); } - else if ( list_length(ctx->dqa_tlist) != 0 ) + else if (list_length(ctx->dqa_tlist) != 0) { /* Shouldn't use this function for multi-DQA pruning. */ - elog(ERROR,"Unexpected use of DISTINCT-qualified aggregate pruning"); + elog(ERROR, "Unexpected use of DISTINCT-qualified aggregate pruning"); } } /* Aggrefs */ - + prelim_tlist = seq_tlist_concat(prelim_tlist, ctx->prefs_tlist); - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) { inter_tlist = seq_tlist_concat(inter_tlist, ctx->irefs_tlist); } final_tlist = seq_tlist_concat(final_tlist, ctx->frefs_tlist); /* Set implicit results */ - + *p_prelim_tlist = prelim_tlist; - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) *p_inter_tlist = inter_tlist; ctx->top_tlist = final_tlist; - *p_final_tlist = (List*) finalize_split_expr((Node*) ctx->fin_tlist, ctx); - *p_final_qual = (List*) finalize_split_expr((Node*) ctx->fin_hqual, ctx); + *p_final_tlist = (List *) finalize_split_expr((Node *) ctx->fin_tlist, ctx); + *p_final_qual = (List *) finalize_split_expr((Node *) ctx->fin_hqual, ctx); } /* Function: reconstruct_coplan_info @@ -3407,38 +3517,40 @@ void reconstruct_agg_info(MppGroupContext *ctx, * and is sensitive to dqa_index. Ordinarily this function would * be used only for multiple-DQA planning. */ -void reconstruct_coplan_info(MppGroupContext *ctx, - int dqa_index, - List **p_prelim_tlist, - List **p_inter_tlist, - List **p_final_tlist) -{ - List *prelim_tlist = NIL; - List *inter_tlist = NIL; - List *final_tlist = NIL; - - int n; +void +reconstruct_coplan_info(MppGroupContext *ctx, + int dqa_index, + List **p_prelim_tlist, + List **p_inter_tlist, + List **p_final_tlist) +{ + List *prelim_tlist = NIL; + List *inter_tlist = NIL; + List *final_tlist = NIL; + + int n; TargetEntry *tle; - + /* Grouping keys */ - + prelim_tlist = copyObject(ctx->grps_tlist); - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) inter_tlist = make_vars_tlist(ctx->grps_tlist, ctx->final_varno, 0); - final_tlist = make_vars_tlist(ctx->grps_tlist, ctx->final_varno, 0); - + final_tlist = make_vars_tlist(ctx->grps_tlist, ctx->final_varno, 0); + /* Single DQA argument, corresponding DQAs */ - - Assert ( ctx->use_dqa_pruning ); + + Assert(ctx->use_dqa_pruning); n = list_length(prelim_tlist); - tle = (TargetEntry*)list_nth(ctx->dqa_tlist, dqa_index); - tle->resno = n+1; - + tle = (TargetEntry *) list_nth(ctx->dqa_tlist, dqa_index); + tle->resno = n + 1; + prelim_tlist = lappend(prelim_tlist, tle); - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) { - List *x = list_make1(tle); + List *x = list_make1(tle); + inter_tlist = list_concat(inter_tlist, make_vars_tlist(x, ctx->final_varno, n)); list_free(x); @@ -3447,10 +3559,10 @@ void reconstruct_coplan_info(MppGroupContext *ctx, /* Plain Aggrefs go only on the first coplan! */ - if ( dqa_index == 0 ) + if (dqa_index == 0) { prelim_tlist = seq_tlist_concat(prelim_tlist, ctx->prefs_tlist); - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) { inter_tlist = seq_tlist_concat(inter_tlist, ctx->irefs_tlist); } @@ -3458,9 +3570,9 @@ void reconstruct_coplan_info(MppGroupContext *ctx, } /* Set implicit results */ - + *p_prelim_tlist = prelim_tlist; - if ( p_inter_tlist != NULL ) + if (p_inter_tlist != NULL) { *p_inter_tlist = inter_tlist; } @@ -3476,9 +3588,10 @@ void reconstruct_coplan_info(MppGroupContext *ctx, * first (partial) aggregation and referring to this target list from * the modified expression for use in the second (final) aggregation. */ -Expr *deconstruct_expr(Expr *expr, MppGroupContext *ctx) +Expr * +deconstruct_expr(Expr *expr, MppGroupContext *ctx) { - return (Expr*)deconstruct_expr_mutator((Node*)expr, ctx); + return (Expr *) deconstruct_expr_mutator((Node *) expr, ctx); } /* @@ -3486,30 +3599,32 @@ Expr *deconstruct_expr(Expr *expr, MppGroupContext *ctx) * * Work for deconstruct_expr. */ -Node* deconstruct_expr_mutator(Node *node, MppGroupContext *ctx) +Node * +deconstruct_expr_mutator(Node *node, MppGroupContext *ctx) { TargetEntry *tle; - + if (node == NULL) return NULL; - + if (IsA(node, Aggref)) { - Aggref *aggref = (Aggref*)node; + Aggref *aggref = (Aggref *) node; + return split_aggref(aggref, ctx); } - - /* If the given expression is a grouping expression, replace it with - * a Var node referring to the (lower) preliminary aggregation's - * target list. + + /* + * If the given expression is a grouping expression, replace it with a Var + * node referring to the (lower) preliminary aggregation's target list. * * While building subplan targetlist we flatten (deduplicate) the - * targetlist ignoring RelabelType node. - * Including RelabelType will cause inconsistent top level target list - * and final target list for aggregation plans. + * targetlist ignoring RelabelType node. Including RelabelType will cause + * inconsistent top level target list and final target list for + * aggregation plans. */ tle = tlist_member_ignore_relabel(node, ctx->grps_tlist); - if( tle != NULL ) + if (tle != NULL) { Var *var = makeVar(grp_varno, tle->resno, exprType((Node*)tle->expr), @@ -3532,7 +3647,7 @@ Node* deconstruct_expr_mutator(Node *node, MppGroupContext *ctx) return (Node*) var; } - return expression_tree_mutator(node, deconstruct_expr_mutator, (void*)ctx); + return expression_tree_mutator(node, deconstruct_expr_mutator, (void *) ctx); } @@ -3542,12 +3657,12 @@ Node* deconstruct_expr_mutator(Node *node, MppGroupContext *ctx) * Find or add a partial-stage Aggref expression for the argument in the * preliminary target list under construction. Return the final-stage * Aggref with a single Var node argument referring to the partial-stage - * Aggref. In case of a DQA argument reduction, however, there is no + * Aggref. In case of a DQA argument reduction, however, there is no * partial-stage Aggref and the final-stage Aggref differs from the original - * in that (1) it does not specify DISTINCT and (2) it refers to its arguments + * in that (1) it does not specify DISTINCT and (2) it refers to its arguments * via Vars on the lower range. * - * For the normal 2-phase case: + * For the normal 2-phase case: * * Note that he result type of the partial-stage Aggref will be the * transition type of the aggregate function. @@ -3565,117 +3680,124 @@ Node* deconstruct_expr_mutator(Node *node, MppGroupContext *ctx) * be used to convert the ending transition value to the result type. * aggregation */ -Node *split_aggref(Aggref *aggref, MppGroupContext *ctx) +Node * +split_aggref(Aggref *aggref, MppGroupContext *ctx) { - ListCell *cell; - Node *final_node; - Oid transtype = InvalidOid; - AttrNumber attrno = OUTER; + ListCell *cell; + Node *final_node; + Oid transtype = InvalidOid; + AttrNumber attrno = OUTER; TargetEntry *prelim_tle = NULL; - Assert(aggref != NULL && aggref->agglevelsup == 0); - - if ( aggref->aggdistinct && ctx->use_dqa_pruning ) + Assert(aggref != NULL && aggref->agglevelsup == 0); + + if (aggref->aggdistinct && ctx->use_dqa_pruning) { - Index arg_attno; - Index dqa_attno; + Index arg_attno; + Index dqa_attno; TargetEntry *dqa_tle = NULL; TargetEntry *arg_tle; - List *dref_tlist = NIL; + List *dref_tlist = NIL; - /* First find the DQA argument. Since this is a DQA, its argument + /* + * First find the DQA argument. Since this is a DQA, its argument * list must contain a single expression that matches one of the * target expressions in ctx->dqa_tlist. */ arg_tle = NULL; - if ( list_length(aggref->args) == 1 ) /* safer than Assert */ + if (list_length(aggref->args) == 1) /* safer than Assert */ { arg_tle = tlist_member(linitial(aggref->args), ctx->dqa_tlist); } if (arg_tle == NULL) - elog(ERROR,"Unexpected use of DISTINCT-qualified aggregation"); + elog(ERROR, "Unexpected use of DISTINCT-qualified aggregation"); arg_attno = arg_tle->resno; /* [1..numDistinctCols] */ - /* We may have seen a DQA just like this one already. Look for - * one in the distinct Aggref target list to date. + /* + * We may have seen a DQA just like this one already. Look for one in + * the distinct Aggref target list to date. */ dref_tlist = ctx->dref_tlists[arg_attno - 1]; dqa_attno = 1; - foreach( cell, dref_tlist ) + foreach(cell, dref_tlist) { - TargetEntry *tle = (TargetEntry*)lfirst(cell); - Aggref *ref = (Aggref*)tle->expr; - - /* Check similarity, avoid aggtype and aggstage - * (which we control) and don't bother with agglevelsup - * (which is always 0 here) or aggdistinct. + TargetEntry *tle = (TargetEntry *) lfirst(cell); + Aggref *ref = (Aggref *) tle->expr; + + /* + * Check similarity, avoid aggtype and aggstage (which we control) + * and don't bother with agglevelsup (which is always 0 here) or + * aggdistinct. */ - if ( aggref->aggfnoid == ref->aggfnoid + if (aggref->aggfnoid == ref->aggfnoid && aggref->aggstar == ref->aggstar - && equal(aggref->args, ref->args) ) + && equal(aggref->args, ref->args)) { dqa_tle = tle; break; } dqa_attno++; } - - if ( dqa_tle == NULL ) + + if (dqa_tle == NULL) { - /* Didn't find a target for the DQA Aggref so make a new one. + /* + * Didn't find a target for the DQA Aggref so make a new one. */ - Var *arg_var; - Aggref *dqa_aggref; - + Var *arg_var; + Aggref *dqa_aggref; + arg_var = makeVar(ctx->final_varno, ctx->numGroupCols + 1, exprType(linitial(aggref->args)), exprTypmod(linitial(aggref->args)), 0); - + dqa_aggref = makeNode(Aggref); memcpy(dqa_aggref, aggref, sizeof(Aggref)); /* flat copy */ dqa_aggref->args = list_make1(arg_var); dqa_aggref->aggdistinct = false; - - dqa_tle = makeTargetEntry((Expr*)dqa_aggref, dqa_attno, NULL, false); + + dqa_tle = makeTargetEntry((Expr *) dqa_aggref, dqa_attno, NULL, false); dref_tlist = lappend(dref_tlist, dqa_tle); } - ctx->dref_tlists[arg_attno-1] = dref_tlist; + ctx->dref_tlists[arg_attno - 1] = dref_tlist; - /* Make the "final" target for the DQA case, a reference to the - * DQA Aggref we just found or constructed. + /* + * Make the "final" target for the DQA case, a reference to the DQA + * Aggref we just found or constructed. */ - final_node = (Node*) makeVar(dqa_base_varno + arg_attno - 1, - dqa_attno, - exprType((Node*)arg_tle->expr), - exprTypmod((Node*)arg_tle->expr), - 0); - } - else /* Ordinary Aggref -or- DQA but ctx->use_dqa_pruning is off. */ - { - Aggref *pref; - Aggref *iref; - Aggref *fref; - + final_node = (Node *) makeVar(dqa_base_varno + arg_attno - 1, + dqa_attno, + exprType((Node *) arg_tle->expr), + exprTypmod((Node *) arg_tle->expr), + 0); + } + else /* Ordinary Aggref -or- DQA but + * ctx->use_dqa_pruning is off. */ + { + Aggref *pref; + Aggref *iref; + Aggref *fref; + /* * We may have seen an Aggref just like this one already. Look for - * the preliminary form of such in the preliminary Aggref target - * list to date. + * the preliminary form of such in the preliminary Aggref target list + * to date. */ - foreach( cell, ctx->prefs_tlist ) + foreach(cell, ctx->prefs_tlist) { - TargetEntry *tle = (TargetEntry*)lfirst(cell); - Aggref *ref = (Aggref*)tle->expr; - - /* Check similarity, avoid aggtype and aggstage - * (which we control) and don't bother with agglevelsup - * (which is always 0 here). + TargetEntry *tle = (TargetEntry *) lfirst(cell); + Aggref *ref = (Aggref *) tle->expr; + + /* + * Check similarity, avoid aggtype and aggstage (which we control) + * and don't bother with agglevelsup (which is always 0 here). */ - if ( aggref->aggfnoid == ref->aggfnoid + if (aggref->aggfnoid == ref->aggfnoid && aggref->aggstar == ref->aggstar && aggref->aggdistinct == ref->aggdistinct && equal(aggref->args, ref->args) - && equal(aggref->aggfilter, ref->aggfilter) ) + && equal(aggref->aggfilter, ref->aggfilter)) { prelim_tle = tle; transtype = ref->aggtype; @@ -3685,42 +3807,45 @@ Node *split_aggref(Aggref *aggref, MppGroupContext *ctx) } /* - * If no existing preliminary Aggref target matched, add one that does. + * If no existing preliminary Aggref target matched, add one that + * does. */ - if ( prelim_tle == NULL ) + if (prelim_tle == NULL) { TargetEntry *final_tle; - Var *args; - + Var *args; + /* Get type information for the Aggref */ transtype = lookup_agg_transtype(aggref); - - /* Make a new preliminary Aggref wrapped as a new target entry. + + /* + * Make a new preliminary Aggref wrapped as a new target entry. * Like the input Aggref, the preliminary refers to the lower - * range. */ - pref = (Aggref*)copyObject(aggref); + * range. + */ + pref = (Aggref *) copyObject(aggref); pref->aggtype = transtype; pref->aggstage = AGGSTAGE_PARTIAL; attrno = 1 + list_length(ctx->prefs_tlist); - prelim_tle = makeTargetEntry((Expr*)pref, attrno, NULL, false); + prelim_tle = makeTargetEntry((Expr *) pref, attrno, NULL, false); prelim_tle->ressortgroupref = ctx->split_aggref_sortgroupref; ctx->prefs_tlist = lappend(ctx->prefs_tlist, prelim_tle); - - args = makeVar(ctx->final_varno, - ctx->numGroupCols - + (ctx->use_dqa_pruning ? 1 : 0) - + attrno, - transtype, -1, 0); - - if ( ctx->use_irefs_tlist ) + + args = makeVar(ctx->final_varno, + ctx->numGroupCols + + (ctx->use_dqa_pruning ? 1 : 0) + + attrno, + transtype, -1, 0); + + if (ctx->use_irefs_tlist) { TargetEntry *inter_tle; iref = makeNode(Aggref); iref->aggfnoid = pref->aggfnoid; iref->aggtype = transtype; - iref->args = list_make1((Expr*)copyObject(args)); + iref->args = list_make1((Expr *) copyObject(args)); /* FILTER is evaluated at the PARTIAL stage. */ iref->agglevelsup = 0; iref->aggstar = false; @@ -3728,31 +3853,31 @@ Node *split_aggref(Aggref *aggref, MppGroupContext *ctx) iref->aggstage = AGGSTAGE_INTERMEDIATE; iref->location = -1; - inter_tle = makeTargetEntry((Expr*)iref, attrno, NULL, false); + inter_tle = makeTargetEntry((Expr *) iref, attrno, NULL, false); inter_tle->ressortgroupref = ctx->split_aggref_sortgroupref; ctx->irefs_tlist = lappend(ctx->irefs_tlist, inter_tle); } /* Make a new final Aggref. */ fref = makeNode(Aggref); - + fref->aggfnoid = aggref->aggfnoid; fref->aggtype = aggref->aggtype; - fref->args = list_make1((Expr*)args); + fref->args = list_make1((Expr *) args); /* FILTER is evaluated at the PARTIAL stage. */ fref->agglevelsup = 0; fref->aggstar = false; - fref->aggdistinct = false; /* handled in preliminary aggregation */ + fref->aggdistinct = false; /* handled in preliminary aggregation */ fref->aggstage = AGGSTAGE_FINAL; fref->location = -1; - final_tle = makeTargetEntry((Expr*)fref, attrno, NULL, false); + final_tle = makeTargetEntry((Expr *) fref, attrno, NULL, false); final_tle->ressortgroupref = ctx->split_aggref_sortgroupref; ctx->frefs_tlist = lappend(ctx->frefs_tlist, final_tle); } - final_node = (Node*)makeVar(ref_varno, attrno, aggref->aggtype, -1, 0); + final_node = (Node *) makeVar(ref_varno, attrno, aggref->aggtype, -1, 0); } - + return final_node; } @@ -3762,31 +3887,32 @@ Node *split_aggref(Aggref *aggref, MppGroupContext *ctx) * Make a targetlist similar to the given length n tlist but consisting of * simple Var nodes with the given varno and varattno in offset + [1..N]. */ -List *make_vars_tlist(List *tlist, Index varno, AttrNumber offset) +List * +make_vars_tlist(List *tlist, Index varno, AttrNumber offset) { - List *new_tlist = NIL; - AttrNumber attno = offset; - ListCell *lc; - - foreach (lc, tlist) + List *new_tlist = NIL; + AttrNumber attno = offset; + ListCell *lc; + + foreach(lc, tlist) { - Var *new_var; + Var *new_var; TargetEntry *new_tle; - TargetEntry *tle = (TargetEntry*)lfirst(lc); + TargetEntry *tle = (TargetEntry *) lfirst(lc); attno++; - + new_var = makeVar(varno, attno, - exprType((Node*)tle->expr), - exprTypmod((Node*)tle->expr), 0); - - new_tle = makeTargetEntry((Expr*)new_var, - attno, /* resno always matches attnr */ - (tle->resname == NULL) ? NULL : pstrdup(tle->resname), - false); + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), 0); + + new_tle = makeTargetEntry((Expr *) new_var, + attno, /* resno always matches attnr */ + (tle->resname == NULL) ? NULL : pstrdup(tle->resname), + false); new_tle->ressortgroupref = tle->ressortgroupref; - + new_tlist = lappend(new_tlist, new_tle); } return new_tlist; @@ -3796,23 +3922,25 @@ List *make_vars_tlist(List *tlist, Index varno, AttrNumber offset) * * Concatenates tlist2 to the end of tlist1 adjusting the resno values * of tlist2 so that the resulting entries have resno = position+1. - * The resno values of tlist1 must be dense from 1 to the length of + * The resno values of tlist1 must be dense from 1 to the length of * the list. (They are sequential by position, though this is not * strictly required. - * + * * May modify tlist1 in place (to adjust last link and length). Does not * modify tlist2, but the result shares structure below the TargetEntry * nodes. */ -List *seq_tlist_concat(List *tlist1, List *tlist2) +List * +seq_tlist_concat(List *tlist1, List *tlist2) { - ListCell *lc; - AttrNumber high_attno = list_length(tlist1); - - foreach (lc, tlist2) + ListCell *lc; + AttrNumber high_attno = list_length(tlist1); + + foreach(lc, tlist2) { - TargetEntry *tle= (TargetEntry*)lfirst(lc); - TargetEntry *new_tle = (TargetEntry*)makeNode(TargetEntry); + TargetEntry *tle = (TargetEntry *) lfirst(lc); + TargetEntry *new_tle = (TargetEntry *) makeNode(TargetEntry); + memcpy(new_tle, tle, sizeof(TargetEntry)); new_tle->resno = ++high_attno; tlist1 = lappend(tlist1, new_tle); @@ -3820,70 +3948,73 @@ List *seq_tlist_concat(List *tlist1, List *tlist2) return tlist1; } -/* Function finalize_split_expr +/* Function finalize_split_expr * * Note: Only called on the top of the "join" tree, so all D_i are * included in attribute offset calculations. */ -Node *finalize_split_expr(Node *expr, MppGroupContext *ctx) +Node * +finalize_split_expr(Node *expr, MppGroupContext *ctx) { return finalize_split_expr_mutator(expr, ctx); } /* Mutator subroutine for finalize_split_expr() replaces pseudo Var nodes * produced by split_aggref() with the similarly typed expression found in - * the top-level targetlist, ctx->top_tlist, being finalized. + * the top-level targetlist, ctx->top_tlist, being finalized. * - * For example, a pseudo Var node that represents the 3rd DQA for the + * For example, a pseudo Var node that represents the 3rd DQA for the * 2nd DQA argument will be replaced by the targetlist expression that - * corresponds to that DQA. + * corresponds to that DQA. */ -Node* finalize_split_expr_mutator(Node *node, MppGroupContext *ctx) +Node * +finalize_split_expr_mutator(Node *node, MppGroupContext *ctx) { if (node == NULL) return NULL; - + if (IsA(node, Var)) { - AttrNumber attrno=(AttrNumber)0; + AttrNumber attrno = (AttrNumber) 0; TargetEntry *tle; - - Var *pseudoVar = (Var*)node; - - if ( pseudoVar->varno == grp_varno ) - { + + Var *pseudoVar = (Var *) node; + + if (pseudoVar->varno == grp_varno) + { attrno = pseudoVar->varattno; } - else if ( pseudoVar->varno == ref_varno ) + else if (pseudoVar->varno == ref_varno) { - if ( ctx->use_dqa_pruning ) + if (ctx->use_dqa_pruning) { attrno = ctx->dqa_offsets[ctx->numDistinctCols] - + pseudoVar->varattno; + + pseudoVar->varattno; } else { attrno = ctx->numGroupCols + pseudoVar->varattno; } } - else if ( pseudoVar->varno >= dqa_base_varno && ctx->use_dqa_pruning ) + else if (pseudoVar->varno >= dqa_base_varno && ctx->use_dqa_pruning) { - int i = pseudoVar->varno - dqa_base_varno; + int i = pseudoVar->varno - dqa_base_varno; + attrno = ctx->dqa_offsets[i] + pseudoVar->varattno; } else { - elog(ERROR,"Unexpected failure of multi-phase aggregation planning"); + elog(ERROR, "Unexpected failure of multi-phase aggregation planning"); } - - tle = (TargetEntry*) list_nth(ctx->top_tlist, attrno - 1); - - return (Node*) tle->expr; + + tle = (TargetEntry *) list_nth(ctx->top_tlist, attrno - 1); + + return (Node *) tle->expr; } - - return expression_tree_mutator(node, + + return expression_tree_mutator(node, finalize_split_expr_mutator, - (void*)ctx); + (void *) ctx); } @@ -3918,26 +4049,31 @@ lookup_agg_transtype(Aggref *aggref) * is to be modified, we must call this method to ensure that the scatter clause * is kept in sync with the new targetlist. */ -void UpdateScatterClause(Query *query, List *newtlist) +void +UpdateScatterClause(Query *query, List *newtlist) { Assert(query); Assert(query->targetList); Assert(newtlist); if (query->scatterClause - && list_nth(query->scatterClause, 0) != NULL /* scattered randomly */ - ) + && list_nth(query->scatterClause, 0) != NULL /* scattered randomly */ + ) { Assert(list_length(query->targetList) == list_length(newtlist)); - List *scatterClause = NIL; - ListCell *lc = NULL; - foreach (lc, query->scatterClause) + List *scatterClause = NIL; + ListCell *lc = NULL; + + foreach(lc, query->scatterClause) { - Expr *o = (Expr *) lfirst(lc); + Expr *o = (Expr *) lfirst(lc); + Assert(o); TargetEntry *tle = tlist_member((Node *) o, query->targetList); + Assert(tle); TargetEntry *ntle = list_nth(newtlist, tle->resno - 1); + scatterClause = lappend(scatterClause, copyObject(ntle->expr)); } query->scatterClause = scatterClause; @@ -3984,59 +4120,65 @@ add_second_stage_agg(PlannerInfo *root, List *newrtable; RangeTblEntry *newrte; RangeTblRef *newrtref; - Plan *agg_node; + Plan *agg_node; /* * Add a SubqueryScan node to renumber the range of the query. * * The result of the preliminary aggregation (represented by lower_tlist) - * may contain targets with no representatives in the range of its outer - * relation. We resolve this by treating the preliminary aggregation as - * a subquery. + * may contain targets with no representatives in the range of its outer + * relation. We resolve this by treating the preliminary aggregation as a + * subquery. + * + * However, this breaks the correspondence between the Plan tree and the + * Query tree that is assumed by the later call to set_plan_references as + * well as by the deparse processing used (e.g.) in EXPLAIN. * - * However, this breaks the correspondence between the Plan tree and - * the Query tree that is assumed by the later call to set_plan_references - * as well as by the deparse processing used (e.g.) in EXPLAIN. - * * So we also push the Query node from the root structure down into a new * subquery RTE and scribble over the original Query node to make it into * a simple SELECT * FROM a Subquery RTE. * - * Note that the Agg phase we add below will refer to the attributes of - * the result of this new SubqueryScan plan node. It is up to the caller + * Note that the Agg phase we add below will refer to the attributes of + * the result of this new SubqueryScan plan node. It is up to the caller * to set up upper_tlist and upper_qual accordingly. */ - - /* Flat-copy the root query into a newly allocated Query node and adjust - * its target list and having qual to match the lower (existing) Agg - * plan we're about to make into a SubqueryScan. + + /* + * Flat-copy the root query into a newly allocated Query node and adjust + * its target list and having qual to match the lower (existing) Agg plan + * we're about to make into a SubqueryScan. */ subquery = copyObject(parse); - + subquery->targetList = copyObject(lower_tlist); subquery->havingQual = NULL; - - /* Subquery attributes shouldn't be marked as junk, else they'll be - * skipped by addRangeTableEntryForSubquery. */ + + /* + * Subquery attributes shouldn't be marked as junk, else they'll be + * skipped by addRangeTableEntryForSubquery. + */ { - ListCell *cell; - - foreach ( cell, subquery->targetList ) + ListCell *cell; + + foreach(cell, subquery->targetList) { - TargetEntry *tle = (TargetEntry *)lfirst(cell); + TargetEntry *tle = (TargetEntry *) lfirst(cell); + tle->resjunk = false; - if ( tle->resname == NULL ) + if (tle->resname == NULL) { - if ( use_root && IsA(tle->expr, Var) ) + if (use_root && IsA(tle->expr, Var)) { - Var *var = (Var*)tle->expr; + Var *var = (Var *) tle->expr; RangeTblEntry *rte = rt_fetch(var->varno, root->parse->rtable); + tle->resname = pstrdup(get_rte_attribute_name(rte, var->varattno)); } else { const char *fmt = "unnamed_attr_%d"; - char buf[32]; /* big enough for fmt */ + char buf[32]; /* big enough for fmt */ + sprintf(buf, fmt, tle->resno); tle->resname = pstrdup(buf); } @@ -4045,8 +4187,8 @@ add_second_stage_agg(PlannerInfo *root, } /* - * Ensure that the plan we're going to attach to the subquery scan has - * all the parameter fields figured out. + * Ensure that the plan we're going to attach to the subquery scan has all + * the parameter fields figured out. */ SS_finalize_plan(root, result_plan, false); @@ -4057,9 +4199,11 @@ add_second_stage_agg(PlannerInfo *root, TRUE); newrtable = list_make1(newrte); - /* Modify the root query in place to look like its range table is - * a simple Subquery. */ - parse->querySource = QSRC_PLANNER; /* but remember it's really ours */ + /* + * Modify the root query in place to look like its range table is a simple + * Subquery. + */ + parse->querySource = QSRC_PLANNER; /* but remember it's really ours */ parse->rtable = newrtable; parse->jointree = makeNode(FromExpr); newrtref = makeNode(RangeTblRef); @@ -4068,51 +4212,52 @@ add_second_stage_agg(PlannerInfo *root, parse->jointree->quals = NULL; parse->rowMarks = NIL; - /* uses parse->targetList to derive the portal's tupDesc, - * so when use_root is true, the caller owns the responsibility to make - * sure it ends up in an appropriate form at the end of planning. + /* + * uses parse->targetList to derive the portal's tupDesc, so + * when use_root is true, the caller owns the responsibility to make sure + * it ends up in an appropriate form at the end of planning. */ - if ( use_root ) + if (use_root) { if (adjust_scatter) { UpdateScatterClause(parse, upper_tlist); } - parse->targetList = copyObject(upper_tlist); /* Match range. */ + parse->targetList = copyObject(upper_tlist); /* Match range. */ } - result_plan = add_subqueryscan(root, p_current_pathkeys, + result_plan = add_subqueryscan(root, p_current_pathkeys, 1, subquery, result_plan); /* Add an Agg node */ /* convert current_numGroups to long int */ - long lNumGroups = (long) Min(numGroups, (double) LONG_MAX); - - agg_node = (Plan *)make_agg(root, - upper_tlist, - upper_qual, - aggstrategy, false, - numGroupCols, - prelimGroupColIdx, - prelimGroupOperators, - lNumGroups, - num_nullcols, - input_grouping, - grouping, - rollup_gs_times, - numAggs, - transSpace, - result_plan); + long lNumGroups = (long) Min(numGroups, (double) LONG_MAX); + + agg_node = (Plan *) make_agg(root, + upper_tlist, + upper_qual, + aggstrategy, false, + numGroupCols, + prelimGroupColIdx, + prelimGroupOperators, + lNumGroups, + num_nullcols, + input_grouping, + grouping, + rollup_gs_times, + numAggs, + transSpace, + result_plan); /* * Agg will not change the sort order unless it is hashed. */ agg_node->flow = pull_up_Flow(agg_node, agg_node->lefttree); - /* + /* * Since the rtable has changed, we had better recreate a RelOptInfo entry - * for it. Make a copy of the groupClause since freeing the arrays can pull - * out references still in use from underneath it. + * for it. Make a copy of the groupClause since freeing the arrays can + * pull out references still in use from underneath it. */ root->parse->groupClause = copyObject(root->parse->groupClause); @@ -4127,30 +4272,31 @@ add_second_stage_agg(PlannerInfo *root, } -/* - * Add a SubqueryScan node to the input plan and maintain the given +/* + * Add a SubqueryScan node to the input plan and maintain the given * pathkeys by making adjustments to them and to the equivalence class * information in root. * * Note that submerging a plan into a subquery scan will require changes - * to the range table and to any expressions above the new scan node. + * to the range table and to any expressions above the new scan node. * This is the caller's responsibility since the nature of the changes * depends on the context in which the subquery is used. */ -Plan* add_subqueryscan(PlannerInfo* root, List **p_pathkeys, - Index varno, Query *subquery, Plan *subplan) +Plan * +add_subqueryscan(PlannerInfo *root, List **p_pathkeys, + Index varno, Query *subquery, Plan *subplan) { - List *subplan_tlist; - int *resno_map; + List *subplan_tlist; + int *resno_map; subplan_tlist = generate_subquery_tlist(varno, subquery->targetList, - false, &resno_map); - - subplan = (Plan*)make_subqueryscan(root, subplan_tlist, - NIL, - varno, /* scanrelid (= varno) */ - subplan, - subquery->rtable); + false, &resno_map); + + subplan = (Plan *) make_subqueryscan(root, subplan_tlist, + NIL, + varno, /* scanrelid (= varno) */ + subplan, + subquery->rtable); mark_passthru_locus(subplan, true, true); @@ -4161,7 +4307,7 @@ Plan* add_subqueryscan(PlannerInfo* root, List **p_pathkeys, } pfree(resno_map); - + return subplan; } @@ -4193,13 +4339,13 @@ hash_safe_type(Oid type) * sorting_prefixes_grouping - is the result ordered on a grouping key prefix? * * If so, then we might prefer a pre-ordered grouping result to one that would - * need sorting after the fact. + * need sorting after the fact. */ -static bool +static bool sorting_prefixes_grouping(PlannerInfo *root) { return root->sort_pathkeys != NIL - && pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys); + && pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys); } /* @@ -4218,9 +4364,10 @@ gp_hash_safe_grouping(PlannerInfo *root) &grouptles, &groupops); foreach(glc, grouptles) { - TargetEntry *tle = (TargetEntry *)lfirst(glc); - bool canhash; - canhash = isGreenplumDbHashable(exprType((Node *)tle->expr)); + TargetEntry *tle = (TargetEntry *) lfirst(glc); + bool canhash; + + canhash = isGreenplumDbHashable(exprType((Node *) tle->expr)); if (!canhash) return false; } @@ -4256,7 +4403,7 @@ reconstruct_pathkeys(PlannerInfo *root, List *pathkeys, int *resno_map, { TargetEntry *new_tle; EquivalenceClass *new_eclass; - PathKey *new_pathkey; + PathKey *new_pathkey; new_tle = get_tle_by_resno(new_tlist, resno_map[tle->resno - 1]); if (!new_tle) @@ -4274,7 +4421,7 @@ reconstruct_pathkeys(PlannerInfo *root, List *pathkeys, int *resno_map, } } if (!found) - { + { new_pathkeys = lappend(new_pathkeys, copyObject(pathkey)); } } @@ -4286,108 +4433,111 @@ reconstruct_pathkeys(PlannerInfo *root, List *pathkeys, int *resno_map, /* cost_common_agg -- Estimate the cost of executing the common subquery - * for an aggregation plan. Assumes that the AggPlanInfo contains the + * for an aggregation plan. Assumes that the AggPlanInfo contains the * correct Path as input_path. * * Returns the total cost and, more importantly, populates the given * dummy Plan node with cost information */ -Cost cost_common_agg(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info, Plan *dummy) +Cost +cost_common_agg(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info, Plan *dummy) { - QualCost tlist_cost; - Cost startup_cost; - Cost total_cost; - double input_rows; - int input_width; - int n; - + QualCost tlist_cost; + Cost startup_cost; + Cost total_cost; + double input_rows; + int input_width; + int n; + Assert(dummy != NULL); - + input_rows = info->input_path->parent->rows; input_width = info->input_path->parent->width; /* Path input width isn't correct for ctx->sub_tlist so we guess. */ n = 32 * list_length(ctx->sub_tlist); - input_width = ( input_width < n )? input_width: n; + input_width = (input_width < n) ? input_width : n; /* Estimate cost of evaluation of the sub_tlist. */ cost_qual_eval(&tlist_cost, ctx->sub_tlist, root); startup_cost = info->input_path->startup_cost + tlist_cost.startup; total_cost = info->input_path->total_cost + tlist_cost.startup + tlist_cost.per_tuple * input_rows; - + memset(dummy, 0, sizeof(Plan)); dummy->type = info->input_path->type; dummy->startup_cost = startup_cost; dummy->total_cost = total_cost; dummy->plan_rows = input_rows; dummy->plan_width = input_width; - + return dummy->total_cost; } -/* Function cost_1phase_aggregation +/* Function cost_1phase_aggregation * * May be used for 1 phase aggregation costing with or without DQAs. * Corresponds to make_one_stage_agg_plan and must be maintained in sync * with it. */ -Cost cost_1phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info) +Cost +cost_1phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info) { - Plan input_dummy; - bool is_sorted; - long numGroups = (*(ctx->p_dNumGroups) < 0) ? 0 : - (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : - (long)*(ctx->p_dNumGroups); - + Plan input_dummy; + bool is_sorted; + long numGroups = (*(ctx->p_dNumGroups) < 0) ? 0 : + (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : + (long) *(ctx->p_dNumGroups); + cost_common_agg(root, ctx, info, &input_dummy); - + is_sorted = pathkeys_contained_in(root->group_pathkeys, info->input_path->pathkeys); - + /* Collocation cost (Motion). */ - switch ( info->group_prep ) - { - case MPP_GRP_PREP_HASH_GROUPS: - is_sorted = false; - input_dummy.total_cost += - incremental_motion_cost(input_dummy.plan_rows, - input_dummy.plan_rows); - break; - case MPP_GRP_PREP_FOCUS_QE: - case MPP_GRP_PREP_FOCUS_QD: - input_dummy.total_cost += - incremental_motion_cost(input_dummy.plan_rows, - input_dummy.plan_rows * root->config->cdbpath_segments); - input_dummy.plan_rows = input_dummy.plan_rows * root->config->cdbpath_segments; - break; - default: - break; - } - - /* NB: We don't need to calculate grouping extension costs here because - * grouping extensions are planned elsewhere. + switch (info->group_prep) + { + case MPP_GRP_PREP_HASH_GROUPS: + is_sorted = false; + input_dummy.total_cost += + incremental_motion_cost(input_dummy.plan_rows, + input_dummy.plan_rows); + break; + case MPP_GRP_PREP_FOCUS_QE: + case MPP_GRP_PREP_FOCUS_QD: + input_dummy.total_cost += + incremental_motion_cost(input_dummy.plan_rows, + input_dummy.plan_rows * root->config->cdbpath_segments); + input_dummy.plan_rows = input_dummy.plan_rows * root->config->cdbpath_segments; + break; + default: + break; + } + + /* + * NB: We don't need to calculate grouping extension costs here because + * grouping extensions are planned elsewhere. */ - if ( ctx->use_hashed_grouping ) + if (ctx->use_hashed_grouping) { /* HashAgg */ - Assert( ctx->numDistinctCols == 0 ); - - add_agg_cost(NULL, &input_dummy, - ctx->sub_tlist, (List*)root->parse->havingQual, - AGG_HASHED, false, + Assert(ctx->numDistinctCols == 0); + + add_agg_cost(NULL, &input_dummy, + ctx->sub_tlist, (List *) root->parse->havingQual, + AGG_HASHED, false, ctx->numGroupCols, ctx->groupColIdx, numGroups, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); } - else + else { - if ( ctx->numGroupCols == 0 ) + if (ctx->numGroupCols == 0) { /* PlainAgg */ - add_agg_cost(NULL, &input_dummy, - ctx->sub_tlist, (List*)root->parse->havingQual, - AGG_PLAIN, false, + add_agg_cost(NULL, &input_dummy, + ctx->sub_tlist, (List *) root->parse->havingQual, + AGG_PLAIN, false, 0, NULL, 1, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); @@ -4395,107 +4545,111 @@ Cost cost_1phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInf else { /* GroupAgg */ - if ( ! is_sorted ) + if (!is_sorted) { add_sort_cost(NULL, &input_dummy, ctx->numGroupCols, NULL, NULL, -1.0); } - add_agg_cost(NULL, &input_dummy, - ctx->sub_tlist, (List*)root->parse->havingQual, - AGG_SORTED, false, + add_agg_cost(NULL, &input_dummy, + ctx->sub_tlist, (List *) root->parse->havingQual, + AGG_SORTED, false, ctx->numGroupCols, ctx->groupColIdx, numGroups, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); } - - /* Beware: AGG_PLAIN and AGG_GROUPED may be used with DQAs, however, - * the function cost_agg doesn't distinguish DQAs so it - * consistently under estimates the cost in these cases. + + /* + * Beware: AGG_PLAIN and AGG_GROUPED may be used with DQAs, however, + * the function cost_agg doesn't distinguish DQAs so it consistently + * under estimates the cost in these cases. */ - if ( ctx->numDistinctCols > 0 ) + if (ctx->numDistinctCols > 0) { - Path path_dummy; - double ngrps = *(ctx->p_dNumGroups); - double nsorts = ngrps * ctx->numDistinctCols; - double avgsize = input_dummy.plan_rows / ngrps; + Path path_dummy; + double ngrps = *(ctx->p_dNumGroups); + double nsorts = ngrps * ctx->numDistinctCols; + double avgsize = input_dummy.plan_rows / ngrps; + cost_sort(&path_dummy, NULL, NIL, 0.0, avgsize, 32, -1); input_dummy.total_cost += nsorts * path_dummy.total_cost; } } - info->plan_cost = root->config->gp_eager_one_phase_agg ? (Cost)0.0 : input_dummy.total_cost; + info->plan_cost = root->config->gp_eager_one_phase_agg ? (Cost) 0.0 : input_dummy.total_cost; info->valid = true; info->join_strategy = DqaJoinNone; info->use_sharing = false; - + info->plan_cost *= gp_coefficient_1phase_agg; return info->plan_cost; } -/* Function cost_2phase_aggregation +/* Function cost_2phase_aggregation * * May be used for 2 phase costing with 0 or 1 DQAs. * Corresponds to make_two_stage_agg_plan and must be maintained in sync * with it. */ -Cost cost_2phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info) +Cost +cost_2phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info) { - Plan input_dummy; - bool is_sorted; - long numGroups = (*(ctx->p_dNumGroups) < 0) ? 0 : - (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : - (long)*(ctx->p_dNumGroups); - double input_rows; - double streaming_fudge = 1.3; - + Plan input_dummy; + bool is_sorted; + long numGroups = (*(ctx->p_dNumGroups) < 0) ? 0 : + (*(ctx->p_dNumGroups) > LONG_MAX) ? LONG_MAX : + (long) *(ctx->p_dNumGroups); + double input_rows; + double streaming_fudge = 1.3; + cost_common_agg(root, ctx, info, &input_dummy); input_rows = input_dummy.plan_rows; - + is_sorted = pathkeys_contained_in(root->group_pathkeys, info->input_path->pathkeys); - + /* Precondition Input */ - - switch ( info->group_prep ) - { - case MPP_GRP_PREP_HASH_DISTINCT: - input_dummy.total_cost += - incremental_motion_cost(input_dummy.plan_rows, - input_dummy.plan_rows); - is_sorted = false; - break; - case MPP_GRP_PREP_NONE: - break; - default: - ereport(ERROR, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected call for two-stage aggregation"))); - break; /* Never */ + + switch (info->group_prep) + { + case MPP_GRP_PREP_HASH_DISTINCT: + input_dummy.total_cost += + incremental_motion_cost(input_dummy.plan_rows, + input_dummy.plan_rows); + is_sorted = false; + break; + case MPP_GRP_PREP_NONE: + break; + default: + ereport(ERROR, + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected call for two-stage aggregation"))); + break; /* Never */ } - + /* Preliminary Aggregation */ - - if ( ctx->use_hashed_grouping ) + + if (ctx->use_hashed_grouping) { - /* Preliminary HashAgg*/ - add_agg_cost(NULL, &input_dummy, - NIL, NIL, /* Don't know preliminary tlist, qual IS NIL */ + /* Preliminary HashAgg */ + add_agg_cost(NULL, &input_dummy, + NIL, NIL, /* Don't know preliminary tlist, qual IS NIL */ AGG_HASHED, root->config->gp_hashagg_streambottom, ctx->numGroupCols, ctx->groupColIdx, numGroups, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); - - if ( gp_hashagg_streambottom ) + + if (gp_hashagg_streambottom) { input_dummy.plan_rows *= streaming_fudge; } } else { - if ( ctx->numGroupCols == 0 ) + if (ctx->numGroupCols == 0) { - /* Preliminary PlainAgg*/ - add_agg_cost(NULL, &input_dummy, - NIL, NIL, /* Don't know preliminary tlist, qual IS NIL */ - AGG_PLAIN, false, + /* Preliminary PlainAgg */ + add_agg_cost(NULL, &input_dummy, + NIL, NIL, /* Don't know preliminary tlist, qual IS + * NIL */ + AGG_PLAIN, false, 0, NULL, 1, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); @@ -4503,77 +4657,80 @@ Cost cost_2phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInf else { /* Preliminary GroupAgg */ - if ( ! is_sorted ) + if (!is_sorted) { add_sort_cost(NULL, &input_dummy, ctx->numGroupCols, NULL, NULL, -1.0); } - add_agg_cost(NULL, &input_dummy, - NIL, NIL, /* Don't know preliminary tlist, qual IS NIL */ - AGG_SORTED, false, + add_agg_cost(NULL, &input_dummy, + NIL, NIL, /* Don't know preliminary tlist, qual IS + * NIL */ + AGG_SORTED, false, ctx->numGroupCols, ctx->groupColIdx, numGroups, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); } - /* Beware: AGG_PLAIN and AGG_GROUPED may be used with DQAs, however, - * the function cost_agg doesn't distinguish DQAs so it - * consistently under estimates the cost in these cases. + + /* + * Beware: AGG_PLAIN and AGG_GROUPED may be used with DQAs, however, + * the function cost_agg doesn't distinguish DQAs so it consistently + * under estimates the cost in these cases. */ - if ( ctx->numDistinctCols > 0 ) + if (ctx->numDistinctCols > 0) { - Path path_dummy; - Cost run_cost; - double ngrps = *(ctx->p_dNumGroups); - double avgsize = input_rows / ngrps; - + Path path_dummy; + Cost run_cost; + double ngrps = *(ctx->p_dNumGroups); + double avgsize = input_rows / ngrps; + Assert(ctx->numDistinctCols == 1); - + cost_sort(&path_dummy, NULL, NIL, input_dummy.total_cost, avgsize, 32, -1.0); run_cost = path_dummy.total_cost - path_dummy.startup_cost; input_dummy.total_cost += path_dummy.startup_cost + ngrps * run_cost; } - + } - + /* Collocate groups */ - switch ( info->group_type ) + switch (info->group_type) { - case MPP_GRP_TYPE_GROUPED_2STAGE: /* Redistribute */ - input_dummy.total_cost += + case MPP_GRP_TYPE_GROUPED_2STAGE: /* Redistribute */ + input_dummy.total_cost += incremental_motion_cost(input_dummy.plan_rows, input_dummy.plan_rows); break; case MPP_GRP_TYPE_PLAIN_2STAGE: /* Gather */ - input_dummy.total_cost += + input_dummy.total_cost += incremental_motion_cost(input_dummy.plan_rows, - input_dummy.plan_rows *root->config->cdbpath_segments); + input_dummy.plan_rows * root->config->cdbpath_segments); break; default: - ereport(ERROR, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected call for two-stage aggregation"))); - break; /* Never */ + ereport(ERROR, + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected call for two-stage aggregation"))); + break; /* Never */ } - + /* Final Aggregation */ - if ( ctx->use_hashed_grouping ) + if (ctx->use_hashed_grouping) { - /* HashAgg*/ - add_agg_cost(NULL, &input_dummy, - NIL, NIL, /* Don't know tlist or qual */ - AGG_HASHED, false, + /* HashAgg */ + add_agg_cost(NULL, &input_dummy, + NIL, NIL, /* Don't know tlist or qual */ + AGG_HASHED, false, ctx->numGroupCols, ctx->groupColIdx, numGroups, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); } else { - if ( ctx->numGroupCols == 0 ) + if (ctx->numGroupCols == 0) { - /* PlainAgg*/ - add_agg_cost(NULL, &input_dummy, - NIL, NIL, /* Don't know tlist or qual */ - AGG_PLAIN, false, + /* PlainAgg */ + add_agg_cost(NULL, &input_dummy, + NIL, NIL, /* Don't know tlist or qual */ + AGG_PLAIN, false, 0, NULL, 1, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); @@ -4582,16 +4739,16 @@ Cost cost_2phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInf { /* GroupAgg */ add_sort_cost(NULL, &input_dummy, ctx->numGroupCols, NULL, NULL, -1.0); - add_agg_cost(NULL, &input_dummy, - NIL, NIL, /* Don't know tlist or qual */ - AGG_SORTED, false, + add_agg_cost(NULL, &input_dummy, + NIL, NIL, /* Don't know tlist or qual */ + AGG_SORTED, false, ctx->numGroupCols, ctx->groupColIdx, numGroups, 0, ctx->agg_counts->numAggs, ctx->agg_counts->transitionSpace); } } - info->plan_cost = root->config->gp_eager_two_phase_agg ? (Cost)0.0 : input_dummy.total_cost; + info->plan_cost = root->config->gp_eager_two_phase_agg ? (Cost) 0.0 : input_dummy.total_cost; info->valid = true; info->join_strategy = DqaJoinNone; info->use_sharing = false; @@ -4610,41 +4767,43 @@ Cost cost_2phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInf * This function assumes the enviroment established by planDqaJoinOrder() * and set_coplan_strategies(). */ -Cost cost_3phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info) +Cost +cost_3phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInfo *info) { - Plan dummy; - Cost total_cost; - Cost share_cost; - Cost reexec_cost; - int i; - bool use_sharing = false; + Plan dummy; + Cost total_cost; + Cost share_cost; + Cost reexec_cost; + int i; + bool use_sharing = false; DqaJoinStrategy join_strategy = DqaJoinUndefined; - Cost input_cost = 0.0; - Cost cost_coplan_cheapest = 0.0; - Cost cost_coplan_sorted = 0.0; - Cost cost_hashjoin = 0.0; - Cost cost_mergejoin = 0.0; - Cost cost_crossjoin = 0.0; - + Cost input_cost = 0.0; + Cost cost_coplan_cheapest = 0.0; + Cost cost_coplan_sorted = 0.0; + Cost cost_hashjoin = 0.0; + Cost cost_mergejoin = 0.0; + Cost cost_crossjoin = 0.0; + cost_common_agg(root, ctx, info, &dummy); - + total_cost = dummy.total_cost; - - Assert( ctx->numDistinctCols == list_length(ctx->agg_counts->dqaArgs) ); - /* Note that join order has already been established by an earlier - * call to planDqaJoinOrder. Here we just use that order, but we - * need to decide on the join type. + Assert(ctx->numDistinctCols == list_length(ctx->agg_counts->dqaArgs)); + + /* + * Note that join order has already been established by an earlier call to + * planDqaJoinOrder. Here we just use that order, but we need to decide + * on the join type. */ - if ( list_length(ctx->agg_counts->dqaArgs) < 2 ) + if (list_length(ctx->agg_counts->dqaArgs) < 2) { join_strategy = DqaJoinNone; } - else if ( ctx->numGroupCols == 0 ) + else if (ctx->numGroupCols == 0) { join_strategy = DqaJoinCross; } - else if ( sorting_prefixes_grouping(root) ) + else if (sorting_prefixes_grouping(root)) { /* Cheapest of merge join of sorted input or sorted hash join */ join_strategy = DqaJoinSorted; @@ -4654,93 +4813,94 @@ Cost cost_3phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInf /* Cheapest of merge join of sorted input or hash join */ join_strategy = DqaJoinCheapest; } - + /* Determine whether to use input sharing. */ - if ( ctx->numDistinctCols < 2 ) + if (ctx->numDistinctCols < 2) { reexec_cost = total_cost; use_sharing = false; } else - { - /* Decide based on apparent costs. - * XXX Do we need to override this if there are volatile functions - * in the common plan? Is this known, or do we need to search? + { + /* + * Decide based on apparent costs. XXX Do we need to override this if + * there are volatile functions in the common plan? Is this known, + * or do we need to search? */ share_cost = cost_share_plan(&dummy, root, ctx->numDistinctCols); reexec_cost = ctx->numDistinctCols * total_cost; use_sharing = share_cost < reexec_cost; } input_cost = use_sharing ? share_cost : reexec_cost; - + /* Get costs for the ctx->numDistinctCols coplans. */ cost_coplan_cheapest = cost_coplan_sorted = 0; - for ( i = 0; i < ctx->numDistinctCols; i++ ) + for (i = 0; i < ctx->numDistinctCols; i++) { - DqaInfo *arg = ctx->dqaArgs + i; - + DqaInfo *arg = ctx->dqaArgs + i; + cost_coplan_cheapest += arg->cost_cheapest; cost_coplan_sorted += arg->cost_sorted; } /* Get costs to join the coplan results. */ - switch ( join_strategy ) - { - case DqaJoinNone: - break; - - case DqaJoinCross: - cost_crossjoin = (ctx->numDistinctCols - 1) * 2 * cpu_tuple_cost; - break; - - case DqaJoinSorted: - case DqaJoinCheapest: - set_cost_of_join_strategies(ctx, &cost_hashjoin, &cost_mergejoin); - - if ( join_strategy == DqaJoinSorted ) - cost_hashjoin += incremental_sort_cost(*ctx->p_dNumGroups, 100, ctx->numDistinctCols); - - cost_hashjoin += cost_coplan_cheapest; - cost_mergejoin += cost_coplan_sorted; - - if ( cost_hashjoin > 0.0 && cost_hashjoin <= cost_mergejoin ) - { - join_strategy = DqaJoinHash; - } - else - { - join_strategy = DqaJoinMerge; - } - break; - - default: - elog(ERROR, "unexpected join strategy during DQA planning"); + switch (join_strategy) + { + case DqaJoinNone: + break; + + case DqaJoinCross: + cost_crossjoin = (ctx->numDistinctCols - 1) * 2 * cpu_tuple_cost; + break; + + case DqaJoinSorted: + case DqaJoinCheapest: + set_cost_of_join_strategies(ctx, &cost_hashjoin, &cost_mergejoin); + + if (join_strategy == DqaJoinSorted) + cost_hashjoin += incremental_sort_cost(*ctx->p_dNumGroups, 100, ctx->numDistinctCols); + + cost_hashjoin += cost_coplan_cheapest; + cost_mergejoin += cost_coplan_sorted; + + if (cost_hashjoin > 0.0 && cost_hashjoin <= cost_mergejoin) + { + join_strategy = DqaJoinHash; + } + else + { + join_strategy = DqaJoinMerge; + } + break; + + default: + elog(ERROR, "unexpected join strategy during DQA planning"); } - + /* Compare costs choose cheapest. */ - switch ( join_strategy ) - { - case DqaJoinNone: - total_cost = input_cost + cost_coplan_cheapest; - break; - - case DqaJoinCross: - total_cost = input_cost + cost_coplan_cheapest + cost_crossjoin; - break; - - case DqaJoinHash: - total_cost = input_cost + cost_coplan_cheapest + cost_hashjoin; - break; - - case DqaJoinMerge: - total_cost = input_cost + cost_coplan_cheapest + cost_mergejoin; - break; - - default: - elog(ERROR, "unexpected join strategy during DQA planning"); - } - - info->plan_cost = root->config->gp_eager_dqa_pruning ? (Cost)0.0 : total_cost; + switch (join_strategy) + { + case DqaJoinNone: + total_cost = input_cost + cost_coplan_cheapest; + break; + + case DqaJoinCross: + total_cost = input_cost + cost_coplan_cheapest + cost_crossjoin; + break; + + case DqaJoinHash: + total_cost = input_cost + cost_coplan_cheapest + cost_hashjoin; + break; + + case DqaJoinMerge: + total_cost = input_cost + cost_coplan_cheapest + cost_mergejoin; + break; + + default: + elog(ERROR, "unexpected join strategy during DQA planning"); + } + + info->plan_cost = root->config->gp_eager_dqa_pruning ? (Cost) 0.0 : total_cost; info->valid = true; info->join_strategy = join_strategy; info->use_sharing = use_sharing; @@ -4750,69 +4910,70 @@ Cost cost_3phase_aggregation(PlannerInfo *root, MppGroupContext *ctx, AggPlanInf } -/* Estimate the costs of +/* Estimate the costs of * 1. HashJoin of cheapest inputs, and * 2. MergeJoin of sorted input. - * + * * If result should be ordered, compare a Sort of 1 with 2. * Else compare 1 with 2. */ -void set_cost_of_join_strategies(MppGroupContext *ctx, Cost *hashjoin_cost, Cost *mergejoin_cost) +void +set_cost_of_join_strategies(MppGroupContext *ctx, Cost *hashjoin_cost, Cost *mergejoin_cost) { - Cost hj_cost; - Cost mj_cost; - List *mergeclauses = NIL; - List *hashclauses = NIL; - - double rows; - int gk_width; - int outer_width; - bool try_hashed = true; - AttrNumber attrno; - Index outer_varno = 1; - int i; - + Cost hj_cost; + Cost mj_cost; + List *mergeclauses = NIL; + List *hashclauses = NIL; + + double rows; + int gk_width; + int outer_width; + bool try_hashed = true; + AttrNumber attrno; + Index outer_varno = 1; + int i; + rows = *ctx->p_dNumGroups; - + /* Widths are wild speculation, but good enough, we hope. */ gk_width = 32 * ctx->numGroupCols; - outer_width = 32; /* DQA transition values for first DQA arg. */ - outer_width += 64; /* Ordinary aggregate transition values. */ - + outer_width = 32; /* DQA transition values for first DQA arg. */ + outer_width += 64; /* Ordinary aggregate transition values. */ + /* We need join clauses for costing. */ - for( i = 0; i < ctx->numGroupCols; i++ ) + for (i = 0; i < ctx->numGroupCols; i++) { - Expr *qual; - Var *outer_var; - Var *inner_var; - AttrNumber resno = ctx->groupColIdx[i]; - Index inner_varno = 1 + (i + 1); + Expr *qual; + Var *outer_var; + Var *inner_var; + AttrNumber resno = ctx->groupColIdx[i]; + Index inner_varno = 1 + (i + 1); TargetEntry *tle = get_tle_by_resno(ctx->sub_tlist, resno); - - Assert( tle != NULL ); - + + Assert(tle != NULL); + outer_var = makeVar(outer_varno, resno, - exprType((Node *)tle->expr), - exprTypmod((Node *)tle->expr), 0); - + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), 0); + inner_var = makeVar(inner_varno, resno, - exprType((Node *)tle->expr), - exprTypmod((Node *)tle->expr), 0); - + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), 0); + /* outer should always be on the left */ - qual = make_op(NULL, list_make1(makeString("=")), - (Node*) outer_var, - (Node*) inner_var, -1); + qual = make_op(NULL, list_make1(makeString("=")), + (Node *) outer_var, + (Node *) inner_var, -1); /* If the grouping column is not hashable, do not try hashing. */ - if (!hash_safe_type(exprType((Node *)tle->expr))) + if (!hash_safe_type(exprType((Node *) tle->expr))) try_hashed = false; - - if ( try_hashed ) + + if (try_hashed) { hashclauses = lappend(hashclauses, copyObject(qual)); } - + qual->type = T_DistinctExpr; qual = make_notclause(qual); mergeclauses = lappend(mergeclauses, qual); @@ -4820,18 +4981,18 @@ void set_cost_of_join_strategies(MppGroupContext *ctx, Cost *hashjoin_cost, Cost /* Estimate the incremental join costs. */ hj_cost = mj_cost = 0; - for ( attrno = 1; attrno < ctx->numDistinctCols; attrno++ ) + for (attrno = 1; attrno < ctx->numDistinctCols; attrno++) { - int dqa_width = 32; - int inner_width = gk_width + dqa_width; - + int dqa_width = 32; + int inner_width = gk_width + dqa_width; + mj_cost += incremental_mergejoin_cost(rows, mergeclauses, ctx->root); - if ( try_hashed ) + if (try_hashed) hj_cost += incremental_hashjoin_cost(rows, inner_width, outer_width, hashclauses, ctx->root); - + outer_width += dqa_width; } - + *mergejoin_cost = mj_cost; *hashjoin_cost = try_hashed ? hj_cost : 0.0; } @@ -4839,7 +5000,8 @@ void set_cost_of_join_strategies(MppGroupContext *ctx, Cost *hashjoin_cost, Cost /* Set up basic structure content. Caller to fill in. */ static -void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) +void +initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) { info->input_path = input_path; info->input_plan = input_plan; @@ -4848,12 +5010,12 @@ void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) info->input_locus = input_path->locus; else CdbPathLocus_MakeNull(&info->input_locus); - + info->group_type = MPP_GRP_TYPE_BASEPLAN; info->group_prep = MPP_GRP_PREP_NONE; CdbPathLocus_MakeNull(&info->output_locus); info->distinctkey_collocate = false; - + info->valid = false; info->plan_cost = 0; info->join_strategy = DqaJoinUndefined; @@ -4864,12 +5026,12 @@ void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) /* set_coplan_strategies * * Determine and cache in the given DqaInfo structure the cheapest - * strategy that computes the answer and the cheapest strategy that + * strategy that computes the answer and the cheapest strategy that * computes the answer in grouping key order. * * Below, the result cardinalities are shown as <-n- where * - * x (input_rows) is the input cardinality which is usually about + * x (input_rows) is the input cardinality which is usually about * equal to #segments * #distinct(grouping key, DQA arg) * * d (darg_rows) is #distinct(grouping key, DQA arg) @@ -4884,7 +5046,7 @@ void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) * * which is encoded in DqaInfo by the flag use_hashed_preliminary. * - * The possible post-Motion strategies are encoded as enum values of + * The possible post-Motion strategies are encoded as enum values of * type DqaCoplanType and indicate all the required plan nodes. * * Vector aggregation strategies that produce a result ordered on the @@ -4893,7 +5055,7 @@ void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) * DQACOPLAN_GGS: <-g- GroupAgg <-d- GroupAgg <- Sort <-x- * DQACOPLAN_GSH: <-g- GroupAgg <- Sort <-d- HashAgg <-x- * DQACOPLAN_SHH: <- Sort <-g- HashAgg <-d- HashAgg <-x- - * + * * In addition, the vector aggreagation strategy * * DQACOPLAN_HH: <-g- HashAgg <-d- HashAgg <-x- R @@ -4906,52 +5068,53 @@ void initAggPlanInfo(AggPlanInfo *info, Path *input_path, Plan *input_plan) * DQACOPLAN_PH: <-1- PlainAgg <-d- HashedAgg <-x- R * */ -void set_coplan_strategies(PlannerInfo *root, MppGroupContext *ctx, DqaInfo *dqaArg, Path *input) +void +set_coplan_strategies(PlannerInfo *root, MppGroupContext *ctx, DqaInfo *dqaArg, Path *input) { - double input_rows = input->parent->rows; - int input_width = input->parent->width; - double darg_rows = dqaArg->num_rows; - double group_rows = *ctx->p_dNumGroups; - long numGroups = (group_rows < 0) ? 0 : - (group_rows > LONG_MAX) ? LONG_MAX : - (long)group_rows; - bool can_hash_group_key = ctx->agg_counts->canHashAgg; - bool can_hash_dqa_arg = dqaArg->can_hash; - bool use_hashed_preliminary = false; - - Cost sort_input = incremental_sort_cost(input_rows, input_width, - ctx->numGroupCols+1); - Cost sort_dargs = incremental_sort_cost(darg_rows, input_width, - ctx->numGroupCols); - Cost sort_groups = incremental_sort_cost(group_rows, input_width, - ctx->numGroupCols); - Cost gagg_input = incremental_agg_cost(input_rows, input_width, - AGG_SORTED, ctx->numGroupCols+1, - numGroups, ctx->agg_counts->numAggs, - ctx->agg_counts->transitionSpace); - Cost gagg_dargs = incremental_agg_cost(darg_rows, input_width, - AGG_SORTED, ctx->numGroupCols, - numGroups, ctx->agg_counts->numAggs, - ctx->agg_counts->transitionSpace); - Cost hagg_input = incremental_agg_cost(input_rows, input_width, - AGG_HASHED, ctx->numGroupCols+1, - numGroups, ctx->agg_counts->numAggs, - ctx->agg_counts->transitionSpace); - Cost hagg_dargs = incremental_agg_cost(darg_rows, input_width, - AGG_HASHED, ctx->numGroupCols, - numGroups, ctx->agg_counts->numAggs, - ctx->agg_counts->transitionSpace); - Cost cost_base; - Cost cost_sorted; - Cost cost_cheapest; + double input_rows = input->parent->rows; + int input_width = input->parent->width; + double darg_rows = dqaArg->num_rows; + double group_rows = *ctx->p_dNumGroups; + long numGroups = (group_rows < 0) ? 0 : + (group_rows > LONG_MAX) ? LONG_MAX : + (long) group_rows; + bool can_hash_group_key = ctx->agg_counts->canHashAgg; + bool can_hash_dqa_arg = dqaArg->can_hash; + bool use_hashed_preliminary = false; + + Cost sort_input = incremental_sort_cost(input_rows, input_width, + ctx->numGroupCols + 1); + Cost sort_dargs = incremental_sort_cost(darg_rows, input_width, + ctx->numGroupCols); + Cost sort_groups = incremental_sort_cost(group_rows, input_width, + ctx->numGroupCols); + Cost gagg_input = incremental_agg_cost(input_rows, input_width, + AGG_SORTED, ctx->numGroupCols + 1, + numGroups, ctx->agg_counts->numAggs, + ctx->agg_counts->transitionSpace); + Cost gagg_dargs = incremental_agg_cost(darg_rows, input_width, + AGG_SORTED, ctx->numGroupCols, + numGroups, ctx->agg_counts->numAggs, + ctx->agg_counts->transitionSpace); + Cost hagg_input = incremental_agg_cost(input_rows, input_width, + AGG_HASHED, ctx->numGroupCols + 1, + numGroups, ctx->agg_counts->numAggs, + ctx->agg_counts->transitionSpace); + Cost hagg_dargs = incremental_agg_cost(darg_rows, input_width, + AGG_HASHED, ctx->numGroupCols, + numGroups, ctx->agg_counts->numAggs, + ctx->agg_counts->transitionSpace); + Cost cost_base; + Cost cost_sorted; + Cost cost_cheapest; DqaCoplanType type_sorted; DqaCoplanType type_cheapest; - Cost trial; - + Cost trial; + /* Preliminary aggregation */ - use_hashed_preliminary = ( can_hash_group_key || ctx->numGroupCols == 0 ) - && can_hash_dqa_arg; - if ( use_hashed_preliminary ) + use_hashed_preliminary = (can_hash_group_key || ctx->numGroupCols == 0) + && can_hash_dqa_arg; + if (use_hashed_preliminary) { cost_base = hagg_input; } @@ -4962,68 +5125,68 @@ void set_coplan_strategies(PlannerInfo *root, MppGroupContext *ctx, DqaInfo *dqa /* Collocating motion */ cost_base += incremental_motion_cost(darg_rows, darg_rows); - + /* Post-motion processing is more complex. */ - if ( ctx->numGroupCols == 0 ) /* scalar agg */ + if (ctx->numGroupCols == 0) /* scalar agg */ { - Cost pagg_dargs = incremental_agg_cost(darg_rows, input_width, - AGG_PLAIN, 0, - 1, ctx->agg_counts->numAggs, - ctx->agg_counts->transitionSpace); - + Cost pagg_dargs = incremental_agg_cost(darg_rows, input_width, + AGG_PLAIN, 0, + 1, ctx->agg_counts->numAggs, + ctx->agg_counts->transitionSpace); + type_sorted = type_cheapest = DQACOPLAN_PGS; cost_sorted = cost_cheapest = sort_input + gagg_input + pagg_dargs; - + trial = hagg_input + pagg_dargs; - if (trial < cost_cheapest ) + if (trial < cost_cheapest) { cost_cheapest = trial; type_cheapest = DQACOPLAN_PH; } } - else /* vector agg */ + else /* vector agg */ { type_sorted = type_cheapest = DQACOPLAN_GGS; cost_sorted = cost_cheapest = sort_input + gagg_input + gagg_dargs; - - if ( can_hash_dqa_arg ) + + if (can_hash_dqa_arg) { trial = hagg_input + sort_dargs + gagg_input; - - if ( trial < cost_cheapest ) + + if (trial < cost_cheapest) { cost_cheapest = trial; type_cheapest = DQACOPLAN_GSH; } - - if ( trial < cost_sorted ) + + if (trial < cost_sorted) { cost_sorted = trial; type_sorted = DQACOPLAN_GSH; } } - - if ( can_hash_group_key && can_hash_dqa_arg ) + + if (can_hash_group_key && can_hash_dqa_arg) { trial = hagg_input + hagg_dargs; - - if ( trial < cost_cheapest ) + + if (trial < cost_cheapest) { cost_cheapest = trial; type_cheapest = DQACOPLAN_HH; } - + trial += sort_groups; - - if ( trial < cost_sorted ) + + if (trial < cost_sorted) { cost_sorted = trial; type_sorted = DQACOPLAN_SHH; } } } - + dqaArg->use_hashed_preliminary = use_hashed_preliminary; dqaArg->cost_sorted = cost_base + cost_sorted; dqaArg->coplan_type_sorted = type_sorted; @@ -5035,48 +5198,52 @@ void set_coplan_strategies(PlannerInfo *root, MppGroupContext *ctx, DqaInfo *dqa /* incremental_sort_cost -- helper for set_coplan_strategies */ -Cost incremental_sort_cost(double rows, int width, int numKeyCols) +Cost +incremental_sort_cost(double rows, int width, int numKeyCols) { - Plan dummy; - + Plan dummy; + memset(&dummy, 0, sizeof(dummy)); dummy.plan_rows = rows; dummy.plan_width = width; - + add_sort_cost(NULL, &dummy, numKeyCols, NULL, NULL, -1.0); - + return dummy.total_cost; -} +} /* incremental_agg_cost -- helper for set_coplan_strategies */ -Cost incremental_agg_cost(double rows, int width, AggStrategy strategy, - int numGroupCols, double numGroups, - int numAggs, int transSpace) +Cost +incremental_agg_cost(double rows, int width, AggStrategy strategy, + int numGroupCols, double numGroups, + int numAggs, int transSpace) { - Plan dummy; - + Plan dummy; + memset(&dummy, 0, sizeof(dummy)); dummy.plan_rows = rows; dummy.plan_width = width; - - add_agg_cost(NULL, &dummy, - NULL, NULL, - strategy, false, - numGroupCols, NULL, + + add_agg_cost(NULL, &dummy, + NULL, NULL, + strategy, false, + numGroupCols, NULL, numGroups, 0, numAggs, transSpace); - + return dummy.total_cost; -} +} -/* incremental_motion_cost -- helper for set_coplan_strategies +/* incremental_motion_cost -- helper for set_coplan_strategies */ -Cost incremental_motion_cost(double sendrows, double recvrows) +Cost +incremental_motion_cost(double sendrows, double recvrows) { - Cost cost_per_row = (gp_motion_cost_per_row > 0.0) - ? gp_motion_cost_per_row - : 2.0 * cpu_tuple_cost; + Cost cost_per_row = (gp_motion_cost_per_row > 0.0) + ? gp_motion_cost_per_row + : 2.0 * cpu_tuple_cost; + return cost_per_row * 0.5 * (sendrows + recvrows); } @@ -5100,9 +5267,10 @@ choose_deduplicate(PlannerInfo *root, List *sortExprs, double num_distinct; double input_rows = input_plan->plan_rows; Path dummy_path; - Cost naive_cost, dedup_cost; + Cost naive_cost, + dedup_cost; int32 width; - AggStrategy aggstrategy; + AggStrategy aggstrategy; int numGroupCols; naive_cost = 0; @@ -5122,16 +5290,17 @@ choose_deduplicate(PlannerInfo *root, List *sortExprs, naive_cost = dummy_path.total_cost; /* - * Make a flattened version of the rangetable. estimate_num_groups() - * needs it. It is normally created later in the planning process, - * in query_planner(), but since we want to call estimate_num_groups() - * before query_planner(), we have to build it here. + * Make a flattened version of the rangetable. estimate_num_groups() needs + * it. It is normally created later in the planning process, in + * query_planner(), but since we want to call estimate_num_groups() before + * query_planner(), we have to build it here. */ root->simple_rel_array_size = list_length(root->parse->rtable) + 1; root->simple_rte_array = (RangeTblEntry **) palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *)); - int rti = 1; - ListCell *lc; + int rti = 1; + ListCell *lc; + foreach(lc, root->parse->rtable) { RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); @@ -5140,10 +5309,9 @@ choose_deduplicate(PlannerInfo *root, List *sortExprs, } /* - * Next, calculate cost of deduplicate. - * The first aggregate calculates number of duplicate for - * each unique sort key, then we add cost of sort after - * the aggregate. + * Next, calculate cost of deduplicate. The first aggregate calculates + * number of duplicate for each unique sort key, then we add cost of sort + * after the aggregate. */ num_distinct = estimate_num_groups(root, sortExprs, input_rows); aggstrategy = AGG_HASHED; @@ -5153,7 +5321,7 @@ choose_deduplicate(PlannerInfo *root, List *sortExprs, 0, 0, false); dummy_path.total_cost += incremental_motion_cost(num_distinct, - num_distinct * root->config->cdbpath_segments); + num_distinct * root->config->cdbpath_segments); cost_sort(&dummy_path, root, NIL, dummy_path.total_cost, num_distinct, @@ -5213,34 +5381,37 @@ wrap_plan_index(PlannerInfo *root, Plan *plan, Query *query, Assert(varno > 0); if (varno != 1) { - foreach (l, plan->flow->hashExpr) + foreach(l, plan->flow->hashExpr) { - Var *var = lfirst(l); - - if (IsA(var, RelabelType) && IsA(((RelabelType *) var)->arg, Var)) + Var *var = lfirst(l); + + if (IsA(var, RelabelType) &&IsA(((RelabelType *) var)->arg, Var)) { - /* handle RelabelType expressions in the flow: MPP-21026, MPP-22219 */ - var = (Var*) ((RelabelType *) var)->arg; + /* + * handle RelabelType expressions in the flow: MPP-21026, + * MPP-22219 + */ + var = (Var *) ((RelabelType *) var)->arg; } - + if (IsA(var, Var)) { /* fix varno, which is set to 1 in wrap_plan */ Assert(var->varno == 1); var->varno = var->varnoold = varno; } - + } /* - * Currently, plan and new parse tree shares target list. - * If this breaks, we'll need to update parse's target list as well. + * Currently, plan and new parse tree shares target list. If this + * breaks, we'll need to update parse's target list as well. */ Assert(plan->targetlist == (*query_p)->targetList); - foreach (l, plan->targetlist) + foreach(l, plan->targetlist) { - TargetEntry *tle = lfirst(l); - Var *var = (Var *) tle->expr; + TargetEntry *tle = lfirst(l); + Var *var = (Var *) tle->expr; if (IsA(var, Var)) { @@ -5274,9 +5445,9 @@ wrap_plan_index(PlannerInfo *root, Plan *plan, Query *query, static void rebuild_simple_rel_and_rte(PlannerInfo *root) { - int i; - int array_size; - ListCell *l; + int i; + int array_size; + ListCell *l; array_size = list_length(root->parse->rtable) + 1; root->simple_rel_array_size = array_size; @@ -5285,13 +5456,13 @@ rebuild_simple_rel_and_rte(PlannerInfo *root) root->simple_rte_array = (RangeTblEntry **) palloc0(sizeof(RangeTblEntry *) * array_size); i = 1; - foreach (l, root->parse->rtable) + foreach(l, root->parse->rtable) { root->simple_rte_array[i] = lfirst(l); i++; } i = 1; - foreach (l, root->parse->rtable) + foreach(l, root->parse->rtable) { (void) build_simple_rel(root, i, RELOPT_BASEREL); i++; @@ -5318,8 +5489,8 @@ make_parallel_or_sequential_agg(PlannerInfo *root, AggClauseCounts *agg_counts, /* - * current_pathkeys_p can be NULL, which means the caller isn't interested in - * the pathkeys. Still, we are. + * current_pathkeys_p can be NULL, which means the caller isn't interested + * in the pathkeys. Still, we are. */ if (current_pathkeys_p) current_pathkeys = *current_pathkeys_p; @@ -5330,14 +5501,13 @@ make_parallel_or_sequential_agg(PlannerInfo *root, AggClauseCounts *agg_counts, if (!result_plan) { /* - * If cdb_grouping_planner doesn't return a plan, - * it means the plan should fall back to sequential. - * In that case, multi-phase aggregate plan is not used. - * Here it's much simpler than grouping_planner, - * since we are sure we have at least one aggregate function - * and no GROUPING SETS. + * If cdb_grouping_planner doesn't return a plan, it means the plan + * should fall back to sequential. In that case, multi-phase aggregate + * plan is not used. Here it's much simpler than grouping_planner, + * since we are sure we have at least one aggregate function and no + * GROUPING SETS. */ - AggStrategy aggstrategy; + AggStrategy aggstrategy; result_plan = group_context->subplan; if (group_context->use_hashed_grouping) @@ -5380,6 +5550,7 @@ make_parallel_or_sequential_agg(PlannerInfo *root, AggClauseCounts *agg_counts, current_pathkeys = NIL; } } + /* * Now make a single Agg node. */ @@ -5392,10 +5563,10 @@ make_parallel_or_sequential_agg(PlannerInfo *root, AggClauseCounts *agg_counts, group_context->groupColIdx, group_context->groupOperators, *group_context->p_dNumGroups, - 0, /* num_nullcols */ - 0, /* input_grouping */ - 0, /* grouping */ - 0, /* rollup_gs_times */ + 0, /* num_nullcols */ + 0, /* input_grouping */ + 0, /* grouping */ + 0, /* rollup_gs_times */ agg_counts->numAggs, agg_counts->transitionSpace, result_plan); @@ -5437,23 +5608,24 @@ deconstruct_within_agg(Node *node, MppGroupContext *ctx) static Node * deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx) { - TargetEntry *tle; + TargetEntry *tle; if (node == NULL) return NULL; - if (IsA(node, Aggref) || IsA(node, PercentileExpr)) + if (IsA(node, Aggref) ||IsA(node, PercentileExpr)) { Index tlistno; AttrNumber attno; - List *sortclauses, *dref_tlist; + List *sortclauses, + *dref_tlist; ListCell *l; Node *final_node; /* - * Here we may see normal aggregates, not only percentiles. - * If normal aggs are involved, ctx->wagSortClauses should have - * NIL elements for it. + * Here we may see normal aggregates, not only percentiles. If normal + * aggs are involved, ctx->wagSortClauses should have NIL elements for + * it. */ if (IsA(node, PercentileExpr)) sortclauses = ((PercentileExpr *) node)->sortClause; @@ -5464,7 +5636,7 @@ deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx) * Find the right sub-plan which this expression should go. */ tlistno = 0; - foreach (l, ctx->wagSortClauses) + foreach(l, ctx->wagSortClauses) { /* Note NIL can be equal to NIL, too. */ if (equal(sortclauses, lfirst(l))) @@ -5475,6 +5647,7 @@ deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx) if (!l) elog(ERROR, "unexpected use of aggregate"); dref_tlist = ctx->dref_tlists[tlistno]; + /* * If the same expression exists at the same level, recycle it. * Otherwise, create a new expression. @@ -5484,8 +5657,8 @@ deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx) if (!tle) { /* - * Don't copy node, share it with tlist, for later operation - * can modify the var reference in tlist. + * Don't copy node, share it with tlist, for later operation can + * modify the var reference in tlist. */ tle = makeTargetEntry((Expr *) node, attno, NULL, false); @@ -5502,9 +5675,8 @@ deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx) } /* - * If the given expression is a grouping expression, replace it with - * a Var node referring to the (lower) preliminary aggregation's - * target list. + * If the given expression is a grouping expression, replace it with a Var + * node referring to the (lower) preliminary aggregation's target list. */ tle = tlist_member(node, ctx->grps_tlist); if (tle != NULL) @@ -5526,7 +5698,8 @@ deconstruct_within_agg_mutator(Node *node, MppGroupContext *ctx) static List * fetch_percentiles(Query *parse, List *sortClause) { - List *nodes, *result; + List *nodes, + *result; ListCell *l; nodes = list_concat(extract_nodes(NULL, (Node *) parse->targetList, T_PercentileExpr), @@ -5534,9 +5707,9 @@ fetch_percentiles(Query *parse, List *sortClause) nodes = list_concat(nodes, extract_nodes(NULL, (Node *) parse->scatterClause, T_PercentileExpr)); result = NIL; - foreach (l, nodes) + foreach(l, nodes) { - PercentileExpr *p = lfirst(l); + PercentileExpr *p = lfirst(l); if (equal(sortClause, p->sortClause)) result = lappend(result, p); @@ -5563,39 +5736,41 @@ make_deduplicate_plan(PlannerInfo *root, List **current_pathkeys_p, Plan *subplan) { - Plan *result_plan; - Aggref *aggref; - GroupContext ctx; - ListCell *l1, *l2; - TargetEntry *pc_tle; - List *tlist; - int numGroupCols; - AttrNumber *groupColIdx; - Oid *groupOperators; - List *pathkeys = NIL; - bool querynode_changed = false; - AggClauseCounts agg_counts; - bool use_hashed_grouping; - - Query *original_parse, *parse; - List *original_group_pathkeys; - - List *sub_tlist = group_context->sub_tlist; - Expr *tvexpr; /* ORDER BY expression */ - const Index Outer = 1; + Plan *result_plan; + Aggref *aggref; + GroupContext ctx; + ListCell *l1, + *l2; + TargetEntry *pc_tle; + List *tlist; + int numGroupCols; + AttrNumber *groupColIdx; + Oid *groupOperators; + List *pathkeys = NIL; + bool querynode_changed = false; + AggClauseCounts agg_counts; + bool use_hashed_grouping; + + Query *original_parse, + *parse; + List *original_group_pathkeys; + + List *sub_tlist = group_context->sub_tlist; + Expr *tvexpr; /* ORDER BY expression */ + const Index Outer = 1; /* - * It is doable to just concatenate groupClause and sortClause, - * but it is more semantic to convert sortClause to groupClause. - * Especially we want to use make_pathkeys_from_groupclause later where - * sortClause is not handled. + * It is doable to just concatenate groupClause and sortClause, but it is + * more semantic to convert sortClause to groupClause. Especially we want + * to use make_pathkeys_from_groupclause later where sortClause is not + * handled. * * Copy input groupClause, since we change it. */ groupClause = copyObject(groupClause); - foreach (l1, sortClause) + foreach(l1, sortClause) { - SortClause *sc = copyObject(lfirst(l1)); + SortClause *sc = copyObject(lfirst(l1)); sc->type = T_GroupClause; groupClause = lappend(groupClause, sc); @@ -5609,17 +5784,17 @@ make_deduplicate_plan(PlannerInfo *root, * ungrouped columns which will be bogus after the aggregate. */ tlist = NIL; - foreach (l1, sub_tlist) + foreach(l1, sub_tlist) { - TargetEntry *tle = lfirst(l1); - TargetEntry *newtle; + TargetEntry *tle = lfirst(l1); + TargetEntry *newtle; /* * Check if this target is a part of grouping columns. */ - foreach (l2, groupClause) + foreach(l2, groupClause) { - GroupClause *gc = lfirst(l2); + GroupClause *gc = lfirst(l2); if (gc->tleSortGroupRef == tle->ressortgroupref) break; @@ -5634,20 +5809,20 @@ make_deduplicate_plan(PlannerInfo *root, } /* - * Count ORDER BY expression so that since NULL input should - * be ignored. We still need not to eliminate NULL input since - * the result should be returned per group even if the group - * has nothing but NULL. + * Count ORDER BY expression so that since NULL input should be ignored. + * We still need not to eliminate NULL input since the result should be + * returned per group even if the group has nothing but NULL. */ tvexpr = (Expr *) get_sortgroupclause_expr(linitial(sortClause), sub_tlist); + /* * Append peer count expression to target list. */ *pc_pos_p = list_length(tlist) + 1; aggref = makeAggrefByOid(AGGFNOID_COUNT_ANY, list_make1(tvexpr)); pc_tle = makeTargetEntry((Expr *) aggref, - *pc_pos_p, "peer_count", false); + *pc_pos_p, "peer_count", false); tlist = lappend(tlist, pc_tle); MemSet(&agg_counts, 0, sizeof(AggClauseCounts)); @@ -5702,8 +5877,8 @@ make_deduplicate_plan(PlannerInfo *root, root->group_pathkeys = original_group_pathkeys; /* - * Add SubqueryScan to wrap this anyway, so that - * the outcome of deduplicate can be treated as a simple subquery relation. + * Add SubqueryScan to wrap this anyway, so that the outcome of + * deduplicate can be treated as a simple subquery relation. */ result_plan = wrap_plan_index(root, result_plan, @@ -5725,8 +5900,8 @@ make_deduplicate_plan(PlannerInfo *root, cdbpullup_expr((Expr *) root->parse->scatterClause, tlist, NIL, Outer); root->parse->rtable = parse->rtable; - // Rebuild arrays for RelOptInfo and RangeTblEntry for the PlannerInfo - // since the underlying range tables have been transformed + /* Rebuild arrays for RelOptInfo and RangeTblEntry for the PlannerInfo */ + /* since the underlying range tables have been transformed */ rebuild_simple_rel_and_rte(root); return result_plan; @@ -5747,16 +5922,16 @@ within_agg_make_baseplan(PlannerInfo *root, List *sortClause, Plan *result_plan) { - List *sub_tlist = group_context->sub_tlist; - double dedup_numGroups; - List *dedup_key_exprs; + List *sub_tlist = group_context->sub_tlist; + double dedup_numGroups; + List *dedup_key_exprs; /* * The GROUP BY keys are the normal grouping keys + sort key. */ dedup_key_exprs = list_concat( - get_sortgrouplist_exprs(root->parse->groupClause, sub_tlist), - get_sortgrouplist_exprs(sortClause, sub_tlist)); + get_sortgrouplist_exprs(root->parse->groupClause, sub_tlist), + get_sortgrouplist_exprs(sortClause, sub_tlist)); /* * Decide whether deduplicate is useful or not. @@ -5767,17 +5942,16 @@ within_agg_make_baseplan(PlannerInfo *root, &dedup_numGroups); /* - * Create the base subplan for the upper join. We may take - * decuplicate way or not, but anyway the target list of result_plan has - * an extra target entry for the peer count. + * Create the base subplan for the upper join. We may take decuplicate way + * or not, but anyway the target list of result_plan has an extra target + * entry for the peer count. */ if (wag_context->use_deduplicate) { /* - * The deduplicate optimization. We reduce identical rows - * and record the number of reduced rows, so that - * percentile function can see the original rows. - * It's similar to the run-length encoding. + * The deduplicate optimization. We reduce identical rows and record + * the number of reduced rows, so that percentile function can see the + * original rows. It's similar to the run-length encoding. * * root->parse is updated inside to represent this subquery. */ @@ -5792,18 +5966,18 @@ within_agg_make_baseplan(PlannerInfo *root, } else { - Query *subquery; - Expr *tv_expr; - NullTest *nt; - CaseWhen *casearg; - CaseExpr *pc_expr; - TargetEntry *pc_tle; + Query *subquery; + Expr *tv_expr; + NullTest *nt; + CaseWhen *casearg; + CaseExpr *pc_expr; + TargetEntry *pc_tle; /* - * The naive case. Wrapping this plan with SubqueryScan anyway - * is demanded as the underlying plan might be SharedInputScan where - * the target list should not be modified, and in order to align - * the semantics with de-duplicate case. + * The naive case. Wrapping this plan with SubqueryScan anyway is + * demanded as the underlying plan might be SharedInputScan where the + * target list should not be modified, and in order to align the + * semantics with de-duplicate case. */ result_plan = wrap_plan_index(root, result_plan, @@ -5821,8 +5995,8 @@ within_agg_make_baseplan(PlannerInfo *root, root->parse->rtable = subquery->rtable; /* - * We make zero as the peer count if tv is NULL. The inner - * should count up how many non-NULL there is. + * We make zero as the peer count if tv is NULL. The inner should + * count up how many non-NULL there is. * * pc = CASE WHEN tv IS NOT NULL THEN 1 ELSE 0 END */ @@ -5850,8 +6024,8 @@ within_agg_make_baseplan(PlannerInfo *root, } /* - * result_plan is SubqueryScan here whichever we took. Update locus - * in order to represent this subqeury. + * result_plan is SubqueryScan here whichever we took. Update locus in + * order to represent this subqeury. */ Assert(IsA(result_plan, SubqueryScan)); @@ -5871,9 +6045,9 @@ within_agg_add_outer_sort(PlannerInfo *root, List *sortClause, Plan *outer_plan) { - List *sort_pathkeys; - Query *outer_parse; - const Index Outer = 1; + List *sort_pathkeys; + Query *outer_parse; + const Index Outer = 1; if (!root->parse->groupClause) @@ -5899,13 +6073,13 @@ within_agg_add_outer_sort(PlannerInfo *root, outer_plan = (Plan *) make_motion_gather_to_QE(root, outer_plan, wag_context->current_pathkeys); outer_plan->total_cost += incremental_motion_cost(outer_plan->plan_rows, - outer_plan->plan_rows * root->config->cdbpath_segments); + outer_plan->plan_rows * root->config->cdbpath_segments); } } else { - CdbPathLocus current_locus; - List *groupSortClauses; + CdbPathLocus current_locus; + List *groupSortClauses; Assert(root->group_pathkeys); @@ -5919,10 +6093,11 @@ within_agg_add_outer_sort(PlannerInfo *root, /* * Add a redistribute motion if the group key doesn't collocate. - * group_pathkeys should have been fixed to reflect the latest targetlist. - * best_path->locus is wrong here since we put SubqueryScan already. + * group_pathkeys should have been fixed to reflect the latest + * targetlist. best_path->locus is wrong here since we put + * SubqueryScan already. */ - if (!cdbpathlocus_collocates(root, current_locus, root->group_pathkeys, false /*exact_match*/)) + if (!cdbpathlocus_collocates(root, current_locus, root->group_pathkeys, false /* exact_match */ )) { List *groupExprs; @@ -5933,11 +6108,13 @@ within_agg_add_outer_sort(PlannerInfo *root, outer_plan->total_cost += incremental_motion_cost(outer_plan->plan_rows, outer_plan->plan_rows); + /* * Invalidate pathkeys; the result is not sorted any more. */ wag_context->current_pathkeys = NULL; } + /* * Now we can add sort node. */ @@ -5952,6 +6129,7 @@ within_agg_add_outer_sort(PlannerInfo *root, groupSortClauses, outer_plan); mark_sort_locus(outer_plan); + /* * Update current pathkeys. */ @@ -5970,6 +6148,7 @@ within_agg_add_outer_sort(PlannerInfo *root, cdbpullup_expr((Expr *) root->parse->havingQual, outer_plan->targetlist, NIL, 1); root->parse->scatterClause = (List *) cdbpullup_expr((Expr *) root->parse->scatterClause, outer_plan->targetlist, NIL, 1); + /* * Wrap plan by subquery as the outer of upcoming join. */ @@ -6001,41 +6180,43 @@ within_agg_construct_inner(PlannerInfo *root, WithinAggContext *wag_context, Plan *inner_plan) { - ListCell *l; - int idx; - int numGroupCols; - Path input_path; - TargetEntry *pc_tle; - Expr *tc_expr; - GroupContext ctx; - List *tlist; - double numGroups = *group_context->p_dNumGroups; - bool use_hashed_grouping; - bool querynode_changed = false; - List *pathkeys = NIL; - AggClauseCounts agg_counts; - AttrNumber *grpColIdx; - Oid *grpOperators; - Query *original_parse; - List *original_group_pathkeys; - Query *parse; - const Index Inner = 2; + ListCell *l; + int idx; + int numGroupCols; + Path input_path; + TargetEntry *pc_tle; + Expr *tc_expr; + GroupContext ctx; + List *tlist; + double numGroups = *group_context->p_dNumGroups; + bool use_hashed_grouping; + bool querynode_changed = false; + List *pathkeys = NIL; + AggClauseCounts agg_counts; + AttrNumber *grpColIdx; + Oid *grpOperators; + Query *original_parse; + List *original_group_pathkeys; + Query *parse; + const Index Inner = 2; get_grouplist_colidx(root->parse->groupClause, inner_plan->targetlist, &numGroupCols, &grpColIdx, &grpOperators); /* build grouping key columns */ tlist = NIL; - foreach_with_count (l, root->parse->groupClause, idx) + foreach_with_count(l, root->parse->groupClause, idx) { - GroupClause *gc = (GroupClause *) lfirst(l); - TargetEntry *tle, *newtle; + GroupClause *gc = (GroupClause *) lfirst(l); + TargetEntry *tle, + *newtle; tle = get_sortgroupclause_tle(gc, inner_plan->targetlist); newtle = flatCopyTargetEntry(tle); newtle->resno = (AttrNumber) idx + 1; tlist = lappend(tlist, newtle); } + /* * Sum up the peer count to count the total number of rows per group. */ @@ -6051,15 +6232,15 @@ within_agg_construct_inner(PlannerInfo *root, /* * best_path is not appropriate here after building some SubqueryScan. - * Build up a dummy Path to reflect the underlying plan, but - * needed information is only locus in cdb_grouping_planner. + * Build up a dummy Path to reflect the underlying plan, but needed + * information is only locus in cdb_grouping_planner. */ memcpy(&input_path, group_context->best_path, sizeof(Path)); /* * Create locus back from flow. Unfortunately cdbpathlocus_from_flow() - * doesn't return hashed locus in repartitioned case, so we need to - * call from_exprs() again if it's available. + * doesn't return hashed locus in repartitioned case, so we need to call + * from_exprs() again if it's available. */ input_path.locus = cdbpathlocus_from_flow(inner_plan->flow); if (CdbPathLocus_IsPartitioned(input_path.locus) && @@ -6072,8 +6253,8 @@ within_agg_construct_inner(PlannerInfo *root, /* * Evaluate possibility for hash/sort strategy. Things have been changed - * since the last decision in grouping_planner(), as the base plan - * may now be sorted. + * since the last decision in grouping_planner(), as the base plan may now + * be sorted. */ use_hashed_grouping = choose_hashed_grouping(root, group_context->tuple_fraction, @@ -6136,8 +6317,8 @@ within_agg_construct_inner(PlannerInfo *root, Assert(list_length(wag_context->rtable) == 2); /* - * Restore the original info. Note that group_pathkeys is updated - * in wrap_plan_index(), so don't move this before it. + * Restore the original info. Note that group_pathkeys is updated in + * wrap_plan_index(), so don't move this before it. */ root->parse = original_parse; root->group_pathkeys = original_group_pathkeys; @@ -6162,16 +6343,18 @@ within_agg_join_plans(PlannerInfo *root, Plan *outer_plan, Plan *inner_plan) { - Plan *result_plan; - ListCell *l; - List *join_tlist; - List *join_clause; - Oid *mergefamilies; - int *mergestrategies; - bool *mergenullsfirst; - const Index Outer = 1, Inner = 2; - List *extravars; - Var *pc_var, *tc_var; + Plan *result_plan; + ListCell *l; + List *join_tlist; + List *join_clause; + Oid *mergefamilies; + int *mergestrategies; + bool *mergenullsfirst; + const Index Outer = 1, + Inner = 2; + List *extravars; + Var *pc_var, + *tc_var; /* * Up to now, these should've been prepared. @@ -6182,26 +6365,28 @@ within_agg_join_plans(PlannerInfo *root, /* * Build target list for grouping columns. * - * This is similar to make_subplanTargetList(), but things are much simpler. - * Note that this makes sure that expressions like SRF are going to be - * in the upper aggregate target list rather than in this join target list. + * This is similar to make_subplanTargetList(), but things are much + * simpler. Note that this makes sure that expressions like SRF are going + * to be in the upper aggregate target list rather than in this join + * target list. */ join_tlist = flatten_tlist(root->parse->targetList); - // GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true - // in pull_var_clause ? + /* GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true */ + /* in pull_var_clause ? */ extravars = pull_var_clause(root->parse->havingQual, false); join_tlist = add_to_flat_tlist(join_tlist, extravars, false); - foreach (l, root->parse->groupClause) + foreach(l, root->parse->groupClause) { - GroupClause *gc = lfirst(l); - TargetEntry *gc_tle, *join_tle; + GroupClause *gc = lfirst(l); + TargetEntry *gc_tle, + *join_tle; /* * We need the grouping expressions in the target list. If they are - * in the taget list already, we remember the grouping reference - * since exracting vars drop those information. Otherwise, we - * simply append the entry to the target list. + * in the taget list already, we remember the grouping reference since + * exracting vars drop those information. Otherwise, we simply append + * the entry to the target list. */ gc_tle = get_sortgroupclause_tle(gc, root->parse->targetList); join_tle = tlist_member((Node *) gc_tle->expr, join_tlist); @@ -6219,9 +6404,9 @@ within_agg_join_plans(PlannerInfo *root, } /* - * Make sure that the peer count and the total count is in the - * target list of the join. They will be needed in the upper - * final aggregate by the percentile functions. + * Make sure that the peer count and the total count is in the target list + * of the join. They will be needed in the upper final aggregate by the + * percentile functions. */ pc_var = makeVar(Outer, wag_context->pc_pos, INT8OID, -1, 0); tc_var = makeVar(Inner, wag_context->tc_pos, INT8OID, -1, 0); @@ -6230,17 +6415,17 @@ within_agg_join_plans(PlannerInfo *root, false); /* add vars from flow expression: MPP-20076 */ - // GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true - // in pull_var_clause ? + /* GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true */ + /* in pull_var_clause ? */ extravars = pull_var_clause((Node *) outer_plan->flow->hashExpr, false); - join_tlist = add_to_flat_tlist(join_tlist, extravars, false /*resjunk*/); + join_tlist = add_to_flat_tlist(join_tlist, extravars, false /* resjunk */ ); /* * It is ideal to tell if the inner plan is fine to merge-join by - * examining it as re-scannable plan, but it seems we don't have - * such infrastructure, so here we assume the inner plan is not - * re-scannable and not sorted. If it is a grouping query, - * we add sort node, otherwise just put a materialize node. + * examining it as re-scannable plan, but it seems we don't have such + * infrastructure, so here we assume the inner plan is not re-scannable + * and not sorted. If it is a grouping query, we add sort node, otherwise + * just put a materialize node. */ if (root->parse->groupClause && !wag_context->inner_pathkeys) { @@ -6264,15 +6449,14 @@ within_agg_join_plans(PlannerInfo *root, } /* - * All set. Join two plans. - * We choose cartesian product if there is no join clauses, meaning - * no grouping happens. + * All set. Join two plans. We choose cartesian product if there is no + * join clauses, meaning no grouping happens. */ if (root->parse->groupClause != NIL) { - int idx; - ListCell *lg; - ListCell *lpk; + int idx; + ListCell *lg; + ListCell *lpk; int ngroups = list_length(root->parse->groupClause); /* Build merge join clauses for grouping columns */ @@ -6283,12 +6467,12 @@ within_agg_join_plans(PlannerInfo *root, idx = 0; forboth(lg, root->parse->groupClause, lpk, root->group_pathkeys) { - GroupClause *gc = (GroupClause *) lfirst(lg); - PathKey *pk = (PathKey *) lfirst(lpk); - TargetEntry *tle; - Var *outer_var, - *inner_var; - RestrictInfo *rinfo; + GroupClause *gc = (GroupClause *) lfirst(lg); + PathKey *pk = (PathKey *) lfirst(lpk); + TargetEntry *tle; + Var *outer_var, + *inner_var; + RestrictInfo *rinfo; /* Construct outer group key. */ tle = get_sortgroupclause_tle(gc, outer_plan->targetlist); @@ -6362,15 +6546,17 @@ within_agg_final_agg(PlannerInfo *root, List *sortClause, Plan *result_plan) { - ListCell *l; - List *percentiles; - Var *pc_var, *tc_var; - AttrNumber *grpColIdx; - Oid *grpOperators; - int numGroupCols; - AggClauseCounts agg_counts; - AggStrategy aggstrategy; - const Index Outer = 1, Inner = 2; + ListCell *l; + List *percentiles; + Var *pc_var, + *tc_var; + AttrNumber *grpColIdx; + Oid *grpOperators; + int numGroupCols; + AggClauseCounts agg_counts; + AggStrategy aggstrategy; + const Index Outer = 1, + Inner = 2; /* * Sanity check. These should've been prepared up to now. @@ -6385,9 +6571,9 @@ within_agg_final_agg(PlannerInfo *root, percentiles = fetch_percentiles(root->parse, sortClause); pc_var = makeVar(Outer, wag_context->pc_pos, INT8OID, -1, 0); tc_var = makeVar(Inner, wag_context->tc_pos, INT8OID, -1, 0); - foreach (l, percentiles) + foreach(l, percentiles) { - PercentileExpr *perc = lfirst(l); + PercentileExpr *perc = lfirst(l); perc->pcExpr = (Expr *) pc_var; perc->tcExpr = (Expr *) tc_var; @@ -6397,8 +6583,8 @@ within_agg_final_agg(PlannerInfo *root, count_agg_clauses((Node *) root->parse->targetList, &agg_counts); /* - * Prepare GROUP BY clause for the final aggregate. - * Make sure the column indices point to the topmost target list. + * Prepare GROUP BY clause for the final aggregate. Make sure the column + * indices point to the topmost target list. */ get_grouplist_colidx(root->parse->groupClause, result_plan->targetlist, &numGroupCols, @@ -6406,11 +6592,13 @@ within_agg_final_agg(PlannerInfo *root, aggstrategy = root->parse->groupClause ? AGG_SORTED : AGG_PLAIN; /* add vars from flow expression: MPP-20076 */ - List *targetList = root->parse->targetList; - // GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true - // in pull_var_clause ? - List *extravars = pull_var_clause((Node *) result_plan->flow->hashExpr, false); - targetList = add_to_flat_tlist(targetList, extravars, true /*resjunk*/); + List *targetList = root->parse->targetList; + + /* GPDB_84_MERGE_FIXME: Should we pass includePlaceHolderVars as true */ + /* in pull_var_clause ? */ + List *extravars = pull_var_clause((Node *) result_plan->flow->hashExpr, false); + + targetList = add_to_flat_tlist(targetList, extravars, true /* resjunk */ ); list_free(extravars); result_plan = (Plan *) make_agg(root, @@ -6422,16 +6610,17 @@ within_agg_final_agg(PlannerInfo *root, grpColIdx, grpOperators, *group_context->p_dNumGroups, - 0, /* num_nullcols */ - 0, /* input_grouping */ - 0, /* grouping */ - 0, /* rollup_gs_times */ - 1, /* numAggs */ + 0, /* num_nullcols */ + 0, /* input_grouping */ + 0, /* grouping */ + 0, /* rollup_gs_times */ + 1, /* numAggs */ agg_counts.transitionSpace, result_plan); /* - * Stop copying sorts in flow, for the targetlist doesn't have them anymore. + * Stop copying sorts in flow, for the targetlist doesn't have them + * anymore. */ mark_passthru_locus(result_plan, true, false); @@ -6499,25 +6688,25 @@ plan_within_agg_persort(PlannerInfo *root, List *current_pathkeys, Plan *result_plan) { - WithinAggContext wag_context; - Plan *outer_plan, *inner_plan; - List *partners; - ListCell *l; + WithinAggContext wag_context; + Plan *outer_plan, + *inner_plan; + List *partners; + ListCell *l; memset(&wag_context, 0, sizeof(WithinAggContext)); wag_context.current_pathkeys = current_pathkeys; /* - * Group clause expressions should be in ascending order, - * because our MergeJoin is not able to handle descending-ordered - * child plans. It is desirable to improve MergeJoin, but it requires - * amount of work. + * Group clause expressions should be in ascending order, because our + * MergeJoin is not able to handle descending-ordered child plans. It is + * desirable to improve MergeJoin, but it requires amount of work. */ - foreach (l, root->parse->groupClause) + foreach(l, root->parse->groupClause) { - GroupClause *gc = lfirst(l); - Node *gcexpr; - Oid gctype; + GroupClause *gc = lfirst(l); + Node *gcexpr; + Oid gctype; /* * We assume only flattened grouping expressions here. @@ -6529,8 +6718,8 @@ plan_within_agg_persort(PlannerInfo *root, } /* - * Make a common plan shared by outer and inner plan. It may become - * a de-duplicate plan. + * Make a common plan shared by outer and inner plan. It may become a + * de-duplicate plan. */ result_plan = within_agg_make_baseplan(root, group_context, @@ -6541,10 +6730,10 @@ plan_within_agg_persort(PlannerInfo *root, Assert(IsA(result_plan, SubqueryScan)); /* - * Split the tree into outer and inner which will be joined later. - * It comes before Sort, so that the both of outer and inner run - * in parallel. We observed in most cases splitting it here requires - * RedistributeMotion in both sides, which allows more parallel way. + * Split the tree into outer and inner which will be joined later. It + * comes before Sort, so that the both of outer and inner run in parallel. + * We observed in most cases splitting it here requires RedistributeMotion + * in both sides, which allows more parallel way. */ partners = share_plan(root, result_plan, 2); outer_plan = list_nth(partners, 0); @@ -6563,9 +6752,9 @@ plan_within_agg_persort(PlannerInfo *root, Assert(list_length(wag_context.rtable) == 1); /* - * Construct inner plan of join that returns only number of rows. - * The inner side always create target list looking like - * G1, G2, ..., count(*) TP + * Construct inner plan of join that returns only number of rows. The + * inner side always create target list looking like G1, G2, ..., count(*) + * TP */ inner_plan = within_agg_construct_inner(root, group_context, @@ -6586,16 +6775,16 @@ plan_within_agg_persort(PlannerInfo *root, inner_plan); #ifdef NOT_USED -if (true) -{ - /* - * For debug purpose. - * This helps to see what's the intermediate result. - */ - root->parse->targetList = copyObject(result_plan->targetlist); - root->parse->rtable = wag_context.rtable; - return result_plan; -} + if (true) + { + /* + * For debug purpose. This helps to see what's the intermediate + * result. + */ + root->parse->targetList = copyObject(result_plan->targetlist); + root->parse->rtable = wag_context.rtable; + return result_plan; + } #endif /* @@ -6630,12 +6819,14 @@ within_agg_planner(PlannerInfo *root, AggClauseCounts *agg_counts, GroupContext *group_context) { - List *aggnodes, *percnodes; + List *aggnodes, + *percnodes; ListCell *l; List **aggreflist; List **sortlist; int numsortlist; - int numGroupCols, numDistinctCols; + int numGroupCols, + numDistinctCols; AttrNumber *grpColIdx; Oid *grpOperators; AttrNumber *distinctColIdx; @@ -6660,16 +6851,16 @@ within_agg_planner(PlannerInfo *root, /* initialize each element with NIL */ aggreflist = (List **) palloc0(sizeof(List *) * numsortlist); sortlist = (List **) palloc0(sizeof(List *) * numsortlist); - numsortlist = 0; /* Use this as a counter */ + numsortlist = 0; /* Use this as a counter */ sub_tlist = group_context->sub_tlist; next_resno = list_length(sub_tlist) + 1; /* * WITHIN aggregates are not supported in the grouping extensions. - * However, parse->groupClause may have non-flattened GroupClause list. - * We simply flatten it by reconstruct_group_clause under the assumption - * that we have denied grouping extension cases. + * However, parse->groupClause may have non-flattened GroupClause list. We + * simply flatten it by reconstruct_group_clause under the assumption that + * we have denied grouping extension cases. */ Assert(!is_grouping_extension(group_context->canonical_grpsets)); get_grouplist_colidx(root->parse->groupClause, sub_tlist, @@ -6681,23 +6872,23 @@ within_agg_planner(PlannerInfo *root, numGroupCols); numDistinctCols = agg_counts->numDistinctAggs; distinctColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numDistinctCols); - numDistinctCols = 0; /* Use this as a counter */ + numDistinctCols = 0; /* Use this as a counter */ /* - * Collect aggref nodes to process them separately from percentiles. - * Note we represent this special case by NIL for sortClause (sortlist.) + * Collect aggref nodes to process them separately from percentiles. Note + * we represent this special case by NIL for sortClause (sortlist.) */ if (aggnodes) { sortlist[numsortlist] = NIL; - foreach (l, aggnodes) + foreach(l, aggnodes) { Aggref *aggref = lfirst(l); if (aggref->aggdistinct) { - Node *arg; - TargetEntry *sub_tle; + Node *arg; + TargetEntry *sub_tle; Assert(list_length(aggref->args) == 1); arg = linitial(aggref->args); @@ -6705,9 +6896,9 @@ within_agg_planner(PlannerInfo *root, if (!sub_tle) { sub_tle = makeTargetEntry((Expr *) arg, - next_resno++, - "", - true); + next_resno++, + "", + true); sub_tlist = lappend(sub_tlist, sub_tle); } distinctColIdx[numDistinctCols++] = sub_tle->resno; @@ -6718,29 +6909,30 @@ within_agg_planner(PlannerInfo *root, } /* - * Collect percentile nodes and classify them into some groups by sortClause. - * During this process, if sub_tlist lacks target entry for the - * sortClause, it is added. + * Collect percentile nodes and classify them into some groups by + * sortClause. During this process, if sub_tlist lacks target entry for + * the sortClause, it is added. */ - foreach (l, percnodes) + foreach(l, percnodes) { PercentileExpr *perc = lfirst(l); - List *sortClause; - ListCell *sl; + List *sortClause; + ListCell *sl; sortClause = perc->sortClause; Assert(sortClause); /* - * We need to add tlist to the parse's tlist. This is - * basically parser's task, but the list is separated - * to keep away from ungroup'ed columns check and - * this is the right place to add back to the parser's tlist. + * We need to add tlist to the parse's tlist. This is basically + * parser's task, but the list is separated to keep away from + * ungroup'ed columns check and this is the right place to add back to + * the parser's tlist. */ - foreach (sl, sortClause) + foreach(sl, sortClause) { - SortClause *sc = lfirst(sl); - TargetEntry *tle, *sub_tle; + SortClause *sc = lfirst(sl); + TargetEntry *tle, + *sub_tle; tle = get_sortgroupclause_tle(sc, perc->sortTargets); sub_tle = tlist_member((Node *) tle->expr, sub_tlist); @@ -6774,8 +6966,8 @@ within_agg_planner(PlannerInfo *root, group_context->sub_tlist = sub_tlist; /* - * Make the scaffold. We always take best_path here because - * it is not clear which to use for upcoming complex plans. + * Make the scaffold. We always take best_path here because it is not + * clear which to use for upcoming complex plans. */ Assert(sub_tlist != NIL); result_plan = create_plan(root, group_context->best_path); @@ -6791,8 +6983,8 @@ within_agg_planner(PlannerInfo *root, /* * The approach is very close to the one for DQA. If the plan consists * only one sort group, then construct straightforward plan without - * mangling target list. If more than one, including normal aggregate, - * we split each sort group (normal aggregate goes to NIL sort group) into + * mangling target list. If more than one, including normal aggregate, we + * split each sort group (normal aggregate goes to NIL sort group) into * sub pieces and construct separate plans, and join them to get the final * plan to match the desired target list. */ @@ -6810,11 +7002,11 @@ within_agg_planner(PlannerInfo *root, else { /* - * The pattern on multi-level sort is similar to multi dqa. - * We use its infrastructure much to avoid reinventing wheel. + * The pattern on multi-level sort is similar to multi dqa. We use its + * infrastructure much to avoid reinventing wheel. */ List *base_plans; - MppGroupContext mgctx; + MppGroupContext mgctx; List *rtable; base_plans = share_plan(root, result_plan, numsortlist); @@ -6830,15 +7022,16 @@ within_agg_planner(PlannerInfo *root, mgctx.numGroupCols = numGroupCols; /* - * This code is from deconstruct_agg_info. What it does is to - * collect grouping keys and make a simple list which contain - * only those key expressions, which will be used in each individual - * plan tree as leading columns (and later JOIN clause). + * This code is from deconstruct_agg_info. What it does is to collect + * grouping keys and make a simple list which contain only those key + * expressions, which will be used in each individual plan tree as + * leading columns (and later JOIN clause). */ mgctx.grps_tlist = NIL; for (i = 0; i < numGroupCols; i++) { - TargetEntry *sub_tle, *prelim_tle; + TargetEntry *sub_tle, + *prelim_tle; sub_tle = get_tle_by_resno(sub_tlist, grpColIdx[i]); prelim_tle = flatCopyTargetEntry(sub_tle); @@ -6848,21 +7041,23 @@ within_agg_planner(PlannerInfo *root, mgctx.dref_tlists = (List **) palloc0(numsortlist * sizeof(List *)); /* - * Within-aggregate special. Used in within_aggregate_expr(). - * Each sub-plan tree is identified by the sort clause. + * Within-aggregate special. Used in within_aggregate_expr(). Each + * sub-plan tree is identified by the sort clause. */ mgctx.wagSortClauses = NIL; for (i = 0; i < numsortlist; i++) mgctx.wagSortClauses = lappend(mgctx.wagSortClauses, sortlist[i]); /* - * Prepare the final tlist to restore the original list. The main work - * goes into deconstruct_within_agg(), which determins which sub-plan tree - * this expression is actually coming from, and store that information in mgctx. + * Prepare the final tlist to restore the original list. The main + * work goes into deconstruct_within_agg(), which determins which + * sub-plan tree this expression is actually coming from, and store + * that information in mgctx. */ - foreach (l, root->parse->targetList) + foreach(l, root->parse->targetList) { - TargetEntry *tle, *final_tle; + TargetEntry *tle, + *final_tle; tle = (TargetEntry *) lfirst(l); final_tle = flatCopyTargetEntry(tle); @@ -6873,12 +7068,13 @@ within_agg_planner(PlannerInfo *root, /* * HAVING clause, same as target list. We wish we could optimize this - * as pushing each expression down to the individual plan tree, but - * we don't do it and just follow the same notion of DQA for now. + * as pushing each expression down to the individual plan tree, but we + * don't do it and just follow the same notion of DQA for now. */ - foreach (l, (List *) root->parse->havingQual) + foreach(l, (List *) root->parse->havingQual) { - Expr *qual, *fin_hqual; + Expr *qual, + *fin_hqual; qual = lfirst(l); fin_hqual = (Expr *) deconstruct_within_agg((Node *) qual, &mgctx); @@ -6897,8 +7093,8 @@ within_agg_planner(PlannerInfo *root, } /* - * Now plan each tree. Store them to array and later join them. - * Don't forget to save rtable representing each subquery. + * Now plan each tree. Store them to array and later join them. Don't + * forget to save rtable representing each subquery. */ rtable = NIL; mgctx.dqaArgs = (DqaInfo *) palloc(numsortlist * sizeof(DqaInfo)); @@ -6915,10 +7111,11 @@ within_agg_planner(PlannerInfo *root, * The base plan is created by best_path. */ current_pathkeys = group_context->best_path->pathkeys; + /* * We use different instance of PlannerInfo for each cycle - * especially cdb_grouping_planner frees simple_rel_array. - * See also plan_append_aggs_with_rewrite. + * especially cdb_grouping_planner frees simple_rel_array. See + * also plan_append_aggs_with_rewrite. */ memcpy(&root_copy, root, sizeof(PlannerInfo)); sz = root->simple_rel_array_size * sizeof(RelOptInfo *); @@ -6931,9 +7128,10 @@ within_agg_planner(PlannerInfo *root, */ coquery = copyObject(root->parse); coquery->targetList = seq_tlist_concat(copyObject(mgctx.grps_tlist), mgctx.dref_tlists[i]); + /* - * Clear havingQual and scatterClause, since they will be handled only - * the top of joins, and never in individual aggregate. + * Clear havingQual and scatterClause, since they will be handled + * only the top of joins, and never in individual aggregate. */ coquery->havingQual = NULL; coquery->scatterClause = NIL; @@ -6956,7 +7154,7 @@ within_agg_planner(PlannerInfo *root, /* * Run normal grouping planner for normal aggs. */ - GroupContext local_group_context; + GroupContext local_group_context; memcpy(&local_group_context, group_context, sizeof(GroupContext)); local_group_context.subplan = list_nth(base_plans, i); @@ -6983,9 +7181,9 @@ within_agg_planner(PlannerInfo *root, snprintf(queryname, sizeof(queryname), "wag_coplan_%d", i + 1); mgctx.dqaArgs[i].coplan = - wrap_plan_index(&root_copy, coplan, coquery, - NULL, i + 1, queryname, - &coquery); + wrap_plan_index(&root_copy, coplan, coquery, + NULL, i + 1, queryname, + &coquery); rtable = list_concat(rtable, coquery->rtable); } @@ -7002,9 +7200,10 @@ within_agg_planner(PlannerInfo *root, result_plan->qual = (List *) finalize_split_expr((Node *) mgctx.fin_hqual, &mgctx); UpdateScatterClause(root->parse, result_plan->targetlist); + /* - * Reconstruct the flow since the targetlist for the result_plan may have - * changed. + * Reconstruct the flow since the targetlist for the result_plan may + * have changed. */ result_plan->flow = pull_up_Flow(result_plan, result_plan->lefttree); /* Need to adjust root->parse for upper plan. */ @@ -7020,31 +7219,37 @@ within_agg_planner(PlannerInfo *root, return result_plan; } -Plan *add_motion_to_dqa_child(Plan *plan, PlannerInfo *root, bool *motion_added) +Plan * +add_motion_to_dqa_child(Plan *plan, PlannerInfo *root, bool *motion_added) { - Plan *result = plan; + Plan *result = plan; + *motion_added = false; - - List *pathkeys = make_pathkeys_for_groupclause(root, root->parse->groupClause, plan->targetlist); + + List *pathkeys = make_pathkeys_for_groupclause(root, root->parse->groupClause, plan->targetlist); CdbPathLocus locus = cdbpathlocus_from_flow(plan->flow); + if (CdbPathLocus_IsPartitioned(locus) && NIL != plan->flow->hashExpr) { locus = cdbpathlocus_from_exprs(root, plan->flow->hashExpr); } - - if (!cdbpathlocus_collocates(root, locus, pathkeys, true /*exact_match*/)) + + if (!cdbpathlocus_collocates(root, locus, pathkeys, true /* exact_match */ )) { - /* MPP-22413: join requires exact distribution match for collocation purposes, - * which may not be provided by the underlying group by, as computing the - * group by only requires relaxed distribution collocation + /* + * MPP-22413: join requires exact distribution match for collocation + * purposes, which may not be provided by the underlying group by, as + * computing the group by only requires relaxed distribution + * collocation */ List *groupExprs = get_sortgrouplist_exprs(root->parse->groupClause, - plan->targetlist); + plan->targetlist); + result = (Plan *) make_motion_hash(root, plan, groupExprs); result->total_cost += incremental_motion_cost(plan->plan_rows, plan->plan_rows); *motion_added = true; } - + return result; } diff --git a/src/backend/cdb/cdbpartition.c b/src/backend/cdb/cdbpartition.c index 22947672f4..7d93a0764b 100644 --- a/src/backend/cdb/cdbpartition.c +++ b/src/backend/cdb/cdbpartition.c @@ -63,16 +63,16 @@ #define MAX_XCHG_BLOCK_SIZE 4096 typedef struct - { - char *key; - List *table_cons; - List *part_cons; - List *cand_cons; - } ConstraintEntry; +{ + char *key; + List *table_cons; + List *part_cons; + List *cand_cons; +} ConstraintEntry; typedef struct { - Node *entry; + Node *entry; } ConNodeEntry; @@ -82,30 +82,26 @@ typedef enum PART_PART, PART_CAND } PartExchangeRole; -static void -record_constraints(Relation pgcon, MemoryContext context, +static void record_constraints(Relation pgcon, MemoryContext context, HTAB *hash_tbl, Relation rel, PartExchangeRole xrole); -static char * -constraint_names(List *cons); +static char *constraint_names(List *cons); static void -constraint_diffs(List *cons_a, List *cons_b, bool match_names, List **missing, List **extra); + constraint_diffs(List *cons_a, List *cons_b, bool match_names, List **missing, List **extra); static void add_template_encoding_clauses(Oid relid, Oid paroid, List *stenc); -static PartitionNode * -findPartitionNodeEntry(PartitionNode *partitionNode, Oid partOid); +static PartitionNode *findPartitionNodeEntry(PartitionNode *partitionNode, Oid partOid); static uint32 -constrNodeHash(const void *keyPtr, Size keysize); + constrNodeHash(const void *keyPtr, Size keysize); static int -constrNodeMatch(const void *keyPtr1, const void *keyPtr2, Size keysize); + constrNodeMatch(const void *keyPtr1, const void *keyPtr2, Size keysize); -static void -parruleord_open_gap(Oid partid, int2 level, Oid parent, +static void parruleord_open_gap(Oid partid, int2 level, Oid parent, int2 ruleord, int stopkey, bool closegap); /* @@ -115,106 +111,98 @@ parruleord_open_gap(Oid partid, int2 level, Oid parent, */ /* Hash entire string. */ -static uint32 key_string_hash(const void *key, Size keysize) +static uint32 +key_string_hash(const void *key, Size keysize) { Size s_len = strlen((const char *) key); - Assert(keysize == sizeof(char*)); + Assert(keysize == sizeof(char *)); return DatumGetUInt32(hash_any((const unsigned char *) key, (int) s_len)); } /* Compare entire string. */ -static int key_string_compare(const void *key1, const void *key2, Size keysize) +static int +key_string_compare(const void *key1, const void *key2, Size keysize) { - Assert(keysize == sizeof(char*)); - return strcmp(((ConstraintEntry*)key1)->key, key2); + Assert(keysize == sizeof(char *)); + return strcmp(((ConstraintEntry *) key1)->key, key2); } /* Copy string by copying pointer. */ -static void *key_string_copy(void *dest, const void *src, Size keysize) +static void * +key_string_copy(void *dest, const void *src, Size keysize) { - Assert(keysize == sizeof(char*)); + Assert(keysize == sizeof(char *)); - *((char**)dest) = (char*)src; /* trust caller re allocation */ - return NULL; /* not used */ + *((char **) dest) = (char *) src; /* trust caller re allocation */ + return NULL; /* not used */ } static char parttype_to_char(PartitionByType type); static void add_partition(Partition *part); static void add_partition_rule(PartitionRule *rule); -static Oid get_part_oid(Oid rootrelid, int16 parlevel, bool istemplate); +static Oid get_part_oid(Oid rootrelid, int16 parlevel, bool istemplate); static Datum *magic_expr_to_datum(Relation rel, PartitionNode *partnode, - Node *expr, bool **ppisnull); -static Oid selectPartitionByRank(PartitionNode *partnode, int rnk); + Node *expr, bool **ppisnull); +static Oid selectPartitionByRank(PartitionNode *partnode, int rnk); static bool compare_partn_opfuncid(PartitionNode *partnode, - char *pub, char *compare_op, - List *colvals, - Datum *values, bool *isnull, - TupleDesc tupdesc); -static PartitionNode * -selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, + char *pub, char *compare_op, + List *colvals, + Datum *values, bool *isnull, + TupleDesc tupdesc); +static PartitionNode *selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, Oid *foundOid, PartitionRule **prule, Oid exprTypid); -static Oid get_less_than_oper(Oid lhstypid, Oid rhstypid, bool strictlyless); +static Oid get_less_than_oper(Oid lhstypid, Oid rhstypid, bool strictlyless); static FmgrInfo *get_less_than_comparator(int keyno, PartitionRangeState *rs, Oid ruleTypeOid, Oid exprTypeOid, bool strictlyless, bool is_direct); static int range_test(Datum tupval, Oid ruleTypeOid, Oid exprTypeOid, PartitionRangeState *rs, int keyno, PartitionRule *rule); -static PartitionNode * -selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, +static PartitionNode *selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, Oid *foundOid, int *pSearch, PartitionRule **prule, Oid exprTypid); -static PartitionNode * -selectHashPartition(PartitionNode *partnode, Datum *values, bool *isnull, +static PartitionNode *selectHashPartition(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, Oid *found, PartitionRule **prule); -static Oid -selectPartition1(PartitionNode *partnode, Datum *values, bool *isnull, +static Oid selectPartition1(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, int *pSearch, PartitionNode **ppn_out); -static int -atpxPart_validate_spec( - PartitionBy *pBy, - CreateStmtContext *pcxt, - Relation rel, - CreateStmt *ct, - PartitionElem *pelem, - PartitionNode *pNode, - char *partName, - bool isDefault, - PartitionByType part_type, - char *partDesc); +static int atpxPart_validate_spec( + PartitionBy *pBy, + CreateStmtContext *pcxt, + Relation rel, + CreateStmt *ct, + PartitionElem *pelem, + PartitionNode *pNode, + char *partName, + bool isDefault, + PartitionByType part_type, + char *partDesc); static void atpxSkipper(PartitionNode *pNode, int *skipped); -static List * -build_rename_part_recurse(PartitionRule *rule, const char *old_parentname, +static List *build_rename_part_recurse(PartitionRule *rule, const char *old_parentname, const char *new_parentname, int *skipped); static Oid -get_opfuncid_by_opname(List *opname, Oid lhsid, Oid rhsid); + get_opfuncid_by_opname(List *opname, Oid lhsid, Oid rhsid); -static PgPartRule * -get_pprule_from_ATC(Relation rel, AlterTableCmd *cmd); +static PgPartRule *get_pprule_from_ATC(Relation rel, AlterTableCmd *cmd); -static List* -get_partition_rules(PartitionNode *pn); +static List *get_partition_rules(PartitionNode *pn); static bool -relation_has_supers(Oid relid); + relation_has_supers(Oid relid); -static NewConstraint * -constraint_apply_mapped(HeapTuple tuple, AttrMap *map, Relation cand, +static NewConstraint *constraint_apply_mapped(HeapTuple tuple, AttrMap *map, Relation cand, bool validate, bool is_split, Relation pgcon); -static char * -ChooseConstraintNameForPartitionCreate(const char *rname, +static char *ChooseConstraintNameForPartitionCreate(const char *rname, const char *cname, const char *label, List *used_names); -static Bitmapset * -get_partition_key_bitmapset(Oid relid); +static Bitmapset *get_partition_key_bitmapset(Oid relid); static List *get_deparsed_partition_encodings(Oid relid, Oid paroid); static List *rel_get_leaf_relids_from_rule(Oid ruleOid); @@ -231,9 +219,10 @@ rel_is_default_partition(Oid relid) ScanKeyData scankey; SysScanDesc sscan; - /* Though pg_partition and pg_partition_rule are only populated - * on the entry database, we accept calls from QEs running a - * segment, but return false. + /* + * Though pg_partition and pg_partition_rule are only populated on the + * entry database, we accept calls from QEs running a segment, but return + * false. */ if (Gp_segment != -1) return false; @@ -250,7 +239,7 @@ rel_is_default_partition(Oid relid) Insist(HeapTupleIsValid(tuple)); - parisdefault = ((Form_pg_partition_rule)GETSTRUCT(tuple))->parisdefault; + parisdefault = ((Form_pg_partition_rule) GETSTRUCT(tuple))->parisdefault; systable_endscan(sscan); heap_close(partrulerel, AccessShareLock); @@ -271,15 +260,15 @@ rel_is_default_partition(Oid relid) bool rel_is_partitioned(Oid relid) { - ScanKeyData scankey; + ScanKeyData scankey; Relation rel; SysScanDesc sscan; bool result; /* - * Though pg_partition and pg_partition_rule are only populated - * on the entry database, we accept calls from QEs running a - * segment, but return false. + * Though pg_partition and pg_partition_rule are only populated on the + * entry database, we accept calls from QEs running a segment, but return + * false. */ if (Gp_segment != -1) return false; @@ -316,17 +305,18 @@ rel_is_partitioned(Oid relid) List * rel_partition_key_attrs(Oid relid) { - Relation rel; + Relation rel; ScanKeyData key; SysScanDesc scan; - HeapTuple tuple; - List *pkeys = NIL; + HeapTuple tuple; + List *pkeys = NIL; - /* Table pg_partition is only populated on the entry database, - * however, we disable calls from outside dispatch to foil use - * of utility mode. (Full UCS may may this test obsolete.) + /* + * Table pg_partition is only populated on the entry database, however, we + * disable calls from outside dispatch to foil use of utility mode. (Full + * UCS may may this test obsolete.) */ - if (Gp_session_role != GP_ROLE_DISPATCH ) + if (Gp_session_role != GP_ROLE_DISPATCH) elog(ERROR, "mode not dispatch"); rel = heap_open(PartitionRelationId, AccessShareLock); @@ -342,9 +332,9 @@ rel_partition_key_attrs(Oid relid) tuple = systable_getnext(scan); - while ( HeapTupleIsValid(tuple) ) + while (HeapTupleIsValid(tuple)) { - Index i; + Index i; Form_pg_partition p = (Form_pg_partition) GETSTRUCT(tuple); if (p->paristemplate) @@ -353,9 +343,9 @@ rel_partition_key_attrs(Oid relid) continue; } - for ( i = 0; i < p->parnatts; i++ ) + for (i = 0; i < p->parnatts; i++) { - pkeys = lappend_int(pkeys, (Oid)p->paratts.values[i]); + pkeys = lappend_int(pkeys, (Oid) p->paratts.values[i]); } tuple = systable_getnext(scan); @@ -382,7 +372,8 @@ rel_partition_key_attrs(Oid relid) List * rel_partition_keys_ordered(Oid relid) { - List *pkeys = NIL; + List *pkeys = NIL; + rel_partition_keys_kinds_ordered(relid, &pkeys, NULL); return pkeys; } @@ -398,11 +389,11 @@ rel_partition_keys_kinds_ordered(Oid relid, List **pkeys, List **pkinds) Relation partrel; ScanKeyData scankey; SysScanDesc sscan; - List *levels = NIL; - List *keysUnordered = NIL; - List *kindsUnordered = NIL; - int nlevels = 0; - HeapTuple tuple = NULL; + List *levels = NIL; + List *keysUnordered = NIL; + List *kindsUnordered = NIL; + int nlevels = 0; + HeapTuple tuple = NULL; partrel = heap_open(PartitionRelationId, AccessShareLock); @@ -422,10 +413,11 @@ rel_partition_keys_kinds_ordered(Oid relid, List **pkeys, List **pkinds) continue; } - List *levelkeys = NIL; - for (int i = 0; i < p->parnatts; i++ ) + List *levelkeys = NIL; + + for (int i = 0; i < p->parnatts; i++) { - levelkeys = lappend_int(levelkeys, (Oid)p->paratts.values[i]); + levelkeys = lappend_int(levelkeys, (Oid) p->paratts.values[i]); } nlevels++; @@ -453,8 +445,8 @@ rel_partition_keys_kinds_ordered(Oid relid, List **pkeys, List **pkinds) return; } - // now order the keys and kinds by level - for (int i = 0; i< nlevels; i++) + /* now order the keys and kinds by level */ + for (int i = 0; i < nlevels; i++) { ListCell *cell; int pos = 0; @@ -466,7 +458,7 @@ rel_partition_keys_kinds_ordered(Oid relid, List **pkeys, List **pkinds) break; ++pos; } - Assert (cell != NULL); + Assert(cell != NULL); if (pkeys != NULL) *pkeys = lappend(*pkeys, list_nth(keysUnordered, pos)); @@ -480,16 +472,16 @@ rel_partition_keys_kinds_ordered(Oid relid, List **pkeys, List **pkinds) } /* - * Does relation have a external partition? - * Returns true only when the input is the root partition - * of a partitioned table and it has external partitions. - */ + * Does relation have a external partition? Returns true only when the input + * is the root partition of a partitioned table and it has external + * partitions. + */ bool rel_has_external_partition(Oid relid) { - ListCell *lc = NULL; - PartitionNode *n = get_parts(relid, 0 /*level*/ , - 0 /*parent*/, false /* inctemplate */, false /*includesubparts*/); + ListCell *lc = NULL; + PartitionNode *n = get_parts(relid, 0 /* level */ , + 0 /* parent */ , false /* inctemplate */ , false /* includesubparts */ ); if (n == NULL || n->rules == NULL) return false; @@ -497,7 +489,7 @@ rel_has_external_partition(Oid relid) foreach(lc, n->rules) { PartitionRule *rule = lfirst(lc); - Relation rel = heap_open(rule->parchildrelid, NoLock); + Relation rel = heap_open(rule->parchildrelid, NoLock); if (RelationIsExternal(rel)) { @@ -519,19 +511,20 @@ rel_has_external_partition(Oid relid) bool rel_has_appendonly_partition(Oid relid) { - ListCell *lc = NULL; + ListCell *lc = NULL; List *leaf_oid_list = NIL; - PartitionNode *n = get_parts(relid, 0 /*level*/ , - 0 /*parent*/, false /* inctemplate */, true /*includesubparts*/); + PartitionNode *n = get_parts(relid, 0 /* level */ , + 0 /* parent */ , false /* inctemplate */ , true /* includesubparts */ ); if (n == NULL || n->rules == NULL) return false; - leaf_oid_list = all_leaf_partition_relids(n); /* all leaves */ + leaf_oid_list = all_leaf_partition_relids(n); /* all leaves */ foreach(lc, leaf_oid_list) { - Relation rel = heap_open(lfirst_oid(lc), NoLock); + Relation rel = heap_open(lfirst_oid(lc), NoLock); + heap_close(rel, NoLock); if (RelationIsAoRows(rel) || RelationIsAoCols(rel)) @@ -557,14 +550,16 @@ rel_has_appendonly_partition(Oid relid) bool rel_is_child_partition(Oid relid) { - ScanKeyData scankey; + ScanKeyData scankey; Relation rel; SysScanDesc sscan; bool result; - /* Though pg_partition and pg_partition_rule are populated only on the + /* + * Though pg_partition and pg_partition_rule are populated only on the * entry database, are some unguarded calles that may come from segments, - * so we return false, even though we don't actually know. */ + * so we return false, even though we don't actually know. + */ if (Gp_segment != -1) return false; @@ -598,20 +593,20 @@ rel_is_child_partition(Oid relid) bool rel_is_leaf_partition(Oid relid) { - HeapTuple tuple; - Oid paroid = InvalidOid; - int maxdepth = 0; - int mylevel = 0; + HeapTuple tuple; + Oid paroid = InvalidOid; + int maxdepth = 0; + int mylevel = 0; Relation partrulerel; Relation partrel; ScanKeyData scankey; SysScanDesc sscan; - Oid partitioned_rel = InvalidOid; /* OID of the root table of the - * partition set - */ + Oid partitioned_rel = InvalidOid; /* OID of the root table of + * the partition set */ + /* - * Find the pg_partition_rule entry to see if this is a child at - * all and, if so, to locate the OID for the pg_partition entry. + * Find the pg_partition_rule entry to see if this is a child at all and, + * if so, to locate the OID for the pg_partition entry. * * SELECT paroid FROM pg_partition_rule WHERE parchildrelid = :1 */ @@ -639,8 +634,8 @@ rel_is_leaf_partition(Oid relid) Insist(HeapTupleIsValid(tuple)); - mylevel = ((Form_pg_partition)GETSTRUCT(tuple))->parlevel; - partitioned_rel = ((Form_pg_partition)GETSTRUCT(tuple))->parrelid; + mylevel = ((Form_pg_partition) GETSTRUCT(tuple))->parlevel; + partitioned_rel = ((Form_pg_partition) GETSTRUCT(tuple))->parrelid; ReleaseSysCache(tuple); @@ -654,16 +649,16 @@ rel_is_leaf_partition(Oid relid) SnapshotNow, 1, &scankey); /* - * Of course, we could just maxdepth++ but this seems safer -- we - * don't have to worry about the starting depth being 0, 1 or - * something else. + * Of course, we could just maxdepth++ but this seems safer -- we don't + * have to worry about the starting depth being 0, 1 or something else. */ while (HeapTupleIsValid(tuple = systable_getnext(sscan))) { /* not interested in templates */ - if (((Form_pg_partition)GETSTRUCT(tuple))->paristemplate == false) + if (((Form_pg_partition) GETSTRUCT(tuple))->paristemplate == false) { - int depth = ((Form_pg_partition)GETSTRUCT(tuple))->parlevel; + int depth = ((Form_pg_partition) GETSTRUCT(tuple))->parlevel; + maxdepth = Max(maxdepth, depth); } } @@ -680,7 +675,8 @@ rel_is_leaf_partition(Oid relid) * table or when called other then on the entry database, i.e., only meaningful * on the entry database. */ -PartStatus rel_part_status(Oid relid) +PartStatus +rel_part_status(Oid relid) { if (Gp_role != GP_ROLE_DISPATCH) { @@ -689,17 +685,17 @@ PartStatus rel_part_status(Oid relid) return PART_STATUS_NONE; } - if ( rel_is_partitioned(relid) ) + if (rel_is_partitioned(relid)) { - Assert( !rel_is_child_partition(relid) && !rel_is_leaf_partition(relid) ); + Assert(!rel_is_child_partition(relid) && !rel_is_leaf_partition(relid)); return PART_STATUS_ROOT; } - else /* not an actual partitioned table root */ + else /* not an actual partitioned table root */ { - if ( rel_is_child_partition(relid) ) + if (rel_is_child_partition(relid)) return rel_is_leaf_partition(relid) ? PART_STATUS_LEAF : PART_STATUS_INTERIOR; - else /* not a part of a partitioned table */ - Assert( !rel_is_child_partition(relid) ); + else /* not a part of a partitioned table */ + Assert(!rel_is_child_partition(relid)); } return PART_STATUS_NONE; } @@ -711,17 +707,18 @@ PartStatus rel_part_status(Oid relid) static List * sorted_insert_list(List *list, HeapTuple tuple) { - ListCell *lc; - ListCell *lc_prev = NULL; - HeapTuple list_tup; - List *ret_list = list; + ListCell *lc; + ListCell *lc_prev = NULL; + HeapTuple list_tup; + List *ret_list = list; + foreach(lc, ret_list) { list_tup = lfirst(lc); if (HeapTupleGetOid(list_tup) > HeapTupleGetOid(tuple)) { - break; + break; } lc_prev = lc; } @@ -767,10 +764,10 @@ record_constraints(Relation pgcon, { HeapTuple tuple; Relation conRel; - Oid conid; - char *condef; + Oid conid; + char *condef; ConstraintEntry *entry; - bool found; + bool found; MemoryContext oldcontext; ScanKeyData scankey; SysScanDesc sscan; @@ -791,15 +788,16 @@ record_constraints(Relation pgcon, conid = HeapTupleGetOid(tuple); condef = pg_get_constraintexpr_string(conid); - entry = (ConstraintEntry*)hash_search(hash_tbl, - (void*) condef, - HASH_ENTER, - &found); + entry = (ConstraintEntry *) hash_search(hash_tbl, + (void *) condef, + HASH_ENTER, + &found); - /* A tuple isn't a Node, but we'll stick it in a List - * anyway, and just be careful. + /* + * A tuple isn't a Node, but we'll stick it in a List anyway, and just + * be careful. */ - if ( !found ) + if (!found) { entry->key = condef; entry->table_cons = NIL; @@ -807,19 +805,19 @@ record_constraints(Relation pgcon, entry->cand_cons = NIL; } tuple = heap_copytuple(tuple); - switch(xrole) + switch (xrole) { case PART_TABLE: entry->table_cons = sorted_insert_list( - entry->table_cons, tuple); + entry->table_cons, tuple); break; case PART_PART: entry->part_cons = sorted_insert_list( - entry->part_cons, tuple); + entry->part_cons, tuple); break; case PART_CAND: entry->cand_cons = sorted_insert_list( - entry->cand_cons, tuple); + entry->cand_cons, tuple); break; default: Assert(FALSE); @@ -843,35 +841,35 @@ record_constraints(Relation pgcon, */ List * cdb_exchange_part_constraints(Relation table, - Relation part, - Relation cand, - bool validate, - bool is_split, - AlterPartitionCmd *pc) + Relation part, + Relation cand, + bool validate, + bool is_split, + AlterPartitionCmd *pc) { - HTAB *hash_tbl; - HASHCTL hash_ctl; + HTAB *hash_tbl; + HASHCTL hash_ctl; HASH_SEQ_STATUS hash_seq; - Relation pgcon; + Relation pgcon; MemoryContext context; MemoryContext oldcontext; ConstraintEntry *entry; - AttrMap *p2t = NULL; - AttrMap *c2t = NULL; + AttrMap *p2t = NULL; + AttrMap *c2t = NULL; - HeapTuple tuple; + HeapTuple tuple; Form_pg_constraint con; - List *excess_constraints = NIL; - List *missing_constraints = NIL; - List *missing_part_constraints = NIL; - List *validation_list = NIL; - int delta_checks = 0; + List *excess_constraints = NIL; + List *missing_constraints = NIL; + List *missing_part_constraints = NIL; + List *validation_list = NIL; + int delta_checks = 0; /* - * Setup an empty hash table mapping constraint definition - * strings to ConstraintEntry structures. + * Setup an empty hash table mapping constraint definition strings to + * ConstraintEntry structures. */ context = AllocSetContextCreate(CurrentMemoryContext, "Constraint Exchange Context", @@ -880,7 +878,7 @@ cdb_exchange_part_constraints(Relation table, MAX_XCHG_BLOCK_SIZE); memset(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(char*); + hash_ctl.keysize = sizeof(char *); hash_ctl.entrysize = sizeof(ConstraintEntry); hash_ctl.hash = key_string_hash; hash_ctl.match = key_string_compare; @@ -896,10 +894,11 @@ cdb_exchange_part_constraints(Relation table, /* Open pg_constraint here for use in the subroutine and below. */ pgcon = heap_open(ConstraintRelationId, AccessShareLock); - /* We need attribute numbers normalized to the partitioned table. - * Note that these maps are inverse to the usual table-to-part maps. + /* + * We need attribute numbers normalized to the partitioned table. Note + * that these maps are inverse to the usual table-to-part maps. */ - oldcontext = MemoryContextSwitchTo(context); + oldcontext = MemoryContextSwitchTo(context); map_part_attrs(part, table, &p2t, TRUE); map_part_attrs(cand, table, &c2t, TRUE); MemoryContextSwitchTo(oldcontext); @@ -910,7 +909,8 @@ cdb_exchange_part_constraints(Relation table, record_constraints(pgcon, context, hash_tbl, cand, PART_CAND); hash_freeze(hash_tbl); - /* Each entry in the hash table represents a single logically equivalent + /* + * Each entry in the hash table represents a single logically equivalent * constraint which may appear zero or more times (under different names) * on each of the three involved relations. By construction, it will * appear on at least one list. @@ -920,24 +920,26 @@ cdb_exchange_part_constraints(Relation table, hash_seq_init(&hash_seq, hash_tbl); while ((entry = hash_seq_search(&hash_seq))) { - if ( list_length(entry->table_cons) > 0 ) + if (list_length(entry->table_cons) > 0) { - /* REGULAR CONSTRAINT + /* + * REGULAR CONSTRAINT * - * Constraints on the whole partitioned table are regular (in - * the sense that they do not enforce partitioning rules and + * Constraints on the whole partitioned table are regular (in the + * sense that they do not enforce partitioning rules and * corresponding constraints must occur on every part). */ - List *missing = NIL; - List *extra = NIL; + List *missing = NIL; + List *extra = NIL; - if ( list_length(entry->part_cons) == 0 ) + if (list_length(entry->part_cons) == 0) { - /* The regular constraint is missing from the existing part, + /* + * The regular constraint is missing from the existing part, * so there is a database anomaly. Warn rather than issuing - * an error, because this may be an attempt to use EXCHANGE - * to correct the problem. There may be multiple constraints + * an error, because this may be an attempt to use EXCHANGE to + * correct the problem. There may be multiple constraints * with different names, but report only the first name since * the constraint expression itself is all that matters. */ @@ -954,50 +956,56 @@ cdb_exchange_part_constraints(Relation table, RelationGetRelationName(table)))); } - /* The regular constraint should ultimately appear on the candidate - * part the same number of times and with the same name as it appears - * on the partitioned table. The call to constraint_diff will find - * matching names and we'll be left with occurrences of the constraint - * that must be added to the candidate (missing) and occurrences that - * must be dropped from the candidate (extra). + /* + * The regular constraint should ultimately appear on the + * candidate part the same number of times and with the same name + * as it appears on the partitioned table. The call to + * constraint_diff will find matching names and we'll be left with + * occurrences of the constraint that must be added to the + * candidate (missing) and occurrences that must be dropped from + * the candidate (extra). */ constraint_diffs(entry->table_cons, entry->cand_cons, true, &missing, &extra); missing_constraints = list_concat(missing_constraints, missing); excess_constraints = list_concat(excess_constraints, extra); } - else if ( list_length(entry->part_cons) > 0 ) /* and none on whole */ + else if (list_length(entry->part_cons) > 0) /* and none on whole */ { - /* PARTITION CONSTRAINT + /* + * PARTITION CONSTRAINT * - * Constraints on the part and not the whole must guard a partition - * rule, so they must be CHECK constraints on partitioning columns. - * They are managed internally, so there must be only one of them. - * (Though a part will have a partition constraint for each partition - * level, a given constraint should appear only once per part.) + * Constraints on the part and not the whole must guard a + * partition rule, so they must be CHECK constraints on + * partitioning columns. They are managed internally, so there + * must be only one of them. (Though a part will have a partition + * constraint for each partition level, a given constraint should + * appear only once per part.) * * They should either already occur on the candidate or be added. * Partition constraint names are not carefully managed so they * shouldn't be regarded as meaningful. * * Since we use the partition constraint of the part to check or - * construct the partition constraint of the candidate, we insist it - * is in good working order, and issue an error, if not. + * construct the partition constraint of the candidate, we insist + * it is in good working order, and issue an error, if not. */ - int n = list_length(entry->part_cons); + int n = list_length(entry->part_cons); - if ( n > 1 ) + if (n > 1) { elog(ERROR, "multiple partition constraints (same key) on \"%s\"", RelationGetRelationName(part)); } - /* Get the model partition constraint. + /* + * Get the model partition constraint. */ tuple = linitial(entry->part_cons); con = (Form_pg_constraint) GETSTRUCT(tuple); - /* Check it, though this is cursory in that we don't check that + /* + * Check it, though this is cursory in that we don't check that * the right attributes are involved and that the semantics are * right. */ @@ -1010,44 +1018,49 @@ cdb_exchange_part_constraints(Relation table, n = list_length(entry->cand_cons); - if ( n == 0 ) + if (n == 0) { - /* The partition constraint is missing from the candidate and + /* + * The partition constraint is missing from the candidate and * must be added. */ missing_part_constraints = lappend(missing_part_constraints, - (HeapTuple)linitial(entry->part_cons)); + (HeapTuple) linitial(entry->part_cons)); } - else if ( n == 1 ) + else if (n == 1) { - /* One instance of the partition constraint exists on the + /* + * One instance of the partition constraint exists on the * candidate, so let's not worry about name drift. All is - * well. */ + * well. + */ } else { - /* Several instances of the partition constraint exist on - * the candidate. If one has a matching name, prefer it. - * Else, just chose the first (arbitrary). + /* + * Several instances of the partition constraint exist on the + * candidate. If one has a matching name, prefer it. Else, + * just chose the first (arbitrary). */ - List *missing = NIL; - List *extra = NIL; + List *missing = NIL; + List *extra = NIL; constraint_diffs(entry->part_cons, entry->cand_cons, false, &missing, &extra); - if ( list_length(missing) == 0 ) + if (list_length(missing) == 0) { excess_constraints = list_concat(excess_constraints, extra); } - else /* missing */ + else /* missing */ { - ListCell *lc; - bool skip = TRUE; + ListCell *lc; + bool skip = TRUE; foreach(lc, entry->cand_cons) { - HeapTuple tuple = (HeapTuple)lfirst(lc); - if ( skip ) + HeapTuple tuple = (HeapTuple) lfirst(lc); + + if (skip) { skip = FALSE; } @@ -1059,28 +1072,33 @@ cdb_exchange_part_constraints(Relation table, } } } - else if ( list_length(entry->cand_cons) > 0 ) /* and none on whole or part */ + else if (list_length(entry->cand_cons) > 0) /* and none on whole or + * part */ { - /* MAVERICK CONSTRAINT + /* + * MAVERICK CONSTRAINT * - * Constraints on only the candidate are extra and must be - * dropped before the candidate can replace the part. + * Constraints on only the candidate are extra and must be dropped + * before the candidate can replace the part. */ excess_constraints = list_concat(excess_constraints, entry->cand_cons); } - else /* Defensive: Can't happen that no constraints are set. */ + else /* Defensive: Can't happen that no constraints + * are set. */ { elog(ERROR, "constraint hash table inconsistent"); } } - if ( excess_constraints ) + if (excess_constraints) { - /* Disallow excess constraints. We could drop them automatically, but they - * may carry semantic information about the candidate that is important to - * the user, so make the user decide whether to drop them. + /* + * Disallow excess constraints. We could drop them automatically, but + * they may carry semantic information about the candidate that is + * important to the user, so make the user decide whether to drop + * them. */ ereport(ERROR, (errcode(ERRCODE_INTEGRITY_CONSTRAINT_VIOLATION), @@ -1090,59 +1108,62 @@ cdb_exchange_part_constraints(Relation table, errhint("drop the invalid constraints and retry"))); } - if ( missing_part_constraints ) + if (missing_part_constraints) { - ListCell *lc; + ListCell *lc; foreach(lc, missing_part_constraints) { - HeapTuple missing_part_constraint = (HeapTuple)lfirst(lc); - /* We need a constraint like the missing one for the part, but translated - * for the candidate. + HeapTuple missing_part_constraint = (HeapTuple) lfirst(lc); + + /* + * We need a constraint like the missing one for the part, but + * translated for the candidate. */ - AttrMap *map; + AttrMap *map; struct NewConstraint *nc; - Form_pg_constraint mcon = (Form_pg_constraint)GETSTRUCT(missing_part_constraint); + Form_pg_constraint mcon = (Form_pg_constraint) GETSTRUCT(missing_part_constraint); - if ( mcon->contype != CONSTRAINT_CHECK ) - elog(ERROR,"Invalid partition constration, not CHECK type"); + if (mcon->contype != CONSTRAINT_CHECK) + elog(ERROR, "Invalid partition constration, not CHECK type"); map_part_attrs(part, cand, &map, TRUE); nc = constraint_apply_mapped(missing_part_constraint, map, cand, validate, is_split, pgcon); - if ( nc ) + if (nc) validation_list = lappend(validation_list, nc); delta_checks++; } } - if ( missing_constraints ) + if (missing_constraints) { - /* We need constraints like the missing ones for the whole, but + /* + * We need constraints like the missing ones for the whole, but * translated for the candidate. */ - AttrMap *map; + AttrMap *map; struct NewConstraint *nc; - ListCell *lc; + ListCell *lc; map_part_attrs(table, cand, &map, TRUE); foreach(lc, missing_constraints) { - HeapTuple tuple = (HeapTuple)lfirst(lc); - Form_pg_constraint mcon = (Form_pg_constraint)GETSTRUCT(tuple); + HeapTuple tuple = (HeapTuple) lfirst(lc); + Form_pg_constraint mcon = (Form_pg_constraint) GETSTRUCT(tuple); nc = constraint_apply_mapped(tuple, map, cand, validate, is_split, pgcon); - if ( nc ) + if (nc) validation_list = lappend(validation_list, nc); - if ( mcon->contype == CONSTRAINT_CHECK ) + if (mcon->contype == CONSTRAINT_CHECK) delta_checks++; } } - if ( delta_checks ) + if (delta_checks) { SetRelationNumChecks(cand, cand->rd_rel->relchecks + delta_checks); } @@ -1169,9 +1190,9 @@ constraint_names(List *cons) initStringInfo(&str); p = ""; - foreach (lc, cons) + foreach(lc, cons) { - HeapTuple tuple = lfirst(lc); + HeapTuple tuple = lfirst(lc); Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple); appendStringInfo(&str, "%s\"%s\"", p, NameStr(con->conname)); @@ -1206,53 +1227,56 @@ constraint_names(List *cons) static void constraint_diffs(List *cons_a, List *cons_b, bool match_names, List **missing, List **extra) { - ListCell *cell_a, *cell_b; - Index pos_a, pos_b; - int *match_a, *match_b; - int n; + ListCell *cell_a, + *cell_b; + Index pos_a, + pos_b; + int *match_a, + *match_b; + int n; - int len_a = list_length(cons_a); - int len_b = list_length(cons_b); + int len_a = list_length(cons_a); + int len_b = list_length(cons_b); Assert(missing != NULL); Assert(extra != NULL); - if ( len_a == 0 ) + if (len_a == 0) { *extra = list_copy(cons_b); *missing = NIL; return; } - if ( len_b == 0 ) + if (len_b == 0) { *extra = NIL; *missing = list_copy(cons_a); return; } - match_a = (int*)palloc(len_a * sizeof(int)); - for ( pos_a = 0; pos_a < len_a; pos_a++ ) + match_a = (int *) palloc(len_a * sizeof(int)); + for (pos_a = 0; pos_a < len_a; pos_a++) match_a[pos_a] = -1; - match_b = (int*)palloc(len_b * sizeof(int)); - for ( pos_b = 0; pos_b < len_b; pos_b++ ) + match_b = (int *) palloc(len_b * sizeof(int)); + for (pos_b = 0; pos_b < len_b; pos_b++) match_b[pos_b] = -1; pos_b = 0; - foreach (cell_b, cons_b) + foreach(cell_b, cons_b) { - HeapTuple tuple_b = (HeapTuple)lfirst(cell_b); + HeapTuple tuple_b = (HeapTuple) lfirst(cell_b); Form_pg_constraint b = (Form_pg_constraint) GETSTRUCT(tuple_b); pos_a = 0; - foreach (cell_a, cons_a) + foreach(cell_a, cons_a) { - HeapTuple tuple_a = lfirst(cell_a); + HeapTuple tuple_a = lfirst(cell_a); Form_pg_constraint a = (Form_pg_constraint) GETSTRUCT(tuple_a); - if ( strncmp(NameStr(a->conname), NameStr(b->conname), NAMEDATALEN) == 0 ) + if (strncmp(NameStr(a->conname), NameStr(b->conname), NAMEDATALEN) == 0) { /* No duplicate names on either list. */ Assert(match_a[pos_a] == -1 && match_b[pos_b] == -1); @@ -1270,31 +1294,31 @@ constraint_diffs(List *cons_a, List *cons_b, bool match_names, List **missing, L *extra = NIL; n = len_a - len_b; - if ( n > 0 || match_names ) + if (n > 0 || match_names) { pos_a = 0; - foreach (cell_a, cons_a) + foreach(cell_a, cons_a) { - if ( match_a[pos_a] == -1 ) + if (match_a[pos_a] == -1) *missing = lappend(*missing, lfirst(cell_a)); pos_a++; n--; - if ( n <= 0 && !match_names) + if (n <= 0 && !match_names) break; } } n = len_b - len_a; - if ( n > 0 || match_names ) + if (n > 0 || match_names) { pos_b = 0; - foreach (cell_b, cons_b) + foreach(cell_b, cons_b) { - if ( match_b[pos_b] == -1 ) + if (match_b[pos_b] == -1) *extra = lappend(*extra, lfirst(cell_b)); pos_b++; n--; - if ( n <= 0 && !match_names ) + if (n <= 0 && !match_names) break; } } @@ -1310,15 +1334,21 @@ constraint_diffs(List *cons_a, List *cons_b, bool match_names, List **missing, L static char parttype_to_char(PartitionByType type) { - char c; + char c; switch (type) { - case PARTTYP_HASH: c = 'h'; break; - case PARTTYP_RANGE: c = 'r'; break; - case PARTTYP_LIST: c = 'l'; break; + case PARTTYP_HASH: + c = 'h'; + break; + case PARTTYP_RANGE: + c = 'r'; + break; + case PARTTYP_LIST: + c = 'l'; + break; default: - c = 0; /* quieten compiler */ + c = 0; /* quieten compiler */ elog(ERROR, "unknown partitioning type %i", type); } @@ -1334,15 +1364,15 @@ char_to_parttype(char c) { PartitionByType pt = PARTTYP_RANGE; /* just to shut GCC up */ - switch(c) + switch (c) { - case 'h': /* hash */ + case 'h': /* hash */ pt = PARTTYP_HASH; break; - case 'r': /* range */ + case 'r': /* range */ pt = PARTTYP_RANGE; break; - case 'l': /* list */ + case 'l': /* list */ pt = PARTTYP_LIST; break; default: @@ -1350,7 +1380,7 @@ char_to_parttype(char c) c); Assert(false); break; - } /* end switch */ + } /* end switch */ return pt; } @@ -1361,12 +1391,12 @@ char_to_parttype(char c) static void add_partition(Partition *part) { - Datum values[Natts_pg_partition]; - bool isnull[Natts_pg_partition]; - Relation partrel; - HeapTuple tup; - oidvector *opclass; - int2vector *attnums; + Datum values[Natts_pg_partition]; + bool isnull[Natts_pg_partition]; + Relation partrel; + HeapTuple tup; + oidvector *opclass; + int2vector *attnums; MemSet(isnull, 0, sizeof(bool) * Natts_pg_partition); @@ -1374,7 +1404,7 @@ add_partition(Partition *part) values[Anum_pg_partition_parkind - 1] = CharGetDatum(part->parkind); values[Anum_pg_partition_parlevel - 1] = Int16GetDatum(part->parlevel); values[Anum_pg_partition_paristemplate - 1] = - BoolGetDatum(part->paristemplate); + BoolGetDatum(part->paristemplate); values[Anum_pg_partition_parnatts - 1] = Int16GetDatum(part->parnatts); attnums = buildint2vector(part->paratts, part->parnatts); @@ -1401,32 +1431,32 @@ add_partition(Partition *part) static void add_partition_rule(PartitionRule *rule) { - Datum values[Natts_pg_partition_rule]; - bool isnull[Natts_pg_partition_rule]; - Relation rulerel; - HeapTuple tup; - NameData name; + Datum values[Natts_pg_partition_rule]; + bool isnull[Natts_pg_partition_rule]; + Relation rulerel; + HeapTuple tup; + NameData name; MemSet(isnull, 0, sizeof(bool) * Natts_pg_partition_rule); values[Anum_pg_partition_rule_paroid - 1] = ObjectIdGetDatum(rule->paroid); values[Anum_pg_partition_rule_parchildrelid - 1] = - ObjectIdGetDatum(rule->parchildrelid); + ObjectIdGetDatum(rule->parchildrelid); values[Anum_pg_partition_rule_parparentrule - 1] = - ObjectIdGetDatum(rule->parparentoid); + ObjectIdGetDatum(rule->parparentoid); name.data[0] = '\0'; namestrcpy(&name, rule->parname); values[Anum_pg_partition_rule_parname - 1] = NameGetDatum(&name); values[Anum_pg_partition_rule_parisdefault - 1] = - BoolGetDatum(rule->parisdefault); + BoolGetDatum(rule->parisdefault); values[Anum_pg_partition_rule_parruleord - 1] = - Int16GetDatum(rule->parruleord); + Int16GetDatum(rule->parruleord); values[Anum_pg_partition_rule_parrangestartincl - 1] = - BoolGetDatum(rule->parrangestartincl); + BoolGetDatum(rule->parrangestartincl); values[Anum_pg_partition_rule_parrangeendincl - 1] = - BoolGetDatum(rule->parrangeendincl); + BoolGetDatum(rule->parrangeendincl); values[Anum_pg_partition_rule_parrangestart - 1] = CStringGetTextDatum(nodeToString(rule->parrangestart)); @@ -1436,14 +1466,15 @@ add_partition_rule(PartitionRule *rule) CStringGetTextDatum(nodeToString(rule->parrangeevery)); values[Anum_pg_partition_rule_parlistvalues - 1] = CStringGetTextDatum(nodeToString(rule->parlistvalues)); + if (rule->parreloptions) values[Anum_pg_partition_rule_parreloptions - 1] = - transformRelOptions((Datum) 0, rule->parreloptions, true, false); + transformRelOptions((Datum) 0, rule->parreloptions, true, false); else isnull[Anum_pg_partition_rule_parreloptions - 1] = true; - values[Anum_pg_partition_rule_partemplatespace -1] = - ObjectIdGetDatum(rule->partemplatespaceId); + values[Anum_pg_partition_rule_partemplatespace - 1] = + ObjectIdGetDatum(rule->partemplatespaceId); rulerel = heap_open(PartitionRuleRelationId, RowExclusiveLock); @@ -1468,16 +1499,15 @@ get_part_oid(Oid rootrelid, int16 parlevel, bool istemplate) HeapTuple tuple; Oid paroid; - /* select oid - * from pg_partition - * where - * parrelid = :rootrelid and - * parlevel = :parlevel and - * paristemplate = :istemplate; + /* + * select oid from pg_partition where parrelid = :rootrelid and parlevel = + * :parlevel and paristemplate = :istemplate; */ - /* pg_partition and pg_partition_rule are populated only on the - * entry database, so our result is only meaningful there. */ + /* + * pg_partition and pg_partition_rule are populated only on the entry + * database, so our result is only meaningful there. + */ Insist(Gp_segment == -1); partrel = heap_open(PartitionRelationId, AccessShareLock); @@ -1517,7 +1547,7 @@ del_part_template(Oid rootrelid, int16 parlevel, Oid parent) bool istemplate = true; Oid paroid = InvalidOid; ItemPointerData tid; - ScanKeyData scankey[3]; + ScanKeyData scankey[3]; Relation part_rel; Relation part_rule_rel; SysScanDesc sscan; @@ -1589,7 +1619,7 @@ del_part_template(Oid rootrelid, int16 parlevel, Oid parent) CommandCounterIncrement(); return 1; -} /* end del_part_template */ +} /* end del_part_template */ /* @@ -1601,17 +1631,18 @@ del_part_template(Oid rootrelid, int16 parlevel, Oid parent) */ void add_part_to_catalog(Oid relid, PartitionBy *pby, - bool bTemplate_Only /* = false */) + bool bTemplate_Only /* = false */ ) { - char pt = parttype_to_char(pby->partType); - ListCell *lc; + char pt = parttype_to_char(pby->partType); + ListCell *lc; PartitionSpec *spec; - Oid paroid = InvalidOid; - Oid rootrelid = InvalidOid; - Relation rel; - Oid parttemplid = InvalidOid; - bool add_temp = bTemplate_Only; /* normally false */ - spec = (PartitionSpec *)pby->partSpec; + Oid paroid = InvalidOid; + Oid rootrelid = InvalidOid; + Relation rel; + Oid parttemplid = InvalidOid; + bool add_temp = bTemplate_Only; /* normally false */ + + spec = (PartitionSpec *) pby->partSpec; /* only create partition catalog entries on the master */ if (Gp_role == GP_ROLE_EXECUTE) @@ -1622,22 +1653,22 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, */ rootrelid = RangeVarGetRelid(pby->parentRel, false); paroid = get_part_oid(rootrelid, pby->partDepth, - bTemplate_Only /* = false */); + bTemplate_Only /* = false */ ); /* create a partition for this level, if one doesn't exist */ if (!OidIsValid(paroid)) { AttrNumber *attnums; - Oid *parclass; - Partition *part = makeNode(Partition); - int i = 0; + Oid *parclass; + Partition *part = makeNode(Partition); + int i = 0; part->parrelid = rootrelid; part->parkind = pt; part->parlevel = pby->partDepth; if (pby->partSpec) - part->paristemplate = ((PartitionSpec *)pby->partSpec)->istemplate; + part->paristemplate = ((PartitionSpec *) pby->partSpec)->istemplate; else part->paristemplate = false; @@ -1647,7 +1678,7 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, foreach(lc, pby->keys) { - int colnum = lfirst_int(lc); + int colnum = lfirst_int(lc); attnums[i++] = colnum; } @@ -1659,7 +1690,7 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, i = 0; foreach(lc, pby->keyopclass) { - Oid opclass = lfirst_oid(lc); + Oid opclass = lfirst_oid(lc); parclass[i++] = opclass; } @@ -1693,23 +1724,23 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, paroid = part->partid; } else - /* oid of the template accompanying the real partition */ + /* oid of the template accompanying the real partition */ parttemplid = get_part_oid(rootrelid, pby->partDepth, true); /* create partition rule */ if (spec) { - Node *listvalues = NULL; - Node *rangestart = NULL; - Node *rangeend = NULL; - Node *rangeevery = NULL; - bool rangestartinc = false; - bool rangeendinc = false; - int2 parruleord = 0; + Node *listvalues = NULL; + Node *rangestart = NULL; + Node *rangeend = NULL; + Node *rangeevery = NULL; + bool rangestartinc = false; + bool rangeendinc = false; + int2 parruleord = 0; PartitionRule *rule = makeNode(PartitionRule); PartitionElem *el; - char *parname = NULL; - Oid parentoid = InvalidOid; + char *parname = NULL; + Oid parentoid = InvalidOid; Assert(list_length(spec->partElem) == 1); @@ -1722,56 +1753,57 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, switch (pby->partType) { - case PARTTYP_HASH: break; + case PARTTYP_HASH: + break; case PARTTYP_LIST: - { - PartitionValuesSpec *vspec = - (PartitionValuesSpec *)el->boundSpec; + { + PartitionValuesSpec *vspec = + (PartitionValuesSpec *) el->boundSpec; - /* might be NULL if this is a default spec */ - if (vspec) - listvalues = (Node *)vspec->partValues; - } + /* might be NULL if this is a default spec */ + if (vspec) + listvalues = (Node *) vspec->partValues; + } break; case PARTTYP_RANGE: - { - PartitionBoundSpec *bspec = - (PartitionBoundSpec *)el->boundSpec; - PartitionRangeItem *ri; - - /* remember, could be a default clause */ - if (bspec) { - Assert(IsA(bspec, PartitionBoundSpec)); - ri = (PartitionRangeItem *)bspec->partStart; - if (ri) + PartitionBoundSpec *bspec = + (PartitionBoundSpec *) el->boundSpec; + PartitionRangeItem *ri; + + /* remember, could be a default clause */ + if (bspec) { - Assert(ri->partedge == PART_EDGE_INCLUSIVE || - ri->partedge == PART_EDGE_EXCLUSIVE); + Assert(IsA(bspec, PartitionBoundSpec)); + ri = (PartitionRangeItem *) bspec->partStart; + if (ri) + { + Assert(ri->partedge == PART_EDGE_INCLUSIVE || + ri->partedge == PART_EDGE_EXCLUSIVE); - rangestartinc = ri->partedge == PART_EDGE_INCLUSIVE; - rangestart = (Node *)ri->partRangeVal; - } + rangestartinc = ri->partedge == PART_EDGE_INCLUSIVE; + rangestart = (Node *) ri->partRangeVal; + } - ri = (PartitionRangeItem *)bspec->partEnd; - if (ri) - { - Assert(ri->partedge == PART_EDGE_INCLUSIVE || - ri->partedge == PART_EDGE_EXCLUSIVE); + ri = (PartitionRangeItem *) bspec->partEnd; + if (ri) + { + Assert(ri->partedge == PART_EDGE_INCLUSIVE || + ri->partedge == PART_EDGE_EXCLUSIVE); - rangeendinc = ri->partedge == PART_EDGE_INCLUSIVE; - rangeend = (Node *)ri->partRangeVal; - } + rangeendinc = ri->partedge == PART_EDGE_INCLUSIVE; + rangeend = (Node *) ri->partRangeVal; + } - if (bspec->partEvery) - { - ri = (PartitionRangeItem *)bspec->partEvery; - rangeevery = (Node *)ri->partRangeVal; + if (bspec->partEvery) + { + ri = (PartitionRangeItem *) bspec->partEvery; + rangeevery = (Node *) ri->partRangeVal; + } + else + rangeevery = NULL; } - else - rangeevery = NULL; } - } break; default: elog(ERROR, "unknown partitioning type %i", pby->partType); @@ -1782,9 +1814,9 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, if (!bTemplate_Only && (pby->partDepth > 0)) { Oid inhoid; - ScanKeyData scankey; + ScanKeyData scankey; SysScanDesc sscan; - HeapTuple tuple; + HeapTuple tuple; rel = heap_open(InheritsRelationId, AccessShareLock); @@ -1827,7 +1859,7 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, /* we still might have to add template rules */ if (!add_temp && OidIsValid(parttemplid)) { - ScanKeyData scankey[3]; + ScanKeyData scankey[3]; SysScanDesc sscan; Relation partrulerel; HeapTuple tuple; @@ -1835,10 +1867,8 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, partrulerel = heap_open(PartitionRuleRelationId, AccessShareLock); /* - * SELECT parchildrelid FROM pg_partition_rule - * WHERE paroid = :1 - * AND parparentrule = :2 - * AND parruleord = :3 + * SELECT parchildrelid FROM pg_partition_rule WHERE paroid = :1 + * AND parparentrule = :2 AND parruleord = :3 */ ScanKeyInit(&scankey[0], Anum_pg_partition_rule_paroid, BTEqualStrategyNumber, F_OIDEQ, @@ -1876,8 +1906,8 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, rule->parrangeendincl = rangeendinc; rule->parrangeend = rangeend; rule->parrangeevery = rangeevery; - rule->parlistvalues = (List *)listvalues; - rule->partemplatespaceId = InvalidOid; /* only valid for template */ + rule->parlistvalues = (List *) listvalues; + rule->partemplatespaceId = InvalidOid; /* only valid for template */ if (!bTemplate_Only) add_partition_rule(rule); @@ -1890,16 +1920,16 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, if (el->storeAttr) { - if (((AlterPartitionCmd *)el->storeAttr)->arg1) + if (((AlterPartitionCmd *) el->storeAttr)->arg1) rule->parreloptions = - (List *)((AlterPartitionCmd *)el->storeAttr)->arg1; - if (((AlterPartitionCmd *)el->storeAttr)->arg2) + (List *) ((AlterPartitionCmd *) el->storeAttr)->arg1; + if (((AlterPartitionCmd *) el->storeAttr)->arg2) { Oid tablespaceId; tablespaceId = - get_settable_tablespace_oid( - strVal(((AlterPartitionCmd *)el->storeAttr)->arg2)); + get_settable_tablespace_oid( + strVal(((AlterPartitionCmd *) el->storeAttr)->arg2)); /* get_settable_tablespace_oid will error out for us */ Assert(OidIsValid(tablespaceId)); @@ -1914,7 +1944,7 @@ add_part_to_catalog(Oid relid, PartitionBy *pby, /* allow subsequent callers to see our work */ CommandCounterIncrement(); -} /* end add_part_to_catalog */ +} /* end add_part_to_catalog */ /* * parruleord_open_gap @@ -1935,7 +1965,7 @@ parruleord_open_gap(Oid partid, int2 level, Oid parent, int2 ruleord, { Relation rel; Relation irel; - HeapTuple tuple; + HeapTuple tuple; ScanKeyData scankey[3]; IndexScanDesc sd; @@ -1967,7 +1997,7 @@ parruleord_open_gap(Oid partid, int2 level, Oid parent, int2 ruleord, sd = index_beginscan(rel, irel, SnapshotNow, 3, scankey); while (HeapTupleIsValid(tuple = index_getnext(sd, BackwardScanDirection))) { - int old_ruleord; + int old_ruleord; Form_pg_partition_rule rule_desc; Insist(HeapTupleIsValid(tuple)); @@ -1975,7 +2005,7 @@ parruleord_open_gap(Oid partid, int2 level, Oid parent, int2 ruleord, tuple = heap_copytuple(tuple); rule_desc = - (Form_pg_partition_rule)GETSTRUCT(tuple); + (Form_pg_partition_rule) GETSTRUCT(tuple); old_ruleord = rule_desc->parruleord; closegap ? rule_desc->parruleord-- : rule_desc->parruleord++; @@ -1992,7 +2022,7 @@ parruleord_open_gap(Oid partid, int2 level, Oid parent, int2 ruleord, heap_close(irel, RowExclusiveLock); heap_close(rel, RowExclusiveLock); -} /* end parruleord_open_gap */ +} /* end parruleord_open_gap */ /* * Build up a PartitionRule based on a tuple from pg_partition_rule @@ -2002,10 +2032,11 @@ PartitionRule * ruleMakePartitionRule(HeapTuple tuple) { Form_pg_partition_rule rule_desc = - (Form_pg_partition_rule)GETSTRUCT(tuple); + (Form_pg_partition_rule)GETSTRUCT(tuple); char *rule_str; Datum rule_datum; bool isnull; + PartitionRule *rule; rule = makeNode(PartitionRule); @@ -2079,7 +2110,7 @@ ruleMakePartitionRule(HeapTuple tuple) Datum *options; int noptions; List *opts = NIL; - int i; + int i; /* XXX XXX: why not use untransformRelOptions ? */ @@ -2105,7 +2136,7 @@ ruleMakePartitionRule(HeapTuple tuple) v = makeString(s); } - opts = lappend(opts, makeDefElem(n, (Node *)v)); + opts = lappend(opts, makeDefElem(n, (Node *) v)); } rule->parreloptions = opts; @@ -2129,11 +2160,11 @@ ruleMakePartitionRule(HeapTuple tuple) Partition * partMakePartition(HeapTuple tuple) { - oidvector *oids; + oidvector *oids; int2vector *atts; - bool isnull; - Form_pg_partition partrow = (Form_pg_partition)GETSTRUCT(tuple); - Partition *p; + bool isnull; + Form_pg_partition partrow = (Form_pg_partition) GETSTRUCT(tuple); + Partition *p; p = makeNode(Partition); @@ -2145,12 +2176,12 @@ partMakePartition(HeapTuple tuple) p->parnatts = partrow->parnatts; atts = (int2vector *) DatumGetPointer(SysCacheGetAttr(PARTOID, tuple, - Anum_pg_partition_paratts, - &isnull)); + Anum_pg_partition_paratts, + &isnull)); Assert(!isnull); oids = (oidvector *) DatumGetPointer(SysCacheGetAttr(PARTOID, tuple, - Anum_pg_partition_parclass, - &isnull)); + Anum_pg_partition_parclass, + &isnull)); Assert(!isnull); p->paratts = palloc(sizeof(int2) * p->parnatts); @@ -2180,25 +2211,24 @@ get_parts(Oid relid, int2 level, Oid parent, bool inctemplate, bool includesubparts) { PartitionNode *pnode = NULL; - HeapTuple tuple; - Relation rel; - List *rules = NIL; + HeapTuple tuple; + Relation rel; + List *rules = NIL; ScanKeyData scankey[3]; SysScanDesc sscan; - /* Though pg_partition and pg_partition_rule are populated only - * on the entry database, we accept calls from QEs running on a - * segment database, but always return NULL; so our result is - * only meaningful on the entry database. */ + /* + * Though pg_partition and pg_partition_rule are populated only on the + * entry database, we accept calls from QEs running on a segment database, + * but always return NULL; so our result is only meaningful on the entry + * database. + */ if (Gp_segment != -1) return pnode; - /* select oid as partid, * - * from pg_partition - * where - * parrelid = :relid and - * parlevel = :level and - * paristemplate = :inctemplate; + /* + * select oid as partid, * from pg_partition where parrelid = :relid and + * parlevel = :level and paristemplate = :inctemplate; */ rel = heap_open(PartitionRelationId, AccessShareLock); @@ -2226,14 +2256,12 @@ get_parts(Oid relid, int2 level, Oid parent, bool inctemplate, systable_endscan(sscan); heap_close(rel, AccessShareLock); - if ( ! pnode ) + if (!pnode) return pnode; - /* select * - * from pg_partition_rule - * where - * paroid = :pnode->part->partid and -- pg_partition.oid - * parparentrule = :parent; + /* + * select * from pg_partition_rule where paroid = :pnode->part->partid and + * -- pg_partition.oid parparentrule = :parent; */ rel = heap_open(PartitionRuleRelationId, AccessShareLock); @@ -2268,7 +2296,7 @@ get_parts(Oid relid, int2 level, Oid parent, bool inctemplate, if (includesubparts) { rule->children = get_parts(relid, level + 1, rule->parruleid, - inctemplate, true /*includesubparts*/); + inctemplate, true /* includesubparts */ ); } if (rule->parisdefault) @@ -2278,12 +2306,14 @@ get_parts(Oid relid, int2 level, Oid parent, bool inctemplate, rules = lappend(rules, rule); } } - /* NOTE: this assert is valid, except for the case of splitting - * the very last partition of a table. For that case, we must - * drop the last partition before re-adding the new pieces, which - * violates this invariant + + /* + * NOTE: this assert is valid, except for the case of splitting the very + * last partition of a table. For that case, we must drop the last + * partition before re-adding the new pieces, which violates this + * invariant */ - /* Assert(inctemplate || list_length(rules) || pnode->default_part); */ + /* Assert(inctemplate || list_length(rules) || pnode->default_part); */ pnode->rules = rules; systable_endscan(sscan); @@ -2302,7 +2332,7 @@ RelationBuildPartitionDescByOid(Oid relid, bool inctemplate) { PartitionNode *n; - n = get_parts(relid, 0, 0, inctemplate, true /*includesubparts*/); + n = get_parts(relid, 0, 0, inctemplate, true /* includesubparts */ ); return n; } @@ -2320,24 +2350,22 @@ RelationBuildPartitionDescByOid(Oid relid, bool inctemplate) Bitmapset * get_partition_key_bitmapset(Oid relid) { - Relation rel; - HeapTuple tuple; - TupleDesc tupledesc; + Relation rel; + HeapTuple tuple; + TupleDesc tupledesc; ScanKeyData scankey; SysScanDesc sscan; - Bitmapset *partition_key = NULL; + Bitmapset *partition_key = NULL; - /* Reject calls from QEs running on a segment database, since - * pg_partition and pg_partition_rule are populated only - * on the entry database. + /* + * Reject calls from QEs running on a segment database, since pg_partition + * and pg_partition_rule are populated only on the entry database. */ Insist(Gp_segment == -1); - /* select paratts - * from pg_partition - * where - * parrelid = :relid and - * not paristemplate; + /* + * select paratts from pg_partition where parrelid = :relid and not + * paristemplate; */ rel = heap_open(PartitionRelationId, AccessShareLock); tupledesc = RelationGetDescr(rel); @@ -2351,22 +2379,22 @@ get_partition_key_bitmapset(Oid relid) while (HeapTupleIsValid(tuple = systable_getnext(sscan))) { - int i; - int2 natts; + int i; + int2 natts; int2vector *atts; - bool isnull; - Form_pg_partition partrow = (Form_pg_partition)GETSTRUCT(tuple); + bool isnull; + Form_pg_partition partrow = (Form_pg_partition) GETSTRUCT(tuple); if (partrow->paristemplate) - continue; /* no interest in template parts */ + continue; /* no interest in template parts */ natts = partrow->parnatts; atts = (int2vector *) DatumGetPointer( - heap_getattr(tuple, Anum_pg_partition_paratts, - tupledesc, &isnull)); + heap_getattr(tuple, Anum_pg_partition_paratts, + tupledesc, &isnull)); Insist(!isnull); - for ( i = 0; i < natts; i++ ) + for (i = 0; i < natts; i++) partition_key = bms_add_member(partition_key, atts->values[i]); } @@ -2384,8 +2412,8 @@ get_partition_key_bitmapset(Oid relid) List * get_partition_attrs(PartitionNode *pn) { - List *attrs = NIL; - int i; + List *attrs = NIL; + int i; if (!pn) return NIL; @@ -2412,17 +2440,17 @@ partition_get_policies_attrs(PartitionNode *pn, GpPolicy *master_policy, return; else { - ListCell *lc; + ListCell *lc; /* * We use master_policy as a fast path. The assumption is that most - * child partitions look like the master so we don't want to enter - * the O(N^2) loop below if we can avoid it. Firstly, though, we must - * copy the master policy into the list. + * child partitions look like the master so we don't want to enter the + * O(N^2) loop below if we can avoid it. Firstly, though, we must copy + * the master policy into the list. */ if (*cols == NIL && master_policy->nattrs) { - int attno; + int attno; for (attno = 0; attno < master_policy->nattrs; attno++) *cols = lappend_int(*cols, master_policy->attrs[attno]); @@ -2431,13 +2459,13 @@ partition_get_policies_attrs(PartitionNode *pn, GpPolicy *master_policy, foreach(lc, pn->rules) { PartitionRule *rule = lfirst(lc); - Relation rel = heap_open(rule->parchildrelid, NoLock); + Relation rel = heap_open(rule->parchildrelid, NoLock); if (master_policy->nattrs != rel->rd_cdbpolicy->nattrs || memcmp(master_policy->attrs, rel->rd_cdbpolicy->attrs, (master_policy->nattrs * sizeof(AttrNumber)))) { - int attno; + int attno; for (attno = 0; attno < rel->rd_cdbpolicy->nattrs; attno++) { @@ -2463,12 +2491,12 @@ partition_policies_equal(GpPolicy *p, PartitionNode *pn) if (pn->rules) { - ListCell *lc; + ListCell *lc; foreach(lc, pn->rules) { PartitionRule *rule = lfirst(lc); - Relation rel = heap_open(rule->parchildrelid, NoLock); + Relation rel = heap_open(rule->parchildrelid, NoLock); if (p->nattrs != rel->rd_cdbpolicy->nattrs) { @@ -2478,7 +2506,7 @@ partition_policies_equal(GpPolicy *p, PartitionNode *pn) else { if (p->attrs == 0) - /* random policy, skip */ + /* random policy, skip */ ; if (memcmp(p->attrs, rel->rd_cdbpolicy->attrs, (sizeof(AttrNumber) * p->nattrs))) @@ -2501,16 +2529,16 @@ partition_policies_equal(GpPolicy *p, PartitionNode *pn) AttrNumber max_partition_attr(PartitionNode *pn) { - AttrNumber n = 0; - List *l = get_partition_attrs(pn); + AttrNumber n = 0; + List *l = get_partition_attrs(pn); if (l) { - ListCell *lc; + ListCell *lc; foreach(lc, l) { - AttrNumber att = lfirst_int(lc); + AttrNumber att = lfirst_int(lc); n = Max(att, n); } @@ -2523,7 +2551,7 @@ int num_partition_levels(PartitionNode *pn) { PartitionNode *tmp; - int level = 0; + int level = 0; tmp = pn; @@ -2534,11 +2562,13 @@ num_partition_levels(PartitionNode *pn) if (tmp->rules) { PartitionRule *rule = linitial(tmp->rules); + tmp = rule->children; } else if (tmp->default_part) { PartitionRule *rule = tmp->default_part; + tmp = rule->children; } else @@ -2558,8 +2588,8 @@ all_partition_relids(PartitionNode *pn) return NIL; else { - ListCell *lc; - List *out = NIL; + ListCell *lc; + List *out = NIL; foreach(lc, pn->rules) { @@ -2589,22 +2619,22 @@ all_partition_relids(PartitionNode *pn) static Node * getPartConstraintsContainsKeys(Oid partOid, Oid rootOid, List *partKey) { - ScanKeyData scankey; + ScanKeyData scankey; SysScanDesc sscan; - Relation conRel; - HeapTuple conTup; - Node *conExpr; - Node *result = NULL; - Datum conBinDatum; - Datum conKeyDatum; - char *conBin; - bool conbinIsNull = false; - bool conKeyIsNull = false; - AttrMap *map; + Relation conRel; + HeapTuple conTup; + Node *conExpr; + Node *result = NULL; + Datum conBinDatum; + Datum conKeyDatum; + char *conBin; + bool conbinIsNull = false; + bool conKeyIsNull = false; + AttrMap *map; /* create the map needed for mapping attnums */ - Relation rootRel = heap_open(rootOid, AccessShareLock); - Relation partRel = heap_open(partOid, AccessShareLock); + Relation rootRel = heap_open(rootOid, AccessShareLock); + Relation partRel = heap_open(partOid, AccessShareLock); map_part_attrs(partRel, rootRel, &map, false); @@ -2622,35 +2652,40 @@ getPartConstraintsContainsKeys(Oid partOid, Oid rootOid, List *partKey) while (HeapTupleIsValid(conTup = systable_getnext(sscan))) { - /* we defer the filter on contype to here in order to take advantage of - * the index on conrelid in the scan */ + /* + * we defer the filter on contype to here in order to take advantage + * of the index on conrelid in the scan + */ Form_pg_constraint conEntry = (Form_pg_constraint) GETSTRUCT(conTup); + if (conEntry->contype != 'c') { continue; } /* Fetch the constraint expression in parsetree form */ conBinDatum = heap_getattr(conTup, Anum_pg_constraint_conbin, - RelationGetDescr(conRel), &conbinIsNull); + RelationGetDescr(conRel), &conbinIsNull); - Assert (!conbinIsNull); + Assert(!conbinIsNull); /* map the attnums in constraint expression to root attnums */ conBin = TextDatumGetCString(conBinDatum); conExpr = stringToNode(conBin); conExpr = attrMapExpr(map, conExpr); - // fetch the key associated with this constraint + /* fetch the key associated with this constraint */ conKeyDatum = heap_getattr(conTup, Anum_pg_constraint_conkey, - RelationGetDescr(conRel), &conKeyIsNull); - Datum *dats = NULL; - int numKeys = 0; + RelationGetDescr(conRel), &conKeyIsNull); + Datum *dats = NULL; + int numKeys = 0; + + bool found = false; - bool found = false; - // extract key elements + /* extract key elements */ deconstruct_array(DatumGetArrayTypeP(conKeyDatum), INT2OID, 2, true, 's', &dats, NULL, &numKeys); for (int i = 0; i < numKeys; i++) { - int16 key_elem = DatumGetInt16(dats[i]); + int16 key_elem = DatumGetInt16(dats[i]); + if (list_member_int(partKey, key_elem)) { found = true; @@ -2661,7 +2696,7 @@ getPartConstraintsContainsKeys(Oid partOid, Oid rootOid, List *partKey) if (found) { if (result) - result = (Node *)make_andclause(list_make2(result, conExpr)); + result = (Node *) make_andclause(list_make2(result, conExpr)); else result = conExpr; } @@ -2684,13 +2719,14 @@ getPartConstraintsContainsKeys(Oid partOid, Oid rootOid, List *partKey) * Outout: * a pointer to the created hash table */ -static HTAB* +static HTAB * createConstraintHashTable(unsigned int nEntries) { - HASHCTL hash_ctl; + HASHCTL hash_ctl; + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(Node**); + hash_ctl.keysize = sizeof(Node **); hash_ctl.entrysize = sizeof(ConNodeEntry); hash_ctl.hash = constrNodeHash; hash_ctl.match = constrNodeMatch; @@ -2709,12 +2745,14 @@ createConstraintHashTable(unsigned int nEntries) static uint32 constrNodeHash(const void *keyPtr, Size keysize) { - uint32 result = 0; - Node *constr = *((Node **) keyPtr); - int con_len = 0; + uint32 result = 0; + Node *constr = *((Node **) keyPtr); + int con_len = 0; + if (constr) { - char* constr_bin = nodeToBinaryStringFast(constr, &con_len); + char *constr_bin = nodeToBinaryStringFast(constr, &con_len); + Assert(con_len > 0); result = tag_hash(constr_bin, con_len); pfree(constr_bin); @@ -2734,8 +2772,9 @@ constrNodeHash(const void *keyPtr, Size keysize) static int constrNodeMatch(const void *keyPtr1, const void *keyPtr2, Size keysize) { - Node *left = *((Node **) keyPtr1); - Node *right = *((Node **) keyPtr2); + Node *left = *((Node **) keyPtr1); + Node *right = *((Node **) keyPtr2); + return equal(left, right) ? 0 : 1; } @@ -2753,49 +2792,57 @@ rel_partitioning_is_uniform(Oid rootOid) Assert(OidIsValid(rootOid)); Assert(rel_is_partitioned(rootOid)); - bool result = true; + bool result = true; MemoryContext uniformityMemoryContext = AllocSetContextCreate(CurrentMemoryContext, - "PartitioningIsUniform", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); + "PartitioningIsUniform", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); MemoryContext callerMemoryContext = MemoryContextSwitchTo(uniformityMemoryContext); - PartitionNode *pnRoot = RelationBuildPartitionDescByOid(rootOid, false /*inctemplate*/); - List *queue = list_make1(pnRoot); + PartitionNode *pnRoot = RelationBuildPartitionDescByOid(rootOid, false /* inctemplate */ ); + List *queue = list_make1(pnRoot); while (result) { - /* we process the partitioning tree level by level, each outer loop corresponds to one level */ - int size = list_length(queue); + /* + * we process the partitioning tree level by level, each outer loop + * corresponds to one level + */ + int size = list_length(queue); + if (0 == size) { break; } - /* Look ahead to get the number of children of the first partition node in this level. - * This allows us to initialize a hash table on the constraints which each partition node - * in this level will be compared to. + /* + * Look ahead to get the number of children of the first partition + * node in this level. This allows us to initialize a hash table on + * the constraints which each partition node in this level will be + * compared to. */ - PartitionNode *pn_ahead = (PartitionNode*) linitial(queue); - int nChildren = list_length(pn_ahead->rules) + (pn_ahead->default_part ? 1 : 0); - HTAB* conHash = createConstraintHashTable(nChildren); + PartitionNode *pn_ahead = (PartitionNode *) linitial(queue); + int nChildren = list_length(pn_ahead->rules) + (pn_ahead->default_part ? 1 : 0); + HTAB *conHash = createConstraintHashTable(nChildren); /* get the list of part keys for this level */ - List *lpartkey = NIL; + List *lpartkey = NIL; + for (int i = 0; i < pn_ahead->part->parnatts; i++) { lpartkey = lappend_int(lpartkey, pn_ahead->part->paratts[i]); } /* now iterate over all partition nodes on this level */ - bool fFirstNode = true; + bool fFirstNode = true; + while (size > 0 && result) { - PartitionNode *pn = (PartitionNode*) linitial(queue); - List *lrules = get_partition_rules(pn); - int curr_nChildren = list_length(lrules); + PartitionNode *pn = (PartitionNode *) linitial(queue); + List *lrules = get_partition_rules(pn); + int curr_nChildren = list_length(lrules); if (curr_nChildren != nChildren) { @@ -2804,27 +2851,34 @@ rel_partitioning_is_uniform(Oid rootOid) } /* loop over the children's constraints of this node */ - ListCell *lc = NULL; + ListCell *lc = NULL; + foreach(lc, lrules) { - PartitionRule *pr = (PartitionRule*) lfirst(lc); - Node *curr_con = getPartConstraintsContainsKeys(pr->parchildrelid, rootOid, lpartkey); - bool found = false; + PartitionRule *pr = (PartitionRule *) lfirst(lc); + Node *curr_con = getPartConstraintsContainsKeys(pr->parchildrelid, rootOid, lpartkey); + bool found = false; - /* we populate the hash table with the constraints of the children of the - * first node in this level */ + /* + * we populate the hash table with the constraints of the + * children of the first node in this level + */ if (fFirstNode) { /* add current constraint to hash table */ - void *con_entry = hash_search(conHash, &curr_con, HASH_ENTER, &found); + void *con_entry = hash_search(conHash, &curr_con, HASH_ENTER, &found); + if (con_entry == NULL) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } - ((ConNodeEntry*) con_entry)->entry = curr_con; + ((ConNodeEntry *) con_entry)->entry = curr_con; } - /* starting from the second node in this level, we probe the children's constraints */ + /* + * starting from the second node in this level, we probe the + * children's constraints + */ else { hash_search(conHash, &curr_con, HASH_FIND, &found); @@ -2871,12 +2925,13 @@ all_leaf_partition_relids(PartitionNode *pn) return NIL; } - ListCell *lc; - List *leaf_relids = NIL; + ListCell *lc; + List *leaf_relids = NIL; foreach(lc, pn->rules) { PartitionRule *rule = lfirst(lc); + if (NULL != rule->children) { leaf_relids = list_concat(leaf_relids, all_leaf_partition_relids(rule->children)); @@ -2892,7 +2947,7 @@ all_leaf_partition_relids(PartitionNode *pn) if (NULL != pn->default_part->children) { leaf_relids = list_concat(leaf_relids, - all_leaf_partition_relids(pn->default_part->children)); + all_leaf_partition_relids(pn->default_part->children)); } else { @@ -2914,7 +2969,7 @@ all_leaf_partition_relids(PartitionNode *pn) static List * rel_get_leaf_relids_from_rule(Oid ruleOid) { - ScanKeyData scankey; + ScanKeyData scankey; Relation part_rule_rel; SysScanDesc sscan; bool hasChildren = false; @@ -2937,8 +2992,8 @@ rel_get_leaf_relids_from_rule(Oid ruleOid) SnapshotNow, 1, &scankey); /* - * If we are still in mid-level, recursively call this function on children rules of - * the given rule. + * If we are still in mid-level, recursively call this function on + * children rules of the given rule. */ while ((tuple = systable_getnext(sscan)) != NULL) { @@ -2946,12 +3001,13 @@ rel_get_leaf_relids_from_rule(Oid ruleOid) lChildrenOid = list_concat(lChildrenOid, rel_get_leaf_relids_from_rule(HeapTupleGetOid(tuple))); } - /* if ruleOid is not parent of any rule, we have reached the leaf level and - * we need to append parchildrelid of this entry to the output + /* + * if ruleOid is not parent of any rule, we have reached the leaf level + * and we need to append parchildrelid of this entry to the output */ if (!hasChildren) { - HeapTuple tuple; + HeapTuple tuple; Form_pg_partition_rule rule_desc; tuple = SearchSysCache1(PARTRULEOID, ObjectIdGetDatum(ruleOid)); @@ -2977,16 +3033,17 @@ rel_get_leaf_relids_from_rule(Oid ruleOid) List * rel_get_leaf_children_relids(Oid relid) { - PartStatus ps = rel_part_status(relid); - List *leaf_relids = NIL; + PartStatus ps = rel_part_status(relid); + List *leaf_relids = NIL; + Assert(PART_STATUS_INTERIOR == ps || PART_STATUS_ROOT == ps); if (PART_STATUS_ROOT == ps) { PartitionNode *pn; - pn = get_parts(relid, 0 /*level*/, 0 /*parent*/, false /*inctemplate*/, - true /*includesubparts*/); + pn = get_parts(relid, 0 /* level */ , 0 /* parent */ , false /* inctemplate */ , + true /* includesubparts */ ); leaf_relids = all_leaf_partition_relids(pn); pfree(pn); } @@ -2995,7 +3052,7 @@ rel_get_leaf_children_relids(Oid relid) Relation partrulerel; ScanKeyData scankey; SysScanDesc sscan; - HeapTuple tuple; + HeapTuple tuple; /* SELECT * FROM pg_partition_rule WHERE parchildrelid = :1 */ partrulerel = heap_open(PartitionRuleRelationId, AccessShareLock); @@ -3035,12 +3092,13 @@ all_interior_partition_relids(PartitionNode *pn) return NIL; } - ListCell *lc; - List *interior_relids = NIL; + ListCell *lc; + List *interior_relids = NIL; foreach(lc, pn->rules) { PartitionRule *rule = lfirst(lc); + if (rule->children) { interior_relids = lappend_oid(interior_relids, rule->parchildrelid); @@ -3054,7 +3112,7 @@ all_interior_partition_relids(PartitionNode *pn) { interior_relids = lappend_oid(interior_relids, pn->default_part->parchildrelid); interior_relids = list_concat(interior_relids, - all_interior_partition_relids(pn->default_part->children)); + all_interior_partition_relids(pn->default_part->children)); } } @@ -3067,14 +3125,16 @@ all_interior_partition_relids(PartitionNode *pn) int countLeafPartTables(Oid rootOid) { - Assert (rel_is_partitioned(rootOid)); + Assert(rel_is_partitioned(rootOid)); + + PartitionNode *pn = get_parts(rootOid, 0 /* level */ , 0 /* parent */ , false /* inctemplate */ , + true /* include subparts */ ); - PartitionNode *pn = get_parts(rootOid, 0 /* level */, 0 /* parent */, false /* inctemplate */, - true /* include subparts */); + List *lRelOids = all_leaf_partition_relids(pn); + + Assert(list_length(lRelOids) > 0); + int count = list_length(lRelOids); - List *lRelOids = all_leaf_partition_relids(pn); - Assert (list_length(lRelOids) > 0); - int count = list_length(lRelOids); list_free(lRelOids); pfree(pn); return count; @@ -3090,22 +3150,23 @@ countLeafPartTables(Oid rootOid) List * all_prule_relids(PartitionRule *prule) { - ListCell *lcr; + ListCell *lcr; PartitionNode *pnode = NULL; - List *oids = NIL; /* of pg_class Oid */ + List *oids = NIL; /* of pg_class Oid */ - if ( prule ) + if (prule) { oids = lappend_oid(oids, prule->parchildrelid); pnode = prule->children; - if ( pnode ) + if (pnode) { oids = list_concat(oids, all_prule_relids(pnode->default_part)); - foreach (lcr, pnode->rules) + foreach(lcr, pnode->rules) { - PartitionRule *child = (PartitionRule*)lfirst(lcr); + PartitionRule *child = (PartitionRule *) lfirst(lcr); + oids = list_concat(oids, all_prule_relids(child)); } } @@ -3165,8 +3226,10 @@ rel_partition_get_master(Oid relid) Oid paroid; Oid masteroid; - /* pg_partition and pg_partition_rule are populated only on the - * entry database, so our result is only meaningful there. */ + /* + * pg_partition and pg_partition_rule are populated only on the entry + * database, so our result is only meaningful there. + */ Insist(Gp_segment == -1); partrulerel = heap_open(PartitionRuleRelationId, AccessShareLock); @@ -3200,7 +3263,7 @@ rel_partition_get_master(Oid relid) return masteroid; -} /* end rel_partition_get_master */ +} /* end rel_partition_get_master */ /* given a relid, build a path list from the master tablename down to * the partition for that relation, using partition names if possible, @@ -3217,8 +3280,10 @@ rel_get_part_path1(Oid relid) Oid parparentrule = InvalidOid; List *lrelid = NIL; - /* pg_partition and pg_partition_rule are populated only on the - * entry database, so our result is only meaningful there. */ + /* + * pg_partition and pg_partition_rule are populated only on the entry + * database, so our result is only meaningful there. + */ Insist(Gp_segment == -1); partrulerel = heap_open(PartitionRuleRelationId, AccessShareLock); @@ -3238,7 +3303,7 @@ rel_get_part_path1(Oid relid) if (HeapTupleIsValid(tuple)) { Form_pg_partition_rule rule_desc = - (Form_pg_partition_rule) GETSTRUCT(tuple); + (Form_pg_partition_rule) GETSTRUCT(tuple); paroid = rule_desc->paroid; parparentrule = rule_desc->parparentrule; @@ -3261,7 +3326,7 @@ rel_get_part_path1(Oid relid) if (HeapTupleIsValid(tuple)) { Form_pg_partition_rule rule_desc = - (Form_pg_partition_rule) GETSTRUCT(tuple); + (Form_pg_partition_rule) GETSTRUCT(tuple); paroid = rule_desc->paroid; parparentrule = rule_desc->parparentrule; @@ -3277,17 +3342,19 @@ rel_get_part_path1(Oid relid) return lrelid; -} /* end rel_get_part_path1 */ +} /* end rel_get_part_path1 */ -static List *rel_get_part_path(Oid relid) +static List * +rel_get_part_path(Oid relid) { - PartitionNode *pNode = NULL; - Partition *part = NULL; - List *lrelid = NIL; - List *lnamerank = NIL; - List *lnrv = NIL; - ListCell *lc, *lc2; - Oid masteroid = InvalidOid; + PartitionNode *pNode = NULL; + Partition *part = NULL; + List *lrelid = NIL; + List *lnamerank = NIL; + List *lnrv = NIL; + ListCell *lc, + *lc2; + Oid masteroid = InvalidOid; masteroid = rel_partition_get_master(relid); @@ -3295,7 +3362,7 @@ static List *rel_get_part_path(Oid relid) return NIL; /* call the guts of RelationBuildPartitionDesc */ - pNode = get_parts(masteroid, 0, 0, false, true /*includesubparts*/); + pNode = get_parts(masteroid, 0, 0, false, true /* includesubparts */ ); if (!pNode) { @@ -3304,19 +3371,21 @@ static List *rel_get_part_path(Oid relid) part = pNode->part; - /* get the relids for each table that corresponds to the partition + /* + * get the relids for each table that corresponds to the partition * heirarchy from the master to the specified partition */ lrelid = rel_get_part_path1(relid); - /* walk the partition tree, finding the partition for each relid, - * and extract useful information (name, rank, value) + /* + * walk the partition tree, finding the partition for each relid, and + * extract useful information (name, rank, value) */ foreach(lc, lrelid) { - Oid parrelid = lfirst_oid(lc); + Oid parrelid = lfirst_oid(lc); PartitionRule *prule; - int rulerank = 1; + int rulerank = 1; Assert(pNode); @@ -3324,9 +3393,9 @@ static List *rel_get_part_path(Oid relid) rulerank = 1; - foreach (lc2, pNode->rules) + foreach(lc2, pNode->rules) { - prule = (PartitionRule *)lfirst(lc2); + prule = (PartitionRule *) lfirst(lc2); if (parrelid == prule->parchildrelid) { @@ -3342,69 +3411,68 @@ static List *rel_get_part_path(Oid relid) Assert(parrelid == prule->parchildrelid); /* default partition must have a name (and no rank) */ - Assert (prule->parname && strlen(prule->parname)); + Assert(prule->parname && strlen(prule->parname)); pNode = prule->children; rulerank = 0; - L_rel_get_part_path_match: +L_rel_get_part_path_match: - if (!rulerank) /* must be default, so it has a name, but no - * rank or value */ + if (!rulerank) /* must be default, so it has a name, but no + * rank or value */ { lnrv = list_make3(prule->parname, NULL, NULL); } - else if (part->parkind == 'l') /* list partition by value */ + else if (part->parkind == 'l') /* list partition by value */ { - char *idval = NULL; - ListCell *lc3; - List *l1 = (List *)prule->parlistvalues; - StringInfoData sid1; - int2 nkeys = part->parnatts; - int2 parcol = 0; + char *idval = NULL; + ListCell *lc3; + List *l1 = (List *) prule->parlistvalues; + StringInfoData sid1; + int2 nkeys = part->parnatts; + int2 parcol = 0; initStringInfo(&sid1); - /* foreach(lc3, l1) */ + /* foreach(lc3, l1) */ /* don't loop -- just need first set of values */ lc3 = list_head(l1); if (lc3) { - List *vals = lfirst(lc3); - ListCell *lcv = list_head(vals); + List *vals = lfirst(lc3); + ListCell *lcv = list_head(vals); - /* Note: similar code in - * ruleutils.c:partition_rule_def_worker + /* + * Note: similar code in ruleutils.c:partition_rule_def_worker */ for (parcol = 0; parcol < nkeys; parcol++) { - Const *con = lfirst(lcv); + Const *con = lfirst(lcv); if (lcv != list_head(vals)) appendStringInfoString(&sid1, ", "); idval = - deparse_expression((Node*)con, - deparse_context_for(get_rel_name(relid), - relid), - false, false); + deparse_expression((Node *) con, + deparse_context_for(get_rel_name(relid), + relid), + false, false); appendStringInfo(&sid1, "%s", idval); lcv = lnext(lcv); - } /* end for parcol */ + } /* end for parcol */ } /* list - no rank */ lnrv = list_make3(prule->parname, NULL, sid1.data); } - else /* range (or hash) - use rank (though rank is not really - * appropriate for hash) - */ + else /* range (or hash) - use rank (though rank is + * not really appropriate for hash) */ { - char *rtxt = palloc(NAMEDATALEN); + char *rtxt = palloc(NAMEDATALEN); sprintf(rtxt, "%d", rulerank); @@ -3415,21 +3483,23 @@ static List *rel_get_part_path(Oid relid) /* build the list of (lists of name, rank, value) for each level */ lnamerank = lappend(lnamerank, lnrv); - } /* end foreach lc (walking list of relids) */ + } /* end foreach lc (walking list of relids) */ return lnamerank; -} /* end rel_get_part_path */ +} /* end rel_get_part_path */ char * rel_get_part_path_pretty(Oid relid, - char *separator, - char *lastsep) + char *separator, + char *lastsep) { - List *lnamerank = NIL; - List *lnrv = NIL; - ListCell *lc, *lc2; - int maxlen; - StringInfoData sid1, sid2; + List *lnamerank = NIL; + List *lnrv = NIL; + ListCell *lc, + *lc2; + int maxlen; + StringInfoData sid1, + sid2; lnamerank = rel_get_part_path(relid); @@ -3446,9 +3516,9 @@ rel_get_part_path_pretty(Oid relid, foreach(lc, lnamerank) { - int lcnt = 0; + int lcnt = 0; - lnrv = (List *)lfirst(lc); + lnrv = (List *) lfirst(lc); maxlen--; @@ -3456,13 +3526,13 @@ rel_get_part_path_pretty(Oid relid, lcnt = 0; - foreach (lc2, lnrv) + foreach(lc2, lnrv) { - char *str = (char *)lfirst(lc2); + char *str = (char *) lfirst(lc2); truncateStringInfo(&sid2, 0); - switch(lcnt) + switch (lcnt) { case 0: if (str && strlen(str)) @@ -3493,13 +3563,13 @@ rel_get_part_path_pretty(Oid relid, lcnt++; } - l_pretty: +l_pretty: appendStringInfo(&sid1, "%s", sid2.data); } return sid1.data; -} /* end rel_get_part_path_pretty */ +} /* end rel_get_part_path_pretty */ /* @@ -3516,16 +3586,16 @@ char * ChoosePartitionName(const char *tablename, int partDepth, const char *partname, Oid namespaceId) { - char *relname; - char depthstr[NAMEDATALEN]; - char prtstr[NAMEDATALEN]; + char *relname; + char depthstr[NAMEDATALEN]; + char prtstr[NAMEDATALEN]; /* build a relation name (see transformPartitionBy */ - snprintf(depthstr, sizeof(depthstr), "%d", partDepth+1); + snprintf(depthstr, sizeof(depthstr), "%d", partDepth + 1); snprintf(prtstr, sizeof(prtstr), "prt_%s", partname); relname = ChooseRelationName(tablename, - depthstr, /* depth */ + depthstr, /* depth */ prtstr, /* part spec */ namespaceId); CommandCounterIncrement(); @@ -3547,7 +3617,8 @@ magic_expr_to_datum(Relation rel, PartitionNode *partnode, TupleDesc tupleDesc; Datum *values; bool *isnull; - int ii, jj; + int ii, + jj; Assert(rel); @@ -3562,9 +3633,9 @@ magic_expr_to_datum(Relation rel, PartitionNode *partnode, *ppisnull = isnull; - Assert (IsA(expr, List)); + Assert(IsA(expr, List)); - jj = list_length((List *)expr); + jj = list_length((List *) expr); if (jj > ii) ereport(ERROR, @@ -3580,27 +3651,27 @@ magic_expr_to_datum(Relation rel, PartitionNode *partnode, { ListCell *lc; - int i = 0; + int i = 0; - foreach(lc, (List *)expr) + foreach(lc, (List *) expr) { - Node *n1 = (Node *) lfirst(lc); - Const *c1; - AttrNumber attno = part->paratts[i++]; + Node *n1 = (Node *) lfirst(lc); + Const *c1; + AttrNumber attno = part->paratts[i++]; Form_pg_attribute attribute = tupleDesc->attrs[attno - 1]; - Oid lhsid = attribute->atttypid; + Oid lhsid = attribute->atttypid; if (!IsA(n1, Const)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("Not a constant expression"))); - c1 = (Const *)n1; + c1 = (Const *) n1; if (lhsid != c1->consttype) { /* see coerce_partition_value */ - Node *out; + Node *out; out = coerce_partition_value(n1, lhsid, attribute->atttypmod, char_to_parttype(partnode->part->parkind)); @@ -3613,7 +3684,7 @@ magic_expr_to_datum(Relation rel, PartitionNode *partnode, Assert(IsA(out, Const)); - c1 = (Const *)out; + c1 = (Const *) out; } /* XXX: cache */ @@ -3623,7 +3694,7 @@ magic_expr_to_datum(Relation rel, PartitionNode *partnode, } return values; -} /* end magic_expr_to_datum */ +} /* end magic_expr_to_datum */ /* * Assume the partition rules are in the "correct" order and return @@ -3633,8 +3704,8 @@ magic_expr_to_datum(Relation rel, PartitionNode *partnode, static Oid selectPartitionByRank(PartitionNode *partnode, int rnk) { - Oid relid = InvalidOid; - List *rules = partnode->rules; + Oid relid = InvalidOid; + List *rules = partnode->rules; PartitionRule *rule; Assert(partnode->part->parkind == 'r'); @@ -3646,36 +3717,37 @@ selectPartitionByRank(PartitionNode *partnode, int rnk) return relid; if (rnk > 0) - rnk--; /* list_nth is zero-based, not one-based */ + rnk--; /* list_nth is zero-based, not one-based */ else if (rnk < 0) { rnk = list_length(rules) + rnk; /* if negative go from end */ /* mpp-3265 */ - if (rnk < 0) /* oops -- too negative */ + if (rnk < 0) /* oops -- too negative */ return relid; } - rule = (PartitionRule *)list_nth(rules, rnk); + rule = (PartitionRule *) list_nth(rules, rnk); return rule->parchildrelid; -} /* end selectPartitionByRank */ +} /* end selectPartitionByRank */ -static bool compare_partn_opfuncid(PartitionNode *partnode, - char *pub, char *compare_op, - List *colvals, - Datum *values, bool *isnull, - TupleDesc tupdesc) +static bool +compare_partn_opfuncid(PartitionNode *partnode, + char *pub, char *compare_op, + List *colvals, + Datum *values, bool *isnull, + TupleDesc tupdesc) { - Partition *part = partnode->part; - List *last_opname = list_make2(makeString(pub), - makeString(compare_op)); - List *opname = NIL; - ListCell *lc; - int numCols = 0; - int colCnt = 0; - int ii = 0; + Partition *part = partnode->part; + List *last_opname = list_make2(makeString(pub), + makeString(compare_op)); + List *opname = NIL; + ListCell *lc; + int numCols = 0; + int colCnt = 0; + int ii = 0; if (1 == strlen(compare_op)) { @@ -3685,13 +3757,13 @@ static bool compare_partn_opfuncid(PartitionNode *partnode, if (0 == strcmp(">", compare_op)) compare_op = ">="; - /* for a list of values, when performing less than or greater - * than comparison, only the final value is compared using - * less than or greater. All prior values must be compared - * with LTE/GTE. For example, comparing the list (1,2,3) to - * see if it is less than (1,2,4), we see that 1 <= 1, 2 <= 2, - * and 3 < 4. So the last_opname is the specified compare_op, - * and the prior opnames are LTE or GTE. + /* + * for a list of values, when performing less than or greater than + * comparison, only the final value is compared using less than or + * greater. All prior values must be compared with LTE/GTE. For + * example, comparing the list (1,2,3) to see if it is less than + * (1,2,4), we see that 1 <= 1, 2 <= 2, and 3 < 4. So the last_opname + * is the specified compare_op, and the prior opnames are LTE or GTE. */ } @@ -3702,8 +3774,8 @@ static bool compare_partn_opfuncid(PartitionNode *partnode, foreach(lc, colvals) { - Const *c = lfirst(lc); - AttrNumber attno = part->paratts[ii]; + Const *c = lfirst(lc); + AttrNumber attno = part->paratts[ii]; if (isnull && isnull[attno - 1]) { @@ -3712,11 +3784,11 @@ static bool compare_partn_opfuncid(PartitionNode *partnode, } else { - Oid lhsid = tupdesc->attrs[attno - 1]->atttypid; - Oid rhsid = lhsid; - Oid opfuncid; - Datum res; - Datum d = values[attno - 1]; + Oid lhsid = tupdesc->attrs[attno - 1]->atttypid; + Oid rhsid = lhsid; + Oid opfuncid; + Datum res; + Datum d = values[attno - 1]; if (1 == colCnt) { @@ -3732,10 +3804,10 @@ static bool compare_partn_opfuncid(PartitionNode *partnode, ii++; colCnt--; - } /* end foreach */ + } /* end foreach */ return true; -} /* end compare_partn_opfuncid */ +} /* end compare_partn_opfuncid */ /* * Given a partition-by-list PartitionNode, search for @@ -3755,16 +3827,16 @@ selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, Oid *foundOid, PartitionRule **prule, Oid exprTypeOid) { - ListCell *lc; - Partition *part = partnode->part; + ListCell *lc; + Partition *part = partnode->part; MemoryContext oldcxt = NULL; PartitionListState *ls; if (accessMethods && accessMethods->amstate[partnode->part->parlevel]) - ls = (PartitionListState *)accessMethods->amstate[partnode->part->parlevel]; + ls = (PartitionListState *) accessMethods->amstate[partnode->part->parlevel]; else { - int natts = partnode->part->parnatts; + int natts = partnode->part->parnatts; ls = palloc(sizeof(PartitionListState)); @@ -3772,7 +3844,7 @@ selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, ls->eqinit = palloc0(sizeof(bool) * natts); if (accessMethods) - accessMethods->amstate[partnode->part->parlevel] = (void *)ls; + accessMethods->amstate[partnode->part->parlevel] = (void *) ls; } if (accessMethods && accessMethods->part_cxt) @@ -3784,18 +3856,17 @@ selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, foreach(lc, partnode->rules) { PartitionRule *rule = lfirst(lc); - List *vals = rule->parlistvalues; - ListCell *lc2; - bool matched = false; + List *vals = rule->parlistvalues; + ListCell *lc2; + bool matched = false; /* * list values are stored in a list of lists to support multi column * partitions. * * At this level, we're processing the list of possible values for the - * given rule, for example: - * values(1, 2, 3) - * values((1, '2005-01-01'), (2, '2006-01-01')) + * given rule, for example: values(1, 2, 3) values((1, '2005-01-01'), + * (2, '2006-01-01')) * * Each iteraction is one element of the values list. In the first * example, we iterate '1', '2' then '3'. For the second, we iterate @@ -3804,16 +3875,16 @@ selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, */ foreach(lc2, vals) { - ListCell *lc3; - List *colvals = (List *)lfirst(lc2); - int i = 0; + ListCell *lc3; + List *colvals = (List *) lfirst(lc2); + int i = 0; - matched = true; /* prove untrue */ + matched = true; /* prove untrue */ foreach(lc3, colvals) { - Const *c = lfirst(lc3); - AttrNumber attno = part->paratts[i]; + Const *c = lfirst(lc3); + AttrNumber attno = part->paratts[i]; if (isnull[attno - 1]) { @@ -3831,38 +3902,45 @@ selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, } else { - Datum res; - Datum d = values[attno - 1]; - FmgrInfo *finfo; + Datum res; + Datum d = values[attno - 1]; + FmgrInfo *finfo; if (!ls->eqinit[i]) { /* - * Compute the type of the LHS and RHS for the equality comparator. - * The way we call the comparator is comp(expr, rule) - * So lhstypid = type(expr) and rhstypeid = type(rule) + * Compute the type of the LHS and RHS for the + * equality comparator. The way we call the comparator + * is comp(expr, rule) So lhstypid = type(expr) and + * rhstypeid = type(rule) */ - /* The tupdesc tuple descriptor matches the table schema, so it has the rule type */ - Oid rhstypid = tupdesc->attrs[attno - 1]->atttypid; + /* + * The tupdesc tuple descriptor matches the table + * schema, so it has the rule type + */ + Oid rhstypid = tupdesc->attrs[attno - 1]->atttypid; /* - * exprTypeOid is passed to us from our caller which evaluated the expression. - * In some cases (e.g legacy optimizer doing explicit casting), we don't compute - * specify exprTypeOid. - * Assume lhstypid = rhstypid in those cases + * exprTypeOid is passed to us from our caller which + * evaluated the expression. In some cases (e.g legacy + * optimizer doing explicit casting), we don't compute + * specify exprTypeOid. Assume lhstypid = rhstypid in + * those cases */ - Oid lhstypid = exprTypeOid; + Oid lhstypid = exprTypeOid; + if (!OidIsValid(lhstypid)) { lhstypid = rhstypid; } - List *opname = list_make2(makeString("pg_catalog"), - makeString("=")); + List *opname = list_make2(makeString("pg_catalog"), + makeString("=")); + + Oid opfuncid = get_opfuncid_by_opname(opname, lhstypid, rhstypid); - Oid opfuncid = get_opfuncid_by_opname(opname, lhstypid, rhstypid); fmgr_info(opfuncid, &(ls->eqfuncs[i])); ls->eqinit[i] = true; } @@ -3919,11 +3997,12 @@ selectListPartition(PartitionNode *partnode, Datum *values, bool *isnull, static Oid get_less_than_oper(Oid lhstypid, Oid rhstypid, bool strictlyless) { - Value *str = strictlyless ? makeString("<") : makeString("<="); - Value *pub = makeString("pg_catalog"); - List *opname = list_make2(pub, str); + Value *str = strictlyless ? makeString("<") : makeString("<="); + Value *pub = makeString("pg_catalog"); + List *opname = list_make2(pub, str); + + Oid funcid = get_opfuncid_by_opname(opname, lhstypid, rhstypid); - Oid funcid = get_opfuncid_by_opname(opname, lhstypid, rhstypid); list_free_deep(opname); return funcid; @@ -3950,41 +4029,52 @@ get_less_than_comparator(int keyno, PartitionRangeState *rs, Oid ruleTypeOid, Oi Assert(NULL != rs); - Oid lhsOid = InvalidOid; - Oid rhsOid = InvalidOid; - FmgrInfo *funcInfo = NULL; + Oid lhsOid = InvalidOid; + Oid rhsOid = InvalidOid; + FmgrInfo *funcInfo = NULL; - if (is_direct && strictlyless) { + if (is_direct && strictlyless) + { /* Looking for expr < partRule comparator */ funcInfo = &rs->ltfuncs_direct[keyno]; - } else if (is_direct && !strictlyless) { + } + else if (is_direct && !strictlyless) + { /* Looking for expr <= partRule comparator */ funcInfo = &rs->lefuncs_direct[keyno]; - } else if (!is_direct && strictlyless) { + } + else if (!is_direct && strictlyless) + { /* Looking for partRule < expr comparator */ funcInfo = &rs->ltfuncs_inverse[keyno]; - } else if (!is_direct && !strictlyless) { + } + else if (!is_direct && !strictlyless) + { /* Looking for partRule <= expr comparator */ funcInfo = &rs->lefuncs_inverse[keyno]; } Assert(NULL != funcInfo); - if (!OidIsValid(funcInfo->fn_oid)) { + if (!OidIsValid(funcInfo->fn_oid)) + { /* We haven't looked up this comparator before, let's do it now */ - if (is_direct) { + if (is_direct) + { /* Looking for "direct" comparators (expr OP partRule ) */ lhsOid = exprTypeOid; rhsOid = ruleTypeOid; } - else { + else + { /* Looking for "inverse" comparators (partRule OP expr ) */ lhsOid = ruleTypeOid; rhsOid = exprTypeOid; } - Oid funcid = get_less_than_oper(lhsOid, rhsOid, strictlyless); + Oid funcid = get_less_than_oper(lhsOid, rhsOid, strictlyless); + fmgr_info(funcid, funcInfo); } @@ -4010,9 +4100,9 @@ static int range_test(Datum tupval, Oid ruleTypeOid, Oid exprTypeOid, PartitionRangeState *rs, int keyno, PartitionRule *rule) { - Const *c = NULL; - FmgrInfo *finfo; - Datum res; + Const *c = NULL; + FmgrInfo *finfo; + Datum res; Assert(PointerIsValid(rule->parrangestart) || PointerIsValid(rule->parrangeend)); @@ -4022,17 +4112,18 @@ range_test(Datum tupval, Oid ruleTypeOid, Oid exprTypeOid, PartitionRangeState * { Assert(IsA(rule->parrangestart, List)); #if NOT_YET - c = (Const *)list_nth((List *)rule->parrangestart, keyno); + c = (Const *) list_nth((List *) rule->parrangestart, keyno); #else - c = (Const *)linitial((List *)rule->parrangestart); + c = (Const *) linitial((List *) rule->parrangestart); #endif /* - * Is the value in the range? - * If rule->parrangestartincl, we request for comparator ruleVal <= exprVal ( ==> strictly_less = false) - * Otherwise, we request comparator ruleVal < exprVal ( ==> strictly_less = true) + * Is the value in the range? If rule->parrangestartincl, we request + * for comparator ruleVal <= exprVal ( ==> strictly_less = false) + * Otherwise, we request comparator ruleVal < exprVal ( ==> + * strictly_less = true) */ - finfo = get_less_than_comparator(keyno, rs, ruleTypeOid, exprTypeOid, !rule->parrangestartincl /* strictly_less */, false /* is_direct */); + finfo = get_less_than_comparator(keyno, rs, ruleTypeOid, exprTypeOid, !rule->parrangestartincl /* strictly_less */ , false /* is_direct */ ); res = FunctionCall2(finfo, c->constvalue, tupval); if (!DatumGetBool(res)) @@ -4043,17 +4134,18 @@ range_test(Datum tupval, Oid ruleTypeOid, Oid exprTypeOid, PartitionRangeState * if (PointerIsValid(rule->parrangeend)) { #if NOT_YET - c = (Const *)list_nth((List *)rule->parrangeend, keyno); + c = (Const *) list_nth((List *) rule->parrangeend, keyno); #else - c = (Const *)linitial((List *)rule->parrangeend); + c = (Const *) linitial((List *) rule->parrangeend); #endif /* - * Is the value in the range? - * If rule->parrangeendincl, we request for comparator exprVal <= ruleVal ( ==> strictly_less = false) - * Otherwise, we request comparator exprVal < ruleVal ( ==> strictly_less = true) + * Is the value in the range? If rule->parrangeendincl, we request for + * comparator exprVal <= ruleVal ( ==> strictly_less = false) + * Otherwise, we request comparator exprVal < ruleVal ( ==> + * strictly_less = true) */ - finfo = get_less_than_comparator(keyno, rs, ruleTypeOid, exprTypeOid, !rule->parrangeendincl /* strictly_less */, true /* is_direct */); + finfo = get_less_than_comparator(keyno, rs, ruleTypeOid, exprTypeOid, !rule->parrangeendincl /* strictly_less */ , true /* is_direct */ ); res = FunctionCall2(finfo, tupval, c->constvalue); if (!DatumGetBool(res)) @@ -4073,30 +4165,34 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, Oid *foundOid, int *pSearch, PartitionRule **prule, Oid exprTypeOid) { - List *rules = partnode->rules; - int high = list_length(rules) - 1; - int low = 0; - int searchpoint = 0; - int mid = 0; - bool matched = false; + List *rules = partnode->rules; + int high = list_length(rules) - 1; + int low = 0; + int searchpoint = 0; + int mid = 0; + bool matched = false; PartitionRule *rule = NULL; PartitionNode *pNode = NULL; PartitionRangeState *rs = NULL; MemoryContext oldcxt = NULL; Assert(partnode->part->parkind == 'r'); - /* For composite partitioning keys, exprTypeOid should always be InvalidOid */ + + /* + * For composite partitioning keys, exprTypeOid should always be + * InvalidOid + */ AssertImply(partnode->part->parnatts > 1, !OidIsValid(exprTypeOid)); if (accessMethods && accessMethods->amstate[partnode->part->parlevel]) - rs = (PartitionRangeState *)accessMethods->amstate[partnode->part->parlevel]; + rs = (PartitionRangeState *) accessMethods->amstate[partnode->part->parlevel]; else { - int natts = partnode->part->parnatts; + int natts = partnode->part->parnatts; /* - * We're still in our caller's memory context so - * the memory will persist long enough for us. + * We're still in our caller's memory context so the memory will + * persist long enough for us. */ rs = palloc(sizeof(PartitionRangeState)); rs->lefuncs_direct = palloc0(sizeof(FmgrInfo) * natts); @@ -4105,8 +4201,8 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, rs->ltfuncs_inverse = palloc0(sizeof(FmgrInfo) * natts); /* - * Set the function Oid to InvalidOid to signal that we - * haven't looked up this function yet + * Set the function Oid to InvalidOid to signal that we haven't looked + * up this function yet */ for (int keyno = 0; keyno < natts; keyno++) { @@ -4117,18 +4213,18 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, } /* - * Unrolling the rules into an array currently works for the - * top level partition only + * Unrolling the rules into an array currently works for the top level + * partition only */ if (partnode->part->parlevel == 0) { - int i = 0; - ListCell *lc; + int i = 0; + ListCell *lc; rs->rules = palloc(sizeof(PartitionRule *) * list_length(rules)); foreach(lc, rules) - rs->rules[i++] = (PartitionRule *)lfirst(lc); + rs->rules[i++] = (PartitionRule *) lfirst(lc); } else rs->rules = NULL; @@ -4150,28 +4246,26 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, * * Consider the following intervals: * - * 1. start( 1, 8) end( 10, 9) - * 2. start( 1, 9) end( 15, 10) - * 3. start( 1, 11) end(100, 12) - * 4. start(15, 10) end( 30, 11) + * 1. start( 1, 8) end( 10, 9) 2. start( 1, 9) end( 15, 10) 3. start( + * 1, 11) end(100, 12) 4. start(15, 10) end( 30, 11) * - * If we were to try and find the partition for a tuple (25, 10), using the - * binary search for the first element, we'd select partition 3 but + * If we were to try and find the partition for a tuple (25, 10), using + * the binary search for the first element, we'd select partition 3 but * partition 4 is also a candidate. It is only when we look at the second * element that we find the single definitive rule. */ while (low <= high) { - AttrNumber attno = partnode->part->paratts[0]; - Datum exprValue = values[attno - 1]; - int ret; + AttrNumber attno = partnode->part->paratts[0]; + Datum exprValue = values[attno - 1]; + int ret; - mid = low + (high - low)/2; + mid = low + (high - low) / 2; if (rs->rules) rule = rs->rules[mid]; else - rule = (PartitionRule *)list_nth(rules, mid); + rule = (PartitionRule *) list_nth(rules, mid); if (isnull[attno - 1]) { @@ -4179,7 +4273,8 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, goto l_fin_range; } - Oid ruleTypeOid = tupdesc->attrs[attno - 1]->atttypid; + Oid ruleTypeOid = tupdesc->attrs[attno - 1]->atttypid; + if (OidIsValid(exprTypeOid)) { ret = range_test(exprValue, ruleTypeOid, exprTypeOid, rs, 0, rule); @@ -4187,8 +4282,8 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, else { /* - * In some cases, we don't have an expression type oid. In those cases, the expression and - * partition rules have the same type. + * In some cases, we don't have an expression type oid. In those + * cases, the expression and partition rules have the same type. */ ret = range_test(exprValue, ruleTypeOid, ruleTypeOid, rs, 0, rule); } @@ -4213,7 +4308,7 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, if (matched) { - int j; + int j; /* Non-composite partition key, we matched so we're done */ if (partnode->part->parnatts == 1) @@ -4225,22 +4320,25 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, goto l_fin_range; } - /* We have more than one partition key.. Must match on the other keys as well */ + /* + * We have more than one partition key.. Must match on the other keys + * as well + */ j = mid; do { - int i; - bool matched = true; - bool first_fail = false; + int i; + bool matched = true; + bool first_fail = false; for (i = 0; i < partnode->part->parnatts; i++) { - AttrNumber attno = partnode->part->paratts[i]; - Datum d = values[attno - 1]; - int ret; + AttrNumber attno = partnode->part->paratts[i]; + Datum d = values[attno - 1]; + int ret; if (j != mid) - rule = (PartitionRule *)list_nth(rules, j); + rule = (PartitionRule *) list_nth(rules, j); if (isnull[attno - 1]) { @@ -4248,8 +4346,12 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, goto l_fin_range; } - Oid ruleTypeOid = tupdesc->attrs[attno - 1]->atttypid; - /* For composite partition keys, we don't support casting comparators, so both sides must be of identical types */ + Oid ruleTypeOid = tupdesc->attrs[attno - 1]->atttypid; + + /* + * For composite partition keys, we don't support casting + * comparators, so both sides must be of identical types + */ Assert(!OidIsValid(exprTypeOid)); ret = range_test(d, ruleTypeOid, ruleTypeOid, rs, i, rule); @@ -4286,17 +4388,17 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, j = mid; do { - int i; - bool matched = true; - bool first_fail = false; + int i; + bool matched = true; + bool first_fail = false; for (i = 0; i < partnode->part->parnatts; i++) { - AttrNumber attno = partnode->part->paratts[i]; - Datum d = values[attno - 1]; - int ret; + AttrNumber attno = partnode->part->paratts[i]; + Datum d = values[attno - 1]; + int ret; - rule = (PartitionRule *)list_nth(rules, j); + rule = (PartitionRule *) list_nth(rules, j); if (isnull[attno - 1]) { @@ -4304,8 +4406,12 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, goto l_fin_range; } - Oid ruleTypeOid = tupdesc->attrs[attno - 1]->atttypid; - /* For composite partition keys, we don't support casting comparators, so both sides must be of identical types */ + Oid ruleTypeOid = tupdesc->attrs[attno - 1]->atttypid; + + /* + * For composite partition keys, we don't support casting + * comparators, so both sides must be of identical types + */ Assert(!OidIsValid(exprTypeOid)); ret = range_test(d, ruleTypeOid, ruleTypeOid, rs, i, rule); if (ret != 0) @@ -4336,7 +4442,7 @@ selectRangePartition(PartitionNode *partnode, Datum *values, bool *isnull, } while (++j < list_length(rules)); - } /* end if matched */ + } /* end if matched */ pNode = NULL; @@ -4348,10 +4454,10 @@ l_fin_range: MemoryContextSwitchTo(oldcxt); if (accessMethods) - accessMethods->amstate[partnode->part->parlevel] = (void *)rs; + accessMethods->amstate[partnode->part->parlevel] = (void *) rs; return pNode; -} /* end selectrangepartition */ +} /* end selectrangepartition */ /* select partition via hash */ @@ -4359,9 +4465,9 @@ static PartitionNode * selectHashPartition(PartitionNode *partnode, Datum *values, bool *isnull, TupleDesc tupdesc, PartitionAccessMethods *accessMethods, Oid *found, PartitionRule **prule) { - uint32 hash = 0; - int i; - int part; + uint32 hash = 0; + int i; + int part; PartitionRule *rule; MemoryContext oldcxt = NULL; @@ -4373,7 +4479,7 @@ selectHashPartition(PartitionNode *partnode, Datum *values, bool *isnull, for (i = 0; i < partnode->part->parnatts; i++) { - AttrNumber attnum = partnode->part->paratts[i]; + AttrNumber attnum = partnode->part->paratts[i]; /* rotate hash left 1 bit at each step */ hash = (hash << 1) | ((hash & 0x80000000) ? 1 : 0); @@ -4386,19 +4492,20 @@ selectHashPartition(PartitionNode *partnode, Datum *values, bool *isnull, continue; else { - Oid opclass = partnode->part->parclass[i]; - Oid inctype = get_opclass_input_type(opclass); - Oid opfamily = get_opclass_family(opclass); + Oid opclass = partnode->part->parclass[i]; + Oid inctype = get_opclass_input_type(opclass); + Oid opfamily = get_opclass_family(opclass); + + Oid hashfunc = get_opfamily_proc(opfamily, inctype, inctype, HASHPROC); + Datum d = values[attnum - 1]; - Oid hashfunc = get_opfamily_proc(opfamily, inctype, inctype, HASHPROC); - Datum d = values[attnum - 1]; hash ^= DatumGetUInt32(OidFunctionCall1(hashfunc, d)); } } part = hash % list_length(partnode->rules); - rule = (PartitionRule *)list_nth(partnode->rules, part); + rule = (PartitionRule *) list_nth(partnode->rules, part); *found = rule->parchildrelid; *prule = rule; @@ -4423,8 +4530,8 @@ selectPartition1(PartitionNode *partnode, Datum *values, bool *isnull, int *pSearch, PartitionNode **ppn_out) { - Oid relid = InvalidOid; - Partition *part = partnode->part; + Oid relid = InvalidOid; + Partition *part = partnode->part; PartitionNode *pn = NULL; PartitionRule *prule = NULL; @@ -4434,15 +4541,15 @@ selectPartition1(PartitionNode *partnode, Datum *values, bool *isnull, /* what kind of partition? */ switch (part->parkind) { - case 'r': /* range */ + case 'r': /* range */ pn = selectRangePartition(partnode, values, isnull, tupdesc, accessMethods, &relid, pSearch, &prule, InvalidOid); break; - case 'h': /* hash */ + case 'h': /* hash */ pn = selectHashPartition(partnode, values, isnull, tupdesc, accessMethods, &relid, &prule); break; - case 'l': /* list */ + case 'l': /* list */ pn = selectListPartition(partnode, values, isnull, tupdesc, accessMethods, &relid, &prule, InvalidOid); break; @@ -4473,8 +4580,9 @@ selectPartition1(PartitionNode *partnode, Datum *values, bool *isnull, { *ppn_out = partnode->default_part->children; - /* don't return the relid, it is invalid -- return - * the relid of the default partition instead + /* + * don't return the relid, it is invalid -- return the + * relid of the default partition instead */ return partnode->default_part->parchildrelid; } @@ -4514,23 +4622,23 @@ selectPartition(PartitionNode *partnode, Datum *values, bool *isnull, * * return: PartitionRule of which constraints match the input key */ -PartitionRule* +PartitionRule * get_next_level_matched_partition(PartitionNode *partnode, Datum *values, bool *isnull, - TupleDesc tupdesc, PartitionAccessMethods *accessMethods, - Oid exprTypid) + TupleDesc tupdesc, PartitionAccessMethods *accessMethods, + Oid exprTypid) { - Oid relid = InvalidOid; - Partition *part = partnode->part; + Oid relid = InvalidOid; + Partition *part = partnode->part; PartitionRule *prule = NULL; /* what kind of partition? */ switch (part->parkind) { - case 'r': /* range */ + case 'r': /* range */ selectRangePartition(partnode, values, isnull, tupdesc, - accessMethods, &relid, NULL, &prule, exprTypid); + accessMethods, &relid, NULL, &prule, exprTypid); break; - case 'l': /* list */ + case 'l': /* list */ selectListPartition(partnode, values, isnull, tupdesc, accessMethods, &relid, &prule, exprTypid); break; @@ -4567,7 +4675,7 @@ get_next_level_matched_partition(PartitionNode *partnode, Datum *values, bool *i * 'relation "foo"' or 'partition "baz" of relation "foo"'. * */ -PgPartRule* +PgPartRule * get_part_rule1(Relation rel, AlterPartitionId *pid, bool bExistError, @@ -4576,17 +4684,17 @@ get_part_rule1(Relation rel, PartitionNode *pNode, char *relname, PartitionNode **ppNode - ) +) { - char namBuf[NAMEDATALEN]; /* the real partition name */ + char namBuf[NAMEDATALEN]; /* the real partition name */ /* a textual representation of the partition id (for error msgs) */ - char partIdStr[(NAMEDATALEN * 2)]; + char partIdStr[(NAMEDATALEN * 2)]; - PgPartRule *prule = NULL; + PgPartRule *prule = NULL; - Oid partrelid = InvalidOid; - int idrank = 0; /* only set for range partns by rank */ + Oid partrelid = InvalidOid; + int idrank = 0; /* only set for range partns by rank */ if (!pid) ereport(ERROR, @@ -4596,65 +4704,65 @@ get_part_rule1(Relation rel, namBuf[0] = 0; - /* build the partition "id string" for error messages as a - * partition name, value, or rank. + /* + * build the partition "id string" for error messages as a partition name, + * value, or rank. * - * Later on, if we discover - * (the partition exists) and - * (it has a name) + * Later on, if we discover (the partition exists) and (it has a name) * then we update the partIdStr to the name */ switch (pid->idtype) { - case AT_AP_IDNone: /* no ID */ + case AT_AP_IDNone: /* no ID */ /* should never happen */ partIdStr[0] = 0; break; - case AT_AP_IDName: /* IDentify by Name */ + case AT_AP_IDName: /* IDentify by Name */ snprintf(partIdStr, sizeof(partIdStr), " \"%s\"", strVal(pid->partiddef)); snprintf(namBuf, sizeof(namBuf), "%s", strVal(pid->partiddef)); break; - case AT_AP_IDValue: /* IDentifier FOR Value */ + case AT_AP_IDValue: /* IDentifier FOR Value */ snprintf(partIdStr, sizeof(partIdStr), " for specified value"); break; - case AT_AP_IDRank: /* IDentifier FOR Rank */ - { - snprintf(partIdStr, sizeof(partIdStr), " for specified rank"); + case AT_AP_IDRank: /* IDentifier FOR Rank */ + { + snprintf(partIdStr, sizeof(partIdStr), " for specified rank"); #ifdef WIN32 #define round(x) (x+0.5) #endif - if (IsA(pid->partiddef, Integer)) - idrank = intVal(pid->partiddef); - else if (IsA(pid->partiddef, Float)) - idrank = floor(floatVal(pid->partiddef)); - else - Assert(false); + if (IsA(pid->partiddef, Integer)) + idrank = intVal(pid->partiddef); + else if (IsA(pid->partiddef, Float)) + idrank = floor(floatVal(pid->partiddef)); + else + Assert(false); - snprintf(partIdStr, sizeof(partIdStr), - " for rank %d", - idrank); - } + snprintf(partIdStr, sizeof(partIdStr), + " for rank %d", + idrank); + } break; - case AT_AP_ID_oid: /* IDentifier by oid */ + case AT_AP_ID_oid: /* IDentifier by oid */ snprintf(partIdStr, sizeof(partIdStr), " for oid %u", - *((Oid *)(pid->partiddef))); + *((Oid *) (pid->partiddef))); break; - case AT_AP_IDDefault: /* IDentify DEFAULT partition */ + case AT_AP_IDDefault: /* IDentify DEFAULT partition */ snprintf(partIdStr, sizeof(partIdStr), " for DEFAULT"); break; case AT_AP_IDRule: - { - PgPartRule *p = linitial((List *)pid->partiddef); - snprintf(partIdStr, sizeof(partIdStr), "%s", - p->partIdStr); - return p; - break; - } - default: /* XXX XXX */ + { + PgPartRule *p = linitial((List *) pid->partiddef); + + snprintf(partIdStr, sizeof(partIdStr), "%s", + p->partIdStr); + return p; + break; + } + default: /* XXX XXX */ Assert(false); } @@ -4665,22 +4773,23 @@ get_part_rule1(Relation rel, errmsg("%s is not partitioned", relname))); - /* if id is a value or rank, get the relid of the partition if - * it exists */ + /* + * if id is a value or rank, get the relid of the partition if it exists + */ if (pNode) { if (pid->idtype == AT_AP_IDValue) { - TupleDesc tupledesc = RelationGetDescr(rel); - bool *isnull; - PartitionNode *pNode2 = NULL; - Datum *d = magic_expr_to_datum(rel, pNode, - pid->partiddef, &isnull); + TupleDesc tupledesc = RelationGetDescr(rel); + bool *isnull; + PartitionNode *pNode2 = NULL; + Datum *d = magic_expr_to_datum(rel, pNode, + pid->partiddef, &isnull); - /* MPP-4011: get right pid for FOR(value). pass a pNode - * ptr down to prevent recursion in selectPartition -- we - * only want the top-most partition for the value in this - * case + /* + * MPP-4011: get right pid for FOR(value). pass a pNode ptr down + * to prevent recursion in selectPartition -- we only want the + * top-most partition for the value in this case */ if (ppNode) partrelid = selectPartition1(pNode, d, isnull, tupledesc, NULL, @@ -4691,20 +4800,20 @@ get_part_rule1(Relation rel, /* build a string rep for the value */ { - ParseState *pstate = make_parsestate(NULL); - Node *pval = (Node *)pid->partiddef; - char *idval = NULL; + ParseState *pstate = make_parsestate(NULL); + Node *pval = (Node *) pid->partiddef; + char *idval = NULL; - pval = (Node *)transformExpressionList(pstate, - (List *)pval); + pval = (Node *) transformExpressionList(pstate, + (List *) pval); free_parsestate(pstate); idval = - deparse_expression(pval, - deparse_context_for(RelationGetRelationName(rel), - RelationGetRelid(rel)), - false, false); + deparse_expression(pval, + deparse_context_for(RelationGetRelationName(rel), + RelationGetRelid(rel)), + false, false); if (idval) snprintf(partIdStr, sizeof(partIdStr), @@ -4715,19 +4824,19 @@ get_part_rule1(Relation rel, } else if (pid->idtype == AT_AP_IDRank) { - char *parTypName = "UNKNOWN"; + char *parTypName = "UNKNOWN"; if (pNode->part->parkind != 'r') { switch (pNode->part->parkind) { - case 'h': /* hash */ + case 'h': /* hash */ parTypName = "HASH"; break; - case 'l': /* list */ + case 'l': /* list */ parTypName = "LIST"; break; - } /* end switch */ + } /* end switch */ ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -4738,15 +4847,15 @@ get_part_rule1(Relation rel, } - partrelid = selectPartitionByRank(pNode, idrank); + partrelid = selectPartitionByRank(pNode, idrank); } } /* check thru the list of partition rules to match by relid or name */ if (pNode) { - ListCell *lc; - int rulerank = 1; + ListCell *lc; + int rulerank = 1; /* set up the relid for the default partition if necessary */ if ((pid->idtype == AT_AP_IDDefault) @@ -4756,7 +4865,7 @@ get_part_rule1(Relation rel, foreach(lc, pNode->rules) { PartitionRule *rule = lfirst(lc); - bool foundit = false; + bool foundit = false; if ((pid->idtype == AT_AP_IDValue) || (pid->idtype == AT_AP_IDRank)) @@ -4787,18 +4896,18 @@ get_part_rule1(Relation rel, prule->pNode = pNode; prule->topRule = rule; - prule->topRuleRank = rulerank; /* 1-based */ + prule->topRuleRank = rulerank; /* 1-based */ prule->relname = relname; break; } rulerank++; - } /* end foreach */ + } /* end foreach */ /* if cannot find, check default partition */ if (!prule && pNode->default_part) { PartitionRule *rule = pNode->default_part; - bool foundit = false; + bool foundit = false; if ((pid->idtype == AT_AP_IDValue) || (pid->idtype == AT_AP_IDRank) @@ -4834,12 +4943,13 @@ get_part_rule1(Relation rel, prule->relname = relname; } } - } /* end if pnode */ + } /* end if pnode */ - /* if the partition exists, set the "id string" in prule and - * indicate whether it is the partition name. The ATPExec - * commands will notify users of the "real" name if the original - * specification was by value or rank + /* + * if the partition exists, set the "id string" in prule and indicate + * whether it is the partition name. The ATPExec commands will notify + * users of the "real" name if the original specification was by value or + * rank */ if (prule) { @@ -4872,27 +4982,27 @@ get_part_rule1(Relation rel, { switch (pid->idtype) { - case AT_AP_IDNone: /* no ID */ + case AT_AP_IDNone: /* no ID */ /* should never happen */ Assert(false); break; - case AT_AP_IDName: /* IDentify by Name */ - case AT_AP_IDValue: /* IDentifier FOR Value */ - case AT_AP_IDRank: /* IDentifier FOR Rank */ - case AT_AP_ID_oid: /* IDentifier by oid */ + case AT_AP_IDName: /* IDentify by Name */ + case AT_AP_IDValue: /* IDentifier FOR Value */ + case AT_AP_IDRank: /* IDentifier FOR Rank */ + case AT_AP_ID_oid: /* IDentifier by oid */ ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("partition%s of %s does not exist", partIdStr, relname))); break; - case AT_AP_IDDefault: /* IDentify DEFAULT partition */ + case AT_AP_IDDefault: /* IDentify DEFAULT partition */ ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("DEFAULT partition of %s does not exist", relname))); break; - default: /* XXX XXX */ + default: /* XXX XXX */ Assert(false); } @@ -4901,21 +5011,21 @@ get_part_rule1(Relation rel, { switch (pid->idtype) { - case AT_AP_IDNone: /* no ID */ + case AT_AP_IDNone: /* no ID */ /* should never happen */ Assert(false); break; - case AT_AP_IDName: /* IDentify by Name */ - case AT_AP_IDValue: /* IDentifier FOR Value */ - case AT_AP_IDRank: /* IDentifier FOR Rank */ - case AT_AP_ID_oid: /* IDentifier by oid */ + case AT_AP_IDName: /* IDentify by Name */ + case AT_AP_IDValue: /* IDentifier FOR Value */ + case AT_AP_IDRank: /* IDentifier FOR Rank */ + case AT_AP_ID_oid: /* IDentifier by oid */ ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("partition%s of %s already exists", partIdStr, relname))); break; - case AT_AP_IDDefault: /* IDentify DEFAULT partition */ + case AT_AP_IDDefault: /* IDentify DEFAULT partition */ ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("DEFAULT partition%s of %s already exists", @@ -4923,7 +5033,7 @@ get_part_rule1(Relation rel, partIdStr : "", relname))); break; - default: /* XXX XXX */ + default: /* XXX XXX */ Assert(false); } @@ -4933,7 +5043,7 @@ get_part_rule1(Relation rel, L_fin_partrule: return prule; -} /* end get_part_rule1 */ +} /* end get_part_rule1 */ PgPartRule * get_part_rule(Relation rel, @@ -4943,10 +5053,10 @@ get_part_rule(Relation rel, int *pSearch, bool inctemplate) { - PartitionNode *pNode = NULL; - PartitionNode *pNode2 = NULL; - char relnamBuf[(NAMEDATALEN * 2)]; - char *relname; + PartitionNode *pNode = NULL; + PartitionNode *pNode2 = NULL; + char relnamBuf[(NAMEDATALEN * 2)]; + char *relname; snprintf(relnamBuf, sizeof(relnamBuf), "relation \"%s\"", RelationGetRelationName(rel)); @@ -4967,19 +5077,19 @@ get_part_rule(Relation rel, if (pid->idtype == AT_AP_IDRule) { - List *l1 = (List *)pid->partiddef; - ListCell *lc; - AlterPartitionId *pid2 = NULL; - PgPartRule* prule2 = NULL; + List *l1 = (List *) pid->partiddef; + ListCell *lc; + AlterPartitionId *pid2 = NULL; + PgPartRule *prule2 = NULL; lc = list_head(l1); - prule2 = (PgPartRule*) lfirst(lc); + prule2 = (PgPartRule *) lfirst(lc); if (prule2 && prule2->topRule && prule2->topRule->children) pNode = prule2->topRule->children; lc = lnext(lc); - pid2 = (AlterPartitionId *)lfirst(lc); + pid2 = (AlterPartitionId *) lfirst(lc); prule2 = get_part_rule1(rel, pid2, @@ -4999,11 +5109,12 @@ get_part_rule(Relation rel, if (pid->idtype == AT_AP_IDList) { - List *l1 = (List *)pid->partiddef; - ListCell *lc; - AlterPartitionId *pid2 = NULL; - PgPartRule* prule2 = NULL; - StringInfoData sid1, sid2; + List *l1 = (List *) pid->partiddef; + ListCell *lc; + AlterPartitionId *pid2 = NULL; + PgPartRule *prule2 = NULL; + StringInfoData sid1, + sid2; initStringInfo(&sid1); initStringInfo(&sid2); @@ -5012,7 +5123,7 @@ get_part_rule(Relation rel, foreach(lc, l1) { - pid2 = (AlterPartitionId *)lfirst(lc); + pid2 = (AlterPartitionId *) lfirst(lc); prule2 = get_part_rule1(rel, pid2, @@ -5032,12 +5143,12 @@ get_part_rule(Relation rel, truncateStringInfo(&sid1, 0); appendStringInfo(&sid1, "%s", sid2.data); truncateStringInfo(&sid2, 0); - } /* end foreach */ + } /* end foreach */ return prule2; } return NULL; -} /* end get_part_rule */ +} /* end get_part_rule */ static void fixup_table_storage_options(CreateStmt *ct) @@ -5045,9 +5156,9 @@ fixup_table_storage_options(CreateStmt *ct) if (!ct->options) { ct->options = list_make2(makeDefElem("appendonly", - (Node *)makeString("true")), + (Node *) makeString("true")), makeDefElem("orientation", - (Node *)makeString("column"))); + (Node *) makeString("column"))); } } @@ -5062,13 +5173,13 @@ static void apply_template_storage_encodings(CreateStmt *ct, Oid relid, Oid paroid, PartitionSpec *tmpl) { - List *encs = get_deparsed_partition_encodings(relid, paroid); + List *encs = get_deparsed_partition_encodings(relid, paroid); if (encs) { /* - * If the user didn't specify WITH (...) at create time, - * we need to force the new partitions to be AO/CO. + * If the user didn't specify WITH (...) at create time, we need to + * force the new partitions to be AO/CO. */ fixup_table_storage_options(ct); tmpl->partElem = list_concat(tmpl->partElem, @@ -5084,36 +5195,36 @@ apply_template_storage_encodings(CreateStmt *ct, Oid relid, Oid paroid, static int atpxPart_validate_spec( - PartitionBy *pBy, - CreateStmtContext *pcxt, - Relation rel, - CreateStmt *ct, - PartitionElem *pelem, - PartitionNode *pNode, - char *partName, - bool isDefault, - PartitionByType part_type, - char *partDesc) + PartitionBy *pBy, + CreateStmtContext *pcxt, + Relation rel, + CreateStmt *ct, + PartitionElem *pelem, + PartitionNode *pNode, + char *partName, + bool isDefault, + PartitionByType part_type, + char *partDesc) { - PartitionSpec *spec = makeNode(PartitionSpec); - ParseState *pstate = NULL; - List *schema = NIL; - List *inheritOids; - List *old_constraints; - int parentOidCount; - int result; - PartitionNode *pNode_tmpl = NULL; + PartitionSpec *spec = makeNode(PartitionSpec); + ParseState *pstate = NULL; + List *schema = NIL; + List *inheritOids; + List *old_constraints; + int parentOidCount; + int result; + PartitionNode *pNode_tmpl = NULL; /* get the table column defs */ schema = - MergeAttributes(schema, - list_make1( - makeRangeVar( - get_namespace_name( - RelationGetNamespace(rel)), - pstrdup(RelationGetRelationName(rel)), -1)), - false, true /* isPartitioned */, - &inheritOids, &old_constraints, &parentOidCount, NULL); + MergeAttributes(schema, + list_make1( + makeRangeVar( + get_namespace_name( + RelationGetNamespace(rel)), + pstrdup(RelationGetRelationName(rel)), -1)), + false, true /* isPartitioned */ , + &inheritOids, &old_constraints, &parentOidCount, NULL); pcxt->columns = schema; @@ -5130,64 +5241,63 @@ atpxPart_validate_spec( pelem->rrand = random(); pBy->partType = part_type; - pBy->keys = NULL; - pBy->partNum = 0; - pBy->subPart = NULL; - pBy->partSpec = (Node *)spec; + pBy->keys = NULL; + pBy->partNum = 0; + pBy->subPart = NULL; + pBy->partSpec = (Node *) spec; pBy->partDepth = pNode->part->parlevel; /* Note: pBy->partQuiet already set by caller */ pBy->parentRel = - makeRangeVar(get_namespace_name(RelationGetNamespace(rel)), - pstrdup(RelationGetRelationName(rel)), -1); - pBy->location = -1; + makeRangeVar(get_namespace_name(RelationGetNamespace(rel)), + pstrdup(RelationGetRelationName(rel)), -1); + pBy->location = -1; pBy->partDefault = NULL; - pBy->bKeepMe = true; /* nefarious: we need to keep the "top" - * partition by statement because - * analyze.c:do_parse_analyze needs to find - * it to re-order the ALTER statements - */ + pBy->bKeepMe = true; /* nefarious: we need to keep the "top" + * partition by statement because + * analyze.c:do_parse_analyze needs to find it + * to re-order the ALTER statements */ /* fixup the pnode_tmpl to get the right parlevel */ if (pNode && (pNode->rules || pNode->default_part)) { pNode_tmpl = get_parts(pNode->part->parrelid, pNode->part->parlevel + 1, - InvalidOid, /* no parent for template */ + InvalidOid, /* no parent for template */ true, - true /*includesubparts*/ - ); + true /* includesubparts */ + ); } - { /* find the partitioning keys (recursively) */ + { /* find the partitioning keys (recursively) */ - PartitionBy *pBy2 = pBy; - PartitionBy *parent_pBy2 = NULL; - PartitionNode *pNode2 = pNode; + PartitionBy *pBy2 = pBy; + PartitionBy *parent_pBy2 = NULL; + PartitionNode *pNode2 = pNode; - int ii; - TupleDesc tupleDesc = RelationGetDescr(rel); - List *pbykeys = NIL; - List *pbyopclass = NIL; - Oid accessMethodId = BTREE_AM_OID; + int ii; + TupleDesc tupleDesc = RelationGetDescr(rel); + List *pbykeys = NIL; + List *pbyopclass = NIL; + Oid accessMethodId = BTREE_AM_OID; while (pNode2) { - pbykeys = NIL; + pbykeys = NIL; pbyopclass = NIL; for (ii = 0; ii < pNode2->part->parnatts; ii++) { - AttrNumber attno = + AttrNumber attno = pNode2->part->paratts[ii]; - Form_pg_attribute attribute = + Form_pg_attribute attribute = tupleDesc->attrs[attno - 1]; - char *attributeName = + char *attributeName = NameStr(attribute->attname); - Oid opclass = + Oid opclass = InvalidOid; opclass = - GetDefaultOpClass(attribute->atttypid, accessMethodId); + GetDefaultOpClass(attribute->atttypid, accessMethodId); if (pbykeys) { @@ -5199,20 +5309,20 @@ atpxPart_validate_spec( pbykeys = list_make1(makeString(attributeName)); pbyopclass = list_make1_oid(opclass); } - } /* end for */ + } /* end for */ pBy2->keys = pbykeys; pBy2->keyopclass = pbyopclass; if (parent_pBy2) - parent_pBy2->subPart = (Node *)pBy2; + parent_pBy2->subPart = (Node *) pBy2; parent_pBy2 = pBy2; if (pNode2 && (pNode2->rules || pNode2->default_part)) { PartitionRule *prule; - PartitionElem *el = NULL; /* for the subpartn template */ + PartitionElem *el = NULL; /* for the subpartn template */ if (pNode2->default_part) prule = pNode2->default_part; @@ -5230,22 +5340,22 @@ atpxPart_validate_spec( ('r' == pNode2->part->parkind)); pBy2 = makeNode(PartitionBy); - pBy2->partType = - ('r' == pNode2->part->parkind) ? - PARTTYP_RANGE : - PARTTYP_LIST; - pBy2->keys = NULL; - pBy2->partNum = 0; - pBy2->subPart = NULL; - pBy2->partSpec = NULL; + pBy2->partType = + ('r' == pNode2->part->parkind) ? + PARTTYP_RANGE : + PARTTYP_LIST; + pBy2->keys = NULL; + pBy2->partNum = 0; + pBy2->subPart = NULL; + pBy2->partSpec = NULL; pBy2->partDepth = pNode2->part->parlevel; pBy2->partQuiet = pBy->partQuiet; pBy2->parentRel = - makeRangeVar( - get_namespace_name( - RelationGetNamespace(rel)), - pstrdup(RelationGetRelationName(rel)), -1); - pBy2->location = -1; + makeRangeVar( + get_namespace_name( + RelationGetNamespace(rel)), + pstrdup(RelationGetRelationName(rel)), -1); + pBy2->location = -1; pBy2->partDefault = NULL; el = NULL; @@ -5254,7 +5364,7 @@ atpxPart_validate_spec( if (pNode_tmpl) { PartitionSpec *spec_tmpl = makeNode(PartitionSpec); - ListCell *lc; + ListCell *lc; spec_tmpl->istemplate = true; @@ -5274,22 +5384,22 @@ atpxPart_validate_spec( if (rule_tmpl->parreloptions || rule_tmpl->partemplatespaceId) { - Node *tspaceName = NULL; + Node *tspaceName = NULL; AlterPartitionCmd *apc = makeNode(AlterPartitionCmd); - el->storeAttr = (Node *)apc; + el->storeAttr = (Node *) apc; if (rule_tmpl->partemplatespaceId) tspaceName = - (Node*)makeString( - get_tablespace_name( - rule_tmpl->partemplatespaceId - )); + (Node *) makeString( + get_tablespace_name( + rule_tmpl->partemplatespaceId + )); apc->partid = NULL; - apc->arg2 = tspaceName; - apc->arg1 = (Node *)rule_tmpl->parreloptions; + apc->arg2 = tspaceName; + apc->arg1 = (Node *) rule_tmpl->parreloptions; } @@ -5299,7 +5409,7 @@ atpxPart_validate_spec( PartitionValuesSpec *vspec = makeNode(PartitionValuesSpec); - el->boundSpec = (Node*)vspec; + el->boundSpec = (Node *) vspec; vspec->partValues = rule_tmpl->parlistvalues; } @@ -5315,54 +5425,54 @@ atpxPart_validate_spec( if (rule_tmpl->parrangestart) { ri = - makeNode(PartitionRangeItem); + makeNode(PartitionRangeItem); ri->partedge = - rule_tmpl->parrangestartincl ? - PART_EDGE_INCLUSIVE : - PART_EDGE_EXCLUSIVE ; + rule_tmpl->parrangestartincl ? + PART_EDGE_INCLUSIVE : + PART_EDGE_EXCLUSIVE; ri->partRangeVal = - (List *)rule_tmpl->parrangestart; + (List *) rule_tmpl->parrangestart; - bspec->partStart = (Node*)ri; + bspec->partStart = (Node *) ri; } if (rule_tmpl->parrangeend) { ri = - makeNode(PartitionRangeItem); + makeNode(PartitionRangeItem); ri->partedge = - rule_tmpl->parrangeendincl ? - PART_EDGE_INCLUSIVE : - PART_EDGE_EXCLUSIVE ; + rule_tmpl->parrangeendincl ? + PART_EDGE_INCLUSIVE : + PART_EDGE_EXCLUSIVE; ri->partRangeVal = - (List *)rule_tmpl->parrangeend; + (List *) rule_tmpl->parrangeend; - bspec->partEnd = (Node*)ri; + bspec->partEnd = (Node *) ri; } if (rule_tmpl->parrangeevery) { ri = - makeNode(PartitionRangeItem); + makeNode(PartitionRangeItem); ri->partRangeVal = - (List *)rule_tmpl->parrangeevery; + (List *) rule_tmpl->parrangeevery; - bspec->partEvery = (Node*)ri; + bspec->partEvery = (Node *) ri; } - el->boundSpec = (Node*)bspec; + el->boundSpec = (Node *) bspec; - } /* end if RANGE */ + } /* end if RANGE */ spec_tmpl->partElem = lappend(spec_tmpl->partElem, el); - } /* end foreach */ + } /* end foreach */ /* MPP-4725 */ /* and the default partition */ @@ -5385,29 +5495,30 @@ atpxPart_validate_spec( /* apply storage encoding for this template */ apply_template_storage_encodings(ct, - RelationGetRelid(rel), - pNode_tmpl->part->partid, - spec_tmpl); - - /* the PartitionElem should hang off the pby - * partspec, and subsequent templates should - * hang off the subspec for the prior - * PartitionElem. + RelationGetRelid(rel), + pNode_tmpl->part->partid, + spec_tmpl); + + /* + * the PartitionElem should hang off the pby partspec, + * and subsequent templates should hang off the + * subspec for the prior PartitionElem. */ - pBy2->partSpec = (Node *)spec_tmpl; + pBy2->partSpec = (Node *) spec_tmpl; - } /* end if pNode_tmpl */ + } /* end if pNode_tmpl */ /* fixup the pnode_tmpl to get the right parlevel */ if (pNode2 && (pNode2->rules || pNode2->default_part)) { pNode_tmpl = get_parts(pNode2->part->parrelid, pNode2->part->parlevel + 1, - InvalidOid, /* no parent for template */ + InvalidOid, /* no parent for + * template */ true, - true /*includesubparts*/ - ); + true /* includesubparts */ + ); } } @@ -5417,7 +5528,7 @@ atpxPart_validate_spec( else pNode2 = NULL; - } /* end while */ + } /* end while */ } pstate = make_parsestate(NULL); @@ -5425,47 +5536,48 @@ atpxPart_validate_spec( free_parsestate(pstate); return result; -} /* end atpxPart_validate_spec */ +} /* end atpxPart_validate_spec */ Node * atpxPartAddList(Relation rel, bool is_split, List *colencs, - PartitionNode *pNode, + PartitionNode *pNode, char *partName, /* pid->partiddef (or NULL) */ bool isDefault, PartitionElem *pelem, PartitionByType part_type, - PgPartRule* par_prule, + PgPartRule *par_prule, char *lrelname, bool bSetTemplate, Oid ownerid) { - DestReceiver *dest = None_Receiver; - int maxpartno = 0; - typedef enum { - FIRST = 0, /* New partition lies before first. */ - MIDDLE, /* New partition lies in the middle. */ - LAST /* New partition lies after last. */ + DestReceiver *dest = None_Receiver; + int maxpartno = 0; + typedef enum + { + FIRST = 0, /* New partition lies before first. */ + MIDDLE, /* New partition lies in the middle. */ + LAST /* New partition lies after last. */ } NewPosition; NewPosition newPos = MIDDLE; - bool bOpenGap = false; - PartitionBy *pBy; - CreateStmtContext cxt; - Node *pSubSpec = NULL; /* return the subpartition spec */ - Relation par_rel = rel; - PartitionNode pNodebuf; - PartitionNode *pNode2 = &pNodebuf; + bool bOpenGap = false; + PartitionBy *pBy; + CreateStmtContext cxt; + Node *pSubSpec = NULL; /* return the subpartition spec */ + Relation par_rel = rel; + PartitionNode pNodebuf; + PartitionNode *pNode2 = &pNodebuf; CreateStmt *ct; /* get the relation for the parent of the new partition */ if (par_prule && par_prule->topRule) par_rel = - heap_open(par_prule->topRule->parchildrelid, AccessShareLock); + heap_open(par_prule->topRule->parchildrelid, AccessShareLock); MemSet(&cxt, 0, sizeof(cxt)); - Assert( (PARTTYP_LIST == part_type) || (PARTTYP_RANGE == part_type) ); + Assert((PARTTYP_LIST == part_type) || (PARTTYP_RANGE == part_type)); /* XXX XXX: handle case of missing boundary spec for range with EVERY */ @@ -5473,21 +5585,21 @@ atpxPartAddList(Relation rel, { if (PARTTYP_RANGE == part_type) { - PartitionBoundSpec *pbs = NULL; - PgPartRule *prule = NULL; - AlterPartitionId pid; - ParseState *pstate = make_parsestate(NULL); - TupleDesc tupledesc = RelationGetDescr(rel); + PartitionBoundSpec *pbs = NULL; + PgPartRule *prule = NULL; + AlterPartitionId pid; + ParseState *pstate = make_parsestate(NULL); + TupleDesc tupledesc = RelationGetDescr(rel); MemSet(&pid, 0, sizeof(AlterPartitionId)); pid.idtype = AT_AP_IDRank; - pid.location = -1; + pid.location = -1; - Assert (IsA(pelem->boundSpec, PartitionBoundSpec)); + Assert(IsA(pelem->boundSpec, PartitionBoundSpec)); - pbs = (PartitionBoundSpec *)pelem->boundSpec; - pSubSpec = pelem->subSpec; /* look for subpartition spec */ + pbs = (PartitionBoundSpec *) pelem->boundSpec; + pSubSpec = pelem->subSpec; /* look for subpartition spec */ /* no EVERY */ if (pbs->partEvery) @@ -5497,7 +5609,7 @@ atpxPartAddList(Relation rel, "RANGE partition to %s", lrelname))); - if (!(pbs->partStart || pbs->partEnd )) + if (!(pbs->partStart || pbs->partEnd)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("Need START or END when adding " @@ -5507,11 +5619,11 @@ atpxPartAddList(Relation rel, /* if no START, then START after last partition */ if (!(pbs->partStart)) { - Datum *d_end = NULL; - bool *isnull; - bool bstat; + Datum *d_end = NULL; + bool *isnull; + bool bstat; - pid.partiddef = (Node *)makeInteger(-1); + pid.partiddef = (Node *) makeInteger(-1); prule = get_part_rule1(rel, &pid, false, false, NULL, @@ -5519,9 +5631,9 @@ atpxPartAddList(Relation rel, lrelname, &pNode2); - /* ok if no prior -- just means this is first - * partition (XXX XXX though should always have 1 - * partition in the table...) + /* + * ok if no prior -- just means this is first partition (XXX + * XXX though should always have 1 partition in the table...) */ if (!(prule && prule->topRule)) @@ -5532,13 +5644,14 @@ atpxPartAddList(Relation rel, } { - Node *n1; + Node *n1; - if ( !IsA(pbs->partEnd, PartitionRangeItem) ) + if (!IsA(pbs->partEnd, PartitionRangeItem)) { - /* pbs->partEnd isn't a PartitionRangeItem! This probably means - * an invalid split of a default part, but we aren't really sure. - * See MPP-14613. + /* + * pbs->partEnd isn't a PartitionRangeItem! This + * probably means an invalid split of a default part, + * but we aren't really sure. See MPP-14613. */ ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), @@ -5546,31 +5659,32 @@ atpxPartAddList(Relation rel, } PartitionRangeItem *ri = - (PartitionRangeItem *)pbs->partEnd; + (PartitionRangeItem *) pbs->partEnd; + PartitionRangeItemIsValid(NULL, ri); - n1 = (Node *)copyObject(ri->partRangeVal); - n1 = (Node *)transformExpressionList(pstate, - (List *)n1); + n1 = (Node *) copyObject(ri->partRangeVal); + n1 = (Node *) transformExpressionList(pstate, + (List *) n1); d_end = - magic_expr_to_datum(rel, pNode, - n1, &isnull); + magic_expr_to_datum(rel, pNode, + n1, &isnull); } if (prule && prule->topRule && prule->topRule->parrangeend - && list_length((List *)prule->topRule->parrangeend)) + && list_length((List *) prule->topRule->parrangeend)) { bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)prule->topRule->parrangeend, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) prule->topRule->parrangeend, + d_end, isnull, tupledesc); - /* if the current end is less than the new end - * then use it as the start of the new - * partition + /* + * if the current end is less than the new end then use it + * as the start of the new partition */ if (bstat) @@ -5580,27 +5694,28 @@ atpxPartAddList(Relation rel, ri->location = -1; ri->partRangeVal = - copyObject(prule->topRule->parrangeend); + copyObject(prule->topRule->parrangeend); /* invert the inclusive/exclusive */ ri->partedge = prule->topRule->parrangeendincl ? - PART_EDGE_EXCLUSIVE : - PART_EDGE_INCLUSIVE; + PART_EDGE_EXCLUSIVE : + PART_EDGE_INCLUSIVE; /* should be final partition */ maxpartno = prule->topRule->parruleord + 1; newPos = LAST; - pbs->partStart = (Node *)ri; + pbs->partStart = (Node *) ri; goto L_fin_no_start; } } - /* if the last partition doesn't have an end, or the - * end isn't less than the new end, check if new end - * is less than current start + /* + * if the last partition doesn't have an end, or the end isn't + * less than the new end, check if new end is less than + * current start */ - pid.partiddef = (Node *)makeInteger(1); + pid.partiddef = (Node *) makeInteger(1); prule = get_part_rule1(rel, &pid, false, false, NULL, @@ -5609,18 +5724,18 @@ atpxPartAddList(Relation rel, &pNode2); if (!(prule && prule->topRule && prule->topRule->parrangestart - && list_length((List *)prule->topRule->parrangestart))) + && list_length((List *) prule->topRule->parrangestart))) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("new partition overlaps existing " "partition"))); bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)prule->topRule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) prule->topRule->parrangestart, + d_end, isnull, tupledesc); if (!bstat) @@ -5631,15 +5746,15 @@ atpxPartAddList(Relation rel, * * We can proceed if we have the following: * - * END (R) EXCLUSIVE ; START (R) INCLUSIVE - * END (R) INCLUSIVE ; START (R) EXCLUSIVE + * END (R) EXCLUSIVE ; START (R) INCLUSIVE END (R) + * INCLUSIVE ; START (R) EXCLUSIVE * * XXX: this should be refactored into a single generic * function that can be used here and in the unbounded end - * case, checked further down. That said, a lot of this code - * should be refactored. + * case, checked further down. That said, a lot of this + * code should be refactored. */ - PartitionRangeItem *ri = (PartitionRangeItem *)pbs->partEnd; + PartitionRangeItem *ri = (PartitionRangeItem *) pbs->partEnd; if ((ri->partedge == PART_EDGE_EXCLUSIVE && prule->topRule->parrangestartincl) || @@ -5647,7 +5762,7 @@ atpxPartAddList(Relation rel, !prule->topRule->parrangestartincl)) { bstat = compare_partn_opfuncid(pNode, "pg_catalog", "=", - (List *)prule->topRule->parrangestart, + (List *) prule->topRule->parrangestart, d_end, isnull, tupledesc); } @@ -5670,20 +5785,20 @@ atpxPartAddList(Relation rel, errmsg("new partition overlaps existing " "partition"))); - L_fin_no_start: - bstat = false; /* fix warning */ + L_fin_no_start: + bstat = false; /* fix warning */ } else if (!(pbs->partEnd)) - { /* if no END, then END before first partition - **ONLY IF** - * START of this partition is before first partition ... */ + { /* if no END, then END before first partition + * *ONLY IF** START of this partition is + * before first partition ... */ - Datum *d_start = NULL; - bool *isnull; - bool bstat; + Datum *d_start = NULL; + bool *isnull; + bool bstat; - pid.partiddef = (Node *)makeInteger(1); + pid.partiddef = (Node *) makeInteger(1); prule = get_part_rule1(rel, &pid, false, false, NULL, @@ -5693,10 +5808,11 @@ atpxPartAddList(Relation rel, /* NOTE: invert all the logic of case of missing partStart */ - /* ok if no successor [?] -- just means this is first - * partition (XXX XXX though should always have 1 - * partition in the table... [XXX XXX unless did a - * SPLIT of a single partition !! ]) + /* + * ok if no successor [?] -- just means this is first + * partition (XXX XXX though should always have 1 partition in + * the table... [XXX XXX unless did a SPLIT of a single + * partition !! ]) */ if (!(prule && prule->topRule)) @@ -5707,34 +5823,34 @@ atpxPartAddList(Relation rel, } { - Node *n1; + Node *n1; PartitionRangeItem *ri = - (PartitionRangeItem *)pbs->partStart; + (PartitionRangeItem *) pbs->partStart; PartitionRangeItemIsValid(NULL, ri); - n1 = (Node *)copyObject(ri->partRangeVal); - n1 = (Node *)transformExpressionList(pstate, - (List *)n1); + n1 = (Node *) copyObject(ri->partRangeVal); + n1 = (Node *) transformExpressionList(pstate, + (List *) n1); d_start = - magic_expr_to_datum(rel, pNode, - n1, &isnull); + magic_expr_to_datum(rel, pNode, + n1, &isnull); } if (prule && prule->topRule && prule->topRule->parrangestart - && list_length((List *)prule->topRule->parrangestart)) + && list_length((List *) prule->topRule->parrangestart)) { bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)prule->topRule->parrangestart, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) prule->topRule->parrangestart, + d_start, isnull, tupledesc); - /* if the current start is greater than the new start - * then use the current start as the end of the new - * partition + /* + * if the current start is greater than the new start then + * use the current start as the end of the new partition */ if (bstat) @@ -5744,12 +5860,12 @@ atpxPartAddList(Relation rel, ri->location = -1; ri->partRangeVal = - copyObject(prule->topRule->parrangestart); + copyObject(prule->topRule->parrangestart); /* invert the inclusive/exclusive */ ri->partedge = prule->topRule->parrangestartincl ? - PART_EDGE_EXCLUSIVE : - PART_EDGE_INCLUSIVE; + PART_EDGE_EXCLUSIVE : + PART_EDGE_INCLUSIVE; /* should be first partition */ maxpartno = prule->topRule->parruleord - 1; @@ -5759,17 +5875,18 @@ atpxPartAddList(Relation rel, bOpenGap = true; } newPos = FIRST; - pbs->partEnd = (Node *)ri; + pbs->partEnd = (Node *) ri; goto L_fin_no_end; } } - /* if the first partition doesn't have an start, or the - * start isn't greater than the new start, check if new start - * is greater than current end + /* + * if the first partition doesn't have an start, or the start + * isn't greater than the new start, check if new start is + * greater than current end */ - pid.partiddef = (Node *)makeInteger(-1); + pid.partiddef = (Node *) makeInteger(-1); prule = get_part_rule1(rel, &pid, false, false, NULL, @@ -5778,18 +5895,18 @@ atpxPartAddList(Relation rel, &pNode2); if (!(prule && prule->topRule && prule->topRule->parrangeend - && list_length((List *)prule->topRule->parrangeend))) + && list_length((List *) prule->topRule->parrangeend))) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("new partition overlaps existing " "partition"))); bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)prule->topRule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) prule->topRule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { /* should be final partition */ @@ -5799,21 +5916,21 @@ atpxPartAddList(Relation rel, else { PartitionRangeItem *ri = - (PartitionRangeItem *)pbs->partStart; + (PartitionRangeItem *) pbs->partStart; /* check for equality */ bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)prule->topRule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) prule->topRule->parrangeend, + d_start, isnull, tupledesc); - /* if new start not >= to current end, then - * new start < current end, so it overlaps. Or if - * new start == current end, but the - * inclusivity is not opposite for the boundaries - * (eg inclusive end abuts inclusive start for + /* + * if new start not >= to current end, then new start < + * current end, so it overlaps. Or if new start == current + * end, but the inclusivity is not opposite for the + * boundaries (eg inclusive end abuts inclusive start for * same start/end value) then it overlaps */ if (!bstat || @@ -5830,32 +5947,32 @@ atpxPartAddList(Relation rel, maxpartno = prule->topRule->parruleord + 1; } - L_fin_no_end: - bstat = false; /* fix warning */ + L_fin_no_end: + bstat = false; /* fix warning */ } else { /* both start and end are specified */ - PartitionRangeItem *ri; - bool bOverlap = false; - bool *isnull; - int startSearchpoint; - int endSearchpoint; - Datum *d_start = NULL; - Datum *d_end = NULL; + PartitionRangeItem *ri; + bool bOverlap = false; + bool *isnull; + int startSearchpoint; + int endSearchpoint; + Datum *d_start = NULL; + Datum *d_end = NULL; /* see if start or end overlaps */ pid.idtype = AT_AP_IDValue; /* check the start */ - ri = (PartitionRangeItem *)pbs->partStart; + ri = (PartitionRangeItem *) pbs->partStart; PartitionRangeItemIsValid(NULL, ri); - pid.partiddef = (Node *)copyObject(ri->partRangeVal); + pid.partiddef = (Node *) copyObject(ri->partRangeVal); pid.partiddef = - (Node *)transformExpressionList(pstate, - (List *)pid.partiddef); + (Node *) transformExpressionList(pstate, + (List *) pid.partiddef); prule = get_part_rule1(rel, &pid, false, false, &startSearchpoint, @@ -5866,13 +5983,15 @@ atpxPartAddList(Relation rel, /* found match for start value in rules */ if (prule && !(prule->topRule->parisdefault && is_split)) { - bool bstat; + bool bstat; PartitionRule *a_rule = prule->topRule; + d_start = - magic_expr_to_datum(rel, pNode, - pid.partiddef, &isnull); + magic_expr_to_datum(rel, pNode, + pid.partiddef, &isnull); - /* if start value was inclusive then it definitely + /* + * if start value was inclusive then it definitely * overlaps */ if (ri->partedge == PART_EDGE_INCLUSIVE) @@ -5881,12 +6000,12 @@ atpxPartAddList(Relation rel, goto L_end_overlap; } - /* not inclusive -- check harder if START really - * overlaps + /* + * not inclusive -- check harder if START really overlaps */ if (0 == - list_length((List *)a_rule->parrangeend)) + list_length((List *) a_rule->parrangeend)) { /* infinite end > new start - overlap */ bOverlap = true; @@ -5894,11 +6013,11 @@ atpxPartAddList(Relation rel, } bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { /* end > new start - overlap */ @@ -5906,67 +6025,70 @@ atpxPartAddList(Relation rel, goto L_end_overlap; } - /* Must be the case that new start == end of - * a_rule (because if the end < new start then how - * could we find it in the interval for prule ?) - * This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE -> New partition does not + /* + * Must be the case that new start == end of a_rule + * (because if the end < new start then how could we find + * it in the interval for prule ?) This is ok if they have + * opposite INCLUSIVE/EXCLUSIVE -> New partition does not * overlap. */ - Assert (compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc)); + Assert(compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc)); if (a_rule->parrangeendincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap + /* + * start and end must be of opposite types, else they + * overlap */ bOverlap = true; goto L_end_overlap; } - /* opposite inclusive/exclusive, so in middle of - * range of existing partitions + /* + * opposite inclusive/exclusive, so in middle of range of + * existing partitions */ newPos = MIDDLE; goto L_check_end; - } /* end if prule */ + } /* end if prule */ /* check for basic case of START > last partition */ if (pNode && pNode->rules && list_length(pNode->rules)) { - bool bstat; + bool bstat; PartitionRule *a_rule = /* get last rule */ - (PartitionRule *)list_nth(pNode->rules, - list_length(pNode->rules) - 1); + (PartitionRule *) list_nth(pNode->rules, + list_length(pNode->rules) - 1); + d_start = - magic_expr_to_datum(rel, pNode, - pid.partiddef, &isnull); + magic_expr_to_datum(rel, pNode, + pid.partiddef, &isnull); if (0 == - list_length((List *)a_rule->parrangeend)) + list_length((List *) a_rule->parrangeend)) { /* infinite end > new start */ bstat = false; } else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); - /* if the new partition start > end of the last - * partition then it is the new final partition. - * Don't bother checking the new end for overlap - * (just check if end > start in validation - * phase + /* + * if the new partition start > end of the last partition + * then it is the new final partition. Don't bother + * checking the new end for overlap (just check if end > + * start in validation phase */ if (bstat) { @@ -5977,32 +6099,34 @@ atpxPartAddList(Relation rel, goto L_end_overlap; } - /* could be the case that new start == end of - * last. This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE. New partition is still - * final partition for this case + /* + * could be the case that new start == end of last. This + * is ok if they have opposite INCLUSIVE/EXCLUSIVE. New + * partition is still final partition for this case */ if (0 == - list_length((List *)a_rule->parrangeend)) + list_length((List *) a_rule->parrangeend)) { /* infinite end > new start */ bstat = false; } else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { if (a_rule->parrangeendincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap */ + /* + * start and end must be of opposite types, else + * they overlap + */ bOverlap = true; goto L_end_overlap; } @@ -6015,37 +6139,37 @@ atpxPartAddList(Relation rel, } else { - /* tricky case: the new start is less than the - * end of the final partition, but it does not - * intersect any existing partitions. So we - * are trying to add a partition in the middle - * of the existing partitions or before the - * first partition. + /* + * tricky case: the new start is less than the end of + * the final partition, but it does not intersect any + * existing partitions. So we are trying to add a + * partition in the middle of the existing partitions + * or before the first partition. */ - a_rule = /* get first rule */ - (PartitionRule *)list_nth(pNode->rules, 0); + a_rule = /* get first rule */ + (PartitionRule *) list_nth(pNode->rules, 0); if (0 == - list_length((List *)a_rule->parrangestart)) + list_length((List *) a_rule->parrangestart)) { /* new start > negative infinite start */ bstat = false; } else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)a_rule->parrangestart, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) a_rule->parrangestart, + d_start, isnull, tupledesc); - /* if the new partition start < start of the - * first partition then it is the new first - * partition. Check the new end for overlap. + /* + * if the new partition start < start of the first + * partition then it is the new first partition. + * Check the new end for overlap. * - * NOTE: ignore the case where - * new start == 1st start and - * inclusive vs exclusive because that is just + * NOTE: ignore the case where new start == 1st start + * and inclusive vs exclusive because that is just * stupid. * */ @@ -6078,18 +6202,21 @@ atpxPartAddList(Relation rel, goto L_end_overlap; } - L_check_end: + L_check_end: /* check the end */ - /* check for basic case of END < first partition (the - opposite of START > last partition) */ - ri = (PartitionRangeItem *)pbs->partEnd; + /* + * check for basic case of END < first partition (the opposite + * of START > last partition) + */ + + ri = (PartitionRangeItem *) pbs->partEnd; PartitionRangeItemIsValid(NULL, ri); - pid.partiddef = (Node *)copyObject(ri->partRangeVal); + pid.partiddef = (Node *) copyObject(ri->partRangeVal); pid.partiddef = - (Node *)transformExpressionList(pstate, - (List *)pid.partiddef); + (Node *) transformExpressionList(pstate, + (List *) pid.partiddef); prule = get_part_rule1(rel, &pid, false, false, &endSearchpoint, @@ -6101,14 +6228,15 @@ atpxPartAddList(Relation rel, if (prule && !(prule->topRule->parisdefault && is_split)) { - bool bstat; + bool bstat; PartitionRule *a_rule = prule->topRule; + d_end = - magic_expr_to_datum(rel, pNode, - pid.partiddef, &isnull); + magic_expr_to_datum(rel, pNode, + pid.partiddef, &isnull); - /* if end value was inclusive then it definitely - * overlaps + /* + * if end value was inclusive then it definitely overlaps */ if (ri->partedge == PART_EDGE_INCLUSIVE) { @@ -6116,11 +6244,11 @@ atpxPartAddList(Relation rel, goto L_end_overlap; } - /* not inclusive -- check harder if END really - * overlaps + /* + * not inclusive -- check harder if END really overlaps */ if (0 == - list_length((List *)a_rule->parrangestart)) + list_length((List *) a_rule->parrangestart)) { /* -infinite start < new end - overlap */ bOverlap = true; @@ -6128,11 +6256,11 @@ atpxPartAddList(Relation rel, } bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc); if (bstat) { /* start < new end - overlap */ @@ -6140,57 +6268,60 @@ atpxPartAddList(Relation rel, goto L_end_overlap; } - /* Must be the case that new end = start of - * a_rule (because if the start > new end then how - * could we find it in the interval for prule ?) - * This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE -> New partition does not + /* + * Must be the case that new end = start of a_rule + * (because if the start > new end then how could we find + * it in the interval for prule ?) This is ok if they have + * opposite INCLUSIVE/EXCLUSIVE -> New partition does not * overlap. */ - Assert (compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc)); + Assert(compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc)); if (a_rule->parrangestartincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap + /* + * start and end must be of opposite types, else they + * overlap */ bOverlap = true; goto L_end_overlap; } - } /* end if prule */ + } /* end if prule */ /* check for case of END < first partition */ if (pNode && pNode->rules && list_length(pNode->rules)) { - bool bstat; + bool bstat; PartitionRule *a_rule = /* get first rule */ - (PartitionRule *)list_nth(pNode->rules, 0); + (PartitionRule *) list_nth(pNode->rules, 0); + d_end = - magic_expr_to_datum(rel, pNode, - pid.partiddef, &isnull); + magic_expr_to_datum(rel, pNode, + pid.partiddef, &isnull); if (0 == - list_length((List *)a_rule->parrangestart)) + list_length((List *) a_rule->parrangestart)) { /* new end > negative infinite start */ bstat = false; } else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc); - /* if the new partition end < start of the first - * partition then it is the new first partition. + /* + * if the new partition end < start of the first partition + * then it is the new first partition. */ if (bstat) { @@ -6198,9 +6329,10 @@ atpxPartAddList(Relation rel, switch (newPos) { case FIRST: - /* since new start < first start and - * new end < first start should be - * first. + + /* + * since new start < first start and new end < + * first start should be first. */ /* should be first partition */ @@ -6215,41 +6347,44 @@ atpxPartAddList(Relation rel, case MIDDLE: case LAST: default: - /* new end is less than first - * partition start but new start isn't - * -- must be end < start + + /* + * new end is less than first partition start + * but new start isn't -- must be end < start */ break; } goto L_end_overlap; } - /* could be the case that new end == start of - * first. This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE. New partition is still - * first partition for this case + /* + * could be the case that new end == start of first. This + * is ok if they have opposite INCLUSIVE/EXCLUSIVE. New + * partition is still first partition for this case */ if (0 == - list_length((List *)a_rule->parrangestart)) + list_length((List *) a_rule->parrangestart)) { /* new end > negative infinite start */ bstat = false; } else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc); if (bstat) { if (a_rule->parrangestartincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap */ + /* + * start and end must be of opposite types, else + * they overlap + */ bOverlap = true; goto L_end_overlap; } @@ -6257,9 +6392,10 @@ atpxPartAddList(Relation rel, switch (newPos) { case FIRST: - /* since new start < first start and - * new end < first start should be - * first. + + /* + * since new start < first start and new end < + * first start should be first. */ /* should be first partition */ @@ -6274,9 +6410,10 @@ atpxPartAddList(Relation rel, case MIDDLE: case LAST: default: - /* new end is less than first - * partition start but new start isn't - * -- must be end < start + + /* + * new end is less than first partition start + * but new start isn't -- must be end < start */ break; } @@ -6284,18 +6421,18 @@ atpxPartAddList(Relation rel, } else { - /* tricky case: the new end is greater than the - * start of the first partition, but it does not - * intersect any existing partitions. So we - * are trying to add a partition in the middle - * of the existing partitions or after the - * last partition. + /* + * tricky case: the new end is greater than the start + * of the first partition, but it does not intersect + * any existing partitions. So we are trying to add a + * partition in the middle of the existing partitions + * or after the last partition. */ - a_rule = /* get last rule */ - (PartitionRule *)list_nth(pNode->rules, - list_length(pNode->rules) - 1); + a_rule = /* get last rule */ + (PartitionRule *) list_nth(pNode->rules, + list_length(pNode->rules) - 1); if (0 == - list_length((List *)a_rule->parrangeend)) + list_length((List *) a_rule->parrangeend)) { /* new end < infinite end */ bstat = false; @@ -6303,20 +6440,18 @@ atpxPartAddList(Relation rel, else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)a_rule->parrangeend, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) a_rule->parrangeend, + d_end, isnull, tupledesc); - /* if the new partition end > end of the - * last partition then it is the new last - * partition (maybe) + /* + * if the new partition end > end of the last + * partition then it is the new last partition (maybe) * - * NOTE: ignore the case where - * new end == last end and - * inclusive vs exclusive because that is just - * stupid. + * NOTE: ignore the case where new end == last end and + * inclusive vs exclusive because that is just stupid. * */ if (bstat) @@ -6324,9 +6459,10 @@ atpxPartAddList(Relation rel, switch (newPos) { case LAST: - /* since new start > last end and - * new end > last end should be - * last. + + /* + * since new start > last end and new end + * > last end should be last. */ /* should be last partition */ @@ -6334,22 +6470,28 @@ atpxPartAddList(Relation rel, break; case FIRST: - /* since new start < first start - * and new end > last end we would - * overlap all partitions!!! + + /* + * since new start < first start and new + * end > last end we would overlap all + * partitions!!! */ case MIDDLE: - /* since new start < last end - * and new end > last end we would - * overlap last partition + + /* + * since new start < last end and new end + * > last end we would overlap last + * partition */ bOverlap = true; goto L_end_overlap; break; default: - /* new end is less than last - * partition end but new start isn't - * -- must be end < start + + /* + * new end is less than last partition end + * but new start isn't -- must be end < + * start */ break; } @@ -6359,8 +6501,10 @@ atpxPartAddList(Relation rel, switch (newPos) { case FIRST: - /* since new start < first start - * and new end in middle we overlap + + /* + * since new start < first start and new + * end in middle we overlap */ bOverlap = true; goto L_end_overlap; @@ -6371,9 +6515,10 @@ atpxPartAddList(Relation rel, break; case LAST: default: - /* since new start > last end and - * new end in middle - * -- must be end < start + + /* + * since new start > last end and new end + * in middle -- must be end < start */ break; } @@ -6392,50 +6537,47 @@ atpxPartAddList(Relation rel, } - /* if the individual start and end values don't - * intersect an existing partition, make sure they - * don't define a range which contains an existing - * partition, ie new start < existing start and new - * end > existing end + /* + * if the individual start and end values don't intersect an + * existing partition, make sure they don't define a range + * which contains an existing partition, ie new start < + * existing start and new end > existing end */ if (!bOverlap && (newPos == MIDDLE)) { bOpenGap = true; - int prev_partno = 0; + int prev_partno = 0; /* - hmm, not always true. see MPP-3667, MPP-3636, MPP-3593 - - if (startSearchpoint != endSearchpoint) - { - bOverlap = true; - goto L_end_overlap; - } - + * hmm, not always true. see MPP-3667, MPP-3636, MPP-3593 + * + * if (startSearchpoint != endSearchpoint) { bOverlap = + * true; goto L_end_overlap; } + * */ while (1) { - bool bstat; + bool bstat; PartitionRule *a_rule = /* get the rule */ - (PartitionRule *)list_nth(pNode->rules, - startSearchpoint); + (PartitionRule *) list_nth(pNode->rules, + startSearchpoint); /* MPP-3621: fix ADD for open intervals */ if (0 == - list_length((List *)a_rule->parrangeend)) + list_length((List *) a_rule->parrangeend)) { /* new end < infinite end */ bstat = false; } else bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<=", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<=", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { @@ -6446,21 +6588,22 @@ atpxPartAddList(Relation rel, continue; } - /* if previous partition was less than - current, then this one should be larger. - if not, then it overlaps... + /* + * if previous partition was less than current, then + * this one should be larger. if not, then it + * overlaps... */ if ( (0 == - list_length((List *)a_rule->parrangestart)) + list_length((List *) a_rule->parrangestart)) || !compare_partn_opfuncid(pNode, "pg_catalog", ">=", - (List *)a_rule->parrangestart, + (List *) a_rule->parrangestart, d_end, isnull, tupledesc)) { - prule = NULL; /* could get the right prule... */ + prule = NULL; /* could get the right prule... */ bOverlap = true; goto L_end_overlap; } @@ -6476,11 +6619,11 @@ atpxPartAddList(Relation rel, maxpartno = a_rule->parruleord; break; - } /* end while */ + } /* end while */ - } /* end 0 == middle */ + } /* end 0 == middle */ - L_end_overlap: + L_end_overlap: if (bOverlap) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), @@ -6492,35 +6635,35 @@ atpxPartAddList(Relation rel, } free_parsestate(pstate); - } /* end if parttype_range */ - } /* end if pelem && pelem->boundspec */ + } /* end if parttype_range */ + } /* end if pelem && pelem->boundspec */ /* - * Create a phony CREATE TABLE statement for the parent table. - * The parse_analyze call later expands it, and we extract just the constituent - * commands we need to create the new partition, and ignore the commands for - * the already-existing parent table + * Create a phony CREATE TABLE statement for the parent table. The + * parse_analyze call later expands it, and we extract just the + * constituent commands we need to create the new partition, and ignore + * the commands for the already-existing parent table */ ct = makeNode(CreateStmt); ct->relation = makeRangeVar(get_namespace_name(RelationGetNamespace(par_rel)), RelationGetRelationName(par_rel), -1); /* - * in analyze.c, fill in tableelts with a list of inhrelation of - * the partition parent table, and fill in inhrelations with copy - * of rangevar for parent table + * in analyze.c, fill in tableelts with a list of inhrelation of the + * partition parent table, and fill in inhrelations with copy of rangevar + * for parent table */ - InhRelation *inh = makeNode(InhRelation); + InhRelation *inh = makeNode(InhRelation); + inh->relation = copyObject(ct->relation); inh->options = list_make3_int( - CREATE_TABLE_LIKE_INCLUDING_DEFAULTS, - CREATE_TABLE_LIKE_INCLUDING_CONSTRAINTS, - CREATE_TABLE_LIKE_INCLUDING_INDEXES); + CREATE_TABLE_LIKE_INCLUDING_DEFAULTS, + CREATE_TABLE_LIKE_INCLUDING_CONSTRAINTS, + CREATE_TABLE_LIKE_INCLUDING_INDEXES); /* - * fill in remaining fields from parse time (gram.y): - * the new partition is LIKE the parent and it - * inherits from it + * fill in remaining fields from parse time (gram.y): the new partition is + * LIKE the parent and it inherits from it */ ct->tableElts = lappend(ct->tableElts, inh); ct->constraints = NIL; @@ -6537,12 +6680,12 @@ atpxPartAddList(Relation rel, ct->tablespacename = NULL; pBy = makeNode(PartitionBy); - if (pelem->subSpec) /* treat subspec as partition by... */ + if (pelem->subSpec) /* treat subspec as partition by... */ { pBy->partSpec = pelem->subSpec; pBy->partDepth = 0; pBy->partQuiet = PART_VERBO_NODISTRO; - pBy->location = -1; + pBy->location = -1; pBy->partDefault = NULL; pBy->parentRel = copyObject(ct->relation); } @@ -6558,12 +6701,12 @@ atpxPartAddList(Relation rel, } ct->distributedBy = NULL; - ct->partitionBy = (Node *)pBy; + ct->partitionBy = (Node *) pBy; ct->relKind = RELKIND_RELATION; ct->policy = 0; ct->postCreate = NULL; - ct->is_add_part = true; /* subroutines need to know this */ + ct->is_add_part = true; /* subroutines need to know this */ ct->ownerid = ownerid; if (!ct->distributedBy) @@ -6577,27 +6720,27 @@ atpxPartAddList(Relation rel, { if (PARTTYP_LIST == part_type) { - ListCell *lc; + ListCell *lc; PartitionValuesSpec *spec; - AlterPartitionId pid; + AlterPartitionId pid; - Assert (IsA(pelem->boundSpec, PartitionValuesSpec)); + Assert(IsA(pelem->boundSpec, PartitionValuesSpec)); MemSet(&pid, 0, sizeof(AlterPartitionId)); - spec = (PartitionValuesSpec *)pelem->boundSpec; + spec = (PartitionValuesSpec *) pelem->boundSpec; /* only check this if we aren't doing split */ if (1) { foreach(lc, spec->partValues) { - List *vals = lfirst(lc); - PgPartRule *prule = NULL; + List *vals = lfirst(lc); + PgPartRule *prule = NULL; pid.idtype = AT_AP_IDValue; - pid.partiddef = (Node *)vals; - pid.location = -1; + pid.partiddef = (Node *) vals; + pid.location = -1; prule = get_part_rule1(rel, &pid, false, false, NULL, @@ -6613,14 +6756,15 @@ atpxPartAddList(Relation rel, prule->partIdStr, lrelname))); - } /* end foreach */ + } /* end foreach */ } /* give a new maxpartno for the list partition */ if (pNode && pNode->rules && list_length(pNode->rules)) { - ListCell *lc; + ListCell *lc; PartitionRule *rule = NULL; + maxpartno = 1; foreach(lc, pNode->rules) @@ -6642,14 +6786,14 @@ atpxPartAddList(Relation rel, if (newPos == FIRST && pNode && list_length(pNode->rules) > 0) { /* - * Adding new partition at the beginning. Find a hole in - * existing parruleord sequence by scanning rules list. Open - * gap only until the hole to accommodate the new rule at - * parruleord = 1. + * Adding new partition at the beginning. Find a hole in existing + * parruleord sequence by scanning rules list. Open gap only until + * the hole to accommodate the new rule at parruleord = 1. */ - ListCell *lc; + ListCell *lc; PartitionRule *rule = NULL; - int hole = 1; + int hole = 1; + foreach(lc, pNode->rules) { rule = lfirst(lc); @@ -6659,30 +6803,33 @@ atpxPartAddList(Relation rel, } ++hole; } + /* - * Open gap only if hole found in the middle. If hole exists - * right at the beginning (first partition's parruleord > 1), - * the gap is already open for us. + * Open gap only if hole found in the middle. If hole exists right at + * the beginning (first partition's parruleord > 1), the gap is + * already open for us. */ if (hole > 1) { parruleord_open_gap( - pNode->part->partid, pNode->part->parlevel, - rule->parparentoid, --hole, 1, - false /* closegap */); + pNode->part->partid, pNode->part->parlevel, + rule->parparentoid, --hole, 1, + false /* closegap */ ); } } else if (newPos == LAST && pNode && list_length(pNode->rules) > 0) { /* - * Adding the new partition at the end. Find the hole closest - * to the end of the rule list. Close gap from the last rule - * only until this hole. The new partition then gets the last - * partition's parruleord. + * Adding the new partition at the end. Find the hole closest to the + * end of the rule list. Close gap from the last rule only until + * this hole. The new partition then gets the last partition's + * parruleord. */ - ListCell *lc; + ListCell *lc; PartitionRule *rule = NULL; - int hole = 1, stopkey = -1; + int hole = 1, + stopkey = -1; + foreach(lc, pNode->rules) { rule = lfirst(lc); @@ -6695,10 +6842,11 @@ atpxPartAddList(Relation rel, if (stopkey != -1) { PartitionRule *last_rule = (PartitionRule *) llast(pNode->rules); + parruleord_open_gap( - pNode->part->partid, pNode->part->parlevel, - last_rule->parparentoid, last_rule->parruleord, stopkey, - true /* closegap */); + pNode->part->partid, pNode->part->parlevel, + last_rule->parparentoid, last_rule->parruleord, stopkey, + true /* closegap */ ); /* Let the new rule reuse last rule's parruleord. */ --maxpartno; } @@ -6706,14 +6854,14 @@ atpxPartAddList(Relation rel, else if (bOpenGap) { /* - * Adding new partition in between first and the last one. - * Check if a hole exists by scanning rule list. If one - * exists, either open or close gap based on location of the - * hole relative to maxpartno. + * Adding new partition in between first and the last one. Check if a + * hole exists by scanning rule list. If one exists, either open or + * close gap based on location of the hole relative to maxpartno. */ - ListCell *lc; + ListCell *lc; PartitionRule *rule = NULL; - int hole = 1; + int hole = 1; + foreach(lc, pNode->rules) { rule = lfirst(lc); @@ -6724,60 +6872,60 @@ atpxPartAddList(Relation rel, if (maxpartno > hole) { /* - * Found a hole before maxpartno. Make room for new - * partition in the slot previous to maxpartno. Decrement - * parruleord values from this slot until the hole. + * Found a hole before maxpartno. Make room for new partition in + * the slot previous to maxpartno. Decrement parruleord values + * from this slot until the hole. */ parruleord_open_gap( - pNode->part->partid, - pNode->part->parlevel, - rule->parparentoid, - --maxpartno, - ++hole, - true /* closegap */); + pNode->part->partid, + pNode->part->parlevel, + rule->parparentoid, + --maxpartno, + ++hole, + true /* closegap */ ); } else if (maxpartno < hole) { /* - * Found a hole after maxpartno. Open gap for maxpartno - * by incrementing parruleord values from the hole until - * maxpartno. + * Found a hole after maxpartno. Open gap for maxpartno by + * incrementing parruleord values from the hole until maxpartno. */ parruleord_open_gap( - pNode->part->partid, - pNode->part->parlevel, - rule->parparentoid, - hole, - maxpartno, - false /* closegap */); + pNode->part->partid, + pNode->part->parlevel, + rule->parparentoid, + hole, + maxpartno, + false /* closegap */ ); } /* if (hole == maxpartno) we don't need to open a gap. */ } { - List *l1; - ListCell *lc; - int ii = 0; - bool bFixFirstATS = true; - bool bFirst_TemplateOnly = true; /* ignore dummy entry */ - int pby_templ_depth = 0; /* template partdepth */ - Oid skipTableRelid = InvalidOid; + List *l1; + ListCell *lc; + int ii = 0; + bool bFixFirstATS = true; + bool bFirst_TemplateOnly = true; /* ignore dummy entry */ + int pby_templ_depth = 0; /* template partdepth */ + Oid skipTableRelid = InvalidOid; /* - * This transformCreateStmt() expands the phony create of a partitioned - * table that we just build into the constituent commands we need to create - * the new part. (This will include some commands for the parent that we - * don't need, since the parent already exists.) + * This transformCreateStmt() expands the phony create of a + * partitioned table that we just build into the constituent commands + * we need to create the new part. (This will include some commands + * for the parent that we don't need, since the parent already + * exists.) */ l1 = transformCreateStmt(ct, "ADD PARTITION", true); /* - * Look for the first CreateStmt and generate a GrantStmt - * based on the RangeVar in it. + * Look for the first CreateStmt and generate a GrantStmt based on the + * RangeVar in it. */ foreach(lc, l1) { - Node *s = lfirst(lc); + Node *s = lfirst(lc); /* skip the first one, it's the fake create table for the parent */ if (lc == list_head(l1)) @@ -6785,9 +6933,9 @@ atpxPartAddList(Relation rel, if (IsA(s, CreateStmt)) { - HeapTuple tuple; - Datum aclDatum; - bool isNull; + HeapTuple tuple; + Datum aclDatum; + bool isNull; CreateStmt *t = (CreateStmt *) s; tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(RelationGetRelid(rel))); @@ -6800,10 +6948,11 @@ atpxPartAddList(Relation rel, &isNull); if (!isNull) { - List *cp = NIL; - int i, num; - Acl *acl; - AclItem *aidat; + List *cp = NIL; + int i, + num; + Acl *acl; + AclItem *aidat; acl = DatumGetAclP(aclDatum); @@ -6812,9 +6961,9 @@ atpxPartAddList(Relation rel, for (i = 0; i < num; i++) { - AclItem *aidata = &aidat[i]; - Datum d; - char *str; + AclItem *aidata = &aidat[i]; + Datum d; + char *str; d = DirectFunctionCall1(aclitemout, PointerGetDatum(aidata)); @@ -6826,7 +6975,7 @@ atpxPartAddList(Relation rel, if (list_length(cp)) { - GrantStmt *gs = makeNode(GrantStmt); + GrantStmt *gs = makeNode(GrantStmt); gs->is_grant = true; gs->objtype = ACL_OBJECT_RELATION; @@ -6843,17 +6992,19 @@ atpxPartAddList(Relation rel, } } - /* skip the first cell because the table already exists -- - * don't recreate it + /* + * skip the first cell because the table already exists -- don't + * recreate it */ lc = list_head(l1); if (lc) { - Node *s = lfirst(lc); + Node *s = lfirst(lc); - /* MPP-10421: but save the relid of the skipped table, - * because we skip indexes associated with it... + /* + * MPP-10421: but save the relid of the skipped table, because we + * skip indexes associated with it... */ if (IsA(s, CreateStmt)) { @@ -6866,19 +7017,18 @@ atpxPartAddList(Relation rel, for_each_cell(lc, lnext(lc)) { - Node *q = lfirst(lc); + Node *q = lfirst(lc); /* - * MPP-6379, MPP-10421: If the statement is an expanded - * index creation statement on the parent (or the "skipped - * table"), ignore it. We get into this situation when the - * parent has one or more indexes on it that our new - * partition is inheriting. + * MPP-6379, MPP-10421: If the statement is an expanded index + * creation statement on the parent (or the "skipped table"), + * ignore it. We get into this situation when the parent has one + * or more indexes on it that our new partition is inheriting. */ if (IsA(q, IndexStmt)) { - IndexStmt *istmt = (IndexStmt *)q; - Oid idxRelid = RangeVarGetRelid(istmt->relation, true); + IndexStmt *istmt = (IndexStmt *) q; + Oid idxRelid = RangeVarGetRelid(istmt->relation, true); if (idxRelid == RelationGetRelid(rel)) continue; @@ -6888,15 +7038,16 @@ atpxPartAddList(Relation rel, continue; } - /* XXX XXX: fix the first Alter Table Statement to have - * the correct maxpartno. Whoohoo!! + /* + * XXX XXX: fix the first Alter Table Statement to have the + * correct maxpartno. Whoohoo!! */ if (bFixFirstATS && q && IsA(q, AlterTableStmt)) { - PartitionSpec *spec = NULL; - AlterTableStmt *ats; - AlterTableCmd *atc; - List *cmds; + PartitionSpec *spec = NULL; + AlterTableStmt *ats; + AlterTableCmd *atc; + List *cmds; bFixFirstATS = false; @@ -6907,60 +7058,62 @@ atpxPartAddList(Relation rel, Assert(cmds && (list_length(cmds) > 1)); - atc = (AlterTableCmd *)lsecond(cmds); + atc = (AlterTableCmd *) lsecond(cmds); Assert(atc->def); - pBy = (PartitionBy *)atc->def; + pBy = (PartitionBy *) atc->def; Assert(IsA(pBy, PartitionBy)); - spec = (PartitionSpec *)pBy->partSpec; + spec = (PartitionSpec *) pBy->partSpec; if (spec) { - List *l2 = spec->partElem; - PartitionElem *pel; + List *l2 = spec->partElem; + PartitionElem *pel; if (l2 && list_length(l2)) { - pel = (PartitionElem *)linitial(l2); + pel = (PartitionElem *) linitial(l2); pel->partno = maxpartno; } } - } /* end first alter table fixup */ + } /* end first alter table fixup */ else if (IsA(q, CreateStmt)) { /* propagate owner */ ((CreateStmt *) q)->ownerid = ownerid; } - /* normal case - add partitions using CREATE statements - * that get dispatched to the segments + /* + * normal case - add partitions using CREATE statements that get + * dispatched to the segments */ if (!bSetTemplate) ProcessUtility(q, synthetic_sql, NULL, - false, /* not top level */ + false, /* not top level */ dest, NULL); else - { /* setting subpartition template only */ + { /* setting subpartition template only */ - /* find all the alter table statements that contain - * partaddinternal, and extract the definitions. Only - * build the catalog entries for subpartition - * templates, not "real" table entries. + /* + * find all the alter table statements that contain + * partaddinternal, and extract the definitions. Only build + * the catalog entries for subpartition templates, not "real" + * table entries. */ if (IsA(q, AlterTableStmt)) { AlterTableStmt *at2 = (AlterTableStmt *) q; - List *l2 = at2->cmds; - ListCell *lc2; + List *l2 = at2->cmds; + ListCell *lc2; foreach(lc2, l2) { @@ -6969,27 +7122,27 @@ atpxPartAddList(Relation rel, if (ac2->subtype == AT_PartAddInternal) { PartitionBy *templ_pby = - (PartitionBy *)ac2->def; + (PartitionBy *) ac2->def; Assert(IsA(templ_pby, PartitionBy)); - /* skip the first one because it's the - * fake parent partition definition for - * the subpartition template entries + /* + * skip the first one because it's the fake parent + * partition definition for the subpartition + * template entries */ if (bFirst_TemplateOnly) { bFirst_TemplateOnly = false; - /* MPP-5992: only set one level of - * templates -- we might have - * templates for subpartitions of the - * subpartitions, which would add - * duplicate templates into the table. - * Only add templates of the specified - * depth and skip deeper template - * definitions. + /* + * MPP-5992: only set one level of templates + * -- we might have templates for + * subpartitions of the subpartitions, which + * would add duplicate templates into the + * table. Only add templates of the specified + * depth and skip deeper template definitions. */ pby_templ_depth = templ_pby->partDepth + 1; @@ -6999,17 +7152,18 @@ atpxPartAddList(Relation rel, if (templ_pby->partDepth == pby_templ_depth) add_part_to_catalog( RelationGetRelid(rel), - (PartitionBy *)ac2->def, + (PartitionBy *) ac2->def, true); } } - } /* end foreach lc2 l2 */ + } /* end foreach lc2 l2 */ } - } /* end else setting subpartition templates only */ + } /* end else setting subpartition templates + * only */ ii++; - } /* end for each cell */ + } /* end for each cell */ } @@ -7017,14 +7171,14 @@ atpxPartAddList(Relation rel, heap_close(par_rel, NoLock); return pSubSpec; -} /* end atpxPartAddList */ +} /* end atpxPartAddList */ List * atpxDropList(Relation rel, PartitionNode *pNode) { - List *l1 = NIL; - ListCell *lc; + List *l1 = NIL; + ListCell *lc; if (!pNode) return l1; @@ -7033,7 +7187,7 @@ atpxDropList(Relation rel, PartitionNode *pNode) foreach(lc, pNode->rules) { PartitionRule *rule = lfirst(lc); - List *l2 = NIL; + List *l2 = NIL; if (rule->children) l2 = atpxDropList(rel, rule->children); @@ -7053,7 +7207,7 @@ atpxDropList(Relation rel, PartitionNode *pNode) if (pNode->default_part) { PartitionRule *rule = pNode->default_part; - List *l2 = NIL; + List *l2 = NIL; if (rule->children) l2 = atpxDropList(rel, rule->children); @@ -7073,8 +7227,8 @@ atpxDropList(Relation rel, PartitionNode *pNode) foreach(lc, pNode->rules) { PartitionRule *rule = lfirst(lc); - char *prelname; - char *nspname; + char *prelname; + char *nspname; Relation rel; rel = heap_open(rule->parchildrelid, AccessShareLock); @@ -7094,8 +7248,8 @@ atpxDropList(Relation rel, PartitionNode *pNode) if (pNode->default_part) { PartitionRule *rule = pNode->default_part; - char *prelname; - char *nspname; + char *prelname; + char *nspname; Relation rel; rel = heap_open(rule->parchildrelid, AccessShareLock); @@ -7112,20 +7266,21 @@ atpxDropList(Relation rel, PartitionNode *pNode) } return l1; -} /* end atpxDropList */ +} /* end atpxDropList */ void exchange_part_rule(Oid oldrelid, Oid newrelid) { - HeapTuple tuple; - Relation catalogRelation; - ScanKeyData scankey; + HeapTuple tuple; + Relation catalogRelation; + ScanKeyData scankey; SysScanDesc sscan; - /* pg_partition and pg_partition_rule are populated only on the - * entry database, so a call to this function is only meaningful - * there. */ + /* + * pg_partition and pg_partition_rule are populated only on the entry + * database, so a call to this function is only meaningful there. + */ Insist(Gp_segment == -1); catalogRelation = heap_open(PartitionRuleRelationId, RowExclusiveLock); @@ -7157,16 +7312,16 @@ exchange_part_rule(Oid oldrelid, Oid newrelid) void exchange_permissions(Oid oldrelid, Oid newrelid) { - HeapTuple oldtuple; - HeapTuple newtuple; - Datum save; - bool saveisnull; - Datum values[Natts_pg_class]; - bool nulls[Natts_pg_class]; - bool replaces[Natts_pg_class]; - HeapTuple replace_tuple; - bool isnull; - Relation rel = heap_open(RelationRelationId, RowExclusiveLock); + HeapTuple oldtuple; + HeapTuple newtuple; + Datum save; + bool saveisnull; + Datum values[Natts_pg_class]; + bool nulls[Natts_pg_class]; + bool replaces[Natts_pg_class]; + HeapTuple replace_tuple; + bool isnull; + Relation rel = heap_open(RelationRelationId, RowExclusiveLock); oldtuple = SearchSysCache1(RELOID, ObjectIdGetDatum(oldrelid)); if (!HeapTupleIsValid(oldtuple)) @@ -7226,11 +7381,11 @@ exchange_permissions(Oid oldrelid, Oid newrelid) bool -atpxModifyListOverlap (Relation rel, - AlterPartitionId *pid, - PgPartRule *prule, - PartitionElem *pelem, - bool bAdd) +atpxModifyListOverlap(Relation rel, + AlterPartitionId *pid, + PgPartRule *prule, + PartitionElem *pelem, + bool bAdd) { if (prule->pNode->default_part && bAdd) ereport(ERROR, @@ -7246,15 +7401,15 @@ atpxModifyListOverlap (Relation rel, prule->pNode->default_part->parname))); { - ListCell *lc; - PartitionValuesSpec *pVSpec; - AlterPartitionId pid2; - PartitionNode *pNode = prule->pNode; - CreateStmtContext cxt; + ListCell *lc; + PartitionValuesSpec *pVSpec; + AlterPartitionId pid2; + PartitionNode *pNode = prule->pNode; + CreateStmtContext cxt; MemSet(&cxt, 0, sizeof(cxt)); - Assert (IsA(pelem->boundSpec, PartitionValuesSpec)); + Assert(IsA(pelem->boundSpec, PartitionValuesSpec)); MemSet(&pid2, 0, sizeof(AlterPartitionId)); @@ -7262,25 +7417,25 @@ atpxModifyListOverlap (Relation rel, (void) atpxPart_validate_spec(makeNode(PartitionBy), &cxt, rel, - NULL, /* CreateStmt */ + NULL, /* CreateStmt */ pelem, pNode, (pid->idtype == AT_AP_IDName) ? strVal(pid->partiddef) : NULL, - false, /* isDefault */ + false, /* isDefault */ PARTTYP_LIST, /* part_type */ prule->partIdStr); - pVSpec = (PartitionValuesSpec *)pelem->boundSpec; + pVSpec = (PartitionValuesSpec *) pelem->boundSpec; foreach(lc, pVSpec->partValues) { - List *vals = lfirst(lc); - PgPartRule *prule2 = NULL; + List *vals = lfirst(lc); + PgPartRule *prule2 = NULL; pid2.idtype = AT_AP_IDValue; - pid2.partiddef = (Node *)vals; - pid2.location = -1; + pid2.partiddef = (Node *) vals; + pid2.location = -1; prule2 = get_part_rule(rel, &pid2, false, false, NULL, false); @@ -7313,10 +7468,11 @@ atpxModifyListOverlap (Relation rel, RelationGetRelationName(rel)))); } } - else /* DROP values */ + else /* DROP values */ { - /* if DROPping a value, it should only be in the - * specified partition + /* + * if DROPping a value, it should only be in the specified + * partition */ if (!prule2) @@ -7343,41 +7499,41 @@ atpxModifyListOverlap (Relation rel, } } - } /* end foreach */ + } /* end foreach */ } return false; -} /* end atpxModifyListOverlap */ +} /* end atpxModifyListOverlap */ bool -atpxModifyRangeOverlap (Relation rel, - AlterPartitionId *pid, - PgPartRule *prule, - PartitionElem *pelem) +atpxModifyRangeOverlap(Relation rel, + AlterPartitionId *pid, + PgPartRule *prule, + PartitionElem *pelem) { - PgPartRule *prule2 = NULL; - AlterPartitionId pid2; - PartitionNode *pNode = prule->pNode; - bool bCheckStart = true; - PartitionBoundSpec *pbs = NULL; - ParseState *pstate; - bool bOverlap = false; - bool *isnull; - TupleDesc tupledesc = RelationGetDescr(rel); - Datum *d_start = NULL; - Datum *d_end = NULL; - Node *pRangeValList = NULL; - int ii; + PgPartRule *prule2 = NULL; + AlterPartitionId pid2; + PartitionNode *pNode = prule->pNode; + bool bCheckStart = true; + PartitionBoundSpec *pbs = NULL; + ParseState *pstate; + bool bOverlap = false; + bool *isnull; + TupleDesc tupledesc = RelationGetDescr(rel); + Datum *d_start = NULL; + Datum *d_end = NULL; + Node *pRangeValList = NULL; + int ii; - Assert (IsA(pelem->boundSpec, PartitionBoundSpec)); + Assert(IsA(pelem->boundSpec, PartitionBoundSpec)); - pbs = (PartitionBoundSpec *)pelem->boundSpec; + pbs = (PartitionBoundSpec *) pelem->boundSpec; - for (ii = 0; ii < 2 ; ii++) + for (ii = 0; ii < 2; ii++) { - PartitionRangeItem *ri; + PartitionRangeItem *ri; - if (bCheckStart) /* check START first, then END */ + if (bCheckStart) /* check START first, then END */ { if (!(pbs->partStart)) { @@ -7390,19 +7546,19 @@ atpxModifyRangeOverlap (Relation rel, ri->location = -1; ri->partRangeVal = - copyObject(prule->topRule->parrangestart); + copyObject(prule->topRule->parrangestart); ri->partedge = prule->topRule->parrangestartincl ? - PART_EDGE_INCLUSIVE : - PART_EDGE_EXCLUSIVE; + PART_EDGE_INCLUSIVE : + PART_EDGE_EXCLUSIVE; /* no start, so use current start */ - pbs->partStart = (Node *)ri; + pbs->partStart = (Node *) ri; } continue; } - ri = (PartitionRangeItem *)pbs->partStart; + ri = (PartitionRangeItem *) pbs->partStart; } else { @@ -7419,16 +7575,16 @@ atpxModifyRangeOverlap (Relation rel, ri->partRangeVal = copyObject(prule->topRule->parrangeend); ri->partedge = prule->topRule->parrangeendincl ? - PART_EDGE_INCLUSIVE : - PART_EDGE_EXCLUSIVE; + PART_EDGE_INCLUSIVE : + PART_EDGE_EXCLUSIVE; /* no end, so use current end */ - pbs->partEnd = (Node *)ri; + pbs->partEnd = (Node *) ri; } break; } - ri = (PartitionRangeItem *)pbs->partEnd; + ri = (PartitionRangeItem *) pbs->partEnd; } MemSet(&pid2, 0, sizeof(AlterPartitionId)); @@ -7437,18 +7593,18 @@ atpxModifyRangeOverlap (Relation rel, pstate = make_parsestate(NULL); pRangeValList = (Node *) copyObject(ri->partRangeVal); pRangeValList = (Node *) - transformExpressionList(pstate, (List *)pRangeValList); + transformExpressionList(pstate, (List *) pRangeValList); free_parsestate(pstate); pid2.partiddef = pRangeValList; - pid2.location = -1; + pid2.location = -1; prule2 = get_part_rule(rel, &pid2, false, false, NULL, false); if (!prule2) { - /* no rules matched -- this is ok as long as no - * default partition + /* + * no rules matched -- this is ok as long as no default partition */ if (prule->pNode->default_part) ereport(ERROR, @@ -7469,18 +7625,18 @@ atpxModifyRangeOverlap (Relation rel, if (bCheckStart) { - bool bstat; + bool bstat; PartitionRule *a_rule; /* check for adjacent partition */ if (1 == prule->topRuleRank) - continue; /* no previous, so changing start is ok */ + continue; /* no previous, so changing start is ok */ MemSet(&pid2, 0, sizeof(AlterPartitionId)); pid2.idtype = AT_AP_IDRank; - pid2.partiddef = (Node *)makeInteger(prule->topRuleRank - 1); - pid2.location = -1; + pid2.partiddef = (Node *) makeInteger(prule->topRuleRank - 1); + pid2.location = -1; prule2 = get_part_rule(rel, &pid2, false, false, NULL, false); @@ -7491,15 +7647,15 @@ atpxModifyRangeOverlap (Relation rel, /* just check against end of adjacent partition */ d_start = - magic_expr_to_datum(rel, pNode, - pRangeValList, &isnull); + magic_expr_to_datum(rel, pNode, + pRangeValList, &isnull); bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { /* end > new start - overlap */ @@ -7507,44 +7663,46 @@ atpxModifyRangeOverlap (Relation rel, break; } - /* could be the case that new start == end of - * previous. This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE. + /* + * could be the case that new start == end of previous. This + * is ok if they have opposite INCLUSIVE/EXCLUSIVE. */ bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { if (a_rule->parrangeendincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap */ + /* + * start and end must be of opposite types, else they + * overlap + */ bOverlap = true; break; } } } - else /* check the end */ + else /* check the end */ { - bool bstat; + bool bstat; PartitionRule *a_rule; /* check for adjacent partition */ if (list_length(pNode->rules) == prule->topRuleRank) - continue; /* no next, so changing end is ok */ + continue; /* no next, so changing end is ok */ MemSet(&pid2, 0, sizeof(AlterPartitionId)); pid2.idtype = AT_AP_IDRank; - pid2.partiddef = (Node *)makeInteger(prule->topRuleRank + 1); - pid2.location = -1; + pid2.partiddef = (Node *) makeInteger(prule->topRuleRank + 1); + pid2.location = -1; prule2 = get_part_rule(rel, &pid2, false, false, NULL, false); @@ -7555,15 +7713,15 @@ atpxModifyRangeOverlap (Relation rel, /* just check against start of adjacent partition */ d_end = - magic_expr_to_datum(rel, pNode, - pRangeValList, &isnull); + magic_expr_to_datum(rel, pNode, + pRangeValList, &isnull); bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc); if (bstat) { /* start < new end - overlap */ @@ -7571,35 +7729,38 @@ atpxModifyRangeOverlap (Relation rel, break; } - /* could be the case that new end == start of - * next. This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE. + /* + * could be the case that new end == start of next. This is + * ok if they have opposite INCLUSIVE/EXCLUSIVE. */ bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc); if (bstat) { if (a_rule->parrangeendincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap */ + /* + * start and end must be of opposite types, else they + * overlap + */ bOverlap = true; break; } } - } /* end else check the end */ + } /* end else check the end */ } else { - /* matched a rule - definitely a problem if the range was + /* + * matched a rule - definitely a problem if the range was * inclusive */ if (prule2->topRuleRank != prule->topRuleRank) @@ -7627,7 +7788,7 @@ atpxModifyRangeOverlap (Relation rel, /* range was exclusive -- need to do some checking */ if (bCheckStart) { - bool bstat; + bool bstat; PartitionRule *a_rule = prule2->topRule; /* check for adjacent partition */ @@ -7639,15 +7800,15 @@ atpxModifyRangeOverlap (Relation rel, /* just check against end of adjacent partition */ d_start = - magic_expr_to_datum(rel, pNode, - pid2.partiddef, &isnull); + magic_expr_to_datum(rel, pNode, + pid2.partiddef, &isnull); bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - ">", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + ">", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc); if (bstat) { /* end > new start - overlap */ @@ -7655,33 +7816,34 @@ atpxModifyRangeOverlap (Relation rel, break; } - /* Must be the case that new start == end of - * a_rule (because if the end < new start then how - * could we find it in the interval for prule ?) - * This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE -> New partition does not + /* + * Must be the case that new start == end of a_rule + * (because if the end < new start then how could we find + * it in the interval for prule ?) This is ok if they have + * opposite INCLUSIVE/EXCLUSIVE -> New partition does not * overlap. */ - Assert (compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangeend, - d_start, isnull, tupledesc)); + Assert(compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangeend, + d_start, isnull, tupledesc)); if (a_rule->parrangeendincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap + /* + * start and end must be of opposite types, else they + * overlap */ bOverlap = true; break; } } - else /* check the end */ + else /* check the end */ { - bool bstat; + bool bstat; PartitionRule *a_rule = prule2->topRule; /* check for adjacent partition */ @@ -7693,15 +7855,15 @@ atpxModifyRangeOverlap (Relation rel, /* just check against start of adjacent partition */ d_end = - magic_expr_to_datum(rel, pNode, - pid2.partiddef, &isnull); + magic_expr_to_datum(rel, pNode, + pid2.partiddef, &isnull); bstat = - compare_partn_opfuncid(pNode, - "pg_catalog", - "<", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc); + compare_partn_opfuncid(pNode, + "pg_catalog", + "<", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc); if (bstat) { /* start < new end - overlap */ @@ -7709,31 +7871,33 @@ atpxModifyRangeOverlap (Relation rel, break; } - /* Must be the case that new end = start of - * a_rule (because if the start > new end then how - * could we find it in the interval for prule ?) - * This is ok if they have opposite - * INCLUSIVE/EXCLUSIVE -> New partition does not + /* + * Must be the case that new end = start of a_rule + * (because if the start > new end then how could we find + * it in the interval for prule ?) This is ok if they have + * opposite INCLUSIVE/EXCLUSIVE -> New partition does not * overlap. */ - Assert (compare_partn_opfuncid(pNode, - "pg_catalog", - "=", - (List *)a_rule->parrangestart, - d_end, isnull, tupledesc)); + Assert(compare_partn_opfuncid(pNode, + "pg_catalog", + "=", + (List *) a_rule->parrangestart, + d_end, isnull, tupledesc)); if (a_rule->parrangestartincl == (ri->partedge == PART_EDGE_INCLUSIVE)) { - /* start and end must be of opposite - * types, else they overlap + /* + * start and end must be of opposite types, else they + * overlap */ bOverlap = true; break; } - } /* end else check the end */ - } /* end if (prule2->topRuleRank != prule->topRuleRank) */ + } /* end else check the end */ + } /* end if (prule2->topRuleRank != + * prule->topRuleRank) */ } /* if checked START, then check END. If checked END, then done */ @@ -7741,7 +7905,7 @@ atpxModifyRangeOverlap (Relation rel, break; if (bCheckStart) bCheckStart = false; - } /* end for */ + } /* end for */ if (bOverlap) ereport(ERROR, @@ -7756,7 +7920,7 @@ atpxModifyRangeOverlap (Relation rel, prule2->partIdStr : ""))); { - CreateStmtContext cxt; + CreateStmtContext cxt; MemSet(&cxt, 0, sizeof(cxt)); @@ -7764,48 +7928,52 @@ atpxModifyRangeOverlap (Relation rel, (void) atpxPart_validate_spec(makeNode(PartitionBy), &cxt, rel, - NULL, /* CreateStmt */ + NULL, /* CreateStmt */ pelem, pNode, (pid->idtype == AT_AP_IDName) ? strVal(pid->partiddef) : NULL, - false, /* isDefault */ - PARTTYP_RANGE, /* part_type */ + false, /* isDefault */ + PARTTYP_RANGE, /* part_type */ prule->partIdStr); } return false; -} /* end atpxModifyRangeOverlap */ +} /* end atpxModifyRangeOverlap */ -static void atpxSkipper(PartitionNode *pNode, int *skipped) +static void +atpxSkipper(PartitionNode *pNode, int *skipped) { - ListCell *lc; + ListCell *lc; - if (!pNode) return; + if (!pNode) + return; /* add entries for rules at current level */ foreach(lc, pNode->rules) { - PartitionRule *rule = lfirst(lc); + PartitionRule *rule = lfirst(lc); - if (skipped) *skipped += 1; + if (skipped) + *skipped += 1; if (rule->children) atpxSkipper(rule->children, skipped); - } /* end foreach */ + } /* end foreach */ /* and the default partition */ if (pNode->default_part) { - PartitionRule *rule = pNode->default_part; + PartitionRule *rule = pNode->default_part; - if (skipped) *skipped += 1; + if (skipped) + *skipped += 1; if (rule->children) atpxSkipper(rule->children, skipped); } -} /* end atpxSkipper */ +} /* end atpxSkipper */ static List * build_rename_part_recurse(PartitionRule *rule, const char *old_parentname, @@ -7813,11 +7981,11 @@ build_rename_part_recurse(PartitionRule *rule, const char *old_parentname, int *skipped) { - RangeVar *rv; - Relation rel; - char *relname = NULL; - char newRelNameBuf[(NAMEDATALEN*2)]; - List *l1 = NIL; + RangeVar *rv; + Relation rel; + char *relname = NULL; + char newRelNameBuf[(NAMEDATALEN * 2)]; + List *l1 = NIL; rel = heap_open(rule->parchildrelid, AccessShareLock); @@ -7830,12 +7998,12 @@ build_rename_part_recurse(PartitionRule *rule, const char *old_parentname, heap_close(rel, AccessShareLock); /* - * The child name should contain the old parent name as a - * prefix - check the length and compare to make sure. + * The child name should contain the old parent name as a prefix - check + * the length and compare to make sure. * - * To build the new child name, just use the new name as a - * prefix, and use the remainder of the child name (the part - * after the old parent name prefix) as the suffix. + * To build the new child name, just use the new name as a prefix, and use + * the remainder of the child name (the part after the old parent name + * prefix) as the suffix. */ if (strlen(old_parentname) > strlen(relname)) { @@ -7868,7 +8036,7 @@ build_rename_part_recurse(PartitionRule *rule, const char *old_parentname, /* add the child lists next (not first) */ { - List *l2 = NIL; + List *l2 = NIL; if (rule->children) l2 = atpxRenameList(rule->children, @@ -7886,8 +8054,8 @@ List * atpxRenameList(PartitionNode *pNode, char *old_parentname, const char *new_parentname, int *skipped) { - List *l1 = NIL; - ListCell *lc; + List *l1 = NIL; + ListCell *lc; if (!pNode) return l1; @@ -7895,19 +8063,19 @@ atpxRenameList(PartitionNode *pNode, /* add entries for rules at current level */ foreach(lc, pNode->rules) { - PartitionRule *rule = lfirst(lc); + PartitionRule *rule = lfirst(lc); l1 = list_concat(l1, build_rename_part_recurse(rule, old_parentname, new_parentname, skipped)); - } /* end foreach */ + } /* end foreach */ /* and the default partition */ if (pNode->default_part) { - PartitionRule *rule = pNode->default_part; + PartitionRule *rule = pNode->default_part; l1 = list_concat(l1, build_rename_part_recurse(rule, @@ -7917,21 +8085,21 @@ atpxRenameList(PartitionNode *pNode, } return l1; -} /* end atpxRenameList */ +} /* end atpxRenameList */ static Oid get_opfuncid_by_opname(List *opname, Oid lhsid, Oid rhsid) { - Oid opfuncid; - Operator op; + Oid opfuncid; + Operator op; op = oper(NULL, opname, lhsid, rhsid, false, -1); - if (op == NULL) /* should not fail */ + if (op == NULL) /* should not fail */ elog(ERROR, "could not find operator"); - opfuncid = ((Form_pg_operator)GETSTRUCT(op))->oprcode; + opfuncid = ((Form_pg_operator) GETSTRUCT(op))->oprcode; ReleaseSysCache(op); return opfuncid; @@ -7947,7 +8115,7 @@ get_opfuncid_by_opname(List *opname, Oid lhsid, Oid rhsid) static PgPartRule * get_pprule_from_ATC(Relation rel, AlterTableCmd *cmd) { - List *pids = NIL; /* of AlterPartitionId */ + List *pids = NIL; /* of AlterPartitionId */ AlterPartitionId *pid = NULL; PgPartRule *pprule = NULL; AlterPartitionId *work_partid = NULL; @@ -7956,42 +8124,42 @@ get_pprule_from_ATC(Relation rel, AlterTableCmd *cmd) /* Get list of enclosing ALTER PARTITION ids. */ - while ( atc->subtype == AT_PartAlter ) + while (atc->subtype == AT_PartAlter) { - AlterPartitionCmd *apc = (AlterPartitionCmd*)atc->def; + AlterPartitionCmd *apc = (AlterPartitionCmd *) atc->def; - pid = (AlterPartitionId*)apc->partid; + pid = (AlterPartitionId *) apc->partid; Insist(IsA(pid, AlterPartitionId)); - atc = (AlterTableCmd*)apc->arg1; + atc = (AlterTableCmd *) apc->arg1; Insist(IsA(atc, AlterTableCmd)); pids = lappend(pids, pid); } - /* The effective ALTER TABLE command is in atc. - * The pids list (of AlterPartitionId nodes) represents the path to - * top partitioning branch of rel. Since we are only called for - * branches and leaves (never the root) of the partition, the pid - * list should not empty. + /* + * The effective ALTER TABLE command is in atc. The pids list (of + * AlterPartitionId nodes) represents the path to top partitioning branch + * of rel. Since we are only called for branches and leaves (never the + * root) of the partition, the pid list should not empty. * - * Use the AlterPartitionId interpretter, get_part_rule, to do - * the interpretation. + * Use the AlterPartitionId interpretter, get_part_rule, to do the + * interpretation. */ - Insist( list_length(pids) > 0 ); + Insist(list_length(pids) > 0); work_partid = makeNode(AlterPartitionId); work_partid->idtype = AT_AP_IDList; - work_partid->partiddef = (Node*)pids; + work_partid->partiddef = (Node *) pids; work_partid->location = -1; pprule = get_part_rule(rel, work_partid, - true, true, /* parts must exist */ - NULL, /* no implicit results */ - false /* no template rules */ - ); + true, true, /* parts must exist */ + NULL, /* no implicit results */ + false /* no template rules */ + ); return pprule; } @@ -8008,7 +8176,7 @@ basic_AT_oids(Relation rel, AlterTableCmd *cmd) { PgPartRule *pprule = get_pprule_from_ATC(rel, cmd); - if ( ! pprule ) + if (!pprule) return NIL; return all_prule_relids(pprule->topRule); @@ -8018,13 +8186,15 @@ basic_AT_oids(Relation rel, AlterTableCmd *cmd) * Return the basic AlterTableCmd found by peeling off intervening layers of * ALTER PARTITION from the given AlterTableCmd. */ -AlterTableCmd *basic_AT_cmd(AlterTableCmd *cmd) +AlterTableCmd * +basic_AT_cmd(AlterTableCmd *cmd) { - while ( cmd->subtype == AT_PartAlter ) + while (cmd->subtype == AT_PartAlter) { - AlterPartitionCmd *apc = (AlterPartitionCmd*)cmd->def; + AlterPartitionCmd *apc = (AlterPartitionCmd *) cmd->def; + Insist(IsA(apc, AlterPartitionCmd)); - cmd = (AlterTableCmd*)apc->arg1; + cmd = (AlterTableCmd *) apc->arg1; Insist(IsA(cmd, AlterTableCmd)); } return cmd; @@ -8041,10 +8211,11 @@ AlterTableCmd *basic_AT_cmd(AlterTableCmd *cmd) * rel Pointer to cache entry for the whole partitioned table * dist_cnames List of column names proposed for distribution some part */ -bool can_implement_dist_on_part(Relation rel, List *dist_cnames) +bool +can_implement_dist_on_part(Relation rel, List *dist_cnames) { - ListCell *lc; - int i; + ListCell *lc; + int i; if (Gp_role != GP_ROLE_DISPATCH) { @@ -8054,26 +8225,26 @@ bool can_implement_dist_on_part(Relation rel, List *dist_cnames) } /* Random is okay. It is represented by a list of one empty list. */ - if ( list_length(dist_cnames) == 1 && linitial(dist_cnames) == NIL ) + if (list_length(dist_cnames) == 1 && linitial(dist_cnames) == NIL) return true; /* Require an exact match to the policy of the parent. */ - if ( list_length(dist_cnames) != rel->rd_cdbpolicy->nattrs ) + if (list_length(dist_cnames) != rel->rd_cdbpolicy->nattrs) return false; i = 0; foreach(lc, dist_cnames) { - AttrNumber attnum; - char *cname; - HeapTuple tuple; - Node *item = lfirst(lc); - bool ok = false; + AttrNumber attnum; + char *cname; + HeapTuple tuple; + Node *item = lfirst(lc); + bool ok = false; - if ( !(item && IsA(item, String)) ) + if (!(item && IsA(item, String))) return false; - cname = strVal((Value *)item); + cname = strVal((Value *) item); tuple = SearchSysCacheAttName(RelationGetRelid(rel), cname); if (!HeapTupleIsValid(tuple)) ereport(ERROR, @@ -8087,7 +8258,7 @@ bool can_implement_dist_on_part(Relation rel, List *dist_cnames) ReleaseSysCache(tuple); - if ( ! ok ) + if (!ok) return false; } return true; @@ -8106,16 +8277,16 @@ bool can_implement_dist_on_part(Relation rel, List *dist_cnames) bool is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) { - AttrMap *map_new = NULL; - AttrMap *map_old = NULL; - bool congruent = TRUE; + AttrMap *map_new = NULL; + AttrMap *map_old = NULL; + bool congruent = TRUE; /* Both parts must be relations. */ if (!(oldrel->rd_rel->relkind == RELKIND_RELATION || newrel->rd_rel->relkind == RELKIND_RELATION)) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot exchange relation " @@ -8127,7 +8298,7 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) if (rel_is_default_partition(oldrel->rd_id)) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot exchange DEFAULT partition " @@ -8135,26 +8306,28 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) } ExtTableEntry *extEntry = GetExtTableEntry(newrel->rd_id); + if (extEntry && extEntry->iswritable) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot exchange relation " "which is a WRITABLE external table"))); } } - - /* Attributes of the existing part (oldrel) must be compatible with the + + /* + * Attributes of the existing part (oldrel) must be compatible with the * partitioned table as a whole. This might be an assertion, but we don't * want this case to pass in a production build, so we use an internal * error. */ - if (congruent && ! map_part_attrs(rel, oldrel, &map_old, FALSE) ) + if (congruent && !map_part_attrs(rel, oldrel, &map_old, FALSE)) { congruent = FALSE; - if ( throw ) + if (throw) elog(ERROR, "existing part \"%s\" not congruent with" "partitioned table \"%s\"", RelationGetRelationName(oldrel), @@ -8163,17 +8336,18 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) /* From here on we need to be careful to free the maps. */ - /* Attributes of new part must be compatible with the partitioned table. + /* + * Attributes of new part must be compatible with the partitioned table. * (We assume that the attributes of the old part are compatible.) */ - if ( congruent && ! map_part_attrs(rel, newrel, &map_new, throw) ) + if (congruent && !map_part_attrs(rel, newrel, &map_new, throw)) congruent = FALSE; /* Both parts must have the same owner. */ - if ( congruent && oldrel->rd_rel->relowner != newrel->rd_rel->relowner) + if (congruent && oldrel->rd_rel->relowner != newrel->rd_rel->relowner) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("owner of \"%s\" must be the same as that " @@ -8186,7 +8360,7 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) if (congruent && oldrel->rd_rel->relhasoids != newrel->rd_rel->relhasoids) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("\"%s\" and \"%s\" must have same OIDs setting", @@ -8195,10 +8369,10 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) } /* The new part table must not be involved in inheritance. */ - if ( congruent && has_subclass_fast(RelationGetRelid(newrel))) + if (congruent && has_subclass_fast(RelationGetRelid(newrel))) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot EXCHANGE table \"%s\" as it has " @@ -8209,7 +8383,7 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) if (congruent && relation_has_supers(RelationGetRelid(newrel))) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot exchange table \"%s\" as it " @@ -8218,44 +8392,45 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) } /* The new part table must not have rules on it. */ - if ( congruent && ( newrel->rd_rules || oldrel->rd_rules ) ) + if (congruent && (newrel->rd_rules || oldrel->rd_rules)) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("cannot exchange table which has rules " "defined on it"))); } - /* The distribution policies of the existing part (oldpart) and the + /* + * The distribution policies of the existing part (oldpart) and the * candidate part (newpart) must match that of the whole partitioned * table. However, we can only check this where the policy table is - * populated, i.e., on the entry database. Note checking the policy - * of the existing part is defensive. It SHOULD match. - * Skip the check when either the oldpart or the newpart is external. + * populated, i.e., on the entry database. Note checking the policy of + * the existing part is defensive. It SHOULD match. Skip the check when + * either the oldpart or the newpart is external. */ if (congruent && Gp_role == GP_ROLE_DISPATCH && - !RelationIsExternal(newrel) && !RelationIsExternal(oldrel)) + !RelationIsExternal(newrel) && !RelationIsExternal(oldrel)) { - GpPolicy *parpol = rel->rd_cdbpolicy; - GpPolicy *oldpol = oldrel->rd_cdbpolicy; - GpPolicy *newpol = newrel->rd_cdbpolicy; - GpPolicy *adjpol = NULL; + GpPolicy *parpol = rel->rd_cdbpolicy; + GpPolicy *oldpol = oldrel->rd_cdbpolicy; + GpPolicy *newpol = newrel->rd_cdbpolicy; + GpPolicy *adjpol = NULL; - if ( map_old != NULL ) + if (map_old != NULL) { - int i; - AttrNumber remapped_parent_attr = 0; + int i; + AttrNumber remapped_parent_attr = 0; - for ( i = 0; i < parpol->nattrs; i++ ) + for (i = 0; i < parpol->nattrs; i++) { remapped_parent_attr = attrMap(map_old, parpol->attrs[i]); - if ( ! (parpol->attrs[i] > 0 /* assert parent live */ - && oldpol->attrs[i] > 0 /* assert old part live */ - && remapped_parent_attr == oldpol->attrs[i] /* assert match */ - )) + if (!(parpol->attrs[i] > 0 /* assert parent live */ + && oldpol->attrs[i] > 0 /* assert old part live */ + && remapped_parent_attr == oldpol->attrs[i] /* assert match */ + )) elog(ERROR, "discrepancy in partitioning policy of \"%s\"", RelationGetRelationName(rel)); @@ -8263,21 +8438,22 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) } else { - if (! GpPolicyEqual(parpol, oldpol) ) + if (!GpPolicyEqual(parpol, oldpol)) elog(ERROR, "discrepancy in partitioning policy of \"%s\"", RelationGetRelationName(rel)); } - if ( map_new != NULL ) + if (map_new != NULL) { - int i; + int i; + adjpol = GpPolicyCopy(CurrentMemoryContext, parpol); - for ( i = 0; i < adjpol->nattrs; i++ ) + for (i = 0; i < adjpol->nattrs; i++) { adjpol->attrs[i] = attrMap(map_new, parpol->attrs[i]); - Assert(newpol->attrs[i] > 0); /* check new part */ + Assert(newpol->attrs[i] > 0); /* check new part */ } } else @@ -8285,10 +8461,10 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) adjpol = parpol; } - if (! GpPolicyEqual(adjpol, newpol) ) + if (!GpPolicyEqual(adjpol, newpol)) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("distribution policy for \"%s\" " @@ -8300,15 +8476,17 @@ is_exchangeable(Relation rel, Relation oldrel, Relation newrel, bool throw) sizeof(AttrNumber) * adjpol->nattrs)) { congruent = FALSE; - if ( throw ) + if (throw) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("distribution policy matches but implementation lags"))); } } - if ( map_old != NULL ) pfree(map_old); - if ( map_new != NULL ) pfree(map_new); + if (map_old != NULL) + pfree(map_old); + if (map_new != NULL) + pfree(map_new); return congruent; } @@ -8331,16 +8509,16 @@ static NewConstraint * constraint_apply_mapped(HeapTuple tuple, AttrMap *map, Relation cand, bool validate, bool is_split, Relation pgcon) { - Datum val; - bool isnull; - Datum *dats; - int16 *keys; - int nkeys; - int i; - Node *conexpr; - char *consrc; - char *conbin; - Form_pg_constraint con = (Form_pg_constraint)GETSTRUCT(tuple); + Datum val; + bool isnull; + Datum *dats; + int16 *keys; + int nkeys; + int i; + Node *conexpr; + char *consrc; + char *conbin; + Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple); NewConstraint *newcon = NULL; /* Translate pg_constraint.conkey */ @@ -8355,14 +8533,15 @@ constraint_apply_mapped(HeapTuple tuple, AttrMap *map, Relation cand, keys = palloc(sizeof(int16) * nkeys); for (i = 0; i < nkeys; i++) { - int16 key = DatumGetInt16(dats[i]); - keys[i] = (int16)attrMap(map, key); + int16 key = DatumGetInt16(dats[i]); + + keys[i] = (int16) attrMap(map, key); } /* Translate pg_constraint.conbin */ val = heap_getattr(tuple, Anum_pg_constraint_conbin, RelationGetDescr(pgcon), &isnull); - if ( !isnull ) + if (!isnull) { conbin = TextDatumGetCString(val); conexpr = stringToNode(conbin); @@ -8389,161 +8568,164 @@ constraint_apply_mapped(HeapTuple tuple, AttrMap *map, Relation cand, } /* Apply translated constraint to candidate. */ - switch ( con->contype ) + switch (con->contype) { case CONSTRAINT_CHECK: - { - Assert( conexpr && conbin && consrc ); - - CreateConstraintEntry(NameStr(con->conname), - con->connamespace, // XXX should this be RelationGetNamespace(cand)? - con->contype, - con->condeferrable, - con->condeferred, - RelationGetRelid(cand), - keys, - nkeys, - InvalidOid, - InvalidOid, - NULL, - NULL, - NULL, - NULL, - 0, - ' ', - ' ', - ' ', - InvalidOid, - conexpr, - conbin, - consrc); - break; - } + { + Assert(conexpr && conbin && consrc); + + CreateConstraintEntry(NameStr(con->conname), + con->connamespace, //XXX should this be RelationGetNamespace(cand) ? + con->contype, + con->condeferrable, + con->condeferred, + RelationGetRelid(cand), + keys, + nkeys, + InvalidOid, + InvalidOid, + NULL, + NULL, + NULL, + NULL, + 0, + ' ', + ' ', + ' ', + InvalidOid, + conexpr, + conbin, + consrc); + break; + } case CONSTRAINT_FOREIGN: - { - int16 *fkeys; - int nfkeys; - Oid indexoid = InvalidOid; - Oid *opclasses = NULL; - Relation frel; - - val = heap_getattr(tuple, Anum_pg_constraint_confkey, - RelationGetDescr(pgcon), &isnull); - Assert(!isnull); - - deconstruct_array(DatumGetArrayTypeP(val), - INT2OID, 2, true, 's', - &dats, NULL, &nfkeys); - fkeys = palloc(sizeof(int16) * nfkeys); - for (i = 0; i < nfkeys; i++) { - fkeys[i] = DatumGetInt16(dats[i]); - } + int16 *fkeys; + int nfkeys; + Oid indexoid = InvalidOid; + Oid *opclasses = NULL; + Relation frel; + + val = heap_getattr(tuple, Anum_pg_constraint_confkey, + RelationGetDescr(pgcon), &isnull); + Assert(!isnull); + + deconstruct_array(DatumGetArrayTypeP(val), + INT2OID, 2, true, 's', + &dats, NULL, &nfkeys); + fkeys = palloc(sizeof(int16) * nfkeys); + for (i = 0; i < nfkeys; i++) + { + fkeys[i] = DatumGetInt16(dats[i]); + } - frel = heap_open(con->confrelid, AccessExclusiveLock); - indexoid = transformFkeyCheckAttrs(frel, nfkeys, fkeys, opclasses); - - CreateConstraintEntry(NameStr(con->conname), - RelationGetNamespace(cand), - con->contype, - con->condeferrable, - con->condeferred, - RelationGetRelid(cand), - keys, - nkeys, - InvalidOid, - con->confrelid, - fkeys, - NULL, - NULL, - NULL, - nfkeys, - con->confupdtype, - con->confdeltype, - con->confmatchtype, - indexoid, - NULL, /* no check constraint */ - NULL, - NULL); - - heap_close(frel, AccessExclusiveLock); - break; - } + frel = heap_open(con->confrelid, AccessExclusiveLock); + indexoid = transformFkeyCheckAttrs(frel, nfkeys, fkeys, opclasses); + + CreateConstraintEntry(NameStr(con->conname), + RelationGetNamespace(cand), + con->contype, + con->condeferrable, + con->condeferred, + RelationGetRelid(cand), + keys, + nkeys, + InvalidOid, + con->confrelid, + fkeys, + NULL, + NULL, + NULL, + nfkeys, + con->confupdtype, + con->confdeltype, + con->confmatchtype, + indexoid, + NULL, /* no check constraint */ + NULL, + NULL); + + heap_close(frel, AccessExclusiveLock); + break; + } case CONSTRAINT_PRIMARY: case CONSTRAINT_UNIQUE: - { - /* Index-backed constraints are handled as indexes. No action here. */ - char *what = (con->contype == CONSTRAINT_PRIMARY)? "PRIMARY KEY" :"UNIQUE"; - char *who = NameStr(con->conname); - - if (is_split) - { - ; /* nothing */ - } - else if (validate) - { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("%s constraint \"%s\" missing", what, who), - errhint("Add %s constraint \"%s\" to the candidate table" - " or drop it from the partitioned table." - , what, who))); - } - else { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("WITHOUT VALIDATION incompatible with missing %s constraint \"%s\"", - what, who), - errhint("Add %s constraint %s to the candidate table" - " or drop it from the partitioned table." - , what, who))); + /* + * Index-backed constraints are handled as indexes. No action + * here. + */ + char *what = (con->contype == CONSTRAINT_PRIMARY) ? "PRIMARY KEY" : "UNIQUE"; + char *who = NameStr(con->conname); + if (is_split) + { + ; /* nothing */ + } + else if (validate) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("%s constraint \"%s\" missing", what, who), + errhint("Add %s constraint \"%s\" to the candidate table" + " or drop it from the partitioned table." + ,what, who))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("WITHOUT VALIDATION incompatible with missing %s constraint \"%s\"", + what, who), + errhint("Add %s constraint %s to the candidate table" + " or drop it from the partitioned table." + ,what, who))); + + } + break; } - break; - } default: /* Defensive, can't occur. */ - elog(ERROR,"invalid constraint type: %c", con->contype); + elog(ERROR, "invalid constraint type: %c", con->contype); break; } newcon = NULL; - if ( validate ) + if (validate) { - switch ( con->contype ) + switch (con->contype) { case CONSTRAINT_CHECK: - { - newcon = (NewConstraint*) palloc0(sizeof(NewConstraint)); - newcon->name = pstrdup(NameStr(con->conname)); - /* ExecQual wants implicit-AND format */ - newcon->qual = (Node *)make_ands_implicit((Expr *)conexpr); - newcon->contype = CONSTR_CHECK; - break; - } + { + newcon = (NewConstraint *) palloc0(sizeof(NewConstraint)); + newcon->name = pstrdup(NameStr(con->conname)); + /* ExecQual wants implicit-AND format */ + newcon->qual = (Node *) make_ands_implicit((Expr *) conexpr); + newcon->contype = CONSTR_CHECK; + break; + } case CONSTRAINT_FOREIGN: - { - elog(WARNING, "Won't enforce FK constraint."); - break; - } + { + elog(WARNING, "Won't enforce FK constraint."); + break; + } case CONSTRAINT_PRIMARY: - { - elog(WARNING, "Won't enforce PK constraint."); - break; - } + { + elog(WARNING, "Won't enforce PK constraint."); + break; + } case CONSTRAINT_UNIQUE: - { - elog(WARNING, "Won't enforce ND constraint."); - break; - } + { + elog(WARNING, "Won't enforce ND constraint."); + break; + } default: - { - elog(WARNING, "!! NOT READY FOR TYPE %c CONSTRAINT !!", con->contype); - break; - } + { + elog(WARNING, "!! NOT READY FOR TYPE %c CONSTRAINT !!", con->contype); + break; + } } } return newcon; @@ -8553,7 +8735,7 @@ constraint_apply_mapped(HeapTuple tuple, AttrMap *map, Relation cand, static bool relation_has_supers(Oid relid) { - ScanKeyData scankey; + ScanKeyData scankey; Relation rel; SysScanDesc sscan; bool result; @@ -8588,178 +8770,178 @@ relation_has_supers(Oid relid) void fixCreateStmtForPartitionedTable(CreateStmt *stmt) { - ListCell *lc_elt; + ListCell *lc_elt; Constraint *con; - List *unnamed_cons = NIL; - List *unnamed_cons_col = NIL; - List *unnamed_cons_lbl = NIL; - List *used_names = NIL; - char *no_name = ""; - int i; + List *unnamed_cons = NIL; + List *unnamed_cons_col = NIL; + List *unnamed_cons_lbl = NIL; + List *used_names = NIL; + char *no_name = ""; + int i; /* Caller should check this! */ Assert(stmt->partitionBy && !stmt->is_part_child); - foreach( lc_elt, stmt->tableElts ) + foreach(lc_elt, stmt->tableElts) { - Node * elt = lfirst(lc_elt); + Node *elt = lfirst(lc_elt); switch (nodeTag(elt)) { case T_ColumnDef: - { - ListCell *lc_con; - - ColumnDef *cdef = (ColumnDef*)elt; - - foreach( lc_con, cdef->constraints ) { - Node *conelt = lfirst(lc_con); + ListCell *lc_con; + + ColumnDef *cdef = (ColumnDef *) elt; - if ( IsA(conelt, Constraint) ) + foreach(lc_con, cdef->constraints) { - con = (Constraint*)conelt; + Node *conelt = lfirst(lc_con); - if ( con->name ) + if (IsA(conelt, Constraint)) { - used_names = lappend(used_names, con->name); - continue; + con = (Constraint *) conelt; + + if (con->name) + { + used_names = lappend(used_names, con->name); + continue; + } + switch (con->contype) + { + case CONSTR_CHECK: + unnamed_cons = lappend(unnamed_cons, con); + unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_lbl = lappend(unnamed_cons_lbl, "check"); + break; + case CONSTR_PRIMARY: + unnamed_cons = lappend(unnamed_cons, con); + unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_lbl = lappend(unnamed_cons_lbl, "pkey"); + break; + case CONSTR_UNIQUE: + unnamed_cons = lappend(unnamed_cons, con); + unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_lbl = lappend(unnamed_cons_lbl, "key"); + break; + default: + break; + } + } + else + { + FkConstraint *fkcon = (FkConstraint *) conelt; + + Insist(IsA(fkcon, FkConstraint)); + + if (fkcon->constr_name) + { + used_names = lappend(used_names, fkcon->constr_name); + continue; + } + + unnamed_cons = lappend(unnamed_cons, fkcon); + unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_lbl = lappend(unnamed_cons_lbl, "fkey"); } + } + break; + } + case T_Constraint: + { + con = (Constraint *) elt; + + if (con->name) + { + used_names = lappend(used_names, con->name); + } + else + { switch (con->contype) { case CONSTR_CHECK: unnamed_cons = lappend(unnamed_cons, con); - unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_col = lappend(unnamed_cons_col, no_name); unnamed_cons_lbl = lappend(unnamed_cons_lbl, "check"); break; case CONSTR_PRIMARY: unnamed_cons = lappend(unnamed_cons, con); - unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_col = lappend(unnamed_cons_col, no_name); unnamed_cons_lbl = lappend(unnamed_cons_lbl, "pkey"); break; case CONSTR_UNIQUE: unnamed_cons = lappend(unnamed_cons, con); - unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); + unnamed_cons_col = lappend(unnamed_cons_col, no_name); unnamed_cons_lbl = lappend(unnamed_cons_lbl, "key"); break; default: break; } } - else - { - FkConstraint *fkcon = (FkConstraint*)conelt; - - Insist( IsA(fkcon, FkConstraint) ); - - if ( fkcon->constr_name ) - { - used_names = lappend(used_names, fkcon->constr_name); - continue; - } - - unnamed_cons = lappend(unnamed_cons, fkcon); - unnamed_cons_col = lappend(unnamed_cons_col, cdef->colname); - unnamed_cons_lbl = lappend(unnamed_cons_lbl, "fkey"); - } - } - break; - } - case T_Constraint: - { - con = (Constraint*)elt; - - if ( con->name ) - { - used_names = lappend(used_names, con->name); + break; } - else + case T_FkConstraint: { - switch (con->contype) + FkConstraint *fkcon = (FkConstraint *) elt; + + unnamed_cons = lappend(unnamed_cons, fkcon); + unnamed_cons_col = lappend(unnamed_cons_col, no_name); + unnamed_cons_lbl = lappend(unnamed_cons_lbl, "fkey"); + + if (fkcon->constr_name) { - case CONSTR_CHECK: - unnamed_cons = lappend(unnamed_cons, con); - unnamed_cons_col = lappend(unnamed_cons_col, no_name); - unnamed_cons_lbl = lappend(unnamed_cons_lbl, "check"); - break; - case CONSTR_PRIMARY: - unnamed_cons = lappend(unnamed_cons, con); - unnamed_cons_col = lappend(unnamed_cons_col, no_name); - unnamed_cons_lbl = lappend(unnamed_cons_lbl, "pkey"); - break; - case CONSTR_UNIQUE: - unnamed_cons = lappend(unnamed_cons, con); - unnamed_cons_col = lappend(unnamed_cons_col, no_name); - unnamed_cons_lbl = lappend(unnamed_cons_lbl, "key"); - break; - default: - break; + used_names = lappend(used_names, fkcon->constr_name); } + break; } - break; - } - case T_FkConstraint: - { - FkConstraint *fkcon = (FkConstraint*)elt; - - unnamed_cons = lappend(unnamed_cons, fkcon); - unnamed_cons_col = lappend(unnamed_cons_col, no_name); - unnamed_cons_lbl = lappend(unnamed_cons_lbl, "fkey"); - - if ( fkcon->constr_name ) + case T_InhRelation: { - used_names = lappend(used_names, fkcon->constr_name); + break; } - break; - } - case T_InhRelation: - { - break; - } default: break; } } - used_names = list_union(used_names, NIL); /* eliminate dups */ + used_names = list_union(used_names, NIL); /* eliminate dups */ - for ( i = 0; i < list_length(unnamed_cons); i++ ) + for (i = 0; i < list_length(unnamed_cons); i++) { - char *label = list_nth(unnamed_cons_lbl, i); - char *colname = NULL; - Node *elt = list_nth(unnamed_cons, i); + char *label = list_nth(unnamed_cons_lbl, i); + char *colname = NULL; + Node *elt = list_nth(unnamed_cons, i); - switch ( nodeTag(elt) ) + switch (nodeTag(elt)) { case T_FkConstraint: - { - FkConstraint *fcon = list_nth(unnamed_cons, i); - - fcon->constr_name = - ChooseConstraintNameForPartitionCreate(stmt->relation->relname, - colname, - label, - used_names); - used_names = lappend(used_names, fcon->constr_name); - break; - } + { + FkConstraint *fcon = list_nth(unnamed_cons, i); + + fcon->constr_name = + ChooseConstraintNameForPartitionCreate(stmt->relation->relname, + colname, + label, + used_names); + used_names = lappend(used_names, fcon->constr_name); + break; + } case T_Constraint: - { - Constraint *con = list_nth(unnamed_cons, i); + { + Constraint *con = list_nth(unnamed_cons, i); - /* Conventionally, no column name for PK. */ - if ( 0 != strcmp(label, "pkey") ) - colname = list_nth(unnamed_cons_col, i); + /* Conventionally, no column name for PK. */ + if (0 != strcmp(label, "pkey")) + colname = list_nth(unnamed_cons_col, i); - con->name = ChooseConstraintNameForPartitionCreate(stmt->relation->relname, - colname, - label, - used_names); - used_names = lappend(used_names,con->name); + con->name = ChooseConstraintNameForPartitionCreate(stmt->relation->relname, + colname, + label, + used_names); + used_names = lappend(used_names, con->name); - break; - } + break; + } default: break; } @@ -8789,16 +8971,16 @@ ChooseConstraintNameForPartitionCreate(const char *rname, const char *label, List *used_names) { - int pass = 0; - char *conname = NULL; - char modlabel[NAMEDATALEN]; - bool found = false; - ListCell *lc; + int pass = 0; + char *conname = NULL; + char modlabel[NAMEDATALEN]; + bool found = false; + ListCell *lc; Assert(rname && *rname); /* Allow caller to pass "" instead of NULL for non-singular cname */ - if ( cname && *cname == '\0' ) + if (cname && *cname == '\0') cname = NULL; /* try the unmodified label first */ @@ -8811,14 +8993,14 @@ ChooseConstraintNameForPartitionCreate(const char *rname, foreach(lc, used_names) { - if (strcmp((char*)lfirst(lc), conname) == 0) + if (strcmp((char *) lfirst(lc), conname) == 0) { found = true; break; } } - if ( ! found ) - break; /* we have a winner */ + if (!found) + break; /* we have a winner */ pfree(conname); snprintf(modlabel, sizeof(modlabel), "%s%d", label, ++pass); @@ -8834,10 +9016,10 @@ ChooseConstraintNameForPartitionCreate(const char *rname, void checkUniqueConstraintVsPartitioning(Relation rel, AttrNumber *indattr, int nidxatts, bool primary) { - int i; - bool contains; - Bitmapset *ikey = NULL; - Bitmapset *pkey = get_partition_key_bitmapset(RelationGetRelid(rel)); + int i; + bool contains; + Bitmapset *ikey = NULL; + Bitmapset *pkey = get_partition_key_bitmapset(RelationGetRelid(rel)); for (i = 0; i < nidxatts; i++) ikey = bms_add_member(ikey, indattr[i]); @@ -8849,11 +9031,11 @@ checkUniqueConstraintVsPartitioning(Relation rel, AttrNumber *indattr, int nidxa if (ikey) bms_free(ikey); - if (! contains ) + if (!contains) { - char *what = "UNIQUE"; + char *what = "UNIQUE"; - if ( primary ) + if (primary) what = "PRIMARY KEY"; ereport(ERROR, @@ -8868,17 +9050,20 @@ checkUniqueConstraintVsPartitioning(Relation rel, AttrNumber *indattr, int nidxa /** * Does a partition node correspond to a leaf partition? */ -static bool IsLeafPartitionNode(PartitionNode *p) +static bool +IsLeafPartitionNode(PartitionNode *p) { Assert(p); /** * If all of the rules have no children, this is a leaf partition. */ - ListCell *lc = NULL; - foreach (lc, p->rules) + ListCell *lc = NULL; + + foreach(lc, p->rules) { PartitionRule *rule = (PartitionRule *) lfirst(lc); + if (rule->children) { return false; @@ -8889,7 +9074,7 @@ static bool IsLeafPartitionNode(PartitionNode *p) * If default partition has children then, this is not a leaf */ if (p->default_part - && p->default_part->children) + && p->default_part->children) { return false; } @@ -8900,12 +9085,13 @@ static bool IsLeafPartitionNode(PartitionNode *p) /* * Given a partition node, return all the associated rules, including the default partition rule if present */ -static List* +static List * get_partition_rules(PartitionNode *pn) { Assert(pn); - List *result = NIL; + List *result = NIL; + if (pn->default_part) { result = lappend(result, pn->default_part); @@ -8924,15 +9110,17 @@ get_partition_rules(PartitionNode *pn) * Output: * List of partition nodes corresponding to its children across all rules. */ -static List *PartitionChildren(PartitionNode *p) +static List * +PartitionChildren(PartitionNode *p) { Assert(p); Assert(!IsLeafPartitionNode(p)); - List *result = NIL; + List *result = NIL; + + ListCell *lc = NULL; - ListCell *lc = NULL; - foreach (lc, p->rules) + foreach(lc, p->rules) { PartitionRule *rule = (PartitionRule *) lfirst(lc); @@ -8946,7 +9134,7 @@ static List *PartitionChildren(PartitionNode *p) * Also add default child */ if (p->default_part - && p->default_part->children) + && p->default_part->children) { result = lappend(result, p->default_part->children); } @@ -8970,27 +9158,28 @@ static List *PartitionChildren(PartitionNode *p) */ List * selectPartitionMulti(PartitionNode *partnode, Datum *values, bool *isnull, - TupleDesc tupdesc, PartitionAccessMethods *accessMethods) + TupleDesc tupdesc, PartitionAccessMethods *accessMethods) { Assert(partnode); - List *leafPartitionOids = NIL; + List *leafPartitionOids = NIL; - List *inputList = list_make1(partnode); + List *inputList = list_make1(partnode); while (list_length(inputList) > 0) { - List *levelOutput = NIL; + List *levelOutput = NIL; + + ListCell *lc = NULL; - ListCell *lc = NULL; - foreach (lc, inputList) + foreach(lc, inputList) { PartitionNode *candidatePartNode = (PartitionNode *) lfirst(lc); - bool foundNull = false; + bool foundNull = false; for (int i = 0; i < candidatePartNode->part->parnatts; i++) { - AttrNumber attno = candidatePartNode->part->paratts[i]; + AttrNumber attno = candidatePartNode->part->paratts[i]; /** * If corresponding value is null, then we should pick all of its @@ -9022,7 +9211,8 @@ selectPartitionMulti(PartitionNode *partnode, Datum *values, bool *isnull, { if (IsLeafPartitionNode(candidatePartNode)) { - Oid matchOid = selectPartition1(candidatePartNode, values, isnull, tupdesc, accessMethods, NULL, NULL); + Oid matchOid = selectPartition1(candidatePartNode, values, isnull, tupdesc, accessMethods, NULL, NULL); + if (matchOid != InvalidOid) { leafPartitionOids = lappend_oid(leafPartitionOids, matchOid); @@ -9031,6 +9221,7 @@ selectPartitionMulti(PartitionNode *partnode, Datum *values, bool *isnull, else { PartitionNode *childPartitionNode = NULL; + selectPartition1(candidatePartNode, values, isnull, tupdesc, accessMethods, NULL, &childPartitionNode); if (childPartitionNode) { @@ -9060,10 +9251,10 @@ static void add_partition_encoding(Oid relid, Oid paroid, AttrNumber attnum, List *encoding) { Relation rel; - Datum partoptions; - Datum values[Natts_pg_partition_encoding]; - bool nulls[Natts_pg_partition_encoding]; - HeapTuple tuple; + Datum partoptions; + Datum values[Natts_pg_partition_encoding]; + bool nulls[Natts_pg_partition_encoding]; + HeapTuple tuple; rel = heap_open(PartitionEncodingRelationId, RowExclusiveLock); @@ -9094,9 +9285,9 @@ add_partition_encoding(Oid relid, Oid paroid, AttrNumber attnum, List *encoding) static void remove_partition_encoding_entry(Oid paroid, AttrNumber attnum) { - Relation rel; + Relation rel; HeapTuple tup; - ScanKeyData scankey; + ScanKeyData scankey; SysScanDesc sscan; rel = heap_open(PartitionEncodingRelationId, RowExclusiveLock); @@ -9113,7 +9304,7 @@ remove_partition_encoding_entry(Oid paroid, AttrNumber attnum) if (attnum != InvalidAttrNumber) { Form_pg_partition_encoding ppe = - (Form_pg_partition_encoding)GETSTRUCT(tup); + (Form_pg_partition_encoding) GETSTRUCT(tup); if (ppe->parencattnum != attnum) continue; @@ -9149,7 +9340,7 @@ remove_partition_encoding_by_key(Oid relid, AttrNumber attnum) SnapshotNow, 1, &scankey); while (HeapTupleIsValid(tup = systable_getnext(sscan))) { - Form_pg_partition part = (Form_pg_partition)GETSTRUCT(tup); + Form_pg_partition part = (Form_pg_partition) GETSTRUCT(tup); if (part->paristemplate) remove_partition_encoding_entry(HeapTupleGetOid(tup), attnum); @@ -9181,16 +9372,16 @@ RemovePartitionEncodingByRelidAttribute(Oid relid, AttrNumber attnum) static void add_template_encoding_clauses(Oid relid, Oid paroid, List *stenc) { - ListCell *lc; + ListCell *lc; foreach(lc, stenc) { ColumnReferenceStorageDirective *c = lfirst(lc); - AttrNumber attnum; + AttrNumber attnum; /* - * Don't store default clauses since we have no need of them - * when we add partitions later. + * Don't store default clauses since we have no need of them when we + * add partitions later. */ if (c->deflt) continue; @@ -9199,7 +9390,7 @@ add_template_encoding_clauses(Oid relid, Oid paroid, List *stenc) Insist(attnum > 0); - add_partition_encoding(relid, paroid, attnum, c->encoding); + add_partition_encoding(relid, paroid, attnum, c->encoding); } } @@ -9213,10 +9404,9 @@ get_partition_encoding_attoptions(Relation rel, Oid paroid) Datum *opts; /* - * XXX XXX: should be FOR UPDATE ? why ? probably should be an - * AccessShare + * XXX XXX: should be FOR UPDATE ? why ? probably should be an AccessShare */ - pgpeenc = heap_open(PartitionEncodingRelationId, RowExclusiveLock); + pgpeenc = heap_open(PartitionEncodingRelationId, RowExclusiveLock); opts = palloc0(sizeof(Datum) * RelationGetNumberOfAttributes(rel)); @@ -9228,11 +9418,11 @@ get_partition_encoding_attoptions(Relation rel, Oid paroid) SnapshotNow, 1, &scankey); while (HeapTupleIsValid(tup = systable_getnext(sscan))) { - Datum paroptions; - AttrNumber attnum; - bool isnull; + Datum paroptions; + AttrNumber attnum; + bool isnull; - attnum = ((Form_pg_partition_encoding)GETSTRUCT(tup))->parencattnum; + attnum = ((Form_pg_partition_encoding) GETSTRUCT(tup))->parencattnum; paroptions = heap_getattr(tup, Anum_pg_partition_encoding_parencattoptions, RelationGetDescr(pgpeenc), @@ -9253,17 +9443,17 @@ get_partition_encoding_attoptions(Relation rel, Oid paroid) static List * get_deparsed_partition_encodings(Oid relid, Oid paroid) { - int i; - List *out = NIL; - Relation rel = heap_open(relid, AccessShareLock); - Datum *opts = get_partition_encoding_attoptions(rel, paroid); + int i; + List *out = NIL; + Relation rel = heap_open(relid, AccessShareLock); + Datum *opts = get_partition_encoding_attoptions(rel, paroid); for (i = 0; i < RelationGetNumberOfAttributes(rel); i++) { if (opts[i] && !rel->rd_att->attrs[i]->attisdropped) { ColumnReferenceStorageDirective *c = - makeNode(ColumnReferenceStorageDirective); + makeNode(ColumnReferenceStorageDirective); c->encoding = untransformRelOptions(opts[i]); c->column = get_attname(relid, i + 1); @@ -9284,16 +9474,18 @@ get_deparsed_partition_encodings(Oid relid, Oid paroid) * * Result is allocated in the current memory context. */ -char* +char * DebugPartitionOid(Datum *elements, int n) { StringInfoData str; + initStringInfo(&str); appendStringInfo(&str, "{"); - for (int i=0; ipartsAndRules, partOid); if (NULL != *partsAndRules) { - // accessMethods define the lookup access methods for partitions, one for each level + /* + * accessMethods define the lookup access methods for partitions, + * one for each level + */ *accessMethods = metadata->accessMethods; return; } @@ -9357,10 +9554,12 @@ findPartitionNodeEntry(PartitionNode *partitionNode, Oid partOid) * intermediate node */ PartitionNode *childNode = NULL; - ListCell *lcChild = NULL; - foreach (lcChild, partitionNode->rules) + ListCell *lcChild = NULL; + + foreach(lcChild, partitionNode->rules) { PartitionRule *childRule = (PartitionRule *) lfirst(lcChild); + childNode = findPartitionNodeEntry(childRule->children, partOid); if (NULL != childNode) { diff --git a/src/backend/cdb/cdbpath.c b/src/backend/cdb/cdbpath.c index ec20f2c588..f932359633 100644 --- a/src/backend/cdb/cdbpath.c +++ b/src/backend/cdb/cdbpath.c @@ -14,11 +14,11 @@ #include "postgres.h" #include "catalog/pg_operator.h" -#include "catalog/pg_proc.h" /* CDB_PROC_TIDTOI8 */ -#include "catalog/pg_type.h" /* INT8OID */ -#include "nodes/makefuncs.h" /* makeFuncExpr() */ -#include "nodes/relation.h" /* PlannerInfo, RelOptInfo, CdbRelDedupInfo */ -#include "optimizer/cost.h" /* cpu_tuple_cost */ +#include "catalog/pg_proc.h" /* CDB_PROC_TIDTOI8 */ +#include "catalog/pg_type.h" /* INT8OID */ +#include "nodes/makefuncs.h" /* makeFuncExpr() */ +#include "nodes/relation.h" /* PlannerInfo, RelOptInfo, CdbRelDedupInfo */ +#include "optimizer/cost.h" /* cpu_tuple_cost */ #include "optimizer/pathnode.h" /* Path, pathnode_walker() */ #include "optimizer/paths.h" #include "optimizer/planmain.h" @@ -28,14 +28,14 @@ #include "utils/syscache.h" -#include "cdb/cdbdef.h" /* CdbSwap() */ -#include "cdb/cdbhash.h" /* isGreenplumDbHashable() */ +#include "cdb/cdbdef.h" /* CdbSwap() */ +#include "cdb/cdbhash.h" /* isGreenplumDbHashable() */ -#include "cdb/cdbpath.h" /* me */ +#include "cdb/cdbpath.h" /* me */ #include "cdb/cdbvars.h" -#ifdef small /* might #define small */ -#undef small /* but I want it for a variable name */ +#ifdef small /* might #define small */ +#undef small /* but I want it for a variable name */ #endif @@ -46,23 +46,23 @@ void cdbpath_cost_motion(PlannerInfo *root, CdbMotionPath *motionpath) { - Path *subpath = motionpath->subpath; - Cost cost_per_row; - Cost motioncost; - double recvrows; - double sendrows; + Path *subpath = motionpath->subpath; + Cost cost_per_row; + Cost motioncost; + double recvrows; + double sendrows; - cost_per_row = (gp_motion_cost_per_row > 0.0) - ? gp_motion_cost_per_row - : 2.0 * cpu_tuple_cost; - sendrows = cdbpath_rows(root, subpath); - recvrows = cdbpath_rows(root, (Path *)motionpath); - motioncost = cost_per_row * 0.5 * (sendrows + recvrows); + cost_per_row = (gp_motion_cost_per_row > 0.0) + ? gp_motion_cost_per_row + : 2.0 * cpu_tuple_cost; + sendrows = cdbpath_rows(root, subpath); + recvrows = cdbpath_rows(root, (Path *) motionpath); + motioncost = cost_per_row * 0.5 * (sendrows + recvrows); - motionpath->path.total_cost = motioncost + subpath->total_cost; - motionpath->path.startup_cost = subpath->startup_cost; - motionpath->path.memory = subpath->memory; -} /* cdbpath_cost_motion */ + motionpath->path.total_cost = motioncost + subpath->total_cost; + motionpath->path.startup_cost = subpath->startup_cost; + motionpath->path.memory = subpath->memory; +} /* cdbpath_cost_motion */ /* @@ -84,185 +84,188 @@ cdbpath_cost_motion(PlannerInfo *root, CdbMotionPath *motionpath) * pathkeys otherwise (the usual case). */ Path * -cdbpath_create_motion_path(PlannerInfo *root, - Path *subpath, - List *pathkeys, - bool require_existing_order, - CdbPathLocus locus) +cdbpath_create_motion_path(PlannerInfo *root, + Path *subpath, + List *pathkeys, + bool require_existing_order, + CdbPathLocus locus) { - CdbMotionPath *pathnode; - - UnusedArg(root); - Assert(cdbpathlocus_is_valid(locus) && - cdbpathlocus_is_valid(subpath->locus)); - - /* Moving subpath output to a single executor process (qDisp or qExec)? */ - if (CdbPathLocus_IsBottleneck(locus)) - { - /* entry-->entry or singleQE-->singleQE? No motion needed. */ - if (CdbPathLocus_IsEqual(subpath->locus, locus)) - return subpath; - - /* entry-->singleQE? Don't move. Slice's QE will run on entry db. */ - if (CdbPathLocus_IsEntry(subpath->locus)) - return subpath; - - /* singleQE-->entry? Don't move. Slice's QE will run on entry db. */ - if (CdbPathLocus_IsSingleQE(subpath->locus)) - { - /* Create CdbMotionPath node to indicate that the slice must be - * dispatched to a singleton gang running on the entry db. We - * merely use this node to note that the path has 'Entry' locus; - * no corresponding Motion node will be created in the Plan tree. - */ - Assert(CdbPathLocus_IsEntry(locus)); - - pathnode = makeNode(CdbMotionPath); - pathnode->path.pathtype = T_Motion; - pathnode->path.parent = subpath->parent; - pathnode->path.locus = locus; - pathnode->path.pathkeys = pathkeys; - pathnode->subpath = subpath; - - /* Costs, etc, are same as subpath. */ - pathnode->path.startup_cost = subpath->total_cost; - pathnode->path.total_cost = subpath->total_cost; - pathnode->path.memory = subpath->memory; - pathnode->path.motionHazard = subpath->motionHazard; - pathnode->path.rescannable = subpath->rescannable; - return (Path *)pathnode; - } - - /* No motion needed if subpath can run anywhere giving same output. */ - if (CdbPathLocus_IsGeneral(subpath->locus)) - return subpath; - - /* Fail if caller refuses motion. */ - if (require_existing_order && - !pathkeys) - return NULL; - - /* replicated-->singleton would give redundant copies of the rows. */ - if (CdbPathLocus_IsReplicated(subpath->locus)) - goto invalid_motion_request; - - /* Must be partitioned-->singleton. - * If caller gave pathkeys, they'll be used for Merge Receive. - * If no pathkeys, Union Receive will arbitrarily interleave - * the rows from the subpath partitions in no special order. - */ - if (!CdbPathLocus_IsPartitioned(subpath->locus)) - goto invalid_motion_request; - } - - /* Output from a single process to be distributed over a gang? */ - else if (CdbPathLocus_IsBottleneck(subpath->locus)) - { - /* Must be bottleneck-->partitioned or bottleneck-->replicated */ - if (!CdbPathLocus_IsPartitioned(locus) && - !CdbPathLocus_IsReplicated(locus)) - goto invalid_motion_request; - - /* Fail if caller disallows motion. */ - if (require_existing_order && - !pathkeys) - return NULL; - - /* Each qExec receives a subset of the rows, with ordering preserved. */ - pathkeys = subpath->pathkeys; - } - - /* Redistributing partitioned subpath output from one gang to another? */ - else if (CdbPathLocus_IsPartitioned(subpath->locus)) - { - /* partitioned-->partitioned? */ - if (CdbPathLocus_IsPartitioned(locus)) - { - /* No motion if subpath partitioning matches caller's request. */ - if (cdbpathlocus_compare(CdbPathLocus_Comparison_Equal, subpath->locus, locus)) - return subpath; - } - - /* Must be partitioned-->replicated */ - else if (!CdbPathLocus_IsReplicated(locus)) - goto invalid_motion_request; - - /* Fail if caller insists on ordered result or no motion. */ - if (require_existing_order) - return NULL; - - /* Output streams lose any ordering they had. - * Only a qDisp or singleton qExec can merge sorted streams (for now). - */ - pathkeys = NIL; - } - - /* If subplan uses no tables, it can run on qDisp or a singleton qExec. */ - else if (CdbPathLocus_IsGeneral(subpath->locus)) - { - /* No motion needed if general-->general or general-->replicated. */ - if (CdbPathLocus_IsGeneral(locus) || - CdbPathLocus_IsReplicated(locus)) - return subpath; - - /* Must be general-->partitioned. */ - if (!CdbPathLocus_IsPartitioned(locus)) - goto invalid_motion_request; - - /* Fail if caller wants no motion. */ - if (require_existing_order && - !pathkeys) - return NULL; - - /* Since the motion is 1-to-many, the rows remain in the same order. */ - pathkeys = subpath->pathkeys; - } - - /* Does subpath produce same multiset of rows on every qExec of its gang? */ - else if (CdbPathLocus_IsReplicated(subpath->locus)) - { - /* No-op if replicated-->replicated. */ - if (CdbPathLocus_IsReplicated(locus)) - return subpath; - - /* Other destinations aren't used or supported at present. */ - goto invalid_motion_request; - } - else - goto invalid_motion_request; - - /* Don't materialize before motion. */ - if (IsA(subpath, MaterialPath)) - subpath = ((MaterialPath *)subpath)->subpath; + CdbMotionPath *pathnode; + + UnusedArg(root); + Assert(cdbpathlocus_is_valid(locus) && + cdbpathlocus_is_valid(subpath->locus)); + + /* Moving subpath output to a single executor process (qDisp or qExec)? */ + if (CdbPathLocus_IsBottleneck(locus)) + { + /* entry-->entry or singleQE-->singleQE? No motion needed. */ + if (CdbPathLocus_IsEqual(subpath->locus, locus)) + return subpath; + + /* entry-->singleQE? Don't move. Slice's QE will run on entry db. */ + if (CdbPathLocus_IsEntry(subpath->locus)) + return subpath; + + /* singleQE-->entry? Don't move. Slice's QE will run on entry db. */ + if (CdbPathLocus_IsSingleQE(subpath->locus)) + { + /* + * Create CdbMotionPath node to indicate that the slice must be + * dispatched to a singleton gang running on the entry db. We + * merely use this node to note that the path has 'Entry' locus; + * no corresponding Motion node will be created in the Plan tree. + */ + Assert(CdbPathLocus_IsEntry(locus)); + + pathnode = makeNode(CdbMotionPath); + pathnode->path.pathtype = T_Motion; + pathnode->path.parent = subpath->parent; + pathnode->path.locus = locus; + pathnode->path.pathkeys = pathkeys; + pathnode->subpath = subpath; + + /* Costs, etc, are same as subpath. */ + pathnode->path.startup_cost = subpath->total_cost; + pathnode->path.total_cost = subpath->total_cost; + pathnode->path.memory = subpath->memory; + pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.rescannable = subpath->rescannable; + return (Path *) pathnode; + } + + /* No motion needed if subpath can run anywhere giving same output. */ + if (CdbPathLocus_IsGeneral(subpath->locus)) + return subpath; + + /* Fail if caller refuses motion. */ + if (require_existing_order && + !pathkeys) + return NULL; + + /* replicated-->singleton would give redundant copies of the rows. */ + if (CdbPathLocus_IsReplicated(subpath->locus)) + goto invalid_motion_request; + + /* + * Must be partitioned-->singleton. If caller gave pathkeys, they'll + * be used for Merge Receive. If no pathkeys, Union Receive will + * arbitrarily interleave the rows from the subpath partitions in no + * special order. + */ + if (!CdbPathLocus_IsPartitioned(subpath->locus)) + goto invalid_motion_request; + } + + /* Output from a single process to be distributed over a gang? */ + else if (CdbPathLocus_IsBottleneck(subpath->locus)) + { + /* Must be bottleneck-->partitioned or bottleneck-->replicated */ + if (!CdbPathLocus_IsPartitioned(locus) && + !CdbPathLocus_IsReplicated(locus)) + goto invalid_motion_request; + + /* Fail if caller disallows motion. */ + if (require_existing_order && + !pathkeys) + return NULL; + + /* Each qExec receives a subset of the rows, with ordering preserved. */ + pathkeys = subpath->pathkeys; + } + + /* Redistributing partitioned subpath output from one gang to another? */ + else if (CdbPathLocus_IsPartitioned(subpath->locus)) + { + /* partitioned-->partitioned? */ + if (CdbPathLocus_IsPartitioned(locus)) + { + /* No motion if subpath partitioning matches caller's request. */ + if (cdbpathlocus_compare(CdbPathLocus_Comparison_Equal, subpath->locus, locus)) + return subpath; + } + + /* Must be partitioned-->replicated */ + else if (!CdbPathLocus_IsReplicated(locus)) + goto invalid_motion_request; + + /* Fail if caller insists on ordered result or no motion. */ + if (require_existing_order) + return NULL; + + /* + * Output streams lose any ordering they had. Only a qDisp or + * singleton qExec can merge sorted streams (for now). + */ + pathkeys = NIL; + } + + /* If subplan uses no tables, it can run on qDisp or a singleton qExec. */ + else if (CdbPathLocus_IsGeneral(subpath->locus)) + { + /* No motion needed if general-->general or general-->replicated. */ + if (CdbPathLocus_IsGeneral(locus) || + CdbPathLocus_IsReplicated(locus)) + return subpath; + + /* Must be general-->partitioned. */ + if (!CdbPathLocus_IsPartitioned(locus)) + goto invalid_motion_request; + + /* Fail if caller wants no motion. */ + if (require_existing_order && + !pathkeys) + return NULL; + + /* Since the motion is 1-to-many, the rows remain in the same order. */ + pathkeys = subpath->pathkeys; + } + + /* Does subpath produce same multiset of rows on every qExec of its gang? */ + else if (CdbPathLocus_IsReplicated(subpath->locus)) + { + /* No-op if replicated-->replicated. */ + if (CdbPathLocus_IsReplicated(locus)) + return subpath; + + /* Other destinations aren't used or supported at present. */ + goto invalid_motion_request; + } + else + goto invalid_motion_request; + + /* Don't materialize before motion. */ + if (IsA(subpath, MaterialPath)) + subpath = ((MaterialPath *) subpath)->subpath; /* - * MPP-3300: materialize *before* motion can never help us, motion - * pushes data. other nodes pull. We relieve motion deadlocks by - * adding materialize nodes on top of motion nodes + * MPP-3300: materialize *before* motion can never help us, motion pushes + * data. other nodes pull. We relieve motion deadlocks by adding + * materialize nodes on top of motion nodes */ - /* Create CdbMotionPath node. */ - pathnode = makeNode(CdbMotionPath); - pathnode->path.pathtype = T_Motion; - pathnode->path.parent = subpath->parent; - pathnode->path.locus = locus; - pathnode->path.pathkeys = pathkeys; - pathnode->subpath = subpath; + /* Create CdbMotionPath node. */ + pathnode = makeNode(CdbMotionPath); + pathnode->path.pathtype = T_Motion; + pathnode->path.parent = subpath->parent; + pathnode->path.locus = locus; + pathnode->path.pathkeys = pathkeys; + pathnode->subpath = subpath; - /* Cost of motion */ - cdbpath_cost_motion(root, pathnode); + /* Cost of motion */ + cdbpath_cost_motion(root, pathnode); - /* Tell operators above us that slack may be needed for deadlock safety. */ - pathnode->path.motionHazard = true; - pathnode->path.rescannable = false; + /* Tell operators above us that slack may be needed for deadlock safety. */ + pathnode->path.motionHazard = true; + pathnode->path.rescannable = false; - return (Path *)pathnode; + return (Path *) pathnode; - /* Unexpected source or destination locus. */ + /* Unexpected source or destination locus. */ invalid_motion_request: - Assert(0); - return NULL; -} /* cdbpath_create_motion_path */ + Assert(0); + return NULL; +} /* cdbpath_create_motion_path */ /* * cdbpath_match_preds_to_partkey_tail @@ -277,11 +280,11 @@ invalid_motion_request: typedef struct { - PlannerInfo *root; - List *mergeclause_list; - CdbPathLocus locus; - CdbPathLocus *colocus; - bool colocus_eq_locus; + PlannerInfo *root; + List *mergeclause_list; + CdbPathLocus locus; + CdbPathLocus *colocus; + bool colocus_eq_locus; } CdbpathMatchPredsContext; @@ -426,7 +429,7 @@ cdbpath_match_preds_to_partkey_tail(CdbpathMatchPredsContext *ctx, } } return true; -} /* cdbpath_match_preds_to_partkey_tail */ +} /* cdbpath_match_preds_to_partkey_tail */ @@ -443,30 +446,30 @@ cdbpath_match_preds_to_partkey_tail(CdbpathMatchPredsContext *ctx, * find_mergeclauses_for_pathkeys() in pathkeys.c */ static bool -cdbpath_match_preds_to_partkey(PlannerInfo *root, - List *mergeclause_list, - CdbPathLocus locus, - CdbPathLocus *colocus) /* OUT */ +cdbpath_match_preds_to_partkey(PlannerInfo *root, + List *mergeclause_list, + CdbPathLocus locus, + CdbPathLocus *colocus) /* OUT */ { - CdbpathMatchPredsContext ctx; + CdbpathMatchPredsContext ctx; - if (!CdbPathLocus_IsHashed(locus) && - !CdbPathLocus_IsHashedOJ(locus)) - return false; + if (!CdbPathLocus_IsHashed(locus) && + !CdbPathLocus_IsHashedOJ(locus)) + return false; - Assert(cdbpathlocus_is_valid(locus)); + Assert(cdbpathlocus_is_valid(locus)); - ctx.root = root; - ctx.mergeclause_list = mergeclause_list; - ctx.locus = locus; - ctx.colocus = colocus; - ctx.colocus_eq_locus = true; + ctx.root = root; + ctx.mergeclause_list = mergeclause_list; + ctx.locus = locus; + ctx.colocus = colocus; + ctx.colocus_eq_locus = true; if (CdbPathLocus_IsHashed(locus)) return cdbpath_match_preds_to_partkey_tail(&ctx, list_head(locus.partkey_h)); else return cdbpath_match_preds_to_partkey_tail(&ctx, list_head(locus.partkey_oj)); -} /* cdbpath_match_preds_to_partkey */ +} /* cdbpath_match_preds_to_partkey */ /* @@ -481,25 +484,25 @@ cdbpath_match_preds_to_partkey(PlannerInfo *root, * find_mergeclauses_for_pathkeys() in pathkeys.c */ static bool -cdbpath_match_preds_to_both_partkeys(PlannerInfo *root, - List *mergeclause_list, - CdbPathLocus outer_locus, - CdbPathLocus inner_locus) +cdbpath_match_preds_to_both_partkeys(PlannerInfo *root, + List *mergeclause_list, + CdbPathLocus outer_locus, + CdbPathLocus inner_locus) { - ListCell *outercell; - ListCell *innercell; + ListCell *outercell; + ListCell *innercell; List *outer_partkey; List *inner_partkey; - if (!mergeclause_list || + if (!mergeclause_list || CdbPathLocus_Degree(outer_locus) == 0 || CdbPathLocus_Degree(inner_locus) == 0 || - CdbPathLocus_Degree(outer_locus) != CdbPathLocus_Degree(inner_locus)) - return false; + CdbPathLocus_Degree(outer_locus) != CdbPathLocus_Degree(inner_locus)) + return false; - Assert(CdbPathLocus_IsHashed(outer_locus) || - CdbPathLocus_IsHashedOJ(outer_locus)); - Assert(CdbPathLocus_IsHashed(inner_locus) || - CdbPathLocus_IsHashedOJ(inner_locus)); + Assert(CdbPathLocus_IsHashed(outer_locus) || + CdbPathLocus_IsHashedOJ(outer_locus)); + Assert(CdbPathLocus_IsHashed(inner_locus) || + CdbPathLocus_IsHashedOJ(inner_locus)); if (CdbPathLocus_IsHashed(outer_locus)) outer_partkey = outer_locus.partkey_h; @@ -511,33 +514,37 @@ cdbpath_match_preds_to_both_partkeys(PlannerInfo *root, else inner_partkey = inner_locus.partkey_oj; - forboth(outercell, outer_partkey, innercell, inner_partkey) - { - List *outersublist = (List *)lfirst(outercell); - List *innersublist = (List *)lfirst(innercell); - ListCell *rcell; - foreach(rcell, mergeclause_list) - { - bool not_found = false; - RestrictInfo *rinfo = (RestrictInfo *)lfirst(rcell); - - if (!rinfo->left_ec) - cache_mergeclause_eclasses(root, rinfo); - - /* Skip predicate if neither side matches outer partkey item. */ - if (CdbPathLocus_IsHashed(outer_locus)) - { - PathKey *pathkey = (PathKey *) outersublist; + forboth(outercell, outer_partkey, innercell, inner_partkey) + { + List *outersublist = (List *) lfirst(outercell); + List *innersublist = (List *) lfirst(innercell); + ListCell *rcell; + + foreach(rcell, mergeclause_list) + { + bool not_found = false; + RestrictInfo *rinfo = (RestrictInfo *) lfirst(rcell); + + if (!rinfo->left_ec) + cache_mergeclause_eclasses(root, rinfo); + + /* Skip predicate if neither side matches outer partkey item. */ + if (CdbPathLocus_IsHashed(outer_locus)) + { + PathKey *pathkey = (PathKey *) outersublist; + if (pathkey->pk_eclass != rinfo->left_ec && pathkey->pk_eclass != rinfo->right_ec) continue; - } - else - { + } + else + { Assert(CdbPathLocus_IsHashedOJ(outer_locus)); - ListCell *i; + ListCell *i; + foreach(i, outersublist) { - PathKey *pathkey = (PathKey *) lfirst(i); + PathKey *pathkey = (PathKey *) lfirst(i); + if (pathkey->pk_eclass != rinfo->left_ec && pathkey->pk_eclass != rinfo->right_ec) { not_found = true; @@ -546,24 +553,28 @@ cdbpath_match_preds_to_both_partkeys(PlannerInfo *root, } if (not_found) continue; - } - - /* Skip predicate if neither side matches inner partkey item. */ - if (innersublist == outersublist) - {} /* do nothing */ - else if (CdbPathLocus_IsHashed(inner_locus)) - { - PathKey *pathkey = (PathKey *) innersublist; + } + + /* Skip predicate if neither side matches inner partkey item. */ + if (innersublist == outersublist) + { + } /* do nothing */ + else if (CdbPathLocus_IsHashed(inner_locus)) + { + PathKey *pathkey = (PathKey *) innersublist; + if (pathkey->pk_eclass != rinfo->left_ec && pathkey->pk_eclass != rinfo->right_ec) continue; - } - else - { - Assert(CdbPathLocus_IsHashedOJ(inner_locus)); - ListCell *i; + } + else + { + Assert(CdbPathLocus_IsHashedOJ(inner_locus)); + ListCell *i; + foreach(i, innersublist) { - PathKey *pathkey = (PathKey *) lfirst(i); + PathKey *pathkey = (PathKey *) lfirst(i); + if (pathkey->pk_eclass != rinfo->left_ec && pathkey->pk_eclass != rinfo->right_ec) { not_found = true; @@ -572,18 +583,18 @@ cdbpath_match_preds_to_both_partkeys(PlannerInfo *root, } if (not_found) continue; - } + } - /* Found equijoin between outer partkey item & inner partkey item */ - break; - } + /* Found equijoin between outer partkey item & inner partkey item */ + break; + } - /* Fail if didn't find equijoin between this pair of partkey items. */ - if (!rcell) - return false; - } - return true; -} /* cdbpath_match_preds_to_both_partkeys */ + /* Fail if didn't find equijoin between this pair of partkey items. */ + if (!rcell) + return false; + } + return true; +} /* cdbpath_match_preds_to_both_partkeys */ @@ -596,7 +607,7 @@ cdbpath_match_preds_to_both_partkeys(PlannerInfo *root, static bool cdbpath_eclass_isGreenplumDbHashable(EquivalenceClass *ec) { - ListCell *j; + ListCell *j; foreach(j, ec->ec_members) { @@ -604,7 +615,7 @@ cdbpath_eclass_isGreenplumDbHashable(EquivalenceClass *ec) /* Fail on non-hashable expression types */ if (!isGreenplumDbHashable(exprType((Node *) em->em_expr))) - return false; + return false; } return true; @@ -623,79 +634,82 @@ cdbpath_eclass_isGreenplumDbHashable(EquivalenceClass *ec) * make_pathkeys_for_mergeclauses() in pathkeys.c */ static bool -cdbpath_partkeys_from_preds(PlannerInfo *root, - List *mergeclause_list, - Path *a_path, - CdbPathLocus *a_locus, /* OUT */ - CdbPathLocus *b_locus) /* OUT */ +cdbpath_partkeys_from_preds(PlannerInfo *root, + List *mergeclause_list, + Path *a_path, + CdbPathLocus *a_locus, /* OUT */ + CdbPathLocus *b_locus) /* OUT */ { - List *a_partkey = NIL; - List *b_partkey = NIL; - ListCell *rcell; - - foreach(rcell, mergeclause_list) - { - RestrictInfo *rinfo = (RestrictInfo *) lfirst(rcell); - - if (!rinfo->left_ec) - { - cache_mergeclause_eclasses(root, rinfo); - Assert(rinfo->left_ec); - } - - /* - * skip non-hashable keys - */ - if (!cdbpath_eclass_isGreenplumDbHashable(rinfo->left_ec) || - !cdbpath_eclass_isGreenplumDbHashable(rinfo->right_ec)) - { - continue; - } - - /* Left & right pathkeys are usually the same... */ - if (!b_partkey && rinfo->left_ec == rinfo->right_ec) - { - ListCell *i; + List *a_partkey = NIL; + List *b_partkey = NIL; + ListCell *rcell; + + foreach(rcell, mergeclause_list) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(rcell); + + if (!rinfo->left_ec) + { + cache_mergeclause_eclasses(root, rinfo); + Assert(rinfo->left_ec); + } + + /* + * skip non-hashable keys + */ + if (!cdbpath_eclass_isGreenplumDbHashable(rinfo->left_ec) || + !cdbpath_eclass_isGreenplumDbHashable(rinfo->right_ec)) + { + continue; + } + + /* Left & right pathkeys are usually the same... */ + if (!b_partkey && rinfo->left_ec == rinfo->right_ec) + { + ListCell *i; foreach(i, a_partkey) { - PathKey *pathkey = (PathKey *) lfirst(i); + PathKey *pathkey = (PathKey *) lfirst(i); + if (pathkey->pk_eclass == rinfo->left_ec) a_partkey = lappend(a_partkey, rinfo->left_ec); } - } + } - /* ... except in outer join ON-clause. */ - else - { + /* ... except in outer join ON-clause. */ + else + { EquivalenceClass *a_ec; EquivalenceClass *b_ec; ListCell *i; bool found = false; - if (bms_is_subset(rinfo->right_relids, a_path->parent->relids)) - { - a_ec = rinfo->right_ec; - b_ec = rinfo->left_ec; - } - else - { - a_ec = rinfo->left_ec; - b_ec = rinfo->right_ec; - Assert(bms_is_subset(rinfo->left_relids, a_path->parent->relids)); - } + if (bms_is_subset(rinfo->right_relids, a_path->parent->relids)) + { + a_ec = rinfo->right_ec; + b_ec = rinfo->left_ec; + } + else + { + a_ec = rinfo->left_ec; + b_ec = rinfo->right_ec; + Assert(bms_is_subset(rinfo->left_relids, a_path->parent->relids)); + } if (!b_ec) b_ec = a_ec; /* - * Convoluted logic to ensure that (a_ec not in a_partkey) AND (b_ec not in b_partkey) + * Convoluted logic to ensure that (a_ec not in a_partkey) AND + * (b_ec not in b_partkey) */ found = false; foreach(i, a_partkey) { - PathKey *pathkey = (PathKey *) lfirst(i); + PathKey *pathkey = (PathKey *) lfirst(i); + if (pathkey->pk_eclass == a_ec) { found = true; @@ -706,7 +720,8 @@ cdbpath_partkeys_from_preds(PlannerInfo *root, { foreach(i, b_partkey) { - PathKey *pathkey = (PathKey *) lfirst(i); + PathKey *pathkey = (PathKey *) lfirst(i); + if (pathkey->pk_eclass == b_ec) { found = true; @@ -716,29 +731,29 @@ cdbpath_partkeys_from_preds(PlannerInfo *root, } if (!found) - { - PathKey *a_pk = makePathKeyForEC(a_ec); - PathKey *b_pk = makePathKeyForEC(b_ec); + { + PathKey *a_pk = makePathKeyForEC(a_ec); + PathKey *b_pk = makePathKeyForEC(b_ec); - a_partkey = lappend(a_partkey, a_pk); - b_partkey = lappend(b_partkey, b_pk); - } - } + a_partkey = lappend(a_partkey, a_pk); + b_partkey = lappend(b_partkey, b_pk); + } + } - if (list_length(a_partkey) >= 20) - break; - } + if (list_length(a_partkey) >= 20) + break; + } - if (!a_partkey) - return false; + if (!a_partkey) + return false; - CdbPathLocus_MakeHashed(a_locus, a_partkey); - if (b_partkey) - CdbPathLocus_MakeHashed(b_locus, b_partkey); - else - *b_locus = *a_locus; - return true; -} /* cdbpath_partkeys_from_preds */ + CdbPathLocus_MakeHashed(a_locus, a_partkey); + if (b_partkey) + CdbPathLocus_MakeHashed(b_locus, b_partkey); + else + *b_locus = *a_locus; + return true; +} /* cdbpath_partkeys_from_preds */ /* @@ -755,342 +770,350 @@ cdbpath_partkeys_from_preds(PlannerInfo *root, typedef struct { - CdbPathLocus locus; - CdbPathLocus move_to; - double bytes; - Path *path; - bool ok_to_replicate; - bool require_existing_order; - bool has_wts; /* Does the rel have WorkTableScan? */ + CdbPathLocus locus; + CdbPathLocus move_to; + double bytes; + Path *path; + bool ok_to_replicate; + bool require_existing_order; + bool has_wts; /* Does the rel have WorkTableScan? */ } CdbpathMfjRel; CdbPathLocus -cdbpath_motion_for_join(PlannerInfo *root, - JoinType jointype, /* JOIN_INNER/FULL/LEFT/RIGHT/IN */ - Path **p_outer_path, /* INOUT */ - Path **p_inner_path, /* INOUT */ - List *mergeclause_list, /* equijoin RestrictInfo list */ - List *outer_pathkeys, - List *inner_pathkeys, - bool outer_require_existing_order, - bool inner_require_existing_order) +cdbpath_motion_for_join(PlannerInfo *root, + JoinType jointype, /* JOIN_INNER/FULL/LEFT/RIGHT/IN */ + Path **p_outer_path, /* INOUT */ + Path **p_inner_path, /* INOUT */ + List *mergeclause_list, /* equijoin RestrictInfo list */ + List *outer_pathkeys, + List *inner_pathkeys, + bool outer_require_existing_order, + bool inner_require_existing_order) { - CdbpathMfjRel outer; - CdbpathMfjRel inner; - - outer.path = *p_outer_path; - inner.path = *p_inner_path; - outer.locus = outer.path->locus; - inner.locus = inner.path->locus; - CdbPathLocus_MakeNull(&outer.move_to); - CdbPathLocus_MakeNull(&inner.move_to); - - Assert(cdbpathlocus_is_valid(outer.locus) && - cdbpathlocus_is_valid(inner.locus)); - - outer.has_wts = cdbpath_contains_wts(outer.path); - inner.has_wts = cdbpath_contains_wts(inner.path); - - /* For now, inner path should not contain WorkTableScan */ - Assert(!inner.has_wts); - - /* - * If outer rel contains WorkTableScan and inner rel is hash - * distributed, unfortunately we have to pretend that inner - * is randomly distributed, otherwise we may end up with - * redistributing outer rel. - */ - if (outer.has_wts && CdbPathLocus_Degree(inner.locus) != 0) + CdbpathMfjRel outer; + CdbpathMfjRel inner; + + outer.path = *p_outer_path; + inner.path = *p_inner_path; + outer.locus = outer.path->locus; + inner.locus = inner.path->locus; + CdbPathLocus_MakeNull(&outer.move_to); + CdbPathLocus_MakeNull(&inner.move_to); + + Assert(cdbpathlocus_is_valid(outer.locus) && + cdbpathlocus_is_valid(inner.locus)); + + outer.has_wts = cdbpath_contains_wts(outer.path); + inner.has_wts = cdbpath_contains_wts(inner.path); + + /* For now, inner path should not contain WorkTableScan */ + Assert(!inner.has_wts); + + /* + * If outer rel contains WorkTableScan and inner rel is hash distributed, + * unfortunately we have to pretend that inner is randomly distributed, + * otherwise we may end up with redistributing outer rel. + */ + if (outer.has_wts && CdbPathLocus_Degree(inner.locus) != 0) CdbPathLocus_MakeStrewn(&inner.locus); - /* Caller can specify an ordering for each source path that is - * the same as or weaker than the path's existing ordering. - * Caller may insist that we do not add motion that would - * lose the specified ordering property; otherwise the given - * ordering is preferred but not required. - * A required NIL ordering means no motion is allowed for that path. - */ - outer.require_existing_order = outer_require_existing_order; - inner.require_existing_order = inner_require_existing_order; - - /* Don't consider replicating the preserved rel of an outer join, or - * the current-query rel of a join between current query and subquery. - * - * Path that contains WorkTableScan cannot be replicated. - */ - outer.ok_to_replicate = !outer.has_wts; - inner.ok_to_replicate = true; - switch (jointype) - { - case JOIN_INNER: - break; - case JOIN_SEMI: - case JOIN_ANTI: - case JOIN_LEFT: - case JOIN_LASJ_NOTIN: - outer.ok_to_replicate = false; - break; - case JOIN_RIGHT: - inner.ok_to_replicate = false; - break; - case JOIN_FULL: - outer.ok_to_replicate = false; - inner.ok_to_replicate = false; - break; - default: - Assert(0); - } - - /* Get rel sizes. */ - outer.bytes = cdbpath_rows(root, outer.path) * outer.path->parent->width; - inner.bytes = cdbpath_rows(root, inner.path) * inner.path->parent->width; - - /* - * Motion not needed if either source is everywhere (e.g. a constant). - * - * But if a row is everywhere and is preserved in an outer join, we - * don't want to preserve it in every qExec process where it is - * unmatched, because that would produce duplicate null-augmented rows. - * So in that case, bring all the partitions to a single qExec to be joined. - * CDB TODO: Can this case be handled without introducing a bottleneck? - */ - if (CdbPathLocus_IsGeneral(outer.locus)) - { - if (!outer.ok_to_replicate && - CdbPathLocus_IsPartitioned(inner.locus)) - CdbPathLocus_MakeSingleQE(&inner.move_to); - else - return inner.locus; - } - else if (CdbPathLocus_IsGeneral(inner.locus)) - { - if (!inner.ok_to_replicate && - CdbPathLocus_IsPartitioned(outer.locus)) - CdbPathLocus_MakeSingleQE(&outer.move_to); - else - return outer.locus; - } - - /* - * Is either source confined to a single process? - * NB: Motion to a single process (qDisp or qExec) is the only motion - * in which we may use Merge Receive to preserve an existing ordering. - */ - else if (CdbPathLocus_IsBottleneck(outer.locus) || - CdbPathLocus_IsBottleneck(inner.locus)) - { /* singleQE or entry db */ - CdbpathMfjRel *single = &outer; - CdbpathMfjRel *other = &inner; - bool single_immovable = (outer.require_existing_order && - !outer_pathkeys) || outer.has_wts; - bool other_immovable = inner.require_existing_order && - !inner_pathkeys; - - /* - * If each of the sources has a single-process locus, then assign both - * sources and the join to run in the same process, without motion. - * The slice will be run on the entry db if either source requires it. - */ - if (CdbPathLocus_IsEntry(single->locus)) - { - if (CdbPathLocus_IsBottleneck(other->locus)) - return single->locus; - } - else if (CdbPathLocus_IsSingleQE(single->locus)) - { - if (CdbPathLocus_IsBottleneck(other->locus)) - return other->locus; - } - - /* Let 'single' be the source whose locus is singleQE or entry. */ - else - { - CdbSwap(CdbpathMfjRel*, single, other); - CdbSwap(bool, single_immovable, other_immovable); - } - Assert(CdbPathLocus_IsBottleneck(single->locus)); - Assert(CdbPathLocus_IsPartitioned(other->locus)); - - /* If the bottlenecked rel can't be moved, bring the other rel to it. */ - if (single_immovable) - other->move_to = single->locus; - - /* Redistribute single rel if joining on other rel's partitioning key */ - else if (cdbpath_match_preds_to_partkey(root, - mergeclause_list, - other->locus, - &single->move_to)) /* OUT */ - {} - - /* Replicate single rel if cheaper than redistributing both rels. */ - else if (single->ok_to_replicate && - single->bytes * root->config->cdbpath_segments < single->bytes + other->bytes) - CdbPathLocus_MakeReplicated(&single->move_to); - - /* Redistribute both rels on equijoin cols. */ - else if (!other->require_existing_order && - cdbpath_partkeys_from_preds(root, - mergeclause_list, - single->path, - &single->move_to, /* OUT */ - &other->move_to)) /* OUT */ - {} - - /* No usable equijoin preds, or caller imposed restrictions on motion. - * Replicate single rel if cheaper than bottlenecking other rel. - */ - else if (single->ok_to_replicate && - single->bytes < other->bytes) - CdbPathLocus_MakeReplicated(&single->move_to); - - /* Broadcast single rel if other rel has WorkTableScan */ - else if (single->ok_to_replicate && other->has_wts) - CdbPathLocus_MakeReplicated(&single->move_to); - - /* Last resort: Move all partitions of other rel to single QE. */ - else - other->move_to = single->locus; - } /* singleQE or entry */ - - /* - * Replicated paths shouldn't occur loose, for now. - */ - else if (CdbPathLocus_IsReplicated(outer.locus) || - CdbPathLocus_IsReplicated(inner.locus)) - { - Assert(false); - goto fail; - } - - /* - * No motion if partitioned alike and joining on the partitioning keys. - */ - else if (cdbpath_match_preds_to_both_partkeys(root, mergeclause_list, - outer.locus, inner.locus)) - return cdbpathlocus_join(outer.locus, inner.locus); - - /* - * Kludge used internally for querying catalogs on segment dbs. - * Each QE will join the catalogs that are local to its own segment. - * The catalogs don't have partitioning keys. No motion needed. - */ - else if (CdbPathLocus_IsStrewn(outer.locus) && - CdbPathLocus_IsStrewn(inner.locus) && - cdbpathlocus_querysegmentcatalogs) - return outer.locus; - - /* - * Both sources are partitioned. Redistribute or replicate one or both. - */ - else - { /* partitioned */ - CdbpathMfjRel *large = &outer; - CdbpathMfjRel *small = &inner; - - /* Which rel is bigger? */ - if (large->bytes < small->bytes) - CdbSwap(CdbpathMfjRel*, large, small); - - /* If joining on larger rel's partitioning key, redistribute smaller. */ - if (!small->require_existing_order && - cdbpath_match_preds_to_partkey(root, - mergeclause_list, - large->locus, - &small->move_to)) /* OUT */ - {} - - /* Replicate smaller rel if cheaper than redistributing larger rel. - * But don't replicate a rel that is to be preserved in outer join. - */ - else if (!small->require_existing_order && - small->ok_to_replicate && - small->bytes * root->config->cdbpath_segments < large->bytes) - CdbPathLocus_MakeReplicated(&small->move_to); - - /* If joining on smaller rel's partitioning key, redistribute larger. */ - else if (!large->require_existing_order && - cdbpath_match_preds_to_partkey(root, - mergeclause_list, - small->locus, - &large->move_to)) /* OUT */ - {} - - /* Replicate smaller rel if cheaper than redistributing both rels. */ - else if (!small->require_existing_order && - small->ok_to_replicate && - small->bytes * root->config->cdbpath_segments < large->bytes + small->bytes) - CdbPathLocus_MakeReplicated(&small->move_to); - - /* Redistribute both rels on equijoin cols. */ - else if (!small->require_existing_order && - !large->require_existing_order && - cdbpath_partkeys_from_preds(root, - mergeclause_list, - large->path, - &large->move_to, - &small->move_to)) - {} - - /* No usable equijoin preds, or couldn't consider the preferred motion. - * Replicate one rel if possible. - * MPP TODO: Consider number of seg dbs per host. - */ - else if (!small->require_existing_order && - small->ok_to_replicate) - CdbPathLocus_MakeReplicated(&small->move_to); - else if (!large->require_existing_order && - large->ok_to_replicate) - CdbPathLocus_MakeReplicated(&large->move_to); - - /* Last resort: Move both rels to a single qExec. */ - else - { - CdbPathLocus_MakeSingleQE(&outer.move_to); - CdbPathLocus_MakeSingleQE(&inner.move_to); - } - } /* partitioned */ - - /* - * Move outer. - */ - if (!CdbPathLocus_IsNull(outer.move_to)) - { - outer.path = cdbpath_create_motion_path(root, - outer.path, - outer_pathkeys, - outer.require_existing_order, - outer.move_to); - if (!outer.path) /* fail if outer motion not feasible */ - goto fail; - } - - /* - * Move inner. - */ - if (!CdbPathLocus_IsNull(inner.move_to)) - { - inner.path = cdbpath_create_motion_path(root, - inner.path, - inner_pathkeys, - inner.require_existing_order, - inner.move_to); - if (!inner.path) /* fail if inner motion not feasible */ - goto fail; - } - - /* - * Ok to join. Give modified subpaths to caller. - */ - *p_outer_path = outer.path; - *p_inner_path = inner.path; - - /* Tell caller where the join will be done. */ - return cdbpathlocus_join(outer.path->locus, inner.path->locus); - -fail: /* can't do this join */ - CdbPathLocus_MakeNull(&outer.move_to); - return outer.move_to; -} /* cdbpath_motion_for_join */ + /* + * Caller can specify an ordering for each source path that is the same as + * or weaker than the path's existing ordering. Caller may insist that we + * do not add motion that would lose the specified ordering property; + * otherwise the given ordering is preferred but not required. A required + * NIL ordering means no motion is allowed for that path. + */ + outer.require_existing_order = outer_require_existing_order; + inner.require_existing_order = inner_require_existing_order; + + /* + * Don't consider replicating the preserved rel of an outer join, or the + * current-query rel of a join between current query and subquery. + * + * Path that contains WorkTableScan cannot be replicated. + */ + outer.ok_to_replicate = !outer.has_wts; + inner.ok_to_replicate = true; + switch (jointype) + { + case JOIN_INNER: + break; + case JOIN_SEMI: + case JOIN_ANTI: + case JOIN_LEFT: + case JOIN_LASJ_NOTIN: + outer.ok_to_replicate = false; + break; + case JOIN_RIGHT: + inner.ok_to_replicate = false; + break; + case JOIN_FULL: + outer.ok_to_replicate = false; + inner.ok_to_replicate = false; + break; + default: + Assert(0); + } + + /* Get rel sizes. */ + outer.bytes = cdbpath_rows(root, outer.path) * outer.path->parent->width; + inner.bytes = cdbpath_rows(root, inner.path) * inner.path->parent->width; + + /* + * Motion not needed if either source is everywhere (e.g. a constant). + * + * But if a row is everywhere and is preserved in an outer join, we don't + * want to preserve it in every qExec process where it is unmatched, + * because that would produce duplicate null-augmented rows. So in that + * case, bring all the partitions to a single qExec to be joined. CDB + * TODO: Can this case be handled without introducing a bottleneck? + */ + if (CdbPathLocus_IsGeneral(outer.locus)) + { + if (!outer.ok_to_replicate && + CdbPathLocus_IsPartitioned(inner.locus)) + CdbPathLocus_MakeSingleQE(&inner.move_to); + else + return inner.locus; + } + else if (CdbPathLocus_IsGeneral(inner.locus)) + { + if (!inner.ok_to_replicate && + CdbPathLocus_IsPartitioned(outer.locus)) + CdbPathLocus_MakeSingleQE(&outer.move_to); + else + return outer.locus; + } + + /* + * Is either source confined to a single process? NB: Motion to a single + * process (qDisp or qExec) is the only motion in which we may use Merge + * Receive to preserve an existing ordering. + */ + else if (CdbPathLocus_IsBottleneck(outer.locus) || + CdbPathLocus_IsBottleneck(inner.locus)) + { /* singleQE or entry db */ + CdbpathMfjRel *single = &outer; + CdbpathMfjRel *other = &inner; + bool single_immovable = (outer.require_existing_order && + !outer_pathkeys) || outer.has_wts; + bool other_immovable = inner.require_existing_order && + !inner_pathkeys; + + /* + * If each of the sources has a single-process locus, then assign both + * sources and the join to run in the same process, without motion. + * The slice will be run on the entry db if either source requires it. + */ + if (CdbPathLocus_IsEntry(single->locus)) + { + if (CdbPathLocus_IsBottleneck(other->locus)) + return single->locus; + } + else if (CdbPathLocus_IsSingleQE(single->locus)) + { + if (CdbPathLocus_IsBottleneck(other->locus)) + return other->locus; + } + + /* Let 'single' be the source whose locus is singleQE or entry. */ + else + { + CdbSwap(CdbpathMfjRel *, single, other); + CdbSwap(bool, single_immovable, other_immovable); + } + Assert(CdbPathLocus_IsBottleneck(single->locus)); + Assert(CdbPathLocus_IsPartitioned(other->locus)); + + /* If the bottlenecked rel can't be moved, bring the other rel to it. */ + if (single_immovable) + other->move_to = single->locus; + + /* Redistribute single rel if joining on other rel's partitioning key */ + else if (cdbpath_match_preds_to_partkey(root, + mergeclause_list, + other->locus, + &single->move_to)) /* OUT */ + { + } + + /* Replicate single rel if cheaper than redistributing both rels. */ + else if (single->ok_to_replicate && + single->bytes * root->config->cdbpath_segments < single->bytes + other->bytes) + CdbPathLocus_MakeReplicated(&single->move_to); + + /* Redistribute both rels on equijoin cols. */ + else if (!other->require_existing_order && + cdbpath_partkeys_from_preds(root, + mergeclause_list, + single->path, + &single->move_to, /* OUT */ + &other->move_to)) /* OUT */ + { + } + + /* + * No usable equijoin preds, or caller imposed restrictions on motion. + * Replicate single rel if cheaper than bottlenecking other rel. + */ + else if (single->ok_to_replicate && + single->bytes < other->bytes) + CdbPathLocus_MakeReplicated(&single->move_to); + + /* Broadcast single rel if other rel has WorkTableScan */ + else if (single->ok_to_replicate && other->has_wts) + CdbPathLocus_MakeReplicated(&single->move_to); + + /* Last resort: Move all partitions of other rel to single QE. */ + else + other->move_to = single->locus; + } /* singleQE or entry */ + + /* + * Replicated paths shouldn't occur loose, for now. + */ + else if (CdbPathLocus_IsReplicated(outer.locus) || + CdbPathLocus_IsReplicated(inner.locus)) + { + Assert(false); + goto fail; + } + + /* + * No motion if partitioned alike and joining on the partitioning keys. + */ + else if (cdbpath_match_preds_to_both_partkeys(root, mergeclause_list, + outer.locus, inner.locus)) + return cdbpathlocus_join(outer.locus, inner.locus); + + /* + * Kludge used internally for querying catalogs on segment dbs. Each QE + * will join the catalogs that are local to its own segment. The catalogs + * don't have partitioning keys. No motion needed. + */ + else if (CdbPathLocus_IsStrewn(outer.locus) && + CdbPathLocus_IsStrewn(inner.locus) && + cdbpathlocus_querysegmentcatalogs) + return outer.locus; + + /* + * Both sources are partitioned. Redistribute or replicate one or both. + */ + else + { /* partitioned */ + CdbpathMfjRel *large = &outer; + CdbpathMfjRel *small = &inner; + + /* Which rel is bigger? */ + if (large->bytes < small->bytes) + CdbSwap(CdbpathMfjRel *, large, small); + + /* If joining on larger rel's partitioning key, redistribute smaller. */ + if (!small->require_existing_order && + cdbpath_match_preds_to_partkey(root, + mergeclause_list, + large->locus, + &small->move_to)) /* OUT */ + { + } + + /* + * Replicate smaller rel if cheaper than redistributing larger rel. + * But don't replicate a rel that is to be preserved in outer join. + */ + else if (!small->require_existing_order && + small->ok_to_replicate && + small->bytes * root->config->cdbpath_segments < large->bytes) + CdbPathLocus_MakeReplicated(&small->move_to); + + /* If joining on smaller rel's partitioning key, redistribute larger. */ + else if (!large->require_existing_order && + cdbpath_match_preds_to_partkey(root, + mergeclause_list, + small->locus, + &large->move_to)) /* OUT */ + { + } + + /* Replicate smaller rel if cheaper than redistributing both rels. */ + else if (!small->require_existing_order && + small->ok_to_replicate && + small->bytes * root->config->cdbpath_segments < large->bytes + small->bytes) + CdbPathLocus_MakeReplicated(&small->move_to); + + /* Redistribute both rels on equijoin cols. */ + else if (!small->require_existing_order && + !large->require_existing_order && + cdbpath_partkeys_from_preds(root, + mergeclause_list, + large->path, + &large->move_to, + &small->move_to)) + { + } + + /* + * No usable equijoin preds, or couldn't consider the preferred + * motion. Replicate one rel if possible. MPP TODO: Consider number of + * seg dbs per host. + */ + else if (!small->require_existing_order && + small->ok_to_replicate) + CdbPathLocus_MakeReplicated(&small->move_to); + else if (!large->require_existing_order && + large->ok_to_replicate) + CdbPathLocus_MakeReplicated(&large->move_to); + + /* Last resort: Move both rels to a single qExec. */ + else + { + CdbPathLocus_MakeSingleQE(&outer.move_to); + CdbPathLocus_MakeSingleQE(&inner.move_to); + } + } /* partitioned */ + + /* + * Move outer. + */ + if (!CdbPathLocus_IsNull(outer.move_to)) + { + outer.path = cdbpath_create_motion_path(root, + outer.path, + outer_pathkeys, + outer.require_existing_order, + outer.move_to); + if (!outer.path) /* fail if outer motion not feasible */ + goto fail; + } + + /* + * Move inner. + */ + if (!CdbPathLocus_IsNull(inner.move_to)) + { + inner.path = cdbpath_create_motion_path(root, + inner.path, + inner_pathkeys, + inner.require_existing_order, + inner.move_to); + if (!inner.path) /* fail if inner motion not feasible */ + goto fail; + } + + /* + * Ok to join. Give modified subpaths to caller. + */ + *p_outer_path = outer.path; + *p_inner_path = inner.path; + + /* Tell caller where the join will be done. */ + return cdbpathlocus_join(outer.path->locus, inner.path->locus); + +fail: /* can't do this join */ + CdbPathLocus_MakeNull(&outer.move_to); + return outer.move_to; +} /* cdbpath_motion_for_join */ /* @@ -1100,130 +1123,131 @@ fail: /* can't do this join */ typedef struct CdbpathDedupFixupContext { - PlannerInfo *root; - Relids distinct_on_rowid_relids; - List *rowid_vars; - int32 subplan_id; - bool need_subplan_id; - bool need_segment_id; + PlannerInfo *root; + Relids distinct_on_rowid_relids; + List *rowid_vars; + int32 subplan_id; + bool need_subplan_id; + bool need_segment_id; } CdbpathDedupFixupContext; static CdbVisitOpt -cdbpath_dedup_fixup_walker(Path *path, void *context); + cdbpath_dedup_fixup_walker(Path *path, void *context); /* Drop Var nodes from a List unless they belong to a given set of relids. */ static List * cdbpath_dedup_pickvars(List *vars, Relids relids_to_keep) { - ListCell *cell; - ListCell *nextcell; - ListCell *prevcell = NULL; - Var *var; - - for (cell = list_head(vars); cell; cell = nextcell) - { - nextcell = lnext(cell); - var = (Var *)lfirst(cell); - Assert(IsA(var, Var)); - if (!bms_is_member(var->varno, relids_to_keep)) - vars = list_delete_cell(vars, cell, prevcell); - else - prevcell = cell; - } - return vars; -} /* cdbpath_dedup_pickvars */ + ListCell *cell; + ListCell *nextcell; + ListCell *prevcell = NULL; + Var *var; + + for (cell = list_head(vars); cell; cell = nextcell) + { + nextcell = lnext(cell); + var = (Var *) lfirst(cell); + Assert(IsA(var, Var)); + if (!bms_is_member(var->varno, relids_to_keep)) + vars = list_delete_cell(vars, cell, prevcell); + else + prevcell = cell; + } + return vars; +} /* cdbpath_dedup_pickvars */ static CdbVisitOpt cdbpath_dedup_fixup_unique(UniquePath *uniquePath, CdbpathDedupFixupContext *ctx) { - Relids downstream_relids = ctx->distinct_on_rowid_relids; - List *ctid_exprs; + Relids downstream_relids = ctx->distinct_on_rowid_relids; + List *ctid_exprs; List *ctid_operators; - List *other_vars = NIL; - List *other_operators = NIL; - List *partkey = NIL; - List *eq = NIL; - ListCell *cell; - bool save_need_segment_id = ctx->need_segment_id; - - Assert(!ctx->rowid_vars); - - /* - * Leave this node unchanged unless it removes duplicates by row id. - * - * NB. If ctx->distinct_on_rowid_relids is nonempty, row id vars - * could be added to our rel's targetlist while visiting the child - * subtree. Any such added columns should pass on safely through this - * Unique op because they aren't added to the distinct_on_exprs list. - */ - if (bms_is_empty(uniquePath->distinct_on_rowid_relids)) - return CdbVisit_Walk; /* onward to visit the kids */ - - /* No action needed if data is trivially unique. */ - if (uniquePath->umethod == UNIQUE_PATH_NOOP || - uniquePath->umethod == UNIQUE_PATH_LIMIT1) - return CdbVisit_Walk; /* onward to visit the kids */ - - /* Find set of relids for which subpath must produce row ids. */ - ctx->distinct_on_rowid_relids = bms_union(ctx->distinct_on_rowid_relids, - uniquePath->distinct_on_rowid_relids); - - /* Tell join ops below that row ids mustn't be left out of targetlists. */ - ctx->distinct_on_rowid_relids = bms_add_member(ctx->distinct_on_rowid_relids, 0); - - /* Notify descendants if we're going to insert a MotionPath below. */ - if (uniquePath->must_repartition) - ctx->need_segment_id = true; - - /* Visit descendants to get list of row id vars and add to targetlists. */ - pathnode_walk_node(uniquePath->subpath, cdbpath_dedup_fixup_walker, ctx); - - /* Restore saved flag. */ - ctx->need_segment_id = save_need_segment_id; - - /* CDB TODO: we share kid's targetlist at present, so our tlist could - * contain rowid vars which are no longer needed downstream. - */ - - /* - * Build DISTINCT ON key for UniquePath, putting the ctid columns first - * because those are usually more distinctive than the segment ids. - * Also build repartitioning key if needed, using only the ctid columns. - */ + List *other_vars = NIL; + List *other_operators = NIL; + List *partkey = NIL; + List *eq = NIL; + ListCell *cell; + bool save_need_segment_id = ctx->need_segment_id; + + Assert(!ctx->rowid_vars); + + /* + * Leave this node unchanged unless it removes duplicates by row id. + * + * NB. If ctx->distinct_on_rowid_relids is nonempty, row id vars could be + * added to our rel's targetlist while visiting the child subtree. Any + * such added columns should pass on safely through this Unique op because + * they aren't added to the distinct_on_exprs list. + */ + if (bms_is_empty(uniquePath->distinct_on_rowid_relids)) + return CdbVisit_Walk; /* onward to visit the kids */ + + /* No action needed if data is trivially unique. */ + if (uniquePath->umethod == UNIQUE_PATH_NOOP || + uniquePath->umethod == UNIQUE_PATH_LIMIT1) + return CdbVisit_Walk; /* onward to visit the kids */ + + /* Find set of relids for which subpath must produce row ids. */ + ctx->distinct_on_rowid_relids = bms_union(ctx->distinct_on_rowid_relids, + uniquePath->distinct_on_rowid_relids); + + /* Tell join ops below that row ids mustn't be left out of targetlists. */ + ctx->distinct_on_rowid_relids = bms_add_member(ctx->distinct_on_rowid_relids, 0); + + /* Notify descendants if we're going to insert a MotionPath below. */ + if (uniquePath->must_repartition) + ctx->need_segment_id = true; + + /* Visit descendants to get list of row id vars and add to targetlists. */ + pathnode_walk_node(uniquePath->subpath, cdbpath_dedup_fixup_walker, ctx); + + /* Restore saved flag. */ + ctx->need_segment_id = save_need_segment_id; + + /* + * CDB TODO: we share kid's targetlist at present, so our tlist could + * contain rowid vars which are no longer needed downstream. + */ + + /* + * Build DISTINCT ON key for UniquePath, putting the ctid columns first + * because those are usually more distinctive than the segment ids. Also + * build repartitioning key if needed, using only the ctid columns. + */ ctid_exprs = NIL; ctid_operators = NIL; foreach(cell, ctx->rowid_vars) - { - Var *var = (Var *)lfirst(cell); - - Assert(IsA(var, Var) && - bms_is_member(var->varno, ctx->distinct_on_rowid_relids)); - - /* Skip vars which aren't part of the row id for this Unique op. */ - if (!bms_is_member(var->varno, uniquePath->distinct_on_rowid_relids)) - continue; - - /* ctid? */ - if (var->varattno == SelfItemPointerAttributeNumber) - { - /* - * The tid type has a full set of comparison operators, but - * oddly its "=" operator is not marked hashable. So 'ctid' - * is directly usable for sorted duplicate removal; but we - * cast it to 64-bit integer for hashed duplicate removal. - */ - if (uniquePath->umethod == UNIQUE_PATH_HASH) + { + Var *var = (Var *) lfirst(cell); + + Assert(IsA(var, Var) && + bms_is_member(var->varno, ctx->distinct_on_rowid_relids)); + + /* Skip vars which aren't part of the row id for this Unique op. */ + if (!bms_is_member(var->varno, uniquePath->distinct_on_rowid_relids)) + continue; + + /* ctid? */ + if (var->varattno == SelfItemPointerAttributeNumber) + { + /* + * The tid type has a full set of comparison operators, but oddly + * its "=" operator is not marked hashable. So 'ctid' is directly + * usable for sorted duplicate removal; but we cast it to 64-bit + * integer for hashed duplicate removal. + */ + if (uniquePath->umethod == UNIQUE_PATH_HASH) { - ctid_exprs = lappend(ctid_exprs, - makeFuncExpr(CDB_PROC_TIDTOI8, INT8OID, - list_make1(var), - COERCE_EXPLICIT_CAST)); + ctid_exprs = lappend(ctid_exprs, + makeFuncExpr(CDB_PROC_TIDTOI8, INT8OID, + list_make1(var), + COERCE_EXPLICIT_CAST)); ctid_operators = lappend_oid(ctid_operators, Int8EqualOperator); } - else + else { - ctid_exprs = lappend(ctid_exprs, var); + ctid_exprs = lappend(ctid_exprs, var); ctid_operators = lappend_oid(ctid_operators, TIDEqualOperator); } @@ -1237,10 +1261,10 @@ cdbpath_dedup_fixup_unique(UniquePath *uniquePath, CdbpathDedupFixupContext *ctx cpathkey = cdb_make_pathkey_for_expr(ctx->root, (Node *) var, eq, false); partkey = lappend(partkey, cpathkey); } - } + } - /* other uniqueifiers such as gp_segment_id */ - else + /* other uniqueifiers such as gp_segment_id */ + else { Operator optup; Oid eqop; @@ -1253,168 +1277,171 @@ cdbpath_dedup_fixup_unique(UniquePath *uniquePath, CdbpathDedupFixupContext *ctx other_operators = lappend_oid(other_operators, eqop); } - } + } - uniquePath->distinct_on_exprs = list_concat(ctid_exprs, other_vars); + uniquePath->distinct_on_exprs = list_concat(ctid_exprs, other_vars); uniquePath->distinct_on_eq_operators = list_concat(ctid_operators, other_operators); - /* To repartition, add a MotionPath below this UniquePath. */ - if (uniquePath->must_repartition) - { - CdbPathLocus locus; - - Assert(partkey); - CdbPathLocus_MakeHashed(&locus, partkey); - - uniquePath->subpath = cdbpath_create_motion_path(ctx->root, - uniquePath->subpath, - NIL, - false, - locus); - Insist(uniquePath->subpath); - uniquePath->path.locus = uniquePath->subpath->locus; - uniquePath->path.motionHazard = uniquePath->subpath->motionHazard; - uniquePath->path.rescannable = uniquePath->subpath->rescannable; - list_free_deep(eq); - } - - /* Prune row id var list to remove items not needed downstream. */ - ctx->rowid_vars = cdbpath_dedup_pickvars(ctx->rowid_vars, downstream_relids); - - bms_free(ctx->distinct_on_rowid_relids); - ctx->distinct_on_rowid_relids = downstream_relids; - return CdbVisit_Skip; /* we visited kids already; done with subtree */ -} /* cdbpath_dedup_fixup_unique */ + /* To repartition, add a MotionPath below this UniquePath. */ + if (uniquePath->must_repartition) + { + CdbPathLocus locus; + + Assert(partkey); + CdbPathLocus_MakeHashed(&locus, partkey); + + uniquePath->subpath = cdbpath_create_motion_path(ctx->root, + uniquePath->subpath, + NIL, + false, + locus); + Insist(uniquePath->subpath); + uniquePath->path.locus = uniquePath->subpath->locus; + uniquePath->path.motionHazard = uniquePath->subpath->motionHazard; + uniquePath->path.rescannable = uniquePath->subpath->rescannable; + list_free_deep(eq); + } + + /* Prune row id var list to remove items not needed downstream. */ + ctx->rowid_vars = cdbpath_dedup_pickvars(ctx->rowid_vars, downstream_relids); + + bms_free(ctx->distinct_on_rowid_relids); + ctx->distinct_on_rowid_relids = downstream_relids; + return CdbVisit_Skip; /* we visited kids already; done with subtree */ +} /* cdbpath_dedup_fixup_unique */ static void cdbpath_dedup_fixup_baserel(Path *path, CdbpathDedupFixupContext *ctx) { - RelOptInfo *rel = path->parent; - List *rowid_vars = NIL; - Const *con; - Var *var; - - Assert(!ctx->rowid_vars); - - /* Find or make a Var node referencing our 'ctid' system attribute. */ - var = find_indexkey_var(ctx->root, rel, SelfItemPointerAttributeNumber); - rowid_vars = lappend(rowid_vars, var); - - /* - * If below a Motion operator, make a Var node for our 'gp_segment_id' attr. - * - * Omit if the data is known to come from just one segment, or consists - * only of constants (e.g. values scan) or immutable function results. - */ - if (ctx->need_segment_id) - { - if (!CdbPathLocus_IsBottleneck(path->locus) && - !CdbPathLocus_IsGeneral(path->locus)) - { - var = find_indexkey_var(ctx->root, rel, GpSegmentIdAttributeNumber); - rowid_vars = lappend(rowid_vars, var); - } - } - - /* - * If below an Append, add 'gp_subplan_id' pseudo column to the targetlist. - * - * set_plan_references() will later replace the pseudo column Var node - * in our rel's targetlist with a copy of its defining expression, i.e. - * the Const node built here. - */ - if (ctx->need_subplan_id) - { - /* Make a Const node containing the current subplan id. */ - con = makeConst(INT4OID, -1, sizeof(int32), Int32GetDatum(ctx->subplan_id), - false, true); - - /* Set up a pseudo column whose value will be the constant. */ - var = cdb_define_pseudo_column(ctx->root, rel, "gp_subplan_id", - (Expr *)con, sizeof(int32)); - - /* Give downstream operators a Var referencing the pseudo column. */ - rowid_vars = lappend(rowid_vars, var); - } - - /* Add these vars to the rel's list of result columns. */ - add_vars_to_targetlist(ctx->root, rowid_vars, ctx->distinct_on_rowid_relids); - - /* Recalculate width of the rel's result rows. */ - set_rel_width(ctx->root, rel); - - /* - * Tell caller to add our vars to the DISTINCT ON key of the ancestral - * UniquePath, and to the targetlists of any intervening ancestors. - */ - ctx->rowid_vars = rowid_vars; -} /* cdbpath_dedup_fixup_baserel */ + RelOptInfo *rel = path->parent; + List *rowid_vars = NIL; + Const *con; + Var *var; + + Assert(!ctx->rowid_vars); + + /* Find or make a Var node referencing our 'ctid' system attribute. */ + var = find_indexkey_var(ctx->root, rel, SelfItemPointerAttributeNumber); + rowid_vars = lappend(rowid_vars, var); + + /* + * If below a Motion operator, make a Var node for our 'gp_segment_id' + * attr. + * + * Omit if the data is known to come from just one segment, or consists + * only of constants (e.g. values scan) or immutable function results. + */ + if (ctx->need_segment_id) + { + if (!CdbPathLocus_IsBottleneck(path->locus) && + !CdbPathLocus_IsGeneral(path->locus)) + { + var = find_indexkey_var(ctx->root, rel, GpSegmentIdAttributeNumber); + rowid_vars = lappend(rowid_vars, var); + } + } + + /* + * If below an Append, add 'gp_subplan_id' pseudo column to the + * targetlist. + * + * set_plan_references() will later replace the pseudo column Var node in + * our rel's targetlist with a copy of its defining expression, i.e. the + * Const node built here. + */ + if (ctx->need_subplan_id) + { + /* Make a Const node containing the current subplan id. */ + con = makeConst(INT4OID, -1, sizeof(int32), Int32GetDatum(ctx->subplan_id), + false, true); + + /* Set up a pseudo column whose value will be the constant. */ + var = cdb_define_pseudo_column(ctx->root, rel, "gp_subplan_id", + (Expr *) con, sizeof(int32)); + + /* Give downstream operators a Var referencing the pseudo column. */ + rowid_vars = lappend(rowid_vars, var); + } + + /* Add these vars to the rel's list of result columns. */ + add_vars_to_targetlist(ctx->root, rowid_vars, ctx->distinct_on_rowid_relids); + + /* Recalculate width of the rel's result rows. */ + set_rel_width(ctx->root, rel); + + /* + * Tell caller to add our vars to the DISTINCT ON key of the ancestral + * UniquePath, and to the targetlists of any intervening ancestors. + */ + ctx->rowid_vars = rowid_vars; +} /* cdbpath_dedup_fixup_baserel */ static void cdbpath_dedup_fixup_joinrel(JoinPath *joinpath, CdbpathDedupFixupContext *ctx) { - RelOptInfo *rel = joinpath->path.parent; + RelOptInfo *rel = joinpath->path.parent; - Assert(!ctx->rowid_vars); + Assert(!ctx->rowid_vars); - /* CDB TODO: Subpath id isn't needed from both outer and inner. - * Don't request row id vars from rhs of EXISTS join. - */ + /* + * CDB TODO: Subpath id isn't needed from both outer and inner. Don't + * request row id vars from rhs of EXISTS join. + */ - /* Get row id vars from outer subpath. */ - if (joinpath->outerjoinpath) - pathnode_walk_node(joinpath->outerjoinpath, cdbpath_dedup_fixup_walker, ctx); + /* Get row id vars from outer subpath. */ + if (joinpath->outerjoinpath) + pathnode_walk_node(joinpath->outerjoinpath, cdbpath_dedup_fixup_walker, ctx); - /* Get row id vars from inner subpath. */ - if (joinpath->innerjoinpath) - { - List *outer_rowid_vars = ctx->rowid_vars; + /* Get row id vars from inner subpath. */ + if (joinpath->innerjoinpath) + { + List *outer_rowid_vars = ctx->rowid_vars; - ctx->rowid_vars = NIL; - pathnode_walk_node(joinpath->innerjoinpath, cdbpath_dedup_fixup_walker, ctx); + ctx->rowid_vars = NIL; + pathnode_walk_node(joinpath->innerjoinpath, cdbpath_dedup_fixup_walker, ctx); - /* Which rel has more rows? Put its row id vars in front. */ - if (outer_rowid_vars && - ctx->rowid_vars && - cdbpath_rows(ctx->root, joinpath->outerjoinpath) >= cdbpath_rows(ctx->root, joinpath->innerjoinpath)) - ctx->rowid_vars = list_concat(outer_rowid_vars, ctx->rowid_vars); - else - ctx->rowid_vars = list_concat(ctx->rowid_vars, outer_rowid_vars); - } + /* Which rel has more rows? Put its row id vars in front. */ + if (outer_rowid_vars && + ctx->rowid_vars && + cdbpath_rows(ctx->root, joinpath->outerjoinpath) >= cdbpath_rows(ctx->root, joinpath->innerjoinpath)) + ctx->rowid_vars = list_concat(outer_rowid_vars, ctx->rowid_vars); + else + ctx->rowid_vars = list_concat(ctx->rowid_vars, outer_rowid_vars); + } - /* Update joinrel's targetlist and adjust row width. */ - if (ctx->rowid_vars) - build_joinrel_tlist(ctx->root, rel, ctx->rowid_vars); -} /* cdbpath_dedup_fixup_joinrel */ + /* Update joinrel's targetlist and adjust row width. */ + if (ctx->rowid_vars) + build_joinrel_tlist(ctx->root, rel, ctx->rowid_vars); +} /* cdbpath_dedup_fixup_joinrel */ static void cdbpath_dedup_fixup_motion(CdbMotionPath *motionpath, CdbpathDedupFixupContext *ctx) { - bool save_need_segment_id = ctx->need_segment_id; + bool save_need_segment_id = ctx->need_segment_id; - /* - * Motion could bring together rows which happen to have the same ctid - * but are actually from different segments. They must not be treated - * as duplicates. To distinguish them, let each row be labeled with - * its originating segment id. - */ - ctx->need_segment_id = true; + /* + * Motion could bring together rows which happen to have the same ctid but + * are actually from different segments. They must not be treated as + * duplicates. To distinguish them, let each row be labeled with its + * originating segment id. + */ + ctx->need_segment_id = true; - /* Visit the upstream nodes. */ - pathnode_walk_node(motionpath->subpath, cdbpath_dedup_fixup_walker, ctx); + /* Visit the upstream nodes. */ + pathnode_walk_node(motionpath->subpath, cdbpath_dedup_fixup_walker, ctx); - /* Restore saved flag. */ - ctx->need_segment_id = save_need_segment_id; -} /* cdbpath_dedup_fixup_motion */ + /* Restore saved flag. */ + ctx->need_segment_id = save_need_segment_id; +} /* cdbpath_dedup_fixup_motion */ static void cdbpath_dedup_fixup_append(AppendPath *appendPath, CdbpathDedupFixupContext *ctx) { - Relids save_distinct_on_rowid_relids = ctx->distinct_on_rowid_relids; - List *appendrel_rowid_vars; - ListCell *cell; - int ncol; - bool save_need_subplan_id = ctx->need_subplan_id; + Relids save_distinct_on_rowid_relids = ctx->distinct_on_rowid_relids; + List *appendrel_rowid_vars; + ListCell *cell; + int ncol; + bool save_need_subplan_id = ctx->need_subplan_id; /* * The planner creates dummy AppendPaths with no subplans, if it can @@ -1424,97 +1451,97 @@ cdbpath_dedup_fixup_append(AppendPath *appendPath, CdbpathDedupFixupContext *ctx if (appendPath->subpaths == NIL) return; - Assert(!ctx->rowid_vars); - - /* Make a working copy of the set of relids for which row ids are needed. */ - ctx->distinct_on_rowid_relids = bms_copy(ctx->distinct_on_rowid_relids); - - /* - * Append could bring together rows which happen to have the same ctid - * but are actually from different tables or different branches of a - * UNION ALL. They must not be treated as duplicates. To distinguish - * them, let each row be labeled with an integer which will be different - * for each branch of the Append. - */ - ctx->need_subplan_id = true; - - /* Assign a dummy subplan id (not actually used) for the appendrel. */ - ctx->subplan_id++; - - /* Add placeholder columns to the appendrel's targetlist. */ - cdbpath_dedup_fixup_baserel((Path *)appendPath, ctx); - ncol = list_length(appendPath->path.parent->reltargetlist); - - appendrel_rowid_vars = ctx->rowid_vars; - ctx->rowid_vars = NIL; - - /* Update the parent and child rels. */ - foreach(cell, appendPath->subpaths) - { - Path *subpath = (Path *)lfirst(cell); - - if (!subpath) - continue; - - /* Assign a subplan id to this branch of the Append. */ - ctx->subplan_id++; - - /* Tell subpath to produce row ids. */ - ctx->distinct_on_rowid_relids = - bms_add_members(ctx->distinct_on_rowid_relids, - subpath->parent->relids); - - /* Process one subpath. */ - pathnode_walk_node(subpath, cdbpath_dedup_fixup_walker, ctx); - - /* - * Subpath and appendrel should have same number of result columns. - * CDB TODO: Add dummy columns to other subpaths to keep their - * targetlists in sync. - */ - if (list_length(subpath->parent->reltargetlist) != ncol) - ereport(ERROR, (errcode(ERRCODE_CDB_FEATURE_NOT_YET), - errmsg("The query is not yet supported in " - "this version of " PACKAGE_NAME "."), - errdetail("Unsupported combination of " - "UNION ALL of joined tables " - "with subquery.") - )); - - /* Don't need subpath's rowid_vars. */ - list_free(ctx->rowid_vars); - ctx->rowid_vars = NIL; - } - - /* Provide appendrel's row id vars to downstream operators. */ - ctx->rowid_vars = appendrel_rowid_vars; - - /* Restore saved values. */ - bms_free(ctx->distinct_on_rowid_relids); - ctx->distinct_on_rowid_relids = save_distinct_on_rowid_relids; - ctx->need_subplan_id = save_need_subplan_id; -} /* cdbpath_dedup_fixup_append */ - - static CdbVisitOpt + Assert(!ctx->rowid_vars); + + /* Make a working copy of the set of relids for which row ids are needed. */ + ctx->distinct_on_rowid_relids = bms_copy(ctx->distinct_on_rowid_relids); + + /* + * Append could bring together rows which happen to have the same ctid but + * are actually from different tables or different branches of a UNION + * ALL. They must not be treated as duplicates. To distinguish them, let + * each row be labeled with an integer which will be different for each + * branch of the Append. + */ + ctx->need_subplan_id = true; + + /* Assign a dummy subplan id (not actually used) for the appendrel. */ + ctx->subplan_id++; + + /* Add placeholder columns to the appendrel's targetlist. */ + cdbpath_dedup_fixup_baserel((Path *) appendPath, ctx); + ncol = list_length(appendPath->path.parent->reltargetlist); + + appendrel_rowid_vars = ctx->rowid_vars; + ctx->rowid_vars = NIL; + + /* Update the parent and child rels. */ + foreach(cell, appendPath->subpaths) + { + Path *subpath = (Path *) lfirst(cell); + + if (!subpath) + continue; + + /* Assign a subplan id to this branch of the Append. */ + ctx->subplan_id++; + + /* Tell subpath to produce row ids. */ + ctx->distinct_on_rowid_relids = + bms_add_members(ctx->distinct_on_rowid_relids, + subpath->parent->relids); + + /* Process one subpath. */ + pathnode_walk_node(subpath, cdbpath_dedup_fixup_walker, ctx); + + /* + * Subpath and appendrel should have same number of result columns. + * CDB TODO: Add dummy columns to other subpaths to keep their + * targetlists in sync. + */ + if (list_length(subpath->parent->reltargetlist) != ncol) + ereport(ERROR, (errcode(ERRCODE_CDB_FEATURE_NOT_YET), + errmsg("The query is not yet supported in " + "this version of " PACKAGE_NAME "."), + errdetail("Unsupported combination of " + "UNION ALL of joined tables " + "with subquery.") + )); + + /* Don't need subpath's rowid_vars. */ + list_free(ctx->rowid_vars); + ctx->rowid_vars = NIL; + } + + /* Provide appendrel's row id vars to downstream operators. */ + ctx->rowid_vars = appendrel_rowid_vars; + + /* Restore saved values. */ + bms_free(ctx->distinct_on_rowid_relids); + ctx->distinct_on_rowid_relids = save_distinct_on_rowid_relids; + ctx->need_subplan_id = save_need_subplan_id; +} /* cdbpath_dedup_fixup_append */ + +static CdbVisitOpt cdbpath_dedup_fixup_walker(Path *path, void *context) { - CdbpathDedupFixupContext *ctx = (CdbpathDedupFixupContext *)context; + CdbpathDedupFixupContext *ctx = (CdbpathDedupFixupContext *) context; Assert(!ctx->rowid_vars); /* Watch for a UniquePath node calling for removal of dups by row id. */ if (path->pathtype == T_Unique) - return cdbpath_dedup_fixup_unique((UniquePath *)path, ctx); + return cdbpath_dedup_fixup_unique((UniquePath *) path, ctx); /* Leave node unchanged unless a downstream Unique op needs row ids. */ if (!bms_overlap(path->parent->relids, ctx->distinct_on_rowid_relids)) - return CdbVisit_Walk; /* visit descendants */ + return CdbVisit_Walk; /* visit descendants */ /* Alter this node to produce row ids for an ancestral Unique operator. */ switch (path->pathtype) { case T_Append: - cdbpath_dedup_fixup_append((AppendPath *)path, ctx); + cdbpath_dedup_fixup_append((AppendPath *) path, ctx); break; case T_SeqScan: @@ -1536,40 +1563,40 @@ cdbpath_dedup_fixup_walker(Path *path, void *context) case T_HashJoin: case T_MergeJoin: case T_NestLoop: - cdbpath_dedup_fixup_joinrel((JoinPath *)path, ctx); + cdbpath_dedup_fixup_joinrel((JoinPath *) path, ctx); break; case T_Result: case T_Material: /* These nodes share child's RelOptInfo and don't need fixup. */ - return CdbVisit_Walk; /* visit descendants */ + return CdbVisit_Walk; /* visit descendants */ case T_Motion: - cdbpath_dedup_fixup_motion((CdbMotionPath *)path, ctx); + cdbpath_dedup_fixup_motion((CdbMotionPath *) path, ctx); break; default: Insist(0); } - return CdbVisit_Skip; /* already visited kids, don't revisit them */ -} /* cdbpath_dedup_fixup_walker */ + return CdbVisit_Skip; /* already visited kids, don't revisit them */ +} /* cdbpath_dedup_fixup_walker */ void cdbpath_dedup_fixup(PlannerInfo *root, Path *path) { - CdbpathDedupFixupContext context; + CdbpathDedupFixupContext context; - memset(&context, 0, sizeof(context)); + memset(&context, 0, sizeof(context)); - context.root = root; + context.root = root; - pathnode_walk_node(path, cdbpath_dedup_fixup_walker, &context); + pathnode_walk_node(path, cdbpath_dedup_fixup_walker, &context); - Assert(bms_is_empty(context.distinct_on_rowid_relids) && - !context.rowid_vars && - !context.need_segment_id && - !context.need_subplan_id); -} /* cdbpath_dedup_fixup */ + Assert(bms_is_empty(context.distinct_on_rowid_relids) && + !context.rowid_vars && + !context.need_segment_id && + !context.need_subplan_id); +} /* cdbpath_dedup_fixup */ /* * Does the path contain WorkTableScan? @@ -1577,15 +1604,15 @@ cdbpath_dedup_fixup(PlannerInfo *root, Path *path) bool cdbpath_contains_wts(Path *path) { - JoinPath *joinPath; + JoinPath *joinPath; AppendPath *appendPath; - ListCell *lc; + ListCell *lc; if (IsJoinPath(path)) { joinPath = (JoinPath *) path; if (cdbpath_contains_wts(joinPath->outerjoinpath) - || cdbpath_contains_wts(joinPath->innerjoinpath)) + || cdbpath_contains_wts(joinPath->innerjoinpath)) return true; else return false; diff --git a/src/backend/cdb/cdbpathlocus.c b/src/backend/cdb/cdbpathlocus.c index 81382a343e..846a7a5e42 100644 --- a/src/backend/cdb/cdbpathlocus.c +++ b/src/backend/cdb/cdbpathlocus.c @@ -13,26 +13,25 @@ */ #include "postgres.h" -#include "catalog/gp_policy.h" /* GpPolicy */ -#include "cdb/cdbdef.h" /* CdbSwap() */ -#include "cdb/cdbpullup.h" /* cdbpullup_missing_var_walker() */ -#include "nodes/makefuncs.h" /* makeVar() */ +#include "catalog/gp_policy.h" /* GpPolicy */ +#include "cdb/cdbdef.h" /* CdbSwap() */ +#include "cdb/cdbpullup.h" /* cdbpullup_missing_var_walker() */ +#include "nodes/makefuncs.h" /* makeVar() */ #include "nodes/nodeFuncs.h" -#include "nodes/plannodes.h" /* Plan */ -#include "nodes/relation.h" /* RelOptInfo */ +#include "nodes/plannodes.h" /* Plan */ +#include "nodes/relation.h" /* RelOptInfo */ #include "optimizer/pathnode.h" /* Path */ -#include "optimizer/paths.h" /* cdb_build_distribution_pathkeys() */ -#include "optimizer/tlist.h" /* tlist_member() */ -#include "parser/parse_expr.h" /* exprType() and exprTypmod() */ +#include "optimizer/paths.h" /* cdb_build_distribution_pathkeys() */ +#include "optimizer/tlist.h" /* tlist_member() */ +#include "parser/parse_expr.h" /* exprType() and exprTypmod() */ #include "cdb/cdbvars.h" -#include "cdb/cdbpathlocus.h" /* me */ +#include "cdb/cdbpathlocus.h" /* me */ -static List * -cdb_build_distribution_pathkeys(PlannerInfo *root, - RelOptInfo *rel, - int nattrs, - AttrNumber *attrs); +static List *cdb_build_distribution_pathkeys(PlannerInfo *root, + RelOptInfo *rel, + int nattrs, + AttrNumber *attrs); /* @@ -44,7 +43,7 @@ cdb_build_distribution_pathkeys(PlannerInfo *root, * To use it, simply set it to true before running a catalog query, then set * it back to false. */ -bool cdbpathlocus_querysegmentcatalogs = false; +bool cdbpathlocus_querysegmentcatalogs = false; /* * Are two pathkeys equal? @@ -61,8 +60,8 @@ pathkeys_equal(List *apathkey, List *bpathkey) forboth(acell, apathkey, bcell, bpathkey) { - PathKey *apathkey = (PathKey *) lfirst(acell); - PathKey *bpathkey = (PathKey *) lfirst(bcell); + PathKey *apathkey = (PathKey *) lfirst(acell); + PathKey *bpathkey = (PathKey *) lfirst(bcell); Assert(IsA(apathkey, PathKey)); Assert(IsA(bpathkey, PathKey)); @@ -114,105 +113,105 @@ list_contains_pathkey(List *list, List *pathkey) * - Returns false otherwise. */ bool -cdbpathlocus_compare(CdbPathLocus_Comparison op, - CdbPathLocus a, - CdbPathLocus b) +cdbpathlocus_compare(CdbPathLocus_Comparison op, + CdbPathLocus a, + CdbPathLocus b) { - ListCell *acell; - ListCell *bcell; - ListCell *aequivpathkeycell; - ListCell *bequivpathkeycell; + ListCell *acell; + ListCell *bcell; + ListCell *aequivpathkeycell; + ListCell *bequivpathkeycell; - Assert(op == CdbPathLocus_Comparison_Equal || - op == CdbPathLocus_Comparison_Contains); + Assert(op == CdbPathLocus_Comparison_Equal || + op == CdbPathLocus_Comparison_Contains); - if (CdbPathLocus_IsStrewn(a) || - CdbPathLocus_IsStrewn(b)) - return false; + if (CdbPathLocus_IsStrewn(a) || + CdbPathLocus_IsStrewn(b)) + return false; - if (CdbPathLocus_IsEqual(a, b)) - return true; + if (CdbPathLocus_IsEqual(a, b)) + return true; - if (CdbPathLocus_Degree(a) == 0 || + if (CdbPathLocus_Degree(a) == 0 || CdbPathLocus_Degree(b) == 0 || - CdbPathLocus_Degree(a) != CdbPathLocus_Degree(b)) - return false; + CdbPathLocus_Degree(a) != CdbPathLocus_Degree(b)) + return false; - if (a.locustype == b.locustype) - { - if (CdbPathLocus_IsHashed(a)) + if (a.locustype == b.locustype) + { + if (CdbPathLocus_IsHashed(a)) return pathkeys_equal(a.partkey_h, b.partkey_h); - if (CdbPathLocus_IsHashedOJ(a)) - { - forboth(acell, a.partkey_oj, bcell, b.partkey_oj) - { - List *aequivpathkeylist = (List *) lfirst(acell); - List *bequivpathkeylist = (List *) lfirst(bcell); - - foreach(bequivpathkeycell, bequivpathkeylist) - { - List *bpathkey = (List *)lfirst(bequivpathkeycell); - - if (!list_contains_pathkey(aequivpathkeylist, bpathkey)) - return false; - } - if (op == CdbPathLocus_Comparison_Equal) - { - foreach(aequivpathkeycell, aequivpathkeylist) - { - List *apathkey = (List *)lfirst(aequivpathkeycell); - - if (!list_contains_pathkey(bequivpathkeylist, apathkey)) - return false; - } - } - } - return true; - } - } - - if (CdbPathLocus_IsHashedOJ(a) && - CdbPathLocus_IsHashed(b)) - { - if (op == CdbPathLocus_Comparison_Equal) - CdbSwap(CdbPathLocus, a, b); - else - { - forboth(acell, a.partkey_oj, bcell, b.partkey_h) - { - List *aequivpathkeylist = (List *)lfirst(acell); - List *bpathkey = (List *)lfirst(bcell); - - if (!list_member_ptr(aequivpathkeylist, bpathkey)) - return false; - } - return true; - } - } - - if (CdbPathLocus_IsHashed(a) && - CdbPathLocus_IsHashedOJ(b)) - { - forboth(acell, a.partkey_h, bcell, b.partkey_oj) - { - List *apathkey = (List *)lfirst(acell); - List *bequivpathkeylist = (List *)lfirst(bcell); - - foreach(bequivpathkeycell, bequivpathkeylist) - { - List *bpathkey = (List *)lfirst(bequivpathkeycell); - - if (apathkey != bpathkey) - return false; - } - } - return true; - } - - Assert(false); - return false; -} /* cdbpathlocus_compare */ + if (CdbPathLocus_IsHashedOJ(a)) + { + forboth(acell, a.partkey_oj, bcell, b.partkey_oj) + { + List *aequivpathkeylist = (List *) lfirst(acell); + List *bequivpathkeylist = (List *) lfirst(bcell); + + foreach(bequivpathkeycell, bequivpathkeylist) + { + List *bpathkey = (List *) lfirst(bequivpathkeycell); + + if (!list_contains_pathkey(aequivpathkeylist, bpathkey)) + return false; + } + if (op == CdbPathLocus_Comparison_Equal) + { + foreach(aequivpathkeycell, aequivpathkeylist) + { + List *apathkey = (List *) lfirst(aequivpathkeycell); + + if (!list_contains_pathkey(bequivpathkeylist, apathkey)) + return false; + } + } + } + return true; + } + } + + if (CdbPathLocus_IsHashedOJ(a) && + CdbPathLocus_IsHashed(b)) + { + if (op == CdbPathLocus_Comparison_Equal) + CdbSwap(CdbPathLocus, a, b); + else + { + forboth(acell, a.partkey_oj, bcell, b.partkey_h) + { + List *aequivpathkeylist = (List *) lfirst(acell); + List *bpathkey = (List *) lfirst(bcell); + + if (!list_member_ptr(aequivpathkeylist, bpathkey)) + return false; + } + return true; + } + } + + if (CdbPathLocus_IsHashed(a) && + CdbPathLocus_IsHashedOJ(b)) + { + forboth(acell, a.partkey_h, bcell, b.partkey_oj) + { + List *apathkey = (List *) lfirst(acell); + List *bequivpathkeylist = (List *) lfirst(bcell); + + foreach(bequivpathkeycell, bequivpathkeylist) + { + List *bpathkey = (List *) lfirst(bequivpathkeycell); + + if (apathkey != bpathkey) + return false; + } + } + return true; + } + + Assert(false); + return false; +} /* cdbpathlocus_compare */ /* * cdb_build_distribution_pathkeys @@ -229,70 +228,74 @@ cdbpathlocus_compare(CdbPathLocus_Comparison op, * other contexts. */ static List * -cdb_build_distribution_pathkeys(PlannerInfo *root, - RelOptInfo *rel, - int nattrs, - AttrNumber *attrs) +cdb_build_distribution_pathkeys(PlannerInfo *root, + RelOptInfo *rel, + int nattrs, + AttrNumber *attrs) { - List *retval = NIL; - List *eq = list_make1(makeString("=")); - int i; - bool isAppendChildRelation = false; - - isAppendChildRelation = (rel->reloptkind == RELOPT_OTHER_MEMBER_REL); - - for (i = 0; i < nattrs; ++i) - { - PathKey *cpathkey; - - /* Find or create a Var node that references the specified column. */ - Var *expr = find_indexkey_var(root, rel, attrs[i]); - Assert(expr); - - /* - * Find or create a pathkey. We distinguish two cases for performance reasons: - * 1) If the relation in question is a child relation under an append node, we don't care - * about ensuring that we return a canonicalized version of its pathkey item. - * Co-location of joins/group-bys happens at the append relation level. - * In create_append_path(), the call to cdbpathlocus_pull_above_projection() ensures - * that canonicalized pathkeys are created at the append relation level. - * (see MPP-3536). - * - * 2) For regular relations, we create a canonical pathkey so that we may identify - * co-location for joins/group-bys. - */ - if (isAppendChildRelation) - { - /** + List *retval = NIL; + List *eq = list_make1(makeString("=")); + int i; + bool isAppendChildRelation = false; + + isAppendChildRelation = (rel->reloptkind == RELOPT_OTHER_MEMBER_REL); + + for (i = 0; i < nattrs; ++i) + { + PathKey *cpathkey; + + /* Find or create a Var node that references the specified column. */ + Var *expr = find_indexkey_var(root, rel, attrs[i]); + + Assert(expr); + + /* + * Find or create a pathkey. We distinguish two cases for performance + * reasons: 1) If the relation in question is a child relation under + * an append node, we don't care about ensuring that we return a + * canonicalized version of its pathkey item. Co-location of + * joins/group-bys happens at the append relation level. In + * create_append_path(), the call to + * cdbpathlocus_pull_above_projection() ensures that canonicalized + * pathkeys are created at the append relation level. (see MPP-3536). + * + * 2) For regular relations, we create a canonical pathkey so that we + * may identify co-location for joins/group-bys. + */ + if (isAppendChildRelation) + { + /** * Append child relation. */ #ifdef DISTRIBUTION_PATHKEYS_DEBUG - PathKey *canonicalPathKeyList = cdb_make_pathkey_for_expr(root, (Node *) expr, eq, true); - /* - * This assert ensures that we should not really find any equivalent keys - * during canonicalization for append child relations. - */ - Assert(list_length(canonicalPathKeyList) == 1); + PathKey *canonicalPathKeyList = cdb_make_pathkey_for_expr(root, (Node *) expr, eq, true); + + /* + * This assert ensures that we should not really find any + * equivalent keys during canonicalization for append child + * relations. + */ + Assert(list_length(canonicalPathKeyList) == 1); #endif - cpathkey = cdb_make_pathkey_for_expr(root, (Node *)expr, eq, false); - Assert(cpathkey); - } - else - { - /** - * Regular relation. + cpathkey = cdb_make_pathkey_for_expr(root, (Node *) expr, eq, false); + Assert(cpathkey); + } + else + { + /** + * Regular relation. */ cpathkey = cdb_make_pathkey_for_expr(root, (Node *) expr, eq, true); - } - Assert(cpathkey); + } + Assert(cpathkey); - /* Append to list of pathkeys. */ - retval = lappend(retval, cpathkey); - } + /* Append to list of pathkeys. */ + retval = lappend(retval, cpathkey); + } list_free_deep(eq); return retval; -} /* cdb_build_distribution_pathkeys */ +} /* cdb_build_distribution_pathkeys */ /* * cdbpathlocus_from_baserel @@ -300,46 +303,47 @@ cdb_build_distribution_pathkeys(PlannerInfo *root, * Returns a locus describing the distribution of a base relation. */ CdbPathLocus -cdbpathlocus_from_baserel(struct PlannerInfo *root, - struct RelOptInfo *rel) +cdbpathlocus_from_baserel(struct PlannerInfo *root, + struct RelOptInfo *rel) { - CdbPathLocus result; - GpPolicy *policy = rel->cdbpolicy; - - if ( Gp_role != GP_ROLE_DISPATCH ) + CdbPathLocus result; + GpPolicy *policy = rel->cdbpolicy; + + if (Gp_role != GP_ROLE_DISPATCH) { CdbPathLocus_MakeEntry(&result); return result; } - if (policy && - policy->ptype == POLICYTYPE_PARTITIONED) - { - /* Are the rows distributed by hashing on specified columns? */ - if (policy->nattrs > 0) - { - List *partkey = cdb_build_distribution_pathkeys(root, - rel, - policy->nattrs, - policy->attrs); - CdbPathLocus_MakeHashed(&result, partkey); - } + if (policy && + policy->ptype == POLICYTYPE_PARTITIONED) + { + /* Are the rows distributed by hashing on specified columns? */ + if (policy->nattrs > 0) + { + List *partkey = cdb_build_distribution_pathkeys(root, + rel, + policy->nattrs, + policy->attrs); - /* Rows are distributed on an unknown criterion (uniformly, we hope!) */ - else - CdbPathLocus_MakeStrewn(&result); - } + CdbPathLocus_MakeHashed(&result, partkey); + } - /* Kludge used internally for querying catalogs on segment dbs */ - else if (cdbpathlocus_querysegmentcatalogs) - CdbPathLocus_MakeStrewn(&result); + /* Rows are distributed on an unknown criterion (uniformly, we hope!) */ + else + CdbPathLocus_MakeStrewn(&result); + } + + /* Kludge used internally for querying catalogs on segment dbs */ + else if (cdbpathlocus_querysegmentcatalogs) + CdbPathLocus_MakeStrewn(&result); - /* Normal catalog access */ - else - CdbPathLocus_MakeEntry(&result); + /* Normal catalog access */ + else + CdbPathLocus_MakeEntry(&result); - return result; -} /* cdbpathlocus_from_baserel */ + return result; +} /* cdbpathlocus_from_baserel */ /* @@ -349,26 +353,26 @@ cdbpathlocus_from_baserel(struct PlannerInfo *root, */ CdbPathLocus cdbpathlocus_from_exprs(struct PlannerInfo *root, - List *hash_on_exprs) + List *hash_on_exprs) { - CdbPathLocus locus; - List *partkey = NIL; - List *eq = list_make1(makeString("=")); - ListCell *cell; + CdbPathLocus locus; + List *partkey = NIL; + List *eq = list_make1(makeString("=")); + ListCell *cell; - foreach(cell, hash_on_exprs) - { - Node *expr = (Node *)lfirst(cell); - PathKey *pathkey; + foreach(cell, hash_on_exprs) + { + Node *expr = (Node *) lfirst(cell); + PathKey *pathkey; - pathkey = cdb_make_pathkey_for_expr(root, expr, eq, true); - partkey = lappend(partkey, pathkey); - } + pathkey = cdb_make_pathkey_for_expr(root, expr, eq, true); + partkey = lappend(partkey, pathkey); + } - CdbPathLocus_MakeHashed(&locus, partkey); - list_free_deep(eq); - return locus; -} /* cdbpathlocus_from_exprs */ + CdbPathLocus_MakeHashed(&locus, partkey); + list_free_deep(eq); + return locus; +} /* cdbpathlocus_from_exprs */ /* @@ -382,67 +386,71 @@ cdbpathlocus_from_exprs(struct PlannerInfo *root, * building Var nodes that reference the subquery's result columns. */ CdbPathLocus -cdbpathlocus_from_subquery(struct PlannerInfo *root, - struct Plan *subqplan, - Index subqrelid) +cdbpathlocus_from_subquery(struct PlannerInfo *root, + struct Plan *subqplan, + Index subqrelid) { - CdbPathLocus locus; - Flow *flow = subqplan->flow; - - Insist(flow); - - /* Flow node was made from CdbPathLocus by cdbpathtoplan_create_flow() */ - switch (flow->flotype) - { - case FLOW_SINGLETON: - if (flow->segindex == -1) - CdbPathLocus_MakeEntry(&locus); - else - CdbPathLocus_MakeSingleQE(&locus); - break; - case FLOW_REPLICATED: - CdbPathLocus_MakeReplicated(&locus); - break; - case FLOW_PARTITIONED: - { - List *partkey = NIL; - ListCell *hashexprcell; - List *eq = list_make1(makeString("=")); - foreach(hashexprcell, flow->hashExpr) - { - Node *expr = (Node *) lfirst(hashexprcell); - TargetEntry *tle; - Var *var; - PathKey *pathkey; - - /* Look for hash key expr among the subquery result columns. */ - tle = tlist_member_ignore_relabel(expr, subqplan->targetlist); - if (!tle) - break; - - Assert(tle->resno >= 1); - var = makeVar(subqrelid, - tle->resno, - exprType((Node *) tle->expr), - exprTypmod((Node *) tle->expr), - 0); - pathkey = cdb_make_pathkey_for_expr(root, (Node *)var, eq, true); - partkey = lappend(partkey, pathkey); - } - if (partkey && - !hashexprcell) - CdbPathLocus_MakeHashed(&locus, partkey); - else - CdbPathLocus_MakeStrewn(&locus); - list_free_deep(eq); - break; - } - default: - CdbPathLocus_MakeNull(&locus); - Insist(0); - } - return locus; -} /* cdbpathlocus_from_subquery */ + CdbPathLocus locus; + Flow *flow = subqplan->flow; + + Insist(flow); + + /* Flow node was made from CdbPathLocus by cdbpathtoplan_create_flow() */ + switch (flow->flotype) + { + case FLOW_SINGLETON: + if (flow->segindex == -1) + CdbPathLocus_MakeEntry(&locus); + else + CdbPathLocus_MakeSingleQE(&locus); + break; + case FLOW_REPLICATED: + CdbPathLocus_MakeReplicated(&locus); + break; + case FLOW_PARTITIONED: + { + List *partkey = NIL; + ListCell *hashexprcell; + List *eq = list_make1(makeString("=")); + + foreach(hashexprcell, flow->hashExpr) + { + Node *expr = (Node *) lfirst(hashexprcell); + TargetEntry *tle; + Var *var; + PathKey *pathkey; + + /* + * Look for hash key expr among the subquery result + * columns. + */ + tle = tlist_member_ignore_relabel(expr, subqplan->targetlist); + if (!tle) + break; + + Assert(tle->resno >= 1); + var = makeVar(subqrelid, + tle->resno, + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), + 0); + pathkey = cdb_make_pathkey_for_expr(root, (Node *) var, eq, true); + partkey = lappend(partkey, pathkey); + } + if (partkey && + !hashexprcell) + CdbPathLocus_MakeHashed(&locus, partkey); + else + CdbPathLocus_MakeStrewn(&locus); + list_free_deep(eq); + break; + } + default: + CdbPathLocus_MakeNull(&locus); + Insist(0); + } + return locus; +} /* cdbpathlocus_from_subquery */ /* @@ -454,14 +462,14 @@ cdbpathlocus_from_subquery(struct PlannerInfo *root, * partkey cannot be expressed in terms of the given relids and targetlist. */ List * -cdbpathlocus_get_partkey_exprs(CdbPathLocus locus, - Bitmapset *relids, - List *targetlist) +cdbpathlocus_get_partkey_exprs(CdbPathLocus locus, + Bitmapset *relids, + List *targetlist) { - List *result = NIL; - ListCell *partkeycell; + List *result = NIL; + ListCell *partkeycell; - Assert(cdbpathlocus_is_valid(locus)); + Assert(cdbpathlocus_is_valid(locus)); if (CdbPathLocus_IsHashed(locus)) { @@ -472,25 +480,28 @@ cdbpathlocus_get_partkey_exprs(CdbPathLocus locus, item = cdbpullup_findPathKeyExprInTargetList(pathkey, targetlist); - /* Fail if can't evaluate partkey in the context of this targetlist. */ + /* + * Fail if can't evaluate partkey in the context of this + * targetlist. + */ if (!item) return NIL; result = lappend(result, item); } return result; - } + } else if (CdbPathLocus_IsHashedOJ(locus)) { foreach(partkeycell, locus.partkey_oj) { - List *pathkeylist = (List *)lfirst(partkeycell); + List *pathkeylist = (List *) lfirst(partkeycell); ListCell *pathkeylistcell; Expr *item = NULL; foreach(pathkeylistcell, pathkeylist) { - PathKey *pathkey = (PathKey *) lfirst(pathkeylistcell); + PathKey *pathkey = (PathKey *) lfirst(pathkeylistcell); item = cdbpullup_findPathKeyExprInTargetList(pathkey, targetlist); @@ -507,7 +518,7 @@ cdbpathlocus_get_partkey_exprs(CdbPathLocus locus, } else return NIL; -} /* cdbpathlocus_get_partkey_exprs */ +} /* cdbpathlocus_get_partkey_exprs */ /* @@ -529,25 +540,25 @@ cdbpathlocus_get_partkey_exprs(CdbPathLocus locus, * Ignored if 'newvarlist' is specified. */ CdbPathLocus -cdbpathlocus_pull_above_projection(struct PlannerInfo *root, - CdbPathLocus locus, - Bitmapset *relids, - List *targetlist, - List *newvarlist, - Index newrelid) +cdbpathlocus_pull_above_projection(struct PlannerInfo *root, + CdbPathLocus locus, + Bitmapset *relids, + List *targetlist, + List *newvarlist, + Index newrelid) { - CdbPathLocus newlocus; - ListCell *partkeycell; - List *newpartkey = NIL; + CdbPathLocus newlocus; + ListCell *partkeycell; + List *newpartkey = NIL; - Assert(cdbpathlocus_is_valid(locus)); + Assert(cdbpathlocus_is_valid(locus)); if (CdbPathLocus_IsHashed(locus)) { foreach(partkeycell, locus.partkey_h) { - PathKey *oldpathkey; - PathKey *newpathkey = NULL; + PathKey *oldpathkey; + PathKey *newpathkey = NULL; /* Get pathkey for key expr rewritten in terms of projection cols. */ oldpathkey = (PathKey *) lfirst(partkeycell); @@ -558,7 +569,10 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, newvarlist, newrelid); - /* Fail if can't evaluate partkey in the context of this targetlist. */ + /* + * Fail if can't evaluate partkey in the context of this + * targetlist. + */ if (!newpathkey) { CdbPathLocus_MakeStrewn(&newlocus); @@ -578,35 +592,39 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, /* For each column of the partitioning key... */ foreach(partkeycell, locus.partkey_oj) { - PathKey *oldpathkey; - PathKey *newpathkey = NULL; + PathKey *oldpathkey; + PathKey *newpathkey = NULL; /* Get pathkey for key expr rewritten in terms of projection cols. */ - List *pathkeylist = (List *)lfirst(partkeycell); + List *pathkeylist = (List *) lfirst(partkeycell); ListCell *pathkeylistcell; foreach(pathkeylistcell, pathkeylist) - { - oldpathkey = (PathKey *) lfirst(pathkeylistcell); - newpathkey = cdb_pull_up_pathkey(root, - oldpathkey, - relids, - targetlist, - newvarlist, - newrelid); - if (newpathkey) - break; - } - /* - * NB: Targetlist might include columns from both sides of - * outer join "=" comparison, in which case cdb_pull_up_pathkey - * might succeed on pathkeys from more than one pathkeylist. - * The pulled-up locus could then be a HashedOJ locus, perhaps - * saving a Motion when an outer join is followed by UNION ALL - * followed by a join or aggregate. For now, don't bother. - */ - - /* Fail if can't evaluate partkey in the context of this targetlist. */ + { + oldpathkey = (PathKey *) lfirst(pathkeylistcell); + newpathkey = cdb_pull_up_pathkey(root, + oldpathkey, + relids, + targetlist, + newvarlist, + newrelid); + if (newpathkey) + break; + } + + /* + * NB: Targetlist might include columns from both sides of outer + * join "=" comparison, in which case cdb_pull_up_pathkey might + * succeed on pathkeys from more than one pathkeylist. The + * pulled-up locus could then be a HashedOJ locus, perhaps saving + * a Motion when an outer join is followed by UNION ALL followed + * by a join or aggregate. For now, don't bother. + */ + + /* + * Fail if can't evaluate partkey in the context of this + * targetlist. + */ if (!newpathkey) { CdbPathLocus_MakeStrewn(&newlocus); @@ -623,7 +641,7 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, } else return locus; -} /* cdbpathlocus_pull_above_projection */ +} /* cdbpathlocus_pull_above_projection */ /* @@ -635,41 +653,41 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, CdbPathLocus cdbpathlocus_join(CdbPathLocus a, CdbPathLocus b) { - ListCell *acell; - ListCell *bcell; - List *equivpathkeylist; - CdbPathLocus ojlocus = {0}; + ListCell *acell; + ListCell *bcell; + List *equivpathkeylist; + CdbPathLocus ojlocus = {0}; Assert(cdbpathlocus_is_valid(a)); Assert(cdbpathlocus_is_valid(b)); - /* Do both input rels have same locus? */ - if (cdbpathlocus_compare(CdbPathLocus_Comparison_Equal, a, b)) - return a; + /* Do both input rels have same locus? */ + if (cdbpathlocus_compare(CdbPathLocus_Comparison_Equal, a, b)) + return a; - /* If one rel is general or replicated, result stays with the other rel. */ - if (CdbPathLocus_IsGeneral(a) || - CdbPathLocus_IsReplicated(a)) - return b; - if (CdbPathLocus_IsGeneral(b) || - CdbPathLocus_IsReplicated(b)) - return a; + /* If one rel is general or replicated, result stays with the other rel. */ + if (CdbPathLocus_IsGeneral(a) || + CdbPathLocus_IsReplicated(a)) + return b; + if (CdbPathLocus_IsGeneral(b) || + CdbPathLocus_IsReplicated(b)) + return a; - /* This is an outer join, or one or both inputs are outer join results. */ + /* This is an outer join, or one or both inputs are outer join results. */ - Assert(CdbPathLocus_Degree(a) > 0 && - CdbPathLocus_Degree(a) == CdbPathLocus_Degree(b)); + Assert(CdbPathLocus_Degree(a) > 0 && + CdbPathLocus_Degree(a) == CdbPathLocus_Degree(b)); - if (CdbPathLocus_IsHashed(a) && - CdbPathLocus_IsHashed(b)) - { - /* Zip the two pathkey lists together to make a HashedOJ locus. */ + if (CdbPathLocus_IsHashed(a) && + CdbPathLocus_IsHashed(b)) + { + /* Zip the two pathkey lists together to make a HashedOJ locus. */ List *partkey_oj = NIL; forboth(acell, a.partkey_h, bcell, b.partkey_h) - { - PathKey *apathkey = (PathKey *) lfirst(acell); - PathKey *bpathkey = (PathKey *)lfirst(bcell); + { + PathKey *apathkey = (PathKey *) lfirst(acell); + PathKey *bpathkey = (PathKey *) lfirst(bcell); equivpathkeylist = list_make2(apathkey, bpathkey); partkey_oj = lappend(partkey_oj, equivpathkeylist); @@ -677,14 +695,14 @@ cdbpathlocus_join(CdbPathLocus a, CdbPathLocus b) CdbPathLocus_MakeHashedOJ(&ojlocus, partkey_oj); Assert(cdbpathlocus_is_valid(ojlocus)); return ojlocus; - } + } - if (!CdbPathLocus_IsHashedOJ(a)) - CdbSwap(CdbPathLocus, a, b); + if (!CdbPathLocus_IsHashedOJ(a)) + CdbSwap(CdbPathLocus, a, b); - Assert(CdbPathLocus_IsHashedOJ(a)); - Assert(CdbPathLocus_IsHashed(b) || - CdbPathLocus_IsHashedOJ(b)); + Assert(CdbPathLocus_IsHashedOJ(a)); + Assert(CdbPathLocus_IsHashed(b) || + CdbPathLocus_IsHashedOJ(b)); if (CdbPathLocus_IsHashed(b)) { @@ -693,11 +711,11 @@ cdbpathlocus_join(CdbPathLocus a, CdbPathLocus b) forboth(acell, a.partkey_oj, bcell, b.partkey_h) { List *aequivpathkeylist = (List *) lfirst(acell); - PathKey *bpathkey = (PathKey *) lfirst(bcell); + PathKey *bpathkey = (PathKey *) lfirst(bcell); - equivpathkeylist = lappend(list_copy(aequivpathkeylist), bpathkey); + equivpathkeylist = lappend(list_copy(aequivpathkeylist), bpathkey); partkey_oj = lappend(partkey_oj, equivpathkeylist); - } + } CdbPathLocus_MakeHashedOJ(&ojlocus, partkey_oj); } else if (CdbPathLocus_IsHashedOJ(b)) @@ -706,18 +724,18 @@ cdbpathlocus_join(CdbPathLocus a, CdbPathLocus b) forboth(acell, a.partkey_oj, bcell, b.partkey_oj) { - List *aequivpathkeylist = (List *) lfirst(acell); - List *bequivpathkeylist = (List *) lfirst(bcell); + List *aequivpathkeylist = (List *) lfirst(acell); + List *bequivpathkeylist = (List *) lfirst(bcell); - equivpathkeylist = list_union_ptr(aequivpathkeylist, - bequivpathkeylist); + equivpathkeylist = list_union_ptr(aequivpathkeylist, + bequivpathkeylist); partkey_oj = lappend(partkey_oj, equivpathkeylist); } CdbPathLocus_MakeHashedOJ(&ojlocus, partkey_oj); - } - Assert(cdbpathlocus_is_valid(ojlocus)); - return ojlocus; -} /* cdbpathlocus_join */ + } + Assert(cdbpathlocus_is_valid(ojlocus)); + return ojlocus; +} /* cdbpathlocus_join */ /* * cdbpathlocus_is_hashed_on_exprs @@ -731,9 +749,9 @@ cdbpathlocus_join(CdbPathLocus a, CdbPathLocus b) bool cdbpathlocus_is_hashed_on_exprs(CdbPathLocus locus, List *exprlist) { - ListCell *partkeycell; + ListCell *partkeycell; - Assert(cdbpathlocus_is_valid(locus)); + Assert(cdbpathlocus_is_valid(locus)); if (CdbPathLocus_IsHashed(locus)) { @@ -742,14 +760,15 @@ cdbpathlocus_is_hashed_on_exprs(CdbPathLocus locus, List *exprlist) bool found = false; ListCell *i; - /* Does pathkey have an expr that is equal() to one in exprlist? */ - PathKey *pathkey = (PathKey *) lfirst(partkeycell); + /* Does pathkey have an expr that is equal() to one in exprlist? */ + PathKey *pathkey = (PathKey *) lfirst(partkeycell); Assert(IsA(pathkey, PathKey)); foreach(i, pathkey->pk_eclass->ec_members) { EquivalenceMember *em = (EquivalenceMember *) lfirst(i); + if (list_member(exprlist, em->em_expr)) { found = true; @@ -758,7 +777,7 @@ cdbpathlocus_is_hashed_on_exprs(CdbPathLocus locus, List *exprlist) } if (!found) return false; - } + } /* Every column of the partkey contains an expr in exprlist. */ return true; } @@ -766,38 +785,40 @@ cdbpathlocus_is_hashed_on_exprs(CdbPathLocus locus, List *exprlist) { foreach(partkeycell, locus.partkey_oj) { - List *pathkeylist = (List *) lfirst(partkeycell); - ListCell *pathkeylistcell; + List *pathkeylist = (List *) lfirst(partkeycell); + ListCell *pathkeylistcell; bool found = false; - foreach(pathkeylistcell, pathkeylist) - { - /* Does some expr in pathkey match some item in exprlist? */ - PathKey *item = (PathKey *) lfirst(pathkeylistcell); - ListCell *i; + + foreach(pathkeylistcell, pathkeylist) + { + /* Does some expr in pathkey match some item in exprlist? */ + PathKey *item = (PathKey *) lfirst(pathkeylistcell); + ListCell *i; Assert(IsA(item, PathKey)); foreach(i, item->pk_eclass->ec_members) { EquivalenceMember *em = (EquivalenceMember *) lfirst(i); + if (list_member(exprlist, em->em_expr)) { found = true; break; } - } + } if (found) break; - } - if (!found) - return false; - } + } + if (!found) + return false; + } /* Every column of the partkey contains an expr in exprlist. */ return true; } else return !CdbPathLocus_IsStrewn(locus); -} /* cdbpathlocus_is_hashed_on_exprs */ +} /* cdbpathlocus_is_hashed_on_exprs */ /* * cdbpathlocus_is_hashed_on_eclasses @@ -823,7 +844,7 @@ cdbpathlocus_is_hashed_on_eclasses(CdbPathLocus locus, List *eclasses, { foreach(partkeycell, locus.partkey_h) { - PathKey *pathkey = (PathKey *) lfirst(partkeycell); + PathKey *pathkey = (PathKey *) lfirst(partkeycell); bool found = false; EquivalenceClass *pk_ec; @@ -866,7 +887,7 @@ cdbpathlocus_is_hashed_on_eclasses(CdbPathLocus locus, List *eclasses, foreach(pathkeylistcell, pathkeylist) { - PathKey *pathkey = (PathKey *) lfirst(pathkeylistcell); + PathKey *pathkey = (PathKey *) lfirst(pathkeylistcell); EquivalenceClass *pk_ec; /* Does pathkey have an eclass that's not in 'eclasses'? */ @@ -903,7 +924,7 @@ cdbpathlocus_is_hashed_on_eclasses(CdbPathLocus locus, List *eclasses, } else return !CdbPathLocus_IsStrewn(locus); -} /* cdbpathlocus_is_hashed_on_exprs */ +} /* cdbpathlocus_is_hashed_on_exprs */ /* @@ -925,10 +946,10 @@ cdbpathlocus_is_hashed_on_eclasses(CdbPathLocus locus, List *eclasses, bool cdbpathlocus_is_hashed_on_relids(CdbPathLocus locus, Bitmapset *relids) { - ListCell *partkeycell; - ListCell *pathkeycell; + ListCell *partkeycell; + ListCell *pathkeycell; - Assert(cdbpathlocus_is_valid(locus)); + Assert(cdbpathlocus_is_valid(locus)); if (CdbPathLocus_IsHashed(locus)) { @@ -936,44 +957,51 @@ cdbpathlocus_is_hashed_on_relids(CdbPathLocus locus, Bitmapset *relids) { bool found = false; - /* Does pathkey contain a Var whose varno is in relids? */ - PathKey *pathkey = (PathKey *) lfirst(partkeycell); + /* Does pathkey contain a Var whose varno is in relids? */ + PathKey *pathkey = (PathKey *) lfirst(partkeycell); Assert(IsA(pathkey, PathKey)); - foreach(pathkeycell, pathkey->pk_eclass->ec_members) - { + foreach(pathkeycell, pathkey->pk_eclass->ec_members) + { EquivalenceMember *em = (EquivalenceMember *) lfirst(pathkeycell); - if (IsA(em->em_expr, Var) && bms_is_subset(em->em_relids, relids)) + + if (IsA(em->em_expr, Var) &&bms_is_subset(em->em_relids, relids)) { found = true; break; } - } + } if (!found) return false; - } - /* Every column of the partkey contains a Var whose varno is in relids. */ + } + + /* + * Every column of the partkey contains a Var whose varno is in + * relids. + */ return true; } else if (CdbPathLocus_IsHashedOJ(locus)) { - foreach(partkeycell, locus.partkey_oj) + foreach(partkeycell, locus.partkey_oj) { bool found = false; - List *pathkeylist = (List *) lfirst(partkeycell); - ListCell *pathkeylistcell; - foreach(pathkeylistcell, pathkeylist) - { - /* Does pathkey contain a Var whose varno is in relids? */ - PathKey *item = (PathKey *)lfirst(pathkeylistcell); - ListCell *i; + List *pathkeylist = (List *) lfirst(partkeycell); + ListCell *pathkeylistcell; + + foreach(pathkeylistcell, pathkeylist) + { + /* Does pathkey contain a Var whose varno is in relids? */ + PathKey *item = (PathKey *) lfirst(pathkeylistcell); + ListCell *i; Assert(IsA(item, PathKey)); foreach(i, item->pk_eclass->ec_members) { EquivalenceMember *em = (EquivalenceMember *) lfirst(i); - if (IsA(em->em_expr, Var) && bms_is_subset(em->em_relids, relids)) + + if (IsA(em->em_expr, Var) &&bms_is_subset(em->em_relids, relids)) { found = true; break; @@ -981,16 +1009,20 @@ cdbpathlocus_is_hashed_on_relids(CdbPathLocus locus, Bitmapset *relids) } if (found) break; - } - if (!found) - return false; - } - /* Every column of the partkey contains a Var whose varno is in relids. */ + } + if (!found) + return false; + } + + /* + * Every column of the partkey contains a Var whose varno is in + * relids. + */ return true; - } + } else return !CdbPathLocus_IsStrewn(locus); -} /* cdbpathlocus_is_hashed_on_relids */ +} /* cdbpathlocus_is_hashed_on_relids */ /* @@ -1001,14 +1033,14 @@ cdbpathlocus_is_hashed_on_relids(CdbPathLocus locus, Bitmapset *relids) bool cdbpathlocus_is_valid(CdbPathLocus locus) { - ListCell *partkeycell; + ListCell *partkeycell; if (!CdbLocusType_IsValid(locus.locustype)) goto bad; - if (!CdbPathLocus_IsHashed(locus) && locus.partkey_h != NIL) + if (!CdbPathLocus_IsHashed(locus) && locus.partkey_h != NIL) goto bad; - if (!CdbPathLocus_IsHashedOJ(locus) && locus.partkey_oj != NIL) + if (!CdbPathLocus_IsHashedOJ(locus) && locus.partkey_oj != NIL) goto bad; if (CdbPathLocus_IsHashed(locus)) @@ -1019,7 +1051,8 @@ cdbpathlocus_is_valid(CdbPathLocus locus) goto bad; foreach(partkeycell, locus.partkey_h) { - PathKey *item = (PathKey *) lfirst(partkeycell); + PathKey *item = (PathKey *) lfirst(partkeycell); + if (!item || !IsA(item, PathKey)) goto bad; } @@ -1033,13 +1066,13 @@ cdbpathlocus_is_valid(CdbPathLocus locus) foreach(partkeycell, locus.partkey_oj) { List *item = (List *) lfirst(partkeycell); + if (!item || !IsA(item, List)) goto bad; } } - return true; + return true; bad: - return false; -} /* cdbpathlocus_is_valid */ - + return false; +} /* cdbpathlocus_is_valid */ diff --git a/src/backend/cdb/cdbpathtoplan.c b/src/backend/cdb/cdbpathtoplan.c index dc43484560..d029da01b6 100644 --- a/src/backend/cdb/cdbpathtoplan.c +++ b/src/backend/cdb/cdbpathtoplan.c @@ -17,191 +17,192 @@ #include "optimizer/planmain.h" /* make_sort_from_pathkeys() */ #include "optimizer/tlist.h" -#include "cdb/cdbllize.h" /* makeFlow() */ -#include "cdb/cdbmutate.h" /* make_*_motion() */ +#include "cdb/cdbllize.h" /* makeFlow() */ +#include "cdb/cdbmutate.h" /* make_*_motion() */ #include "cdb/cdbutil.h" -#include "cdb/cdbvars.h" /* gp_singleton_segindex */ +#include "cdb/cdbvars.h" /* gp_singleton_segindex */ -#include "cdb/cdbpathtoplan.h" /* me */ +#include "cdb/cdbpathtoplan.h" /* me */ /* * cdbpathtoplan_create_flow */ Flow * -cdbpathtoplan_create_flow(PlannerInfo *root, - CdbPathLocus locus, - Relids relids, - List *pathkeys, - Plan *plan) +cdbpathtoplan_create_flow(PlannerInfo *root, + CdbPathLocus locus, + Relids relids, + List *pathkeys, + Plan *plan) { - Flow *flow = NULL; - - /* Distribution */ - if (CdbPathLocus_IsEntry(locus)) - { - flow = makeFlow(FLOW_SINGLETON); - flow->segindex = -1; - } - else if (CdbPathLocus_IsSingleQE(locus)) - { - flow = makeFlow(FLOW_SINGLETON); - flow->segindex = 0; - } - else if (CdbPathLocus_IsGeneral(locus)) - { - flow = makeFlow(FLOW_SINGLETON); - flow->segindex = 0; - } - else if (CdbPathLocus_IsReplicated(locus)) - { - flow = makeFlow(FLOW_REPLICATED); - } - else if (CdbPathLocus_IsHashed(locus) || - CdbPathLocus_IsHashedOJ(locus)) - { - flow = makeFlow(FLOW_PARTITIONED); - flow->hashExpr = cdbpathlocus_get_partkey_exprs(locus, - relids, - plan->targetlist); - /* - * hashExpr can be NIL if the rel is partitioned on columns that aren't - * projected (i.e. are not present in the result of this Path operator). - */ - } - else if (CdbPathLocus_IsStrewn(locus)) - flow = makeFlow(FLOW_PARTITIONED); - else - Insist(0); + Flow *flow = NULL; + + /* Distribution */ + if (CdbPathLocus_IsEntry(locus)) + { + flow = makeFlow(FLOW_SINGLETON); + flow->segindex = -1; + } + else if (CdbPathLocus_IsSingleQE(locus)) + { + flow = makeFlow(FLOW_SINGLETON); + flow->segindex = 0; + } + else if (CdbPathLocus_IsGeneral(locus)) + { + flow = makeFlow(FLOW_SINGLETON); + flow->segindex = 0; + } + else if (CdbPathLocus_IsReplicated(locus)) + { + flow = makeFlow(FLOW_REPLICATED); + } + else if (CdbPathLocus_IsHashed(locus) || + CdbPathLocus_IsHashedOJ(locus)) + { + flow = makeFlow(FLOW_PARTITIONED); + flow->hashExpr = cdbpathlocus_get_partkey_exprs(locus, + relids, + plan->targetlist); + + /* + * hashExpr can be NIL if the rel is partitioned on columns that + * aren't projected (i.e. are not present in the result of this Path + * operator). + */ + } + else if (CdbPathLocus_IsStrewn(locus)) + flow = makeFlow(FLOW_PARTITIONED); + else + Insist(0); - flow->req_move = MOVEMENT_NONE; + flow->req_move = MOVEMENT_NONE; flow->locustype = locus.locustype; - return flow; -} /* cdbpathtoplan_create_flow */ + return flow; +} /* cdbpathtoplan_create_flow */ /* * cdbpathtoplan_create_motion_plan */ Motion * -cdbpathtoplan_create_motion_plan(PlannerInfo *root, - CdbMotionPath *path, - Plan *subplan) +cdbpathtoplan_create_motion_plan(PlannerInfo *root, + CdbMotionPath *path, + Plan *subplan) { - Motion *motion = NULL; - Path *subpath = path->subpath; - - /* Send all tuples to a single process? */ - if (CdbPathLocus_IsBottleneck(path->path.locus)) - { - int destSegIndex = -1; /* to dispatcher */ - - if (CdbPathLocus_IsSingleQE(path->path.locus)) - destSegIndex = gp_singleton_segindex; /* to singleton qExec */ - - if (path->path.pathkeys) - { - /* - * Build a dummy Sort node. We'll take its sort key info to - * define our Merge Receive keys. Unchanged subplan ptr is - * returned to us if ordering is degenerate (all cols constant). - */ - Sort *sort = make_sort_from_pathkeys(root, - subplan, - path->path.pathkeys, - -1.0, - true); - - /* Merge Receive to preserve ordering */ - if (sort) - { - /* Result node might have been added below the Sort */ - subplan = sort->plan.lefttree; - motion = make_sorted_union_motion(root, + Motion *motion = NULL; + Path *subpath = path->subpath; + + /* Send all tuples to a single process? */ + if (CdbPathLocus_IsBottleneck(path->path.locus)) + { + int destSegIndex = -1; /* to dispatcher */ + + if (CdbPathLocus_IsSingleQE(path->path.locus)) + destSegIndex = gp_singleton_segindex; /* to singleton qExec */ + + if (path->path.pathkeys) + { + /* + * Build a dummy Sort node. We'll take its sort key info to + * define our Merge Receive keys. Unchanged subplan ptr is + * returned to us if ordering is degenerate (all cols constant). + */ + Sort *sort = make_sort_from_pathkeys(root, + subplan, + path->path.pathkeys, + -1.0, + true); + + /* Merge Receive to preserve ordering */ + if (sort) + { + /* Result node might have been added below the Sort */ + subplan = sort->plan.lefttree; + motion = make_sorted_union_motion(root, subplan, - destSegIndex, + destSegIndex, path->path.pathkeys, - false /* useExecutorVarFormat */ - ); - } - - /* Degenerate ordering... build unordered Union Receive */ - else - motion = make_union_motion(subplan, - destSegIndex, - false /* useExecutorVarFormat */ - ); - } - - /* Unordered Union Receive */ - else - motion = make_union_motion(subplan, - destSegIndex, - false /* useExecutorVarFormat */ - ); - } - - /* Send all of the tuples to all of the QEs in gang above... */ - else if (CdbPathLocus_IsReplicated(path->path.locus)) - motion = make_broadcast_motion(subplan, - false /* useExecutorVarFormat */ - ); - - /* Hashed redistribution to all QEs in gang above... */ - else if (CdbPathLocus_IsHashed(path->path.locus) || - CdbPathLocus_IsHashedOJ(path->path.locus)) - { - List *hashExpr = cdbpathlocus_get_partkey_exprs(path->path.locus, - path->path.parent->relids, - subplan->targetlist); - Insist(hashExpr); - - /** + false /* useExecutorVarFormat */ + ); + } + + /* Degenerate ordering... build unordered Union Receive */ + else + motion = make_union_motion(subplan, + destSegIndex, + false /* useExecutorVarFormat */ + ); + } + + /* Unordered Union Receive */ + else + motion = make_union_motion(subplan, + destSegIndex, + false /* useExecutorVarFormat */ + ); + } + + /* Send all of the tuples to all of the QEs in gang above... */ + else if (CdbPathLocus_IsReplicated(path->path.locus)) + motion = make_broadcast_motion(subplan, + false /* useExecutorVarFormat */ + ); + + /* Hashed redistribution to all QEs in gang above... */ + else if (CdbPathLocus_IsHashed(path->path.locus) || + CdbPathLocus_IsHashedOJ(path->path.locus)) + { + List *hashExpr = cdbpathlocus_get_partkey_exprs(path->path.locus, + path->path.parent->relids, + subplan->targetlist); + + Insist(hashExpr); + + /** * If there are subplans in the hashExpr, push it down to lower level. */ - if (contain_subplans((Node *) hashExpr)) + if (contain_subplans((Node *) hashExpr)) { /* make a Result node to do the projection if necessary */ if (!is_projection_capable_plan(subplan)) { - List *tlist = copyObject(subplan->targetlist); + List *tlist = copyObject(subplan->targetlist); subplan = (Plan *) make_result(root, tlist, NULL, subplan); } subplan->targetlist = add_to_flat_tlist(subplan->targetlist, hashExpr, - true /* resjunk */); - } - motion = make_hashed_motion(subplan, - hashExpr, - false /* useExecutorVarFormat */); - } - else - Insist(0); - - /* - * Decorate the subplan with a Flow node telling the plan slicer - * what kind of gang will be needed to execute the subplan. - */ - subplan->flow = cdbpathtoplan_create_flow(root, - subpath->locus, - subpath->parent - ? subpath->parent->relids - : NULL, - subpath->pathkeys, - subplan); + true /* resjunk */ ); + } + motion = make_hashed_motion(subplan, + hashExpr, + false /* useExecutorVarFormat */ ); + } + else + Insist(0); + + /* + * Decorate the subplan with a Flow node telling the plan slicer what kind + * of gang will be needed to execute the subplan. + */ + subplan->flow = cdbpathtoplan_create_flow(root, + subpath->locus, + subpath->parent + ? subpath->parent->relids + : NULL, + subpath->pathkeys, + subplan); /** * If plan has a flow node, and its child is projection capable, * then ensure all entries of hashExpr are in the targetlist. */ if (subplan->flow - && subplan->flow->hashExpr - && is_projection_capable_plan(subplan)) + && subplan->flow->hashExpr + && is_projection_capable_plan(subplan)) { - subplan->targetlist = add_to_flat_tlist(subplan->targetlist, subplan->flow->hashExpr, true /* resjunk */); + subplan->targetlist = add_to_flat_tlist(subplan->targetlist, subplan->flow->hashExpr, true /* resjunk */ ); } - return motion; -} /* cdbpathtoplan_create_motion_plan */ - - + return motion; +} /* cdbpathtoplan_create_motion_plan */ diff --git a/src/backend/cdb/cdbpersistentbuild.c b/src/backend/cdb/cdbpersistentbuild.c index 5c39a21acb..f24ba354a4 100755 --- a/src/backend/cdb/cdbpersistentbuild.c +++ b/src/backend/cdb/cdbpersistentbuild.c @@ -56,107 +56,108 @@ PersistentBuild_NonTransactionTruncate(RelFileNode *relFileNode) PersistentFileSysObjName fsObjName; PersistentFileSysObjName_SetRelationFile( - &fsObjName, - relFileNode, - /* segmentFileNum */ 0); + &fsObjName, + relFileNode, + /* segmentFileNum */ 0); if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), - "Non-transaction truncate of '%s'", + elog(Persistent_DebugPrintLevel(), + "Non-transaction truncate of '%s'", PersistentFileSysObjName_ObjectName(&fsObjName)); - + smgrRelation = smgropen(*relFileNode); - + smgrtruncate( - smgrRelation, - 0, - /* isTemp */ true, - /* isLocalBuf */ false, - /* persistentTid */ NULL, - /* persistentSerialNum */ 0); - + smgrRelation, + 0, + /* isTemp */ true, + /* isLocalBuf */ false, + /* persistentTid */ NULL, + /* persistentSerialNum */ 0); + smgrclose(smgrRelation); } -static void PersistentBuild_ScanGpPersistentRelationNodeForGlobal( - Relation gp_relation_node, +static void +PersistentBuild_ScanGpPersistentRelationNodeForGlobal( + Relation gp_relation_node, - int64 *count) + int64 *count) { PersistentFileSysObjData *fileSysObjData; - PersistentFileSysObjSharedData *fileSysObjSharedData; + PersistentFileSysObjSharedData *fileSysObjSharedData; PersistentStoreScan storeScan; - - Datum values[Natts_gp_persistent_relation_node]; - + + Datum values[Natts_gp_persistent_relation_node]; + ItemPointerData persistentTid; - int64 persistentSerialNum; + int64 persistentSerialNum; PersistentFileSysObj_GetDataPtrs( - PersistentFsObjType_RelationFile, - &fileSysObjData, - &fileSysObjSharedData); - + PersistentFsObjType_RelationFile, + &fileSysObjData, + &fileSysObjSharedData); + PersistentStore_BeginScan( - &fileSysObjData->storeData, - &fileSysObjSharedData->storeSharedData, - &storeScan); + &fileSysObjData->storeData, + &fileSysObjSharedData->storeSharedData, + &storeScan); while (PersistentStore_GetNext( - &storeScan, - values, - &persistentTid, - &persistentSerialNum)) + &storeScan, + values, + &persistentTid, + &persistentSerialNum)) { - RelFileNode relFileNode; - int32 segmentFileNum; - - PersistentFileSysRelStorageMgr relationStorageManager; - PersistentFileSysState persistentState; - int64 createMirrorDataLossTrackingSessionNum; - MirroredObjectExistenceState mirrorExistenceState; + RelFileNode relFileNode; + int32 segmentFileNum; + + PersistentFileSysRelStorageMgr relationStorageManager; + PersistentFileSysState persistentState; + int64 createMirrorDataLossTrackingSessionNum; + MirroredObjectExistenceState mirrorExistenceState; MirroredRelDataSynchronizationState mirrorDataSynchronizationState; - bool mirrorBufpoolMarkedForScanIncrementalResync; - int64 mirrorBufpoolResyncChangedPageCount; - XLogRecPtr mirrorBufpoolResyncCkptLoc; - BlockNumber mirrorBufpoolResyncCkptBlockNum; - int64 mirrorAppendOnlyLossEof; - int64 mirrorAppendOnlyNewEof; + bool mirrorBufpoolMarkedForScanIncrementalResync; + int64 mirrorBufpoolResyncChangedPageCount; + XLogRecPtr mirrorBufpoolResyncCkptLoc; + BlockNumber mirrorBufpoolResyncCkptBlockNum; + int64 mirrorAppendOnlyLossEof; + int64 mirrorAppendOnlyNewEof; PersistentFileSysRelBufpoolKind relBufpoolKind; - TransactionId parentXid; - int64 serialNum; - - PersistentFileSysObjName fsObjName; + TransactionId parentXid; + int64 serialNum; + + PersistentFileSysObjName fsObjName; GpPersistentRelationNode_GetValues( - values, - &relFileNode.spcNode, - &relFileNode.dbNode, - &relFileNode.relNode, - &segmentFileNum, - &relationStorageManager, - &persistentState, - &createMirrorDataLossTrackingSessionNum, - &mirrorExistenceState, - &mirrorDataSynchronizationState, - &mirrorBufpoolMarkedForScanIncrementalResync, - &mirrorBufpoolResyncChangedPageCount, - &mirrorBufpoolResyncCkptLoc, - &mirrorBufpoolResyncCkptBlockNum, - &mirrorAppendOnlyLossEof, - &mirrorAppendOnlyNewEof, - &relBufpoolKind, - &parentXid, - &serialNum); + values, + &relFileNode.spcNode, + &relFileNode.dbNode, + &relFileNode.relNode, + &segmentFileNum, + &relationStorageManager, + &persistentState, + &createMirrorDataLossTrackingSessionNum, + &mirrorExistenceState, + &mirrorDataSynchronizationState, + &mirrorBufpoolMarkedForScanIncrementalResync, + &mirrorBufpoolResyncChangedPageCount, + &mirrorBufpoolResyncCkptLoc, + &mirrorBufpoolResyncCkptBlockNum, + &mirrorAppendOnlyLossEof, + &mirrorAppendOnlyNewEof, + &relBufpoolKind, + &parentXid, + &serialNum); if (persistentState == PersistentFileSysState_Free) continue; PersistentFileSysObjName_SetRelationFile( - &fsObjName, - &relFileNode, - segmentFileNum); + &fsObjName, + &relFileNode, + segmentFileNum); if (relFileNode.spcNode != GLOBALTABLESPACE_OID) continue; @@ -165,15 +166,15 @@ static void PersistentBuild_ScanGpPersistentRelationNodeForGlobal( elog(ERROR, "Only expecting global tables to be Buffer Pool managed"); InsertGpRelationNodeTuple( - gp_relation_node, - relFileNode.relNode, // pg_class OID - /* relationName */ NULL, // Optional. - (relFileNode.spcNode == MyDatabaseTableSpace) ? 0:relFileNode.spcNode, - relFileNode.relNode, // pg_class relfilenode - /* segmentFileNum */ 0, - /* updateIndex */ false, - &persistentTid, - persistentSerialNum); + gp_relation_node, + relFileNode.relNode, //pg_class OID + /* relationName */ NULL, //Optional. + (relFileNode.spcNode == MyDatabaseTableSpace) ? 0 : relFileNode.spcNode, + relFileNode.relNode, //pg_class relfilenode + /* segmentFileNum */ 0, + /* updateIndex */ false, + &persistentTid, + persistentSerialNum); (*count)++; } @@ -181,56 +182,57 @@ static void PersistentBuild_ScanGpPersistentRelationNodeForGlobal( PersistentStore_EndScan(&storeScan); } -static void PersistentBuild_PopulateGpRelationNode( - DatabaseInfo *info, +static void +PersistentBuild_PopulateGpRelationNode( + DatabaseInfo *info, - Oid defaultTablespace, + Oid defaultTablespace, - MirroredObjectExistenceState mirrorExistenceState, + MirroredObjectExistenceState mirrorExistenceState, - MirroredRelDataSynchronizationState relDataSynchronizationState, + MirroredRelDataSynchronizationState relDataSynchronizationState, - int64 *count) + int64 *count) { - Relation gp_relation_node; - int r; + Relation gp_relation_node; + int r; RelFileNode indexRelFileNode; - bool indexFound; - Relation gp_relation_node_index; + bool indexFound; + Relation gp_relation_node_index; struct IndexInfo *indexInfo; if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "PersistentBuild_PopulateGpRelationNode: Enter for dbOid %u", info->database); MemSet(&indexRelFileNode, 0, sizeof(RelFileNode)); indexFound = false; - - gp_relation_node = - DirectOpen_GpRelationNodeOpen( - defaultTablespace, - info->database); + + gp_relation_node = + DirectOpen_GpRelationNodeOpen( + defaultTablespace, + info->database); for (r = 0; r < info->dbInfoRelArrayCount; r++) { - DbInfoRel *dbInfoRel = &info->dbInfoRelArray[r]; - + DbInfoRel *dbInfoRel = &info->dbInfoRelArray[r]; + RelFileNode relFileNode; PersistentFileSysRelStorageMgr relStorageMgr; ItemPointerData persistentTid; - int64 persistentSerialNum; + int64 persistentSerialNum; if (dbInfoRel->dbInfoRelKey.reltablespace == GLOBALTABLESPACE_OID && info->database != TemplateDbOid) continue; relFileNode.spcNode = dbInfoRel->dbInfoRelKey.reltablespace; - relFileNode.dbNode = - (dbInfoRel->dbInfoRelKey.reltablespace == GLOBALTABLESPACE_OID ? - 0 : info->database); + relFileNode.dbNode = + (dbInfoRel->dbInfoRelKey.reltablespace == GLOBALTABLESPACE_OID ? + 0 : info->database); relFileNode.relNode = dbInfoRel->dbInfoRelKey.relfilenode; if (dbInfoRel->relationOid == GpRelationNodeOidIndexId) @@ -240,83 +242,85 @@ static void PersistentBuild_PopulateGpRelationNode( } relStorageMgr = ( - (dbInfoRel->relstorage == RELSTORAGE_AOROWS || - dbInfoRel->relstorage == RELSTORAGE_AOCOLS ) ? - PersistentFileSysRelStorageMgr_AppendOnly : - PersistentFileSysRelStorageMgr_BufferPool); + (dbInfoRel->relstorage == RELSTORAGE_AOROWS || + dbInfoRel->relstorage == RELSTORAGE_AOCOLS) ? + PersistentFileSysRelStorageMgr_AppendOnly : + PersistentFileSysRelStorageMgr_BufferPool); /* - * The gp_relation_node mapping table is empty, so use the physical files as - * the guide. + * The gp_relation_node mapping table is empty, so use the physical + * files as the guide. */ if (relStorageMgr == PersistentFileSysRelStorageMgr_BufferPool) { PersistentFileSysRelStorageMgr localRelStorageMgr; PersistentFileSysRelBufpoolKind relBufpoolKind; - + GpPersistentRelationNode_GetRelationInfo( - dbInfoRel->relkind, - dbInfoRel->relstorage, - dbInfoRel->relam, - &localRelStorageMgr, - &relBufpoolKind); + dbInfoRel->relkind, + dbInfoRel->relstorage, + dbInfoRel->relam, + &localRelStorageMgr, + &relBufpoolKind); Assert(localRelStorageMgr == PersistentFileSysRelStorageMgr_BufferPool); /* * Heap tables only ever add a single segment_file_num=0 entry to - * gp_persistent_relation regardless of how many segment files there - * really are. + * gp_persistent_relation regardless of how many segment files + * there really are. */ PersistentRelation_AddCreated( - &relFileNode, - /* segmentFileNum */ 0, - relStorageMgr, - relBufpoolKind, - mirrorExistenceState, - relDataSynchronizationState, - /* mirrorAppendOnlyLossEof */ 0, - /* mirrorAppendOnlyNewEof */ 0, - dbInfoRel->relname, - &persistentTid, - &persistentSerialNum, - /* flushToXLog */ false); - + &relFileNode, + /* segmentFileNum */ 0, + relStorageMgr, + relBufpoolKind, + mirrorExistenceState, + relDataSynchronizationState, + /* mirrorAppendOnlyLossEof */ 0, + /* mirrorAppendOnlyNewEof */ 0, + dbInfoRel->relname, + &persistentTid, + &persistentSerialNum, + /* flushToXLog */ false); + InsertGpRelationNodeTuple( - gp_relation_node, - dbInfoRel->relationOid, // pg_class OID - dbInfoRel->relname, - (dbInfoRel->dbInfoRelKey.reltablespace == MyDatabaseTableSpace) ? 0:dbInfoRel->dbInfoRelKey.reltablespace, - relFileNode.relNode, // pg_class relfilenode - /* segmentFileNum */ 0, - /* updateIndex */ false, - &persistentTid, - persistentSerialNum); - + gp_relation_node, + dbInfoRel->relationOid, //pg_class OID + dbInfoRel->relname, + (dbInfoRel->dbInfoRelKey.reltablespace == MyDatabaseTableSpace) ? 0 : dbInfoRel->dbInfoRelKey.reltablespace, + relFileNode.relNode, //pg_class relfilenode + /* segmentFileNum */ 0, + /* updateIndex */ false, + &persistentTid, + persistentSerialNum); + } else { - int a; - int p; + int a; + int p; /* * Append-Only. */ + /* - * Merge physical file existence and ao[cs]seg catalog logical EOFs . + * Merge physical file existence and ao[cs]seg catalog logical + * EOFs . */ a = 0; for (p = 0; p < dbInfoRel->physicalSegmentFilesCount; p++) { - int physicalSegmentFileNum = dbInfoRel->physicalSegmentFiles[p].segmentFileNum; + int physicalSegmentFileNum = dbInfoRel->physicalSegmentFiles[p].segmentFileNum; - bool haveCatalogInfo; - int64 logicalEof; + bool haveCatalogInfo; + int64 logicalEof; - /* - * There is mostly a 1:1 matching of physical files and logical - * files and we just have to match them up correctly. However - * there are several cases where this can diverge that we have - * to be able to handle. + /* + * There is mostly a 1:1 matching of physical files and + * logical files and we just have to match them up correctly. + * However there are several cases where this can diverge that + * we have to be able to handle. * * 1) Segment file 0 always exists as a physical file, but is * only cataloged when it actually contains data - this only @@ -324,46 +328,45 @@ static void PersistentBuild_PopulateGpRelationNode( * * 2) Files created in aborted transactions where an initial * frozen tuple never made it to disk may have a physical file - * with no logical file. - * XXX - These are leaked files that should probably be - * cleaned up at some point. + * with no logical file. XXX - These are leaked files that + * should probably be cleaned up at some point. * * 3) It is possible to have files that logically exist with a - * logical EOF of 0 but not exist in the filesystem. - * XXX - how does this happen, is it really safe? + * logical EOF of 0 but not exist in the filesystem. XXX - + * how does this happen, is it really safe? */ - logicalEof = 0; + logicalEof = 0; haveCatalogInfo = false; /* If we exhaust the loop then we are in case 2 */ while (a < dbInfoRel->appendOnlyCatalogSegmentInfoCount) { DbInfoAppendOnlyCatalogSegmentInfo *logicalSegInfo = \ - &dbInfoRel->appendOnlyCatalogSegmentInfo[a]; + &dbInfoRel->appendOnlyCatalogSegmentInfo[a]; /* Normal Case: both exist */ if (logicalSegInfo->segmentFileNum == physicalSegmentFileNum) { - logicalEof = logicalSegInfo->logicalEof; + logicalEof = logicalSegInfo->logicalEof; haveCatalogInfo = true; a++; - break; /* found */ + break; /* found */ } - + /* case 0 or case 2 */ else if (logicalSegInfo->segmentFileNum > physicalSegmentFileNum) { - logicalEof = 0; + logicalEof = 0; haveCatalogInfo = false; - break; /* not found */ + break; /* not found */ } /* case 3 - skip over logical segments w/o physical files */ else if (logicalSegInfo->logicalEof == 0) { a++; - continue; /* keep looking */ + continue; /* keep looking */ } /* otherwise it is an error */ @@ -379,38 +382,39 @@ static void PersistentBuild_PopulateGpRelationNode( Assert(false); } - /* - * case 2) Ignore segment file left over from pre-Release 4.0 aborted - * transaction whose initial frozen ao[cs]seg tuple never made it to - * disk. This will be a file that can result in an upgrade complaint... + /* + * case 2) Ignore segment file left over from pre-Release 4.0 + * aborted transaction whose initial frozen ao[cs]seg tuple + * never made it to disk. This will be a file that can result + * in an upgrade complaint... */ if (physicalSegmentFileNum > 0 && !haveCatalogInfo) continue; - + PersistentRelation_AddCreated( - &relFileNode, - physicalSegmentFileNum, - relStorageMgr, - PersistentFileSysRelBufpoolKind_None, - mirrorExistenceState, - relDataSynchronizationState, - /* mirrorAppendOnlyLossEof */ logicalEof, - /* mirrorAppendOnlyNewEof */ logicalEof, - dbInfoRel->relname, - &persistentTid, - &persistentSerialNum, - /* flushToXLog */ false); - + &relFileNode, + physicalSegmentFileNum, + relStorageMgr, + PersistentFileSysRelBufpoolKind_None, + mirrorExistenceState, + relDataSynchronizationState, + /* mirrorAppendOnlyLossEof */ logicalEof, + /* mirrorAppendOnlyNewEof */ logicalEof, + dbInfoRel->relname, + &persistentTid, + &persistentSerialNum, + /* flushToXLog */ false); + InsertGpRelationNodeTuple( - gp_relation_node, - dbInfoRel->relationOid, // pg_class OID - dbInfoRel->relname, - (dbInfoRel->dbInfoRelKey.reltablespace == MyDatabaseTableSpace) ? 0:dbInfoRel->dbInfoRelKey.reltablespace, - relFileNode.relNode, // pg_class relfilenode - physicalSegmentFileNum, - /* updateIndex */ false, - &persistentTid, - persistentSerialNum); + gp_relation_node, + dbInfoRel->relationOid, //pg_class OID + dbInfoRel->relname, + (dbInfoRel->dbInfoRelKey.reltablespace == MyDatabaseTableSpace) ? 0 : dbInfoRel->dbInfoRelKey.reltablespace, + relFileNode.relNode, //pg_class relfilenode + physicalSegmentFileNum, + /* updateIndex */ false, + &persistentTid, + persistentSerialNum); } } (*count)++; @@ -419,40 +423,42 @@ static void PersistentBuild_PopulateGpRelationNode( if (info->database != TemplateDbOid) { PersistentBuild_ScanGpPersistentRelationNodeForGlobal( - gp_relation_node, - count); + gp_relation_node, + count); } /* - * Build the index for gp_relation_node. + * Build the index for gp_relation_node. * - * The problem is the session we are using is associated with one particular database - * of the cluster, but we need to iterate through all the databases. So, unfortunately, - * the solution has been to use the "Direct Open" stuff. + * The problem is the session we are using is associated with one + * particular database of the cluster, but we need to iterate through all + * the databases. So, unfortunately, the solution has been to use the + * "Direct Open" stuff. * - * We do this because MyDatabaseId, the default tablespace of the session should not be - * changed. The various caches and many other implicit things assume the object is for - * MyDatabaseId and the default tablespace. For example, we cannot use - * CatalogUpdateIndexes called in InsertGpRelationNodeTuple because it will not do - * the right thing. + * We do this because MyDatabaseId, the default tablespace of the session + * should not be changed. The various caches and many other implicit + * things assume the object is for MyDatabaseId and the default + * tablespace. For example, we cannot use CatalogUpdateIndexes called in + * InsertGpRelationNodeTuple because it will not do the right thing. * - * Also, if they re-indexed gp_relation_node, it will have a different relfilenode and so we - * must have found it (above) and open it with dynamically. + * Also, if they re-indexed gp_relation_node, it will have a different + * relfilenode and so we must have found it (above) and open it with + * dynamically. */ Assert(indexFound); - + PersistentBuild_NonTransactionTruncate( - &indexRelFileNode); - - gp_relation_node_index = - DirectOpen_GpRelationNodeIndexOpenDynamic( - GpRelationNodeOidIndexId, - indexRelFileNode.spcNode, - indexRelFileNode.dbNode, - indexRelFileNode.relNode); + &indexRelFileNode); + + gp_relation_node_index = + DirectOpen_GpRelationNodeIndexOpenDynamic( + GpRelationNodeOidIndexId, + indexRelFileNode.spcNode, + indexRelFileNode.dbNode, + indexRelFileNode.relNode); indexInfo = makeNode(IndexInfo); - + indexInfo->ii_NumIndexAttrs = Natts_gp_relation_node_index; indexInfo->ii_KeyAttrNumbers[0] = 1; indexInfo->ii_KeyAttrNumbers[1] = 2; @@ -460,7 +466,7 @@ static void PersistentBuild_PopulateGpRelationNode( indexInfo->ii_Unique = true; if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "PersistentBuild_PopulateGpRelationNode: building gp_relation_node_index %u/%u/%u for gp_relation_node %u/%u/%u", gp_relation_node_index->rd_node.spcNode, gp_relation_node_index->rd_node.dbNode, @@ -470,11 +476,11 @@ static void PersistentBuild_PopulateGpRelationNode( gp_relation_node->rd_node.relNode); index_build( - gp_relation_node, - gp_relation_node_index, - indexInfo, - false, - true); + gp_relation_node, + gp_relation_node_index, + indexInfo, + false, + true); DirectOpen_GpRelationNodeIndexClose(gp_relation_node_index); @@ -482,7 +488,7 @@ static void PersistentBuild_PopulateGpRelationNode( if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "PersistentBuild_PopulateGpRelationNode: Exit for dbOid %u", info->database); @@ -490,34 +496,34 @@ static void PersistentBuild_PopulateGpRelationNode( static int64 PersistentBuild_BuildDb( - Oid dbOid, + Oid dbOid, - bool mirrored) + bool mirrored) { - MirroredObjectExistenceState mirrorExistenceState; - MirroredRelDataSynchronizationState relDataSynchronizationState; - - int64 count = 0; - Relation gp_global_sequence; - Relation pg_database; - HeapTuple tuple; - Form_pg_database form_pg_database; - DatabaseInfo *info; - Oid defaultTablespace; - int t; + MirroredObjectExistenceState mirrorExistenceState; + MirroredRelDataSynchronizationState relDataSynchronizationState; + + int64 count = 0; + Relation gp_global_sequence; + Relation pg_database; + HeapTuple tuple; + Form_pg_database form_pg_database; + DatabaseInfo *info; + Oid defaultTablespace; + int t; SysScanDesc sscan; /* * Turn this on so we don't try to fetch persistence information from - * gp_relation_node for gp_relation_node and its index until we've done the - * assignment with PersistentRelation_AddCreated. + * gp_relation_node for gp_relation_node and its index until we've done + * the assignment with PersistentRelation_AddCreated. */ gp_before_persistence_work = true; if (mirrored) { mirrorExistenceState = MirroredObjectExistenceState_MirrorCreated; - relDataSynchronizationState = + relDataSynchronizationState = MirroredRelDataSynchronizationState_DataSynchronized; } else @@ -526,10 +532,10 @@ PersistentBuild_BuildDb( relDataSynchronizationState = MirroredRelDataSynchronizationState_None; } - /* - * If the gp_global_sequence table hasn't been populated yet then we need + /* + * If the gp_global_sequence table hasn't been populated yet then we need * to populate it before we can proceed with building the rest of the - * persistent tables. + * persistent tables. * * SELECT * FROM gp_global_sequence FOR UPDATE */ @@ -538,12 +544,12 @@ PersistentBuild_BuildDb( tuple = systable_getnext(sscan); if (!HeapTupleIsValid(tuple)) { - Datum values[Natts_gp_global_sequence]; - bool nulls[Natts_gp_global_sequence]; + Datum values[Natts_gp_global_sequence]; + bool nulls[Natts_gp_global_sequence]; /* Insert N frozen tuples of value 0 */ MemSet(nulls, false, sizeof(nulls)); - values[Anum_gp_global_sequence_sequence_num-1] = Int64GetDatum(0); + values[Anum_gp_global_sequence_sequence_num - 1] = Int64GetDatum(0); tuple = heap_form_tuple(RelationGetDescr(gp_global_sequence), values, nulls); if (!HeapTupleIsValid(tuple)) @@ -563,58 +569,58 @@ PersistentBuild_BuildDb( if (!HeapTupleIsValid(tuple)) elog(ERROR, "could not find tuple for database %u", dbOid); form_pg_database = (Form_pg_database) GETSTRUCT(tuple); - + defaultTablespace = form_pg_database->dattablespace; if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "PersistentBuild_BuildDb: dbOid %u, '%s', mirror existence state '%s', " "data synchronization state '%s'", dbOid, NameStr(form_pg_database->datname), MirroredObjectExistenceState_Name(mirrorExistenceState), MirroredRelDataSynchronizationState_Name( - relDataSynchronizationState)); + relDataSynchronizationState)); /* - * Special call here to scan the persistent meta-data structures so we are open for - * business and then we can add information. + * Special call here to scan the persistent meta-data structures so we are + * open for business and then we can add information. */ PersistentFileSysObj_BuildInitScan(); info = DatabaseInfo_Collect( - dbOid, - defaultTablespace, - /* snapshot */ NULL, - /* collectGpRelationNodeInfo */ false, - /* collectAppendOnlyCatalogSegmentInfo */ true, - /* scanFileSystem */ true); + dbOid, + defaultTablespace, + /* snapshot */ NULL, + /* collectGpRelationNodeInfo */ false, + /* collectAppendOnlyCatalogSegmentInfo */ true, + /* scanFileSystem */ true); for (t = 0; t < info->tablespacesCount; t++) { - Oid tablespace = info->tablespaces[t]; - DbDirNode dbDirNode; + Oid tablespace = info->tablespaces[t]; + DbDirNode dbDirNode; ItemPointerData persistentTid; if (tablespace == GLOBALTABLESPACE_OID) continue; - + dbDirNode.tablespace = tablespace; dbDirNode.database = dbOid; PersistentDatabase_AddCreated( - &dbDirNode, - mirrorExistenceState, - &persistentTid, - /* flushToXLog */ false); - } - + &dbDirNode, + mirrorExistenceState, + &persistentTid, + /* flushToXLog */ false); + } + PersistentBuild_PopulateGpRelationNode( - info, - defaultTablespace, - mirrorExistenceState, - relDataSynchronizationState, - &count); + info, + defaultTablespace, + mirrorExistenceState, + relDataSynchronizationState, + &count); heap_close(pg_database, RowExclusiveLock); @@ -622,11 +628,12 @@ PersistentBuild_BuildDb( SIMPLE_FAULT_INJECTOR(RebuildPTDB); - /* + /* * Since we have written XLOG records with of zeroes because of the gp_before_persistence_work - * GUC, lets request a checkpoint to force out all buffer pool pages so we - * never try to redo those XLOG records in Crash Recovery. + * persistentSerialNum> of zeroes because of the + * gp_before_persistence_work GUC, lets request a checkpoint to force out + * all buffer pool pages so we never try to redo those XLOG records in + * Crash Recovery. */ RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); @@ -636,11 +643,11 @@ PersistentBuild_BuildDb( Datum gp_persistent_build_db(PG_FUNCTION_ARGS) { - bool mirrored = PG_GETARG_BOOL(0); + bool mirrored = PG_GETARG_BOOL(0); PersistentBuild_BuildDb( - MyDatabaseId, - mirrored); + MyDatabaseId, + mirrored); PG_RETURN_INT32(1); } @@ -649,80 +656,80 @@ gp_persistent_build_db(PG_FUNCTION_ARGS) Datum gp_persistent_build_all(PG_FUNCTION_ARGS) { - bool mirrored = PG_GETARG_BOOL(0); + bool mirrored = PG_GETARG_BOOL(0); - Relation pg_tablespace; - Relation pg_database; - HeapTuple tuple; + Relation pg_tablespace; + Relation pg_database; + HeapTuple tuple; SysScanDesc sscan; - Datum *d; - bool *null; - - // UNDONE: Verify we are in some sort of single-user mode. + Datum *d; + bool *null; + + /* UNDONE: Verify we are in some sort of single-user mode. */ /* * Re-build tablespaces. */ d = (Datum *) palloc(sizeof(Datum) * Natts_pg_tablespace); - null = (bool *) palloc(sizeof(bool) * Natts_pg_tablespace); - + null = (bool *) palloc(sizeof(bool) * Natts_pg_tablespace); + pg_tablespace = heap_open(TableSpaceRelationId, AccessShareLock); sscan = systable_beginscan(pg_tablespace, InvalidOid, false, SnapshotNow, 0, NULL); while (HeapTupleIsValid(tuple = systable_getnext(sscan))) { - Oid tablespaceOid; - + Oid tablespaceOid; + if (!HeapTupleIsValid(tuple)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace tuple is invalid"))); - + tablespaceOid = HeapTupleGetOid(tuple); - + heap_deform_tuple(tuple, RelationGetDescr(pg_tablespace), d, null); - + if (tablespaceOid == DEFAULTTABLESPACE_OID || tablespaceOid == GLOBALTABLESPACE_OID) { if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "gp_persistent_build_all: skip pg_default and pg_global tablespaceOid %u", - tablespaceOid); + tablespaceOid); continue; } - + if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "gp_persistent_build_all: tablespaceOid %u filespaceOid %u", tablespaceOid, DatumGetInt32(d[Anum_pg_tablespace_spcfsoid - 1])); - + PersistentTablespace_AddCreated( DatumGetInt32(d[Anum_pg_tablespace_spcfsoid - 1]), tablespaceOid, mirrored ? - MirroredObjectExistenceState_MirrorCreated : - MirroredObjectExistenceState_NotMirrored, - /* flushToXLog */ false); + MirroredObjectExistenceState_MirrorCreated : + MirroredObjectExistenceState_NotMirrored, + /* flushToXLog */ false); } - + systable_endscan(sscan); - + heap_close(pg_tablespace, AccessShareLock); - + pfree(d); pfree(null); - + /* - * Re-build databases. - * Do template1 first since it will also populate the shared-object persistent objects. - */ + * Re-build databases. Do template1 first since it will also populate the + * shared-object persistent objects. + */ PersistentBuild_BuildDb( - TemplateDbOid, - mirrored); + TemplateDbOid, + mirrored); if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "gp_persistent_build_all: template1 complete"); /* @@ -734,25 +741,25 @@ gp_persistent_build_all(PG_FUNCTION_ARGS) while (HeapTupleIsValid(tuple = systable_getnext(sscan))) { - Oid dbOid; - + Oid dbOid; + dbOid = HeapTupleGetOid(tuple); if (dbOid == TemplateDbOid) { if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "gp_persistent_build_all: skip template1"); continue; } if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "gp_persistent_build_all: dbOid %u", dbOid); - + PersistentBuild_BuildDb( - dbOid, - mirrored); + dbOid, + mirrored); } systable_endscan(sscan); @@ -765,34 +772,35 @@ gp_persistent_build_all(PG_FUNCTION_ARGS) static void PersistentBuild_FindGpRelationNodeIndex( - Oid database, + Oid database, - Oid defaultTablespace, + Oid defaultTablespace, - RelFileNode *relFileNode) + RelFileNode *relFileNode) { Relation pg_class_rel; SysScanDesc sscan; HeapTuple tuple; - bool found; + bool found; /* - * Iterate through all the relations of the database and find gp_relation_node_index. + * Iterate through all the relations of the database and find + * gp_relation_node_index. */ - pg_class_rel = - DirectOpen_PgClassOpen( - defaultTablespace, - database); + pg_class_rel = + DirectOpen_PgClassOpen( + defaultTablespace, + database); sscan = systable_beginscan(pg_class_rel, InvalidOid, false, SnapshotNow, 0, NULL); found = false; while (HeapTupleIsValid(tuple = systable_getnext(sscan))) { - Oid relationOid; + Oid relationOid; - Form_pg_class form_pg_class; + Form_pg_class form_pg_class; - Oid reltablespace; + Oid reltablespace; relationOid = HeapTupleGetOid(tuple); if (relationOid != GpRelationNodeOidIndexId) @@ -811,7 +819,7 @@ PersistentBuild_FindGpRelationNodeIndex( relFileNode->spcNode = reltablespace; relFileNode->dbNode = database; - relFileNode->relNode= form_pg_class->relfilenode; + relFileNode->relNode = form_pg_class->relfilenode; found = true; break; @@ -831,9 +839,9 @@ static int64 PersistentBuild_TruncateAllGpRelationNode(void) { Relation pg_database; - HeapTuple tuple; - SysScanDesc sscan; - int64 count; + HeapTuple tuple; + SysScanDesc sscan; + int64 count; /* * Truncate gp_relation_node and its index in each database. @@ -846,25 +854,25 @@ PersistentBuild_TruncateAllGpRelationNode(void) while (HeapTupleIsValid(tuple = systable_getnext(sscan))) { Form_pg_database form_pg_database = - (Form_pg_database)GETSTRUCT(tuple); + (Form_pg_database) GETSTRUCT(tuple); - Oid dbOid; - Oid dattablespace; + Oid dbOid; + Oid dattablespace; RelFileNode relFileNode; SMgrRelation smgrRelation; - Page btree_metapage; - + Page btree_metapage; + dbOid = HeapTupleGetOid(tuple); dattablespace = form_pg_database->dattablespace; if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "PersistentBuild_TruncateAllGpRelationNode: dbOid %u, '%s'", dbOid, NameStr(form_pg_database->datname)); if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "Truncating gp_relation_node %u/%u/%u in database oid %u ('%s')", relFileNode.spcNode, relFileNode.dbNode, @@ -877,22 +885,23 @@ PersistentBuild_TruncateAllGpRelationNode(void) relFileNode.relNode = GpRelationNodeRelationId; /* - * Truncate WITHOUT generating an XLOG record (i.e. pretend it is a temp relation). + * Truncate WITHOUT generating an XLOG record (i.e. pretend it is a + * temp relation). */ PersistentBuild_NonTransactionTruncate(&relFileNode); count++; /* - * And, the index. Unfortunately, the relfilenode OID can change due to a - * REINDEX {TABLE|INDEX} command. + * And, the index. Unfortunately, the relfilenode OID can change due + * to a REINDEX {TABLE|INDEX} command. */ PersistentBuild_FindGpRelationNodeIndex( - dbOid, - dattablespace, - &relFileNode); + dbOid, + dattablespace, + &relFileNode); if (Debug_persistent_print) - elog(Persistent_DebugPrintLevel(), + elog(Persistent_DebugPrintLevel(), "Truncating gp_relation_node_index %u/%u/%u in database oid %u ('%s'). relfilenode different %s, tablespace different %s", relFileNode.spcNode, relFileNode.dbNode, @@ -904,17 +913,17 @@ PersistentBuild_TruncateAllGpRelationNode(void) PersistentBuild_NonTransactionTruncate(&relFileNode); - // The BTree needs an empty meta-data block. + /* The BTree needs an empty meta-data block. */ smgrRelation = smgropen(relFileNode); - btree_metapage = (Page)palloc(BLCKSZ); + btree_metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(btree_metapage, P_NONE, 0); PageSetChecksumInplace(btree_metapage, 0); smgrwrite( - smgrRelation, - /* blockNum */ 0, - (char*)btree_metapage, - /* isTemp */ false); + smgrRelation, + /* blockNum */ 0, + (char *) btree_metapage, + /* isTemp */ false); smgrimmedsync(smgrRelation); pfree(btree_metapage); @@ -934,7 +943,7 @@ gp_persistent_reset_all(PG_FUNCTION_ARGS) { RelFileNode relFileNode; - // UNDONE: Verify we are in some sort of single-user mode. + /* UNDONE: Verify we are in some sort of single-user mode. */ /* * Truncate all database's gp_relation_node and their indices. @@ -942,28 +951,28 @@ gp_persistent_reset_all(PG_FUNCTION_ARGS) PersistentBuild_TruncateAllGpRelationNode(); /* - * Truncate the 4 persistent shared tables. - * 'gp_persistent_filespace_node' persistent table is not dropped - * since it cannot be re-built. 'pg_filespace' table does not exist - * on segments by design. + * Truncate the 4 persistent shared tables. 'gp_persistent_filespace_node' + * persistent table is not dropped since it cannot be re-built. + * 'pg_filespace' table does not exist on segments by design. */ relFileNode.spcNode = GLOBALTABLESPACE_OID; relFileNode.dbNode = 0; - + relFileNode.relNode = GpPersistentRelationNodeRelationId; PersistentBuild_NonTransactionTruncate(&relFileNode); - + relFileNode.relNode = GpPersistentDatabaseNodeRelationId; PersistentBuild_NonTransactionTruncate(&relFileNode); - + relFileNode.relNode = GpPersistentTablespaceNodeRelationId; PersistentBuild_NonTransactionTruncate(&relFileNode); - + relFileNode.relNode = GpPersistentRelationNodeRelationId; PersistentBuild_NonTransactionTruncate(&relFileNode); /* - * Reset the persistent shared-memory free list heads and all shared-memory hash-tables. + * Reset the persistent shared-memory free list heads and all + * shared-memory hash-tables. */ PersistentFileSysObj_Reset(); @@ -973,20 +982,20 @@ gp_persistent_reset_all(PG_FUNCTION_ARGS) Datum gp_persistent_repair_delete(PG_FUNCTION_ARGS) { - int fsObjType; - ItemPointerData persistentTid; + int fsObjType; + ItemPointerData persistentTid; fsObjType = PG_GETARG_INT32(0); persistentTid = PG_GETARG_TID(1); - if (fsObjType < PersistentFsObjType_First || + if (fsObjType < PersistentFsObjType_First || fsObjType > PersistentFsObjType_Last) - elog(ERROR, + elog(ERROR, "Persistent object type must be in the range 1..4 " - "(Relation, Database Dir, Tablespace Dir, Filespace Dir)"); + "(Relation, Database Dir, Tablespace Dir, Filespace Dir)"); PersistentFileSysObj_RepairDelete( - fsObjType, - &persistentTid); + fsObjType, + &persistentTid); PG_RETURN_INT32(0); } diff --git a/src/backend/cdb/cdbpersistentcheck.c b/src/backend/cdb/cdbpersistentcheck.c index 6010cf4544..68332e7e68 100644 --- a/src/backend/cdb/cdbpersistentcheck.c +++ b/src/backend/cdb/cdbpersistentcheck.c @@ -51,102 +51,105 @@ typedef struct PT_GpPersistentRelationNode { - Oid tablespaceOid; - Oid databaseOid; - Oid relfilenodeOid; - int32 segmentFileNum; - PersistentFileSysRelStorageMgr relationStorageManager; - PersistentFileSysState persistentState; - int64 createMirrorDataLossTrackingSessionNum; - MirroredObjectExistenceState mirrorExistenceState; + Oid tablespaceOid; + Oid databaseOid; + Oid relfilenodeOid; + int32 segmentFileNum; + PersistentFileSysRelStorageMgr relationStorageManager; + PersistentFileSysState persistentState; + int64 createMirrorDataLossTrackingSessionNum; + MirroredObjectExistenceState mirrorExistenceState; MirroredRelDataSynchronizationState mirrorDataSynchronizationState; - bool mirrorBufpoolMarkedForScanIncrementalResync; - int64 mirrorBufpoolResyncChangedPageCount; - XLogRecPtr mirrorBufpoolResyncCkptLoc; - BlockNumber mirrorBufpoolResyncCkptBlockNum; - int64 mirrorAppendOnlyLossEof; - int64 mirrorAppendOnlyNewEof; - PersistentFileSysRelBufpoolKind relBufpoolKind; - TransactionId parentXid; - int64 persistentSerialNum; + bool mirrorBufpoolMarkedForScanIncrementalResync; + int64 mirrorBufpoolResyncChangedPageCount; + XLogRecPtr mirrorBufpoolResyncCkptLoc; + BlockNumber mirrorBufpoolResyncCkptBlockNum; + int64 mirrorAppendOnlyLossEof; + int64 mirrorAppendOnlyNewEof; + PersistentFileSysRelBufpoolKind relBufpoolKind; + TransactionId parentXid; + int64 persistentSerialNum; } PT_GpPersistentRelationNode; -static inline void GpPersistentRelationNodeGetValues(Datum* values, PT_GpPersistentRelationNode *relnode) +static inline void +GpPersistentRelationNodeGetValues(Datum *values, PT_GpPersistentRelationNode *relnode) { GpPersistentRelationNode_GetValues( - values, - &relnode->tablespaceOid, - &relnode->databaseOid, - &relnode->relfilenodeOid, - &relnode->segmentFileNum, - &relnode->relationStorageManager, - &relnode->persistentState, - &relnode->createMirrorDataLossTrackingSessionNum, - &relnode->mirrorExistenceState, - &relnode->mirrorDataSynchronizationState, - &relnode->mirrorBufpoolMarkedForScanIncrementalResync, - &relnode->mirrorBufpoolResyncChangedPageCount, - &relnode->mirrorBufpoolResyncCkptLoc, - &relnode->mirrorBufpoolResyncCkptBlockNum, - &relnode->mirrorAppendOnlyLossEof, - &relnode->mirrorAppendOnlyNewEof, - &relnode->relBufpoolKind, - &relnode->parentXid, - &relnode->persistentSerialNum); + values, + &relnode->tablespaceOid, + &relnode->databaseOid, + &relnode->relfilenodeOid, + &relnode->segmentFileNum, + &relnode->relationStorageManager, + &relnode->persistentState, + &relnode->createMirrorDataLossTrackingSessionNum, + &relnode->mirrorExistenceState, + &relnode->mirrorDataSynchronizationState, + &relnode->mirrorBufpoolMarkedForScanIncrementalResync, + &relnode->mirrorBufpoolResyncChangedPageCount, + &relnode->mirrorBufpoolResyncCkptLoc, + &relnode->mirrorBufpoolResyncCkptBlockNum, + &relnode->mirrorAppendOnlyLossEof, + &relnode->mirrorAppendOnlyNewEof, + &relnode->relBufpoolKind, + &relnode->parentXid, + &relnode->persistentSerialNum); } typedef struct PT_GpPersistentDatabaseNode { - Oid tablespaceOid; - Oid databaseOid; - PersistentFileSysState persistentState; - int64 createMirrorDataLossTrackingSessionNum; + Oid tablespaceOid; + Oid databaseOid; + PersistentFileSysState persistentState; + int64 createMirrorDataLossTrackingSessionNum; MirroredObjectExistenceState mirrorExistenceState; - int32 reserved; - TransactionId parentXid; - int64 persistentSerialNum; + int32 reserved; + TransactionId parentXid; + int64 persistentSerialNum; } PT_GpPersistentDatabaseNode; -static inline void GpPersistentDatabaseNodeGetValues(Datum* values, PT_GpPersistentDatabaseNode *dbnode) +static inline void +GpPersistentDatabaseNodeGetValues(Datum *values, PT_GpPersistentDatabaseNode *dbnode) { GpPersistentDatabaseNode_GetValues( - values, - &dbnode->tablespaceOid, - &dbnode->databaseOid, - &dbnode->persistentState, - &dbnode->createMirrorDataLossTrackingSessionNum, - &dbnode->mirrorExistenceState, - &dbnode->reserved, - &dbnode->parentXid, - &dbnode->persistentSerialNum); + values, + &dbnode->tablespaceOid, + &dbnode->databaseOid, + &dbnode->persistentState, + &dbnode->createMirrorDataLossTrackingSessionNum, + &dbnode->mirrorExistenceState, + &dbnode->reserved, + &dbnode->parentXid, + &dbnode->persistentSerialNum); } typedef struct PT_GpPersistentTablespaceNode { - Oid filespaceOid; - Oid tablespaceOid; - PersistentFileSysState persistentState; - int64 createMirrorDataLossTrackingSessionNum; + Oid filespaceOid; + Oid tablespaceOid; + PersistentFileSysState persistentState; + int64 createMirrorDataLossTrackingSessionNum; MirroredObjectExistenceState mirrorExistenceState; - int32 reserved; - TransactionId parentXid; - int64 persistentSerialNum; + int32 reserved; + TransactionId parentXid; + int64 persistentSerialNum; } PT_GpPersistentTablespaceNode; -static inline void GpPersistentTablespaceNodeGetValues(Datum* values, PT_GpPersistentTablespaceNode *tablenode) +static inline void +GpPersistentTablespaceNodeGetValues(Datum *values, PT_GpPersistentTablespaceNode *tablenode) { GpPersistentTablespaceNode_GetValues( - values, - &tablenode->filespaceOid, - &tablenode->tablespaceOid, - &tablenode->persistentState, - &tablenode->createMirrorDataLossTrackingSessionNum, - &tablenode->mirrorExistenceState, - &tablenode->reserved, - &tablenode->parentXid, - &tablenode->persistentSerialNum); + values, + &tablenode->filespaceOid, + &tablenode->tablespaceOid, + &tablenode->persistentState, + &tablenode->createMirrorDataLossTrackingSessionNum, + &tablenode->mirrorExistenceState, + &tablenode->reserved, + &tablenode->parentXid, + &tablenode->persistentSerialNum); } @@ -154,17 +157,17 @@ static inline void GpPersistentTablespaceNodeGetValues(Datum* values, PT_GpPersi typedef struct { - char *quertTitle; - char *queryStr; - bool executeWhenInSyncOnly; + char *quertTitle; + char *queryStr; + bool executeWhenInSyncOnly; -}query; +} query; #define NONDB_SPECIFIC_PTCAT_VERIFICATION_NUM_QUERIES 14 -query NonDBSpecific_PTCat_Verification_queries[] = +query NonDBSpecific_PTCat_Verification_queries[] = { -//1 - {"gp_persistent_filespace_node state check", +/* 1 */ + {"gp_persistent_filespace_node state check", " SELECT p.filespace_oid," " case when p.persistent_state = 0 then 'free'" " when p.persistent_state = 1 then 'create pending'" @@ -187,9 +190,9 @@ query NonDBSpecific_PTCat_Verification_queries[] = " end as mirror_existence_state" " FROM gp_persistent_filespace_node p" " WHERE p.persistent_state not in (0, 2)" - " or p.mirror_existence_state not in (0,1,3)", true}, -//2 - {"gp_persistent_filespace_node <=> pg_filespace", + " or p.mirror_existence_state not in (0,1,3)", true}, +/* 2 */ + {"gp_persistent_filespace_node <=> pg_filespace", " SELECT coalesce(f.oid, p.filespace_oid) as filespace_oid," " f.fsname as filespace" " FROM (SELECT * FROM gp_persistent_filespace_node" @@ -197,9 +200,9 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FULL OUTER JOIN (SELECT oid, fsname FROM pg_filespace" " WHERE oid != 3052) f" " ON (p.filespace_oid = f.oid)" - " WHERE (p.filespace_oid is NULL OR f.oid is NULL)", false}, -//3 - {"gp_persistent_filespace_node <=> gp_global_sequence", + " WHERE (p.filespace_oid is NULL OR f.oid is NULL)", false}, +/* 3 */ + {"gp_persistent_filespace_node <=> gp_global_sequence", " SELECT p.filespace_oid, f.fsname as filespace," " case when p.persistent_state = 0 then 'free'" " when p.persistent_state = 1 then 'create pending'" @@ -214,9 +217,9 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FROM gp_global_sequence s, gp_persistent_filespace_node p" " LEFT JOIN pg_filespace f ON (f.oid = p.filespace_oid)" " WHERE s.ctid = '(0,4)' and p.persistent_serial_num > s.sequence_num", - false}, -//4 - {"gp_persistent_database_node state check", + false}, +/* 4 */ + {"gp_persistent_database_node state check", " SELECT p.tablespace_oid, p.database_oid," " case when p.persistent_state = 0 then 'free'" " when p.persistent_state = 1 then 'create pending'" @@ -239,18 +242,18 @@ query NonDBSpecific_PTCat_Verification_queries[] = " end as mirror_existence_state" " FROM gp_persistent_database_node p" " WHERE p.persistent_state not in (0, 2)" - " or p.mirror_existence_state not in (0,1,3)", true}, -//5 - {"gp_persistent_database_node <=> pg_database", + " or p.mirror_existence_state not in (0,1,3)", true}, +/* 5 */ + {"gp_persistent_database_node <=> pg_database", " SELECT coalesce(d.oid, p.database_oid) as database_oid," " d.datname as database" " FROM (SELECT * FROM gp_persistent_database_node" " WHERE persistent_state = 2) p" " FULL OUTER JOIN pg_database d" " ON (d.oid = p.database_oid)" - " WHERE (d.datname is null or p.database_oid is null)", false}, -//6 - {"gp_persistent_database_node <=> pg_tablespace", + " WHERE (d.datname is null or p.database_oid is null)", false}, +/* 6 */ + {"gp_persistent_database_node <=> pg_tablespace", " SELECT coalesce(t.oid, p.database_oid) as database_oid," " t.spcname as tablespace" " FROM (SELECT * FROM gp_persistent_database_node" @@ -258,9 +261,9 @@ query NonDBSpecific_PTCat_Verification_queries[] = " LEFT OUTER JOIN (SELECT oid, spcname FROM pg_tablespace" " WHERE oid != 1664) t" " ON (t.oid = p.tablespace_oid)" - " WHERE t.spcname is null", false}, -//7 - {"gp_persistent_database_node <=> gp_global_sequence", + " WHERE t.spcname is null", false}, +/* 7 */ + {"gp_persistent_database_node <=> gp_global_sequence", " SELECT p.database_oid, p.tablespace_oid, d.datname as database," " case when p.persistent_state = 0 then 'free'" " when p.persistent_state = 1 then 'create pending'" @@ -275,9 +278,9 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FROM gp_global_sequence s, gp_persistent_database_node p" " LEFT JOIN pg_database d ON (d.oid = p.database_oid)" " WHERE s.ctid = '(0,2)' and p.persistent_serial_num > s.sequence_num", - false}, -//8 - {"gp_persistent_tablespace_node state check", + false}, +/* 8 */ + {"gp_persistent_tablespace_node state check", " SELECT p.filespace_oid, p.tablespace_oid," " case when p.persistent_state = 0 then 'free'" " when p.persistent_state = 1 then 'create pending'" @@ -301,9 +304,9 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FROM gp_persistent_tablespace_node p" " WHERE p.persistent_state not in (0, 2)" " or p.mirror_existence_state not in (0,1,3)", - true}, -//9 - {"gp_persistent_tablespace_node <=> pg_tablespace", + true}, +/* 9 */ + {"gp_persistent_tablespace_node <=> pg_tablespace", " SELECT coalesce(t.oid, p.tablespace_oid) as tablespace_oid," " t.spcname as tablespace" " FROM (SELECT * FROM gp_persistent_tablespace_node" @@ -311,17 +314,17 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FULL OUTER JOIN (" " SELECT oid, spcname FROM pg_tablespace WHERE oid not in (1663, 1664)" " ) t ON (t.oid = p.tablespace_oid)" - " WHERE t.spcname is null or p.tablespace_oid is null", false}, -//10 - {"gp_persistent_tablespace_node <=> pg_filespace", + " WHERE t.spcname is null or p.tablespace_oid is null", false}, +/* 10 */ + {"gp_persistent_tablespace_node <=> pg_filespace", " SELECT p.filespace_oid, f.fsname as filespace" " FROM (SELECT * FROM gp_persistent_tablespace_node" " WHERE persistent_state = 2) p" " LEFT OUTER JOIN pg_filespace f" " ON (f.oid = p.filespace_oid)" - " WHERE f.fsname is null", false}, -//11 - {"gp_persistent_tablespace_node <=> gp_global_sequence", + " WHERE f.fsname is null", false}, +/* 11 */ + {"gp_persistent_tablespace_node <=> gp_global_sequence", " SELECT p.filespace_oid, p.tablespace_oid, t.spcname as tablespace," " case when p.persistent_state = 0 then 'free'" " when p.persistent_state = 1 then 'create pending'" @@ -336,22 +339,22 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FROM gp_global_sequence s, gp_persistent_tablespace_node p" " LEFT JOIN pg_tablespace t ON (t.oid = p.tablespace_oid)" " WHERE s.ctid = '(0,3)' and p.persistent_serial_num > s.sequence_num", - false}, -//12 - {"gp_persistent_relation_node <=> pg_database", + false}, +/* 12 */ + {"gp_persistent_relation_node <=> pg_database", " SELECT datname, oid, count(*)" " FROM (" " SELECT d.datname as datname, p.database_oid as oid" " FROM (SELECT * FROM gp_persistent_relation_node" " WHERE database_oid != 0 and persistent_state = 2" " ) p" - " full outer join pg_database d ON (d.oid = p.database_oid)" + " full outer join pg_database d ON (d.oid = p.database_oid)" " ) x" " GROUP BY 1,2" " HAVING datname is null or oid is null or count(*) < 100", - false}, -//13 - {"gp_persistent_relation_node <=> gp_global_sequence", + false}, +/* 13 */ + {"gp_persistent_relation_node <=> gp_global_sequence", " SELECT p.tablespace_oid, p.database_oid, p.relfilenode_oid," " p.segment_file_num," " case when p.persistent_state = 0 then 'free'" @@ -367,134 +370,145 @@ query NonDBSpecific_PTCat_Verification_queries[] = " FROM gp_global_sequence s, gp_persistent_relation_node p" " LEFT JOIN pg_tablespace t ON (t.oid = p.tablespace_oid)" " WHERE s.ctid = '(0,1)' and p.persistent_serial_num > s.sequence_num", - false}, + false}, -//14 - {"pg_database <=> filesystem", +/* 14 */ + {"pg_database <=> filesystem", " SELECT tablespace_oid, database_oid, count(*)" " FROM gp_persistent_relation_node_check() p" " LEFT OUTER JOIN pg_database d" " ON (p.database_oid = d.oid)" " WHERE d.oid is null and database_oid != 0" " GROUP BY tablespace_oid, database_oid", - false} + false} }; #define DB_SPECIFIC_PTCAT_VERIFICATION_NUM_QUERIES 5 -static query DB_PTCat_Veritifcation_queries[]= +static query DB_PTCat_Veritifcation_queries[] = { -//1 - {"gp_persistent_relation_node state check", - " SELECT p.tablespace_oid, p.relfilenode_oid, p.segment_file_num," - " case when p.persistent_state = 0 then 'free'" - " when p.persistent_state = 1 then 'create pending'" - " when p.persistent_state = 2 then 'created'" - " when p.persistent_state = 3 then 'drop pending'" - " when p.persistent_state = 4 then 'abort create'" - " when p.persistent_state = 5 then 'JIT create pending'" - " when p.persistent_state = 6 then 'bulk load create pending'" - " else 'unknown state: ' || p.persistent_state" - " end as persistent_state," - " case when p.mirror_existence_state = 0 then 'mirror free'" - " when p.mirror_existence_state = 1 then 'not mirrored'" - " when p.mirror_existence_state = 2 then 'mirror create pending'" - " when p.mirror_existence_state = 3 then 'mirror created'" - " when p.mirror_existence_state = 4 then 'mirror down before create'" - " when p.mirror_existence_state = 5 then 'mirror down during create'" - " when p.mirror_existence_state = 6 then 'mirror drop pending'" - " when p.mirror_existence_state = 7 then 'mirror only drop remains'" - " else 'unknown state: ' || p.mirror_existence_state" - " end as mirror_existence_state" - " FROM gp_persistent_relation_node p" - " WHERE (p.persistent_state not in (0, 2)" - " or p.mirror_existence_state not in (0,1,3))" - " and p.database_oid in (" - " SELECT oid FROM pg_database WHERE datname = current_database()" - " )", true}, -//2 - {"gp_persistent_relation_node <=> pg_tablespace", - " SELECT distinct p.tablespace_oid" - " FROM (SELECT * FROM gp_persistent_relation_node" - " WHERE persistent_state = 2" - " AND database_oid in (" - " SELECT oid FROM pg_database" - " WHERE datname = current_database()" - " UNION ALL" - " SELECT 0)) p" - " LEFT OUTER JOIN pg_tablespace t" - " ON (t.oid = p.tablespace_oid)" - " WHERE t.oid is null", false}, -//3 - {"gp_persistent_relation_node <=> gp_relation_node", - " SELECT coalesce(p.relfilenode_oid, r.relfilenode_oid) as relfilenode," - " p.ctid, r.persistent_tid" - " FROM (" - " SELECT p.ctid, p.* FROM gp_persistent_relation_node p" - " WHERE persistent_state = 2 AND p.database_oid in (" - " SELECT oid FROM pg_database WHERE datname = current_database()" - " UNION ALL" - " SELECT 0" - " )" - " ) p" - " FULL OUTER JOIN gp_relation_node r" - " ON (p.relfilenode_oid = r.relfilenode_oid and" - " p.segment_file_num = r.segment_file_num)" - " WHERE (p.relfilenode_oid is NULL OR" - " r.relfilenode_oid is NULL OR" - " p.ctid != r.persistent_tid)", false}, -//4 - {"gp_persistent_relation_node <=> pg_class", - " SELECT coalesce(p.relfilenode_oid, c.relfilenode) as relfilenode," - " c.nspname, c.relname, c.relkind, c.relstorage" - " FROM (" - " SELECT * FROM gp_persistent_relation_node" - " WHERE persistent_state = 2 AND database_oid in (" - " SELECT oid FROM pg_database WHERE datname = current_database()" - " UNION ALL" - " SELECT 0" - " )" - " ) p" - " FULL OUTER JOIN (" - " SELECT n.nspname, c.relname, c.relfilenode, c.relstorage, c.relkind" - " FROM pg_class c" - " LEFT OUTER JOIN pg_namespace n ON (c.relnamespace = n.oid)" - " WHERE c.relstorage not in ('v', 'x', 'f')" - " ) c ON (p.relfilenode_oid = c.relfilenode)" - " WHERE p.relfilenode_oid is NULL OR c.relfilenode is NULL", false}, -//5 - {"gp_persistent_relation_node <=> filesystem", - " SELECT coalesce(a.tablespace_oid, b.tablespace_oid) as tablespace_oid," - " coalesce(a.database_oid, b.database_oid) as database_oid," - " coalesce(a.relfilenode_oid, b.relfilenode_oid) as relfilenode_oid," - " coalesce(a.segment_file_num, b.segment_file_num) as segment_file_num," - " a.relfilenode_oid is null as filesystem," - " b.relfilenode_oid is null as persistent," - " b.relkind, b.relstorage" - " FROM gp_persistent_relation_node a" - " FULL OUTER JOIN (" - " SELECT p.*, c.relkind, c.relstorage" - " FROM gp_persistent_relation_node_check() p" - " LEFT OUTER JOIN pg_class c" - " ON (p.relfilenode_oid = c.relfilenode)" - " WHERE (p.segment_file_num = 0 or c.relstorage != 'h')" - " ) b ON (a.tablespace_oid = b.tablespace_oid and" - " a.database_oid = b.database_oid and" - " a.relfilenode_oid = b.relfilenode_oid and" - " a.segment_file_num = b.segment_file_num)" - " WHERE (a.relfilenode_oid is null OR" - " (a.persistent_state = 2 and b.relfilenode_oid is null)) and" - " coalesce(a.database_oid, b.database_oid) in (" - " SELECT oid FROM pg_database WHERE datname = current_database()" - " UNION ALL" - " SELECT 0" - " )", false} +/* 1 */ + { + "gp_persistent_relation_node state check", + " SELECT p.tablespace_oid, p.relfilenode_oid, p.segment_file_num," + " case when p.persistent_state = 0 then 'free'" + " when p.persistent_state = 1 then 'create pending'" + " when p.persistent_state = 2 then 'created'" + " when p.persistent_state = 3 then 'drop pending'" + " when p.persistent_state = 4 then 'abort create'" + " when p.persistent_state = 5 then 'JIT create pending'" + " when p.persistent_state = 6 then 'bulk load create pending'" + " else 'unknown state: ' || p.persistent_state" + " end as persistent_state," + " case when p.mirror_existence_state = 0 then 'mirror free'" + " when p.mirror_existence_state = 1 then 'not mirrored'" + " when p.mirror_existence_state = 2 then 'mirror create pending'" + " when p.mirror_existence_state = 3 then 'mirror created'" + " when p.mirror_existence_state = 4 then 'mirror down before create'" + " when p.mirror_existence_state = 5 then 'mirror down during create'" + " when p.mirror_existence_state = 6 then 'mirror drop pending'" + " when p.mirror_existence_state = 7 then 'mirror only drop remains'" + " else 'unknown state: ' || p.mirror_existence_state" + " end as mirror_existence_state" + " FROM gp_persistent_relation_node p" + " WHERE (p.persistent_state not in (0, 2)" + " or p.mirror_existence_state not in (0,1,3))" + " and p.database_oid in (" + " SELECT oid FROM pg_database WHERE datname = current_database()" + " )", true + }, +/* 2 */ + { + "gp_persistent_relation_node <=> pg_tablespace", + " SELECT distinct p.tablespace_oid" + " FROM (SELECT * FROM gp_persistent_relation_node" + " WHERE persistent_state = 2" + " AND database_oid in (" + " SELECT oid FROM pg_database" + " WHERE datname = current_database()" + " UNION ALL" + " SELECT 0)) p" + " LEFT OUTER JOIN pg_tablespace t" + " ON (t.oid = p.tablespace_oid)" + " WHERE t.oid is null", false + }, +/* 3 */ + { + "gp_persistent_relation_node <=> gp_relation_node", + " SELECT coalesce(p.relfilenode_oid, r.relfilenode_oid) as relfilenode," + " p.ctid, r.persistent_tid" + " FROM (" + " SELECT p.ctid, p.* FROM gp_persistent_relation_node p" + " WHERE persistent_state = 2 AND p.database_oid in (" + " SELECT oid FROM pg_database WHERE datname = current_database()" + " UNION ALL" + " SELECT 0" + " )" + " ) p" + " FULL OUTER JOIN gp_relation_node r" + " ON (p.relfilenode_oid = r.relfilenode_oid and" + " p.segment_file_num = r.segment_file_num)" + " WHERE (p.relfilenode_oid is NULL OR" + " r.relfilenode_oid is NULL OR" + " p.ctid != r.persistent_tid)", false + }, +/* 4 */ + { + "gp_persistent_relation_node <=> pg_class", + " SELECT coalesce(p.relfilenode_oid, c.relfilenode) as relfilenode," + " c.nspname, c.relname, c.relkind, c.relstorage" + " FROM (" + " SELECT * FROM gp_persistent_relation_node" + " WHERE persistent_state = 2 AND database_oid in (" + " SELECT oid FROM pg_database WHERE datname = current_database()" + " UNION ALL" + " SELECT 0" + " )" + " ) p" + " FULL OUTER JOIN (" + " SELECT n.nspname, c.relname, c.relfilenode, c.relstorage, c.relkind" + " FROM pg_class c" + " LEFT OUTER JOIN pg_namespace n ON (c.relnamespace = n.oid)" + " WHERE c.relstorage not in ('v', 'x', 'f')" + " ) c ON (p.relfilenode_oid = c.relfilenode)" + " WHERE p.relfilenode_oid is NULL OR c.relfilenode is NULL", false + }, +/* 5 */ + { + "gp_persistent_relation_node <=> filesystem", + " SELECT coalesce(a.tablespace_oid, b.tablespace_oid) as tablespace_oid," + " coalesce(a.database_oid, b.database_oid) as database_oid," + " coalesce(a.relfilenode_oid, b.relfilenode_oid) as relfilenode_oid," + " coalesce(a.segment_file_num, b.segment_file_num) as segment_file_num," + " a.relfilenode_oid is null as filesystem," + " b.relfilenode_oid is null as persistent," + " b.relkind, b.relstorage" + " FROM gp_persistent_relation_node a" + " FULL OUTER JOIN (" + " SELECT p.*, c.relkind, c.relstorage" + " FROM gp_persistent_relation_node_check() p" + " LEFT OUTER JOIN pg_class c" + " ON (p.relfilenode_oid = c.relfilenode)" + " WHERE (p.segment_file_num = 0 or c.relstorage != 'h')" + " ) b ON (a.tablespace_oid = b.tablespace_oid and" + " a.database_oid = b.database_oid and" + " a.relfilenode_oid = b.relfilenode_oid and" + " a.segment_file_num = b.segment_file_num)" + " WHERE (a.relfilenode_oid is null OR" + " (a.persistent_state = 2 and b.relfilenode_oid is null)) and" + " coalesce(a.database_oid, b.database_oid) in (" + " SELECT oid FROM pg_database WHERE datname = current_database()" + " UNION ALL" + " SELECT 0" + " )", false + } }; static bool connected = false; -static ResourceOwner savedResourceOwner = NULL; -static MemoryContext oldMemoryContext = NULL; +static ResourceOwner savedResourceOwner = NULL; +static MemoryContext oldMemoryContext = NULL; + /*Pass 4 - End*/ /* Post DTM Recovery Verification related */ @@ -503,28 +517,31 @@ static PT_PostDTMRecv_Data *PT_PostDTMRecv_Info = NULL; /* * Function to check existence of entry in table with specified values. * If entry exist throw the error else just return fine. - * + * * Intention is to block any duplicate entries from getting IN, * hence must be called from every place trying to add entry to PT relation table. */ -void PTCheck_BeforeAddingEntry( PersistentStoreData *storeData, Datum *values) +void +PTCheck_BeforeAddingEntry(PersistentStoreData *storeData, Datum *values) { PTCHECK_RETURN_IF_DISABLED(); elog(PTCHECK_LOG_LEVEL, "PTCheck: Checking before adding entry to PT"); - int nKey = 0; - bool allowDuplicates = false; - bool status = true; + int nKey = 0; + bool allowDuplicates = false; + bool status = true; ItemPointerData iptr; - ScanKey key = (*storeData->scanKeyInitCallback)(values, &nKey); + ScanKey key = (*storeData->scanKeyInitCallback) (values, &nKey); + if (key == NULL) { return; } - Datum *existing_values = (Datum*)palloc(storeData->numAttributes * sizeof(Datum)); + Datum *existing_values = (Datum *) palloc(storeData->numAttributes * sizeof(Datum)); + if (existing_values == NULL) { elog(LOG, "PTCheck: Failed to allocate memory for existing_values datastructure"); @@ -534,53 +551,55 @@ void PTCheck_BeforeAddingEntry( PersistentStoreData *storeData, Datum *values) /* * Lets scan the table and fetch the entry with the key */ - Relation persistentRel = (*storeData->openRel)(); + Relation persistentRel = (*storeData->openRel) (); HeapScanDesc scan = heap_beginscan(persistentRel, SnapshotNow, nKey, key); - HeapTuple tuple = heap_getnext(scan, ForwardScanDirection); + HeapTuple tuple = heap_getnext(scan, ForwardScanDirection); + while (tuple != NULL) { PersistentStore_DeformTuple(storeData, persistentRel->rd_att, tuple, existing_values); - allowDuplicates = (*storeData->allowDuplicateCallback)(existing_values, values); + allowDuplicates = (*storeData->allowDuplicateCallback) (existing_values, values); if (allowDuplicates == false) { /* * Not expecting duplicate, MUST error-out then */ - status = false; - break; + status = false; + break; } - (*storeData->printTupleCallback)(LOG, "PTCheck insert", &iptr, values); - (*storeData->printTupleCallback)(LOG, "PTCheck allowing with duplicate", &iptr, existing_values); - + (*storeData->printTupleCallback) (LOG, "PTCheck insert", &iptr, values); + (*storeData->printTupleCallback) (LOG, "PTCheck allowing with duplicate", &iptr, existing_values); + /* * Callback returned its exception case and allow duplicate for this, * hence proceed forward to check - */ + */ tuple = heap_getnext(scan, ForwardScanDirection); } heap_endscan(scan); - (*storeData->closeRel)(persistentRel); + (*storeData->closeRel) (persistentRel); if (status == false) { - (*storeData->printTupleCallback)(LOG, "PTCheck insert", &iptr, values); - (*storeData->printTupleCallback)(LOG, "PTCheck conflicts with duplicate", &iptr, existing_values); + (*storeData->printTupleCallback) (LOG, "PTCheck insert", &iptr, values); + (*storeData->printTupleCallback) (LOG, "PTCheck conflicts with duplicate", &iptr, existing_values); ereport(ERROR, - (ERRCODE_INTERNAL_ERROR, - errmsg("PTCheck: Failed object entry already exist."))); + (ERRCODE_INTERNAL_ERROR, + errmsg("PTCheck: Failed object entry already exist."))); } - + pfree(existing_values); pfree(key); } -ScanKey Persistent_RelationScanKeyInit(Datum *values, int *nKeys) +ScanKey +Persistent_RelationScanKeyInit(Datum *values, int *nKeys) { PT_GpPersistentRelationNode relnode; @@ -588,15 +607,16 @@ ScanKey Persistent_RelationScanKeyInit(Datum *values, int *nKeys) /* tablespace_oid, database_oid, relfilenode_oid and segment_file_num */ *nKeys = 4; - ScanKey key = palloc0(*nKeys * sizeof(ScanKeyData)); + ScanKey key = palloc0(*nKeys * sizeof(ScanKeyData)); + if (key == NULL) { return NULL; } /* - * We needn't fill in sk_strategy or sk_subtype since these scankeys - * will never be passed to an index. + * We needn't fill in sk_strategy or sk_subtype since these scankeys will + * never be passed to an index. */ ScanKeyInit(&key[0], Anum_gp_persistent_relation_node_tablespace_oid, InvalidStrategy, F_OIDEQ, (Datum) relnode.tablespaceOid); ScanKeyInit(&key[1], Anum_gp_persistent_relation_node_database_oid, InvalidStrategy, F_OIDEQ, (Datum) relnode.databaseOid); @@ -614,14 +634,15 @@ ScanKey Persistent_RelationScanKeyInit(Datum *values, int *nKeys) * Currently, this code is shared between realtion, database, tablespace and filespace. * If in future exceptions vary for the same can write different routine for the same. */ -static inline bool -Persistent_AllowDuplicateEntry( PersistentFileSysState old_persistentState, - MirroredObjectExistenceState old_mirrorExistenceState, - PersistentFileSysState new_persistentState, - MirroredObjectExistenceState new_mirrorExistenceState ) +static inline bool +Persistent_AllowDuplicateEntry(PersistentFileSysState old_persistentState, + MirroredObjectExistenceState old_mirrorExistenceState, + PersistentFileSysState new_persistentState, + MirroredObjectExistenceState new_mirrorExistenceState) { /* - * Currently, only one exception hence coidng with ifs, as not many exceptions are expected. + * Currently, only one exception hence coidng with ifs, as not many + * exceptions are expected. */ if ((old_persistentState == PersistentFileSysState_AbortingCreate) && (old_mirrorExistenceState == MirroredObjectExistenceState_OnlyMirrorDropRemains)) @@ -636,7 +657,8 @@ Persistent_AllowDuplicateEntry( PersistentFileSysState old_persistentState, return false; } -bool Persistent_RelationAllowDuplicateEntry(Datum *exist_values, Datum *new_values) +bool +Persistent_RelationAllowDuplicateEntry(Datum *exist_values, Datum *new_values) { PT_GpPersistentRelationNode old_relnode; PT_GpPersistentRelationNode new_relnode; @@ -644,13 +666,14 @@ bool Persistent_RelationAllowDuplicateEntry(Datum *exist_values, Datum *new_valu GpPersistentRelationNodeGetValues(exist_values, &old_relnode); GpPersistentRelationNodeGetValues(new_values, &new_relnode); - return Persistent_AllowDuplicateEntry( old_relnode.persistentState, - old_relnode.mirrorExistenceState, - new_relnode.persistentState, - new_relnode.mirrorExistenceState ); + return Persistent_AllowDuplicateEntry(old_relnode.persistentState, + old_relnode.mirrorExistenceState, + new_relnode.persistentState, + new_relnode.mirrorExistenceState); } -ScanKey Persistent_DatabaseScanKeyInit(Datum *values, int *nKeys) +ScanKey +Persistent_DatabaseScanKeyInit(Datum *values, int *nKeys) { PT_GpPersistentDatabaseNode dbnode; @@ -658,15 +681,16 @@ ScanKey Persistent_DatabaseScanKeyInit(Datum *values, int *nKeys) /* tablespace_oid and database_oid */ *nKeys = 2; - ScanKey key = palloc0(*nKeys * sizeof(ScanKeyData)); + ScanKey key = palloc0(*nKeys * sizeof(ScanKeyData)); + if (key == NULL) { return NULL; } /* - * We needn't fill in sk_strategy or sk_subtype since these scankeys - * will never be passed to an index. + * We needn't fill in sk_strategy or sk_subtype since these scankeys will + * never be passed to an index. */ ScanKeyInit(&key[0], Anum_gp_persistent_database_node_tablespace_oid, InvalidStrategy, F_OIDEQ, (Datum) dbnode.tablespaceOid); ScanKeyInit(&key[1], Anum_gp_persistent_database_node_database_oid, InvalidStrategy, F_OIDEQ, (Datum) dbnode.databaseOid); @@ -674,19 +698,21 @@ ScanKey Persistent_DatabaseScanKeyInit(Datum *values, int *nKeys) return key; } -bool Persistent_DatabaseAllowDuplicateEntry(Datum *exist_values, Datum *new_values) +bool +Persistent_DatabaseAllowDuplicateEntry(Datum *exist_values, Datum *new_values) { /* - * Any exceptions intended to allow duplicate entries need to be checked here - * and for those specific cases returned TRUE and logged. - * For all rest of the cases default is to return false to avoid duplicate entry. + * Any exceptions intended to allow duplicate entries need to be checked + * here and for those specific cases returned TRUE and logged. For all + * rest of the cases default is to return false to avoid duplicate entry. */ /* For Database we expect no duplicates */ return false; } -ScanKey Persistent_TablespaceScanKeyInit(Datum *values, int *nKeys) +ScanKey +Persistent_TablespaceScanKeyInit(Datum *values, int *nKeys) { PT_GpPersistentTablespaceNode tablespacenode; @@ -694,28 +720,30 @@ ScanKey Persistent_TablespaceScanKeyInit(Datum *values, int *nKeys) /* filespace_oid and tablespace_oid */ *nKeys = 2; - ScanKey key = palloc0(*nKeys * sizeof(ScanKeyData)); + ScanKey key = palloc0(*nKeys * sizeof(ScanKeyData)); + if (key == NULL) { return NULL; } /* - * We needn't fill in sk_strategy or sk_subtype since these scankeys - * will never be passed to an index. + * We needn't fill in sk_strategy or sk_subtype since these scankeys will + * never be passed to an index. */ - ScanKeyInit(&key[0], Anum_gp_persistent_tablespace_node_filespace_oid, InvalidStrategy, F_OIDEQ, (Datum) tablespacenode.filespaceOid ); - ScanKeyInit(&key[1], Anum_gp_persistent_tablespace_node_tablespace_oid, InvalidStrategy, F_OIDEQ, (Datum) tablespacenode.tablespaceOid ); + ScanKeyInit(&key[0], Anum_gp_persistent_tablespace_node_filespace_oid, InvalidStrategy, F_OIDEQ, (Datum) tablespacenode.filespaceOid); + ScanKeyInit(&key[1], Anum_gp_persistent_tablespace_node_tablespace_oid, InvalidStrategy, F_OIDEQ, (Datum) tablespacenode.tablespaceOid); return key; } -bool Persistent_TablespaceAllowDuplicateEntry(Datum *exist_values, Datum *new_values) +bool +Persistent_TablespaceAllowDuplicateEntry(Datum *exist_values, Datum *new_values) { /* - * Any exceptions intended to allow duplicate entries need to be checked here - * and for those specific cases returned TRUE and logged. - * For all rest of the cases default is to return false to avoid duplicate entry. + * Any exceptions intended to allow duplicate entries need to be checked + * here and for those specific cases returned TRUE and logged. For all + * rest of the cases default is to return false to avoid duplicate entry. */ @@ -723,20 +751,22 @@ bool Persistent_TablespaceAllowDuplicateEntry(Datum *exist_values, Datum *new_va return false; } -ScanKey Persistent_FilespaceScanKeyInit(Datum *values, int *nKeys) +ScanKey +Persistent_FilespaceScanKeyInit(Datum *values, int *nKeys) { return NULL; } -bool Persistent_FilespaceAllowDuplicateEntry(Datum *exist_values, Datum *new_values) +bool +Persistent_FilespaceAllowDuplicateEntry(Datum *exist_values, Datum *new_values) { /* - * Any exceptions intended to allow duplicate entries need to be checked here - * and for those specific cases returned TRUE and logged. - * For all rest of the cases default is to return false to avoid duplicate entry. + * Any exceptions intended to allow duplicate entries need to be checked + * here and for those specific cases returned TRUE and logged. For all + * rest of the cases default is to return false to avoid duplicate entry. */ - return false; + return false; } /* @@ -753,9 +783,9 @@ bool Persistent_FilespaceAllowDuplicateEntry(Datum *exist_values, Datum *new_val Size Persistent_PostDTMRecv_ShmemSize(void) { - Size size; + Size size; - size = hash_estimate_size((Size)PT_MAX_NUM_POSTDTMRECV_DB, sizeof(postDTMRecv_dbTblSpc_Hash_Entry)); + size = hash_estimate_size((Size) PT_MAX_NUM_POSTDTMRECV_DB, sizeof(postDTMRecv_dbTblSpc_Hash_Entry)); size = add_size(size, sizeof(PT_PostDTMRecv_Data)); return size; @@ -764,21 +794,21 @@ Persistent_PostDTMRecv_ShmemSize(void) void Persistent_PostDTMRecv_ShmemInit(void) { - HASHCTL info; - int hash_flags; - bool foundPtr; + HASHCTL info; + int hash_flags; + bool foundPtr; PT_PostDTMRecv_Info = - (PT_PostDTMRecv_Data *) - ShmemInitStruct("Post DTM Recovery Checks Info", - sizeof(PT_PostDTMRecv_Data), - &foundPtr); + (PT_PostDTMRecv_Data *) + ShmemInitStruct("Post DTM Recovery Checks Info", + sizeof(PT_PostDTMRecv_Data), + &foundPtr); if (PT_PostDTMRecv_Info == NULL) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - (errmsg("not enough shared memory for post DTM recv. checks")))); + (errmsg("not enough shared memory for post DTM recv. checks")))); } if (!foundPtr) @@ -793,37 +823,38 @@ Persistent_PostDTMRecv_ShmemInit(void) hash_flags = (HASH_ELEM | HASH_FUNCTION); PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash = - ShmemInitHash("Post DTM Recv dbtblspc hash", - PT_MAX_NUM_POSTDTMRECV_DB, - PT_MAX_NUM_POSTDTMRECV_DB, - &info, - hash_flags); + ShmemInitHash("Post DTM Recv dbtblspc hash", + PT_MAX_NUM_POSTDTMRECV_DB, + PT_MAX_NUM_POSTDTMRECV_DB, + &info, + hash_flags); if (PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash == NULL) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - (errmsg("not enough shared memory for post DTM recv. checks")))); + (errmsg("not enough shared memory for post DTM recv. checks")))); } } postDTMRecv_dbTblSpc_Hash_Entry * Persistent_PostDTMRecv_InsertHashEntry(Oid dbId, postDTMRecv_dbTblSpc_Hash_Entry *values, bool *exists) { - bool foundPtr; + bool foundPtr; postDTMRecv_dbTblSpc_Hash_Entry *entry; + Insist(PT_PostDTMRecv_Info); Insist(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash != NULL); entry = (postDTMRecv_dbTblSpc_Hash_Entry *) hash_search( - PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash, - (void *) &dbId, - HASH_ENTER, - &foundPtr); + PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash, + (void *) &dbId, + HASH_ENTER, + &foundPtr); if (entry == NULL) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - (errmsg("Not enough shared memory")))); + (errmsg("Not enough shared memory")))); } if (foundPtr) @@ -833,7 +864,7 @@ Persistent_PostDTMRecv_InsertHashEntry(Oid dbId, postDTMRecv_dbTblSpc_Hash_Entry *exists = FALSE; entry->database = values->database; entry->tablespace = values->tablespace; - elog(LOG,"Added %d database %d tablespace to Hash",entry->database, entry->tablespace); + elog(LOG, "Added %d database %d tablespace to Hash", entry->database, entry->tablespace); } return entry; @@ -842,7 +873,8 @@ Persistent_PostDTMRecv_InsertHashEntry(Oid dbId, postDTMRecv_dbTblSpc_Hash_Entry void Persistent_PostDTMRecv_RemoveHashEntry(Oid dbId) { - bool foundPtr; + bool foundPtr; + Insist(PT_PostDTMRecv_Info); Insist(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash != NULL); @@ -855,7 +887,7 @@ Persistent_PostDTMRecv_RemoveHashEntry(Oid dbId) postDTMRecv_dbTblSpc_Hash_Entry * Persistent_PostDTMRecv_LookupHashEntry(Oid dbId, bool *exists) { - bool foundPtr; + bool foundPtr; postDTMRecv_dbTblSpc_Hash_Entry *entry; Insist(PT_PostDTMRecv_Info); @@ -863,9 +895,9 @@ Persistent_PostDTMRecv_LookupHashEntry(Oid dbId, bool *exists) *exists = true; entry = (postDTMRecv_dbTblSpc_Hash_Entry *) hash_search(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash, - (void *) &dbId, - HASH_FIND, - &foundPtr); + (void *) &dbId, + HASH_FIND, + &foundPtr); if (!foundPtr) { *exists = false; @@ -910,7 +942,7 @@ Persistent_PrintHash() while ((entry = (postDTMRecv_dbTblSpc_Hash_Entry *) hash_seq_search(&status)) != NULL) { - elog(LOG,"Database : %d, Tablespace : %d", entry->database,entry->tablespace); + elog(LOG, "Database : %d, Tablespace : %d", entry->database, entry->tablespace); } } @@ -946,23 +978,23 @@ Persistent_Pre_ExecuteQuery() int Persistent_ExecuteQuery(char const *query, bool readOnlyQuery) { - StringInfoData sqlstmt; - int ret; - int proc = 0; + StringInfoData sqlstmt; + int ret; + int proc = 0; - Assert (query); + Assert(query); Insist(connected); - /*Initializations*/ + /* Initializations */ sqlstmt.data = NULL; /* Assemble our query string */ initStringInfo(&sqlstmt); - appendStringInfo(&sqlstmt,"%s",query); + appendStringInfo(&sqlstmt, "%s", query); PG_TRY(); { - /*XXX: Need to set the snapshot here. Reason - Unknown*/ + /* XXX: Need to set the snapshot here. Reason - Unknown */ ActiveSnapshot = SnapshotNow; /* Run the query. */ @@ -971,22 +1003,23 @@ Persistent_ExecuteQuery(char const *query, bool readOnlyQuery) if (ret > 0 && SPI_tuptable != NULL) { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - SPITupleTable* tuptable = SPI_tuptable; - int i,j; - char localbuf[8192]; + TupleDesc tupdesc = SPI_tuptable->tupdesc; + SPITupleTable *tuptable = SPI_tuptable; + int i, + j; + char localbuf[8192]; - for (j = 0; j< proc; j++) + for (j = 0; j < proc; j++) { - HeapTuple tuple = tuptable->vals[j]; + HeapTuple tuple = tuptable->vals[j]; for (i = 1, localbuf[0] = '\0'; i <= tupdesc->natts; i++) { - snprintf(localbuf + strlen (localbuf), sizeof(localbuf) - strlen(localbuf), " %s%s", - SPI_getvalue(tuple, tupdesc, i), - (i == tupdesc->natts) ? " " : " |"); + snprintf(localbuf + strlen(localbuf), sizeof(localbuf) - strlen(localbuf), " %s%s", + SPI_getvalue(tuple, tupdesc, i), + (i == tupdesc->natts) ? " " : " |"); } - elog (LOG, "==>: %s", localbuf); + elog(LOG, "==>: %s", localbuf); } } } @@ -1012,7 +1045,7 @@ Persistent_Post_ExecuteQuery(void) Insist(connected); SPI_finish(); - connected=false; + connected = false; MemoryContextSwitchTo(oldMemoryContext); CurrentResourceOwner = savedResourceOwner; @@ -1031,7 +1064,7 @@ Persistent_ExecuteQuery_Cleanup(void) if (connected) { SPI_finish(); - connected=false; + connected = false; } AbortCurrentTransaction(); @@ -1048,12 +1081,15 @@ Persistent_ExecuteQuery_Cleanup(void) bool Persistent_NonDBSpecificPTCatVerification(void) { - int querynum = 0; - bool testSucceeded = true; + int querynum = 0; + bool testSucceeded = true; - /*DataState is needed because some cross cons. queries should be run only in InSync mode*/ + /* + * DataState is needed because some cross cons. queries should be run only + * in InSync mode + */ getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); - bool isDataInSync = (dataState == DataStateInSync); + bool isDataInSync = (dataState == DataStateInSync); Persistent_Pre_ExecuteQuery(); PG_TRY(); @@ -1063,7 +1099,7 @@ Persistent_NonDBSpecificPTCatVerification(void) if (!isDataInSync && NonDBSpecific_PTCat_Verification_queries[querynum].executeWhenInSyncOnly) continue; - elog(LOG,"%s",NonDBSpecific_PTCat_Verification_queries[querynum].quertTitle); + elog(LOG, "%s", NonDBSpecific_PTCat_Verification_queries[querynum].quertTitle); if (Persistent_ExecuteQuery(NonDBSpecific_PTCat_Verification_queries[querynum].queryStr, true) > 0) testSucceeded = false; } @@ -1083,14 +1119,18 @@ Persistent_NonDBSpecificPTCatVerification(void) * Performs database specific PersistentTables-Catalog verification * Return true if all the verifications pass else returns false */ -bool Persistent_DBSpecificPTCatVerification (void) +bool +Persistent_DBSpecificPTCatVerification(void) { - int querynum = 0; - bool testSucceeded = true; + int querynum = 0; + bool testSucceeded = true; - /*DataState is needed because some cross cons. queries should be run only in InSync mode*/ + /* + * DataState is needed because some cross cons. queries should be run only + * in InSync mode + */ getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); - bool isDataInSync = (dataState == DataStateInSync); + bool isDataInSync = (dataState == DataStateInSync); Persistent_Pre_ExecuteQuery(); PG_TRY(); @@ -1100,7 +1140,7 @@ bool Persistent_DBSpecificPTCatVerification (void) if (!isDataInSync && DB_PTCat_Veritifcation_queries[querynum].executeWhenInSyncOnly) continue; - elog(LOG,"%s",DB_PTCat_Veritifcation_queries[querynum].quertTitle); + elog(LOG, "%s", DB_PTCat_Veritifcation_queries[querynum].quertTitle); if (Persistent_ExecuteQuery(DB_PTCat_Veritifcation_queries[querynum].queryStr, true) > 0) testSucceeded = false; } @@ -1145,9 +1185,9 @@ Persistent_PostDTMRecv_DBSpecificPTCatVerification(void) Datum gp_dbspecific_ptcat_verification(PG_FUNCTION_ARGS) { - elog(LOG,"DB specific PersistentTable-Catalog Verification using DB %d", MyDatabaseId); + elog(LOG, "DB specific PersistentTable-Catalog Verification using DB %d", MyDatabaseId); if (!Persistent_DBSpecificPTCatVerification()) - elog(ERROR,"DB specific PersistentTable-Catalog verifications failed."); + elog(ERROR, "DB specific PersistentTable-Catalog verifications failed."); PG_RETURN_BOOL(true); } @@ -1155,10 +1195,9 @@ gp_dbspecific_ptcat_verification(PG_FUNCTION_ARGS) Datum gp_nondbspecific_ptcat_verification(PG_FUNCTION_ARGS) { - elog(LOG,"Non-DB specific PersistentTable-Catalog Verification using DB %d", MyDatabaseId); + elog(LOG, "Non-DB specific PersistentTable-Catalog Verification using DB %d", MyDatabaseId); if (!Persistent_NonDBSpecificPTCatVerification()) - elog(ERROR,"Non-DB specific PersistentTable-Catalog verifications failed."); + elog(ERROR, "Non-DB specific PersistentTable-Catalog verifications failed."); PG_RETURN_BOOL(true); } - diff --git a/src/backend/cdb/cdbresynchronizechangetracking.c b/src/backend/cdb/cdbresynchronizechangetracking.c index 214dd1c737..1d6b5b5c47 100644 --- a/src/backend/cdb/cdbresynchronizechangetracking.c +++ b/src/backend/cdb/cdbresynchronizechangetracking.c @@ -49,114 +49,124 @@ /* * Global Variables */ -char* changeTrackingMainBuffer; /* buffer for writing into main file (full) */ -char* changeTrackingCompactingBuffer; /* buffer for using when compacting files */ -char* changeTrackingXlogDataBuffer; -char metabuf[CHANGETRACKING_METABUFLEN]; -ChangeTrackingBufStatusData* CTMainWriteBufStatus; /* describes state of changeTrackingMainBuffer */ -ChangeTrackingBufStatusData* CTCompactWriteBufStatus; /* describes state of changeTrackingCompactingBuffer */ -ChangeTrackingResyncMetaData* changeTrackingResyncMeta; -ChangeTrackingLogCompactingStateData* changeTrackingCompState; /* state of data compacting in log files */ - -extern bool enable_groupagg; /* from guc.h */ +char *changeTrackingMainBuffer; /* buffer for writing into main file + * (full) */ +char *changeTrackingCompactingBuffer; /* buffer for using when + * compacting files */ +char *changeTrackingXlogDataBuffer; +char metabuf[CHANGETRACKING_METABUFLEN]; +ChangeTrackingBufStatusData *CTMainWriteBufStatus; /* describes state of + * changeTrackingMainBuffer */ +ChangeTrackingBufStatusData *CTCompactWriteBufStatus; /* describes state of + * changeTrackingCompactingBuffer */ +ChangeTrackingResyncMetaData *changeTrackingResyncMeta; +ChangeTrackingLogCompactingStateData *changeTrackingCompState; /* state of data + * compacting in log + * files */ + +extern bool enable_groupagg; /* from guc.h */ /* * Local functions */ -static int ChangeTracking_WriteBuffer(File file, CTFType ftype); -static void ChangeTracking_AddBufferPoolChange(CTFType ctype, - XLogRecPtr* xlogLocation, - RelFileNode* relFileNode, - BlockNumber blockNum, - ItemPointerData persistentTid, - int64 persistentSerialNum); -//static IncrementalChangeList* ChangeTracking_InitIncrementalChangeList(int count); -static ChangeTrackingResult* ChangeTracking_FormResult(int count); -static void ChangeTracking_AddResultEntry(ChangeTrackingResult *result, - Oid space, - Oid db, - Oid rel, - BlockNumber blocknum, - XLogRecPtr* lsn_end); +static int ChangeTracking_WriteBuffer(File file, CTFType ftype); +static void ChangeTracking_AddBufferPoolChange(CTFType ctype, + XLogRecPtr *xlogLocation, + RelFileNode *relFileNode, + BlockNumber blockNum, + ItemPointerData persistentTid, + int64 persistentSerialNum); + +/* static IncrementalChangeList* ChangeTracking_InitIncrementalChangeList(int count); */ +static ChangeTrackingResult *ChangeTracking_FormResult(int count); +static void ChangeTracking_AddResultEntry(ChangeTrackingResult *result, + Oid space, + Oid db, + Oid rel, + BlockNumber blocknum, + XLogRecPtr *lsn_end); static void ChangeTracking_MarkFullResyncLockAcquired(void); static void ChangeTracking_HandleWriteError(CTFType ft); static void ChangeTracking_CreateTransientLogIfNeeded(void); -static void ChangeTracking_ResetBufStatus(ChangeTrackingBufStatusData* bufstat); -static void ChangeTracking_ResetCompactingStatus(ChangeTrackingLogCompactingStateData* compstat); +static void ChangeTracking_ResetBufStatus(ChangeTrackingBufStatusData *bufstat); +static void ChangeTracking_ResetCompactingStatus(ChangeTrackingLogCompactingStateData *compstat); /* * Return the required shared-memory size for this module. */ -extern Size ChangeTrackingShmemSize(void) +extern Size +ChangeTrackingShmemSize(void) { Size size = 0; - - size = add_size(size, 2 * CHANGETRACKING_BLCKSZ); /* two 32kB shmem buffers */ - size = add_size(size, CHANGETRACKING_XLOGDATASZ); + + size = add_size(size, 2 * CHANGETRACKING_BLCKSZ); /* two 32kB shmem + * buffers */ + size = add_size(size, CHANGETRACKING_XLOGDATASZ); size = add_size(size, 2 * sizeof(ChangeTrackingBufStatusData)); /* the 2 buffer status */ - size = add_size(size, sizeof(ChangeTrackingResyncMetaData)); /* the resync metadata */ + size = add_size(size, sizeof(ChangeTrackingResyncMetaData)); /* the resync metadata */ size = add_size(size, sizeof(ChangeTrackingLogCompactingStateData)); - + return size; } - + /* * Initialize the shared-memory for this module. */ -extern void ChangeTrackingShmemInit(void) +extern void +ChangeTrackingShmemInit(void) { - bool foundBuffer1, - foundBuffer2, - foundStatus1, - foundStatus2, - foundMeta, - foundXlogData, - foundCompState; - size_t bufsize1 = CHANGETRACKING_BLCKSZ; - size_t bufsize2 = CHANGETRACKING_XLOGDATASZ; - - changeTrackingMainBuffer = (char *) - ShmemInitStruct("Change Tracking Main (Full) Log Buffer", - bufsize1, + bool foundBuffer1, + foundBuffer2, + foundStatus1, + foundStatus2, + foundMeta, + foundXlogData, + foundCompState; + size_t bufsize1 = CHANGETRACKING_BLCKSZ; + size_t bufsize2 = CHANGETRACKING_XLOGDATASZ; + + changeTrackingMainBuffer = (char *) + ShmemInitStruct("Change Tracking Main (Full) Log Buffer", + bufsize1, &foundBuffer1); - changeTrackingCompactingBuffer = (char *) - ShmemInitStruct("Change Tracking Log Buffer for compacting operations", - bufsize1, + changeTrackingCompactingBuffer = (char *) + ShmemInitStruct("Change Tracking Log Buffer for compacting operations", + bufsize1, &foundBuffer2); - CTMainWriteBufStatus = (ChangeTrackingBufStatusData *) - ShmemInitStruct("Change Tracking Full Log Buffer Status", - sizeof(ChangeTrackingBufStatusData), + CTMainWriteBufStatus = (ChangeTrackingBufStatusData *) + ShmemInitStruct("Change Tracking Full Log Buffer Status", + sizeof(ChangeTrackingBufStatusData), &foundStatus1); - CTCompactWriteBufStatus = (ChangeTrackingBufStatusData *) - ShmemInitStruct("Change Tracking Compact Log Buffer Status", - sizeof(ChangeTrackingBufStatusData), + CTCompactWriteBufStatus = (ChangeTrackingBufStatusData *) + ShmemInitStruct("Change Tracking Compact Log Buffer Status", + sizeof(ChangeTrackingBufStatusData), &foundStatus2); - changeTrackingResyncMeta = (ChangeTrackingResyncMetaData *) - ShmemInitStruct("Change Tracking Resync Meta Data", - sizeof(ChangeTrackingResyncMetaData), + changeTrackingResyncMeta = (ChangeTrackingResyncMetaData *) + ShmemInitStruct("Change Tracking Resync Meta Data", + sizeof(ChangeTrackingResyncMetaData), &foundMeta); - changeTrackingXlogDataBuffer = (char *) - ShmemInitStruct("Change Tracking Xlog Data Buffer", - bufsize2, - &foundXlogData); + changeTrackingXlogDataBuffer = (char *) + ShmemInitStruct("Change Tracking Xlog Data Buffer", + bufsize2, + &foundXlogData); - changeTrackingCompState = (ChangeTrackingLogCompactingStateData *) - ShmemInitStruct("Change Tracking Compacting state", - sizeof(ChangeTrackingLogCompactingStateData), + changeTrackingCompState = (ChangeTrackingLogCompactingStateData *) + ShmemInitStruct("Change Tracking Compacting state", + sizeof(ChangeTrackingLogCompactingStateData), &foundCompState); /* See if we are already initialized */ - if (foundBuffer1 || foundBuffer2 || foundStatus1 || + if (foundBuffer1 || foundBuffer2 || foundStatus1 || foundStatus2 || foundMeta || foundXlogData || foundCompState) { /* all should be present or neither */ - Assert(foundBuffer1 && foundBuffer2 && foundBuffer1 && + Assert(foundBuffer1 && foundBuffer2 && foundBuffer1 && foundBuffer2 && foundMeta && foundXlogData && foundCompState); return; } @@ -165,7 +175,7 @@ extern void ChangeTrackingShmemInit(void) memset(changeTrackingMainBuffer, 0, bufsize1); memset(changeTrackingCompactingBuffer, 0, bufsize1); memset(changeTrackingXlogDataBuffer, 0, bufsize2); - + /* init buffer status */ CTMainWriteBufStatus->maxbufsize = bufsize1; ChangeTracking_ResetBufStatus(CTMainWriteBufStatus); @@ -181,7 +191,7 @@ extern void ChangeTrackingShmemInit(void) changeTrackingResyncMeta->insync_transition_completed = false; ChangeTracking_ResetCompactingStatus(changeTrackingCompState); - + ereport(DEBUG1, (errmsg("initialized changetracking shared memory structures"))); return; @@ -190,35 +200,37 @@ extern void ChangeTrackingShmemInit(void) /* * Reset shmem variables to zero. */ -extern void ChangeTrackingShmemReset(void) +extern void +ChangeTrackingShmemReset(void) { ChangeTracking_ResetBufStatus(CTMainWriteBufStatus); ChangeTracking_ResetCompactingStatus(changeTrackingCompState); - + return; } /* - * This procedure will be called when mirror loss has been detected AND + * This procedure will be called when mirror loss has been detected AND * the master has told the primary it is carrying on. - * It scans the xlog starting from the most recent checkpoint and collects + * It scans the xlog starting from the most recent checkpoint and collects * all the interesting changes for the changelog. add them to the changelog. */ -void ChangeTracking_CreateInitialFromPreviousCheckpoint( - XLogRecPtr *lastChangeTrackingEndLoc) -{ +void +ChangeTracking_CreateInitialFromPreviousCheckpoint( + XLogRecPtr *lastChangeTrackingEndLoc) +{ if (gp_change_tracking) { - int count = XLogAddRecordsToChangeTracking(lastChangeTrackingEndLoc); + int count = XLogAddRecordsToChangeTracking(lastChangeTrackingEndLoc); elog(LOG, "scanned through %d initial xlog records since last checkpoint " - "for writing into the resynchronize change log", count); + "for writing into the resynchronize change log", count); } else { elog(WARNING, "Change logging is disabled. This should only occur after " - "a manual intervention of an administrator, and only with " - "guidance from greenplum support."); + "a manual intervention of an administrator, and only with " + "guidance from greenplum support."); } } @@ -229,28 +241,29 @@ void ChangeTracking_CreateInitialFromPreviousCheckpoint( * xlogLocation - The XLOG LSN of the record that describes the page change. * relFileNode - The tablespace, database, and relation OIDs for the changed relation. * blockNum - the block that was changed. - * + * */ -static void ChangeTracking_AddBufferPoolChange(CTFType ftype, - XLogRecPtr* xlogLocation, - RelFileNode* relFileNode, - BlockNumber blockNum, - ItemPointerData persistentTid, - int64 persistentSerialNum) +static void +ChangeTracking_AddBufferPoolChange(CTFType ftype, + XLogRecPtr *xlogLocation, + RelFileNode *relFileNode, + BlockNumber blockNum, + ItemPointerData persistentTid, + int64 persistentSerialNum) { ChangeTrackingRecord rec; ChangeTrackingBufStatusData *bufstat; - char* buf; - int freespace = 0; + char *buf; + int freespace = 0; /* gp_persistent relation change? we shouldn't log it. exit early */ - if(GpPersistent_SkipXLogInfo(relFileNode->relNode)) + if (GpPersistent_SkipXLogInfo(relFileNode->relNode)) return; - + Assert(ftype != CTF_META); Assert(ftype != CTF_LOG_TRANSIENT); - - if(ftype == CTF_LOG_FULL) + + if (ftype == CTF_LOG_FULL) { /* this is a regular write from xlog */ bufstat = CTMainWriteBufStatus; @@ -262,8 +275,8 @@ static void ChangeTracking_AddBufferPoolChange(CTFType ftype, bufstat = CTCompactWriteBufStatus; buf = changeTrackingCompactingBuffer; } - - + + /* populate a new change log record */ rec.xlogLocation = *xlogLocation; rec.relFileNode = *relFileNode; @@ -272,41 +285,44 @@ static void ChangeTracking_AddBufferPoolChange(CTFType ftype, rec.persistentSerialNum = persistentSerialNum; LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - - if(bufstat->bufsize == 0) - bufstat->bufsize = sizeof(ChangeTrackingPageHeader); /* leave room for header (first time around only) */ - + + if (bufstat->bufsize == 0) + bufstat->bufsize = sizeof(ChangeTrackingPageHeader); /* leave room for header + * (first time around + * only) */ + /* copy to our shared memory buffer */ memcpy(buf + bufstat->bufsize, &rec, sizeof(rec)); - + /* update state in shared memory */ bufstat->recordcount++; bufstat->bufsize += sizeof(rec); - /* - * check if buffer is full. if it is pad it with zeros, add a - * header and write it to the change log file. We don't flush - * it yet, it will be done during checkpoint. + /* + * check if buffer is full. if it is pad it with zeros, add a header and + * write it to the change log file. We don't flush it yet, it will be done + * during checkpoint. */ freespace = bufstat->maxbufsize - bufstat->bufsize; - if(freespace < sizeof(ChangeTrackingRecord)) + if (freespace < sizeof(ChangeTrackingRecord)) { /* - * NOTE: We open the file, write it, and close it each time a buffer gets - * written. Why? The 'File' reference used to be kept in shmem but - * when the background writer comes in with a checkpoint the fd.c - * cache of the bgwriter process didn't know about this file. so, - * for now we keep it in the local fd.c cache and open and close each - * time until a better solution is found. + * NOTE: We open the file, write it, and close it each time a buffer + * gets written. Why? The 'File' reference used to be kept in shmem + * but when the background writer comes in with a checkpoint the fd.c + * cache of the bgwriter process didn't know about this file. so, for + * now we keep it in the local fd.c cache and open and close each time + * until a better solution is found. */ - File file = ChangeTracking_OpenFile(ftype); + File file = ChangeTracking_OpenFile(ftype); + if (ChangeTracking_WriteBuffer(file, ftype) < 0) ChangeTracking_HandleWriteError(ftype); ChangeTracking_CloseFile(file); } - + LWLockRelease(ChangeTrackingWriteLock); - + if (Debug_filerep_print) { elog(LOG, @@ -327,114 +343,116 @@ static void ChangeTracking_AddBufferPoolChange(CTFType ftype, * return it in a buffer. The data is stored in a buffer * in a similar structure to how an xlog record data section * looks like. - * + * * Normally the data is stored in a shared memory buffer * 'changeTrackingXlogDataBuffer'. however, there are special * cases (currently gist is the only one) which may require * a much larger buffer to store the data. In that case we - * dynamically allocate a buffer and populate it, while + * dynamically allocate a buffer and populate it, while * marking 'iscopy' to true to let the caller know they * need to pfree it themselves. */ -char* ChangeTracking_CopyRdataBuffers(XLogRecData *rdata, - RmgrId rmid, - uint8 info, - bool* iscopy) +char * +ChangeTracking_CopyRdataBuffers(XLogRecData *rdata, + RmgrId rmid, + uint8 info, + bool *iscopy) { - XLogRecData* ptr = rdata; /* don't want to change rdata, use another ptr */ - char* gist_data_buf = NULL; - int pos = 0; - bool gist_split_page = (rmid == RM_GIST_ID && - (info & ~XLR_INFO_MASK) == XLOG_GIST_PAGE_SPLIT); - + XLogRecData *ptr = rdata; /* don't want to change rdata, use another ptr */ + char *gist_data_buf = NULL; + int pos = 0; + bool gist_split_page = (rmid == RM_GIST_ID && + (info & ~XLR_INFO_MASK) == XLOG_GIST_PAGE_SPLIT); + while (ptr->data == NULL && ptr->next != NULL) ptr = ptr->next; - + if (ptr->data == NULL) { *iscopy = false; return NULL; } - + /* Copy the main (first) rdata data block */ Assert(ptr->len <= CHANGETRACKING_XLOGDATASZ); memcpy(changeTrackingXlogDataBuffer, ptr->data, ptr->len); pos += ptr->len; - + /* ok, we're done! ... unless there's a special case to handle */ - + /* special case: gist split has data we need in the next rdata blocks */ - if(gist_split_page) + if (gist_split_page) { - XLogRecData* ptr_save_loc = ptr; - int gist_data_len = ptr->len; /* previous data */ - + XLogRecData *ptr_save_loc = ptr; + int gist_data_len = ptr->len; /* previous data */ + /* pre-calculate buf size we will need */ - while(ptr->next != NULL) + while (ptr->next != NULL) { ptr = ptr->next; - - if(ptr->data != NULL) + + if (ptr->data != NULL) gist_data_len += ptr->len; } /* allocate a buffer. copy all previously copied data */ gist_data_buf = (char *) palloc(gist_data_len * sizeof(char)); memcpy(gist_data_buf, changeTrackingXlogDataBuffer, pos); - + /* now copy the rest of the gist data */ ptr = ptr_save_loc; - while(ptr->next != NULL) + while (ptr->next != NULL) { ptr = ptr->next; - - if(ptr->data != NULL) + + if (ptr->data != NULL) { memcpy(gist_data_buf + pos, ptr->data, ptr->len); - pos += ptr->len; + pos += ptr->len; } } - + *iscopy = true; return gist_data_buf; } - + *iscopy = false; return changeTrackingXlogDataBuffer; } /* - * When a new xlog record is created and we're in changetracking mode this + * When a new xlog record is created and we're in changetracking mode this * function gets called in order to create a changetracking record as well. * If the passed in xlog record is uninteresting to us, the function will * not log it and will return normally. - * + * * We pass in the actual RM data *separately* from the XLogRecord. We normally * wouldn't need to do that, because the data follows the XLogRecord header, * however it turns out that XLogInsert() will break apart an xlog record if in * buffer boundaries and load some of it in the end of current buffer and the * rest, therefore leaving it no longer contigious in memory. */ -void ChangeTracking_AddRecordFromXlog(RmgrId rmid, - uint8 info, - void* data, - XLogRecPtr* loc) +void +ChangeTracking_AddRecordFromXlog(RmgrId rmid, + uint8 info, + void *data, + XLogRecPtr *loc) { - int relationChangeInfoArrayCount; - int i; - int arrlen = ChangeTracking_GetInfoArrayDesiredMaxLength(rmid, info); - RelationChangeInfo relationChangeInfoArray[arrlen]; - + int relationChangeInfoArrayCount; + int i; + int arrlen = ChangeTracking_GetInfoArrayDesiredMaxLength(rmid, info); + RelationChangeInfo relationChangeInfoArray[arrlen]; + Assert(gp_change_tracking); - + ChangeTracking_GetRelationChangeInfoFromXlog( - rmid, - info, - data, - relationChangeInfoArray, - &relationChangeInfoArrayCount, - arrlen); + rmid, + info, + data, + relationChangeInfoArray, + &relationChangeInfoArrayCount, + arrlen); for (i = 0; i < relationChangeInfoArrayCount; i++) ChangeTracking_AddBufferPoolChange(CTF_LOG_FULL, @@ -445,38 +463,39 @@ void ChangeTracking_AddRecordFromXlog(RmgrId rmid, relationChangeInfoArray[i].persistentSerialNum); } -bool ChangeTracking_PrintRelationChangeInfo( - RmgrId xl_rmid, - uint8 xl_info, - void *data, - XLogRecPtr *loc, - bool weAreGeneratingXLogNow, - bool printSkipIssuesOnly) +bool +ChangeTracking_PrintRelationChangeInfo( + RmgrId xl_rmid, + uint8 xl_info, + void *data, + XLogRecPtr *loc, + bool weAreGeneratingXLogNow, + bool printSkipIssuesOnly) { - bool atLeastOneSkipIssue = false; - int relationChangeInfoArrayCount; - int i; - int arrlen = ChangeTracking_GetInfoArrayDesiredMaxLength(xl_rmid, xl_info); - RelationChangeInfo relationChangeInfoArray[arrlen]; - + bool atLeastOneSkipIssue = false; + int relationChangeInfoArrayCount; + int i; + int arrlen = ChangeTracking_GetInfoArrayDesiredMaxLength(xl_rmid, xl_info); + RelationChangeInfo relationChangeInfoArray[arrlen]; + ChangeTracking_GetRelationChangeInfoFromXlog( - xl_rmid, - xl_info, - data, - relationChangeInfoArray, - &relationChangeInfoArrayCount, - arrlen); + xl_rmid, + xl_info, + data, + relationChangeInfoArray, + &relationChangeInfoArrayCount, + arrlen); for (i = 0; i < relationChangeInfoArrayCount; i++) { - RelationChangeInfo *relationChangeInfo; - int64 maxPersistentSerialNum; - bool skip; - bool zeroTid = false; - bool invalidTid = false; - bool zeroSerialNum = false; - bool invalidSerialNum = false; - bool skipIssue = false; + RelationChangeInfo *relationChangeInfo; + int64 maxPersistentSerialNum; + bool skip; + bool zeroTid = false; + bool invalidTid = false; + bool zeroSerialNum = false; + bool invalidSerialNum = false; + bool skipIssue = false; relationChangeInfo = &relationChangeInfoArray[i]; @@ -497,7 +516,8 @@ bool ChangeTracking_PrintRelationChangeInfo( invalidSerialNum = (relationChangeInfo->persistentSerialNum < 0); /* - * If we have'nt done the scan yet... do not do upper range check. + * If we have'nt done the scan yet... do not do upper range + * check. */ if (maxPersistentSerialNum != 0 && relationChangeInfo->persistentSerialNum > maxPersistentSerialNum) @@ -507,25 +527,25 @@ bool ChangeTracking_PrintRelationChangeInfo( } if (!printSkipIssuesOnly || skipIssue) - elog(LOG, + elog(LOG, "ChangeTracking_PrintRelationChangeInfo: [%d] xl_rmid %d, xl_info 0x%X, %u/%u/%u, block number %u, LSN %s, persistent serial num " INT64_FORMAT ", TID %s, maxPersistentSerialNum " INT64_FORMAT ", skip %s, zeroTid %s, invalidTid %s, zeroSerialNum %s, invalidSerialNum %s, skipIssue %s", - i, - xl_rmid, - xl_info, - relationChangeInfo->relFileNode.spcNode, - relationChangeInfo->relFileNode.dbNode, - relationChangeInfo->relFileNode.relNode, - relationChangeInfo->blockNumber, - XLogLocationToString(loc), - relationChangeInfo->persistentSerialNum, - ItemPointerToString(&relationChangeInfo->persistentTid), - maxPersistentSerialNum, - (skip ? "true" : "false"), - (zeroTid ? "true" : "false"), - (invalidTid ? "true" : "false"), - (zeroSerialNum ? "true" : "false"), - (invalidSerialNum ? "true" : "false"), - (skipIssue ? "true" : "false")); + i, + xl_rmid, + xl_info, + relationChangeInfo->relFileNode.spcNode, + relationChangeInfo->relFileNode.dbNode, + relationChangeInfo->relFileNode.relNode, + relationChangeInfo->blockNumber, + XLogLocationToString(loc), + relationChangeInfo->persistentSerialNum, + ItemPointerToString(&relationChangeInfo->persistentTid), + maxPersistentSerialNum, + (skip ? "true" : "false"), + (zeroTid ? "true" : "false"), + (invalidTid ? "true" : "false"), + (zeroSerialNum ? "true" : "false"), + (invalidSerialNum ? "true" : "false"), + (skipIssue ? "true" : "false")); if (skipIssue) atLeastOneSkipIssue = true; @@ -535,54 +555,56 @@ bool ChangeTracking_PrintRelationChangeInfo( } -static void ChangeTracking_AddRelationChangeInfo( - RelationChangeInfo *relationChangeInfoArray, - int *relationChangeInfoArrayCount, - int relationChangeInfoMaxSize, - RelFileNode *relFileNode, - BlockNumber blockNumber, - ItemPointer persistentTid, - int64 persistentSerialNum) +static void +ChangeTracking_AddRelationChangeInfo( + RelationChangeInfo *relationChangeInfoArray, + int *relationChangeInfoArrayCount, + int relationChangeInfoMaxSize, + RelFileNode *relFileNode, + BlockNumber blockNumber, + ItemPointer persistentTid, + int64 persistentSerialNum) { - RelationChangeInfo *relationChangeInfo; - - Assert (*relationChangeInfoArrayCount < relationChangeInfoMaxSize); + RelationChangeInfo *relationChangeInfo; + + Assert(*relationChangeInfoArrayCount < relationChangeInfoMaxSize); relationChangeInfo = &relationChangeInfoArray[*relationChangeInfoArrayCount]; - relationChangeInfo->relFileNode = *relFileNode; - relationChangeInfo->blockNumber = blockNumber; - relationChangeInfo->persistentTid = *persistentTid; - relationChangeInfo->persistentSerialNum = persistentSerialNum; + relationChangeInfo->relFileNode = *relFileNode; + relationChangeInfo->blockNumber = blockNumber; + relationChangeInfo->persistentTid = *persistentTid; + relationChangeInfo->persistentSerialNum = persistentSerialNum; (*relationChangeInfoArrayCount)++; } -void ChangeTracking_GetRelationChangeInfoFromXlog( - RmgrId xl_rmid, - uint8 xl_info, - void *data, - RelationChangeInfo *relationChangeInfoArray, - int *relationChangeInfoArrayCount, - int relationChangeInfoMaxSize) +void +ChangeTracking_GetRelationChangeInfoFromXlog( + RmgrId xl_rmid, + uint8 xl_info, + void *data, + RelationChangeInfo *relationChangeInfoArray, + int *relationChangeInfoArrayCount, + int relationChangeInfoMaxSize) { - uint8 info = xl_info & ~XLR_INFO_MASK; - uint8 op = 0; + uint8 info = xl_info & ~XLR_INFO_MASK; + uint8 op = 0; MemSet(relationChangeInfoArray, 0, sizeof(RelationChangeInfo) * relationChangeInfoMaxSize); *relationChangeInfoArrayCount = 0; /* - * Find the RM for this xlog record and see whether we are - * interested in logging it as a buffer pool change or not. + * Find the RM for this xlog record and see whether we are interested in + * logging it as a buffer pool change or not. */ switch (xl_rmid) { - /* - * The following changes aren't interesting to the change log - */ + /* + * The following changes aren't interesting to the change log + */ case RM_CLOG_ID: case RM_MULTIXACT_ID: case RM_XACT_ID: @@ -594,23 +616,28 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( #ifdef USE_SEGWALREP case RM_APPEND_ONLY_ID: -#endif /* USE_SEGWALREP */ +#endif /* USE_SEGWALREP */ break; - /* - * These aren't supported in GPDB - */ + /* + * These aren't supported in GPDB + */ case RM_HASH_ID: elog(ERROR, "internal error: unsupported RM ID (%d) in ChangeTracking_GetRelationChangeInfoFromXlog", xl_rmid); break; case RM_GIN_ID: - /* keep LOG severity till crash recovery or GIN is implemented in order to avoid double failures during cdbfast */ + + /* + * keep LOG severity till crash recovery or GIN is implemented in + * order to avoid double failures during cdbfast + */ elog(LOG, "internal error: unsupported RM ID (%d) in ChangeTracking_GetRelationChangeInfoFromXlog", xl_rmid); break; - /* - * The following changes must be logged in the change log. - */ + + /* + * The following changes must be logged in the change log. + */ case RM_XLOG_ID: if (info == XLOG_HINT) { @@ -618,13 +645,13 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( memcpy(&bkpbwithpt, data, sizeof(BkpBlockWithPT)); ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(bkpbwithpt.bkpb.node), - bkpbwithpt.bkpb.block, - &bkpbwithpt.persistentTid, - bkpbwithpt.persistentSerialNum); + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(bkpbwithpt.bkpb.node), + bkpbwithpt.bkpb.block, + &bkpbwithpt.persistentTid, + bkpbwithpt.persistentSerialNum); } break; @@ -633,34 +660,34 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( switch (op) { case XLOG_HEAP2_FREEZE: - { - xl_heap_freeze *xlrec = (xl_heap_freeze *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->heapnode.node), - xlrec->block, - &xlrec->heapnode.persistentTid, - xlrec->heapnode.persistentSerialNum); - break; - } + { + xl_heap_freeze *xlrec = (xl_heap_freeze *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->heapnode.node), + xlrec->block, + &xlrec->heapnode.persistentTid, + xlrec->heapnode.persistentSerialNum); + break; + } case XLOG_HEAP2_CLEAN: case XLOG_HEAP2_CLEAN_MOVE: - { - xl_heap_clean *xlrec = (xl_heap_clean *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->heapnode.node), - xlrec->block, - &xlrec->heapnode.persistentTid, - xlrec->heapnode.persistentSerialNum); - break; - } + { + xl_heap_clean *xlrec = (xl_heap_clean *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->heapnode.node), + xlrec->block, + &xlrec->heapnode.persistentTid, + xlrec->heapnode.persistentSerialNum); + break; + } default: elog(ERROR, "internal error: unsupported RM_HEAP2_ID op (%u) in ChangeTracking_GetRelationChangeInfoFromXlog", info); } @@ -670,108 +697,108 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( switch (op) { case XLOG_HEAP_INSERT: - { - xl_heap_insert *xlrec = (xl_heap_insert *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - break; - } + { + xl_heap_insert *xlrec = (xl_heap_insert *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + break; + } case XLOG_HEAP_DELETE: - { - xl_heap_delete *xlrec = (xl_heap_delete *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - break; - } + { + xl_heap_delete *xlrec = (xl_heap_delete *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + break; + } case XLOG_HEAP_HOT_UPDATE: case XLOG_HEAP_UPDATE: case XLOG_HEAP_MOVE: - { - xl_heap_update *xlrec = (xl_heap_update *) data; - - BlockNumber oldblock = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - BlockNumber newblock = ItemPointerGetBlockNumber(&(xlrec->newtid)); - bool samepage = (oldblock == newblock); - - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - newblock, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - if(!samepage) + { + xl_heap_update *xlrec = (xl_heap_update *) data; + + BlockNumber oldblock = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + BlockNumber newblock = ItemPointerGetBlockNumber(&(xlrec->newtid)); + bool samepage = (oldblock == newblock); + + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - oldblock, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - - break; - } + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + newblock, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + if (!samepage) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + oldblock, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + + break; + } case XLOG_HEAP_NEWPAGE: - { - xl_heap_newpage *xlrec = (xl_heap_newpage *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->heapnode.node), - xlrec->blkno, - &xlrec->heapnode.persistentTid, - xlrec->heapnode.persistentSerialNum); - break; - } + { + xl_heap_newpage *xlrec = (xl_heap_newpage *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->heapnode.node), + xlrec->blkno, + &xlrec->heapnode.persistentTid, + xlrec->heapnode.persistentSerialNum); + break; + } case XLOG_HEAP_LOCK: - { - xl_heap_lock *xlrec = (xl_heap_lock *) data; - BlockNumber block = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - block, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - break; - } + { + xl_heap_lock *xlrec = (xl_heap_lock *) data; + BlockNumber block = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + block, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + break; + } case XLOG_HEAP_INPLACE: - { - xl_heap_inplace *xlrec = (xl_heap_inplace *) data; - BlockNumber block = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - block, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - break; - } + { + xl_heap_inplace *xlrec = (xl_heap_inplace *) data; + BlockNumber block = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + block, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + break; + } default: elog(ERROR, "internal error: unsupported RM_HEAP_ID op (%u) in ChangeTracking_GetRelationChangeInfoFromXlog", op); @@ -784,167 +811,167 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( case XLOG_BTREE_INSERT_LEAF: case XLOG_BTREE_INSERT_UPPER: case XLOG_BTREE_INSERT_META: - { - xl_btree_insert *xlrec = (xl_btree_insert *) data; - BlockIdData blkid = xlrec->target.tid.ip_blkid; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - BlockIdGetBlockNumber(&blkid), - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - - if(info == XLOG_BTREE_INSERT_META) + { + xl_btree_insert *xlrec = (xl_btree_insert *) data; + BlockIdData blkid = xlrec->target.tid.ip_blkid; + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - BTREE_METAPAGE, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + BlockIdGetBlockNumber(&blkid), + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + + if (info == XLOG_BTREE_INSERT_META) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + BTREE_METAPAGE, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); - break; - } + break; + } case XLOG_BTREE_SPLIT_L: case XLOG_BTREE_SPLIT_L_ROOT: case XLOG_BTREE_SPLIT_R: case XLOG_BTREE_SPLIT_R_ROOT: - { - xl_btree_split *xlrec = (xl_btree_split *) data; - - /* orig page / new left page */ - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, &(xlrec->node), - xlrec->leftsib, - &xlrec->persistentTid, - xlrec->persistentSerialNum); - - /* new right page */ - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, &(xlrec->node), - xlrec->rightsib, - &xlrec->persistentTid, - xlrec->persistentSerialNum); - - /* next block (orig page's rightlink) */ - if (xlrec->rnext != P_NONE) { + xl_btree_split *xlrec = (xl_btree_split *) data; + + /* orig page / new left page */ + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, &(xlrec->node), + xlrec->leftsib, + &xlrec->persistentTid, + xlrec->persistentSerialNum); + + /* new right page */ ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, &(xlrec->node), - xlrec->rnext, - &xlrec->persistentTid, - xlrec->persistentSerialNum); + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, &(xlrec->node), + xlrec->rightsib, + &xlrec->persistentTid, + xlrec->persistentSerialNum); + + /* next block (orig page's rightlink) */ + if (xlrec->rnext != P_NONE) + { + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, &(xlrec->node), + xlrec->rnext, + &xlrec->persistentTid, + xlrec->persistentSerialNum); + } + break; } - break; - } case XLOG_BTREE_DELETE: - { - xl_btree_delete *xlrec = (xl_btree_delete *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->btreenode.node), - xlrec->block, - &xlrec->btreenode.persistentTid, - xlrec->btreenode.persistentSerialNum); - break; - } + { + xl_btree_delete *xlrec = (xl_btree_delete *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->btreenode.node), + xlrec->block, + &xlrec->btreenode.persistentTid, + xlrec->btreenode.persistentSerialNum); + break; + } case XLOG_BTREE_DELETE_PAGE: case XLOG_BTREE_DELETE_PAGE_HALF: case XLOG_BTREE_DELETE_PAGE_META: - { - xl_btree_delete_page *xlrec = (xl_btree_delete_page *) data; - BlockIdData blkid = xlrec->target.tid.ip_blkid; - BlockNumber block = BlockIdGetBlockNumber(&blkid); - - if (block != P_NONE) - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - block, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - - if (xlrec->rightblk != P_NONE) - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - xlrec->rightblk, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - - if (xlrec->leftblk != P_NONE) - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - xlrec->leftblk, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - - if (xlrec->deadblk != P_NONE) + { + xl_btree_delete_page *xlrec = (xl_btree_delete_page *) data; + BlockIdData blkid = xlrec->target.tid.ip_blkid; + BlockNumber block = BlockIdGetBlockNumber(&blkid); + + if (block != P_NONE) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + block, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + + if (xlrec->rightblk != P_NONE) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + xlrec->rightblk, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + + if (xlrec->leftblk != P_NONE) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + xlrec->leftblk, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + + if (xlrec->deadblk != P_NONE) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + xlrec->deadblk, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + + if (info == XLOG_BTREE_DELETE_PAGE_META) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->target.node), + BTREE_METAPAGE, + &xlrec->target.persistentTid, + xlrec->target.persistentSerialNum); + break; + } + case XLOG_BTREE_NEWROOT: + { + xl_btree_newroot *xlrec = (xl_btree_newroot *) data; + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - xlrec->deadblk, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - - if (info == XLOG_BTREE_DELETE_PAGE_META) + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->btreenode.node), + xlrec->rootblk, + &xlrec->btreenode.persistentTid, + xlrec->btreenode.persistentSerialNum); + + /* newroot always updates the meta page */ ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->target.node), - BTREE_METAPAGE, - &xlrec->target.persistentTid, - xlrec->target.persistentSerialNum); - break; - } - case XLOG_BTREE_NEWROOT: - { - xl_btree_newroot *xlrec = (xl_btree_newroot *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->btreenode.node), - xlrec->rootblk, - &xlrec->btreenode.persistentTid, - xlrec->btreenode.persistentSerialNum); - - /* newroot always updates the meta page */ - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->btreenode.node), - BTREE_METAPAGE, - &xlrec->btreenode.persistentTid, - xlrec->btreenode.persistentSerialNum); - - break; - } + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->btreenode.node), + BTREE_METAPAGE, + &xlrec->btreenode.persistentTid, + xlrec->btreenode.persistentSerialNum); + + break; + } default: elog(ERROR, "internal error: unsupported RM_BTREE_ID op (%u) in ChangeTracking_GetRelationChangeInfoFromXlog", info); @@ -954,153 +981,153 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( switch (info) { case XLOG_BITMAP_INSERT_NEWLOV: - { - xl_bm_newpage *xlrec = (xl_bm_newpage *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_new_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - break; - } + { + xl_bm_newpage *xlrec = (xl_bm_newpage *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_new_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + break; + } case XLOG_BITMAP_INSERT_LOVITEM: - { - xl_bm_lovitem *xlrec = (xl_bm_lovitem *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_lov_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - - if (xlrec->bm_is_new_lov_blkno) + { + xl_bm_lovitem *xlrec = (xl_bm_lovitem *) data; + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - BM_METAPAGE, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - break; - } + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_lov_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + + if (xlrec->bm_is_new_lov_blkno) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + BM_METAPAGE, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + break; + } case XLOG_BITMAP_INSERT_META: - { - xl_bm_metapage *xlrec = (xl_bm_metapage *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - BM_METAPAGE, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - break; - } + { + xl_bm_metapage *xlrec = (xl_bm_metapage *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + BM_METAPAGE, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + break; + } case XLOG_BITMAP_INSERT_BITMAP_LASTWORDS: - { - xl_bm_bitmap_lastwords *xlrec = (xl_bm_bitmap_lastwords *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_lov_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - break; - } + { + xl_bm_bitmap_lastwords *xlrec = (xl_bm_bitmap_lastwords *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_lov_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + break; + } case XLOG_BITMAP_INSERT_WORDS: - { - xl_bm_bitmapwords *xlrec = (xl_bm_bitmapwords *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_lov_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - - if (!xlrec->bm_is_last) + { + xl_bm_bitmapwords *xlrec = (xl_bm_bitmapwords *) data; + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_next_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - break; - } + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_lov_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + + if (!xlrec->bm_is_last) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_next_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + break; + } case XLOG_BITMAP_UPDATEWORD: - { - xl_bm_updateword *xlrec = (xl_bm_updateword *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - break; - } - case XLOG_BITMAP_UPDATEWORDS: - { - xl_bm_updatewords *xlrec = (xl_bm_updatewords *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_first_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - - if (xlrec->bm_two_pages) + { + xl_bm_updateword *xlrec = (xl_bm_updateword *) data; + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_second_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - - if (xlrec->bm_new_lastpage) + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + break; + } + case XLOG_BITMAP_UPDATEWORDS: + { + xl_bm_updatewords *xlrec = (xl_bm_updatewords *) data; + ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->bm_node), - xlrec->bm_lov_blkno, - &xlrec->bm_persistentTid, - xlrec->bm_persistentSerialNum); - - break; - } + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_first_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + + if (xlrec->bm_two_pages) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_second_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + + if (xlrec->bm_new_lastpage) + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->bm_node), + xlrec->bm_lov_blkno, + &xlrec->bm_persistentTid, + xlrec->bm_persistentSerialNum); + + break; + } default: elog(ERROR, "internal error: unsupported RM_BITMAP_ID op (%u) in ChangeTracking_GetRelationChangeInfoFromXlog", info); } @@ -1109,124 +1136,131 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( switch (info) { case XLOG_SEQ_LOG: - { - xl_seq_rec *xlrec = (xl_seq_rec *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xlrec->node), - 0, /* seq_redo touches block 0 only */ - &xlrec->persistentTid, - xlrec->persistentSerialNum); + { + xl_seq_rec *xlrec = (xl_seq_rec *) data; - break; - } + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xlrec->node), + 0, /* seq_redo touches + * block 0 only */ + &xlrec->persistentTid, + xlrec->persistentSerialNum); + + break; + } default: elog(ERROR, "internal error: unsupported RM_SEQ_ID op (%u) in ChangeTracking_GetRelationChangeInfoFromXlog", info); } break; - + case RM_GIST_ID: switch (info) { case XLOG_GIST_PAGE_UPDATE: case XLOG_GIST_NEW_ROOT: - { - gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xldata->node), - xldata->blkno, - &xldata->persistentTid, - xldata->persistentSerialNum); - break; - } + { + gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xldata->node), + xldata->blkno, + &xldata->persistentTid, + xldata->persistentSerialNum); + break; + } case XLOG_GIST_PAGE_DELETE: - { - gistxlogPageDelete *xldata = (gistxlogPageDelete *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xldata->node), - xldata->blkno, - &xldata->persistentTid, - xldata->persistentSerialNum); - break; - } + { + gistxlogPageDelete *xldata = (gistxlogPageDelete *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xldata->node), + xldata->blkno, + &xldata->persistentTid, + xldata->persistentSerialNum); + break; + } case XLOG_GIST_PAGE_SPLIT: - { - gistxlogPageSplit* xldata = (gistxlogPageSplit *) data; - char* ptr; - int j, - i = 0; - - /* first, log the splitted page */ - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xldata->node), - xldata->origblkno, - &xldata->persistentTid, - xldata->persistentSerialNum); - - /* now log all the pages that we split into */ - ptr = (char *)data + sizeof(gistxlogPageSplit); - - for (i = 0; i < xldata->npage; i++) { - gistxlogPage* gistp; - - gistp = (gistxlogPage *) ptr; - ptr += sizeof(gistxlogPage); + gistxlogPageSplit *xldata = (gistxlogPageSplit *) data; + char *ptr; + int j, + i = 0; - //elog(LOG, "CHANGETRACKING GIST SPLIT: block [%d/%d]:%d", i+1,xldata->npage, gistp->blkno); + /* first, log the splitted page */ ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xldata->node), - gistp->blkno, - &xldata->persistentTid, - xldata->persistentSerialNum); - - /* skip over all index tuples. we only care about block numbers */ - j = 0; - while (j < gistp->num) + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xldata->node), + xldata->origblkno, + &xldata->persistentTid, + xldata->persistentSerialNum); + + /* now log all the pages that we split into */ + ptr = (char *) data + sizeof(gistxlogPageSplit); + + for (i = 0; i < xldata->npage; i++) { - ptr += IndexTupleSize((IndexTuple) ptr); - j++; + gistxlogPage *gistp; + + gistp = (gistxlogPage *) ptr; + ptr += sizeof(gistxlogPage); + + /* + * elog(LOG, "CHANGETRACKING GIST SPLIT: block + * [%d/%d]:%d", i+1,xldata->npage, gistp->blkno); + */ + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xldata->node), + gistp->blkno, + &xldata->persistentTid, + xldata->persistentSerialNum); + + /* + * skip over all index tuples. we only care about + * block numbers + */ + j = 0; + while (j < gistp->num) + { + ptr += IndexTupleSize((IndexTuple) ptr); + j++; + } } + + break; } - - break; - } case XLOG_GIST_CREATE_INDEX: - { - gistxlogCreateIndex* xldata = (gistxlogCreateIndex *) data; - - ChangeTracking_AddRelationChangeInfo( - relationChangeInfoArray, - relationChangeInfoArrayCount, - relationChangeInfoMaxSize, - &(xldata->node), - GIST_ROOT_BLKNO, - &xldata->persistentTid, - xldata->persistentSerialNum); - break; - } + { + gistxlogCreateIndex *xldata = (gistxlogCreateIndex *) data; + + ChangeTracking_AddRelationChangeInfo( + relationChangeInfoArray, + relationChangeInfoArrayCount, + relationChangeInfoMaxSize, + &(xldata->node), + GIST_ROOT_BLKNO, + &xldata->persistentTid, + xldata->persistentSerialNum); + break; + } case XLOG_GIST_INSERT_COMPLETE: - { - /* nothing to be done here */ - break; - } + { + /* nothing to be done here */ + break; + } default: elog(ERROR, "internal error: unsupported RM_GIST_ID op (%u) in ChangeTracking_GetRelationChangeInfoFromXlog", info); } @@ -1239,72 +1273,76 @@ void ChangeTracking_GetRelationChangeInfoFromXlog( } -void ChangeTracking_FsyncDataIntoLog(CTFType ftype) +void +ChangeTracking_FsyncDataIntoLog(CTFType ftype) { - File file; + File file; ChangeTrackingBufStatusData *bufstat; - + Assert(ftype != CTF_META); Assert(ftype != CTF_LOG_TRANSIENT); - - if(ftype == CTF_LOG_FULL) + + if (ftype == CTF_LOG_FULL) bufstat = CTMainWriteBufStatus; else bufstat = CTCompactWriteBufStatus; LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - + file = ChangeTracking_OpenFile(ftype); - + /* write the existing (non-full) shmem buffer */ - if(bufstat->recordcount > 0) + if (bufstat->recordcount > 0) { - if(ChangeTracking_WriteBuffer(file, ftype) < 0) + if (ChangeTracking_WriteBuffer(file, ftype) < 0) ChangeTracking_HandleWriteError(ftype); } - + #ifdef FAULT_INJECTOR if (FaultInjector_InjectFaultIfSet( - ChangeTrackingDisable, - DDLNotSpecified, - "" /* databaseName */, - "" /* tableName */) == FaultInjectorTypeSkip) - ChangeTracking_HandleWriteError(ftype); // disable Change Tracking -#endif - + ChangeTrackingDisable, + DDLNotSpecified, + "" /* databaseName */ , + "" /* tableName */ ) == FaultInjectorTypeSkip) + ChangeTracking_HandleWriteError(ftype); + /* disable Change Tracking */ +#endif + errno = 0; /* time to fsync change log to disk */ - if(FileSync(file) < 0) + if (FileSync(file) < 0) ChangeTracking_HandleWriteError(ftype); ChangeTracking_CloseFile(file); LWLockRelease(ChangeTrackingWriteLock); } + /* * CheckPointChangeTracking - * + * * When the system wide checkpoint is performed, this function is called. * We write and fsync the current page we have (partially full) into disk. * Also, we save this partially full buffer in shared memory, so that after * this checkpoint we will continue filling it up, and when it is full use * it to overwrite this partial page with a full one. - * + * * In addition, we check if this full log file is ready to be compacted. * if it is, we create a transient log file out of it for the filerep * process that will do the compacting. */ -void CheckPointChangeTracking(void) +void +CheckPointChangeTracking(void) { - + /* do nothing if not in proper operating mode */ - if(!ChangeTracking_ShouldTrackChanges()) + if (!ChangeTracking_ShouldTrackChanges()) return; - + /* force data into disk */ ChangeTracking_FsyncDataIntoLog(CTF_LOG_FULL); - + ChangeTracking_CreateTransientLogIfNeeded(); } @@ -1312,65 +1350,68 @@ void CheckPointChangeTracking(void) * API for getting incremental change entries for gp_persistent relation ************************************************************************/ -IncrementalChangeList* ChangeTracking_InitIncrementalChangeList(int count) +IncrementalChangeList * +ChangeTracking_InitIncrementalChangeList(int count) { - IncrementalChangeList* result; - - result = (IncrementalChangeList*) palloc0(sizeof(IncrementalChangeList)); + IncrementalChangeList *result; + + result = (IncrementalChangeList *) palloc0(sizeof(IncrementalChangeList)); result->count = count; - result->entries = (IncrementalChangeEntry*) palloc(sizeof(IncrementalChangeEntry) * count); - + result->entries = (IncrementalChangeEntry *) palloc(sizeof(IncrementalChangeEntry) * count); + return result; } /* - * Get an ordered list of [persistent_tid,persistent_serialnum, numblocks] - * from the change tracking log, with unique tid values, each paired with + * Get an ordered list of [persistent_tid,persistent_serialnum, numblocks] + * from the change tracking log, with unique tid values, each paired with * the newest serial num found for it, and number of blocks changed for that * serial number. */ IncrementalChangeList * ChangeTracking_GetIncrementalChangeList(void) { - IncrementalChangeList* result = NULL; - StringInfoData sqlstmt; - int ret; - int proc; - volatile bool connected = false; - ResourceOwner save = CurrentResourceOwner; - MemoryContext oldcontext = CurrentMemoryContext; - CTFType ftype = CTF_LOG_COMPACT; /* always read from the compact file only */ - - Assert(dataState == DataStateInResync); + IncrementalChangeList *result = NULL; + StringInfoData sqlstmt; + int ret; + int proc; + volatile bool connected = false; + ResourceOwner save = CurrentResourceOwner; + MemoryContext oldcontext = CurrentMemoryContext; + CTFType ftype = CTF_LOG_COMPACT; /* always read from the compact + * file only */ + + Assert(dataState == DataStateInResync); Assert(gp_change_tracking); - + /* assemble our query string */ initStringInfo(&sqlstmt); - + /* TODO: there must be a more efficient query to use here */ - appendStringInfo(&sqlstmt, "SELECT t1.persistent_tid, t1.persistent_sn, t1.numblocks " - "FROM (SELECT persistent_tid, persistent_sn, count(distinct blocknum) as numblocks " - " FROM gp_changetracking_log(%d) " - " GROUP BY persistent_tid, persistent_sn) as t1, " - " (SELECT persistent_tid, max(persistent_sn) as persistent_sn " - " FROM gp_changetracking_log(%d) " - " GROUP BY persistent_tid) as t2 " - "WHERE t1.persistent_tid = t2.persistent_tid " - "AND t1.persistent_sn = t2.persistent_sn", - ftype, ftype); - - /* - * NOTE: here's a cleaner version of the same query. compare which runs more efficiently. - * minimal testing shows it's the one above, but by small margin + appendStringInfo(&sqlstmt, "SELECT t1.persistent_tid, t1.persistent_sn, t1.numblocks " + "FROM (SELECT persistent_tid, persistent_sn, count(distinct blocknum) as numblocks " + " FROM gp_changetracking_log(%d) " + " GROUP BY persistent_tid, persistent_sn) as t1, " + " (SELECT persistent_tid, max(persistent_sn) as persistent_sn " + " FROM gp_changetracking_log(%d) " + " GROUP BY persistent_tid) as t2 " + "WHERE t1.persistent_tid = t2.persistent_tid " + "AND t1.persistent_sn = t2.persistent_sn", + ftype, ftype); + + /* + * NOTE: here's a cleaner version of the same query. compare which runs + * more efficiently. minimal testing shows it's the one above, but by + * small margin */ -// appendStringInfo(&sqlstmt, "SELECT persistent_tid, persistent_sn, count(distinct blocknum) " -// "FROM gp_changetracking_log(%d) " -// "GROUP BY persistent_tid, persistent_sn " -// "HAVING (persistent_tid, persistent_sn) " -// "IN (SELECT persistent_tid, max(persistent_sn) " -// "FROM gp_changetracking_log(%d) " -// "GROUP BY persistent_tid", ftype, ftype); - +/* appendStringInfo(&sqlstmt, "SELECT persistent_tid, persistent_sn, count(distinct blocknum) " */ +/* "FROM gp_changetracking_log(%d) " */ +/* "GROUP BY persistent_tid, persistent_sn " */ +/* "HAVING (persistent_tid, persistent_sn) " */ +/* "IN (SELECT persistent_tid, max(persistent_sn) " */ +/* "FROM gp_changetracking_log(%d) " */ +/* "GROUP BY persistent_tid", ftype, ftype); */ + PG_TRY(); { @@ -1394,52 +1435,55 @@ ChangeTracking_GetIncrementalChangeList(void) if (ret > 0 && SPI_tuptable != NULL) { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - SPITupleTable* tuptable = SPI_tuptable; - MemoryContext cxt_save; - int i; + TupleDesc tupdesc = SPI_tuptable->tupdesc; + SPITupleTable *tuptable = SPI_tuptable; + MemoryContext cxt_save; + int i; /* * Iterate through each result tuple */ for (i = 0; i < proc; i++) { - HeapTuple tuple = tuptable->vals[i]; - - IncrementalChangeEntry* entry; - ItemPointer persistentTid; - int64 persistentSerialNum; - int64 numblocks; - char* str_tid; - char* str_sn; - char* str_numb; - + HeapTuple tuple = tuptable->vals[i]; + + IncrementalChangeEntry *entry; + ItemPointer persistentTid; + int64 persistentSerialNum; + int64 numblocks; + char *str_tid; + char *str_sn; + char *str_numb; + /* get result columns from SPI (as strings) */ str_tid = SPI_getvalue(tuple, tupdesc, 1); str_sn = SPI_getvalue(tuple, tupdesc, 2); str_numb = SPI_getvalue(tuple, tupdesc, 3); - - //elog(LOG,"tuple %d: tid %s sn %s numb %s", i, str_tid, str_sn, str_numb); + + /* + * elog(LOG,"tuple %d: tid %s sn %s numb %s", i, str_tid, + * str_sn, str_numb); + */ /* use our own context so that SPI won't free our stuff later */ cxt_save = MemoryContextSwitchTo(oldcontext); /* init the result memory on first pass */ - if(i == 0) + if (i == 0) result = ChangeTracking_InitIncrementalChangeList(proc); - + /* convert to desired data type */ persistentTid = (ItemPointer) DatumGetPointer( - DirectFunctionCall1(tidin, CStringGetDatum(str_tid))); + DirectFunctionCall1(tidin, CStringGetDatum(str_tid))); persistentSerialNum = DatumGetInt64(DirectFunctionCall1(int8in, CStringGetDatum(str_sn))); numblocks = DatumGetInt64(DirectFunctionCall1(int8in, CStringGetDatum(str_numb))); - + /* populate this entry */ entry = &(result->entries[i]); entry->persistentTid = *persistentTid; entry->persistentSerialNum = persistentSerialNum; entry->numblocks = numblocks; - + MemoryContextSwitchTo(cxt_save); } } @@ -1451,10 +1495,10 @@ ChangeTracking_GetIncrementalChangeList(void) connected = false; SPI_finish(); - + CommitTransactionCommand(); } - + /* Clean up in case of error. */ PG_CATCH(); { @@ -1476,12 +1520,13 @@ ChangeTracking_GetIncrementalChangeList(void) return result; } -void ChangeTracking_FreeIncrementalChangeList(IncrementalChangeList* iclist) +void +ChangeTracking_FreeIncrementalChangeList(IncrementalChangeList *iclist) { Assert(iclist); - + pfree(iclist->entries); - pfree(iclist); + pfree(iclist); } /************************************* @@ -1492,15 +1537,16 @@ void ChangeTracking_FreeIncrementalChangeList(IncrementalChangeList* iclist) * Allocate memory for a request that includes information about * the objects of interest. */ -ChangeTrackingRequest* ChangeTracking_FormRequest(int max_count) +ChangeTrackingRequest * +ChangeTracking_FormRequest(int max_count) { - ChangeTrackingRequest* request; - - request = (ChangeTrackingRequest*) palloc0(sizeof(ChangeTrackingRequest)); + ChangeTrackingRequest *request; + + request = (ChangeTrackingRequest *) palloc0(sizeof(ChangeTrackingRequest)); request->count = 0; request->max_count = max_count; - request->entries = (ChangeTrackingRequestEntry*) palloc(sizeof(ChangeTrackingRequestEntry) * max_count); - + request->entries = (ChangeTrackingRequestEntry *) palloc(sizeof(ChangeTrackingRequestEntry) * max_count); + return request; } @@ -1510,14 +1556,15 @@ ChangeTrackingRequest* ChangeTracking_FormRequest(int max_count) * The block number last_fetched indicates the last block number of this * relation fetched and sent to mirror by a resync worker. */ -void ChangeTracking_AddRequestEntry(ChangeTrackingRequest *request, - RelFileNode relFileNode) +void +ChangeTracking_AddRequestEntry(ChangeTrackingRequest *request, + RelFileNode relFileNode) { - ChangeTrackingRequestEntry* entry; - - if(request->count + 1 > request->max_count) + ChangeTrackingRequestEntry *entry; + + if (request->count + 1 > request->max_count) elog(ERROR, "ChangeTracking: trying to add more request entries than originally requested"); - + entry = &(request->entries[request->count]); entry->relFileNode.relNode = relFileNode.relNode; entry->relFileNode.spcNode = relFileNode.spcNode; @@ -1529,7 +1576,8 @@ void ChangeTracking_AddRequestEntry(ChangeTrackingRequest *request, /* * Deallocate memory associated with the request. */ -void ChangeTracking_FreeRequest(ChangeTrackingRequest *request) +void +ChangeTracking_FreeRequest(ChangeTrackingRequest *request) { pfree(request->entries); pfree(request); @@ -1540,50 +1588,52 @@ void ChangeTracking_FreeRequest(ChangeTrackingRequest *request) * by the resync changetracking after the synchronizer requested information, * the result of the request will be stored here and passed back to the caller. */ -static ChangeTrackingResult* ChangeTracking_FormResult(int max_count) +static ChangeTrackingResult * +ChangeTracking_FormResult(int max_count) { - ChangeTrackingResult* result; - - result = (ChangeTrackingResult*) palloc0(sizeof(ChangeTrackingResult)); + ChangeTrackingResult *result; + + result = (ChangeTrackingResult *) palloc0(sizeof(ChangeTrackingResult)); result->count = 0; result->max_count = max_count; result->ask_for_more = false; - result->entries = (ChangeTrackingResultEntry*) palloc(sizeof(ChangeTrackingResultEntry) * max_count); - + result->entries = (ChangeTrackingResultEntry *) palloc(sizeof(ChangeTrackingResultEntry) * max_count); + return result; } -static void ChangeTracking_AddResultEntry(ChangeTrackingResult *result, - Oid space, - Oid db, - Oid rel, - BlockNumber blocknum, - XLogRecPtr* lsn_end) +static void +ChangeTracking_AddResultEntry(ChangeTrackingResult *result, + Oid space, + Oid db, + Oid rel, + BlockNumber blocknum, + XLogRecPtr *lsn_end) { - ChangeTrackingResultEntry* entry; - - if(result->count + 1 > result->max_count) + ChangeTrackingResultEntry *entry; + + if (result->count + 1 > result->max_count) elog(ERROR, "ChangeTracking: trying to add more result entries than originally requested"); - + entry = &(result->entries[result->count]); entry->relFileNode.spcNode = space; entry->relFileNode.dbNode = db; entry->relFileNode.relNode = rel; entry->block_num = blocknum; entry->lsn_end = *lsn_end; - + result->count++; } /* * We are in resync mode and the synchronizer module is asking - * us for the information we have gathered. - * - * The synchronizer passes in a list of relfilenodes, each with - * a start and end LSN. For each of those relations that are + * us for the information we have gathered. + * + * The synchronizer passes in a list of relfilenodes, each with + * a start and end LSN. For each of those relations that are * found in the change tracking log file this routine will return * the list of block numbers and the end LSN of each. - * + * * We restrict the total number of changes that this routine returns to * gp_filerep_ct_batch_size, in order to not overflow memory. If a specific * relation is expected to have more than this number of changes, this routine @@ -1593,41 +1643,43 @@ static void ChangeTracking_AddResultEntry(ChangeTrackingResult *result, * happens the caller should set last_fetched in the relation's request to the * highest block number seen so far. */ -ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) +ChangeTrackingResult * +ChangeTracking_GetChanges(ChangeTrackingRequest *request) { - ChangeTrackingResult* result = NULL; - StringInfoData sqlstmt; - int ret; - int proc; - int i; - volatile bool connected = false; /* needs to survive PG_TRY()/CATCH() */ - ResourceOwner save = CurrentResourceOwner; - MemoryContext oldcontext = CurrentMemoryContext; - CTFType ftype = CTF_LOG_COMPACT; /* always read from the compact log only */ - - Assert(dataState == DataStateInResync); + ChangeTrackingResult *result = NULL; + StringInfoData sqlstmt; + int ret; + int proc; + int i; + volatile bool connected = false; /* needs to survive PG_TRY()/CATCH() */ + ResourceOwner save = CurrentResourceOwner; + MemoryContext oldcontext = CurrentMemoryContext; + CTFType ftype = CTF_LOG_COMPACT; /* always read from the compact + * log only */ + + Assert(dataState == DataStateInResync); Assert(gp_change_tracking); /* assemble our query string */ initStringInfo(&sqlstmt); appendStringInfo(&sqlstmt, "SELECT space, db, rel, blocknum, max(xlogloc) " - "FROM gp_changetracking_log(%d) " - "WHERE ", ftype); + "FROM gp_changetracking_log(%d) " + "WHERE ", ftype); - for(i = 0 ; i < request->count ; i++) + for (i = 0; i < request->count; i++) { - Oid space = request->entries[i].relFileNode.spcNode; - Oid db = request->entries[i].relFileNode.dbNode; - Oid rel = request->entries[i].relFileNode.relNode; + Oid space = request->entries[i].relFileNode.spcNode; + Oid db = request->entries[i].relFileNode.dbNode; + Oid rel = request->entries[i].relFileNode.relNode; BlockNumber last_fetched = request->entries[i].last_fetched; - if(i != 0) + if (i != 0) appendStringInfo(&sqlstmt, "OR "); - + appendStringInfo(&sqlstmt, "(space = %u AND " - "db = %u AND " - "rel = %u", - space, db, rel); + "db = %u AND " + "rel = %u", + space, db, rel); if (last_fetched > 0) appendStringInfo(&sqlstmt, " AND blocknum > %u) ", last_fetched); else @@ -1635,21 +1687,25 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) } appendStringInfo(&sqlstmt, "GROUP BY space, db, rel, blocknum " - "ORDER BY space, db, rel, blocknum "); - + "ORDER BY space, db, rel, blocknum "); + /* - * We limit the result to our max value so that we don't use too much memory both - * in the query result and in the actual result returned to the caller. - * - * The +1 is there in order to "peek" if there's more data to be returned. Therefore - * if MAX+1 records were returned from the query we return MAX records to the caller - * and indicate that there are more records to return in the next call with the same - * request (we don't return the last record found. we'll return it next time). + * We limit the result to our max value so that we don't use too much + * memory both in the query result and in the actual result returned to + * the caller. + * + * The +1 is there in order to "peek" if there's more data to be returned. + * Therefore if MAX+1 records were returned from the query we return MAX + * records to the caller and indicate that there are more records to + * return in the next call with the same request (we don't return the last + * record found. we'll return it next time). */ appendStringInfo(&sqlstmt, "LIMIT %d", gp_filerep_ct_batch_size + 1); - bool old_enable_groupagg = enable_groupagg; - enable_groupagg = false; /* disable sort group agg -- our query works better with hash agg */ - + bool old_enable_groupagg = enable_groupagg; + + enable_groupagg = false; /* disable sort group agg -- our query works + * better with hash agg */ + PG_TRY(); { /* must be in a transaction in order to use SPI */ @@ -1672,37 +1728,37 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) if (ret > 0 && SPI_tuptable != NULL) { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - SPITupleTable* tuptable = SPI_tuptable; - MemoryContext cxt_save; - int i; - - /* - * if got gp_filerep_ct_batch_size changes or less, it means we - * satisfied all the requests. If not, it means there are still more - * results to return in the next calls. + TupleDesc tupdesc = SPI_tuptable->tupdesc; + SPITupleTable *tuptable = SPI_tuptable; + MemoryContext cxt_save; + int i; + + /* + * if got gp_filerep_ct_batch_size changes or less, it means we + * satisfied all the requests. If not, it means there are still + * more results to return in the next calls. */ - bool satisfied_request = (proc <= gp_filerep_ct_batch_size); + bool satisfied_request = (proc <= gp_filerep_ct_batch_size); /* * Iterate through each result tuple */ for (i = 0; i < proc; i++) { - HeapTuple tuple = tuptable->vals[i]; + HeapTuple tuple = tuptable->vals[i]; BlockNumber blocknum; - XLogRecPtr* endlsn; - Oid space; - Oid db; - Oid rel; - char* str_space; - char* str_db; - char* str_rel; - char* str_blocknum; - char* str_endlsn; - - + XLogRecPtr *endlsn; + Oid space; + Oid db; + Oid rel; + char *str_space; + char *str_db; + char *str_rel; + char *str_blocknum; + char *str_endlsn; + + /* get result columns from SPI (as strings) */ str_space = SPI_getvalue(tuple, tupdesc, 1); str_db = SPI_getvalue(tuple, tupdesc, 2); @@ -1710,13 +1766,16 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) str_blocknum = SPI_getvalue(tuple, tupdesc, 4); str_endlsn = SPI_getvalue(tuple, tupdesc, 5); - //elog(NOTICE,"tuple %d: %s %s %s block %s lsn %s", i, str_space, str_db, str_rel, str_blocknum, str_endlsn); + /* + * elog(NOTICE,"tuple %d: %s %s %s block %s lsn %s", i, + * str_space, str_db, str_rel, str_blocknum, str_endlsn); + */ /* use our own context so that SPI won't free our stuff later */ cxt_save = MemoryContextSwitchTo(oldcontext); /* init the result memory on first pass */ - if(i == 0) + if (i == 0) { if (satisfied_request) { @@ -1726,32 +1785,39 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) else { /* prepare memory for partial result */ - if(request->count != 1) + if (request->count != 1) elog(ERROR, "internal error in ChangeTracking_GetChanges(): caller " - "passed in an invalid request (expecting more than %d " - "result entries for more than a single relation)", - gp_filerep_ct_batch_size); - + "passed in an invalid request (expecting more than %d " + "result entries for more than a single relation)", + gp_filerep_ct_batch_size); + result = ChangeTracking_FormResult(gp_filerep_ct_batch_size); - - /* tell caller to call us again with the same relation (but different start lsn) */ + + /* + * tell caller to call us again with the same relation + * (but different start lsn) + */ result->ask_for_more = true; } } - - + + /* convert to desired data type */ space = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(str_space))); db = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(str_db))); rel = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(str_rel))); blocknum = DatumGetUInt32(DirectFunctionCall1(int4in, CStringGetDatum(str_blocknum))); - endlsn = (XLogRecPtr*) DatumGetPointer(DirectFunctionCall1(gpxloglocin, CStringGetDatum(str_endlsn))); - /* TODO: in the above should use DatumGetXLogLoc instead, but it's not public */ - - /* + endlsn = (XLogRecPtr *) DatumGetPointer(DirectFunctionCall1(gpxloglocin, CStringGetDatum(str_endlsn))); + + /* + * TODO: in the above should use DatumGetXLogLoc instead, but + * it's not public + */ + + /* * skip the last "extra" entry if satisfied_request is false */ - if(i == gp_filerep_ct_batch_size) + if (i == gp_filerep_ct_batch_size) { Assert(!satisfied_request); Assert(result->ask_for_more); @@ -1760,13 +1826,13 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) } /* add our entry to the result */ - ChangeTracking_AddResultEntry(result, + ChangeTracking_AddResultEntry(result, space, db, rel, blocknum, endlsn); - + MemoryContextSwitchTo(cxt_save); } } @@ -1778,7 +1844,7 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) connected = false; SPI_finish(); - + CommitTransactionCommand(); enable_groupagg = old_enable_groupagg; } @@ -1790,7 +1856,7 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) SPI_finish(); AbortCurrentTransaction(); - + enable_groupagg = old_enable_groupagg; /* Carry on with error handling. */ @@ -1809,12 +1875,13 @@ ChangeTrackingResult* ChangeTracking_GetChanges(ChangeTrackingRequest *request) /* * Free all memory associated with the result, after using it. */ -void ChangeTracking_FreeResult(ChangeTrackingResult *result) +void +ChangeTracking_FreeResult(ChangeTrackingResult *result) { - if(result) + if (result) { pfree(result->entries); - pfree(result); + pfree(result); } } @@ -1822,7 +1889,8 @@ void ChangeTracking_FreeResult(ChangeTrackingResult *result) * File I/O routines *************************************/ -static void ChangeTracking_SetPathByType(CTFType ftype, char *path) +static void +ChangeTracking_SetPathByType(CTFType ftype, char *path) { if (ftype == CTF_LOG_FULL) ChangeTrackingFullLogFilePath(path); @@ -1837,12 +1905,13 @@ static void ChangeTracking_SetPathByType(CTFType ftype, char *path) /* * Open change tracking log file for write, and seek if needed */ -File ChangeTracking_OpenFile(CTFType ftype) +File +ChangeTracking_OpenFile(CTFType ftype) { - File file; - char path[MAXPGPATH]; - struct stat st; - + File file; + char path[MAXPGPATH]; + struct stat st; + if (stat(CHANGETRACKINGDIR, &st) < 0) { errno = 0; @@ -1858,46 +1927,47 @@ File ChangeTracking_OpenFile(CTFType ftype) } ChangeTracking_SetPathByType(ftype, path); - + switch (ftype) { case CTF_META: - + /* open it (and create if doesn't exist) */ - file = PathNameOpenFile(path, - O_RDWR | O_CREAT | PG_BINARY, - S_IRUSR | S_IWUSR); + file = PathNameOpenFile(path, + O_RDWR | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); if (file == -1) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); + errmsg("could not open file \"%s\": %m", path))); - /* + /* * seek to beginning of file. The meta file only has a single * block. we will overwrite it each time with new meta data. */ - FileSeek(file, 0, SEEK_SET); + FileSeek(file, 0, SEEK_SET); break; case CTF_LOG_FULL: case CTF_LOG_COMPACT: case CTF_LOG_TRANSIENT: - - /* - * open it (create if doesn't exist). seek to eof for appending. - * (can't use O_APPEND because we may like to reposition later on). + + /* + * open it (create if doesn't exist). seek to eof for appending. + * (can't use O_APPEND because we may like to reposition later + * on). */ - file = PathNameOpenFile(path, - O_RDWR | O_CREAT | PG_BINARY, - S_IRUSR | S_IWUSR); - + file = PathNameOpenFile(path, + O_RDWR | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); + if (file == -1) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); + errmsg("could not open file \"%s\": %m", path))); - FileSeek(file, 0, SEEK_END); + FileSeek(file, 0, SEEK_END); break; default: @@ -1908,56 +1978,60 @@ File ChangeTracking_OpenFile(CTFType ftype) return file; } -void ChangeTracking_CloseFile(File file) +void +ChangeTracking_CloseFile(File file) { FileClose(file); } -bool ChangeTracking_DoesFileExist(CTFType ftype) +bool +ChangeTracking_DoesFileExist(CTFType ftype) { - File file; - char path[MAXPGPATH]; + File file; + char path[MAXPGPATH]; /* set up correct path */ ChangeTracking_SetPathByType(ftype, path); - + /* open it (don't create if doesn't exist) */ - file = PathNameOpenFile(path, - O_RDONLY | PG_BINARY, - S_IRUSR | S_IWUSR); - - if(file < 0) + file = PathNameOpenFile(path, + O_RDONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + + if (file < 0) return false; else ChangeTracking_CloseFile(file); - + return true; } /* * Rename one log file to another */ -static void ChangeTracking_RenameLogFile(CTFType source, CTFType dest) +static void +ChangeTracking_RenameLogFile(CTFType source, CTFType dest) { - char fn1[MAXPGPATH]; - char fn2[MAXPGPATH]; - + char fn1[MAXPGPATH]; + char fn2[MAXPGPATH]; + ChangeTracking_SetPathByType(source, fn1); ChangeTracking_SetPathByType(dest, fn2); - + /* Rename the FULL log file to TRANSIENT log */ if (rename(fn1, fn2)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", - fn1, fn2))); + fn1, fn2))); Assert(ChangeTracking_DoesFileExist(source) == false); } -static void ChangeTracking_DropLogFile(CTFType ftype) +static void +ChangeTracking_DropLogFile(CTFType ftype) { - char path[MAXPGPATH]; + char path[MAXPGPATH]; Assert(ftype != CTF_META); @@ -1967,21 +2041,23 @@ static void ChangeTracking_DropLogFile(CTFType ftype) elog(LOG, "could not unlink file \"%s\": %m", path); } -static void ChangeTracking_DropLogFiles(void) +static void +ChangeTracking_DropLogFiles(void) { ChangeTracking_DropLogFile(CTF_LOG_FULL); ChangeTracking_DropLogFile(CTF_LOG_TRANSIENT); ChangeTracking_DropLogFile(CTF_LOG_COMPACT); - + ChangeTracking_ResetBufStatus(CTMainWriteBufStatus); ChangeTracking_ResetBufStatus(CTCompactWriteBufStatus); ChangeTracking_ResetCompactingStatus(changeTrackingCompState); } -static void ChangeTracking_DropMetaFile(void) +static void +ChangeTracking_DropMetaFile(void) { - char path[MAXPGPATH]; + char path[MAXPGPATH]; changeTrackingResyncMeta->resync_mode_full = false; setFullResync(changeTrackingResyncMeta->resync_mode_full); @@ -2000,17 +2076,19 @@ static void ChangeTracking_DropMetaFile(void) /* * Drop all change tracking files (log and meta). */ -void ChangeTracking_DropAll(void) +void +ChangeTracking_DropAll(void) { LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - + ChangeTracking_DropLogFiles(); ChangeTracking_DropMetaFile(); - + LWLockRelease(ChangeTrackingWriteLock); } -static void ChangeTracking_ResetBufStatus(ChangeTrackingBufStatusData* bufstat) +static void +ChangeTracking_ResetBufStatus(ChangeTrackingBufStatusData *bufstat) { bufstat->bufsize = 0; bufstat->recordcount = 0; @@ -2018,7 +2096,8 @@ static void ChangeTracking_ResetBufStatus(ChangeTrackingBufStatusData* bufstat) bufstat->fileseg = 0; } -static void ChangeTracking_ResetCompactingStatus(ChangeTrackingLogCompactingStateData* compstat) +static void +ChangeTracking_ResetCompactingStatus(ChangeTrackingLogCompactingStateData *compstat) { compstat->ctcompact_bufs_added = 0; compstat->ctfull_bufs_added = 0; @@ -2028,27 +2107,28 @@ static void ChangeTracking_ResetCompactingStatus(ChangeTrackingLogCompactingStat /* * Write our shared memory buffer to the log file. This will be - * called if we have a buffer full of change log records or + * called if we have a buffer full of change log records or * during a checkpoint (a checkpoint will flush it too). - * + * * NOTE: caller must hold a LW lock before calling this function. * NOTE: caller must also verify that the buffer isn't empty. */ -static int ChangeTracking_WriteBuffer(File file, CTFType ftype) +static int +ChangeTracking_WriteBuffer(File file, CTFType ftype) { ChangeTrackingBufStatusData *bufstat; ChangeTrackingPageHeader *headerptr; - char* buf; - int freespace; - int wrote = 0; - int64 restartpos; - int64 actualpos = 0; - pg_crc32 crc; - + char *buf; + int freespace; + int wrote = 0; + int64 restartpos; + int64 actualpos = 0; + pg_crc32 crc; + Assert(ftype != CTF_META); Assert(ftype != CTF_LOG_TRANSIENT); - - if(ftype == CTF_LOG_FULL) + + if (ftype == CTF_LOG_FULL) { /* this is for a regular write from xlog */ bufstat = CTMainWriteBufStatus; @@ -2063,11 +2143,11 @@ static int ChangeTracking_WriteBuffer(File file, CTFType ftype) freespace = bufstat->maxbufsize - bufstat->bufsize; - if(bufstat->recordcount == 0) + if (bufstat->recordcount == 0) elog(ERROR, "ChangeTracking_WriteBuffer called with empty buffer"); /* Set the checksum to 0 to include header also in the crc calculation */ - headerptr = (ChangeTrackingPageHeader*)buf; + headerptr = (ChangeTrackingPageHeader *) buf; headerptr->blockversion = CHANGETRACKING_STORAGE_VERSION; headerptr->numrecords = bufstat->recordcount; headerptr->checksum = 0; @@ -2075,26 +2155,26 @@ static int ChangeTracking_WriteBuffer(File file, CTFType ftype) /* pad end of page with zeros */ MemSet(buf + bufstat->bufsize, 0, freespace); - /* Calculate checksum for the whole buffer, including the header.*/ + /* Calculate checksum for the whole buffer, including the header. */ Assert(bufstat->maxbufsize == CHANGETRACKING_BLCKSZ); INIT_CRC32C(crc); COMP_CRC32C(crc, buf, bufstat->maxbufsize); FIN_CRC32C(crc); - + headerptr->checksum = crc; - + /* - * 1) set the file write position with FileSeek. - * 2) save the position in 'restartpos' for the next round (if the page - * we'll write soon isn't full). see nextwritepos at the bottom of - * this function for more information. + * 1) set the file write position with FileSeek. 2) save the position in + * 'restartpos' for the next round (if the page we'll write soon isn't + * full). see nextwritepos at the bottom of this function for more + * information. */ if (bufstat->nextwritepos != CHANGETRACKING_FILE_EOF) restartpos = FileSeek(file, bufstat->nextwritepos, SEEK_SET); else restartpos = FileSeek(file, 0, SEEK_END); - + /* write the complete buffer to file */ errno = 0; wrote = FileWrite(file, buf, bufstat->maxbufsize); @@ -2102,67 +2182,72 @@ static int ChangeTracking_WriteBuffer(File file, CTFType ftype) /* Verify if the number of bytes written is what we expected */ if (wrote != bufstat->maxbufsize) { - /* The callers of this module should handle the error return value. But still adding an extra - * elog message for debug purposes.*/ + /* + * The callers of this module should handle the error return value. + * But still adding an extra elog message for debug purposes. + */ elog(WARNING, "Unable to write %d bytes to change tracking %s. Actual bytes written = %d." - ,bufstat->maxbufsize - ,ChangeTracking_FtypeToString(ftype) - ,wrote); + ,bufstat->maxbufsize + ,ChangeTracking_FtypeToString(ftype) + ,wrote); return -1; } - + /* - * check if there's room for more records in this buffer. + * check if there's room for more records in this buffer. */ if (freespace < sizeof(ChangeTrackingRecord)) { /* block written and is full. reset it to zero */ ChangeTracking_ResetBufStatus(bufstat); - + /* update count of bufs added since last compact operation */ - if(ftype == CTF_LOG_FULL) + if (ftype == CTF_LOG_FULL) changeTrackingCompState->ctfull_bufs_added++; else changeTrackingCompState->ctcompact_bufs_added++; - /* This is an extra check to see if we can correctly seek to the end of file. - * If the seek to the end of the file is broken, there is a chance that in - * the next block writing cycle the system may start writing from a stale EOF - * causing corruption. */ + /* + * This is an extra check to see if we can correctly seek to the end + * of file. If the seek to the end of the file is broken, there is a + * chance that in the next block writing cycle the system may start + * writing from a stale EOF causing corruption. + */ actualpos = FileSeek(file, 0, SEEK_END); if (restartpos + bufstat->maxbufsize != actualpos) { - ereport(WARNING,(errmsg("FileSeek to the end looks broken." - " Expected EOF = " INT64_FORMAT - " but after FileSeek(SEEK_END) we get EOF = " INT64_FORMAT - ,(restartpos + bufstat->maxbufsize) - ,actualpos))); + ereport(WARNING, (errmsg("FileSeek to the end looks broken." + " Expected EOF = " INT64_FORMAT + " but after FileSeek(SEEK_END) we get EOF = " INT64_FORMAT + ,(restartpos + bufstat->maxbufsize) + ,actualpos))); } } else { - /* + /* * written block is partially full (probably due to a checkpoint). - * Don't reset the buffer in shared memory, as we'd like to fill - * it up and later overwrite the block we just wrote. Set the next - * write position to the beginning of this page we just wrote, to - * use in the next round. + * Don't reset the buffer in shared memory, as we'd like to fill it up + * and later overwrite the block we just wrote. Set the next write + * position to the beginning of this page we just wrote, to use in the + * next round. */ bufstat->nextwritepos = restartpos; } - + return 0; } /* * Write a meta record to the meta file. */ -static int ChangeTracking_WriteMeta(File file) +static int +ChangeTracking_WriteMeta(File file) { - + ChangeTrackingMetaRecord rec; - int wrote = 0; + int wrote = 0; Assert(CHANGETRACKING_METABUFLEN >= sizeof(rec)); @@ -2171,8 +2256,8 @@ static int ChangeTracking_WriteMeta(File file) rec.resync_mode_full = changeTrackingResyncMeta->resync_mode_full; rec.resync_transition_completed = changeTrackingResyncMeta->resync_transition_completed; rec.insync_transition_completed = changeTrackingResyncMeta->insync_transition_completed; - - /* init buffer and copy rec into it */ + + /* init buffer and copy rec into it */ MemSet(metabuf, 0, CHANGETRACKING_METABUFLEN); memcpy(metabuf, &rec, sizeof(rec)); @@ -2183,22 +2268,22 @@ static int ChangeTracking_WriteMeta(File file) /* Verify if the number of bytes written are what we expected. */ if (wrote != CHANGETRACKING_METABUFLEN) return -1; - + return 0; } /* * Read a meta record from the Meta file. */ -static ChangeTrackingMetaRecord* -ChangeTracking_ReadMeta(File file, ChangeTrackingMetaRecord* rec) +static ChangeTrackingMetaRecord * +ChangeTracking_ReadMeta(File file, ChangeTrackingMetaRecord *rec) { - char path[MAXPGPATH]; - int nbytes = 0; + char path[MAXPGPATH]; + int nbytes = 0; - Assert(CHANGETRACKING_METABUFLEN >= sizeof(ChangeTrackingMetaRecord)); - - /* init buffer */ + Assert(CHANGETRACKING_METABUFLEN >= sizeof(ChangeTrackingMetaRecord)); + + /* init buffer */ MemSet(metabuf, 0, CHANGETRACKING_METABUFLEN); /* read the record from the meta file, if any */ @@ -2213,29 +2298,30 @@ ChangeTracking_ReadMeta(File file, ChangeTrackingMetaRecord* rec) else if (nbytes < 0) { ChangeTrackingMetaFilePath(path); - ereport(WARNING, + ereport(WARNING, (errcode_for_file_access(), errmsg("unable to read change tracking meta file \"%s\", " - "change tracking disabled : %m", - path), + "change tracking disabled : %m", + path), errSendAlert(true))); return NULL; } - + /* populate a the meta record with data from the file */ - rec->resync_lsn_end = ((ChangeTrackingMetaRecord *)metabuf)->resync_lsn_end; - rec->resync_mode_full = ((ChangeTrackingMetaRecord *)metabuf)->resync_mode_full; - rec->resync_transition_completed = ((ChangeTrackingMetaRecord *)metabuf)->resync_transition_completed; - rec->insync_transition_completed = ((ChangeTrackingMetaRecord *)metabuf)->insync_transition_completed; + rec->resync_lsn_end = ((ChangeTrackingMetaRecord *) metabuf)->resync_lsn_end; + rec->resync_mode_full = ((ChangeTrackingMetaRecord *) metabuf)->resync_mode_full; + rec->resync_transition_completed = ((ChangeTrackingMetaRecord *) metabuf)->resync_transition_completed; + rec->insync_transition_completed = ((ChangeTrackingMetaRecord *) metabuf)->insync_transition_completed; return rec; } -bool ChangeTracking_ShouldTrackChanges(void) +bool +ChangeTracking_ShouldTrackChanges(void) { return (FileRep_IsInChangeTracking() && gp_change_tracking); -} - +} + /************************************* * Resync related routines. *************************************/ @@ -2244,8 +2330,8 @@ bool ChangeTracking_ShouldTrackChanges(void) * The routine mark in shared memory and persistently in Change Tracking log file * that full resync is required and disable change tracking. * - * The routine is called - * a) if failure is detected during Change Tracking + * The routine is called + * a) if failure is detected during Change Tracking * b) if user requested gprecoverseg with --FULL option (i.e. mirror node replacement) * c) if user performed gpaddmirrors * @@ -2255,70 +2341,73 @@ bool ChangeTracking_ShouldTrackChanges(void) /* * Drop all change tracking files (log and meta). */ -void ChangeTracking_MarkFullResync(void) +void +ChangeTracking_MarkFullResync(void) { LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - - ChangeTracking_MarkFullResyncLockAcquired(); /* no need to check return code here */ - + + ChangeTracking_MarkFullResyncLockAcquired(); /* no need to check return + * code here */ + LWLockRelease(ChangeTrackingWriteLock); } static void ChangeTracking_MarkFullResyncLockAcquired(void) { - File file; - CTFType ftype = CTF_META; - bool emit_error = false; + File file; + CTFType ftype = CTF_META; + bool emit_error = false; - /* - * Insert an entry into the meta file that mark FullResync needed, and fsync. + /* + * Insert an entry into the meta file that mark FullResync needed, and + * fsync. */ FileRep_InsertConfigLogEntry("marking full resync "); - + changeTrackingResyncMeta->resync_mode_full = true; setFullResync(changeTrackingResyncMeta->resync_mode_full); changeTrackingResyncMeta->resync_lsn_end.xlogid = 0; changeTrackingResyncMeta->resync_lsn_end.xrecoff = 0; changeTrackingResyncMeta->resync_transition_completed = false; changeTrackingResyncMeta->insync_transition_completed = false; - - file = ChangeTracking_OpenFile(ftype); - + + file = ChangeTracking_OpenFile(ftype); + /* - * Write and fsync the file. If we fail here we can't recover - * from the error. Must call FileRep_SetPostmasterReset() + * Write and fsync the file. If we fail here we can't recover from the + * error. Must call FileRep_SetPostmasterReset() */ if (ChangeTracking_WriteMeta(file) < 0) { emit_error = true; - FileRep_SetPostmasterReset(); + FileRep_SetPostmasterReset(); } - + if (FileSync(file) < 0) { emit_error = true; - FileRep_SetPostmasterReset(); + FileRep_SetPostmasterReset(); } - + if (emit_error) { - ereport(WARNING, + ereport(WARNING, (errcode_for_file_access(), errmsg("write error for change tracking meta file in " "ChangeTracking_MarkFullResyncLockAcquired. " "Change Tracking disabled : %m"), - errSendAlert(true))); + errSendAlert(true))); } ChangeTracking_CloseFile(file); - + /* Delete Change Tracking log file (if exists) */ ChangeTracking_DropLogFiles(); - + /* set full resync flag in configuration shared memory */ - setFullResync(changeTrackingResyncMeta->resync_mode_full); - + setFullResync(changeTrackingResyncMeta->resync_mode_full); + getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); FileRep_SetSegmentState(SegmentStateChangeTrackingDisabled, FaultTypeNotInitialized); @@ -2330,33 +2419,34 @@ ChangeTracking_MarkFullResyncLockAcquired(void) /* * Mark incremental resync, reset metadata and don't drop change tracking log files */ -void ChangeTracking_MarkIncrResync(void) +void +ChangeTracking_MarkIncrResync(void) { File file; CTFType ftype = CTF_META; FileRep_InsertConfigLogEntry("marking incremental resync "); - + LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - + changeTrackingResyncMeta->resync_mode_full = false; setFullResync(changeTrackingResyncMeta->resync_mode_full); changeTrackingResyncMeta->resync_lsn_end.xlogid = 0; changeTrackingResyncMeta->resync_lsn_end.xrecoff = 0; changeTrackingResyncMeta->resync_transition_completed = false; changeTrackingResyncMeta->insync_transition_completed = false; - - file = ChangeTracking_OpenFile(ftype); - - if(ChangeTracking_WriteMeta(file) < 0) + + file = ChangeTracking_OpenFile(ftype); + + if (ChangeTracking_WriteMeta(file) < 0) ChangeTracking_HandleWriteError(ftype); - + errno = 0; - + /* fsync meta file to disk */ - if(FileSync(file) < 0) + if (FileSync(file) < 0) ChangeTracking_HandleWriteError(ftype); - + ChangeTracking_CloseFile(file); LWLockRelease(ChangeTrackingWriteLock); @@ -2367,26 +2457,26 @@ ChangeTracking_MarkTransitionToResyncCompleted(void) { File file; CTFType ftype = CTF_META; - + LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); FileRep_InsertConfigLogEntry("setting resync_transition_completed to true "); changeTrackingResyncMeta->resync_mode_full = false; changeTrackingResyncMeta->resync_transition_completed = true; - file = ChangeTracking_OpenFile(ftype); - - if(ChangeTracking_WriteMeta(file) < 0) + file = ChangeTracking_OpenFile(ftype); + + if (ChangeTracking_WriteMeta(file) < 0) ChangeTracking_HandleWriteError(ftype); errno = 0; - + /* fsync meta file to disk */ - if(FileSync(file) < 0) + if (FileSync(file) < 0) ChangeTracking_HandleWriteError(ftype); - + ChangeTracking_CloseFile(file); - + LWLockRelease(ChangeTrackingWriteLock); } @@ -2395,114 +2485,114 @@ ChangeTracking_MarkTransitionToInsyncCompleted(void) { File file; CTFType ftype = CTF_META; - + LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); FileRep_InsertConfigLogEntry("setting insync_transition_completed to true "); - + changeTrackingResyncMeta->insync_transition_completed = true; - file = ChangeTracking_OpenFile(ftype); - + file = ChangeTracking_OpenFile(ftype); + if (ChangeTracking_WriteMeta(file) < 0) ChangeTracking_HandleWriteError(ftype); errno = 0; - + /* fsync meta file to disk */ - if(FileSync(file) < 0) + if (FileSync(file) < 0) ChangeTracking_HandleWriteError(ftype); - + ChangeTracking_CloseFile(file); - + LWLockRelease(ChangeTrackingWriteLock); } /* - * Store last change tracked LSN into shared memory and + * Store last change tracked LSN into shared memory and * in Change Tracking meta file (LSN has to be fsync-ed to persistent media). * Also make sure to write any remaining log file data and fsync it. */ -void +void ChangeTracking_RecordLastChangeTrackedLoc(void) { File file; CTFType ftype = CTF_META; - XLogRecPtr endResyncLSN; - + XLogRecPtr endResyncLSN; + LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - - if (! isFullResync()) + + if (!isFullResync()) { setFullResync(changeTrackingResyncMeta->resync_mode_full); } endResyncLSN = XLogLastChangeTrackedLoc(); - + /* the routine stores last LSN in shared memory */ - FileRepResyncManager_SetEndResyncLSN(endResyncLSN); + FileRepResyncManager_SetEndResyncLSN(endResyncLSN); - /* + /* * append "endResyncLSN" (last LSN recorded in Change Tracking log files) - * in Change Tracking meta file. "endResyncLSN" also marks the last entry - * in Change Tracking before resync takes place. Later If transition from + * in Change Tracking meta file. "endResyncLSN" also marks the last entry + * in Change Tracking before resync takes place. Later If transition from * Resync to Change tracking occurs then new changes will be appended. */ FileRep_InsertConfigLogEntry("setting resync lsn "); changeTrackingResyncMeta->resync_lsn_end = endResyncLSN; - file = ChangeTracking_OpenFile(ftype); - - if(ChangeTracking_WriteMeta(file) < 0) + file = ChangeTracking_OpenFile(ftype); + + if (ChangeTracking_WriteMeta(file) < 0) ChangeTracking_HandleWriteError(ftype); errno = 0; - + /* fsync meta file to disk */ - if(FileSync(file) < 0) + if (FileSync(file) < 0) ChangeTracking_HandleWriteError(ftype); - + ChangeTracking_CloseFile(file); - - /* - * log file: write the existing (non-full) buffer and fsync it + + /* + * log file: write the existing (non-full) buffer and fsync it */ ftype = CTF_LOG_FULL; - file = ChangeTracking_OpenFile(ftype); - - if(CTMainWriteBufStatus->recordcount > 0) + file = ChangeTracking_OpenFile(ftype); + + if (CTMainWriteBufStatus->recordcount > 0) { - if(ChangeTracking_WriteBuffer(file, ftype) < 0) + if (ChangeTracking_WriteBuffer(file, ftype) < 0) ChangeTracking_HandleWriteError(ftype); } { - char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; - - snprintf(tmpBuf, sizeof(tmpBuf), - "number of ct records to flush count '%u' size '%u' max '%u' offset " INT64_FORMAT " fileseg '%u' ", - CTMainWriteBufStatus->recordcount, - CTMainWriteBufStatus->bufsize, - CTMainWriteBufStatus->maxbufsize, - CTMainWriteBufStatus->nextwritepos, - CTMainWriteBufStatus->fileseg); - - FileRep_InsertConfigLogEntry(tmpBuf); + char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; + + snprintf(tmpBuf, sizeof(tmpBuf), + "number of ct records to flush count '%u' size '%u' max '%u' offset " INT64_FORMAT " fileseg '%u' ", + CTMainWriteBufStatus->recordcount, + CTMainWriteBufStatus->bufsize, + CTMainWriteBufStatus->maxbufsize, + CTMainWriteBufStatus->nextwritepos, + CTMainWriteBufStatus->fileseg); + + FileRep_InsertConfigLogEntry(tmpBuf); } - - if(FileSync(file) < 0) + + if (FileSync(file) < 0) ChangeTracking_HandleWriteError(ftype); - + ChangeTracking_CloseFile(file); LWLockRelease(ChangeTrackingWriteLock); } /* - * The routine is called during gpstart if dataState == DataStateInResync. + * The routine is called during gpstart if dataState == DataStateInResync. * Responsibilities: * *) determines when crash recovery will be performed by checking * if lastChangeTrackedLoc is recorded - * *) if lastChangeTrackedLoc is recorded then + * *) if lastChangeTrackedLoc is recorded then * 1) resync recovers to ready state * 2) WAL replays to primary and mirror * *) if lastChangeTrackedLoc is NOT recorded then @@ -2512,46 +2602,46 @@ ChangeTracking_RecordLastChangeTrackedLoc(void) */ bool ChangeTracking_RetrieveLastChangeTrackedLoc(void) -{ - ChangeTrackingMetaRecord rec; - ChangeTrackingMetaRecord* recptr = NULL; - XLogRecPtr DummyRecPtr = {0, 0}; - File file; - - file = ChangeTracking_OpenFile(CTF_META); +{ + ChangeTrackingMetaRecord rec; + ChangeTrackingMetaRecord *recptr = NULL; + XLogRecPtr DummyRecPtr = {0, 0}; + File file; + + file = ChangeTracking_OpenFile(CTF_META); recptr = ChangeTracking_ReadMeta(file, &rec); - + /* is there a meta record in the meta file? */ if (recptr) { elog(LOG, "CHANGETRACKING: found an MD record. is full resync %d, last lsn (%d/%d) " - "is transition to resync completed %d, is transition to insync completed %d", - recptr->resync_mode_full, - recptr->resync_lsn_end.xlogid, - recptr->resync_lsn_end.xrecoff, - recptr->resync_transition_completed, - recptr->insync_transition_completed); + "is transition to resync completed %d, is transition to insync completed %d", + recptr->resync_mode_full, + recptr->resync_lsn_end.xlogid, + recptr->resync_lsn_end.xrecoff, + recptr->resync_transition_completed, + recptr->insync_transition_completed); setFullResync(recptr->resync_mode_full); changeTrackingResyncMeta->resync_mode_full = recptr->resync_mode_full; - - /* - * if resync_lsn_end isn't {0,0} then we have a valid value - * that was set earlier. in that case set it in the resync - * manager shared memory + + /* + * if resync_lsn_end isn't {0,0} then we have a valid value that was + * set earlier. in that case set it in the resync manager shared + * memory */ if (!XLByteEQ(recptr->resync_lsn_end, DummyRecPtr)) { - FileRepResyncManager_SetEndResyncLSN(recptr->resync_lsn_end); - return recptr->resync_transition_completed; + FileRepResyncManager_SetEndResyncLSN(recptr->resync_lsn_end); + return recptr->resync_transition_completed; } } else FileRep_InsertConfigLogEntry("pg_changetracking meta data record not found "); - + ChangeTracking_CloseFile(file); - + return false; } @@ -2560,36 +2650,36 @@ ChangeTracking_RetrieveLastChangeTrackedLoc(void) */ bool ChangeTracking_RetrieveIsTransitionToInsync(void) -{ - ChangeTrackingMetaRecord rec; - ChangeTrackingMetaRecord* recptr = NULL; - File file; - bool res = false; - - file = ChangeTracking_OpenFile(CTF_META); +{ + ChangeTrackingMetaRecord rec; + ChangeTrackingMetaRecord *recptr = NULL; + File file; + bool res = false; + + file = ChangeTracking_OpenFile(CTF_META); recptr = ChangeTracking_ReadMeta(file, &rec); - + /* is there a meta record in the meta file? */ if (recptr) { - res = recptr->insync_transition_completed; - + res = recptr->insync_transition_completed; + /* set full resync flag in configuration shared memory */ - setFullResync(recptr->resync_mode_full); + setFullResync(recptr->resync_mode_full); + - elog(LOG, "CHANGETRACKING: ChangeTracking_RetrieveIsTransitionToInsync() found " - "insync_transition_completed:'%s' full resync:'%s' ", + "insync_transition_completed:'%s' full resync:'%s' ", (res == TRUE) ? "true" : "false", (recptr->resync_mode_full == TRUE) ? "true" : "false"); - + } else FileRep_InsertConfigLogEntry("pg_changetracking meta data record not found "); ChangeTracking_CloseFile(file); - + return res; } @@ -2598,44 +2688,45 @@ ChangeTracking_RetrieveIsTransitionToInsync(void) */ bool ChangeTracking_RetrieveIsTransitionToResync(void) -{ - ChangeTrackingMetaRecord rec; - ChangeTrackingMetaRecord* recptr = NULL; - File file; - bool res = false; - - file = ChangeTracking_OpenFile(CTF_META); - +{ + ChangeTrackingMetaRecord rec; + ChangeTrackingMetaRecord *recptr = NULL; + File file; + bool res = false; + + file = ChangeTracking_OpenFile(CTF_META); + recptr = ChangeTracking_ReadMeta(file, &rec); - + /* is there a meta record in the meta file? */ if (recptr) { - res = recptr->resync_transition_completed; - + res = recptr->resync_transition_completed; + /* set full resync flag in configuration shared memory */ - setFullResync(recptr->resync_mode_full); - - elog(LOG, + setFullResync(recptr->resync_mode_full); + + elog(LOG, "CHANGETRACKING: ChangeTracking_RetrieveIsTransitionToResync() found " - "resync_transition_completed:'%s' full resync:'%s' ", + "resync_transition_completed:'%s' full resync:'%s' ", (res == TRUE) ? "true" : "false", (recptr->resync_mode_full == TRUE) ? "true" : "false"); } else FileRep_InsertConfigLogEntry("pg_changetracking meta data record not found "); - + ChangeTracking_CloseFile(file); - + return res; } /** * Get the total amount of space, in bytes, used by the changetracking information. */ -int64 ChangeTracking_GetTotalSpaceUsedOnDisk(void) +int64 +ChangeTracking_GetTotalSpaceUsedOnDisk(void) { - return db_dir_size(CHANGETRACKINGDIR); + return db_dir_size(CHANGETRACKINGDIR); } /************************************* @@ -2644,31 +2735,32 @@ int64 ChangeTracking_GetTotalSpaceUsedOnDisk(void) /* * ChangeTracking_doesFileNeedCompacting - * + * * If a log file was appended more than CHANGETRACKING_COMPACT_THRESHOLD bytes (currently * 1GB) since the last time it was compacted, return true. otherwise return false. */ -bool ChangeTracking_doesFileNeedCompacting(CTFType ftype) +bool +ChangeTracking_doesFileNeedCompacting(CTFType ftype) { - bool needs_compacting = false; - + bool needs_compacting = false; + Assert(ftype != CTF_LOG_TRANSIENT); Assert(ftype != CTF_META); - - switch(ftype) + + switch (ftype) { case CTF_LOG_FULL: - needs_compacting = (changeTrackingCompState->ctfull_bufs_added * CHANGETRACKING_BLCKSZ > + needs_compacting = (changeTrackingCompState->ctfull_bufs_added * CHANGETRACKING_BLCKSZ > CHANGETRACKING_COMPACT_THRESHOLD); break; case CTF_LOG_COMPACT: - needs_compacting = (changeTrackingCompState->ctcompact_bufs_added * CHANGETRACKING_BLCKSZ > + needs_compacting = (changeTrackingCompState->ctcompact_bufs_added * CHANGETRACKING_BLCKSZ > CHANGETRACKING_COMPACT_THRESHOLD); break; default: elog(ERROR, "internal error in ChangeTracking_doesFileNeedCompacting (used %d)", ftype); } - + return needs_compacting; } @@ -2676,34 +2768,35 @@ bool ChangeTracking_doesFileNeedCompacting(CTFType ftype) * Create a transient log file for the filerep process * to use for compacting. Do that if all of the following * conditions exist: - * + * * 1) The full log file is larger than the threshold for * compacting (currently 1GB). - * + * * 2) Filerep compacting operation isn't currently in progress - * + * * 3) There isn't already a transient file we previously created, - * since this means that the filerep process didn't get to + * since this means that the filerep process didn't get to * compacting it yet, and we don't want to run it over. - * + * */ -static void ChangeTracking_CreateTransientLogIfNeeded(void) +static void +ChangeTracking_CreateTransientLogIfNeeded(void) { LWLockAcquire(ChangeTrackingCompactLock, LW_EXCLUSIVE); - LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - - if (ChangeTracking_doesFileNeedCompacting(CTF_LOG_FULL) && /* condition (1) */ - changeTrackingCompState->in_progress == false && /* condition (2) */ - ChangeTracking_DoesFileExist(CTF_LOG_TRANSIENT) == false) /* condition (3) */ + LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); + + if (ChangeTracking_doesFileNeedCompacting(CTF_LOG_FULL) && /* condition (1) */ + changeTrackingCompState->in_progress == false && /* condition (2) */ + ChangeTracking_DoesFileExist(CTF_LOG_TRANSIENT) == false) /* condition (3) */ { ChangeTracking_RenameLogFile(CTF_LOG_FULL, CTF_LOG_TRANSIENT); - + /* we must now reset our full log write as we'll start a new file */ ChangeTracking_ResetBufStatus(CTMainWriteBufStatus); - + changeTrackingCompState->ctfull_bufs_added = 0; } - + LWLockRelease(ChangeTrackingWriteLock); LWLockRelease(ChangeTrackingCompactLock); } @@ -2712,23 +2805,23 @@ static void ChangeTracking_CreateTransientLogIfNeeded(void) * During crash recovery append records from CT_LOG_FULL to CT_LOG_TRANSIENT in order to run compacting * that will discard records that have higher lsn than the highest lsn in xlog. */ -void +void ChangeTracking_CreateTransientLog(void) { LWLockAcquire(ChangeTrackingCompactLock, LW_EXCLUSIVE); - LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); - - if (ChangeTracking_DoesFileExist(CTF_LOG_TRANSIENT) == false) + LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); + + if (ChangeTracking_DoesFileExist(CTF_LOG_TRANSIENT) == false) { FileRep_InsertConfigLogEntry("rename full to transient change tracking log file"); - ChangeTracking_RenameLogFile(CTF_LOG_FULL, CTF_LOG_TRANSIENT); - + ChangeTracking_RenameLogFile(CTF_LOG_FULL, CTF_LOG_TRANSIENT); + /* we must now reset our full log write as we'll start a new file */ ChangeTracking_ResetBufStatus(CTMainWriteBufStatus); - - changeTrackingCompState->ctfull_bufs_added = 0; + + changeTrackingCompState->ctfull_bufs_added = 0; } - else + else { File fileTransient = 0; File fileFull = 0; @@ -2739,20 +2832,20 @@ ChangeTracking_CreateTransientLog(void) int64 positionFullEnd = 0; int nbytes = 0; - char *buf = NULL; - + char *buf = NULL; + FileRep_InsertConfigLogEntry("append records from full to transient change tracking log file"); while (1) { errno = 0; fileFull = ChangeTracking_OpenFile(ftype); - + if (fileFull > 0) { positionFullEnd = FileSeek(fileFull, 0, SEEK_END); - positionFull = FileSeek(fileFull, 0, SEEK_SET); - + positionFull = FileSeek(fileFull, 0, SEEK_SET); + if (positionFullEnd < 0 || positionFull < 0) { ereport(WARNING, @@ -2760,45 +2853,45 @@ ChangeTracking_CreateTransientLog(void) errmsg("unable to seek to begin " INT64_FORMAT " or end " INT64_FORMAT " in change tracking '%s' file : %m", positionFull, positionFullEnd, - ChangeTracking_FtypeToString(ftype)))); + ChangeTracking_FtypeToString(ftype)))); break; - } + } } else { ereport(WARNING, (errcode_for_file_access(), errmsg("unable to open change tracking '%s' file : %m", - ChangeTracking_FtypeToString(ftype)))); + ChangeTracking_FtypeToString(ftype)))); break; } - + ftype = CTF_LOG_TRANSIENT; errno = 0; fileTransient = ChangeTracking_OpenFile(ftype); - + if (fileTransient > 0) { - position = FileSeek(fileTransient, 0, SEEK_END); - + position = FileSeek(fileTransient, 0, SEEK_END); + if (position < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("unable to seek to end in change tracking '%s' file : %m", - ChangeTracking_FtypeToString(ftype)))); + ChangeTracking_FtypeToString(ftype)))); break; - } + } } else { ereport(WARNING, (errcode_for_file_access(), errmsg("unable to open change tracking '%s' file : %m", - ChangeTracking_FtypeToString(ftype)))); + ChangeTracking_FtypeToString(ftype)))); break; } - + buf = MemoryContextAlloc(TopMemoryContext, CHANGETRACKING_BLCKSZ); if (buf == NULL) { @@ -2807,112 +2900,115 @@ ChangeTracking_CreateTransientLog(void) LWLockRelease(ChangeTrackingWriteLock); LWLockRelease(ChangeTrackingCompactLock); - + ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), (errmsg("could not allocate memory for change tracking log buffer")))); } - + MemSet(buf, 0, CHANGETRACKING_BLCKSZ); - + while (positionFull < positionFullEnd) { errno = 0; nbytes = FileRead(fileFull, buf, CHANGETRACKING_BLCKSZ); - + if (nbytes == CHANGETRACKING_BLCKSZ) - { + { nbytes = FileWrite(fileTransient, buf, CHANGETRACKING_BLCKSZ); - + if (nbytes < CHANGETRACKING_BLCKSZ) { - ChangeTracking_HandleWriteError(ftype); + ChangeTracking_HandleWriteError(ftype); break; } } else { - ChangeTracking_HandleWriteError(CTF_LOG_FULL); + ChangeTracking_HandleWriteError(CTF_LOG_FULL); break; } positionFull += CHANGETRACKING_BLCKSZ; } - + if (positionFull < positionFullEnd) { break; } - + errno = 0; if (FileSync(fileTransient) < 0) { - ChangeTracking_HandleWriteError(ftype); + ChangeTracking_HandleWriteError(ftype); } - + ChangeTracking_DropLogFile(CTF_LOG_FULL); - + /* we must now reset our full log write as we'll start a new file */ ChangeTracking_ResetBufStatus(CTMainWriteBufStatus); - - changeTrackingCompState->ctfull_bufs_added = 0; - + + changeTrackingCompState->ctfull_bufs_added = 0; + break; - } // while(1) - - if (fileTransient) - { - ChangeTracking_CloseFile(fileTransient); - } - + } //while (1) + + if (fileTransient) + { + ChangeTracking_CloseFile(fileTransient); + } + if (fileFull) { ChangeTracking_CloseFile(fileFull); } - + if (buf) { pfree(buf); } } - + LWLockRelease(ChangeTrackingWriteLock); LWLockRelease(ChangeTrackingCompactLock); } /* * This routine will normally be called by an external process. - * + * * It will do the following: - * - * 1) if a transient file exists (therefore waiting to be compacted), - * compact it into the compact log file and remove the transient + * + * 1) if a transient file exists (therefore waiting to be compacted), + * compact it into the compact log file and remove the transient * file when done. - * + * * 2) if a compact file is ready to be compacted further, rename it to * transient, compact transient into compact log and remove transient * when done. - * - * note that we take the LW compacting lock only very briefly just to - * set "in progress" flag. Don't want to hold it for the duration of - * compacting operation since that will slow down changetracking. + * + * note that we take the LW compacting lock only very briefly just to + * set "in progress" flag. Don't want to hold it for the duration of + * compacting operation since that will slow down changetracking. */ -void ChangeTracking_CompactLogsIfPossible(void) +void +ChangeTracking_CompactLogsIfPossible(void) { - bool compact_transient = false; /* should compact transient log file? */ - bool compact_compact = false; /* should compact compact log file? */ - + bool compact_transient = false; /* should compact transient log + * file? */ + bool compact_compact = false; /* should compact compact log + * file? */ + /* -- transient log -- */ - + LWLockAcquire(ChangeTrackingCompactLock, LW_EXCLUSIVE); - + if (ChangeTracking_DoesFileExist(CTF_LOG_TRANSIENT)) { compact_transient = true; changeTrackingCompState->in_progress = true; } - + LWLockRelease(ChangeTrackingCompactLock); - + if (compact_transient) { /* Now do the actual compacting. Remove transient log file when done */ @@ -2920,23 +3016,23 @@ void ChangeTracking_CompactLogsIfPossible(void) ChangeTracking_ResetBufStatus(CTCompactWriteBufStatus); ChangeTracking_CompactLogFile(CTF_LOG_TRANSIENT, CTF_LOG_COMPACT, NULL); - + ChangeTracking_DropLogFile(CTF_LOG_TRANSIENT); - + } - + /* -- compact log -- */ LWLockAcquire(ChangeTrackingCompactLock, LW_EXCLUSIVE); - + if (ChangeTracking_doesFileNeedCompacting(CTF_LOG_COMPACT)) { compact_compact = true; changeTrackingCompState->in_progress = true; } - + LWLockRelease(ChangeTrackingCompactLock); - + if (compact_compact) { /* Now do the actual compacting. Remove transient log file when done */ @@ -2945,19 +3041,19 @@ void ChangeTracking_CompactLogsIfPossible(void) ChangeTracking_ResetBufStatus(CTCompactWriteBufStatus); ChangeTracking_RenameLogFile(CTF_LOG_COMPACT, CTF_LOG_TRANSIENT); ChangeTracking_CompactLogFile(CTF_LOG_TRANSIENT, CTF_LOG_COMPACT, NULL); - changeTrackingCompState->ctcompact_bufs_added = 0; /* reset for next round */ + changeTrackingCompState->ctcompact_bufs_added = 0; /* reset for next round */ } - + changeTrackingCompState->in_progress = false; - + } bool ChangeTrackingIsCompactingInProgress(void) { - /* - * no lock is required since that information is required only for status report - * for test automation + /* + * no lock is required since that information is required only for status + * report for test automation */ return changeTrackingCompState->in_progress; } @@ -2968,16 +3064,16 @@ ChangeTrackingSetXLogEndLocation(XLogRecPtr upto_lsn) LWLockAcquire(ChangeTrackingCompactLock, LW_EXCLUSIVE); changeTrackingCompState->xlog_end_location = upto_lsn; - + LWLockRelease(ChangeTrackingCompactLock); } -void +void ChangeTracking_DoFullCompactingRoundIfNeeded(void) { - if (! (changeTrackingCompState->xlog_end_location.xlogid == 0 && - changeTrackingCompState->xlog_end_location.xrecoff == 0)) + if (!(changeTrackingCompState->xlog_end_location.xlogid == 0 && + changeTrackingCompState->xlog_end_location.xrecoff == 0)) { ChangeTracking_DoFullCompactingRound(&changeTrackingCompState->xlog_end_location); } @@ -2987,58 +3083,59 @@ ChangeTracking_DoFullCompactingRoundIfNeeded(void) * This routine must be called before resync is started * and after the last call to ChangeTracking_CompactLogsIfPossible(), * or during recovery start. - * + * * It will take care of compacting all the left over pieces. * (for example, an outstanding transient file, or some file that * didn't make it past the 1GB threshold). - * + * * if the passed in 'uptolsn' is something other than {0,0}, then we * tell the compacting routines to ignore any changetracking records * with lsn > uptolsn when doing the compacting logic (therefore these * records will be gone forever). - * + * * The logic of this routine is done in the following order: - * - * (1) if transient file exists, compact it into compact file. remove + * + * (1) if transient file exists, compact it into compact file. remove * transient file. - * + * * (2) rename the full log file into transient and compact it into the - * compact file. note that it is possible for the full log file to not - * exist, in the case where it was just renamed to transient (in - * ChangeTracking_CompactLogsIfPossible) and no more writes were made - * into it. Even though it should be a rare case we must check for it + * compact file. note that it is possible for the full log file to not + * exist, in the case where it was just renamed to transient (in + * ChangeTracking_CompactLogsIfPossible) and no more writes were made + * into it. Even though it should be a rare case we must check for it * here and skip this stage if needed. - * + * * (3) compact the compact file itself. - * + * * NOTE: we don't care about taking any locks or setting the in_progress * flag. this is because this routine should be run after change tracking * mode is complete, so we don't expect change tracking module to do any * more writes. */ -void ChangeTracking_DoFullCompactingRound(XLogRecPtr* upto_lsn) +void +ChangeTracking_DoFullCompactingRound(XLogRecPtr *upto_lsn) { /* - * we should normally have in_progress == false now, but if the filerep - * process that did compacting in the background was killed while compacting - * we issue a warning, mainly for tracking purposes. this should be harmless - * though - resulting in few duplicate entries. compacting state is reset - * few lines down. + * we should normally have in_progress == false now, but if the filerep + * process that did compacting in the background was killed while + * compacting we issue a warning, mainly for tracking purposes. this + * should be harmless though - resulting in few duplicate entries. + * compacting state is reset few lines down. */ - if(changeTrackingCompState->in_progress) + if (changeTrackingCompState->in_progress) elog(LOG, "ChangeTracking: warning - routine compacting was shut off abnormally"); - + FileRep_InsertConfigLogEntry("running a full round of compacting the logs "); - + if (upto_lsn != NULL) - elog(LOG, "ChangeTracking: discarding records with LSN higher than %s", - XLogLocationToString(upto_lsn)); - + elog(LOG, "ChangeTracking: discarding records with LSN higher than %s", + XLogLocationToString(upto_lsn)); + /* compacting state is no longer needed. reset it to be safe */ ChangeTracking_ResetCompactingStatus(changeTrackingCompState); - - + + /* step (1) */ if (ChangeTracking_DoesFileExist(CTF_LOG_TRANSIENT)) { @@ -3046,7 +3143,7 @@ void ChangeTracking_DoFullCompactingRound(XLogRecPtr* upto_lsn) ChangeTracking_ResetBufStatus(CTCompactWriteBufStatus); ChangeTracking_CompactLogFile(CTF_LOG_TRANSIENT, CTF_LOG_COMPACT, upto_lsn); ChangeTracking_DropLogFile(CTF_LOG_TRANSIENT); - + upto_lsn = NULL; changeTrackingCompState->xlog_end_location.xlogid = 0; changeTrackingCompState->xlog_end_location.xrecoff = 0; @@ -3080,41 +3177,45 @@ void ChangeTracking_DoFullCompactingRound(XLogRecPtr* upto_lsn) /* * ChangeTracking_CompactLogFile - * + * * compact the source CT log file into the dest CT log file. - * if 'uptolsn' is not NULL, then discard any record with + * if 'uptolsn' is not NULL, then discard any record with * lsn > uptolsn when compacting. */ -int ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr* uptolsn) -{ - StringInfoData sqlstmt; - int ret; - int proc; - bool connected = false; - int64 count = 0; - ResourceOwner save = CurrentResourceOwner; - MemoryContext oldcontext = CurrentMemoryContext; - - /* as of right now the only compacting operation possible is transient-->compact */ +int +ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr *uptolsn) +{ + StringInfoData sqlstmt; + int ret; + int proc; + bool connected = false; + int64 count = 0; + ResourceOwner save = CurrentResourceOwner; + MemoryContext oldcontext = CurrentMemoryContext; + + /* + * as of right now the only compacting operation possible is + * transient-->compact + */ Assert(source == CTF_LOG_TRANSIENT); Assert(dest == CTF_LOG_COMPACT); - + /* find out if full log file exists, if not return error */ - if(!ChangeTracking_DoesFileExist(source)) + if (!ChangeTracking_DoesFileExist(source)) return -1; - + /* assemble our query string */ initStringInfo(&sqlstmt); - + appendStringInfo(&sqlstmt, "SELECT space, db, rel, blocknum, max(xlogloc), persistent_tid, persistent_sn " - "FROM gp_changetracking_log(%d) ", source); - + "FROM gp_changetracking_log(%d) ", source); + /* filter xlogloc higher than uptolsn if requested to do so */ - if(uptolsn != NULL) + if (uptolsn != NULL) appendStringInfo(&sqlstmt, "WHERE xlogloc <= '(%X/%X)' ", uptolsn->xlogid, uptolsn->xrecoff); - - appendStringInfo(&sqlstmt, "GROUP BY space, db, rel, blocknum, persistent_tid, persistent_sn"); - + + appendStringInfo(&sqlstmt, "GROUP BY space, db, rel, blocknum, persistent_tid, persistent_sn"); + PG_TRY(); { @@ -3138,28 +3239,28 @@ int ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr* upto if ((segmentState != SegmentStateChangeTrackingDisabled) && ret > 0 && SPI_tuptable != NULL) { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - SPITupleTable* tuptable = SPI_tuptable; - MemoryContext cxt_save; - int i; + TupleDesc tupdesc = SPI_tuptable->tupdesc; + SPITupleTable *tuptable = SPI_tuptable; + MemoryContext cxt_save; + int i; for (i = 0; i < proc; i++) { - HeapTuple tuple = tuptable->vals[i]; + HeapTuple tuple = tuptable->vals[i]; RelFileNode relfile; - - BlockNumber blocknum; - XLogRecPtr* endlsn; - ItemPointer persistentTid; - int64 persistentSerialNum; - char* str_space; - char* str_db; - char* str_rel; - char* str_blocknum; - char* str_endlsn; - char* str_tid; - char* str_sn; - + + BlockNumber blocknum; + XLogRecPtr *endlsn; + ItemPointer persistentTid; + int64 persistentSerialNum; + char *str_space; + char *str_db; + char *str_rel; + char *str_blocknum; + char *str_endlsn; + char *str_tid; + char *str_sn; + /* get result columns from SPI (as strings) */ str_space = SPI_getvalue(tuple, tupdesc, 1); str_db = SPI_getvalue(tuple, tupdesc, 2); @@ -3168,41 +3269,44 @@ int ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr* upto str_endlsn = SPI_getvalue(tuple, tupdesc, 5); str_tid = SPI_getvalue(tuple, tupdesc, 6); str_sn = SPI_getvalue(tuple, tupdesc, 7); - - //elog(NOTICE,"tuple %d: %s %s %s block %s lsn %s", i, str_space, str_db, str_rel, str_blocknum, str_endlsn); - + + /* + * elog(NOTICE,"tuple %d: %s %s %s block %s lsn %s", i, + * str_space, str_db, str_rel, str_blocknum, str_endlsn); + */ + /* use our own context so that SPI won't free our stuff later */ cxt_save = MemoryContextSwitchTo(oldcontext); - + /* convert to desired data type */ relfile.spcNode = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(str_space))); relfile.dbNode = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(str_db))); relfile.relNode = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(str_rel))); blocknum = DatumGetUInt32(DirectFunctionCall1(int4in, CStringGetDatum(str_blocknum))); - endlsn = (XLogRecPtr*) DatumGetPointer(DirectFunctionCall1(gpxloglocin, CStringGetDatum(str_endlsn))); + endlsn = (XLogRecPtr *) DatumGetPointer(DirectFunctionCall1(gpxloglocin, CStringGetDatum(str_endlsn))); persistentTid = (ItemPointer) DatumGetPointer( - DirectFunctionCall1(tidin, CStringGetDatum(str_tid))); + DirectFunctionCall1(tidin, CStringGetDatum(str_tid))); persistentSerialNum = DatumGetInt64(DirectFunctionCall1(int8in, CStringGetDatum(str_sn))); SIMPLE_FAULT_INJECTOR(FileRepChangeTrackingCompacting); /* write this record to the compact file */ ChangeTracking_AddBufferPoolChange(dest, - endlsn, - &relfile, - blocknum, - *persistentTid, + endlsn, + &relfile, + blocknum, + *persistentTid, persistentSerialNum); - + count++; MemoryContextSwitchTo(cxt_save); } } - + connected = false; SPI_finish(); - + CommitTransactionCommand(); } @@ -3213,7 +3317,7 @@ int ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr* upto SPI_finish(); AbortCurrentTransaction(); - + /* Carry on with error handling. */ PG_RE_THROW(); } @@ -3221,9 +3325,9 @@ int ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr* upto /* done writing to the compact file. must fsync now */ ChangeTracking_FsyncDataIntoLog(dest); - + elog(LOG, "ChangeTracking done creating the compact version. reduced to " INT64_FORMAT " records", count); - + MemoryContextSwitchTo(oldcontext); CurrentResourceOwner = save; @@ -3232,7 +3336,7 @@ int ChangeTracking_CompactLogFile(CTFType source, CTFType dest, XLogRecPtr* upto return 0; } -/* +/* * find last LSN recorded in Change Tracking Full Log file */ bool @@ -3243,7 +3347,7 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE int64 position = 0; int64 numBlocks = 0; int nbytes = 0; - char *buf = NULL; + char *buf = NULL; bool retval = true; LWLockAcquire(ChangeTrackingWriteLock, LW_EXCLUSIVE); @@ -3251,18 +3355,18 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE while (1) { errno = 0; - file = ChangeTracking_OpenFile(ftype); - + file = ChangeTracking_OpenFile(ftype); + if (file > 0) { - position = FileSeek(file, 0, SEEK_END); + position = FileSeek(file, 0, SEEK_END); if (position < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("unable to seek to end in change tracking '%s' file : %m", - ChangeTracking_FtypeToString(ftype)))); + ChangeTracking_FtypeToString(ftype)))); break; } @@ -3275,9 +3379,9 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE /* * CT files are always written in term of CHANGETRACKING_BLCKSZ, - * so while reading must have it aligned to same. If not - * something went missing or is extra in file and hence treat it - * as corruption and act accordingly. + * so while reading must have it aligned to same. If not something + * went missing or is extra in file and hence treat it as + * corruption and act accordingly. */ if (position % CHANGETRACKING_BLCKSZ) { @@ -3288,7 +3392,11 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE position))); ChangeTracking_CloseFile(file); file = 0; - /* Marks segment state as CT disabled and deletes all the CT files */ + + /* + * Marks segment state as CT disabled and deletes all the CT + * files + */ ChangeTracking_MarkFullResyncLockAcquired(); retval = false; break; @@ -3296,40 +3404,40 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE numBlocks = position / CHANGETRACKING_BLCKSZ; position = (numBlocks - 1) * CHANGETRACKING_BLCKSZ; - FileSeek(file, position, SEEK_SET); + FileSeek(file, position, SEEK_SET); } else { ereport(WARNING, (errcode_for_file_access(), errmsg("unable to open change tracking '%s' file : %m", - ChangeTracking_FtypeToString(ftype)))); + ChangeTracking_FtypeToString(ftype)))); break; } - + buf = MemoryContextAlloc(TopMemoryContext, CHANGETRACKING_BLCKSZ); if (buf == NULL) { ChangeTracking_CloseFile(file); LWLockRelease(ChangeTrackingWriteLock); - + ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), (errmsg("could not allocate memory for change tracking log buffer")))); } - + MemSet(buf, 0, CHANGETRACKING_BLCKSZ); - + errno = 0; nbytes = FileRead(file, buf, CHANGETRACKING_BLCKSZ); - + if (nbytes == CHANGETRACKING_BLCKSZ) - { - ChangeTrackingPageHeader *header; - ChangeTrackingRecord *record; - char *bufTemp = buf; - pg_crc32 read_checksum; - pg_crc32 calc_checksum; + { + ChangeTrackingPageHeader *header; + ChangeTrackingRecord *record; + char *bufTemp = buf; + pg_crc32 read_checksum; + pg_crc32 calc_checksum; header = (ChangeTrackingPageHeader *) bufTemp; read_checksum = header->checksum; @@ -3343,15 +3451,19 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE (errcode(ERRCODE_DATA_CORRUPTED), errmsg("changetracking log (CTF_LOG_FULL) corrupted, disabling changetracking"), errdetail("checksum mismatch read:0x%08X compute:0x%08X", - read_checksum, calc_checksum))); + read_checksum, calc_checksum))); ChangeTracking_CloseFile(file); file = 0; - /* Marks segment state as CT disabled and deletes all the CT files */ + + /* + * Marks segment state as CT disabled and deletes all the CT + * files + */ ChangeTracking_MarkFullResyncLockAcquired(); retval = false; break; } - + bufTemp += sizeof(ChangeTrackingPageHeader) + sizeof(ChangeTrackingRecord) * (header->numrecords - 1); record = (ChangeTrackingRecord *) bufTemp; *lastChangeTrackingLogEndLoc = record->xlogLocation; @@ -3370,12 +3482,12 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE } break; } - + if (file) { ChangeTracking_CloseFile(file); } - + LWLockRelease(ChangeTrackingWriteLock); if (buf) @@ -3384,7 +3496,7 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE } return retval; -} +} /* * Any RM will not change more than 5 blocks per xlog record. The only exception @@ -3392,24 +3504,30 @@ ChangeTracking_GetLastChangeTrackingLogEndLoc(XLogRecPtr *lastChangeTrackingLogE * This function will return the maximum number of change infos that could occur, so * that we could set the array size accordingly. */ -int ChangeTracking_GetInfoArrayDesiredMaxLength(RmgrId rmid, uint8 info) +int +ChangeTracking_GetInfoArrayDesiredMaxLength(RmgrId rmid, uint8 info) { - int MaxRelChangeInfoReturns = 5; - int MaxRelChangeInfoReturns_GistSplit = 1024; //TODO: this is some sort of a very large guess. check if realistic. - bool gist_split = ((rmid == RM_GIST_ID && (info & ~XLR_INFO_MASK) == XLOG_GIST_PAGE_SPLIT)); - int arrLen = (!gist_split ? MaxRelChangeInfoReturns : MaxRelChangeInfoReturns_GistSplit); - + int MaxRelChangeInfoReturns = 5; + + /* TODO:this is some sort of a very large guess.check if realistic */ + int MaxRelChangeInfoReturns_GistSplit = 1024; + + bool gist_split = ((rmid == RM_GIST_ID && (info & ~XLR_INFO_MASK) == XLOG_GIST_PAGE_SPLIT)); + int arrLen = (!gist_split ? MaxRelChangeInfoReturns : MaxRelChangeInfoReturns_GistSplit); + return arrLen; } -static void ChangeTracking_HandleWriteError(CTFType ft) +static void +ChangeTracking_HandleWriteError(CTFType ft) { ChangeTracking_MarkFullResyncLockAcquired(); } -char *ChangeTracking_FtypeToString(CTFType ftype) +char * +ChangeTracking_FtypeToString(CTFType ftype) { - switch(ftype) + switch (ftype) { case CTF_LOG_FULL: return "full log"; diff --git a/src/backend/cdb/cdbsetop.c b/src/backend/cdb/cdbsetop.c index 7dcf66a110..7176a754b5 100644 --- a/src/backend/cdb/cdbsetop.c +++ b/src/backend/cdb/cdbsetop.c @@ -45,27 +45,29 @@ static List *makeHashExprsFromNonjunkTargets(List *targetList); * * See the comments in cdbsetop.h for discussion of types of setop plan. */ -GpSetOpType choose_setop_type(List *planlist) +GpSetOpType +choose_setop_type(List *planlist) { - ListCell *cell; - Plan *subplan = NULL; - bool ok_general = TRUE; - bool ok_partitioned = TRUE; - bool ok_replicated = TRUE; - bool ok_single_qe = TRUE; - bool has_partitioned = FALSE; - - Assert( Gp_role == GP_ROLE_DISPATCH || Gp_role == GP_ROLE_UTILITY ); - + ListCell *cell; + Plan *subplan = NULL; + bool ok_general = TRUE; + bool ok_partitioned = TRUE; + bool ok_replicated = TRUE; + bool ok_single_qe = TRUE; + bool has_partitioned = FALSE; + + Assert(Gp_role == GP_ROLE_DISPATCH || Gp_role == GP_ROLE_UTILITY); + foreach(cell, planlist) { - Flow *subplanflow; - subplan = (Plan*)lfirst(cell); - subplanflow = subplan->flow; + Flow *subplanflow; - Assert(is_plan_node((Node*)subplan)); + subplan = (Plan *) lfirst(cell); + subplanflow = subplan->flow; + + Assert(is_plan_node((Node *) subplan)); Assert(subplanflow != NULL); - switch ( subplanflow->locustype ) + switch (subplanflow->locustype) { case CdbLocusType_Hashed: case CdbLocusType_HashedOJ: @@ -73,15 +75,15 @@ GpSetOpType choose_setop_type(List *planlist) ok_general = ok_replicated = FALSE; has_partitioned = TRUE; break; - + case CdbLocusType_Entry: ok_general = ok_partitioned = ok_replicated = ok_single_qe = FALSE; break; - + case CdbLocusType_SingleQE: ok_general = ok_replicated = FALSE; break; - + case CdbLocusType_General: break; @@ -91,12 +93,12 @@ GpSetOpType choose_setop_type(List *planlist) return PSETOP_NONE; } } - - if ( ok_general ) + + if (ok_general) return PSETOP_GENERAL; - else if ( ok_partitioned && has_partitioned ) + else if (ok_partitioned && has_partitioned) return PSETOP_PARALLEL_PARTITIONED; - else if ( ok_single_qe ) + else if (ok_single_qe) return PSETOP_SEQUENTIAL_QE; return PSETOP_SEQUENTIAL_QD; @@ -106,135 +108,139 @@ GpSetOpType choose_setop_type(List *planlist) void adjust_setop_arguments(PlannerInfo *root, List *planlist, GpSetOpType setop_type) { - ListCell *cell; - Plan *subplan; - Plan *adjusted_plan; - - foreach ( cell, planlist ) + ListCell *cell; + Plan *subplan; + Plan *adjusted_plan; + + foreach(cell, planlist) { - Flow* subplanflow; - subplan = (Plan*)lfirst(cell); - subplanflow = subplan->flow; + Flow *subplanflow; - Assert(is_plan_node((Node*)subplan)); + subplan = (Plan *) lfirst(cell); + subplanflow = subplan->flow; + + Assert(is_plan_node((Node *) subplan)); Assert(subplanflow != NULL); - + adjusted_plan = subplan; - switch ( setop_type ) + switch (setop_type) { - case PSETOP_GENERAL: - /* This only occurs when all arguments are general. */ - break; + case PSETOP_GENERAL: + /* This only occurs when all arguments are general. */ + break; - case PSETOP_PARALLEL_PARTITIONED: - switch ( subplanflow->locustype ) - { - case CdbLocusType_Hashed: - case CdbLocusType_HashedOJ: - case CdbLocusType_Strewn: - Assert( subplanflow->flotype == FLOW_PARTITIONED ); - break; - case CdbLocusType_SingleQE: - case CdbLocusType_General: - Assert( subplanflow->flotype == FLOW_SINGLETON && subplanflow->segindex > -1 ); - /* The setop itself will run on an N-gang, so we need to - * arrange for the singleton input to be separately dispatched - * to a 1-gang and collect its result on one of our N QEs. - * Hence ... */ - adjusted_plan = (Plan *)make_motion_hash_all_targets(NULL, subplan); - break; - case CdbLocusType_Null: - case CdbLocusType_Entry: - case CdbLocusType_Replicated: - default: - ereport(ERROR, ( - errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected argument locus to set operation") )); - break; - } - break; - - case PSETOP_SEQUENTIAL_QD: - switch ( subplanflow->locustype ) - { - case CdbLocusType_Hashed: - case CdbLocusType_HashedOJ: - case CdbLocusType_Strewn: - Assert( subplanflow->flotype == FLOW_PARTITIONED ); - adjusted_plan = (Plan*)make_motion_gather_to_QD(root, subplan, NULL); - break; - - case CdbLocusType_SingleQE: - Assert( subplanflow->flotype == FLOW_SINGLETON && subplanflow->segindex == 0 ); - adjusted_plan = (Plan*)make_motion_gather_to_QD(root, subplan, NULL); - break; - - case CdbLocusType_Entry: - case CdbLocusType_General: - break; - - case CdbLocusType_Null: - case CdbLocusType_Replicated: - default: - ereport(ERROR, ( - errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected argument locus to set operation") )); - break; - } - break; - - case PSETOP_SEQUENTIAL_QE: - switch ( subplanflow->locustype ) - { - case CdbLocusType_Hashed: - case CdbLocusType_HashedOJ: - case CdbLocusType_Strewn: - Assert( subplanflow->flotype == FLOW_PARTITIONED ); - /* Gather to QE. No need to keep ordering. */ - adjusted_plan = (Plan*)make_motion_gather_to_QE(root, subplan, NULL); - break; - - case CdbLocusType_SingleQE: - Assert( subplanflow->flotype == FLOW_SINGLETON && subplanflow->segindex != -1 ); - break; - - case CdbLocusType_General: - break; - - case CdbLocusType_Entry: - case CdbLocusType_Null: - case CdbLocusType_Replicated: - default: - ereport(ERROR, ( - errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected argument locus to set operation") )); - break; - } - break; - - case PSETOP_PARALLEL_REPLICATED: - /* Only when all args are replicated. */ - ereport(ERROR, (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected replicated intermediate result"), - errdetail("argument to set operation may not be replicated") )); - break; - - default: - /* Can't happen! */ - ereport(ERROR, ( - errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("unexpected arguments to set operation") )); - break; + case PSETOP_PARALLEL_PARTITIONED: + switch (subplanflow->locustype) + { + case CdbLocusType_Hashed: + case CdbLocusType_HashedOJ: + case CdbLocusType_Strewn: + Assert(subplanflow->flotype == FLOW_PARTITIONED); + break; + case CdbLocusType_SingleQE: + case CdbLocusType_General: + Assert(subplanflow->flotype == FLOW_SINGLETON && subplanflow->segindex > -1); + + /* + * The setop itself will run on an N-gang, so we need + * to arrange for the singleton input to be separately + * dispatched to a 1-gang and collect its result on + * one of our N QEs. Hence ... + */ + adjusted_plan = (Plan *) make_motion_hash_all_targets(NULL, subplan); + break; + case CdbLocusType_Null: + case CdbLocusType_Entry: + case CdbLocusType_Replicated: + default: + ereport(ERROR, ( + errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected argument locus to set operation"))); + break; + } + break; + + case PSETOP_SEQUENTIAL_QD: + switch (subplanflow->locustype) + { + case CdbLocusType_Hashed: + case CdbLocusType_HashedOJ: + case CdbLocusType_Strewn: + Assert(subplanflow->flotype == FLOW_PARTITIONED); + adjusted_plan = (Plan *) make_motion_gather_to_QD(root, subplan, NULL); + break; + + case CdbLocusType_SingleQE: + Assert(subplanflow->flotype == FLOW_SINGLETON && subplanflow->segindex == 0); + adjusted_plan = (Plan *) make_motion_gather_to_QD(root, subplan, NULL); + break; + + case CdbLocusType_Entry: + case CdbLocusType_General: + break; + + case CdbLocusType_Null: + case CdbLocusType_Replicated: + default: + ereport(ERROR, ( + errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected argument locus to set operation"))); + break; + } + break; + + case PSETOP_SEQUENTIAL_QE: + switch (subplanflow->locustype) + { + case CdbLocusType_Hashed: + case CdbLocusType_HashedOJ: + case CdbLocusType_Strewn: + Assert(subplanflow->flotype == FLOW_PARTITIONED); + /* Gather to QE. No need to keep ordering. */ + adjusted_plan = (Plan *) make_motion_gather_to_QE(root, subplan, NULL); + break; + + case CdbLocusType_SingleQE: + Assert(subplanflow->flotype == FLOW_SINGLETON && subplanflow->segindex != -1); + break; + + case CdbLocusType_General: + break; + + case CdbLocusType_Entry: + case CdbLocusType_Null: + case CdbLocusType_Replicated: + default: + ereport(ERROR, ( + errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected argument locus to set operation"))); + break; + } + break; + + case PSETOP_PARALLEL_REPLICATED: + /* Only when all args are replicated. */ + ereport(ERROR, (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected replicated intermediate result"), + errdetail("argument to set operation may not be replicated"))); + break; + + default: + /* Can't happen! */ + ereport(ERROR, ( + errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("unexpected arguments to set operation"))); + break; } - + /* If we made changes, inject them into the argument list. */ - if ( subplan != adjusted_plan ) + if (subplan != adjusted_plan) { subplan = adjusted_plan; cell->data.ptr_value = subplan; } } - + return; } @@ -247,10 +253,11 @@ adjust_setop_arguments(PlannerInfo *root, List *planlist, GpSetOpType setop_type * * A NULL result indicates either a NULL argument or a problem. */ -Flow *copyFlow(Flow *model_flow, bool withExprs, bool withSort) +Flow * +copyFlow(Flow *model_flow, bool withExprs, bool withSort) { - Flow *new_flow = NULL; - + Flow *new_flow = NULL; + if (model_flow == NULL) return NULL; @@ -303,7 +310,7 @@ Motion * make_motion_gather_to_QE(PlannerInfo *root, Plan *subplan, List *sortPathKeys) { return make_motion_gather(root, subplan, gp_singleton_segindex, sortPathKeys); -} +} /* * make_motion_gather @@ -314,7 +321,7 @@ make_motion_gather_to_QE(PlannerInfo *root, Plan *subplan, List *sortPathKeys) Motion * make_motion_gather(PlannerInfo *root, Plan *subplan, int segindex, List *sortPathKeys) { - Motion *motion; + Motion *motion; Assert(subplan->flow != NULL); Assert(subplan->flow->flotype == FLOW_PARTITIONED || @@ -323,19 +330,19 @@ make_motion_gather(PlannerInfo *root, Plan *subplan, int segindex, List *sortPat if (sortPathKeys) { motion = make_sorted_union_motion(root, - subplan, - segindex, - sortPathKeys, - false /* useExecutorVarFormat */); + subplan, + segindex, + sortPathKeys, + false /* useExecutorVarFormat */ ); } else { motion = make_union_motion( - subplan, - segindex, - false /* useExecutorVarFormat */); + subplan, + segindex, + false /* useExecutorVarFormat */ ); } - + return motion; } @@ -348,7 +355,8 @@ make_motion_gather(PlannerInfo *root, Plan *subplan, int segindex, List *sortPat Motion * make_motion_hash_all_targets(PlannerInfo *root, Plan *subplan) { - List *hashexprs = makeHashExprsFromNonjunkTargets(subplan->targetlist); + List *hashexprs = makeHashExprsFromNonjunkTargets(subplan->targetlist); + return make_motion_hash(root, subplan, hashexprs); } @@ -359,16 +367,16 @@ make_motion_hash_all_targets(PlannerInfo *root, Plan *subplan) * motion should only be applied to a non-replicated, non-root subplan. */ Motion * -make_motion_hash(PlannerInfo *root __attribute__((unused)) , Plan *subplan, List *hashexprs) +make_motion_hash(PlannerInfo *root __attribute__((unused)), Plan *subplan, List *hashexprs) { - Motion *motion; - + Motion *motion; + Assert(subplan->flow != NULL); - + motion = make_hashed_motion( - subplan, - hashexprs, - false /* useExecutorVarFormat */); + subplan, + hashexprs, + false /* useExecutorVarFormat */ ); return motion; } @@ -382,10 +390,11 @@ make_motion_hash(PlannerInfo *root __attribute__((unused)) , Plan *subplan, List * * Returns the newly allocate expression list for a Motion node. */ -List *makeHashExprsFromNonjunkTargets(List *targetlist) +List * +makeHashExprsFromNonjunkTargets(List *targetlist) { ListCell *cell; - List *hashlist = NIL; + List *hashlist = NIL; foreach(cell, targetlist) { @@ -404,66 +413,69 @@ List *makeHashExprsFromNonjunkTargets(List *targetlist) * Marks an Append plan with its locus based on the set operation * type determined during examination of the arguments. */ -void mark_append_locus(Plan *plan, GpSetOpType optype) +void +mark_append_locus(Plan *plan, GpSetOpType optype) { - switch ( optype ) + switch (optype) { - case PSETOP_GENERAL: - mark_plan_general(plan); - break; - case PSETOP_PARALLEL_PARTITIONED: - mark_plan_strewn(plan); - break; - case PSETOP_PARALLEL_REPLICATED: - mark_plan_replicated(plan); - break; - case PSETOP_SEQUENTIAL_QD: - mark_plan_entry(plan); - break; - case PSETOP_SEQUENTIAL_QE: - mark_plan_singleQE(plan); - case PSETOP_NONE: - break; + case PSETOP_GENERAL: + mark_plan_general(plan); + break; + case PSETOP_PARALLEL_PARTITIONED: + mark_plan_strewn(plan); + break; + case PSETOP_PARALLEL_REPLICATED: + mark_plan_replicated(plan); + break; + case PSETOP_SEQUENTIAL_QD: + mark_plan_entry(plan); + break; + case PSETOP_SEQUENTIAL_QE: + mark_plan_singleQE(plan); + case PSETOP_NONE: + break; } } -void mark_passthru_locus(Plan *plan, bool with_hash, bool with_sort) +void +mark_passthru_locus(Plan *plan, bool with_hash, bool with_sort) { - Flow *flow; - Plan *subplan = NULL; - bool is_subquery = IsA(plan, SubqueryScan); + Flow *flow; + Plan *subplan = NULL; + bool is_subquery = IsA(plan, SubqueryScan); + + Assert(is_plan_node((Node *) plan) && plan->flow == NULL); - Assert( is_plan_node((Node*)plan) && plan->flow == NULL ); - - if ( is_subquery ) + if (is_subquery) { - subplan = ((SubqueryScan*)plan)->subplan; + subplan = ((SubqueryScan *) plan)->subplan; } else { subplan = plan->lefttree; } - - Assert( subplan != NULL && subplan->flow != NULL); - + + Assert(subplan != NULL && subplan->flow != NULL); + flow = copyFlow(subplan->flow, with_hash && !is_subquery, with_sort); - - if ( is_subquery && with_hash && flow->flotype == FLOW_PARTITIONED ) + + if (is_subquery && with_hash && flow->flotype == FLOW_PARTITIONED) { - ListCell *c; - List *hash = NIL; - Index varno = ((Scan*)plan)->scanrelid; + ListCell *c; + List *hash = NIL; + Index varno = ((Scan *) plan)->scanrelid; - Flow *subplanflow = subplan->flow; + Flow *subplanflow = subplan->flow; - /* Make sure all the expressions the flow thinks we're hashed on - * occur in the subplan targetlist. + /* + * Make sure all the expressions the flow thinks we're hashed on occur + * in the subplan targetlist. */ - foreach( c, subplanflow->hashExpr ) + foreach(c, subplanflow->hashExpr) { - Node *x = (Node*)lfirst(c); + Node *x = (Node *) lfirst(c); - Expr *exprNew = cdbpullup_expr((Expr *) x, subplan->targetlist, NULL, varno); + Expr *exprNew = cdbpullup_expr((Expr *) x, subplan->targetlist, NULL, varno); hash = lappend(hash, exprNew); } @@ -475,44 +487,50 @@ void mark_passthru_locus(Plan *plan, bool with_hash, bool with_sort) } -void mark_sort_locus(Plan *plan) +void +mark_sort_locus(Plan *plan) { plan->flow = pull_up_Flow(plan, plan->lefttree); -} +} -void mark_plan_general(Plan* plan) +void +mark_plan_general(Plan *plan) { - Assert( is_plan_node((Node*)plan) && plan->flow == NULL ); + Assert(is_plan_node((Node *) plan) && plan->flow == NULL); plan->flow = makeFlow(FLOW_SINGLETON); plan->flow->segindex = 0; plan->flow->locustype = CdbLocusType_General; } -void mark_plan_strewn(Plan* plan) +void +mark_plan_strewn(Plan *plan) { - Assert( is_plan_node((Node*)plan) && plan->flow == NULL ); + Assert(is_plan_node((Node *) plan) && plan->flow == NULL); plan->flow = makeFlow(FLOW_PARTITIONED); plan->flow->locustype = CdbLocusType_Strewn; } -void mark_plan_replicated(Plan* plan) +void +mark_plan_replicated(Plan *plan) { - Assert( is_plan_node((Node*)plan) && plan->flow == NULL ); + Assert(is_plan_node((Node *) plan) && plan->flow == NULL); plan->flow = makeFlow(FLOW_REPLICATED); plan->flow->locustype = CdbLocusType_Replicated; } -void mark_plan_entry(Plan* plan) +void +mark_plan_entry(Plan *plan) { - Assert( is_plan_node((Node*)plan) && plan->flow == NULL ); + Assert(is_plan_node((Node *) plan) && plan->flow == NULL); plan->flow = makeFlow(FLOW_SINGLETON); plan->flow->segindex = -1; plan->flow->locustype = CdbLocusType_Entry; } -void mark_plan_singleQE(Plan* plan) +void +mark_plan_singleQE(Plan *plan) { - Assert( is_plan_node((Node*)plan) && plan->flow == NULL ); + Assert(is_plan_node((Node *) plan) && plan->flow == NULL); plan->flow = makeFlow(FLOW_SINGLETON); plan->flow->segindex = 0; plan->flow->locustype = CdbLocusType_SingleQE; diff --git a/src/backend/cdb/cdbshareddoublylinked.c b/src/backend/cdb/cdbshareddoublylinked.c index 0d85791d61..879ec76b5e 100644 --- a/src/backend/cdb/cdbshareddoublylinked.c +++ b/src/backend/cdb/cdbshareddoublylinked.c @@ -18,10 +18,10 @@ void SharedListBase_Init( - SharedListBase *base, - void *data, - int size, - int offsetToDoubleLinks) + SharedListBase *base, + void *data, + int size, + int offsetToDoubleLinks) { base->data = data; base->size = size; @@ -30,7 +30,7 @@ SharedListBase_Init( void SharedDoublyLinkedHead_Init( - SharedDoublyLinkedHead *head) + SharedDoublyLinkedHead *head) { head->count = 0; head->first = -1; @@ -39,47 +39,47 @@ SharedDoublyLinkedHead_Init( void SharedDoubleLinks_Init( - SharedDoubleLinks *doubleLinks, - int index) + SharedDoubleLinks *doubleLinks, + int index) { doubleLinks->index = index; doubleLinks->prev = -1; doubleLinks->next = -1; } -SharedDoubleLinks* -SharedDoubleLinks_FromElement( - SharedListBase *base, - void *current) +SharedDoubleLinks * +SharedDoubleLinks_FromElement( + SharedListBase *base, + void *current) { - uint8 *uint8Current = (uint8*)current; + uint8 *uint8Current = (uint8 *) current; - return (SharedDoubleLinks*)(uint8Current + base->offsetToDoubleLinks); + return (SharedDoubleLinks *) (uint8Current + base->offsetToDoubleLinks); } -void* -SharedListBase_ToElement( - SharedListBase *base, - int index) +void * +SharedListBase_ToElement( + SharedListBase *base, + int index) { Assert(base != NULL); Assert(index >= 0); - + return base->data + (index * base->size); } -void* +void * SharedDoublyLinkedHead_First( - SharedListBase *base, - SharedDoublyLinkedHead *head) + SharedListBase *base, + SharedDoublyLinkedHead *head) { Assert(base != NULL); Assert(head != NULL); if (head->first != -1) { - void *firstEle; - SharedDoubleLinks *firstDoubleLinks; + void *firstEle; + SharedDoubleLinks *firstDoubleLinks; Assert(head->first >= 0); firstEle = SharedListBase_ToElement(base, head->first); @@ -92,32 +92,32 @@ SharedDoublyLinkedHead_First( return NULL; } -SharedDoubleLinks* -SharedListBase_ToDoubleLinks( - SharedListBase *base, - int index) +SharedDoubleLinks * +SharedListBase_ToDoubleLinks( + SharedListBase *base, + int index) { SharedDoubleLinks *sharedDoubleLinks; - + Assert(base != NULL); Assert(index >= 0); - + sharedDoubleLinks = - (SharedDoubleLinks*) - (base->data + (index * base->size) + base->offsetToDoubleLinks); + (SharedDoubleLinks *) + (base->data + (index * base->size) + base->offsetToDoubleLinks); Assert(sharedDoubleLinks->index == index); return sharedDoubleLinks; } -void* +void * SharedDoubleLinks_Next( - SharedListBase *base, - SharedDoublyLinkedHead *head, - void *currentEle) + SharedListBase *base, + SharedDoublyLinkedHead *head, + void *currentEle) { - SharedDoubleLinks *currentDoubleLinks; - + SharedDoubleLinks *currentDoubleLinks; + Assert(base != NULL); Assert(head != NULL); Assert(currentEle != NULL); @@ -131,9 +131,9 @@ SharedDoubleLinks_Next( } else { - void *nextEle; - SharedDoubleLinks *nextDoubleLinks; - + void *nextEle; + SharedDoubleLinks *nextDoubleLinks; + Assert(currentDoubleLinks->next >= 0); nextEle = SharedListBase_ToElement(base, currentDoubleLinks->next); nextDoubleLinks = SharedDoubleLinks_FromElement(base, nextEle); @@ -146,20 +146,20 @@ SharedDoubleLinks_Next( void SharedDoubleLinks_Remove( - SharedListBase *base, - SharedDoublyLinkedHead *head, - void *removeEle) + SharedListBase *base, + SharedDoublyLinkedHead *head, + void *removeEle) { - SharedDoubleLinks *removeDoubleLinks; - int index; - SharedDoubleLinks *prevDoubleLinks = NULL; - SharedDoubleLinks *nextDoubleLinks = NULL; + SharedDoubleLinks *removeDoubleLinks; + int index; + SharedDoubleLinks *prevDoubleLinks = NULL; + SharedDoubleLinks *nextDoubleLinks = NULL; + - Assert(base != NULL); Assert(head != NULL); Assert(removeEle != NULL); - + removeDoubleLinks = SharedDoubleLinks_FromElement(base, removeEle); index = removeDoubleLinks->index; @@ -181,26 +181,26 @@ SharedDoubleLinks_Remove( * Removing the first element. */ Assert(head->first == index); - - nextDoubleLinks = + + nextDoubleLinks = SharedListBase_ToDoubleLinks(base, removeDoubleLinks->next); Assert(nextDoubleLinks->prev == index); nextDoubleLinks->prev = -1; - + head->first = nextDoubleLinks->index; } else if (removeDoubleLinks->next == -1) { Assert(head->last == index); - + /* * Removing the last element. */ - prevDoubleLinks = + prevDoubleLinks = SharedListBase_ToDoubleLinks(base, removeDoubleLinks->prev); Assert(prevDoubleLinks->next == index); prevDoubleLinks->next = -1; - + head->last = prevDoubleLinks->index; } else @@ -208,12 +208,12 @@ SharedDoubleLinks_Remove( /* * Removing a middle element. */ - nextDoubleLinks = + nextDoubleLinks = SharedListBase_ToDoubleLinks(base, removeDoubleLinks->next); Assert(nextDoubleLinks->prev == index); nextDoubleLinks->prev = removeDoubleLinks->prev; - prevDoubleLinks = + prevDoubleLinks = SharedListBase_ToDoubleLinks(base, removeDoubleLinks->prev); Assert(prevDoubleLinks->next == index); prevDoubleLinks->next = removeDoubleLinks->next; @@ -221,19 +221,19 @@ SharedDoubleLinks_Remove( Assert(head->count >= 1); head->count--; - + removeDoubleLinks->prev = -1; removeDoubleLinks->next = -1; } void SharedDoublyLinkedHead_AddFirst( - SharedListBase *base, - SharedDoublyLinkedHead *head, - void *ele) + SharedListBase *base, + SharedDoublyLinkedHead *head, + void *ele) { - SharedDoubleLinks *eleDoubleLinks; - + SharedDoubleLinks *eleDoubleLinks; + Assert(base != NULL); Assert(head != NULL); Assert(ele != NULL); @@ -249,10 +249,10 @@ SharedDoublyLinkedHead_AddFirst( } else { - SharedDoubleLinks *firstDoubleLinks; - + SharedDoubleLinks *firstDoubleLinks; + Assert(head->count > 0); - firstDoubleLinks = + firstDoubleLinks = SharedListBase_ToDoubleLinks(base, head->first); Assert(firstDoubleLinks->prev == -1); @@ -260,18 +260,18 @@ SharedDoublyLinkedHead_AddFirst( head->first = eleDoubleLinks->index; firstDoubleLinks->prev = eleDoubleLinks->index; } - + head->count++; } void SharedDoublyLinkedHead_AddLast( - SharedListBase *base, - SharedDoublyLinkedHead *head, - void *ele) + SharedListBase *base, + SharedDoublyLinkedHead *head, + void *ele) { - SharedDoubleLinks *eleDoubleLinks; - + SharedDoubleLinks *eleDoubleLinks; + Assert(base != NULL); Assert(head != NULL); Assert(ele != NULL); @@ -287,13 +287,13 @@ SharedDoublyLinkedHead_AddLast( } else { - SharedDoubleLinks *lastDoubleLinks; - + SharedDoubleLinks *lastDoubleLinks; + Assert(head->count > 0); Assert(head->first >= 0); Assert(head->last >= 0); - - lastDoubleLinks = + + lastDoubleLinks = SharedListBase_ToDoubleLinks(base, head->last); Assert(lastDoubleLinks->next == -1); @@ -301,39 +301,38 @@ SharedDoublyLinkedHead_AddLast( head->last = eleDoubleLinks->index; lastDoubleLinks->next = eleDoubleLinks->index; } - + head->count++; } -void* +void * SharedDoublyLinkedHead_RemoveFirst( - SharedListBase *base, - SharedDoublyLinkedHead *head) + SharedListBase *base, + SharedDoublyLinkedHead *head) { - void* firstEle; - SharedDoubleLinks *firstDoubleLinks; - + void *firstEle; + SharedDoubleLinks *firstDoubleLinks; + Assert(base != NULL); Assert(head != NULL); - + if (head->first == -1) { Assert(head->count == 0); return NULL; } - + Assert(head->first >= 0); firstEle = SharedListBase_ToElement(base, head->first); firstDoubleLinks = SharedDoubleLinks_FromElement(base, firstEle); Assert(firstDoubleLinks->index == head->first); SharedDoubleLinks_Remove( - base, - head, - firstEle); + base, + head, + firstEle); return firstEle; } -//****************************************************************************** - +/* ****************************************************************************** */ diff --git a/src/backend/cdb/cdbsharedoidsearch.c b/src/backend/cdb/cdbsharedoidsearch.c index 84f147d836..a978aed601 100755 --- a/src/backend/cdb/cdbsharedoidsearch.c +++ b/src/backend/cdb/cdbsharedoidsearch.c @@ -17,33 +17,35 @@ #include "cdb/cdbshareddoublylinked.h" #include "cdb/cdbsharedoidsearch.h" -// ----------------------------------------------------------------------------- -// Free Pool -// ----------------------------------------------------------------------------- +/* ----------------------------------------------------------------------------- */ +/* Free Pool */ +/* ----------------------------------------------------------------------------- */ -static void SharedOidSearch_MakeFreeObjPool( - SharedOidSearchFreeObjPool *sharedFreeObjPool, - /* The shared free object pool to initialize. */ +static void +SharedOidSearch_MakeFreeObjPool( + SharedOidSearchFreeObjPool *sharedFreeObjPool, + /* The shared free object pool to initialize. */ - void *freeObjectArray, - /* The shared-memory to use. */ + void *freeObjectArray, + /* The shared-memory to use. */ - int32 freeObjectCount, - /* The byte length of the shared-memory. */ + int32 freeObjectCount, + /* The byte length of the shared-memory. */ - int32 objectLen) - /* - * The total length of the objects that includes the embedded header - * SharedOidSearchObjHeader. - */ + int32 objectLen) + + /* + * The total length of the objects that includes the embedded header + * SharedOidSearchObjHeader. + */ { - int32 i; + int32 i; SharedListBase_Init( - &sharedFreeObjPool->listBase, - freeObjectArray, - objectLen, - offsetof(SharedOidSearchObjHeader,private.links)); + &sharedFreeObjPool->listBase, + freeObjectArray, + objectLen, + offsetof(SharedOidSearchObjHeader, private.links)); SharedDoublyLinkedHead_Init(&sharedFreeObjPool->freeList); @@ -54,60 +56,64 @@ static void SharedOidSearch_MakeFreeObjPool( { SharedOidSearchObjHeader *ele; - ele = (SharedOidSearchObjHeader*) - SharedListBase_ToElement( - &sharedFreeObjPool->listBase, i); - + ele = (SharedOidSearchObjHeader *) + SharedListBase_ToElement( + &sharedFreeObjPool->listBase, i); + SharedDoubleLinks_Init(&ele->private.links, i); SharedDoublyLinkedHead_AddLast( - &sharedFreeObjPool->listBase, - &sharedFreeObjPool->freeList, - ele); + &sharedFreeObjPool->listBase, + &sharedFreeObjPool->freeList, + ele); } } -// ----------------------------------------------------------------------------- -// Initialize -// ----------------------------------------------------------------------------- +/* ----------------------------------------------------------------------------- */ +/* Initialize */ +/* ----------------------------------------------------------------------------- */ -int32 SharedOidSearch_TableLen( - int32 hashSize, - /* The hash array size */ +int32 +SharedOidSearch_TableLen( + int32 hashSize, + /* The hash array size */ - int32 freeObjectCount, + int32 freeObjectCount, - int32 objectLen) - /* - * The total length of the objects that includes the embedded header - * SharedOidSearchObjHeader. - */ + int32 objectLen) + + /* + * The total length of the objects that includes the embedded header + * SharedOidSearchObjHeader. + */ { - return MAXALIGN(offsetof(SharedOidSearchTable,private.buckets)) + - MAXALIGN(hashSize * sizeof(SharedOidSearchHashBucket)) + - MAXALIGN(freeObjectCount * objectLen); + return MAXALIGN(offsetof(SharedOidSearchTable, private.buckets)) + + MAXALIGN(hashSize * sizeof(SharedOidSearchHashBucket)) + + MAXALIGN(freeObjectCount * objectLen); } -void SharedOidSearch_InitTable( - SharedOidSearchTable *table, - /* The shared search tables to initialize. */ +void +SharedOidSearch_InitTable( + SharedOidSearchTable *table, + /* The shared search tables to initialize. */ + + int32 hashSize, + /* The hash array size */ - int32 hashSize, - /* The hash array size */ + int32 freeObjectCount, - int32 freeObjectCount, + int32 objectLen) - int32 objectLen) - /* - * The total length of the objects that includes the embedded header - * SharedOidSearchObjHeader. - */ + /* + * The total length of the objects that includes the embedded header + * SharedOidSearchObjHeader. + */ { - SharedOidSearchHashBucket *bucketArray; - int32 freePoolObjectArrayOffset; - void *freePoolObjectArray; + SharedOidSearchHashBucket *bucketArray; + int32 freePoolObjectArrayOffset; + void *freePoolObjectArray; - int32 i; + int32 i; table->private.hashSize = hashSize; @@ -117,69 +123,72 @@ void SharedOidSearch_InitTable( SharedDoublyLinkedHead_Init(&bucketArray[i].bucketListHead); freePoolObjectArrayOffset = - MAXALIGN( - (int32) - ((uint8*)&table->private.buckets[hashSize] - (uint8*)table)); - - freePoolObjectArray = - (void*) - (((uint8*)table) + freePoolObjectArrayOffset); - + MAXALIGN( + (int32) + ((uint8 *) &table->private.buckets[hashSize] - (uint8 *) table)); + + freePoolObjectArray = + (void *) + (((uint8 *) table) + freePoolObjectArrayOffset); + SharedOidSearch_MakeFreeObjPool( - &table->private.freePool, - freePoolObjectArray, - freeObjectCount, - objectLen); + &table->private.freePool, + freePoolObjectArray, + freeObjectCount, + objectLen); } -// ----------------------------------------------------------------------------- -// Helpers -// ----------------------------------------------------------------------------- +/* ----------------------------------------------------------------------------- */ +/* Helpers */ +/* ----------------------------------------------------------------------------- */ -static SharedOidSearchHashBucket *SharedOidSearch_GetBucket( - SharedOidSearchTable *table, - Oid oid1) +static SharedOidSearchHashBucket * +SharedOidSearch_GetBucket( + SharedOidSearchTable *table, + Oid oid1) { - int32 bucketNum; - + int32 bucketNum; + bucketNum = oid1 % table->private.hashSize; return &table->private.buckets[bucketNum]; } -static SharedOidSearchObjHeader *SharedOidSearch_FindInBucket( - SharedOidSearchTable *table, - SharedOidSearchHashBucket *bucket, - Oid oid1, - Oid oid2) +static SharedOidSearchObjHeader * +SharedOidSearch_FindInBucket( + SharedOidSearchTable *table, + SharedOidSearchHashBucket *bucket, + Oid oid1, + Oid oid2) { - SharedListBase *listBase = &table->private.freePool.listBase; - SharedDoublyLinkedHead *listHead; - SharedOidSearchObjHeader *ele; + SharedListBase *listBase = &table->private.freePool.listBase; + SharedDoublyLinkedHead *listHead; + SharedOidSearchObjHeader *ele; listHead = &bucket->bucketListHead; - ele = (SharedOidSearchObjHeader*) - SharedDoublyLinkedHead_First(listBase, listHead); + ele = (SharedOidSearchObjHeader *) + SharedDoublyLinkedHead_First(listBase, listHead); while (ele != NULL) { if (ele->oid1 == oid1 && ele->oid2 == oid2) return ele; - ele = (SharedOidSearchObjHeader*) - SharedDoubleLinks_Next(listBase, listHead, ele); + ele = (SharedOidSearchObjHeader *) + SharedDoubleLinks_Next(listBase, listHead, ele); } return NULL; } -static void SharedOidSearch_RemoveFromBucket( - SharedOidSearchTable *table, - SharedOidSearchHashBucket *bucket, - SharedOidSearchObjHeader *ele) +static void +SharedOidSearch_RemoveFromBucket( + SharedOidSearchTable *table, + SharedOidSearchHashBucket *bucket, + SharedOidSearchObjHeader *ele) { - SharedListBase *listBase = &table->private.freePool.listBase; - SharedDoublyLinkedHead *listHead; + SharedListBase *listBase = &table->private.freePool.listBase; + SharedDoublyLinkedHead *listHead; Assert(ele->private.isDeleted); Assert(ele->private.pinCount == 0); @@ -187,26 +196,27 @@ static void SharedOidSearch_RemoveFromBucket( listHead = &bucket->bucketListHead; SharedDoubleLinks_Remove(listBase, listHead, ele); - + SharedDoublyLinkedHead_AddLast( - &table->private.freePool.listBase, - &table->private.freePool.freeList, - ele); + &table->private.freePool.listBase, + &table->private.freePool.freeList, + ele); } -// ----------------------------------------------------------------------------- -// Add, Find, Probe, and Delete -// ----------------------------------------------------------------------------- +/* ----------------------------------------------------------------------------- */ +/* Add, Find, Probe, and Delete */ +/* ----------------------------------------------------------------------------- */ -SharedOidSearchAddResult SharedOidSearch_Add( - SharedOidSearchTable *table, - Oid oid1, - Oid oid2, - SharedOidSearchObjHeader **header) +SharedOidSearchAddResult +SharedOidSearch_Add( + SharedOidSearchTable *table, + Oid oid1, + Oid oid2, + SharedOidSearchObjHeader **header) { - SharedOidSearchHashBucket *bucket; - SharedOidSearchObjHeader *ele; + SharedOidSearchHashBucket *bucket; + SharedOidSearchObjHeader *ele; bucket = SharedOidSearch_GetBucket(table, oid1); @@ -214,11 +224,11 @@ SharedOidSearchAddResult SharedOidSearch_Add( if (ele != NULL) return SharedOidSearchAddResult_Exists; - ele = - (SharedOidSearchObjHeader*) - SharedDoublyLinkedHead_RemoveFirst( - &table->private.freePool.listBase, - &table->private.freePool.freeList); + ele = + (SharedOidSearchObjHeader *) + SharedDoublyLinkedHead_RemoveFirst( + &table->private.freePool.listBase, + &table->private.freePool.freeList); if (ele == NULL) return SharedOidSearchAddResult_NoMemory; @@ -228,21 +238,22 @@ SharedOidSearchAddResult SharedOidSearch_Add( ele->oid2 = oid2; SharedDoublyLinkedHead_AddLast( - &table->private.freePool.listBase, - &bucket->bucketListHead, - ele); + &table->private.freePool.listBase, + &bucket->bucketListHead, + ele); *header = ele; return SharedOidSearchAddResult_Ok; } -SharedOidSearchObjHeader *SharedOidSearch_Find( - SharedOidSearchTable *table, - Oid oid1, - Oid oid2) +SharedOidSearchObjHeader * +SharedOidSearch_Find( + SharedOidSearchTable *table, + Oid oid1, + Oid oid2) { - SharedOidSearchHashBucket *bucket; - SharedOidSearchObjHeader *ele; + SharedOidSearchHashBucket *bucket; + SharedOidSearchObjHeader *ele; bucket = SharedOidSearch_GetBucket(table, oid1); @@ -253,48 +264,51 @@ SharedOidSearchObjHeader *SharedOidSearch_Find( return ele; } -static bool SharedOidSearch_NextBucket( - SharedOidSearchTable *table, - SharedOidSearchHashBucket **bucket) +static bool +SharedOidSearch_NextBucket( + SharedOidSearchTable *table, + SharedOidSearchHashBucket **bucket) { - int32 hashSize = table->private.hashSize; + int32 hashSize = table->private.hashSize; - if (*bucket == &table->private.buckets[hashSize-1]) + if (*bucket == &table->private.buckets[hashSize - 1]) return false; (*bucket)++; return true; } -static bool SharedOidSearch_FindNonDeletedInBucket( - SharedListBase *listBase, - SharedDoublyLinkedHead *listHead, - SharedOidSearchObjHeader **header) +static bool +SharedOidSearch_FindNonDeletedInBucket( + SharedListBase *listBase, + SharedDoublyLinkedHead *listHead, + SharedOidSearchObjHeader **header) { while (true) { if (!(*header)->private.isDeleted) return true; - - *header = - (SharedOidSearchObjHeader*) - SharedDoubleLinks_Next( - listBase, - listHead, - *header); + + *header = + (SharedOidSearchObjHeader *) + SharedDoubleLinks_Next( + listBase, + listHead, + *header); if (*header == NULL) return false; } } -void SharedOidSearch_Iterate( - SharedOidSearchTable *table, - SharedOidSearchObjHeader **header) +void +SharedOidSearch_Iterate( + SharedOidSearchTable *table, + SharedOidSearchObjHeader **header) { - SharedListBase *listBase = &table->private.freePool.listBase; - SharedOidSearchHashBucket *bucket; - SharedOidSearchObjHeader *current = *header; + SharedListBase *listBase = &table->private.freePool.listBase; + SharedOidSearchHashBucket *bucket; + SharedOidSearchObjHeader *current = *header; if (current != NULL) { @@ -305,23 +319,23 @@ void SharedOidSearch_Iterate( Assert(current->private.pinCount > 0); current->private.pinCount--; - - *header = - (SharedOidSearchObjHeader*) - SharedDoubleLinks_Next( - listBase, - &bucket->bucketListHead, - current); + + *header = + (SharedOidSearchObjHeader *) + SharedDoubleLinks_Next( + listBase, + &bucket->bucketListHead, + current); if (current->private.isDeleted && current->private.pinCount == 0) SharedOidSearch_RemoveFromBucket(table, bucket, current); - + if (*header != NULL && SharedOidSearch_FindNonDeletedInBucket( - listBase, - &bucket->bucketListHead, - header)) + listBase, + &bucket->bucketListHead, + header)) { (*header)->private.pinCount++; return; @@ -330,9 +344,10 @@ void SharedOidSearch_Iterate( Assert(*header == NULL); if (!SharedOidSearch_NextBucket( - table, - &bucket)) - return; // No more buckets. + table, + &bucket)) + return; + //No more buckets. } else bucket = &table->private.buckets[0]; @@ -340,18 +355,18 @@ void SharedOidSearch_Iterate( /* * Find next non-empty bucket. */ - while (true) + while (true) { - *header = - (SharedOidSearchObjHeader*) - SharedDoublyLinkedHead_First( - listBase, - &bucket->bucketListHead); + *header = + (SharedOidSearchObjHeader *) + SharedDoublyLinkedHead_First( + listBase, + &bucket->bucketListHead); if (*header != NULL && SharedOidSearch_FindNonDeletedInBucket( - listBase, - &bucket->bucketListHead, - header)) + listBase, + &bucket->bucketListHead, + header)) { (*header)->private.pinCount++; return; @@ -360,19 +375,21 @@ void SharedOidSearch_Iterate( Assert(*header == NULL); if (!SharedOidSearch_NextBucket( - table, - &bucket)) - return; // No more buckets. + table, + &bucket)) + return; + //No more buckets. } } -void SharedOidSearch_ReleaseIterator( - SharedOidSearchTable *table, - SharedOidSearchObjHeader **header) +void +SharedOidSearch_ReleaseIterator( + SharedOidSearchTable *table, + SharedOidSearchObjHeader **header) { - SharedOidSearchObjHeader *current = *header; - SharedOidSearchHashBucket *bucket; - + SharedOidSearchObjHeader *current = *header; + SharedOidSearchHashBucket *bucket; + Assert(current->private.pinCount > 0); current->private.pinCount--; @@ -382,23 +399,25 @@ void SharedOidSearch_ReleaseIterator( bucket = SharedOidSearch_GetBucket(table, current->oid1); SharedOidSearch_RemoveFromBucket(table, bucket, current); } - + *header = NULL; } -void SharedOidSearch_Delete( - SharedOidSearchTable *table, - SharedOidSearchObjHeader *header) +void +SharedOidSearch_Delete( + SharedOidSearchTable *table, + SharedOidSearchObjHeader *header) { - SharedOidSearchHashBucket *bucket; + SharedOidSearchHashBucket *bucket; Assert(!header->private.isDeleted); header->private.isDeleted = true; if (header->private.pinCount > 0) - return; // Let the last iterator turn out the light. + return; + //Let the last iterator turn out the light. - bucket = SharedOidSearch_GetBucket(table, header->oid1); + bucket = SharedOidSearch_GetBucket(table, header->oid1); SharedOidSearch_RemoveFromBucket(table, bucket, header); } diff --git a/src/backend/cdb/cdbsreh.c b/src/backend/cdb/cdbsreh.c index bcb4317e6c..87dbaedace 100644 --- a/src/backend/cdb/cdbsreh.c +++ b/src/backend/cdb/cdbsreh.c @@ -43,7 +43,7 @@ #include "utils/builtins.h" #include "utils/bytea.h" -static int GetNextSegid(CdbSreh *cdbsreh); +static int GetNextSegid(CdbSreh *cdbsreh); static void PreprocessByteaData(char *src); static void ErrorLogWrite(CdbSreh *cdbsreh); @@ -56,8 +56,8 @@ static void ErrorLogWrite(CdbSreh *cdbsreh); */ typedef struct ReadErrorLogContext { - FILE *fp; /* file pointer to the error log */ - char filename[MAXPGPATH];/* filename of fp */ + FILE *fp; /* file pointer to the error log */ + char filename[MAXPGPATH]; /* filename of fp */ } ReadErrorLogContext; typedef enum RejectLimitCode @@ -68,7 +68,7 @@ typedef enum RejectLimitCode REJECT_UNPARSABLE_CSV, } RejectLimitCode; -int gp_initial_bad_row_limit = 1000; +int gp_initial_bad_row_limit = 1000; /* * makeCdbSreh @@ -82,10 +82,10 @@ makeCdbSreh(int rejectlimit, bool is_limit_in_rows, char *filename, char *relname, bool log_to_file) { - CdbSreh *h; + CdbSreh *h; h = palloc(sizeof(CdbSreh)); - + h->errmsg = NULL; h->rawdata = NULL; h->linenumber = 0; @@ -110,21 +110,21 @@ makeCdbSreh(int rejectlimit, bool is_limit_in_rows, * anyway. */ h->badrowcontext = AllocSetContextCreate(CurrentMemoryContext, - "SrehMemCtxt", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - + "SrehMemCtxt", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + return h; } void destroyCdbSreh(CdbSreh *cdbsreh) { - + /* delete the bad row context */ - MemoryContextDelete(cdbsreh->badrowcontext); - + MemoryContextDelete(cdbsreh->badrowcontext); + pfree(cdbsreh); } @@ -140,24 +140,24 @@ destroyCdbSreh(CdbSreh *cdbsreh) * - If QD COPY send the bad row to the QE COPY to deal with. * */ -void HandleSingleRowError(CdbSreh *cdbsreh) +void +HandleSingleRowError(CdbSreh *cdbsreh) { - - /* increment total number of errors for this segment */ + + /* increment total number of errors for this segment */ cdbsreh->rejectcount++; - - /* - * if reached the segment reject limit don't do anything. - * (this will get checked and handled later on by the caller). + + /* + * if reached the segment reject limit don't do anything. (this will get + * checked and handled later on by the caller). */ - if(IsRejectLimitReached(cdbsreh)) + if (IsRejectLimitReached(cdbsreh)) return; /* - * If not specified table or file, do nothing. Otherwise, - * record the error: - * QD - send the bad data row to a random QE (via roundrobin). - * QE - log the error in the error log file. + * If not specified table or file, do nothing. Otherwise, record the + * error: QD - send the bad data row to a random QE (via roundrobin). QE - + * log the error in the error log file. */ if (cdbsreh->log_to_file) { @@ -167,16 +167,16 @@ void HandleSingleRowError(CdbSreh *cdbsreh) GetNextSegid(cdbsreh), cdbsreh->rawdata, strlen(cdbsreh->rawdata)); - + } else { ErrorLogWrite(cdbsreh); } - + } - - return; /* OK */ + + return; /* OK */ } /* @@ -193,7 +193,7 @@ GetErrorTupleDesc(void) */ if (tupdesc == NULL) { - TupleDesc tmp; + TupleDesc tmp; MemoryContext oldcontext = MemoryContextSwitchTo(CacheMemoryContext); tmp = CreateTemplateTupleDesc(NUM_ERRORTABLE_ATTR, false); @@ -220,17 +220,17 @@ FormErrorTuple(CdbSreh *cdbsreh) bool nulls[NUM_ERRORTABLE_ATTR]; Datum values[NUM_ERRORTABLE_ATTR]; MemoryContext oldcontext; - + oldcontext = MemoryContextSwitchTo(cdbsreh->badrowcontext); - + /* Initialize all values for row to NULL */ MemSet(values, 0, NUM_ERRORTABLE_ATTR * sizeof(Datum)); MemSet(nulls, true, NUM_ERRORTABLE_ATTR * sizeof(bool)); - + /* command start time */ values[errtable_cmdtime - 1] = TimestampTzGetDatum(GetCurrentStatementStartTimestamp()); nulls[errtable_cmdtime - 1] = false; - + /* line number */ if (cdbsreh->linenumber > 0) { @@ -238,7 +238,7 @@ FormErrorTuple(CdbSreh *cdbsreh) nulls[errtable_linenum - 1] = false; } - if(cdbsreh->is_server_enc) + if (cdbsreh->is_server_enc) { /* raw data */ values[errtable_rawdata - 1] = CStringGetTextDatum(cdbsreh->rawdata); @@ -259,35 +259,36 @@ FormErrorTuple(CdbSreh *cdbsreh) /* relation name */ values[errtable_relname - 1] = CStringGetTextDatum(cdbsreh->relname); nulls[errtable_relname - 1] = false; - + /* error message */ values[errtable_errmsg - 1] = CStringGetTextDatum(cdbsreh->errmsg); nulls[errtable_errmsg - 1] = false; - - + + MemoryContextSwitchTo(oldcontext); - + /* * And now we can form the input tuple. */ return heap_form_tuple(GetErrorTupleDesc(), values, nulls); } -/* +/* * ReportSrehResults * * When necessary emit a NOTICE that describes the end result of the * SREH operations. Information includes the total number of rejected * rows, and whether rows were ignored or logged into an error log file. */ -void ReportSrehResults(CdbSreh *cdbsreh, int total_rejected) +void +ReportSrehResults(CdbSreh *cdbsreh, int total_rejected) { - if(total_rejected > 0) + if (total_rejected > 0) { ereport(NOTICE, - (errmsg("Found %d data formatting errors (%d or more " - "input rows). Rejected related input data.", - total_rejected, total_rejected))); + (errmsg("Found %d data formatting errors (%d or more " + "input rows). Rejected related input data.", + total_rejected, total_rejected))); } } @@ -295,15 +296,16 @@ static void sendnumrows_internal(int numrejected, int numcompleted) { StringInfoData buf; - + if (Gp_role != GP_ROLE_EXECUTE) elog(FATAL, "SendNumRows: called outside of execute context."); pq_beginmessage(&buf, 'j'); /* 'j' is the msg code for rejected records */ pq_sendint(&buf, numrejected, 4); - if (numcompleted > 0) /* optional send completed num for COPY FROM ON SEGMENT */ + if (numcompleted > 0) /* optional send completed num for COPY FROM + * ON SEGMENT */ pq_sendint(&buf, numcompleted, 4); - pq_endmessage(&buf); + pq_endmessage(&buf); } /* @@ -315,7 +317,7 @@ sendnumrows_internal(int numrejected, int numcompleted) void SendNumRowsRejected(int numrejected) { - sendnumrows_internal(numrejected, 0); + sendnumrows_internal(numrejected, 0); } /* @@ -327,7 +329,7 @@ SendNumRowsRejected(int numrejected) void SendNumRows(int numrejected, int numcompleted) { - sendnumrows_internal(numrejected, numcompleted); + sendnumrows_internal(numrejected, numcompleted); } /* Identify the reject limit type */ @@ -341,11 +343,11 @@ GetRejectLimitCode(CdbSreh *cdbsreh) return REJECT_FIRST_BAD_LIMIT; /* special case: check for un-parsable csv format errors */ - if(CSV_IS_UNPARSABLE(cdbsreh)) + if (CSV_IS_UNPARSABLE(cdbsreh)) return REJECT_UNPARSABLE_CSV; /* now check if actual reject limit is reached */ - if(cdbsreh->is_limit_in_rows) + if (cdbsreh->is_limit_in_rows) { /* limit is in ROWS */ if (cdbsreh->rejectcount >= cdbsreh->rejectlimit) @@ -354,11 +356,11 @@ GetRejectLimitCode(CdbSreh *cdbsreh) else { /* limit is in PERCENT */ - + /* calculate the percent only if threshold is satisfied */ - if(cdbsreh->processed > gp_reject_percent_threshold) + if (cdbsreh->processed > gp_reject_percent_threshold) { - if( (cdbsreh->rejectcount * 100) / cdbsreh->processed >= cdbsreh->rejectlimit) + if ((cdbsreh->rejectcount * 100) / cdbsreh->processed >= cdbsreh->rejectlimit) code = REJECT_LIMIT_REACHED; } } @@ -373,7 +375,7 @@ GetRejectLimitCode(CdbSreh *cdbsreh) void ErrorIfRejectLimitReached(CdbSreh *cdbsreh, CdbCopy *cdbCopy) { - RejectLimitCode code; + RejectLimitCode code; code = GetRejectLimitCode(cdbsreh); @@ -458,13 +460,14 @@ IsRejectLimitReached(CdbSreh *cdbsreh) * Return the next sequential segment id of available segids (roundrobin). */ static -int GetNextSegid(CdbSreh *cdbsreh) +int +GetNextSegid(CdbSreh *cdbsreh) { - int total_segs = cdbsreh->cdbcopy->total_segs; - - if(cdbsreh->lastsegid == total_segs) + int total_segs = cdbsreh->cdbcopy->total_segs; + + if (cdbsreh->lastsegid == total_segs) cdbsreh->lastsegid = 0; /* start over from first segid */ - + return (cdbsreh->lastsegid++ % total_segs); } @@ -486,10 +489,11 @@ int GetNextSegid(CdbSreh *cdbsreh) * NOTE: code is copied from esc_dec_len() in encode.c and slightly modified. */ static -void PreprocessByteaData(char *src) +void +PreprocessByteaData(char *src) { const char *end = src + strlen(src); - + while (src < end) { if (src[0] != '\\') @@ -520,23 +524,24 @@ void PreprocessByteaData(char *src) */ src[0] = ' '; src++; - } + } } - + } /* * IsRejectLimitValid - * + * * verify that the the reject limit specified by the user is within the * allowed values for ROWS or PERCENT. */ -void VerifyRejectLimit(char rejectlimittype, int rejectlimit) +void +VerifyRejectLimit(char rejectlimittype, int rejectlimit) { - if(rejectlimittype == 'r') + if (rejectlimittype == 'r') { /* ROWS */ - if(rejectlimit < 2) + if (rejectlimit < 2) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("Segment reject limit in ROWS " @@ -594,10 +599,7 @@ ErrorLogWrite(CdbSreh *cdbsreh) ereport(ERROR, (errmsg("could not open \"%s\": %m", filename))); /* - * format: - * 0-4: length - * 5-8: crc - * 9-n: tuple data + * format: 0-4: length 5-8: crc 9-n: tuple data */ if (fwrite(&tuple->t_len, 1, sizeof(tuple->t_len), fp) != sizeof(tuple->t_len)) elog(ERROR, "could not write tuple length: %m"); @@ -630,8 +632,8 @@ ErrorLogRead(FILE *fp, pg_crc32 *crc) break; /* - * The tuple is "in-memory" format of HeapTuple. Allocate - * the whole chunk consecutively. + * The tuple is "in-memory" format of HeapTuple. Allocate the whole + * chunk consecutively. */ tuple = palloc(HEAPTUPLESIZE + t_len); tuple->t_len = t_len; @@ -650,7 +652,7 @@ ErrorLogRead(FILE *fp, pg_crc32 *crc) tuple = NULL; break; } - } while(0); + } while (0); LWLockRelease(ErrorLogLock); @@ -672,8 +674,8 @@ ResultToDatum(PGresult *result, int row, AttrNumber attnum, PGFunction func, boo { *isnull = false; return DirectFunctionCall3(func, - CStringGetDatum(PQgetvalue(result, row, attnum)), - ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)); + CStringGetDatum(PQgetvalue(result, row, attnum)), + ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)); } } @@ -691,10 +693,10 @@ gp_read_error_log(PG_FUNCTION_ARGS) Datum result; /* - * This function is marked as EXECUTE ON ALL SEGMENTS, so we - * should not get here in the dispatcher. + * This function is marked as EXECUTE ON ALL SEGMENTS, so we should not + * get here in the dispatcher. */ - Assert (Gp_role != GP_ROLE_DISPATCH); + Assert(Gp_role != GP_ROLE_DISPATCH); /* * First call setup @@ -748,7 +750,9 @@ gp_read_error_log(PG_FUNCTION_ARGS) */ if (context->fp) { - pg_crc32 crc, written_crc; + pg_crc32 crc, + written_crc; + tuple = ErrorLogRead(context->fp, &written_crc); /* @@ -763,20 +767,20 @@ gp_read_error_log(PG_FUNCTION_ARGS) if (!EQ_CRC32C(crc, written_crc)) { elog(LOG, "incorrect checksum in error log %s", - context->filename); + context->filename); tuple = NULL; } } /* - * If we found a valid tuple, return it. Otherwise, fall through - * in the DONE routine. + * If we found a valid tuple, return it. Otherwise, fall through in + * the DONE routine. */ if (HeapTupleIsValid(tuple)) { /* - * We need to set typmod for the executor to understand - * its type we just blessed. + * We need to set typmod for the executor to understand its type + * we just blessed. */ HeapTupleHeaderSetTypMod(tuple->t_data, funcctx->tuple_desc->tdtypmod); @@ -811,19 +815,19 @@ ErrorLogDelete(Oid databaseId, Oid relationId) if (!OidIsValid(relationId)) { - DIR *dir; - struct dirent *de; - char *dirpath = ErrorLogDir; - char prefix[MAXPGPATH]; - int len; + DIR *dir; + struct dirent *de; + char *dirpath = ErrorLogDir; + char prefix[MAXPGPATH]; + int len; if (OidIsValid(databaseId)) snprintf(prefix, sizeof(prefix), "%u_", databaseId); dir = AllocateDir(dirpath); /* - * If we cannot open the directory, most likely it does not exist. - * Do nothing. + * If we cannot open the directory, most likely it does not exist. Do + * nothing. */ if (dir == NULL) return false; @@ -844,9 +848,9 @@ ErrorLogDelete(Oid databaseId, Oid relationId) if (len >= (MAXPGPATH - 1)) { ereport(WARNING, - (errcode(ERRCODE_GP_INTERNAL_ERROR), - (errmsg("log filename truncation on \"%s\", unable to delete error log", - de->d_name)))); + (errcode(ERRCODE_GP_INTERNAL_ERROR), + (errmsg("log filename truncation on \"%s\", unable to delete error log", + de->d_name)))); continue; } LWLockAcquire(ErrorLogLock, LW_EXCLUSIVE); @@ -860,11 +864,13 @@ ErrorLogDelete(Oid databaseId, Oid relationId) */ if (strncmp(de->d_name, prefix, strlen(prefix)) == 0) { - int res; - Oid dummyDbId, relid; + int res; + Oid dummyDbId, + relid; res = sscanf(de->d_name, "%u_%u", &dummyDbId, &relid); Assert(dummyDbId == databaseId); + /* * Recursively delete the file. */ @@ -896,8 +902,8 @@ gp_truncate_error_log(PG_FUNCTION_ARGS) { text *relname; char *relname_str; - RangeVar *relrv; - Oid relid; + RangeVar *relrv; + Oid relid; bool allResults = true; relname = PG_GETARG_TEXT_P(0); @@ -911,7 +917,7 @@ gp_truncate_error_log(PG_FUNCTION_ARGS) if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("must be superuser to delete all error log files")))); + (errmsg("must be superuser to delete all error log files")))); ErrorLogDelete(InvalidOid, InvalidOid); } @@ -954,7 +960,7 @@ gp_truncate_error_log(PG_FUNCTION_ARGS) if (Gp_role == GP_ROLE_DISPATCH) { int i = 0; - StringInfoData sql; + StringInfoData sql; CdbPgResults cdb_pgresults = {NULL, 0}; initStringInfo(&sql); diff --git a/src/backend/cdb/cdbsrlz.c b/src/backend/cdb/cdbsrlz.c index 4e6e474a17..ad4f25d83c 100644 --- a/src/backend/cdb/cdbsrlz.c +++ b/src/backend/cdb/cdbsrlz.c @@ -36,9 +36,9 @@ static char *uncompress_string(const char *src, int size, int *uncompressed_len) char * serializeNode(Node *node, int *size, int *uncompressed_size_out) { - char *pszNode; - char *sNode; - int uncompressed_size; + char *pszNode; + char *sNode; + int uncompressed_size; Assert(node != NULL); Assert(size != NULL); @@ -46,7 +46,7 @@ serializeNode(Node *node, int *size, int *uncompressed_size_out) { pszNode = nodeToBinaryStringFast(node, &uncompressed_size); Assert(pszNode != NULL); - + if (NULL != uncompressed_size_out) { *uncompressed_size_out = uncompressed_size; @@ -67,9 +67,9 @@ serializeNode(Node *node, int *size, int *uncompressed_size_out) Node * deserializeNode(const char *strNode, int size) { - char *sNode; - Node *node; - int uncompressed_len; + char *sNode; + Node *node; + int uncompressed_len; Assert(strNode != NULL); @@ -80,7 +80,7 @@ deserializeNode(const char *strNode, int size) Assert(sNode != NULL); node = readNodeFromBinaryString(sNode, uncompressed_len); - + pfree(sNode); } END_MEMORY_ACCOUNT(); @@ -96,33 +96,34 @@ deserializeNode(const char *strNode, int size) static char * compress_string(const char *src, int uncompressed_size, int *size) { - int level = 3; + int level = 3; unsigned long compressed_size; - int status; + int status; - Bytef *result; + Bytef *result; Assert(size != NULL); - + if (src == NULL) { *size = 0; return NULL; } - - compressed_size = gp_compressBound(uncompressed_size); /* worst case */ - + + compressed_size = gp_compressBound(uncompressed_size); /* worst case */ + result = palloc(compressed_size + sizeof(int)); - memcpy(result, &uncompressed_size, sizeof(int)); /* save the original length */ - - status = gp_compress2(result + sizeof(int), &compressed_size, (Bytef *)src, uncompressed_size, level); + memcpy(result, &uncompressed_size, sizeof(int)); /* save the original + * length */ + + status = gp_compress2(result + sizeof(int), &compressed_size, (Bytef *) src, uncompressed_size, level); if (status != Z_OK) - elog(ERROR,"Compression failed: %s (errno=%d) uncompressed len %d, compressed %d", - zError(status), status, uncompressed_size, (int)compressed_size); - + elog(ERROR, "Compression failed: %s (errno=%d) uncompressed len %d, compressed %d", + zError(status), status, uncompressed_size, (int) compressed_size); + *size = compressed_size + sizeof(int); - return (char *)result; + return (char *) result; } /* @@ -131,25 +132,26 @@ compress_string(const char *src, int uncompressed_size, int *size) static char * uncompress_string(const char *src, int size, int *uncompressed_len) { - Bytef *result; + Bytef *result; unsigned long resultlen; - int status; + int status; + *uncompressed_len = 0; - + if (src == NULL) return NULL; - + Assert(size >= sizeof(int)); - + memcpy(uncompressed_len, src, sizeof(int)); - + resultlen = *uncompressed_len; result = palloc(resultlen); - - status = gp_uncompress(result, &resultlen, (Bytef *)(src + sizeof(int)), size - sizeof(int)); + + status = gp_uncompress(result, &resultlen, (Bytef *) (src + sizeof(int)), size - sizeof(int)); if (status != Z_OK) - elog(ERROR,"Uncompress failed: %s (errno=%d compressed len %d, uncompressed %d)", + elog(ERROR, "Uncompress failed: %s (errno=%d compressed len %d, uncompressed %d)", zError(status), status, size, *uncompressed_len); - - return (char *)result; + + return (char *) result; } diff --git a/src/backend/cdb/partitionselection.c b/src/backend/cdb/partitionselection.c index f808a885bc..11b7a30dc4 100644 --- a/src/backend/cdb/partitionselection.c +++ b/src/backend/cdb/partitionselection.c @@ -30,7 +30,7 @@ typedef struct AttrMapContext { const AttrNumber *newattno; /* The mapping table to remap the varattno */ - Index varno; /* Which rte's varattno to re-map */ + Index varno; /* Which rte's varattno to re-map */ } AttrMapContext; static bool change_varattnos_varno_walker(Node *node, const AttrMapContext *attrMapCxt); @@ -46,13 +46,15 @@ static bool change_varattnos_varno_walker(Node *node, const AttrMapContext *attr static int32 eval_propagation_expression(PartitionSelectorState *node, Oid part_oid) { - ExprState *propagationExprState = node->propagationExprState; + ExprState *propagationExprState = node->propagationExprState; ExprContext *econtext = node->ps.ps_ExprContext; + ResetExprContext(econtext); - bool isNull = false; + bool isNull = false; ExprDoneCond isDone = ExprSingleResult; - Datum result = ExecEvalExpr(propagationExprState, econtext, &isNull, &isDone); + Datum result = ExecEvalExpr(propagationExprState, econtext, &isNull, &isDone); + return DatumGetInt32(result); } @@ -72,13 +74,14 @@ eval_part_qual(ExprState *exprstate, PartitionSelectorState *node, TupleTableSlo { /* evaluate generalPredicate */ ExprContext *econtext = node->ps.ps_ExprContext; + ResetExprContext(econtext); econtext->ecxt_outertuple = inputTuple; econtext->ecxt_scantuple = inputTuple; - List *qualList = list_make1(exprstate); + List *qualList = list_make1(exprstate); - return ExecQual(qualList, econtext, false /* result is not for null */); + return ExecQual(qualList, econtext, false /* result is not for null */ ); } /* ---------------------------------------------------------------- @@ -98,24 +101,27 @@ eval_part_qual(ExprState *exprstate, PartitionSelectorState *node, TupleTableSlo * * ---------------------------------------------------------------- */ -static PartitionRule* +static PartitionRule * partition_selection(PartitionNode *pn, PartitionAccessMethods *accessMethods, Oid root_oid, Datum value, Oid exprTypid, bool isNull) { - Assert (NULL != pn); - Assert (NULL != accessMethods); - Partition *part = pn->part; + Assert(NULL != pn); + Assert(NULL != accessMethods); + Partition *part = pn->part; + + Assert(1 == part->parnatts); + AttrNumber partAttno = part->paratts[0]; - Assert (1 == part->parnatts); - AttrNumber partAttno = part->paratts[0]; - Assert (0 < partAttno); + Assert(0 < partAttno); + + Relation rel = relation_open(root_oid, NoLock); + TupleDesc tupDesc = RelationGetDescr(rel); - Relation rel = relation_open(root_oid, NoLock); - TupleDesc tupDesc = RelationGetDescr(rel); Assert(tupDesc->natts >= partAttno); int i; Datum *values = palloc0(partAttno * sizeof(Datum)); bool *isnull = palloc(partAttno * sizeof(bool)); + for (i = 0; i < partAttno - 1; i++) isnull[i] = true; isnull[partAttno - 1] = isNull; @@ -140,21 +146,27 @@ partition_selection(PartitionNode *pn, PartitionAccessMethods *accessMethods, Oi */ static List * partition_rules_for_general_predicate(PartitionSelectorState *node, int level, - TupleTableSlot *inputTuple, PartitionNode *parentNode) + TupleTableSlot *inputTuple, PartitionNode *parentNode) { - Assert (NULL != node); - Assert (NULL != parentNode); + Assert(NULL != node); + Assert(NULL != parentNode); + + List *result = NIL; + ListCell *lc = NULL; - List *result = NIL; - ListCell *lc = NULL; - foreach (lc, parentNode->rules) + foreach(lc, parentNode->rules) { PartitionRule *rule = (PartitionRule *) lfirst(lc); - /* We need to register it to allLevelParts to evaluate the current predicate */ + + /* + * We need to register it to allLevelParts to evaluate the current + * predicate + */ node->levelPartRules[level] = rule; /* evaluate generalPredicate */ - ExprState *exprstate = (ExprState *) lfirst(list_nth_cell(node->levelExprStates, level)); + ExprState *exprstate = (ExprState *) lfirst(list_nth_cell(node->levelExprStates, level)); + if (eval_part_qual(exprstate, node, inputTuple)) { result = lappend(result, rule); @@ -163,11 +175,15 @@ partition_rules_for_general_predicate(PartitionSelectorState *node, int level, if (parentNode->default_part) { - /* We need to register it to allLevelParts to evaluate the current predicate */ + /* + * We need to register it to allLevelParts to evaluate the current + * predicate + */ node->levelPartRules[level] = parentNode->default_part; /* evaluate generalPredicate */ - ExprState *exprstate = (ExprState *) lfirst(list_nth_cell(node->levelExprStates, level)); + ExprState *exprstate = (ExprState *) lfirst(list_nth_cell(node->levelExprStates, level)); + if (eval_part_qual(exprstate, node, inputTuple)) { result = lappend(result, parentNode->default_part); @@ -189,32 +205,35 @@ partition_rules_for_general_predicate(PartitionSelectorState *node, int level, */ static PartitionRule * partition_rules_for_equality_predicate(PartitionSelectorState *node, int level, - TupleTableSlot *inputTuple, PartitionNode *parentNode) + TupleTableSlot *inputTuple, PartitionNode *parentNode) { - Assert (NULL != node); - Assert (NULL != node->ps.plan); - Assert (NULL != parentNode); + Assert(NULL != node); + Assert(NULL != node->ps.plan); + Assert(NULL != parentNode); PartitionSelector *ps = (PartitionSelector *) node->ps.plan; - Assert (level < ps->nLevels); + + Assert(level < ps->nLevels); /* evaluate equalityPredicate to get partition identifier value */ - ExprState *exprState = (ExprState *) lfirst(list_nth_cell(node->levelEqExprStates, level)); + ExprState *exprState = (ExprState *) lfirst(list_nth_cell(node->levelEqExprStates, level)); ExprContext *econtext = node->ps.ps_ExprContext; + ResetExprContext(econtext); econtext->ecxt_outertuple = inputTuple; econtext->ecxt_scantuple = inputTuple; - bool isNull = false; + bool isNull = false; ExprDoneCond isDone = ExprSingleResult; - Datum value = ExecEvalExpr(exprState, econtext, &isNull, &isDone); + Datum value = ExecEvalExpr(exprState, econtext, &isNull, &isDone); /* - * Compute the type of the expression result. Sometimes this can be different - * than the type of the partition rules (MPP-25707), and we'll need this type - * to choose the correct comparator. + * Compute the type of the expression result. Sometimes this can be + * different than the type of the partition rules (MPP-25707), and we'll + * need this type to choose the correct comparator. */ - Oid exprTypid = exprType((Node *) exprState->expr); + Oid exprTypid = exprType((Node *) exprState->expr); + return partition_selection(parentNode, node->accessMethods, ps->relid, value, exprTypid, isNull); } @@ -245,34 +264,38 @@ SelectedParts * processLevel(PartitionSelectorState *node, int level, TupleTableSlot *inputTuple) { SelectedParts *selparts = makeNode(SelectedParts); + selparts->partOids = NIL; selparts->scanIds = NIL; - Assert (NULL != node->ps.plan); + Assert(NULL != node->ps.plan); PartitionSelector *ps = (PartitionSelector *) node->ps.plan; - Assert (level < ps->nLevels); + + Assert(level < ps->nLevels); /* get equality and general predicate for the current level */ - Expr *equalityPredicate = (Expr *) lfirst(list_nth_cell(ps->levelEqExpressions, level)); - Expr *generalPredicate = (Expr *) lfirst(list_nth_cell(ps->levelExpressions, level)); + Expr *equalityPredicate = (Expr *) lfirst(list_nth_cell(ps->levelEqExpressions, level)); + Expr *generalPredicate = (Expr *) lfirst(list_nth_cell(ps->levelExpressions, level)); /* get parent PartitionNode if in level 0, it's the root PartitionNode */ PartitionNode *parentNode = node->rootPartitionNode; + if (0 != level) { - Assert (NULL != node->levelPartRules[level - 1]); + Assert(NULL != node->levelPartRules[level - 1]); parentNode = node->levelPartRules[level - 1]->children; } /* list of PartitionRule that satisfied the predicates */ - List *satisfiedRules = NIL; + List *satisfiedRules = NIL; /* If equalityPredicate exists */ if (NULL != equalityPredicate) { - Assert (NULL == generalPredicate); + Assert(NULL == generalPredicate); PartitionRule *chosenRule = partition_rules_for_equality_predicate(node, level, inputTuple, parentNode); + if (chosenRule != NULL) { satisfiedRules = lappend(satisfiedRules, chosenRule); @@ -281,27 +304,30 @@ processLevel(PartitionSelectorState *node, int level, TupleTableSlot *inputTuple /* If generalPredicate exists */ else if (NULL != generalPredicate) { - List *chosenRules = partition_rules_for_general_predicate(node, level, inputTuple, parentNode); + List *chosenRules = partition_rules_for_general_predicate(node, level, inputTuple, parentNode); + satisfiedRules = list_concat(satisfiedRules, chosenRules); } /* None of the predicate exists */ else { /* - * Neither equality predicate nor general predicate - * exists. Return all the next level PartitionRule. + * Neither equality predicate nor general predicate exists. Return all + * the next level PartitionRule. * - * WARNING: Do NOT use list_concat with satisfiedRules - * and parentNode->rules. list_concat will destructively modify - * satisfiedRules to point to parentNode->rules, which will - * then be freed when we free satisfiedRules. This does not - * apply when we execute partition_rules_for_general_predicate - * as it creates its own list. + * WARNING: Do NOT use list_concat with satisfiedRules and + * parentNode->rules. list_concat will destructively modify + * satisfiedRules to point to parentNode->rules, which will then be + * freed when we free satisfiedRules. This does not apply when we + * execute partition_rules_for_general_predicate as it creates its own + * list. */ - ListCell* lc = NULL; - foreach (lc, parentNode->rules) + ListCell *lc = NULL; + + foreach(lc, parentNode->rules) { PartitionRule *rule = (PartitionRule *) lfirst(lc); + satisfiedRules = lappend(satisfiedRules, rule); } @@ -311,25 +337,29 @@ processLevel(PartitionSelectorState *node, int level, TupleTableSlot *inputTuple } } - /* Based on the satisfied PartitionRules, go to next - * level or propagate PartOids if we are in the leaf level + /* + * Based on the satisfied PartitionRules, go to next level or propagate + * PartOids if we are in the leaf level */ - ListCell* lc = NULL; - foreach (lc, satisfiedRules) + ListCell *lc = NULL; + + foreach(lc, satisfiedRules) { PartitionRule *rule = (PartitionRule *) lfirst(lc); + node->levelPartRules[level] = rule; /* If we already in the leaf level */ if (level == ps->nLevels - 1) { - bool shouldPropagate = true; + bool shouldPropagate = true; /* if residual predicate exists */ if (NULL != ps->residualPredicate) { /* evaluate residualPredicate */ - ExprState *exprstate = node->residualPredicateExprState; + ExprState *exprstate = node->residualPredicateExprState; + shouldPropagate = eval_part_qual(exprstate, node, inputTuple); } @@ -340,16 +370,22 @@ processLevel(PartitionSelectorState *node, int level, TupleTableSlot *inputTuple if (!list_member_oid(selparts->partOids, rule->parchildrelid)) { selparts->partOids = lappend_oid(selparts->partOids, rule->parchildrelid); - int scanId = eval_propagation_expression(node, rule->parchildrelid); + int scanId = eval_propagation_expression(node, rule->parchildrelid); + selparts->scanIds = lappend_int(selparts->scanIds, scanId); } } } } - /* Recursively call this function for next level's partition elimination */ + + /* + * Recursively call this function for next level's partition + * elimination + */ else { - SelectedParts *selpartsChild = processLevel(node, level+1, inputTuple); + SelectedParts *selpartsChild = processLevel(node, level + 1, inputTuple); + selparts->partOids = list_concat(selparts->partOids, selpartsChild->partOids); selparts->scanIds = list_concat(selparts->scanIds, selpartsChild->scanIds); pfree(selpartsChild); @@ -376,38 +412,40 @@ initPartitionSelection(PartitionSelector *node, EState *estate) { /* create and initialize PartitionSelectorState structure */ PartitionSelectorState *psstate; - ListCell *lc; + ListCell *lc; psstate = makeNode(PartitionSelectorState); psstate->ps.plan = (Plan *) node; psstate->ps.state = estate; - psstate->levelPartRules = (PartitionRule**) palloc0(node->nLevels * sizeof(PartitionRule*)); + psstate->levelPartRules = (PartitionRule **) palloc0(node->nLevels * sizeof(PartitionRule *)); /* ExprContext initialization */ ExecAssignExprContext(estate, &psstate->ps); /* initialize ExprState for evaluating expressions */ - foreach (lc, node->levelEqExpressions) + foreach(lc, node->levelEqExpressions) { - Expr *eqExpr = (Expr *) lfirst(lc); + Expr *eqExpr = (Expr *) lfirst(lc); + psstate->levelEqExprStates = lappend(psstate->levelEqExprStates, - ExecInitExpr(eqExpr, (PlanState *) psstate)); + ExecInitExpr(eqExpr, (PlanState *) psstate)); } - foreach (lc, node->levelExpressions) + foreach(lc, node->levelExpressions) { - Expr *generalExpr = (Expr *) lfirst(lc); + Expr *generalExpr = (Expr *) lfirst(lc); + psstate->levelExprStates = lappend(psstate->levelExprStates, - ExecInitExpr(generalExpr, (PlanState *) psstate)); + ExecInitExpr(generalExpr, (PlanState *) psstate)); } psstate->residualPredicateExprState = ExecInitExpr((Expr *) node->residualPredicate, - (PlanState *) psstate); + (PlanState *) psstate); psstate->propagationExprState = ExecInitExpr((Expr *) node->propagationExpression, - (PlanState *) psstate); + (PlanState *) psstate); psstate->ps.targetlist = (List *) ExecInitExpr((Expr *) node->plan.targetlist, - (PlanState *) psstate); + (PlanState *) psstate); return psstate; } @@ -421,7 +459,7 @@ initPartitionSelection(PartitionSelector *node, EState *estate) */ void getPartitionNodeAndAccessMethod(Oid rootOid, List *partsMetadata, MemoryContext memoryContext, - PartitionNode **partsAndRules, PartitionAccessMethods **accessMethods) + PartitionNode **partsAndRules, PartitionAccessMethods **accessMethods) { Assert(NULL != partsMetadata); findPartitionMetadataEntry(partsMetadata, rootOid, partsAndRules, accessMethods); @@ -454,17 +492,17 @@ static_part_selection(PartitionSelector *ps) psstate = initPartitionSelection(ps, estate); getPartitionNodeAndAccessMethod - ( - ps->relid, - partsMetadata, - estate->es_query_cxt, - &psstate->rootPartitionNode, - &psstate->accessMethods - ); + ( + ps->relid, + partsMetadata, + estate->es_query_cxt, + &psstate->rootPartitionNode, + &psstate->accessMethods + ); MemoryContextSwitchTo(oldcxt); - selparts = processLevel(psstate, 0 /* level */, NULL /*inputSlot*/); + selparts = processLevel(psstate, 0 /* level */ , NULL /* inputSlot */ ); /* cleanup */ FreeExecutorState(estate); @@ -489,7 +527,7 @@ varattnos_map(TupleDesc old, TupleDesc new) int i, j; - bool mapRequired = false; + bool mapRequired = false; attmap = (AttrNumber *) palloc0(sizeof(AttrNumber) * old->natts); for (i = 1; i <= old->natts; i++) @@ -538,10 +576,15 @@ varattnos_map(TupleDesc old, TupleDesc new) void change_varattnos_of_a_node(Node *node, const AttrNumber *newattno) { - /* Only attempt re-mapping if re-mapping is necessary (i.e., non-null newattno map) */ + /* + * Only attempt re-mapping if re-mapping is necessary (i.e., non-null + * newattno map) + */ if (newattno) { - change_varattnos_of_a_varno(node, newattno, 1 /* varno is hard-coded to 1 (i.e., only first RTE) */); + change_varattnos_of_a_varno(node, newattno, 1 /* varno is hard-coded + * to 1 (i.e., only + * first RTE) */ ); } } -- GitLab