diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2002-08-11 21:17:35 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2002-08-11 21:17:35 +0000 |
commit | e44beef712144316cb83d32ccf3231a1503c9655 (patch) | |
tree | 1b8cd8ecdc55866d61fcb44f4b5162303d9f023c /src/backend/commands/cluster.c | |
parent | 9bccdf17f725550e463fbc9fddf0acf2ed3a8e66 (diff) | |
download | postgresql-e44beef712144316cb83d32ccf3231a1503c9655.tar.gz postgresql-e44beef712144316cb83d32ccf3231a1503c9655.zip |
Code review of CLUSTER patch. Clean up problems with relcache getting
confused, toasted data getting lost, etc.
Diffstat (limited to 'src/backend/commands/cluster.c')
-rw-r--r-- | src/backend/commands/cluster.c | 428 |
1 files changed, 262 insertions, 166 deletions
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 9362c8f2496..7ca8e1dd329 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1,29 +1,25 @@ /*------------------------------------------------------------------------- * * cluster.c - * Paul Brown's implementation of cluster index. + * CLUSTER a table on an index. + * + * There is hardly anything left of Paul Brown's original implementation... * - * I am going to use the rename function as a model for this in the - * parser and executor, and the vacuum code as an example in this - * file. As I go - in contrast to the rest of postgres - there will - * be BUCKETS of comments. This is to allow reviewers to understand - * my (probably bogus) assumptions about the way this works. - * [pbrown '94] * * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.85 2002/08/10 21:00:34 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.86 2002/08/11 21:17:34 tgl Exp $ * *------------------------------------------------------------------------- */ - #include "postgres.h" #include "access/genam.h" #include "access/heapam.h" +#include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" #include "catalog/index.h" @@ -40,25 +36,23 @@ /* * We need one of these structs for each index in the relation to be * clustered. It's basically the data needed by index_create() so - * we can recreate the indexes after destroying the old heap. + * we can rebuild the indexes on the new heap. */ typedef struct { + Oid indexOID; char *indexName; IndexInfo *indexInfo; Oid accessMethodOID; Oid *classOID; - Oid indexOID; - bool isPrimary; } IndexAttrs; -static Oid copy_heap(Oid OIDOldHeap, const char *NewName); -static void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex); -static List *get_indexattr_list (Oid OIDOldHeap); +static Oid make_new_heap(Oid OIDOldHeap, const char *NewName); +static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex); +static List *get_indexattr_list(Relation OldHeap); static void recreate_indexattr(Oid OIDOldHeap, List *indexes); static void swap_relfilenodes(Oid r1, Oid r2); -Relation RelationIdGetRelation(Oid relationId); /* * cluster @@ -67,17 +61,14 @@ Relation RelationIdGetRelation(Oid relationId); * swapping the relfilenodes of the new table and the old table, so * the OID of the original table is preserved. Thus we do not lose * GRANT, inheritance nor references to this table (this was a bug - * in releases thru 7.3) + * in releases thru 7.3). * - * Also create new indexes and swap the filenodes with the old indexes - * the same way we do for the relation. + * Also create new indexes and swap the filenodes with the old indexes the + * same way we do for the relation. Since we are effectively bulk-loading + * the new table, it's better to create the indexes afterwards than to fill + * them incrementally while we load the table. * - * TODO: - * maybe we can get away with AccessShareLock for the table. - * Concurrency would be much improved. Only acquire - * AccessExclusiveLock right before swapping the filenodes. - * This would allow users to CLUSTER on a regular basis, - * practically eliminating the need for auto-clustered indexes. + * Permissions checks were done already. */ void cluster(RangeVar *oldrelation, char *oldindexname) @@ -105,43 +96,60 @@ cluster(RangeVar *oldrelation, char *oldindexname) RelationGetNamespace(OldHeap)); if (!OidIsValid(OIDOldIndex)) elog(ERROR, "CLUSTER: cannot find index \"%s\" for table \"%s\"", - oldindexname, oldrelation->relname); + oldindexname, RelationGetRelationName(OldHeap)); OldIndex = index_open(OIDOldIndex); LockRelation(OldIndex, AccessExclusiveLock); /* * Check that index is in fact an index on the given relation */ - if (OldIndex->rd_index->indrelid != OIDOldHeap) + if (OldIndex->rd_index == NULL || + OldIndex->rd_index->indrelid != OIDOldHeap) elog(ERROR, "CLUSTER: \"%s\" is not an index for table \"%s\"", - oldindexname, oldrelation->relname); + RelationGetRelationName(OldIndex), + RelationGetRelationName(OldHeap)); - /* Drop relcache refcnts, but do NOT give up the locks */ - heap_close(OldHeap, NoLock); - index_close(OldIndex); + /* + * Disallow clustering system relations. This will definitely NOT work + * for shared relations (we have no way to update pg_class rows in other + * databases), nor for nailed-in-cache relations (the relfilenode values + * for those are hardwired, see relcache.c). It might work for other + * system relations, but I ain't gonna risk it. + */ + if (IsSystemRelation(OldHeap)) + elog(ERROR, "CLUSTER: cannot cluster system relation \"%s\"", + RelationGetRelationName(OldHeap)); /* Save the information of all indexes on the relation. */ - indexes = get_indexattr_list(OIDOldHeap); + indexes = get_indexattr_list(OldHeap); + + /* Drop relcache refcnts, but do NOT give up the locks */ + index_close(OldIndex); + heap_close(OldHeap, NoLock); /* - * Create the new heap with a temporary name. + * Create the new heap, using a temporary name in the same namespace + * as the existing table. NOTE: there is some risk of collision with user + * relnames. Working around this seems more trouble than it's worth; in + * particular, we can't create the new heap in a different namespace from + * the old, or we will have problems with the TEMP status of temp tables. */ - snprintf(NewHeapName, NAMEDATALEN, "temp_%u", OIDOldHeap); + snprintf(NewHeapName, NAMEDATALEN, "pg_temp_%u", OIDOldHeap); - OIDNewHeap = copy_heap(OIDOldHeap, NewHeapName); + OIDNewHeap = make_new_heap(OIDOldHeap, NewHeapName); - /* We do not need CommandCounterIncrement() because copy_heap did it. */ + /* We don't need CommandCounterIncrement() because make_new_heap did it. */ /* * Copy the heap data into the new table in the desired order. */ - rebuildheap(OIDNewHeap, OIDOldHeap, OIDOldIndex); + copy_heap_data(OIDNewHeap, OIDOldHeap, OIDOldIndex); - /* To make the new heap's data visible. */ + /* To make the new heap's data visible (probably not needed?). */ CommandCounterIncrement(); /* Swap the relfilenodes of the old and new heaps. */ - swap_relfilenodes(OIDNewHeap, OIDOldHeap); + swap_relfilenodes(OIDOldHeap, OIDNewHeap); CommandCounterIncrement(); @@ -150,21 +158,26 @@ cluster(RangeVar *oldrelation, char *oldindexname) object.objectId = OIDNewHeap; object.objectSubId = 0; - /* The relation is local to our transaction and we know nothin + /* + * The new relation is local to our transaction and we know nothing * depends on it, so DROP_RESTRICT should be OK. */ performDeletion(&object, DROP_RESTRICT); /* performDeletion does CommandCounterIncrement at end */ - /* Recreate the indexes on the relation. We do not need - * CommandCounterIncrement() because recreate_indexattr does it. - */ - recreate_indexattr(OIDOldHeap, indexes); + /* + * Recreate each index on the relation. We do not need + * CommandCounterIncrement() because recreate_indexattr does it. + */ + recreate_indexattr(OIDOldHeap, indexes); } +/* + * Create the new table that we will fill with correctly-ordered data. + */ static Oid -copy_heap(Oid OIDOldHeap, const char *NewName) +make_new_heap(Oid OIDOldHeap, const char *NewName) { TupleDesc OldHeapDesc, tupdesc; @@ -206,27 +219,29 @@ copy_heap(Oid OIDOldHeap, const char *NewName) return OIDNewHeap; } +/* + * Do the physical copying of heap data. + */ static void -rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) +copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) { - Relation LocalNewHeap, - LocalOldHeap, - LocalOldIndex; - IndexScanDesc ScanDesc; - HeapTuple LocalHeapTuple; + Relation NewHeap, + OldHeap, + OldIndex; + IndexScanDesc scan; + HeapTuple tuple; /* * Open the relations I need. Scan through the OldHeap on the OldIndex * and insert each tuple into the NewHeap. */ - LocalNewHeap = heap_open(OIDNewHeap, AccessExclusiveLock); - LocalOldHeap = heap_open(OIDOldHeap, AccessExclusiveLock); - LocalOldIndex = index_open(OIDOldIndex); + NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock); + OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock); + OldIndex = index_open(OIDOldIndex); - ScanDesc = index_beginscan(LocalOldHeap, LocalOldIndex, - SnapshotNow, 0, (ScanKey) NULL); + scan = index_beginscan(OldHeap, OldIndex, SnapshotNow, 0, (ScanKey) NULL); - while ((LocalHeapTuple = index_getnext(ScanDesc, ForwardScanDirection)) != NULL) + while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL) { /* * We must copy the tuple because heap_insert() will overwrite @@ -234,199 +249,280 @@ rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) * retrieved tuple will actually be in a disk buffer! Thus, * the source relation would get trashed, which is bad news if * we abort later on. (This was a bug in releases thru 7.0) + * + * Note that the copied tuple will have the original OID, if any, + * so this does preserve OIDs. */ - HeapTuple copiedTuple = heap_copytuple(LocalHeapTuple); + HeapTuple copiedTuple = heap_copytuple(tuple); + + simple_heap_insert(NewHeap, copiedTuple); - simple_heap_insert(LocalNewHeap, copiedTuple); heap_freetuple(copiedTuple); CHECK_FOR_INTERRUPTS(); } - index_endscan(ScanDesc); + index_endscan(scan); - index_close(LocalOldIndex); - heap_close(LocalOldHeap, NoLock); - heap_close(LocalNewHeap, NoLock); + index_close(OldIndex); + heap_close(OldHeap, NoLock); + heap_close(NewHeap, NoLock); } -/* Get the necessary info about the indexes in the relation and - * return a List of IndexAttrs. +/* + * Get the necessary info about the indexes of the relation and + * return a list of IndexAttrs structures. */ -List * -get_indexattr_list (Oid OIDOldHeap) +static List * +get_indexattr_list(Relation OldHeap) { - ScanKeyData entry; - HeapScanDesc scan; - Relation indexRelation; - HeapTuple indexTuple; List *indexes = NIL; - IndexAttrs *attrs; - HeapTuple tuple; - Form_pg_index index; - - /* Grab the index tuples by looking into RelationRelationName - * by the OID of the old heap. - */ - indexRelation = heap_openr(IndexRelationName, AccessShareLock); - ScanKeyEntryInitialize(&entry, 0, Anum_pg_index_indrelid, - F_OIDEQ, ObjectIdGetDatum(OIDOldHeap)); - scan = heap_beginscan(indexRelation, SnapshotNow, 1, &entry); - while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + List *indlist; + + /* Ask the relcache to produce a list of the indexes of the old rel */ + foreach(indlist, RelationGetIndexList(OldHeap)) { - index = (Form_pg_index) GETSTRUCT(indexTuple); + Oid indexOID = (Oid) lfirsti(indlist); + HeapTuple indexTuple; + HeapTuple classTuple; + Form_pg_index indexForm; + Form_pg_class classForm; + IndexAttrs *attrs; + + indexTuple = SearchSysCache(INDEXRELID, + ObjectIdGetDatum(indexOID), + 0, 0, 0); + if (!HeapTupleIsValid(indexTuple)) + elog(ERROR, "Cache lookup failed for index %u", indexOID); + indexForm = (Form_pg_index) GETSTRUCT(indexTuple); + Assert(indexForm->indexrelid == indexOID); attrs = (IndexAttrs *) palloc(sizeof(IndexAttrs)); - attrs->indexInfo = BuildIndexInfo(index); - attrs->isPrimary = index->indisprimary; - attrs->indexOID = index->indexrelid; - - /* The opclasses are copied verbatim from the original indexes. - */ - attrs->classOID = (Oid *)palloc(sizeof(Oid) * - attrs->indexInfo->ii_NumIndexAttrs); - memcpy(attrs->classOID, index->indclass, - sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs); - - /* Name and access method of each index come from - * RelationRelationName. - */ - tuple = SearchSysCache(RELOID, - ObjectIdGetDatum(attrs->indexOID), - 0, 0, 0); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "CLUSTER: cannot find index %u", attrs->indexOID); - attrs->indexName = pstrdup(NameStr(((Form_pg_class) GETSTRUCT(tuple))->relname)); - attrs->accessMethodOID = ((Form_pg_class) GETSTRUCT(tuple))->relam; - ReleaseSysCache(tuple); + attrs->indexOID = indexOID; + attrs->indexInfo = BuildIndexInfo(indexForm); + attrs->classOID = (Oid *) + palloc(sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs); + memcpy(attrs->classOID, indexForm->indclass, + sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs); + + /* Name and access method of each index come from pg_class */ + classTuple = SearchSysCache(RELOID, + ObjectIdGetDatum(indexOID), + 0, 0, 0); + if (!HeapTupleIsValid(classTuple)) + elog(ERROR, "Cache lookup failed for index %u", indexOID); + classForm = (Form_pg_class) GETSTRUCT(classTuple); + + attrs->indexName = pstrdup(NameStr(classForm->relname)); + attrs->accessMethodOID = classForm->relam; + + ReleaseSysCache(classTuple); + ReleaseSysCache(indexTuple); /* Cons the gathered data into the list. We do not care about * ordering, and this is more efficient than append. */ - indexes=lcons((void *)attrs, indexes); + indexes = lcons(attrs, indexes); } - heap_endscan(scan); - heap_close(indexRelation, AccessShareLock); + return indexes; } -/* Create new indexes and swap the filenodes with old indexes. Then drop - * the new index (carrying the old heap along). +/* + * Create new indexes and swap the filenodes with old indexes. Then drop + * the new index (carrying the old index filenode along). */ -void +static void recreate_indexattr(Oid OIDOldHeap, List *indexes) { - IndexAttrs *attrs; List *elem; - Oid newIndexOID; - char newIndexName[NAMEDATALEN]; - ObjectAddress object; - foreach (elem, indexes) + foreach(elem, indexes) { - attrs=(IndexAttrs *) lfirst(elem); + IndexAttrs *attrs = (IndexAttrs *) lfirst(elem); + Oid newIndexOID; + char newIndexName[NAMEDATALEN]; + ObjectAddress object; /* Create the new index under a temporary name */ - snprintf(newIndexName, NAMEDATALEN, "temp_%u", attrs->indexOID); + snprintf(newIndexName, NAMEDATALEN, "pg_temp_%u", attrs->indexOID); - /* The new index will have constraint status set to false, + /* + * The new index will have primary and constraint status set to false, * but since we will only use its filenode it doesn't matter: * after the filenode swap the index will keep the constraint * status of the old index. */ newIndexOID = index_create(OIDOldHeap, newIndexName, attrs->indexInfo, attrs->accessMethodOID, - attrs->classOID, attrs->isPrimary, + attrs->classOID, false, false, allowSystemTableMods); CommandCounterIncrement(); /* Swap the filenodes. */ - swap_relfilenodes(newIndexOID, attrs->indexOID); - setRelhasindex(OIDOldHeap, true, attrs->isPrimary, InvalidOid); + swap_relfilenodes(attrs->indexOID, newIndexOID); + + CommandCounterIncrement(); /* Destroy new index with old filenode */ object.classId = RelOid_pg_class; object.objectId = newIndexOID; object.objectSubId = 0; - /* The relation is local to our transaction and we know + /* + * The relation is local to our transaction and we know * nothing depends on it, so DROP_RESTRICT should be OK. */ performDeletion(&object, DROP_RESTRICT); /* performDeletion does CommandCounterIncrement() at its end */ - - pfree(attrs->classOID); - pfree(attrs); } - freeList(indexes); } -/* Swap the relfilenodes for two given relations. +/* + * Swap the relfilenodes for two given relations. + * + * Also swap any TOAST links, so that the toast data moves along with + * the main-table data. */ -void +static void swap_relfilenodes(Oid r1, Oid r2) { - /* I can probably keep RelationRelationName open in the main - * function and pass the Relation around so I don't have to open - * it every time. - */ Relation relRelation, rel; - HeapTuple reltup[2]; - Oid tempRFNode; + HeapTuple reltup1, + reltup2; + Form_pg_class relform1, + relform2; + Oid swaptemp; int i; CatalogIndexState indstate; - /* We need both RelationRelationName tuples. */ + /* We need writable copies of both pg_class tuples. */ relRelation = heap_openr(RelationRelationName, RowExclusiveLock); - reltup[0] = SearchSysCacheCopy(RELOID, - ObjectIdGetDatum(r1), - 0, 0, 0); - if (!HeapTupleIsValid(reltup[0])) + reltup1 = SearchSysCacheCopy(RELOID, + ObjectIdGetDatum(r1), + 0, 0, 0); + if (!HeapTupleIsValid(reltup1)) elog(ERROR, "CLUSTER: Cannot find tuple for relation %u", r1); - reltup[1] = SearchSysCacheCopy(RELOID, - ObjectIdGetDatum(r2), - 0, 0, 0); - if (!HeapTupleIsValid(reltup[1])) + relform1 = (Form_pg_class) GETSTRUCT(reltup1); + + reltup2 = SearchSysCacheCopy(RELOID, + ObjectIdGetDatum(r2), + 0, 0, 0); + if (!HeapTupleIsValid(reltup2)) elog(ERROR, "CLUSTER: Cannot find tuple for relation %u", r2); + relform2 = (Form_pg_class) GETSTRUCT(reltup2); - /* The buffer manager gets confused if we swap relfilenodes for + /* + * The buffer manager gets confused if we swap relfilenodes for * relations that are not both local or non-local to this transaction. * Flush the buffers on both relations so the buffer manager can - * forget about'em. + * forget about'em. (XXX this might not be necessary anymore?) */ - - rel = RelationIdGetRelation(r1); + rel = relation_open(r1, NoLock); i = FlushRelationBuffers(rel, 0); if (i < 0) elog(ERROR, "CLUSTER: FlushRelationBuffers returned %d", i); - RelationClose(rel); - rel = RelationIdGetRelation(r1); + relation_close(rel, NoLock); + + rel = relation_open(r2, NoLock); i = FlushRelationBuffers(rel, 0); if (i < 0) elog(ERROR, "CLUSTER: FlushRelationBuffers returned %d", i); - RelationClose(rel); + relation_close(rel, NoLock); + + /* + * Actually swap the filenode and TOAST fields in the two tuples + */ + swaptemp = relform1->relfilenode; + relform1->relfilenode = relform2->relfilenode; + relform2->relfilenode = swaptemp; - /* Actually swap the filenodes */ + swaptemp = relform1->reltoastrelid; + relform1->reltoastrelid = relform2->reltoastrelid; + relform2->reltoastrelid = swaptemp; - tempRFNode = ((Form_pg_class) GETSTRUCT(reltup[0]))->relfilenode; - ((Form_pg_class) GETSTRUCT(reltup[0]))->relfilenode = - ((Form_pg_class) GETSTRUCT(reltup[1]))->relfilenode; - ((Form_pg_class) GETSTRUCT(reltup[1]))->relfilenode = tempRFNode; + /* we should not swap reltoastidxid */ - /* Update the RelationRelationName tuples */ - simple_heap_update(relRelation, &reltup[1]->t_self, reltup[1]); - simple_heap_update(relRelation, &reltup[0]->t_self, reltup[0]); + /* Update the tuples in pg_class */ + simple_heap_update(relRelation, &reltup1->t_self, reltup1); + simple_heap_update(relRelation, &reltup2->t_self, reltup2); - /* To keep system catalogs current. */ + /* Keep system catalogs current */ indstate = CatalogOpenIndexes(relRelation); - CatalogIndexInsert(indstate, reltup[1]); - CatalogIndexInsert(indstate, reltup[0]); + CatalogIndexInsert(indstate, reltup1); + CatalogIndexInsert(indstate, reltup2); CatalogCloseIndexes(indstate); - heap_close(relRelation, NoLock); - heap_freetuple(reltup[0]); - heap_freetuple(reltup[1]); + /* + * If we have toast tables associated with the relations being swapped, + * change their dependency links to re-associate them with their new + * owning relations. Otherwise the wrong one will get dropped ... + * + * NOTE: for now, we can assume the new table will have a TOAST table + * if and only if the old one does. This logic might need work if we + * get smarter about dropped columns. + * + * NOTE: at present, a TOAST table's only dependency is the one on + * its owning table. If more are ever created, we'd need to use something + * more selective than deleteDependencyRecordsFor() to get rid of only + * the link we want. + */ + if (relform1->reltoastrelid || relform2->reltoastrelid) + { + ObjectAddress baseobject, + toastobject; + long count; + + if (!(relform1->reltoastrelid && relform2->reltoastrelid)) + elog(ERROR, "CLUSTER: expected both swapped tables to have TOAST tables"); + + /* Delete old dependencies */ + count = deleteDependencyRecordsFor(RelOid_pg_class, + relform1->reltoastrelid); + if (count != 1) + elog(ERROR, "CLUSTER: expected one dependency record for TOAST table, found %ld", + count); + count = deleteDependencyRecordsFor(RelOid_pg_class, + relform2->reltoastrelid); + if (count != 1) + elog(ERROR, "CLUSTER: expected one dependency record for TOAST table, found %ld", + count); + + /* Register new dependencies */ + baseobject.classId = RelOid_pg_class; + baseobject.objectId = r1; + baseobject.objectSubId = 0; + toastobject.classId = RelOid_pg_class; + toastobject.objectId = relform1->reltoastrelid; + toastobject.objectSubId = 0; + + recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); + + baseobject.objectId = r2; + toastobject.objectId = relform2->reltoastrelid; + + recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); + } + + /* + * Blow away the old relcache entries now. We need this kluge because + * relcache.c indexes relcache entries by rd_node as well as OID. + * It will get confused if it is asked to (re)build an entry with a new + * rd_node value when there is still another entry laying about with + * that same rd_node value. (Fortunately, since one of the entries + * is local in our transaction, it's sufficient to clear out our own + * relcache this way; the problem cannot arise for other backends when + * they see our update on the non-local relation.) + */ + RelationForgetRelation(r1); + RelationForgetRelation(r2); + + /* Clean up. */ + heap_freetuple(reltup1); + heap_freetuple(reltup2); + + heap_close(relRelation, RowExclusiveLock); } |