mirror of
https://github.com/postgres/postgres.git
synced 2025-09-02 04:21:28 +03:00
Increase width of RelFileNumbers from 32 bits to 56 bits.
RelFileNumbers are now assigned using a separate counter, instead of being assigned from the OID counter. This counter never wraps around: if all 2^56 possible RelFileNumbers are used, an internal error occurs. As the cluster is limited to 2^64 total bytes of WAL, this limitation should not cause a problem in practice. If the counter were 64 bits wide rather than 56 bits wide, we would need to increase the width of the BufferTag, which might adversely impact buffer lookup performance. Also, this lets us use bigint for pg_class.relfilenode and other places where these values are exposed at the SQL level without worrying about overflow. This should remove the need to keep "tombstone" files around until the next checkpoint when relations are removed. We do that to keep RelFileNumbers from being recycled, but now that won't happen anyway. However, this patch doesn't actually change anything in this area; it just makes it possible for a future patch to do so. Dilip Kumar, based on an idea from Andres Freund, who also reviewed some earlier versions of the patch. Further review and some wordsmithing by me. Also reviewed at various points by Ashutosh Sharma, Vignesh C, Amul Sul, Álvaro Herrera, and Tom Lane. Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
This commit is contained in:
@@ -31,7 +31,7 @@ static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
|
||||
|
||||
typedef struct
|
||||
{
|
||||
Oid reloid; /* hash key */
|
||||
RelFileNumber relnumber; /* hash key */
|
||||
} unlogged_relation_entry;
|
||||
|
||||
/*
|
||||
@@ -184,10 +184,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
|
||||
* need to be reset. Otherwise, this cleanup operation would be
|
||||
* O(n^2).
|
||||
*/
|
||||
ctl.keysize = sizeof(Oid);
|
||||
ctl.keysize = sizeof(RelFileNumber);
|
||||
ctl.entrysize = sizeof(unlogged_relation_entry);
|
||||
ctl.hcxt = CurrentMemoryContext;
|
||||
hash = hash_create("unlogged relation OIDs", 32, &ctl,
|
||||
hash = hash_create("unlogged relation RelFileNumbers", 32, &ctl,
|
||||
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
|
||||
|
||||
/* Scan the directory. */
|
||||
@@ -208,10 +208,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Put the OID portion of the name into the hash table, if it
|
||||
* isn't already.
|
||||
* Put the RELFILENUMBER portion of the name into the hash table,
|
||||
* if it isn't already.
|
||||
*/
|
||||
ent.reloid = atooid(de->d_name);
|
||||
ent.relnumber = atorelnumber(de->d_name);
|
||||
(void) hash_search(hash, &ent, HASH_ENTER, NULL);
|
||||
}
|
||||
|
||||
@@ -248,10 +248,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* See whether the OID portion of the name shows up in the hash
|
||||
* table. If so, nuke it!
|
||||
* See whether the RELFILENUMBER portion of the name shows up in
|
||||
* the hash table. If so, nuke it!
|
||||
*/
|
||||
ent.reloid = atooid(de->d_name);
|
||||
ent.relnumber = atorelnumber(de->d_name);
|
||||
if (hash_search(hash, &ent, HASH_FIND, NULL))
|
||||
{
|
||||
snprintf(rm_path, sizeof(rm_path), "%s/%s",
|
||||
@@ -286,7 +286,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
|
||||
{
|
||||
ForkNumber forkNum;
|
||||
int relnumchars;
|
||||
char relnumbuf[OIDCHARS + 1];
|
||||
char relnumbuf[RELNUMBERCHARS + 1];
|
||||
char srcpath[MAXPGPATH * 2];
|
||||
char dstpath[MAXPGPATH];
|
||||
|
||||
@@ -329,7 +329,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
|
||||
{
|
||||
ForkNumber forkNum;
|
||||
int relnumchars;
|
||||
char relnumbuf[OIDCHARS + 1];
|
||||
char relnumbuf[RELNUMBERCHARS + 1];
|
||||
char mainpath[MAXPGPATH];
|
||||
|
||||
/* Skip anything that doesn't look like a relation data file. */
|
||||
@@ -372,8 +372,8 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
|
||||
* for a non-temporary relation and false otherwise.
|
||||
*
|
||||
* NB: If this function returns true, the caller is entitled to assume that
|
||||
* *relnumchars has been set to a value no more than OIDCHARS, and thus
|
||||
* that a buffer of OIDCHARS+1 characters is sufficient to hold the
|
||||
* *relnumchars has been set to a value no more than RELNUMBERCHARS, and thus
|
||||
* that a buffer of RELNUMBERCHARS+1 characters is sufficient to hold the
|
||||
* RelFileNumber portion of the filename. This is critical to protect against
|
||||
* a possible buffer overrun.
|
||||
*/
|
||||
@@ -386,7 +386,7 @@ parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
|
||||
/* Look for a non-empty string of digits (that isn't too long). */
|
||||
for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
|
||||
;
|
||||
if (pos == 0 || pos > OIDCHARS)
|
||||
if (pos == 0 || pos > RELNUMBERCHARS)
|
||||
return false;
|
||||
*relnumchars = pos;
|
||||
|
||||
|
@@ -273,7 +273,7 @@ restart:
|
||||
BlockNumber blknum;
|
||||
|
||||
BufferGetTag(buf, &rlocator, &forknum, &blknum);
|
||||
elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
|
||||
elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/" UINT64_FORMAT,
|
||||
blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber);
|
||||
|
||||
/* make sure we hold an exclusive lock */
|
||||
|
@@ -53,3 +53,4 @@ XactTruncationLock 44
|
||||
# 45 was XactTruncationLock until removal of BackendRandomLock
|
||||
WrapLimitsVacuumLock 46
|
||||
NotifyQueueTailLock 47
|
||||
RelFileNumberGenLock 48
|
@@ -257,6 +257,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
* next checkpoint, we prevent reassignment of the relfilenumber until it's
|
||||
* safe, because relfilenumber assignment skips over any existing file.
|
||||
*
|
||||
* XXX. Although all of this was true when relfilenumbers were 32 bits wide,
|
||||
* they are now 56 bits wide and do not wrap around, so in the future we can
|
||||
* change the code to immediately unlink the first segment of the relation
|
||||
* along with all the others. We still do reuse relfilenumbers when createdb()
|
||||
* is performed using the file-copy method or during movedb(), but the scenario
|
||||
* described above can only happen when creating a new relation.
|
||||
*
|
||||
* We do not need to go through this dance for temp relations, though, because
|
||||
* we never make WAL entries for temp rels, and so a temp rel poses no threat
|
||||
* to the health of a regular rel that has taken over its relfilenumber.
|
||||
|
@@ -154,7 +154,7 @@ smgropen(RelFileLocator rlocator, BackendId backend)
|
||||
/* First time through: initialize the hash table */
|
||||
HASHCTL ctl;
|
||||
|
||||
ctl.keysize = sizeof(RelFileLocatorBackend);
|
||||
ctl.keysize = SizeOfRelFileLocatorBackend;
|
||||
ctl.entrysize = sizeof(SMgrRelationData);
|
||||
SMgrRelationHash = hash_create("smgr relation table", 400,
|
||||
&ctl, HASH_ELEM | HASH_BLOBS);
|
||||
|
Reference in New Issue
Block a user