diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 566d559c78a..4a11d0c419b 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.187 2006/11/05 22:42:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.187.2.1 2007/01/27 20:15:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -40,6 +40,7 @@ #include "postmaster/bgwriter.h" #include "storage/freespace.h" #include "storage/procarray.h" +#include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/flatfiles.h" @@ -643,6 +644,12 @@ dropdb(const char *dbname, bool missing_ok) */ FreeSpaceMapForgetDatabase(db_id); + /* + * Tell bgwriter to forget any pending fsync requests for files in the + * database; else it'll fail at next checkpoint. + */ + ForgetDatabaseFsyncRequests(db_id); + /* * On Windows, force a checkpoint so that the bgwriter doesn't hold any * open files, which would cause rmdir() to fail. diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 9062b4f7deb..1fec1e4b11b 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -2,7 +2,7 @@ * * bgwriter.c * - * The background writer (bgwriter) is new in Postgres 8.0. It attempts + * The background writer (bgwriter) is new as of Postgres 8.0. It attempts * to keep regular backends from having to write out dirty shared buffers * (which they would only do when needing to free a shared buffer to read in * another page). In the best scenario all writes from shared buffers will @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.33 2006/12/01 19:55:28 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.33.2.1 2007/01/27 20:15:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -103,8 +103,8 @@ typedef struct { RelFileNode rnode; - BlockNumber segno; - /* might add a request-type field later */ + BlockNumber segno; /* see md.c for special values */ + /* might add a real request-type field later; not needed yet */ } BgWriterRequest; typedef struct @@ -700,6 +700,11 @@ RequestCheckpoint(bool waitforit, bool warnontime) * the backend calls this routine to pass over knowledge that the relation * is dirty and must be fsync'd before next checkpoint. * + * segno specifies which segment (not block!) of the relation needs to be + * fsync'd. (Since the valid range is much less than BlockNumber, we can + * use high values for special flags; that's all internal to md.c, which + * see for details.) + * * If we are unable to pass over the request (at present, this can happen * if the shared memory queue is full), we return false. That forces * the backend to do its own fsync. We hope that will be even more seldom. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index f58ab03ce42..71ecaab01ae 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123 2006/11/20 01:07:56 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123.2.1 2007/01/27 20:15:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -30,6 +30,23 @@ /* interval for calling AbsorbFsyncRequests in mdsync */ #define FSYNCS_PER_ABSORB 10 +/* special values for the segno arg to RememberFsyncRequest */ +#define FORGET_RELATION_FSYNC (InvalidBlockNumber) +#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) + +/* + * On Windows, we have to interpret EACCES as possibly meaning the same as + * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, + * that's what you get. Ugh. This code is designed so that we don't + * actually believe these cases are okay without further evidence (namely, + * a pending fsync request getting revoked ... see mdsync). + */ +#ifndef WIN32 +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) +#else +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) +#endif + /* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it @@ -92,9 +109,8 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash - * table remembers the pending operations. We use a hash table not because - * we want to look up individual operations, but simply as a convenient way - * of eliminating duplicate requests. + * table remembers the pending operations. We use a hash table mostly as + * a convenient way of eliminating duplicate requests. * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) @@ -103,6 +119,12 @@ typedef struct { RelFileNode rnode; /* the targeted relation */ BlockNumber segno; /* which segment */ +} PendingOperationTag; + +typedef struct +{ + PendingOperationTag tag; /* hash table key (must be first!) */ + int failures; /* number of failed attempts to fsync */ } PendingOperationEntry; static HTAB *pendingOpsTable = NULL; @@ -145,7 +167,7 @@ mdinit(void) HASHCTL hash_ctl; MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(PendingOperationEntry); + hash_ctl.keysize = sizeof(PendingOperationTag); hash_ctl.entrysize = sizeof(PendingOperationEntry); hash_ctl.hash = tag_hash; hash_ctl.hcxt = MdCxt; @@ -228,6 +250,12 @@ mdunlink(RelFileNode rnode, bool isRedo) int save_errno = 0; char *path; + /* + * We have to clean out any pending fsync requests for the doomed relation, + * else the next mdsync() will fail. + */ + ForgetRelationFsyncRequests(rnode); + path = relpath(rnode); /* Delete the first segment, or only segment if not doing segmenting */ @@ -374,7 +402,7 @@ mdopen(SMgrRelation reln, bool allowNotFound) if (fd < 0) { pfree(path); - if (allowNotFound && errno == ENOENT) + if (allowNotFound && FILE_POSSIBLY_DELETED(errno)) return NULL; ereport(ERROR, (errcode_for_file_access(), @@ -727,97 +755,130 @@ mdimmedsync(SMgrRelation reln) bool mdsync(void) { - HASH_SEQ_STATUS hstat; - PendingOperationEntry *entry; - int absorb_counter; + bool need_retry; if (!pendingOpsTable) return false; /* - * If we are in the bgwriter, the sync had better include all fsync - * requests that were queued by backends before the checkpoint REDO point - * was determined. We go that a little better by accepting all requests - * queued up to the point where we start fsync'ing. + * The fsync table could contain requests to fsync relations that have + * been deleted (unlinked) by the time we get to them. Rather than + * just hoping an ENOENT (or EACCES on Windows) error can be ignored, + * what we will do is retry the whole process after absorbing fsync + * request messages again. Since mdunlink() queues a "revoke" message + * before actually unlinking, the fsync request is guaranteed to be gone + * the second time if it really was this case. DROP DATABASE likewise + * has to tell us to forget fsync requests before it starts deletions. */ - AbsorbFsyncRequests(); + do { + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + int absorb_counter; + + need_retry = false; - absorb_counter = FSYNCS_PER_ABSORB; - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { /* - * If fsync is off then we don't have to bother opening the file at - * all. (We delay checking until this point so that changing fsync on - * the fly behaves sensibly.) + * If we are in the bgwriter, the sync had better include all fsync + * requests that were queued by backends before the checkpoint REDO + * point was determined. We go that a little better by accepting all + * requests queued up to the point where we start fsync'ing. */ - if (enableFsync) + AbsorbFsyncRequests(); + + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { - SMgrRelation reln; - MdfdVec *seg; - /* - * If in bgwriter, absorb pending requests every so often to - * prevent overflow of the fsync request queue. The hashtable - * code does not specify whether entries added by this will be - * visited by our search, but we don't really care: it's OK if we - * do, and OK if we don't. + * If fsync is off then we don't have to bother opening the file + * at all. (We delay checking until this point so that changing + * fsync on the fly behaves sensibly.) */ - if (--absorb_counter <= 0) + if (enableFsync) { - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; - } + SMgrRelation reln; + MdfdVec *seg; - /* - * Find or create an smgr hash entry for this relation. This may - * seem a bit unclean -- md calling smgr? But it's really the - * best solution. It ensures that the open file reference isn't - * permanently leaked if we get an error here. (You may say "but - * an unreferenced SMgrRelation is still a leak!" Not really, - * because the only case in which a checkpoint is done by a - * process that isn't about to shut down is in the bgwriter, and - * it will periodically do smgrcloseall(). This fact justifies - * our not closing the reln in the success path either, which is a - * good thing since in non-bgwriter cases we couldn't safely do - * that.) Furthermore, in many cases the relation will have been - * dirtied through this same smgr relation, and so we can save a - * file open/close cycle. - */ - reln = smgropen(entry->rnode); - - /* - * It is possible that the relation has been dropped or truncated - * since the fsync request was entered. Therefore, we have to - * allow file-not-found errors. This applies both during - * _mdfd_getseg() and during FileSync, since fd.c might have - * closed the file behind our back. - */ - seg = _mdfd_getseg(reln, - entry->segno * ((BlockNumber) RELSEG_SIZE), - true); - if (seg) - { - if (FileSync(seg->mdfd_vfd) < 0 && - errno != ENOENT) + /* + * If in bgwriter, we want to absorb pending requests every so + * often to prevent overflow of the fsync request queue. This + * could result in deleting the current entry out from under + * our hashtable scan, so the procedure is to fall out of the + * scan and start over from the top of the function. + */ + if (--absorb_counter <= 0) { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not fsync segment %u of relation %u/%u/%u: %m", - entry->segno, - entry->rnode.spcNode, - entry->rnode.dbNode, - entry->rnode.relNode))); - return false; + need_retry = true; + break; + } + + /* + * Find or create an smgr hash entry for this relation. This + * may seem a bit unclean -- md calling smgr? But it's really + * the best solution. It ensures that the open file reference + * isn't permanently leaked if we get an error here. (You may + * say "but an unreferenced SMgrRelation is still a leak!" Not + * really, because the only case in which a checkpoint is done + * by a process that isn't about to shut down is in the + * bgwriter, and it will periodically do smgrcloseall(). This + * fact justifies our not closing the reln in the success path + * either, which is a good thing since in non-bgwriter cases + * we couldn't safely do that.) Furthermore, in many cases + * the relation will have been dirtied through this same smgr + * relation, and so we can save a file open/close cycle. + */ + reln = smgropen(entry->tag.rnode); + + /* + * It is possible that the relation has been dropped or + * truncated since the fsync request was entered. Therefore, + * allow ENOENT, but only if we didn't fail once already on + * this file. This applies both during _mdfd_getseg() and + * during FileSync, since fd.c might have closed the file + * behind our back. + */ + seg = _mdfd_getseg(reln, + entry->tag.segno * ((BlockNumber) RELSEG_SIZE), + true); + if (seg == NULL || + FileSync(seg->mdfd_vfd) < 0) + { + /* + * XXX is there any point in allowing more than one try? + * Don't see one at the moment, but easy to change the + * test here if so. + */ + if (!FILE_POSSIBLY_DELETED(errno) || + ++(entry->failures) > 1) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not fsync segment %u of relation %u/%u/%u: %m", + entry->tag.segno, + entry->tag.rnode.spcNode, + entry->tag.rnode.dbNode, + entry->tag.rnode.relNode))); + return false; + } + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", + entry->tag.segno, + entry->tag.rnode.spcNode, + entry->tag.rnode.dbNode, + entry->tag.rnode.relNode))); + need_retry = true; + continue; /* don't delete the hashtable entry */ } } - } - /* Okay, delete this entry */ - if (hash_search(pendingOpsTable, entry, - HASH_REMOVE, NULL) == NULL) - elog(ERROR, "pendingOpsTable corrupted"); - } + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, &entry->tag, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } + } while (need_retry); return true; } @@ -839,14 +900,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg) { if (pendingOpsTable) { - PendingOperationEntry entry; - - /* ensure any pad bytes in the struct are zeroed */ - MemSet(&entry, 0, sizeof(entry)); - entry.rnode = reln->smgr_rnode; - entry.segno = seg->mdfd_segno; - - (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL); + /* push it into local pending-ops table */ + RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno); return true; } else @@ -865,22 +920,135 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg) * * We stuff the fsync request into the local hash table for execution * during the bgwriter's next checkpoint. + * + * The range of possible segment numbers is way less than the range of + * BlockNumber, so we can reserve high values of segno for special purposes. + * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for + * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for + * a whole database. (These are a tad slow because the hash table has to be + * searched linearly, but it doesn't seem worth rethinking the table structure + * for them.) */ void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) { - PendingOperationEntry entry; - Assert(pendingOpsTable); - /* ensure any pad bytes in the struct are zeroed */ - MemSet(&entry, 0, sizeof(entry)); - entry.rnode = rnode; - entry.segno = segno; + if (segno == FORGET_RELATION_FSYNC) + { + /* Remove any pending requests for the entire relation */ + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; - (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL); + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + if (RelFileNodeEquals(entry->tag.rnode, rnode)) + { + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, &entry->tag, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } + } + } + else if (segno == FORGET_DATABASE_FSYNC) + { + /* Remove any pending requests for the entire database */ + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + if (entry->tag.rnode.dbNode == rnode.dbNode) + { + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, &entry->tag, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } + } + } + else + { + /* Normal case: enter a request to fsync this segment */ + PendingOperationTag key; + PendingOperationEntry *entry; + bool found; + + /* ensure any pad bytes in the hash key are zeroed */ + MemSet(&key, 0, sizeof(key)); + key.rnode = rnode; + key.segno = segno; + + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &key, + HASH_ENTER, + &found); + if (!found) /* new entry, so initialize it */ + entry->failures = 0; + } } +/* + * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten + */ +void +ForgetRelationFsyncRequests(RelFileNode rnode) +{ + if (pendingOpsTable) + { + /* standalone backend or startup process: fsync state is local */ + RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC); + } + else if (IsUnderPostmaster) + { + /* + * Notify the bgwriter about it. If we fail to queue the revoke + * message, we have to sleep and try again ... ugly, but hopefully + * won't happen often. + * + * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with + * an error would leave the no-longer-used file still present on + * disk, which would be bad, so I'm inclined to assume that the + * bgwriter will always empty the queue soon. + */ + while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC)) + pg_usleep(10000L); /* 10 msec seems a good number */ + /* + * Note we don't wait for the bgwriter to actually absorb the + * revoke message; see mdsync() for the implications. + */ + } +} + +/* + * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten + */ +void +ForgetDatabaseFsyncRequests(Oid dbid) +{ + RelFileNode rnode; + + rnode.dbNode = dbid; + rnode.spcNode = 0; + rnode.relNode = 0; + + if (pendingOpsTable) + { + /* standalone backend or startup process: fsync state is local */ + RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC); + } + else if (IsUnderPostmaster) + { + /* see notes in ForgetRelationFsyncRequests */ + while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC)) + pg_usleep(10000L); /* 10 msec seems a good number */ + } +} + + /* * _fdvec_alloc() -- Make a MdfdVec object. */ @@ -984,7 +1152,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound) (segstogo == 1 || InRecovery) ? O_CREAT : 0); if (v->mdfd_chain == NULL) { - if (allowNotFound && errno == ENOENT) + if (allowNotFound && FILE_POSSIBLY_DELETED(errno)) return NULL; ereport(ERROR, (errcode_for_file_access(), diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index b768a5b5304..a28010d7916 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.55 2006/03/24 04:32:13 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.55.2.1 2007/01/27 20:15:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -107,6 +107,8 @@ extern bool mdimmedsync(SMgrRelation reln); extern bool mdsync(void); extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno); +extern void ForgetRelationFsyncRequests(RelFileNode rnode); +extern void ForgetDatabaseFsyncRequests(Oid dbid); /* smgrtype.c */ extern Datum smgrout(PG_FUNCTION_ARGS);