1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-02 04:21:28 +03:00

Per previous discussions, get rid of use of sync(2) in favor of

explicitly fsync'ing every (non-temp) file we have written since the
last checkpoint.  In the vast majority of cases, the burden of the
fsyncs should fall on the bgwriter process not on backends.  (To this
end, we assume that an fsync issued by the bgwriter will force out
blocks written to the same file by other processes using other file
descriptors.  Anyone have a problem with that?)  This makes the world
safe for WIN32, which ain't even got sync(2), and really makes the world
safe for Unixen as well, because sync(2) never had the semantics we need:
it offers no way to wait for the requested I/O to finish.

Along the way, fix a bug I recently introduced in xlog recovery:
file truncation replay failed to clear bufmgr buffers for the dropped
blocks, which could result in 'PANIC:  heap_delete_redo: no block'
later on in xlog replay.
This commit is contained in:
Tom Lane
2004-05-31 03:48:10 +00:00
parent f024086db3
commit 9b178555fc
13 changed files with 779 additions and 250 deletions

View File

@@ -27,14 +27,17 @@
*
* If the bgwriter exits unexpectedly, the postmaster treats that the same
* as a backend crash: shared memory may be corrupted, so remaining backends
* should be killed by SIGQUIT and then a recovery cycle started.
* should be killed by SIGQUIT and then a recovery cycle started. (Even if
* shared memory isn't corrupted, we have lost information about which
* files need to be fsync'd for the next checkpoint, and so a system
* restart needs to be forced.)
*
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -55,13 +58,54 @@
#include "utils/guc.h"
/*
/*----------
* Shared memory area for communication between bgwriter and backends
*
* The ckpt counters allow backends to watch for completion of a checkpoint
* request they send. Here's how it works:
* * At start of a checkpoint, bgwriter increments ckpt_started.
* * On completion of a checkpoint, bgwriter sets ckpt_done to
* equal ckpt_started.
* * On failure of a checkpoint, bgwrite first increments ckpt_failed,
* then sets ckpt_done to equal ckpt_started.
* All three fields are declared sig_atomic_t to ensure they can be read
* and written without explicit locking. The algorithm for backends is:
* 1. Record current values of ckpt_failed and ckpt_started (in that
* order!).
* 2. Send signal to request checkpoint.
* 3. Sleep until ckpt_started changes. Now you know a checkpoint has
* begun since you started this algorithm (although *not* that it was
* specifically initiated by your signal).
* 4. Record new value of ckpt_started.
* 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
* arithmetic here in case counters wrap around.) Now you know a
* checkpoint has started and completed, but not whether it was
* successful.
* 6. If ckpt_failed is different from the originally saved value,
* assume request failed; otherwise it was definitely successful.
*
* The requests array holds fsync requests sent by backends and not yet
* absorbed by the bgwriter.
*----------
*/
typedef struct
{
RelFileNode rnode;
BlockNumber segno;
/* might add a request-type field later */
} BgWriterRequest;
typedef struct
{
pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */
sig_atomic_t checkpoint_count; /* advances when checkpoint done */
sig_atomic_t ckpt_started; /* advances when checkpoint starts */
sig_atomic_t ckpt_done; /* advances when checkpoint done */
sig_atomic_t ckpt_failed; /* advances when checkpoint fails */
int num_requests; /* current # of requests */
int max_requests; /* allocated array size */
BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */
} BgWriterShmemStruct;
static BgWriterShmemStruct *BgWriterShmem;
@@ -86,6 +130,10 @@ static volatile sig_atomic_t shutdown_requested = false;
/*
* Private state
*/
static bool am_bg_writer = false;
static bool ckpt_active = false;
static time_t last_checkpoint_time;
@@ -106,6 +154,7 @@ BackgroundWriterMain(void)
{
Assert(BgWriterShmem != NULL);
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
/*
* Properly accept or ignore signals the postmaster might send us
@@ -180,6 +229,17 @@ BackgroundWriterMain(void)
*/
InError = false;
/* Warn any waiting backends that the checkpoint failed. */
if (ckpt_active)
{
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
bgs->ckpt_failed++;
bgs->ckpt_done = bgs->ckpt_started;
ckpt_active = false;
}
/*
* Exit interrupt holdoff section we implicitly established above.
*/
@@ -214,8 +274,17 @@ BackgroundWriterMain(void)
long udelay;
/*
* Process any signals received recently.
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
*/
if (!PostmasterIsAlive(true))
exit(1);
/*
* Process any requests or signals received recently.
*/
AbsorbFsyncRequests();
if (got_SIGHUP)
{
got_SIGHUP = false;
@@ -265,8 +334,20 @@ BackgroundWriterMain(void)
errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
}
/*
* Indicate checkpoint start to any waiting backends.
*/
ckpt_active = true;
BgWriterShmem->ckpt_started++;
CreateCheckPoint(false, force_checkpoint);
/*
* Indicate checkpoint completion to any waiting backends.
*/
BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
ckpt_active = false;
/*
* Note we record the checkpoint start time not end time as
* last_checkpoint_time. This is so that time-driven checkpoints
@@ -274,14 +355,11 @@ BackgroundWriterMain(void)
*/
last_checkpoint_time = now;
/*
* Indicate checkpoint completion to any waiting backends.
*/
BgWriterShmem->checkpoint_count++;
/*
* After any checkpoint, close all smgr files. This is so we
* won't hang onto smgr references to deleted files indefinitely.
* (It is safe to do this because this process does not have a
* relcache, and so no dangling references could remain.)
*/
smgrcloseall();
@@ -301,6 +379,8 @@ BackgroundWriterMain(void)
* we respond reasonably promptly when someone signals us,
* break down the sleep into 1-second increments, and check for
* interrupts after each nap.
*
* We absorb pending requests after each short sleep.
*/
udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
while (udelay > 1000000L)
@@ -308,17 +388,11 @@ BackgroundWriterMain(void)
if (got_SIGHUP || checkpoint_requested || shutdown_requested)
break;
pg_usleep(1000000L);
AbsorbFsyncRequests();
udelay -= 1000000L;
}
if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
pg_usleep(udelay);
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
*/
if (!PostmasterIsAlive(true))
exit(1);
}
}
@@ -387,10 +461,11 @@ int
BgWriterShmemSize(void)
{
/*
* This is not worth measuring right now, but may become so after we
* add fsync signaling ...
* Currently, the size of the requests[] array is arbitrarily set
* equal to NBuffers. This may prove too large or small ...
*/
return MAXALIGN(sizeof(BgWriterShmemStruct));
return MAXALIGN(sizeof(BgWriterShmemStruct) +
(NBuffers - 1) * sizeof(BgWriterRequest));
}
/*
@@ -404,7 +479,7 @@ BgWriterShmemInit(void)
BgWriterShmem = (BgWriterShmemStruct *)
ShmemInitStruct("Background Writer Data",
sizeof(BgWriterShmemStruct),
BgWriterShmemSize(),
&found);
if (BgWriterShmem == NULL)
ereport(FATAL,
@@ -414,6 +489,7 @@ BgWriterShmemInit(void)
return; /* already initialized */
MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
BgWriterShmem->max_requests = NBuffers;
}
/*
@@ -427,8 +503,10 @@ BgWriterShmemInit(void)
void
RequestCheckpoint(bool waitforit)
{
volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count;
sig_atomic_t old_count = *count_ptr;
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
sig_atomic_t old_failed = bgs->ckpt_failed;
sig_atomic_t old_started = bgs->ckpt_started;
/*
* Send signal to request checkpoint. When waitforit is false,
@@ -442,15 +520,119 @@ RequestCheckpoint(bool waitforit)
"could not signal for checkpoint: %m");
/*
* If requested, wait for completion. We detect completion by
* observing a change in checkpoint_count in shared memory.
* If requested, wait for completion. We detect completion according
* to the algorithm given above.
*/
if (waitforit)
{
while (*count_ptr == old_count)
while (bgs->ckpt_started == old_started)
{
CHECK_FOR_INTERRUPTS();
pg_usleep(1000000L);
pg_usleep(100000L);
}
old_started = bgs->ckpt_started;
/*
* We are waiting for ckpt_done >= old_started, in a modulo
* sense. This is a little tricky since we don't know the
* width or signedness of sig_atomic_t. We make the lowest
* common denominator assumption that it is only as wide
* as "char". This means that this algorithm will cope
* correctly as long as we don't sleep for more than 127
* completed checkpoints. (If we do, we will get another
* chance to exit after 128 more checkpoints...)
*/
while (((signed char) (bgs->ckpt_done - old_started)) < 0)
{
CHECK_FOR_INTERRUPTS();
pg_usleep(100000L);
}
if (bgs->ckpt_failed != old_failed)
ereport(ERROR,
(errmsg("checkpoint request failed"),
errhint("Consult the postmaster log for details.")));
}
}
/*
* ForwardFsyncRequest
* Forward a file-fsync request from a backend to the bgwriter
*
* Whenever a backend is compelled to write directly to a relation
* (which should be seldom, if the bgwriter is getting its job done),
* the backend calls this routine to pass over knowledge that the relation
* is dirty and must be fsync'd before next checkpoint.
*
* If we are unable to pass over the request (at present, this can happen
* if the shared memory queue is full), we return false. That forces
* the backend to do its own fsync. We hope that will be even more seldom.
*
* Note: we presently make no attempt to eliminate duplicate requests
* in the requests[] queue. The bgwriter will have to eliminate dups
* internally anyway, so we may as well avoid holding the lock longer
* than we have to here.
*/
bool
ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
BgWriterRequest *request;
if (!IsUnderPostmaster)
return false; /* probably shouldn't even get here */
Assert(BgWriterShmem != NULL);
LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
if (BgWriterShmem->bgwriter_pid == 0 ||
BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
{
LWLockRelease(BgWriterCommLock);
return false;
}
request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
request->rnode = rnode;
request->segno = segno;
LWLockRelease(BgWriterCommLock);
return true;
}
/*
* AbsorbFsyncRequests
* Retrieve queued fsync requests and pass them to local smgr.
*
* This is exported because it must be called during CreateCheckpoint;
* we have to be sure we have accepted all pending requests *after* we
* establish the checkpoint redo pointer. Since CreateCheckpoint
* sometimes runs in non-bgwriter processes, do nothing if not bgwriter.
*/
void
AbsorbFsyncRequests(void)
{
BgWriterRequest *requests = NULL;
BgWriterRequest *request;
int n;
if (!am_bg_writer)
return;
/*
* We try to avoid holding the lock for a long time by copying the
* request array.
*/
LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
n = BgWriterShmem->num_requests;
if (n > 0)
{
requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
}
BgWriterShmem->num_requests = 0;
LWLockRelease(BgWriterCommLock);
for (request = requests; n > 0; request++, n--)
{
RememberFsyncRequest(request->rnode, request->segno);
}
if (requests)
pfree(requests);
}