mirror of
https://github.com/postgres/postgres.git
synced 2025-09-03 15:22:11 +03:00
977 lines
25 KiB
C
977 lines
25 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* md.c
|
|
* This code manages relations that reside on magnetic disk.
|
|
*
|
|
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118 2005/10/15 02:49:26 momjian Exp $
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <fcntl.h>
|
|
#include <sys/file.h>
|
|
|
|
#include "catalog/catalog.h"
|
|
#include "miscadmin.h"
|
|
#include "postmaster/bgwriter.h"
|
|
#include "storage/fd.h"
|
|
#include "storage/smgr.h"
|
|
#include "utils/hsearch.h"
|
|
#include "utils/memutils.h"
|
|
|
|
|
|
/*
|
|
* The magnetic disk storage manager keeps track of open file
|
|
* descriptors in its own descriptor pool. This is done to make it
|
|
* easier to support relations that are larger than the operating
|
|
* system's file size limit (often 2GBytes). In order to do that,
|
|
* we break relations up into chunks of < 2GBytes and store one chunk
|
|
* in each of several files that represent the relation. See the
|
|
* BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
|
|
* All chunks except the last MUST have size exactly equal to RELSEG_SIZE
|
|
* blocks --- see mdnblocks() and mdtruncate().
|
|
*
|
|
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
|
|
* cache is, therefore, just the head of a list of MdfdVec objects.
|
|
* But note the md_fd pointer can be NULL, indicating relation not open.
|
|
*
|
|
* Note that mdfd_chain == NULL does not necessarily mean the relation
|
|
* doesn't have another segment after this one; we may just not have
|
|
* opened the next segment yet. (We could not have "all segments are
|
|
* in the chain" as an invariant anyway, since another backend could
|
|
* extend the relation when we weren't looking.)
|
|
*
|
|
* All MdfdVec objects are palloc'd in the MdCxt memory context.
|
|
*/
|
|
|
|
typedef struct _MdfdVec
|
|
{
|
|
File mdfd_vfd; /* fd number in fd.c's pool */
|
|
BlockNumber mdfd_segno; /* segment number, from 0 */
|
|
#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
|
|
struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
|
|
#endif
|
|
} MdfdVec;
|
|
|
|
static MemoryContext MdCxt; /* context for all md.c allocations */
|
|
|
|
|
|
/*
|
|
* In some contexts (currently, standalone backends and the bgwriter process)
|
|
* we keep track of pending fsync operations: we need to remember all relation
|
|
* segments that have been written since the last checkpoint, so that we can
|
|
* fsync them down to disk before completing the next checkpoint. This hash
|
|
* table remembers the pending operations. We use a hash table not because
|
|
* we want to look up individual operations, but simply as a convenient way
|
|
* of eliminating duplicate requests.
|
|
*
|
|
* (Regular backends do not track pending operations locally, but forward
|
|
* them to the bgwriter.)
|
|
*
|
|
* XXX for WIN32, may want to expand this to track pending deletes, too.
|
|
*/
|
|
typedef struct
|
|
{
|
|
RelFileNode rnode; /* the targeted relation */
|
|
BlockNumber segno; /* which segment */
|
|
} PendingOperationEntry;
|
|
|
|
static HTAB *pendingOpsTable = NULL;
|
|
|
|
|
|
/* local routines */
|
|
static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
|
|
static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
|
|
static MdfdVec *_fdvec_alloc(void);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
|
|
int oflags);
|
|
#endif
|
|
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
|
|
bool allowNotFound);
|
|
static BlockNumber _mdnblocks(File file, Size blcksz);
|
|
|
|
|
|
/*
|
|
* mdinit() -- Initialize private state for magnetic disk storage manager.
|
|
*/
|
|
bool
|
|
mdinit(void)
|
|
{
|
|
MdCxt = AllocSetContextCreate(TopMemoryContext,
|
|
"MdSmgr",
|
|
ALLOCSET_DEFAULT_MINSIZE,
|
|
ALLOCSET_DEFAULT_INITSIZE,
|
|
ALLOCSET_DEFAULT_MAXSIZE);
|
|
|
|
/*
|
|
* Create pending-operations hashtable if we need it. Currently, we need
|
|
* it if we are standalone (not under a postmaster) OR if we are a
|
|
* bootstrap-mode subprocess of a postmaster (that is, a startup or
|
|
* bgwriter process).
|
|
*/
|
|
if (!IsUnderPostmaster || IsBootstrapProcessingMode())
|
|
{
|
|
HASHCTL hash_ctl;
|
|
|
|
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
|
|
hash_ctl.keysize = sizeof(PendingOperationEntry);
|
|
hash_ctl.entrysize = sizeof(PendingOperationEntry);
|
|
hash_ctl.hash = tag_hash;
|
|
hash_ctl.hcxt = MdCxt;
|
|
pendingOpsTable = hash_create("Pending Ops Table",
|
|
100L,
|
|
&hash_ctl,
|
|
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* mdcreate() -- Create a new relation on magnetic disk.
|
|
*
|
|
* If isRedo is true, it's okay for the relation to exist already.
|
|
*/
|
|
bool
|
|
mdcreate(SMgrRelation reln, bool isRedo)
|
|
{
|
|
char *path;
|
|
File fd;
|
|
|
|
if (isRedo && reln->md_fd != NULL)
|
|
return true; /* created and opened already... */
|
|
|
|
Assert(reln->md_fd == NULL);
|
|
|
|
path = relpath(reln->smgr_rnode);
|
|
|
|
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
|
|
|
|
if (fd < 0)
|
|
{
|
|
int save_errno = errno;
|
|
|
|
/*
|
|
* During bootstrap, there are cases where a system relation will be
|
|
* accessed (by internal backend processes) before the bootstrap
|
|
* script nominally creates it. Therefore, allow the file to exist
|
|
* already, even if isRedo is not set. (See also mdopen)
|
|
*/
|
|
if (isRedo || IsBootstrapProcessingMode())
|
|
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
|
|
if (fd < 0)
|
|
{
|
|
pfree(path);
|
|
/* be sure to return the error reported by create, not open */
|
|
errno = save_errno;
|
|
return false;
|
|
}
|
|
errno = 0;
|
|
}
|
|
|
|
pfree(path);
|
|
|
|
reln->md_fd = _fdvec_alloc();
|
|
|
|
reln->md_fd->mdfd_vfd = fd;
|
|
reln->md_fd->mdfd_segno = 0;
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
reln->md_fd->mdfd_chain = NULL;
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* mdunlink() -- Unlink a relation.
|
|
*
|
|
* Note that we're passed a RelFileNode --- by the time this is called,
|
|
* there won't be an SMgrRelation hashtable entry anymore.
|
|
*
|
|
* If isRedo is true, it's okay for the relation to be already gone.
|
|
*/
|
|
bool
|
|
mdunlink(RelFileNode rnode, bool isRedo)
|
|
{
|
|
bool status = true;
|
|
int save_errno = 0;
|
|
char *path;
|
|
|
|
path = relpath(rnode);
|
|
|
|
/* Delete the first segment, or only segment if not doing segmenting */
|
|
if (unlink(path) < 0)
|
|
{
|
|
if (!isRedo || errno != ENOENT)
|
|
{
|
|
status = false;
|
|
save_errno = errno;
|
|
}
|
|
}
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
/* Get the additional segments, if any */
|
|
if (status)
|
|
{
|
|
char *segpath = (char *) palloc(strlen(path) + 12);
|
|
BlockNumber segno;
|
|
|
|
for (segno = 1;; segno++)
|
|
{
|
|
sprintf(segpath, "%s.%u", path, segno);
|
|
if (unlink(segpath) < 0)
|
|
{
|
|
/* ENOENT is expected after the last segment... */
|
|
if (errno != ENOENT)
|
|
{
|
|
status = false;
|
|
save_errno = errno;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
pfree(segpath);
|
|
}
|
|
#endif
|
|
|
|
pfree(path);
|
|
|
|
errno = save_errno;
|
|
return status;
|
|
}
|
|
|
|
/*
|
|
* mdextend() -- Add a block to the specified relation.
|
|
*
|
|
* The semantics are basically the same as mdwrite(): write at the
|
|
* specified position. However, we are expecting to extend the
|
|
* relation (ie, blocknum is the current EOF), and so in case of
|
|
* failure we clean up by truncating.
|
|
*
|
|
* This routine returns true or false, with errno set as appropriate.
|
|
*
|
|
* Note: this routine used to call mdnblocks() to get the block position
|
|
* to write at, but that's pretty silly since the caller needs to know where
|
|
* the block will be written, and accordingly must have done mdnblocks()
|
|
* already. Might as well pass in the position and save a seek.
|
|
*/
|
|
bool
|
|
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
|
|
{
|
|
long seekpos;
|
|
int nbytes;
|
|
MdfdVec *v;
|
|
|
|
v = _mdfd_getseg(reln, blocknum, false);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
|
|
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
|
|
#else
|
|
seekpos = (long) (BLCKSZ * (blocknum));
|
|
#endif
|
|
|
|
/*
|
|
* Note: because caller obtained blocknum by calling _mdnblocks, which did
|
|
* a seek(SEEK_END), this seek is often redundant and will be optimized
|
|
* away by fd.c. It's not redundant, however, if there is a partial page
|
|
* at the end of the file. In that case we want to try to overwrite the
|
|
* partial page with a full page. It's also not redundant if bufmgr.c had
|
|
* to dump another buffer of the same file to make room for the new page's
|
|
* buffer.
|
|
*/
|
|
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
|
|
return false;
|
|
|
|
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
|
|
{
|
|
if (nbytes > 0)
|
|
{
|
|
int save_errno = errno;
|
|
|
|
/* Remove the partially-written page */
|
|
FileTruncate(v->mdfd_vfd, seekpos);
|
|
FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
|
|
errno = save_errno;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
if (!isTemp)
|
|
{
|
|
if (!register_dirty_segment(reln, v))
|
|
return false;
|
|
}
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* mdopen() -- Open the specified relation. ereport's on failure.
|
|
* (Optionally, can return NULL instead of ereport for ENOENT.)
|
|
*
|
|
* Note we only open the first segment, when there are multiple segments.
|
|
*/
|
|
static MdfdVec *
|
|
mdopen(SMgrRelation reln, bool allowNotFound)
|
|
{
|
|
MdfdVec *mdfd;
|
|
char *path;
|
|
File fd;
|
|
|
|
/* No work if already open */
|
|
if (reln->md_fd)
|
|
return reln->md_fd;
|
|
|
|
path = relpath(reln->smgr_rnode);
|
|
|
|
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
|
|
|
|
if (fd < 0)
|
|
{
|
|
/*
|
|
* During bootstrap, there are cases where a system relation will be
|
|
* accessed (by internal backend processes) before the bootstrap
|
|
* script nominally creates it. Therefore, accept mdopen() as a
|
|
* substitute for mdcreate() in bootstrap mode only. (See mdcreate)
|
|
*/
|
|
if (IsBootstrapProcessingMode())
|
|
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
|
|
if (fd < 0)
|
|
{
|
|
pfree(path);
|
|
if (allowNotFound && errno == ENOENT)
|
|
return NULL;
|
|
ereport(ERROR,
|
|
(errcode_for_file_access(),
|
|
errmsg("could not open relation %u/%u/%u: %m",
|
|
reln->smgr_rnode.spcNode,
|
|
reln->smgr_rnode.dbNode,
|
|
reln->smgr_rnode.relNode)));
|
|
}
|
|
}
|
|
|
|
pfree(path);
|
|
|
|
reln->md_fd = mdfd = _fdvec_alloc();
|
|
|
|
mdfd->mdfd_vfd = fd;
|
|
mdfd->mdfd_segno = 0;
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
mdfd->mdfd_chain = NULL;
|
|
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
|
|
#endif
|
|
|
|
return mdfd;
|
|
}
|
|
|
|
/*
|
|
* mdclose() -- Close the specified relation, if it isn't closed already.
|
|
*
|
|
* Returns true or false with errno set as appropriate.
|
|
*/
|
|
bool
|
|
mdclose(SMgrRelation reln)
|
|
{
|
|
MdfdVec *v = reln->md_fd;
|
|
|
|
/* No work if already closed */
|
|
if (v == NULL)
|
|
return true;
|
|
|
|
reln->md_fd = NULL; /* prevent dangling pointer after error */
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
while (v != NULL)
|
|
{
|
|
MdfdVec *ov = v;
|
|
|
|
/* if not closed already */
|
|
if (v->mdfd_vfd >= 0)
|
|
FileClose(v->mdfd_vfd);
|
|
/* Now free vector */
|
|
v = v->mdfd_chain;
|
|
pfree(ov);
|
|
}
|
|
#else
|
|
if (v->mdfd_vfd >= 0)
|
|
FileClose(v->mdfd_vfd);
|
|
pfree(v);
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* mdread() -- Read the specified block from a relation.
|
|
*/
|
|
bool
|
|
mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
|
|
{
|
|
bool status;
|
|
long seekpos;
|
|
int nbytes;
|
|
MdfdVec *v;
|
|
|
|
v = _mdfd_getseg(reln, blocknum, false);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
|
|
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
|
|
#else
|
|
seekpos = (long) (BLCKSZ * (blocknum));
|
|
#endif
|
|
|
|
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
|
|
return false;
|
|
|
|
status = true;
|
|
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
|
|
{
|
|
/*
|
|
* If we are at or past EOF, return zeroes without complaining. Also
|
|
* substitute zeroes if we found a partial block at EOF.
|
|
*
|
|
* XXX this is really ugly, bad design. However the current
|
|
* implementation of hash indexes requires it, because hash index
|
|
* pages are initialized out-of-order.
|
|
*/
|
|
if (nbytes == 0 ||
|
|
(nbytes > 0 && mdnblocks(reln) == blocknum))
|
|
MemSet(buffer, 0, BLCKSZ);
|
|
else
|
|
status = false;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
/*
|
|
* mdwrite() -- Write the supplied block at the appropriate location.
|
|
*/
|
|
bool
|
|
mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
|
|
{
|
|
long seekpos;
|
|
MdfdVec *v;
|
|
|
|
v = _mdfd_getseg(reln, blocknum, false);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
|
|
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
|
|
#else
|
|
seekpos = (long) (BLCKSZ * (blocknum));
|
|
#endif
|
|
|
|
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
|
|
return false;
|
|
|
|
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
|
|
return false;
|
|
|
|
if (!isTemp)
|
|
{
|
|
if (!register_dirty_segment(reln, v))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* mdnblocks() -- Get the number of blocks stored in a relation.
|
|
*
|
|
* Important side effect: all segments of the relation are opened
|
|
* and added to the mdfd_chain list. If this routine has not been
|
|
* called, then only segments up to the last one actually touched
|
|
* are present in the chain...
|
|
*
|
|
* Returns # of blocks, or InvalidBlockNumber on error.
|
|
*/
|
|
BlockNumber
|
|
mdnblocks(SMgrRelation reln)
|
|
{
|
|
MdfdVec *v = mdopen(reln, false);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
BlockNumber nblocks;
|
|
BlockNumber segno = 0;
|
|
|
|
/*
|
|
* Skip through any segments that aren't the last one, to avoid redundant
|
|
* seeks on them. We have previously verified that these segments are
|
|
* exactly RELSEG_SIZE long, and it's useless to recheck that each time.
|
|
* (NOTE: this assumption could only be wrong if another backend has
|
|
* truncated the relation. We rely on higher code levels to handle that
|
|
* scenario by closing and re-opening the md fd.)
|
|
*/
|
|
while (v->mdfd_chain != NULL)
|
|
{
|
|
segno++;
|
|
v = v->mdfd_chain;
|
|
}
|
|
|
|
for (;;)
|
|
{
|
|
nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
|
|
if (nblocks > ((BlockNumber) RELSEG_SIZE))
|
|
elog(FATAL, "segment too big");
|
|
if (nblocks < ((BlockNumber) RELSEG_SIZE))
|
|
return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
|
|
|
|
/*
|
|
* If segment is exactly RELSEG_SIZE, advance to next one.
|
|
*/
|
|
segno++;
|
|
|
|
if (v->mdfd_chain == NULL)
|
|
{
|
|
/*
|
|
* Because we pass O_CREAT, we will create the next segment (with
|
|
* zero length) immediately, if the last segment is of length
|
|
* REL_SEGSIZE. This is unnecessary but harmless, and testing for
|
|
* the case would take more cycles than it seems worth.
|
|
*/
|
|
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
|
|
if (v->mdfd_chain == NULL)
|
|
return InvalidBlockNumber; /* failed? */
|
|
}
|
|
|
|
v = v->mdfd_chain;
|
|
}
|
|
#else
|
|
return _mdnblocks(v->mdfd_vfd, BLCKSZ);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* mdtruncate() -- Truncate relation to specified number of blocks.
|
|
*
|
|
* Returns # of blocks or InvalidBlockNumber on error.
|
|
*/
|
|
BlockNumber
|
|
mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
|
|
{
|
|
MdfdVec *v;
|
|
BlockNumber curnblk;
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
BlockNumber priorblocks;
|
|
#endif
|
|
|
|
/*
|
|
* NOTE: mdnblocks makes sure we have opened all existing segments, so
|
|
* that truncate/delete loop will get them all!
|
|
*/
|
|
curnblk = mdnblocks(reln);
|
|
if (curnblk == InvalidBlockNumber)
|
|
return InvalidBlockNumber; /* mdnblocks failed */
|
|
if (nblocks > curnblk)
|
|
return InvalidBlockNumber; /* bogus request */
|
|
if (nblocks == curnblk)
|
|
return nblocks; /* no work */
|
|
|
|
v = mdopen(reln, false);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
priorblocks = 0;
|
|
while (v != NULL)
|
|
{
|
|
MdfdVec *ov = v;
|
|
|
|
if (priorblocks > nblocks)
|
|
{
|
|
/*
|
|
* This segment is no longer wanted at all (and has already been
|
|
* unlinked from the mdfd_chain). We truncate the file before
|
|
* deleting it because if other backends are holding the file
|
|
* open, the unlink will fail on some platforms. Better a
|
|
* zero-size file gets left around than a big file...
|
|
*/
|
|
FileTruncate(v->mdfd_vfd, 0);
|
|
FileUnlink(v->mdfd_vfd);
|
|
v = v->mdfd_chain;
|
|
Assert(ov != reln->md_fd); /* we never drop the 1st segment */
|
|
pfree(ov);
|
|
}
|
|
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
|
|
{
|
|
/*
|
|
* This is the last segment we want to keep. Truncate the file to
|
|
* the right length, and clear chain link that points to any
|
|
* remaining segments (which we shall zap). NOTE: if nblocks is
|
|
* exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
|
|
* segment to 0 length but keep it. This is mainly so that the
|
|
* right thing happens if nblocks==0.
|
|
*/
|
|
BlockNumber lastsegblocks = nblocks - priorblocks;
|
|
|
|
if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
|
|
return InvalidBlockNumber;
|
|
if (!isTemp)
|
|
{
|
|
if (!register_dirty_segment(reln, v))
|
|
return InvalidBlockNumber;
|
|
}
|
|
v = v->mdfd_chain;
|
|
ov->mdfd_chain = NULL;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* We still need this segment and 0 or more blocks beyond it, so
|
|
* nothing to do here.
|
|
*/
|
|
v = v->mdfd_chain;
|
|
}
|
|
priorblocks += RELSEG_SIZE;
|
|
}
|
|
#else
|
|
if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
|
|
return InvalidBlockNumber;
|
|
if (!isTemp)
|
|
{
|
|
if (!register_dirty_segment(reln, v))
|
|
return InvalidBlockNumber;
|
|
}
|
|
#endif
|
|
|
|
return nblocks;
|
|
}
|
|
|
|
/*
|
|
* mdimmedsync() -- Immediately sync a relation to stable storage.
|
|
*
|
|
* Note that only writes already issued are synced; this routine knows
|
|
* nothing of dirty buffers that may exist inside the buffer manager.
|
|
*/
|
|
bool
|
|
mdimmedsync(SMgrRelation reln)
|
|
{
|
|
MdfdVec *v;
|
|
BlockNumber curnblk;
|
|
|
|
/*
|
|
* NOTE: mdnblocks makes sure we have opened all existing segments, so
|
|
* that fsync loop will get them all!
|
|
*/
|
|
curnblk = mdnblocks(reln);
|
|
if (curnblk == InvalidBlockNumber)
|
|
return false; /* mdnblocks failed */
|
|
|
|
v = mdopen(reln, false);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
while (v != NULL)
|
|
{
|
|
if (FileSync(v->mdfd_vfd) < 0)
|
|
return false;
|
|
v = v->mdfd_chain;
|
|
}
|
|
#else
|
|
if (FileSync(v->mdfd_vfd) < 0)
|
|
return false;
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* mdsync() -- Sync previous writes to stable storage.
|
|
*
|
|
* This is only called during checkpoints, and checkpoints should only
|
|
* occur in processes that have created a pendingOpsTable.
|
|
*/
|
|
bool
|
|
mdsync(void)
|
|
{
|
|
HASH_SEQ_STATUS hstat;
|
|
PendingOperationEntry *entry;
|
|
|
|
if (!pendingOpsTable)
|
|
return false;
|
|
|
|
/*
|
|
* If we are in the bgwriter, the sync had better include all fsync
|
|
* requests that were queued by backends before the checkpoint REDO point
|
|
* was determined. We go that a little better by accepting all requests
|
|
* queued up to the point where we start fsync'ing.
|
|
*/
|
|
AbsorbFsyncRequests();
|
|
|
|
hash_seq_init(&hstat, pendingOpsTable);
|
|
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
|
{
|
|
/*
|
|
* If fsync is off then we don't have to bother opening the file at
|
|
* all. (We delay checking until this point so that changing fsync on
|
|
* the fly behaves sensibly.)
|
|
*/
|
|
if (enableFsync)
|
|
{
|
|
SMgrRelation reln;
|
|
MdfdVec *seg;
|
|
|
|
/*
|
|
* Find or create an smgr hash entry for this relation. This may
|
|
* seem a bit unclean -- md calling smgr? But it's really the
|
|
* best solution. It ensures that the open file reference isn't
|
|
* permanently leaked if we get an error here. (You may say "but
|
|
* an unreferenced SMgrRelation is still a leak!" Not really,
|
|
* because the only case in which a checkpoint is done by a
|
|
* process that isn't about to shut down is in the bgwriter, and
|
|
* it will periodically do smgrcloseall(). This fact justifies
|
|
* our not closing the reln in the success path either, which is a
|
|
* good thing since in non-bgwriter cases we couldn't safely do
|
|
* that.) Furthermore, in many cases the relation will have been
|
|
* dirtied through this same smgr relation, and so we can save a
|
|
* file open/close cycle.
|
|
*/
|
|
reln = smgropen(entry->rnode);
|
|
|
|
/*
|
|
* It is possible that the relation has been dropped or truncated
|
|
* since the fsync request was entered. Therefore, we have to
|
|
* allow file-not-found errors. This applies both during
|
|
* _mdfd_getseg() and during FileSync, since fd.c might have
|
|
* closed the file behind our back.
|
|
*/
|
|
seg = _mdfd_getseg(reln,
|
|
entry->segno * ((BlockNumber) RELSEG_SIZE),
|
|
true);
|
|
if (seg)
|
|
{
|
|
if (FileSync(seg->mdfd_vfd) < 0 &&
|
|
errno != ENOENT)
|
|
{
|
|
ereport(LOG,
|
|
(errcode_for_file_access(),
|
|
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
|
|
entry->segno,
|
|
entry->rnode.spcNode,
|
|
entry->rnode.dbNode,
|
|
entry->rnode.relNode)));
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Okay, delete this entry */
|
|
if (hash_search(pendingOpsTable, entry,
|
|
HASH_REMOVE, NULL) == NULL)
|
|
elog(ERROR, "pendingOpsTable corrupted");
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* register_dirty_segment() -- Mark a relation segment as needing fsync
|
|
*
|
|
* If there is a local pending-ops table, just make an entry in it for
|
|
* mdsync to process later. Otherwise, try to pass off the fsync request
|
|
* to the background writer process. If that fails, just do the fsync
|
|
* locally before returning (we expect this will not happen often enough
|
|
* to be a performance problem).
|
|
*
|
|
* A false result implies I/O failure during local fsync. errno will be
|
|
* valid for error reporting.
|
|
*/
|
|
static bool
|
|
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
|
|
{
|
|
if (pendingOpsTable)
|
|
{
|
|
PendingOperationEntry entry;
|
|
|
|
/* ensure any pad bytes in the struct are zeroed */
|
|
MemSet(&entry, 0, sizeof(entry));
|
|
entry.rnode = reln->smgr_rnode;
|
|
entry.segno = seg->mdfd_segno;
|
|
|
|
(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
|
|
return true;
|
|
}
|
|
|
|
if (FileSync(seg->mdfd_vfd) < 0)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
|
|
*
|
|
* We stuff the fsync request into the local hash table for execution
|
|
* during the bgwriter's next checkpoint.
|
|
*/
|
|
void
|
|
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
|
|
{
|
|
PendingOperationEntry entry;
|
|
|
|
Assert(pendingOpsTable);
|
|
|
|
/* ensure any pad bytes in the struct are zeroed */
|
|
MemSet(&entry, 0, sizeof(entry));
|
|
entry.rnode = rnode;
|
|
entry.segno = segno;
|
|
|
|
(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
|
|
}
|
|
|
|
/*
|
|
* _fdvec_alloc() -- Make a MdfdVec object.
|
|
*/
|
|
static MdfdVec *
|
|
_fdvec_alloc(void)
|
|
{
|
|
return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
|
|
}
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
|
|
/*
|
|
* Open the specified segment of the relation,
|
|
* and make a MdfdVec object for it. Returns NULL on failure.
|
|
*/
|
|
static MdfdVec *
|
|
_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
|
|
{
|
|
MdfdVec *v;
|
|
int fd;
|
|
char *path,
|
|
*fullpath;
|
|
|
|
path = relpath(reln->smgr_rnode);
|
|
|
|
if (segno > 0)
|
|
{
|
|
/* be sure we have enough space for the '.segno' */
|
|
fullpath = (char *) palloc(strlen(path) + 12);
|
|
sprintf(fullpath, "%s.%u", path, segno);
|
|
pfree(path);
|
|
}
|
|
else
|
|
fullpath = path;
|
|
|
|
/* open the file */
|
|
fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
|
|
|
|
pfree(fullpath);
|
|
|
|
if (fd < 0)
|
|
return NULL;
|
|
|
|
/* allocate an mdfdvec entry for it */
|
|
v = _fdvec_alloc();
|
|
|
|
/* fill the entry */
|
|
v->mdfd_vfd = fd;
|
|
v->mdfd_segno = segno;
|
|
v->mdfd_chain = NULL;
|
|
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
|
|
|
|
/* all done */
|
|
return v;
|
|
}
|
|
#endif /* LET_OS_MANAGE_FILESIZE */
|
|
|
|
/*
|
|
* _mdfd_getseg() -- Find the segment of the relation holding the
|
|
* specified block. ereport's on failure.
|
|
* (Optionally, can return NULL instead of ereport for ENOENT.)
|
|
*/
|
|
static MdfdVec *
|
|
_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
|
|
{
|
|
MdfdVec *v = mdopen(reln, allowNotFound);
|
|
|
|
#ifndef LET_OS_MANAGE_FILESIZE
|
|
BlockNumber segstogo;
|
|
BlockNumber nextsegno;
|
|
|
|
if (!v)
|
|
return NULL; /* only possible if allowNotFound */
|
|
|
|
for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
|
|
segstogo > 0;
|
|
nextsegno++, segstogo--)
|
|
{
|
|
if (v->mdfd_chain == NULL)
|
|
{
|
|
/*
|
|
* We will create the next segment only if the target block is
|
|
* within it. This prevents Sorcerer's Apprentice syndrome if a
|
|
* bug at higher levels causes us to be handed a ridiculously
|
|
* large blkno --- otherwise we could create many thousands of
|
|
* empty segment files before reaching the "target" block. We
|
|
* should never need to create more than one new segment per call,
|
|
* so this restriction seems reasonable.
|
|
*
|
|
* BUT: when doing WAL recovery, disable this logic and create
|
|
* segments unconditionally. In this case it seems better to
|
|
* assume the given blkno is good (it presumably came from a
|
|
* CRC-checked WAL record); furthermore this lets us cope in the
|
|
* case where we are replaying WAL data that has a write into a
|
|
* high-numbered segment of a relation that was later deleted. We
|
|
* want to go ahead and create the segments so we can finish out
|
|
* the replay.
|
|
*/
|
|
v->mdfd_chain = _mdfd_openseg(reln,
|
|
nextsegno,
|
|
(segstogo == 1 || InRecovery) ? O_CREAT : 0);
|
|
if (v->mdfd_chain == NULL)
|
|
{
|
|
if (allowNotFound && errno == ENOENT)
|
|
return NULL;
|
|
ereport(ERROR,
|
|
(errcode_for_file_access(),
|
|
errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
|
|
nextsegno,
|
|
reln->smgr_rnode.spcNode,
|
|
reln->smgr_rnode.dbNode,
|
|
reln->smgr_rnode.relNode,
|
|
blkno)));
|
|
}
|
|
}
|
|
v = v->mdfd_chain;
|
|
}
|
|
#endif
|
|
|
|
return v;
|
|
}
|
|
|
|
/*
|
|
* Get number of blocks present in a single disk file
|
|
*/
|
|
static BlockNumber
|
|
_mdnblocks(File file, Size blcksz)
|
|
{
|
|
long len;
|
|
|
|
len = FileSeek(file, 0L, SEEK_END);
|
|
if (len < 0)
|
|
return 0; /* on failure, assume file is empty */
|
|
return (BlockNumber) (len / blcksz);
|
|
}
|