1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-28 11:55:03 +03:00
Files
postgres/src/backend/storage/smgr/md.c
PostgreSQL Daemon 2ff501590b Tag appropriate files for rc3
Also performed an initial run through of upgrading our Copyright date to
extend to 2005 ... first run here was very simple ... change everything
where: grep 1996-2004 && the word 'Copyright' ... scanned through the
generated list with 'less' first, and after, to make sure that I only
picked up the right entries ...
2004-12-31 22:04:05 +00:00

983 lines
25 KiB
C

/*-------------------------------------------------------------------------
*
* md.c
* This code manages relations that reside on magnetic disk.
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114 2004/12/31 22:01:13 pgsql Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "catalog/catalog.h"
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
/*
* The magnetic disk storage manager keeps track of open file
* descriptors in its own descriptor pool. This is done to make it
* easier to support relations that are larger than the operating
* system's file size limit (often 2GBytes). In order to do that,
* we break relations up into chunks of < 2GBytes and store one chunk
* in each of several files that represent the relation. See the
* BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
* All chunks except the last MUST have size exactly equal to RELSEG_SIZE
* blocks --- see mdnblocks() and mdtruncate().
*
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
* cache is, therefore, just the head of a list of MdfdVec objects.
* But note the md_fd pointer can be NULL, indicating relation not open.
*
* Note that mdfd_chain == NULL does not necessarily mean the relation
* doesn't have another segment after this one; we may just not have
* opened the next segment yet. (We could not have "all segments are
* in the chain" as an invariant anyway, since another backend could
* extend the relation when we weren't looking.)
*
* All MdfdVec objects are palloc'd in the MdCxt memory context.
*/
typedef struct _MdfdVec
{
File mdfd_vfd; /* fd number in fd.c's pool */
BlockNumber mdfd_segno; /* segment number, from 0 */
#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
#endif
} MdfdVec;
static MemoryContext MdCxt; /* context for all md.c allocations */
/*
* In some contexts (currently, standalone backends and the bgwriter process)
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
* table remembers the pending operations. We use a hash table not because
* we want to look up individual operations, but simply as a convenient way
* of eliminating duplicate requests.
*
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
*
* XXX for WIN32, may want to expand this to track pending deletes, too.
*/
typedef struct
{
RelFileNode rnode; /* the targeted relation */
BlockNumber segno; /* which segment */
} PendingOperationEntry;
static HTAB *pendingOpsTable = NULL;
/* local routines */
static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
int oflags);
#endif
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
bool allowNotFound);
static BlockNumber _mdnblocks(File file, Size blcksz);
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
*/
bool
mdinit(void)
{
MdCxt = AllocSetContextCreate(TopMemoryContext,
"MdSmgr",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
/*
* Create pending-operations hashtable if we need it. Currently, we
* need it if we are standalone (not under a postmaster) OR if we are
* a bootstrap-mode subprocess of a postmaster (that is, a startup or
* bgwriter process).
*/
if (!IsUnderPostmaster || IsBootstrapProcessingMode())
{
HASHCTL hash_ctl;
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(PendingOperationEntry);
hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hash = tag_hash;
hash_ctl.hcxt = MdCxt;
pendingOpsTable = hash_create("Pending Ops Table",
100L,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
}
return true;
}
/*
* mdcreate() -- Create a new relation on magnetic disk.
*
* If isRedo is true, it's okay for the relation to exist already.
*/
bool
mdcreate(SMgrRelation reln, bool isRedo)
{
char *path;
File fd;
if (isRedo && reln->md_fd != NULL)
return true; /* created and opened already... */
Assert(reln->md_fd == NULL);
path = relpath(reln->smgr_rnode);
fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
if (fd < 0)
{
int save_errno = errno;
/*
* During bootstrap, there are cases where a system relation will
* be accessed (by internal backend processes) before the
* bootstrap script nominally creates it. Therefore, allow the
* file to exist already, even if isRedo is not set. (See also
* mdopen)
*/
if (isRedo || IsBootstrapProcessingMode())
fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
if (fd < 0)
{
pfree(path);
/* be sure to return the error reported by create, not open */
errno = save_errno;
return false;
}
errno = 0;
}
pfree(path);
reln->md_fd = _fdvec_alloc();
reln->md_fd->mdfd_vfd = fd;
reln->md_fd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
return true;
}
/*
* mdunlink() -- Unlink a relation.
*
* Note that we're passed a RelFileNode --- by the time this is called,
* there won't be an SMgrRelation hashtable entry anymore.
*
* If isRedo is true, it's okay for the relation to be already gone.
*/
bool
mdunlink(RelFileNode rnode, bool isRedo)
{
bool status = true;
int save_errno = 0;
char *path;
path = relpath(rnode);
/* Delete the first segment, or only segment if not doing segmenting */
if (unlink(path) < 0)
{
if (!isRedo || errno != ENOENT)
{
status = false;
save_errno = errno;
}
}
#ifndef LET_OS_MANAGE_FILESIZE
/* Get the additional segments, if any */
if (status)
{
char *segpath = (char *) palloc(strlen(path) + 12);
BlockNumber segno;
for (segno = 1;; segno++)
{
sprintf(segpath, "%s.%u", path, segno);
if (unlink(segpath) < 0)
{
/* ENOENT is expected after the last segment... */
if (errno != ENOENT)
{
status = false;
save_errno = errno;
}
break;
}
}
pfree(segpath);
}
#endif
pfree(path);
errno = save_errno;
return status;
}
/*
* mdextend() -- Add a block to the specified relation.
*
* The semantics are basically the same as mdwrite(): write at the
* specified position. However, we are expecting to extend the
* relation (ie, blocknum is the current EOF), and so in case of
* failure we clean up by truncating.
*
* This routine returns true or false, with errno set as appropriate.
*
* Note: this routine used to call mdnblocks() to get the block position
* to write at, but that's pretty silly since the caller needs to know where
* the block will be written, and accordingly must have done mdnblocks()
* already. Might as well pass in the position and save a seek.
*/
bool
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
/*
* Note: because caller obtained blocknum by calling _mdnblocks, which
* did a seek(SEEK_END), this seek is often redundant and will be
* optimized away by fd.c. It's not redundant, however, if there is a
* partial page at the end of the file. In that case we want to try
* to overwrite the partial page with a full page. It's also not
* redundant if bufmgr.c had to dump another buffer of the same file
* to make room for the new page's buffer.
*/
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return false;
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
if (nbytes > 0)
{
int save_errno = errno;
/* Remove the partially-written page */
FileTruncate(v->mdfd_vfd, seekpos);
FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
errno = save_errno;
}
return false;
}
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return false;
}
#ifndef LET_OS_MANAGE_FILESIZE
Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
return true;
}
/*
* mdopen() -- Open the specified relation. ereport's on failure.
* (Optionally, can return NULL instead of ereport for ENOENT.)
*
* Note we only open the first segment, when there are multiple segments.
*/
static MdfdVec *
mdopen(SMgrRelation reln, bool allowNotFound)
{
MdfdVec *mdfd;
char *path;
File fd;
/* No work if already open */
if (reln->md_fd)
return reln->md_fd;
path = relpath(reln->smgr_rnode);
fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
if (fd < 0)
{
/*
* During bootstrap, there are cases where a system relation will
* be accessed (by internal backend processes) before the
* bootstrap script nominally creates it. Therefore, accept
* mdopen() as a substitute for mdcreate() in bootstrap mode only.
* (See mdcreate)
*/
if (IsBootstrapProcessingMode())
fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
if (fd < 0)
{
pfree(path);
if (allowNotFound && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open relation %u/%u/%u: %m",
reln->smgr_rnode.spcNode,
reln->smgr_rnode.dbNode,
reln->smgr_rnode.relNode)));
}
}
pfree(path);
reln->md_fd = mdfd = _fdvec_alloc();
mdfd->mdfd_vfd = fd;
mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
mdfd->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
return mdfd;
}
/*
* mdclose() -- Close the specified relation, if it isn't closed already.
*
* Returns true or false with errno set as appropriate.
*/
bool
mdclose(SMgrRelation reln)
{
MdfdVec *v = reln->md_fd;
/* No work if already closed */
if (v == NULL)
return true;
reln->md_fd = NULL; /* prevent dangling pointer after error */
#ifndef LET_OS_MANAGE_FILESIZE
while (v != NULL)
{
MdfdVec *ov = v;
/* if not closed already */
if (v->mdfd_vfd >= 0)
FileClose(v->mdfd_vfd);
/* Now free vector */
v = v->mdfd_chain;
pfree(ov);
}
#else
if (v->mdfd_vfd >= 0)
FileClose(v->mdfd_vfd);
pfree(v);
#endif
return true;
}
/*
* mdread() -- Read the specified block from a relation.
*/
bool
mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
{
bool status;
long seekpos;
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return false;
status = true;
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
/*
* If we are at or past EOF, return zeroes without complaining.
* Also substitute zeroes if we found a partial block at EOF.
*
* XXX this is really ugly, bad design. However the current
* implementation of hash indexes requires it, because hash index
* pages are initialized out-of-order.
*/
if (nbytes == 0 ||
(nbytes > 0 && mdnblocks(reln) == blocknum))
MemSet(buffer, 0, BLCKSZ);
else
status = false;
}
return status;
}
/*
* mdwrite() -- Write the supplied block at the appropriate location.
*/
bool
mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return false;
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
return false;
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return false;
}
return true;
}
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
* Important side effect: all segments of the relation are opened
* and added to the mdfd_chain list. If this routine has not been
* called, then only segments up to the last one actually touched
* are present in the chain...
*
* Returns # of blocks, or InvalidBlockNumber on error.
*/
BlockNumber
mdnblocks(SMgrRelation reln)
{
MdfdVec *v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
BlockNumber segno = 0;
/*
* Skip through any segments that aren't the last one, to avoid
* redundant seeks on them. We have previously verified that these
* segments are exactly RELSEG_SIZE long, and it's useless to recheck
* that each time. (NOTE: this assumption could only be wrong if
* another backend has truncated the relation. We rely on higher code
* levels to handle that scenario by closing and re-opening the md
* fd.)
*/
while (v->mdfd_chain != NULL)
{
segno++;
v = v->mdfd_chain;
}
for (;;)
{
nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
elog(FATAL, "segment too big");
if (nblocks < ((BlockNumber) RELSEG_SIZE))
return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
/*
* If segment is exactly RELSEG_SIZE, advance to next one.
*/
segno++;
if (v->mdfd_chain == NULL)
{
/*
* Because we pass O_CREAT, we will create the next segment
* (with zero length) immediately, if the last segment is of
* length REL_SEGSIZE. This is unnecessary but harmless, and
* testing for the case would take more cycles than it seems
* worth.
*/
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
if (v->mdfd_chain == NULL)
return InvalidBlockNumber; /* failed? */
}
v = v->mdfd_chain;
}
#else
return _mdnblocks(v->mdfd_vfd, BLCKSZ);
#endif
}
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
*
* Returns # of blocks or InvalidBlockNumber on error.
*/
BlockNumber
mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
{
MdfdVec *v;
BlockNumber curnblk;
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber priorblocks;
#endif
/*
* NOTE: mdnblocks makes sure we have opened all existing segments, so
* that truncate/delete loop will get them all!
*/
curnblk = mdnblocks(reln);
if (curnblk == InvalidBlockNumber)
return InvalidBlockNumber; /* mdnblocks failed */
if (nblocks > curnblk)
return InvalidBlockNumber; /* bogus request */
if (nblocks == curnblk)
return nblocks; /* no work */
v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
while (v != NULL)
{
MdfdVec *ov = v;
if (priorblocks > nblocks)
{
/*
* This segment is no longer wanted at all (and has already
* been unlinked from the mdfd_chain). We truncate the file
* before deleting it because if other backends are holding
* the file open, the unlink will fail on some platforms.
* Better a zero-size file gets left around than a big file...
*/
FileTruncate(v->mdfd_vfd, 0);
FileUnlink(v->mdfd_vfd);
v = v->mdfd_chain;
Assert(ov != reln->md_fd); /* we never drop the 1st segment */
pfree(ov);
}
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
{
/*
* This is the last segment we want to keep. Truncate the file
* to the right length, and clear chain link that points to
* any remaining segments (which we shall zap). NOTE: if
* nblocks is exactly a multiple K of RELSEG_SIZE, we will
* truncate the K+1st segment to 0 length but keep it. This is
* mainly so that the right thing happens if nblocks==0.
*/
BlockNumber lastsegblocks = nblocks - priorblocks;
if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
return InvalidBlockNumber;
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return InvalidBlockNumber;
}
v = v->mdfd_chain;
ov->mdfd_chain = NULL;
}
else
{
/*
* We still need this segment and 0 or more blocks beyond it,
* so nothing to do here.
*/
v = v->mdfd_chain;
}
priorblocks += RELSEG_SIZE;
}
#else
if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
return InvalidBlockNumber;
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return InvalidBlockNumber;
}
#endif
return nblocks;
}
/*
* mdimmedsync() -- Immediately sync a relation to stable storage.
*/
bool
mdimmedsync(SMgrRelation reln)
{
MdfdVec *v;
BlockNumber curnblk;
/*
* NOTE: mdnblocks makes sure we have opened all existing segments, so
* that fsync loop will get them all!
*/
curnblk = mdnblocks(reln);
if (curnblk == InvalidBlockNumber)
return false; /* mdnblocks failed */
v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
while (v != NULL)
{
if (FileSync(v->mdfd_vfd) < 0)
return false;
v = v->mdfd_chain;
}
#else
if (FileSync(v->mdfd_vfd) < 0)
return false;
#endif
return true;
}
/*
* mdsync() -- Sync previous writes to stable storage.
*
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOpsTable.
*/
bool
mdsync(void)
{
HASH_SEQ_STATUS hstat;
PendingOperationEntry *entry;
if (!pendingOpsTable)
return false;
/*
* If we are in the bgwriter, the sync had better include all fsync
* requests that were queued by backends before the checkpoint REDO
* point was determined. We go that a little better by accepting all
* requests queued up to the point where we start fsync'ing.
*/
AbsorbFsyncRequests();
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
/*
* If fsync is off then we don't have to bother opening the file
* at all. (We delay checking until this point so that changing
* fsync on the fly behaves sensibly.)
*/
if (enableFsync)
{
SMgrRelation reln;
MdfdVec *seg;
/*
* Find or create an smgr hash entry for this relation. This
* may seem a bit unclean -- md calling smgr? But it's really
* the best solution. It ensures that the open file reference
* isn't permanently leaked if we get an error here. (You may
* say "but an unreferenced SMgrRelation is still a leak!" Not
* really, because the only case in which a checkpoint is done
* by a process that isn't about to shut down is in the
* bgwriter, and it will periodically do smgrcloseall(). This
* fact justifies our not closing the reln in the success path
* either, which is a good thing since in non-bgwriter cases
* we couldn't safely do that.) Furthermore, in many cases
* the relation will have been dirtied through this same smgr
* relation, and so we can save a file open/close cycle.
*/
reln = smgropen(entry->rnode);
/*
* It is possible that the relation has been dropped or
* truncated since the fsync request was entered. Therefore,
* we have to allow file-not-found errors. This applies both
* during _mdfd_getseg() and during FileSync, since fd.c might
* have closed the file behind our back.
*/
seg = _mdfd_getseg(reln,
entry->segno * ((BlockNumber) RELSEG_SIZE),
true);
if (seg)
{
if (FileSync(seg->mdfd_vfd) < 0 &&
errno != ENOENT)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
entry->segno,
entry->rnode.spcNode,
entry->rnode.dbNode,
entry->rnode.relNode)));
return false;
}
}
}
/* Okay, delete this entry */
if (hash_search(pendingOpsTable, entry,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
return true;
}
/*
* register_dirty_segment() -- Mark a relation segment as needing fsync
*
* If there is a local pending-ops table, just make an entry in it for
* mdsync to process later. Otherwise, try to pass off the fsync request
* to the background writer process. If that fails, just do the fsync
* locally before returning (we expect this will not happen often enough
* to be a performance problem).
*
* A false result implies I/O failure during local fsync. errno will be
* valid for error reporting.
*/
static bool
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
if (pendingOpsTable)
{
PendingOperationEntry entry;
/* ensure any pad bytes in the struct are zeroed */
MemSet(&entry, 0, sizeof(entry));
entry.rnode = reln->smgr_rnode;
entry.segno = seg->mdfd_segno;
if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
return true;
/* out of memory: fall through to do it locally */
}
else
{
if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
return true;
}
if (FileSync(seg->mdfd_vfd) < 0)
return false;
return true;
}
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
*
* We stuff the fsync request into the local hash table for execution
* during the bgwriter's next checkpoint.
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
PendingOperationEntry entry;
Assert(pendingOpsTable);
/* ensure any pad bytes in the struct are zeroed */
MemSet(&entry, 0, sizeof(entry));
entry.rnode = rnode;
entry.segno = segno;
if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
ereport(FATAL,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
/*
* _fdvec_alloc() -- Make a MdfdVec object.
*/
static MdfdVec *
_fdvec_alloc(void)
{
return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
}
#ifndef LET_OS_MANAGE_FILESIZE
/*
* Open the specified segment of the relation,
* and make a MdfdVec object for it. Returns NULL on failure.
*/
static MdfdVec *
_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
{
MdfdVec *v;
int fd;
char *path,
*fullpath;
path = relpath(reln->smgr_rnode);
if (segno > 0)
{
/* be sure we have enough space for the '.segno' */
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%u", path, segno);
pfree(path);
}
else
fullpath = path;
/* open the file */
fd = FileNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
pfree(fullpath);
if (fd < 0)
return NULL;
/* allocate an mdfdvec entry for it */
v = _fdvec_alloc();
/* fill the entry */
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
v->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
}
#endif /* LET_OS_MANAGE_FILESIZE */
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
* specified block. ereport's on failure.
* (Optionally, can return NULL instead of ereport for ENOENT.)
*/
static MdfdVec *
_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
{
MdfdVec *v = mdopen(reln, allowNotFound);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber segstogo;
BlockNumber nextsegno;
if (!v)
return NULL; /* only possible if allowNotFound */
for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
segstogo > 0;
nextsegno++, segstogo--)
{
if (v->mdfd_chain == NULL)
{
/*
* We will create the next segment only if the target block is
* within it. This prevents Sorcerer's Apprentice syndrome if
* a bug at higher levels causes us to be handed a
* ridiculously large blkno --- otherwise we could create many
* thousands of empty segment files before reaching the
* "target" block. We should never need to create more than
* one new segment per call, so this restriction seems
* reasonable.
*
* BUT: when doing WAL recovery, disable this logic and create
* segments unconditionally. In this case it seems better
* to assume the given blkno is good (it presumably came from
* a CRC-checked WAL record); furthermore this lets us cope
* in the case where we are replaying WAL data that has a write
* into a high-numbered segment of a relation that was later
* deleted. We want to go ahead and create the segments so
* we can finish out the replay.
*/
v->mdfd_chain = _mdfd_openseg(reln,
nextsegno,
(segstogo == 1 || InRecovery) ? O_CREAT : 0);
if (v->mdfd_chain == NULL)
{
if (allowNotFound && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
nextsegno,
reln->smgr_rnode.spcNode,
reln->smgr_rnode.dbNode,
reln->smgr_rnode.relNode,
blkno)));
}
}
v = v->mdfd_chain;
}
#endif
return v;
}
/*
* Get number of blocks present in a single disk file
*/
static BlockNumber
_mdnblocks(File file, Size blcksz)
{
long len;
len = FileSeek(file, 0L, SEEK_END);
if (len < 0)
return 0; /* on failure, assume file is empty */
return (BlockNumber) (len / blcksz);
}