1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-03 15:22:11 +03:00
Files
postgres/src/backend/storage/smgr/md.c
2005-10-15 02:49:52 +00:00

977 lines
25 KiB
C

/*-------------------------------------------------------------------------
*
* md.c
* This code manages relations that reside on magnetic disk.
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118 2005/10/15 02:49:26 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "catalog/catalog.h"
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
/*
* The magnetic disk storage manager keeps track of open file
* descriptors in its own descriptor pool. This is done to make it
* easier to support relations that are larger than the operating
* system's file size limit (often 2GBytes). In order to do that,
* we break relations up into chunks of < 2GBytes and store one chunk
* in each of several files that represent the relation. See the
* BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
* All chunks except the last MUST have size exactly equal to RELSEG_SIZE
* blocks --- see mdnblocks() and mdtruncate().
*
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
* cache is, therefore, just the head of a list of MdfdVec objects.
* But note the md_fd pointer can be NULL, indicating relation not open.
*
* Note that mdfd_chain == NULL does not necessarily mean the relation
* doesn't have another segment after this one; we may just not have
* opened the next segment yet. (We could not have "all segments are
* in the chain" as an invariant anyway, since another backend could
* extend the relation when we weren't looking.)
*
* All MdfdVec objects are palloc'd in the MdCxt memory context.
*/
typedef struct _MdfdVec
{
File mdfd_vfd; /* fd number in fd.c's pool */
BlockNumber mdfd_segno; /* segment number, from 0 */
#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
#endif
} MdfdVec;
static MemoryContext MdCxt; /* context for all md.c allocations */
/*
* In some contexts (currently, standalone backends and the bgwriter process)
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
* table remembers the pending operations. We use a hash table not because
* we want to look up individual operations, but simply as a convenient way
* of eliminating duplicate requests.
*
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
*
* XXX for WIN32, may want to expand this to track pending deletes, too.
*/
typedef struct
{
RelFileNode rnode; /* the targeted relation */
BlockNumber segno; /* which segment */
} PendingOperationEntry;
static HTAB *pendingOpsTable = NULL;
/* local routines */
static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
int oflags);
#endif
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
bool allowNotFound);
static BlockNumber _mdnblocks(File file, Size blcksz);
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
*/
bool
mdinit(void)
{
MdCxt = AllocSetContextCreate(TopMemoryContext,
"MdSmgr",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
/*
* Create pending-operations hashtable if we need it. Currently, we need
* it if we are standalone (not under a postmaster) OR if we are a
* bootstrap-mode subprocess of a postmaster (that is, a startup or
* bgwriter process).
*/
if (!IsUnderPostmaster || IsBootstrapProcessingMode())
{
HASHCTL hash_ctl;
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(PendingOperationEntry);
hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hash = tag_hash;
hash_ctl.hcxt = MdCxt;
pendingOpsTable = hash_create("Pending Ops Table",
100L,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
}
return true;
}
/*
* mdcreate() -- Create a new relation on magnetic disk.
*
* If isRedo is true, it's okay for the relation to exist already.
*/
bool
mdcreate(SMgrRelation reln, bool isRedo)
{
char *path;
File fd;
if (isRedo && reln->md_fd != NULL)
return true; /* created and opened already... */
Assert(reln->md_fd == NULL);
path = relpath(reln->smgr_rnode);
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
if (fd < 0)
{
int save_errno = errno;
/*
* During bootstrap, there are cases where a system relation will be
* accessed (by internal backend processes) before the bootstrap
* script nominally creates it. Therefore, allow the file to exist
* already, even if isRedo is not set. (See also mdopen)
*/
if (isRedo || IsBootstrapProcessingMode())
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
if (fd < 0)
{
pfree(path);
/* be sure to return the error reported by create, not open */
errno = save_errno;
return false;
}
errno = 0;
}
pfree(path);
reln->md_fd = _fdvec_alloc();
reln->md_fd->mdfd_vfd = fd;
reln->md_fd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
return true;
}
/*
* mdunlink() -- Unlink a relation.
*
* Note that we're passed a RelFileNode --- by the time this is called,
* there won't be an SMgrRelation hashtable entry anymore.
*
* If isRedo is true, it's okay for the relation to be already gone.
*/
bool
mdunlink(RelFileNode rnode, bool isRedo)
{
bool status = true;
int save_errno = 0;
char *path;
path = relpath(rnode);
/* Delete the first segment, or only segment if not doing segmenting */
if (unlink(path) < 0)
{
if (!isRedo || errno != ENOENT)
{
status = false;
save_errno = errno;
}
}
#ifndef LET_OS_MANAGE_FILESIZE
/* Get the additional segments, if any */
if (status)
{
char *segpath = (char *) palloc(strlen(path) + 12);
BlockNumber segno;
for (segno = 1;; segno++)
{
sprintf(segpath, "%s.%u", path, segno);
if (unlink(segpath) < 0)
{
/* ENOENT is expected after the last segment... */
if (errno != ENOENT)
{
status = false;
save_errno = errno;
}
break;
}
}
pfree(segpath);
}
#endif
pfree(path);
errno = save_errno;
return status;
}
/*
* mdextend() -- Add a block to the specified relation.
*
* The semantics are basically the same as mdwrite(): write at the
* specified position. However, we are expecting to extend the
* relation (ie, blocknum is the current EOF), and so in case of
* failure we clean up by truncating.
*
* This routine returns true or false, with errno set as appropriate.
*
* Note: this routine used to call mdnblocks() to get the block position
* to write at, but that's pretty silly since the caller needs to know where
* the block will be written, and accordingly must have done mdnblocks()
* already. Might as well pass in the position and save a seek.
*/
bool
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
/*
* Note: because caller obtained blocknum by calling _mdnblocks, which did
* a seek(SEEK_END), this seek is often redundant and will be optimized
* away by fd.c. It's not redundant, however, if there is a partial page
* at the end of the file. In that case we want to try to overwrite the
* partial page with a full page. It's also not redundant if bufmgr.c had
* to dump another buffer of the same file to make room for the new page's
* buffer.
*/
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return false;
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
if (nbytes > 0)
{
int save_errno = errno;
/* Remove the partially-written page */
FileTruncate(v->mdfd_vfd, seekpos);
FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
errno = save_errno;
}
return false;
}
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return false;
}
#ifndef LET_OS_MANAGE_FILESIZE
Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
return true;
}
/*
* mdopen() -- Open the specified relation. ereport's on failure.
* (Optionally, can return NULL instead of ereport for ENOENT.)
*
* Note we only open the first segment, when there are multiple segments.
*/
static MdfdVec *
mdopen(SMgrRelation reln, bool allowNotFound)
{
MdfdVec *mdfd;
char *path;
File fd;
/* No work if already open */
if (reln->md_fd)
return reln->md_fd;
path = relpath(reln->smgr_rnode);
fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
if (fd < 0)
{
/*
* During bootstrap, there are cases where a system relation will be
* accessed (by internal backend processes) before the bootstrap
* script nominally creates it. Therefore, accept mdopen() as a
* substitute for mdcreate() in bootstrap mode only. (See mdcreate)
*/
if (IsBootstrapProcessingMode())
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
if (fd < 0)
{
pfree(path);
if (allowNotFound && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open relation %u/%u/%u: %m",
reln->smgr_rnode.spcNode,
reln->smgr_rnode.dbNode,
reln->smgr_rnode.relNode)));
}
}
pfree(path);
reln->md_fd = mdfd = _fdvec_alloc();
mdfd->mdfd_vfd = fd;
mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
mdfd->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
return mdfd;
}
/*
* mdclose() -- Close the specified relation, if it isn't closed already.
*
* Returns true or false with errno set as appropriate.
*/
bool
mdclose(SMgrRelation reln)
{
MdfdVec *v = reln->md_fd;
/* No work if already closed */
if (v == NULL)
return true;
reln->md_fd = NULL; /* prevent dangling pointer after error */
#ifndef LET_OS_MANAGE_FILESIZE
while (v != NULL)
{
MdfdVec *ov = v;
/* if not closed already */
if (v->mdfd_vfd >= 0)
FileClose(v->mdfd_vfd);
/* Now free vector */
v = v->mdfd_chain;
pfree(ov);
}
#else
if (v->mdfd_vfd >= 0)
FileClose(v->mdfd_vfd);
pfree(v);
#endif
return true;
}
/*
* mdread() -- Read the specified block from a relation.
*/
bool
mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
{
bool status;
long seekpos;
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return false;
status = true;
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
/*
* If we are at or past EOF, return zeroes without complaining. Also
* substitute zeroes if we found a partial block at EOF.
*
* XXX this is really ugly, bad design. However the current
* implementation of hash indexes requires it, because hash index
* pages are initialized out-of-order.
*/
if (nbytes == 0 ||
(nbytes > 0 && mdnblocks(reln) == blocknum))
MemSet(buffer, 0, BLCKSZ);
else
status = false;
}
return status;
}
/*
* mdwrite() -- Write the supplied block at the appropriate location.
*/
bool
mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
Assert(seekpos < BLCKSZ * RELSEG_SIZE);
#else
seekpos = (long) (BLCKSZ * (blocknum));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return false;
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
return false;
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return false;
}
return true;
}
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
* Important side effect: all segments of the relation are opened
* and added to the mdfd_chain list. If this routine has not been
* called, then only segments up to the last one actually touched
* are present in the chain...
*
* Returns # of blocks, or InvalidBlockNumber on error.
*/
BlockNumber
mdnblocks(SMgrRelation reln)
{
MdfdVec *v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
BlockNumber segno = 0;
/*
* Skip through any segments that aren't the last one, to avoid redundant
* seeks on them. We have previously verified that these segments are
* exactly RELSEG_SIZE long, and it's useless to recheck that each time.
* (NOTE: this assumption could only be wrong if another backend has
* truncated the relation. We rely on higher code levels to handle that
* scenario by closing and re-opening the md fd.)
*/
while (v->mdfd_chain != NULL)
{
segno++;
v = v->mdfd_chain;
}
for (;;)
{
nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
elog(FATAL, "segment too big");
if (nblocks < ((BlockNumber) RELSEG_SIZE))
return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
/*
* If segment is exactly RELSEG_SIZE, advance to next one.
*/
segno++;
if (v->mdfd_chain == NULL)
{
/*
* Because we pass O_CREAT, we will create the next segment (with
* zero length) immediately, if the last segment is of length
* REL_SEGSIZE. This is unnecessary but harmless, and testing for
* the case would take more cycles than it seems worth.
*/
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
if (v->mdfd_chain == NULL)
return InvalidBlockNumber; /* failed? */
}
v = v->mdfd_chain;
}
#else
return _mdnblocks(v->mdfd_vfd, BLCKSZ);
#endif
}
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
*
* Returns # of blocks or InvalidBlockNumber on error.
*/
BlockNumber
mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
{
MdfdVec *v;
BlockNumber curnblk;
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber priorblocks;
#endif
/*
* NOTE: mdnblocks makes sure we have opened all existing segments, so
* that truncate/delete loop will get them all!
*/
curnblk = mdnblocks(reln);
if (curnblk == InvalidBlockNumber)
return InvalidBlockNumber; /* mdnblocks failed */
if (nblocks > curnblk)
return InvalidBlockNumber; /* bogus request */
if (nblocks == curnblk)
return nblocks; /* no work */
v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
while (v != NULL)
{
MdfdVec *ov = v;
if (priorblocks > nblocks)
{
/*
* This segment is no longer wanted at all (and has already been
* unlinked from the mdfd_chain). We truncate the file before
* deleting it because if other backends are holding the file
* open, the unlink will fail on some platforms. Better a
* zero-size file gets left around than a big file...
*/
FileTruncate(v->mdfd_vfd, 0);
FileUnlink(v->mdfd_vfd);
v = v->mdfd_chain;
Assert(ov != reln->md_fd); /* we never drop the 1st segment */
pfree(ov);
}
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
{
/*
* This is the last segment we want to keep. Truncate the file to
* the right length, and clear chain link that points to any
* remaining segments (which we shall zap). NOTE: if nblocks is
* exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
* segment to 0 length but keep it. This is mainly so that the
* right thing happens if nblocks==0.
*/
BlockNumber lastsegblocks = nblocks - priorblocks;
if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
return InvalidBlockNumber;
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return InvalidBlockNumber;
}
v = v->mdfd_chain;
ov->mdfd_chain = NULL;
}
else
{
/*
* We still need this segment and 0 or more blocks beyond it, so
* nothing to do here.
*/
v = v->mdfd_chain;
}
priorblocks += RELSEG_SIZE;
}
#else
if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
return InvalidBlockNumber;
if (!isTemp)
{
if (!register_dirty_segment(reln, v))
return InvalidBlockNumber;
}
#endif
return nblocks;
}
/*
* mdimmedsync() -- Immediately sync a relation to stable storage.
*
* Note that only writes already issued are synced; this routine knows
* nothing of dirty buffers that may exist inside the buffer manager.
*/
bool
mdimmedsync(SMgrRelation reln)
{
MdfdVec *v;
BlockNumber curnblk;
/*
* NOTE: mdnblocks makes sure we have opened all existing segments, so
* that fsync loop will get them all!
*/
curnblk = mdnblocks(reln);
if (curnblk == InvalidBlockNumber)
return false; /* mdnblocks failed */
v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
while (v != NULL)
{
if (FileSync(v->mdfd_vfd) < 0)
return false;
v = v->mdfd_chain;
}
#else
if (FileSync(v->mdfd_vfd) < 0)
return false;
#endif
return true;
}
/*
* mdsync() -- Sync previous writes to stable storage.
*
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOpsTable.
*/
bool
mdsync(void)
{
HASH_SEQ_STATUS hstat;
PendingOperationEntry *entry;
if (!pendingOpsTable)
return false;
/*
* If we are in the bgwriter, the sync had better include all fsync
* requests that were queued by backends before the checkpoint REDO point
* was determined. We go that a little better by accepting all requests
* queued up to the point where we start fsync'ing.
*/
AbsorbFsyncRequests();
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
/*
* If fsync is off then we don't have to bother opening the file at
* all. (We delay checking until this point so that changing fsync on
* the fly behaves sensibly.)
*/
if (enableFsync)
{
SMgrRelation reln;
MdfdVec *seg;
/*
* Find or create an smgr hash entry for this relation. This may
* seem a bit unclean -- md calling smgr? But it's really the
* best solution. It ensures that the open file reference isn't
* permanently leaked if we get an error here. (You may say "but
* an unreferenced SMgrRelation is still a leak!" Not really,
* because the only case in which a checkpoint is done by a
* process that isn't about to shut down is in the bgwriter, and
* it will periodically do smgrcloseall(). This fact justifies
* our not closing the reln in the success path either, which is a
* good thing since in non-bgwriter cases we couldn't safely do
* that.) Furthermore, in many cases the relation will have been
* dirtied through this same smgr relation, and so we can save a
* file open/close cycle.
*/
reln = smgropen(entry->rnode);
/*
* It is possible that the relation has been dropped or truncated
* since the fsync request was entered. Therefore, we have to
* allow file-not-found errors. This applies both during
* _mdfd_getseg() and during FileSync, since fd.c might have
* closed the file behind our back.
*/
seg = _mdfd_getseg(reln,
entry->segno * ((BlockNumber) RELSEG_SIZE),
true);
if (seg)
{
if (FileSync(seg->mdfd_vfd) < 0 &&
errno != ENOENT)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
entry->segno,
entry->rnode.spcNode,
entry->rnode.dbNode,
entry->rnode.relNode)));
return false;
}
}
}
/* Okay, delete this entry */
if (hash_search(pendingOpsTable, entry,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
return true;
}
/*
* register_dirty_segment() -- Mark a relation segment as needing fsync
*
* If there is a local pending-ops table, just make an entry in it for
* mdsync to process later. Otherwise, try to pass off the fsync request
* to the background writer process. If that fails, just do the fsync
* locally before returning (we expect this will not happen often enough
* to be a performance problem).
*
* A false result implies I/O failure during local fsync. errno will be
* valid for error reporting.
*/
static bool
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
if (pendingOpsTable)
{
PendingOperationEntry entry;
/* ensure any pad bytes in the struct are zeroed */
MemSet(&entry, 0, sizeof(entry));
entry.rnode = reln->smgr_rnode;
entry.segno = seg->mdfd_segno;
(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
return true;
}
else
{
if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
return true;
}
if (FileSync(seg->mdfd_vfd) < 0)
return false;
return true;
}
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
*
* We stuff the fsync request into the local hash table for execution
* during the bgwriter's next checkpoint.
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
PendingOperationEntry entry;
Assert(pendingOpsTable);
/* ensure any pad bytes in the struct are zeroed */
MemSet(&entry, 0, sizeof(entry));
entry.rnode = rnode;
entry.segno = segno;
(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
}
/*
* _fdvec_alloc() -- Make a MdfdVec object.
*/
static MdfdVec *
_fdvec_alloc(void)
{
return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
}
#ifndef LET_OS_MANAGE_FILESIZE
/*
* Open the specified segment of the relation,
* and make a MdfdVec object for it. Returns NULL on failure.
*/
static MdfdVec *
_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
{
MdfdVec *v;
int fd;
char *path,
*fullpath;
path = relpath(reln->smgr_rnode);
if (segno > 0)
{
/* be sure we have enough space for the '.segno' */
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%u", path, segno);
pfree(path);
}
else
fullpath = path;
/* open the file */
fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
pfree(fullpath);
if (fd < 0)
return NULL;
/* allocate an mdfdvec entry for it */
v = _fdvec_alloc();
/* fill the entry */
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
v->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
}
#endif /* LET_OS_MANAGE_FILESIZE */
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
* specified block. ereport's on failure.
* (Optionally, can return NULL instead of ereport for ENOENT.)
*/
static MdfdVec *
_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
{
MdfdVec *v = mdopen(reln, allowNotFound);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber segstogo;
BlockNumber nextsegno;
if (!v)
return NULL; /* only possible if allowNotFound */
for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
segstogo > 0;
nextsegno++, segstogo--)
{
if (v->mdfd_chain == NULL)
{
/*
* We will create the next segment only if the target block is
* within it. This prevents Sorcerer's Apprentice syndrome if a
* bug at higher levels causes us to be handed a ridiculously
* large blkno --- otherwise we could create many thousands of
* empty segment files before reaching the "target" block. We
* should never need to create more than one new segment per call,
* so this restriction seems reasonable.
*
* BUT: when doing WAL recovery, disable this logic and create
* segments unconditionally. In this case it seems better to
* assume the given blkno is good (it presumably came from a
* CRC-checked WAL record); furthermore this lets us cope in the
* case where we are replaying WAL data that has a write into a
* high-numbered segment of a relation that was later deleted. We
* want to go ahead and create the segments so we can finish out
* the replay.
*/
v->mdfd_chain = _mdfd_openseg(reln,
nextsegno,
(segstogo == 1 || InRecovery) ? O_CREAT : 0);
if (v->mdfd_chain == NULL)
{
if (allowNotFound && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
nextsegno,
reln->smgr_rnode.spcNode,
reln->smgr_rnode.dbNode,
reln->smgr_rnode.relNode,
blkno)));
}
}
v = v->mdfd_chain;
}
#endif
return v;
}
/*
* Get number of blocks present in a single disk file
*/
static BlockNumber
_mdnblocks(File file, Size blcksz)
{
long len;
len = FileSeek(file, 0L, SEEK_END);
if (len < 0)
return 0; /* on failure, assume file is empty */
return (BlockNumber) (len / blcksz);
}