mirror of
https://github.com/postgres/postgres.git
synced 2025-05-03 22:24:49 +03:00
Improve scalability of md.c for large relations.
So far md.c used a linked list of segments. That proved to be a problem when processing large relations, because every smgr.c/md.c level access to a page incurred walking through a linked list of all preceding segments. Thus making accessing pages O(#segments). Replace the linked list of segments hanging off SMgrRelationData with an array of opened segments. That allows O(1) access to individual segments, if they've previously been opened. Discussion: <20140331101001.GE13135@alap3.anarazel.de> Reviewed-By: Peter Geoghegan, Tom Lane (in an older version)
This commit is contained in:
parent
417fefaf08
commit
45e191e3aa
@ -92,27 +92,23 @@
|
|||||||
* out to an unlinked old copy of a segment file that will eventually
|
* out to an unlinked old copy of a segment file that will eventually
|
||||||
* disappear.
|
* disappear.
|
||||||
*
|
*
|
||||||
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
|
* File descriptors are stored in the per-fork md_seg_fds arrays inside
|
||||||
* cache is, therefore, just the head of a list of MdfdVec objects, one
|
* SMgrRelation. The length of these arrays is stored in md_num_open_segs.
|
||||||
* per segment. But note the md_fd pointer can be NULL, indicating
|
* Note that a fork's md_num_open_segs having a specific value does not
|
||||||
* relation not open.
|
* necessarily mean the relation doesn't have additional segments; we may
|
||||||
*
|
* just not have opened the next segment yet. (We could not have "all
|
||||||
* Also note that mdfd_chain == NULL does not necessarily mean the relation
|
* segments are in the array" as an invariant anyway, since another backend
|
||||||
* doesn't have another segment after this one; we may just not have
|
* could extend the relation while we aren't looking.) We do not have
|
||||||
* opened the next segment yet. (We could not have "all segments are
|
|
||||||
* in the chain" as an invariant anyway, since another backend could
|
|
||||||
* extend the relation when we weren't looking.) We do not make chain
|
|
||||||
* entries for inactive segments, however; as soon as we find a partial
|
* entries for inactive segments, however; as soon as we find a partial
|
||||||
* segment, we assume that any subsequent segments are inactive.
|
* segment, we assume that any subsequent segments are inactive.
|
||||||
*
|
*
|
||||||
* All MdfdVec objects are palloc'd in the MdCxt memory context.
|
* The entire MdfdVec array is palloc'd in the MdCxt memory context.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct _MdfdVec
|
typedef struct _MdfdVec
|
||||||
{
|
{
|
||||||
File mdfd_vfd; /* fd number in fd.c's pool */
|
File mdfd_vfd; /* fd number in fd.c's pool */
|
||||||
BlockNumber mdfd_segno; /* segment number, from 0 */
|
BlockNumber mdfd_segno; /* segment number, from 0 */
|
||||||
struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
|
|
||||||
} MdfdVec;
|
} MdfdVec;
|
||||||
|
|
||||||
static MemoryContext MdCxt; /* context for all MdfdVec objects */
|
static MemoryContext MdCxt; /* context for all MdfdVec objects */
|
||||||
@ -189,7 +185,9 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
|
|||||||
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
|
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
|
||||||
MdfdVec *seg);
|
MdfdVec *seg);
|
||||||
static void register_unlink(RelFileNodeBackend rnode);
|
static void register_unlink(RelFileNodeBackend rnode);
|
||||||
static MdfdVec *_fdvec_alloc(void);
|
static void _fdvec_resize(SMgrRelation reln,
|
||||||
|
ForkNumber forknum,
|
||||||
|
int nseg);
|
||||||
static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
|
static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
|
||||||
BlockNumber segno);
|
BlockNumber segno);
|
||||||
static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
|
static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
|
||||||
@ -294,13 +292,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
void
|
void
|
||||||
mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||||
{
|
{
|
||||||
|
MdfdVec *mdfd;
|
||||||
char *path;
|
char *path;
|
||||||
File fd;
|
File fd;
|
||||||
|
|
||||||
if (isRedo && reln->md_fd[forkNum] != NULL)
|
if (isRedo && reln->md_num_open_segs[forkNum] > 0)
|
||||||
return; /* created and opened already... */
|
return; /* created and opened already... */
|
||||||
|
|
||||||
Assert(reln->md_fd[forkNum] == NULL);
|
Assert(reln->md_num_open_segs[forkNum] == 0);
|
||||||
|
|
||||||
path = relpath(reln->smgr_rnode, forkNum);
|
path = relpath(reln->smgr_rnode, forkNum);
|
||||||
|
|
||||||
@ -330,11 +329,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
|||||||
|
|
||||||
pfree(path);
|
pfree(path);
|
||||||
|
|
||||||
reln->md_fd[forkNum] = _fdvec_alloc();
|
_fdvec_resize(reln, forkNum, 1);
|
||||||
|
mdfd = &reln->md_seg_fds[forkNum][0];
|
||||||
reln->md_fd[forkNum]->mdfd_vfd = fd;
|
mdfd->mdfd_vfd = fd;
|
||||||
reln->md_fd[forkNum]->mdfd_segno = 0;
|
mdfd->mdfd_segno = 0;
|
||||||
reln->md_fd[forkNum]->mdfd_chain = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -579,8 +577,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
|
|||||||
File fd;
|
File fd;
|
||||||
|
|
||||||
/* No work if already open */
|
/* No work if already open */
|
||||||
if (reln->md_fd[forknum])
|
if (reln->md_num_open_segs[forknum] > 0)
|
||||||
return reln->md_fd[forknum];
|
return &reln->md_seg_fds[forknum][0];
|
||||||
|
|
||||||
path = relpath(reln->smgr_rnode, forknum);
|
path = relpath(reln->smgr_rnode, forknum);
|
||||||
|
|
||||||
@ -612,11 +610,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
|
|||||||
|
|
||||||
pfree(path);
|
pfree(path);
|
||||||
|
|
||||||
reln->md_fd[forknum] = mdfd = _fdvec_alloc();
|
_fdvec_resize(reln, forknum, 1);
|
||||||
|
mdfd = &reln->md_seg_fds[forknum][0];
|
||||||
mdfd->mdfd_vfd = fd;
|
mdfd->mdfd_vfd = fd;
|
||||||
mdfd->mdfd_segno = 0;
|
mdfd->mdfd_segno = 0;
|
||||||
mdfd->mdfd_chain = NULL;
|
|
||||||
Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
|
Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
|
||||||
|
|
||||||
return mdfd;
|
return mdfd;
|
||||||
@ -628,25 +626,29 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
|
|||||||
void
|
void
|
||||||
mdclose(SMgrRelation reln, ForkNumber forknum)
|
mdclose(SMgrRelation reln, ForkNumber forknum)
|
||||||
{
|
{
|
||||||
MdfdVec *v = reln->md_fd[forknum];
|
int nopensegs = reln->md_num_open_segs[forknum];
|
||||||
|
|
||||||
/* No work if already closed */
|
/* No work if already closed */
|
||||||
if (v == NULL)
|
if (nopensegs == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
reln->md_fd[forknum] = NULL; /* prevent dangling pointer after error */
|
/* close segments starting from the end */
|
||||||
|
while (nopensegs > 0)
|
||||||
while (v != NULL)
|
|
||||||
{
|
{
|
||||||
MdfdVec *ov = v;
|
MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
|
||||||
|
|
||||||
/* if not closed already */
|
/* if not closed already */
|
||||||
if (v->mdfd_vfd >= 0)
|
if (v->mdfd_vfd >= 0)
|
||||||
|
{
|
||||||
FileClose(v->mdfd_vfd);
|
FileClose(v->mdfd_vfd);
|
||||||
/* Now free vector */
|
v->mdfd_vfd = -1;
|
||||||
v = v->mdfd_chain;
|
}
|
||||||
pfree(ov);
|
|
||||||
|
nopensegs--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* resize just once, avoids pointless reallocations */
|
||||||
|
_fdvec_resize(reln, forknum, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -862,9 +864,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
|||||||
* mdnblocks() -- Get the number of blocks stored in a relation.
|
* mdnblocks() -- Get the number of blocks stored in a relation.
|
||||||
*
|
*
|
||||||
* Important side effect: all active segments of the relation are opened
|
* Important side effect: all active segments of the relation are opened
|
||||||
* and added to the mdfd_chain list. If this routine has not been
|
* and added to the mdfd_seg_fds array. If this routine has not been
|
||||||
* called, then only segments up to the last one actually touched
|
* called, then only segments up to the last one actually touched
|
||||||
* are present in the chain.
|
* are present in the array.
|
||||||
*/
|
*/
|
||||||
BlockNumber
|
BlockNumber
|
||||||
mdnblocks(SMgrRelation reln, ForkNumber forknum)
|
mdnblocks(SMgrRelation reln, ForkNumber forknum)
|
||||||
@ -873,24 +875,24 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
BlockNumber nblocks;
|
BlockNumber nblocks;
|
||||||
BlockNumber segno = 0;
|
BlockNumber segno = 0;
|
||||||
|
|
||||||
|
/* mdopen has opened the first segment */
|
||||||
|
Assert(reln->md_num_open_segs[forknum] > 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Skip through any segments that aren't the last one, to avoid redundant
|
* Start from the last open segments, to avoid redundant seeks. We have
|
||||||
* seeks on them. We have previously verified that these segments are
|
* previously verified that these segments are exactly RELSEG_SIZE long,
|
||||||
* exactly RELSEG_SIZE long, and it's useless to recheck that each time.
|
* and it's useless to recheck that each time.
|
||||||
*
|
*
|
||||||
* NOTE: this assumption could only be wrong if another backend has
|
* NOTE: this assumption could only be wrong if another backend has
|
||||||
* truncated the relation. We rely on higher code levels to handle that
|
* truncated the relation. We rely on higher code levels to handle that
|
||||||
* scenario by closing and re-opening the md fd, which is handled via
|
* scenario by closing and re-opening the md fd, which is handled via
|
||||||
* relcache flush. (Since the checkpointer doesn't participate in
|
* relcache flush. (Since the checkpointer doesn't participate in
|
||||||
* relcache flush, it could have segment chain entries for inactive
|
* relcache flush, it could have segment entries for inactive segments;
|
||||||
* segments; that's OK because the checkpointer never needs to compute
|
* that's OK because the checkpointer never needs to compute relation
|
||||||
* relation size.)
|
* size.)
|
||||||
*/
|
*/
|
||||||
while (v->mdfd_chain != NULL)
|
segno = reln->md_num_open_segs[forknum] - 1;
|
||||||
{
|
v = &reln->md_seg_fds[forknum][segno];
|
||||||
segno++;
|
|
||||||
v = v->mdfd_chain;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
@ -905,21 +907,16 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
*/
|
*/
|
||||||
segno++;
|
segno++;
|
||||||
|
|
||||||
if (v->mdfd_chain == NULL)
|
/*
|
||||||
{
|
* We used to pass O_CREAT here, but that's has the disadvantage that
|
||||||
/*
|
* it might create a segment which has vanished through some operating
|
||||||
* We used to pass O_CREAT here, but that's has the disadvantage
|
* system misadventure. In such a case, creating the segment here
|
||||||
* that it might create a segment which has vanished through some
|
* undermines _mdfd_getseg's attempts to notice and report an error
|
||||||
* operating system misadventure. In such a case, creating the
|
* upon access to a missing segment.
|
||||||
* segment here undermines _mdfd_getseg's attempts to notice and
|
*/
|
||||||
* report an error upon access to a missing segment.
|
v = _mdfd_openseg(reln, forknum, segno, 0);
|
||||||
*/
|
if (v == NULL)
|
||||||
v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, 0);
|
return segno * ((BlockNumber) RELSEG_SIZE);
|
||||||
if (v->mdfd_chain == NULL)
|
|
||||||
return segno * ((BlockNumber) RELSEG_SIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
v = v->mdfd_chain;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -929,9 +926,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
void
|
void
|
||||||
mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||||
{
|
{
|
||||||
MdfdVec *v;
|
|
||||||
BlockNumber curnblk;
|
BlockNumber curnblk;
|
||||||
BlockNumber priorblocks;
|
BlockNumber priorblocks;
|
||||||
|
int curopensegs;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NOTE: mdnblocks makes sure we have opened all active segments, so that
|
* NOTE: mdnblocks makes sure we have opened all active segments, so that
|
||||||
@ -951,19 +948,24 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
|||||||
if (nblocks == curnblk)
|
if (nblocks == curnblk)
|
||||||
return; /* no work */
|
return; /* no work */
|
||||||
|
|
||||||
v = mdopen(reln, forknum, EXTENSION_FAIL);
|
/*
|
||||||
|
* Truncate segments, starting at the last one. Starting at the end makes
|
||||||
priorblocks = 0;
|
* managing the memory for the fd array easier, should there be errors.
|
||||||
while (v != NULL)
|
*/
|
||||||
|
curopensegs = reln->md_num_open_segs[forknum];
|
||||||
|
while (curopensegs > 0)
|
||||||
{
|
{
|
||||||
MdfdVec *ov = v;
|
MdfdVec *v;
|
||||||
|
|
||||||
|
priorblocks = (curopensegs - 1) * RELSEG_SIZE;
|
||||||
|
|
||||||
|
v = &reln->md_seg_fds[forknum][curopensegs - 1];
|
||||||
|
|
||||||
if (priorblocks > nblocks)
|
if (priorblocks > nblocks)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* This segment is no longer active (and has already been unlinked
|
* This segment is no longer active. We truncate the file, but do
|
||||||
* from the mdfd_chain). We truncate the file, but do not delete
|
* not delete it, for reasons explained in the header comments.
|
||||||
* it, for reasons explained in the header comments.
|
|
||||||
*/
|
*/
|
||||||
if (FileTruncate(v->mdfd_vfd, 0) < 0)
|
if (FileTruncate(v->mdfd_vfd, 0) < 0)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
@ -973,21 +975,21 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
|||||||
|
|
||||||
if (!SmgrIsTemp(reln))
|
if (!SmgrIsTemp(reln))
|
||||||
register_dirty_segment(reln, forknum, v);
|
register_dirty_segment(reln, forknum, v);
|
||||||
v = v->mdfd_chain;
|
|
||||||
Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
|
/* we never drop the 1st segment */
|
||||||
* segment */
|
Assert(v != &reln->md_seg_fds[forknum][0]);
|
||||||
FileClose(ov->mdfd_vfd);
|
|
||||||
pfree(ov);
|
FileClose(v->mdfd_vfd);
|
||||||
|
_fdvec_resize(reln, forknum, curopensegs - 1);
|
||||||
}
|
}
|
||||||
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
|
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* This is the last segment we want to keep. Truncate the file to
|
* This is the last segment we want to keep. Truncate the file to
|
||||||
* the right length, and clear chain link that points to any
|
* the right length. NOTE: if nblocks is exactly a multiple K of
|
||||||
* remaining segments (which we shall zap). NOTE: if nblocks is
|
* RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
|
||||||
* exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
|
* keep it. This adheres to the invariant given in the header
|
||||||
* segment to 0 length but keep it. This adheres to the invariant
|
* comments.
|
||||||
* given in the header comments.
|
|
||||||
*/
|
*/
|
||||||
BlockNumber lastsegblocks = nblocks - priorblocks;
|
BlockNumber lastsegblocks = nblocks - priorblocks;
|
||||||
|
|
||||||
@ -999,18 +1001,16 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
|||||||
nblocks)));
|
nblocks)));
|
||||||
if (!SmgrIsTemp(reln))
|
if (!SmgrIsTemp(reln))
|
||||||
register_dirty_segment(reln, forknum, v);
|
register_dirty_segment(reln, forknum, v);
|
||||||
v = v->mdfd_chain;
|
|
||||||
ov->mdfd_chain = NULL;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* We still need this segment and 0 or more blocks beyond it, so
|
* We still need this segment, so nothing to do for this and any
|
||||||
* nothing to do here.
|
* earlier segment.
|
||||||
*/
|
*/
|
||||||
v = v->mdfd_chain;
|
break;
|
||||||
}
|
}
|
||||||
priorblocks += RELSEG_SIZE;
|
curopensegs--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1023,7 +1023,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
|||||||
void
|
void
|
||||||
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
|
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
|
||||||
{
|
{
|
||||||
MdfdVec *v;
|
int segno;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NOTE: mdnblocks makes sure we have opened all active segments, so that
|
* NOTE: mdnblocks makes sure we have opened all active segments, so that
|
||||||
@ -1031,16 +1031,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
|
|||||||
*/
|
*/
|
||||||
mdnblocks(reln, forknum);
|
mdnblocks(reln, forknum);
|
||||||
|
|
||||||
v = mdopen(reln, forknum, EXTENSION_FAIL);
|
segno = reln->md_num_open_segs[forknum];
|
||||||
|
|
||||||
while (v != NULL)
|
while (segno > 0)
|
||||||
{
|
{
|
||||||
|
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
|
||||||
|
|
||||||
if (FileSync(v->mdfd_vfd) < 0)
|
if (FileSync(v->mdfd_vfd) < 0)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode_for_file_access(),
|
(errcode_for_file_access(),
|
||||||
errmsg("could not fsync file \"%s\": %m",
|
errmsg("could not fsync file \"%s\": %m",
|
||||||
FilePathName(v->mdfd_vfd))));
|
FilePathName(v->mdfd_vfd))));
|
||||||
v = v->mdfd_chain;
|
segno--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1703,12 +1705,40 @@ ForgetDatabaseFsyncRequests(Oid dbid)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _fdvec_alloc() -- Make a MdfdVec object.
|
* _fdvec_resize() -- Resize the fork's open segments array
|
||||||
*/
|
*/
|
||||||
static MdfdVec *
|
static void
|
||||||
_fdvec_alloc(void)
|
_fdvec_resize(SMgrRelation reln,
|
||||||
|
ForkNumber forknum,
|
||||||
|
int nseg)
|
||||||
{
|
{
|
||||||
return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
|
if (nseg == 0)
|
||||||
|
{
|
||||||
|
if (reln->md_num_open_segs[forknum] > 0)
|
||||||
|
{
|
||||||
|
pfree(reln->md_seg_fds[forknum]);
|
||||||
|
reln->md_seg_fds[forknum] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (reln->md_num_open_segs[forknum] == 0)
|
||||||
|
{
|
||||||
|
reln->md_seg_fds[forknum] =
|
||||||
|
MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* It doesn't seem worthwile complicating the code by having a more
|
||||||
|
* aggressive growth strategy here; the number of segments doesn't
|
||||||
|
* grow that fast, and the memory context internally will sometimes
|
||||||
|
* avoid doing an actual reallocation.
|
||||||
|
*/
|
||||||
|
reln->md_seg_fds[forknum] =
|
||||||
|
repalloc(reln->md_seg_fds[forknum],
|
||||||
|
sizeof(MdfdVec) * nseg);
|
||||||
|
}
|
||||||
|
|
||||||
|
reln->md_num_open_segs[forknum] = nseg;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1756,13 +1786,14 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
|
|||||||
if (fd < 0)
|
if (fd < 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
/* allocate an mdfdvec entry for it */
|
if (segno <= reln->md_num_open_segs[forknum])
|
||||||
v = _fdvec_alloc();
|
_fdvec_resize(reln, forknum, segno + 1);
|
||||||
|
|
||||||
/* fill the entry */
|
/* fill the entry */
|
||||||
|
v = &reln->md_seg_fds[forknum][segno];
|
||||||
v->mdfd_vfd = fd;
|
v->mdfd_vfd = fd;
|
||||||
v->mdfd_segno = segno;
|
v->mdfd_segno = segno;
|
||||||
v->mdfd_chain = NULL;
|
|
||||||
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
|
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
|
||||||
|
|
||||||
/* all done */
|
/* all done */
|
||||||
@ -1781,7 +1812,7 @@ static MdfdVec *
|
|||||||
_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||||
bool skipFsync, int behavior)
|
bool skipFsync, int behavior)
|
||||||
{
|
{
|
||||||
MdfdVec *v = mdopen(reln, forknum, behavior);
|
MdfdVec *v;
|
||||||
BlockNumber targetseg;
|
BlockNumber targetseg;
|
||||||
BlockNumber nextsegno;
|
BlockNumber nextsegno;
|
||||||
|
|
||||||
@ -1789,98 +1820,116 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
|||||||
Assert(behavior &
|
Assert(behavior &
|
||||||
(EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
|
(EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
|
||||||
|
|
||||||
if (!v)
|
|
||||||
return NULL; /* if behavior & EXTENSION_RETURN_NULL */
|
|
||||||
|
|
||||||
targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
|
targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
|
||||||
for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
|
|
||||||
|
/* if an existing and opened segment, we're done */
|
||||||
|
if (targetseg < reln->md_num_open_segs[forknum])
|
||||||
{
|
{
|
||||||
|
v = &reln->md_seg_fds[forknum][targetseg];
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The target segment is not yet open. Iterate over all the segments
|
||||||
|
* between the last opened and the target segment. This way missing
|
||||||
|
* segments either raise an error, or get created (according to
|
||||||
|
* 'behavior'). Start with either the last opened, or the first segment if
|
||||||
|
* none was opened before.
|
||||||
|
*/
|
||||||
|
if (reln->md_num_open_segs[forknum] > 0)
|
||||||
|
v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
|
||||||
|
else
|
||||||
|
{
|
||||||
|
v = mdopen(reln, forknum, behavior);
|
||||||
|
if (!v)
|
||||||
|
return NULL; /* if behavior & EXTENSION_RETURN_NULL */
|
||||||
|
}
|
||||||
|
|
||||||
|
for (nextsegno = reln->md_num_open_segs[forknum];
|
||||||
|
nextsegno <= targetseg; nextsegno++)
|
||||||
|
{
|
||||||
|
BlockNumber nblocks = _mdnblocks(reln, forknum, v);
|
||||||
|
int flags = 0;
|
||||||
|
|
||||||
Assert(nextsegno == v->mdfd_segno + 1);
|
Assert(nextsegno == v->mdfd_segno + 1);
|
||||||
|
|
||||||
if (v->mdfd_chain == NULL)
|
if (nblocks > ((BlockNumber) RELSEG_SIZE))
|
||||||
|
elog(FATAL, "segment too big");
|
||||||
|
|
||||||
|
if ((behavior & EXTENSION_CREATE) ||
|
||||||
|
(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
|
||||||
{
|
{
|
||||||
BlockNumber nblocks = _mdnblocks(reln, forknum, v);
|
/*
|
||||||
int flags = 0;
|
* Normally we will create new segments only if authorized by the
|
||||||
|
* caller (i.e., we are doing mdextend()). But when doing WAL
|
||||||
|
* recovery, create segments anyway; this allows cases such as
|
||||||
|
* replaying WAL data that has a write into a high-numbered
|
||||||
|
* segment of a relation that was later deleted. We want to go
|
||||||
|
* ahead and create the segments so we can finish out the replay.
|
||||||
|
* However if the caller has specified
|
||||||
|
* EXTENSION_REALLY_RETURN_NULL, then extension is not desired
|
||||||
|
* even in recovery; we won't reach this point in that case.
|
||||||
|
*
|
||||||
|
* We have to maintain the invariant that segments before the last
|
||||||
|
* active segment are of size RELSEG_SIZE; therefore, if
|
||||||
|
* extending, pad them out with zeroes if needed. (This only
|
||||||
|
* matters if in recovery, or if the caller is extending the
|
||||||
|
* relation discontiguously, but that can happen in hash indexes.)
|
||||||
|
*/
|
||||||
|
if (nblocks < ((BlockNumber) RELSEG_SIZE))
|
||||||
|
{
|
||||||
|
char *zerobuf = palloc0(BLCKSZ);
|
||||||
|
|
||||||
if (nblocks > ((BlockNumber) RELSEG_SIZE))
|
mdextend(reln, forknum,
|
||||||
elog(FATAL, "segment too big");
|
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
|
||||||
|
zerobuf, skipFsync);
|
||||||
if ((behavior & EXTENSION_CREATE) ||
|
pfree(zerobuf);
|
||||||
(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
|
}
|
||||||
|
flags = O_CREAT;
|
||||||
|
}
|
||||||
|
else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
|
||||||
|
nblocks < ((BlockNumber) RELSEG_SIZE))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* When not extending (or explicitly including truncated
|
||||||
|
* segments), only open the next segment if the current one is
|
||||||
|
* exactly RELSEG_SIZE. If not (this branch), either return NULL
|
||||||
|
* or fail.
|
||||||
|
*/
|
||||||
|
if (behavior & EXTENSION_RETURN_NULL)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Normally we will create new segments only if authorized by
|
* Some callers discern between reasons for _mdfd_getseg()
|
||||||
* the caller (i.e., we are doing mdextend()). But when doing
|
* returning NULL based on errno. As there's no failing
|
||||||
* WAL recovery, create segments anyway; this allows cases
|
* syscall involved in this case, explicitly set errno to
|
||||||
* such as replaying WAL data that has a write into a
|
* ENOENT, as that seems the closest interpretation.
|
||||||
* high-numbered segment of a relation that was later deleted.
|
|
||||||
* We want to go ahead and create the segments so we can
|
|
||||||
* finish out the replay. However if the caller has specified
|
|
||||||
* EXTENSION_REALLY_RETURN_NULL, then extension is not desired
|
|
||||||
* even in recovery; we won't reach this point in that case.
|
|
||||||
*
|
|
||||||
* We have to maintain the invariant that segments before the
|
|
||||||
* last active segment are of size RELSEG_SIZE; therefore, if
|
|
||||||
* extending, pad them out with zeroes if needed. (This only
|
|
||||||
* matters if in recovery, or if the caller is extending the
|
|
||||||
* relation discontiguously, but that can happen in hash
|
|
||||||
* indexes.)
|
|
||||||
*/
|
*/
|
||||||
if (nblocks < ((BlockNumber) RELSEG_SIZE))
|
errno = ENOENT;
|
||||||
{
|
return NULL;
|
||||||
char *zerobuf = palloc0(BLCKSZ);
|
|
||||||
|
|
||||||
mdextend(reln, forknum,
|
|
||||||
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
|
|
||||||
zerobuf, skipFsync);
|
|
||||||
pfree(zerobuf);
|
|
||||||
}
|
|
||||||
flags = O_CREAT;
|
|
||||||
}
|
|
||||||
else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
|
|
||||||
nblocks < ((BlockNumber) RELSEG_SIZE))
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* When not extending (or explicitly including truncated
|
|
||||||
* segments), only open the next segment if the current one is
|
|
||||||
* exactly RELSEG_SIZE. If not (this branch), either return
|
|
||||||
* NULL or fail.
|
|
||||||
*/
|
|
||||||
if (behavior & EXTENSION_RETURN_NULL)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Some callers discern between reasons for _mdfd_getseg()
|
|
||||||
* returning NULL based on errno. As there's no failing
|
|
||||||
* syscall involved in this case, explicitly set errno to
|
|
||||||
* ENOENT, as that seems the closest interpretation.
|
|
||||||
*/
|
|
||||||
errno = ENOENT;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode_for_file_access(),
|
|
||||||
errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
|
|
||||||
_mdfd_segpath(reln, forknum, nextsegno),
|
|
||||||
blkno, nblocks)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, flags);
|
ereport(ERROR,
|
||||||
|
(errcode_for_file_access(),
|
||||||
|
errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
|
||||||
|
_mdfd_segpath(reln, forknum, nextsegno),
|
||||||
|
blkno, nblocks)));
|
||||||
|
}
|
||||||
|
|
||||||
if (v->mdfd_chain == NULL)
|
v = _mdfd_openseg(reln, forknum, nextsegno, flags);
|
||||||
{
|
|
||||||
if ((behavior & EXTENSION_RETURN_NULL) &&
|
if (v == NULL)
|
||||||
FILE_POSSIBLY_DELETED(errno))
|
{
|
||||||
return NULL;
|
if ((behavior & EXTENSION_RETURN_NULL) &&
|
||||||
ereport(ERROR,
|
FILE_POSSIBLY_DELETED(errno))
|
||||||
(errcode_for_file_access(),
|
return NULL;
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode_for_file_access(),
|
||||||
errmsg("could not open file \"%s\" (target block %u): %m",
|
errmsg("could not open file \"%s\" (target block %u): %m",
|
||||||
_mdfd_segpath(reln, forknum, nextsegno),
|
_mdfd_segpath(reln, forknum, nextsegno),
|
||||||
blkno)));
|
blkno)));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
v = v->mdfd_chain;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,7 +174,7 @@ smgropen(RelFileNode rnode, BackendId backend)
|
|||||||
|
|
||||||
/* mark it not open */
|
/* mark it not open */
|
||||||
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||||
reln->md_fd[forknum] = NULL;
|
reln->md_num_open_segs[forknum] = 0;
|
||||||
|
|
||||||
/* it has no owner yet */
|
/* it has no owner yet */
|
||||||
add_to_unowned_list(reln);
|
add_to_unowned_list(reln);
|
||||||
@ -379,7 +379,7 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
|||||||
* Exit quickly in WAL replay mode if we've already opened the file. If
|
* Exit quickly in WAL replay mode if we've already opened the file. If
|
||||||
* it's open, it surely must exist.
|
* it's open, it surely must exist.
|
||||||
*/
|
*/
|
||||||
if (isRedo && reln->md_fd[forknum] != NULL)
|
if (isRedo && reln->md_num_open_segs[forknum] > 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -64,8 +64,12 @@ typedef struct SMgrRelationData
|
|||||||
*/
|
*/
|
||||||
int smgr_which; /* storage manager selector */
|
int smgr_which; /* storage manager selector */
|
||||||
|
|
||||||
/* for md.c; NULL for forks that are not open */
|
/*
|
||||||
struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
|
* for md.c; per-fork arrays of the number of open segments
|
||||||
|
* (md_num_open_segs) and the segments themselves (md_seg_fds).
|
||||||
|
*/
|
||||||
|
int md_num_open_segs[MAX_FORKNUM + 1];
|
||||||
|
struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
|
||||||
|
|
||||||
/* if unowned, list link in list of all unowned SMgrRelations */
|
/* if unowned, list link in list of all unowned SMgrRelations */
|
||||||
struct SMgrRelationData *next_unowned_reln;
|
struct SMgrRelationData *next_unowned_reln;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user