1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-12 05:01:15 +03:00

Postgres95 1.01 Distribution - Virgin Sources

This commit is contained in:
Marc G. Fournier
1996-07-09 06:22:35 +00:00
commit d31084e9d1
868 changed files with 242656 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for the storage modules
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
#
#-------------------------------------------------------------------------
stordir= $(CURDIR)/storage
VPATH:= $(VPATH):$(stordir):$(stordir)/buffer:$(stordir)/file:$(stordir)/ipc:\
$(stordir)/large_object:$(stordir)/lmgr:$(stordir)/page:$(stordir)/smgr
SUBSRCS=
include $(stordir)/buffer/Makefile.inc
include $(stordir)/file/Makefile.inc
include $(stordir)/ipc/Makefile.inc
include $(stordir)/large_object/Makefile.inc
include $(stordir)/lmgr/Makefile.inc
include $(stordir)/page/Makefile.inc
include $(stordir)/smgr/Makefile.inc
SRCS_STORAGE:= $(SUBSRCS)
HEADERS+= backendid.h block.h buf.h buf_internals.h bufmgr.h bufpage.h \
fd.h ipc.h item.h itemid.h itempos.h \
itemptr.h large_object.h lmgr.h lock.h multilev.h off.h page.h \
pagenum.h pos.h proc.h shmem.h sinval.h sinvaladt.h smgr.h spin.h

View File

@@ -0,0 +1,32 @@
/*-------------------------------------------------------------------------
*
* backendid.h--
* POSTGRES backend id communication definitions
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: backendid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef BACKENDID_H
#define BACKENDID_H
/* ----------------
* pulled out of sinval.h to temporarily reduce #include nesting.
* -cim 8/17/90
* ----------------
*/
typedef int16 BackendId; /* unique currently active backend identifier */
#define InvalidBackendId (-1)
typedef int32 BackendTag; /* unique backend identifier */
#define InvalidBackendTag (-1)
extern BackendId MyBackendId; /* backend id of this backend */
extern BackendTag MyBackendTag; /* backend tag of this backend */
#endif /* BACKENDID_H */

114
src/backend/storage/block.h Normal file
View File

@@ -0,0 +1,114 @@
/*-------------------------------------------------------------------------
*
* block.h--
* POSTGRES disk block definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: block.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef BLOCK_H
#define BLOCK_H
#include "c.h"
/*
* BlockNumber:
*
* each data file (heap or index) is divided into postgres disk blocks
* (which may be thought of as the unit of i/o -- a postgres buffer
* contains exactly one disk block). the blocks are numbered
* sequentially, 0 to 0xFFFFFFFE.
*
* InvalidBlockNumber is the same thing as P_NEW in buf.h.
*
* the access methods, the buffer manager and the storage manager are
* more or less the only pieces of code that should be accessing disk
* blocks directly.
*/
typedef uint32 BlockNumber;
#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF)
/*
* BlockId:
*
* this is a storage type for BlockNumber. in other words, this type
* is used for on-disk structures (e.g., in HeapTupleData) whereas
* BlockNumber is the type on which calculations are performed (e.g.,
* in access method code).
*
* there doesn't appear to be any reason to have separate types except
* for the fact that BlockIds can be SHORTALIGN'd (and therefore any
* structures that contains them, such as ItemPointerData, can also be
* SHORTALIGN'd). this is an important consideration for reducing the
* space requirements of the line pointer (ItemIdData) array on each
* page and the header of each heap or index tuple, so it doesn't seem
* wise to change this without good reason.
*/
typedef struct BlockIdData {
uint16 bi_hi;
uint16 bi_lo;
} BlockIdData;
typedef BlockIdData *BlockId; /* block identifier */
/* ----------------
* support macros
* ----------------
*/
/*
* BlockNumberIsValid --
* True iff blockNumber is valid.
*/
#define BlockNumberIsValid(blockNumber) \
((bool) ((int32) (blockNumber) != InvalidBlockNumber))
/*
* BlockIdIsValid --
* True iff the block identifier is valid.
*/
#define BlockIdIsValid(blockId) \
((bool) PointerIsValid(blockId))
/*
* BlockIdSet --
* Sets a block identifier to the specified value.
*/
#define BlockIdSet(blockId, blockNumber) \
Assert(PointerIsValid(blockId)); \
(blockId)->bi_hi = (blockNumber) >> 16; \
(blockId)->bi_lo = (blockNumber) & 0xffff
/*
* BlockIdCopy --
* Copy a block identifier.
*/
#define BlockIdCopy(toBlockId, fromBlockId) \
Assert(PointerIsValid(toBlockId)); \
Assert(PointerIsValid(fromBlockId)); \
(toBlockId)->bi_hi = (fromBlockId)->bi_hi; \
(toBlockId)->bi_lo = (fromBlockId)->bi_lo
/*
* BlockIdEquals --
* Check for block number equality.
*/
#define BlockIdEquals(blockId1, blockId2) \
((blockId1)->bi_hi == (blockId2)->bi_hi && \
(blockId1)->bi_lo == (blockId2)->bi_lo)
/*
* BlockIdGetBlockNumber --
* Retrieve the block number from a block identifier.
*/
#define BlockIdGetBlockNumber(blockId) \
(AssertMacro(BlockIdIsValid(blockId)) ? \
(BlockNumber) (((blockId)->bi_hi << 16) | ((uint16) (blockId)->bi_lo)) : \
(BlockNumber) InvalidBlockNumber)
#endif /* BLOCK_H */

47
src/backend/storage/buf.h Normal file
View File

@@ -0,0 +1,47 @@
/*-------------------------------------------------------------------------
*
* buf.h--
* Basic buffer manager data types.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: buf.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef BUF_H
#define BUF_H
#define InvalidBuffer (0)
#define UnknownBuffer (-99999)
typedef long Buffer;
/*
* BufferIsInvalid --
* True iff the buffer is invalid.
*/
#define BufferIsInvalid(buffer) ((buffer) == InvalidBuffer)
/*
* BufferIsUnknown --
* True iff the buffer is unknown.
*/
#define BufferIsUnknown(buffer) ((buffer) == UnknownBuffer)
/*
* BufferIsLocal --
* True iff the buffer is local (not visible to other servers).
*/
#define BufferIsLocal(buffer) ((buffer) < 0)
/*
* If NO_BUFFERISVALID is defined, all error checking using BufferIsValid()
* are suppressed. Decision-making using BufferIsValid is not affected.
* This should be set only if one is sure there will be no errors.
* - plai 9/10/90
*/
#undef NO_BUFFERISVALID
#endif /* BUF_H */

View File

@@ -0,0 +1,220 @@
/*-------------------------------------------------------------------------
*
* buf_internals.h--
* Internal definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: buf_internals.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
* NOTE
* If BUFFERPAGE0 is defined, then 0 will be used as a
* valid buffer page number.
*
*-------------------------------------------------------------------------
*/
#ifndef BUFMGR_INTERNALS_H
#define BUFMGR_INTERNALS_H
#include "postgres.h"
#include "storage/buf.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "utils/rel.h"
#include "utils/relcache.h"
/* Buf Mgr constants */
/* in bufmgr.c */
extern int NBuffers;
extern int Data_Descriptors;
extern int Free_List_Descriptor;
extern int Lookup_List_Descriptor;
extern int Num_Descriptors;
/*
* Flags for buffer descriptors
*/
#define BM_DIRTY (1 << 0)
#define BM_PRIVATE (1 << 1)
#define BM_VALID (1 << 2)
#define BM_DELETED (1 << 3)
#define BM_FREE (1 << 4)
#define BM_IO_IN_PROGRESS (1 << 5)
#define BM_IO_ERROR (1 << 6)
typedef bits16 BufFlags;
typedef struct sbufdesc BufferDesc;
typedef struct sbufdesc BufferHdr;
typedef struct buftag BufferTag;
/* long * so alignment will be correct */
typedef long **BufferBlock;
struct buftag{
LRelId relId;
BlockNumber blockNum; /* blknum relative to begin of reln */
};
#define CLEAR_BUFFERTAG(a)\
(a)->relId.dbId = InvalidOid; \
(a)->relId.relId = InvalidOid; \
(a)->blockNum = InvalidBlockNumber
#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
{ \
(a)->blockNum = xx_blockNum;\
(a)->relId = RelationGetLRelId(xx_reln); \
}
#define COPY_BUFFERTAG(a,b)\
{ \
(a)->blockNum = (b)->blockNum;\
LRelIdAssign(*(a),*(b));\
}
#define EQUAL_BUFFERTAG(a,b) \
(((a)->blockNum == (b)->blockNum) &&\
(OID_Equal((a)->relId.relId,(b)->relId.relId)))
#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
#define INVALID_DESCRIPTOR (-3)
/*
* bletch hack -- anyplace that we declare space for relation or
* database names, we just use '16', not a symbolic constant, to
* specify their lengths. BM_NAMESIZE is the length of these names,
* and is used in the buffer manager code. somebody with lots of
* spare time should do this for all the other modules, too.
*/
#define BM_NAMESIZE 16
/*
* struct sbufdesc -- shared buffer cache metadata for a single
* shared buffer descriptor.
*
* We keep the name of the database and relation in which this
* buffer appears in order to avoid a catalog lookup on cache
* flush if we don't have the reldesc in the cache. It is also
* possible that the relation to which this buffer belongs is
* not visible to all backends at the time that it gets flushed.
* Dbname, relname, dbid, and relid are enough to determine where
* to put the buffer, for all storage managers.
*/
struct sbufdesc {
Buffer freeNext; /* link for freelist chain */
Buffer freePrev;
SHMEM_OFFSET data; /* pointer to data in buf pool */
/* tag and id must be together for table lookup to work */
BufferTag tag; /* file/block identifier */
int buf_id; /* maps global desc to local desc */
BufFlags flags; /* described below */
int16 bufsmgr; /* storage manager id for buffer */
unsigned refcount; /* # of times buffer is pinned */
char *sb_dbname; /* name of db in which buf belongs */
char *sb_relname; /* name of reln */
#ifdef HAS_TEST_AND_SET
/* can afford a dedicated lock if test-and-set locks are available */
slock_t io_in_progress_lock;
#endif /* HAS_TEST_AND_SET */
/*
* I padded this structure to a power of 2 (128 bytes on a MIPS) because
* BufferDescriptorGetBuffer is called a billion times and it does an
* C pointer subtraction (i.e., "x - y" -> array index of x relative
* to y, which is calculated using division by struct size). Integer
* ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...
* this hack cut 10% off of the time to create the Wisconsin database!
* It eats up more shared memory, of course, but we're (allegedly)
* going to make some of these types bigger soon anyway... -pma 1/2/93
*/
#if defined(PORTNAME_ultrix4)
char sb_pad[60]; /* no slock_t */
#endif /* mips */
#if defined(PORTNAME_sparc) || defined(PORTNAME_sparc_solaris) || defined(PORTNAME_irix5)
char sb_pad[56]; /* has slock_t */
#endif /* sparc || irix5 */
#if defined(PORTNAME_hpux)
char sb_pad[44]; /* has slock_t */
#endif /* alpha */
#if defined(PORTNAME_alpha)
char sb_pad[40]; /* has slock_t */
#endif /* alpha */
};
/*
* mao tracing buffer allocation
*/
/*#define BMTRACE*/
#ifdef BMTRACE
typedef struct _bmtrace {
int bmt_pid;
long bmt_buf;
long bmt_dbid;
long bmt_relid;
int bmt_blkno;
int bmt_op;
#define BMT_NOTUSED 0
#define BMT_ALLOCFND 1
#define BMT_ALLOCNOTFND 2
#define BMT_DEALLOC 3
} bmtrace;
#endif /* BMTRACE */
/*
* Bufmgr Interface:
*/
/* Internal routines: only called by buf.c */
/*freelist.c*/
extern void AddBufferToFreelist(BufferDesc *bf);
extern void PinBuffer(BufferDesc *buf);
extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf);
extern void UnpinBuffer(BufferDesc *buf);
extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf);
extern BufferDesc *GetFreeBuffer(void);
extern void InitFreeList(bool init);
extern void DBG_FreeListCheck(int nfree);
/* buf_table.c */
extern void InitBufTable(void);
extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
extern bool BufTableDelete(BufferDesc *buf);
extern bool BufTableInsert(BufferDesc *buf);
extern void DBG_LookupListCheck(int nlookup);
/* bufmgr.c */
extern BufferDesc *BufferDescriptors;
extern BufferBlock BufferBlocks;
extern long *PrivateRefCount;
extern long *LastRefCount;
extern SPINLOCK BufMgrLock;
/* localbuf.c */
extern long *LocalRefCount;
extern BufferDesc *LocalBufferDescriptors;
extern int NLocBuffer;
extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr);
extern int WriteLocalBuffer(Buffer buffer, bool release);
extern int FlushLocalBuffer(Buffer buffer);
extern void InitLocalBuffer();
extern void LocalBufferSync();
extern void ResetLocalBufferPool();
#endif /* BUFMGR_INTERNALS_H */

View File

@@ -0,0 +1,16 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/buffer
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= buf_table.c buf_init.c bufmgr.c freelist.c localbuf.c
SRCS_SITEMGR+= buf_table.c buf_init.c freelist.c

View File

@@ -0,0 +1,280 @@
/*-------------------------------------------------------------------------
*
* buf_init.c--
* buffer manager initialization routines
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <sys/file.h>
#include <stdio.h>
#include <math.h>
#include <signal.h>
/* declarations split between these three files */
#include "storage/buf.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "storage/smgr.h"
#include "storage/lmgr.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/hsearch.h"
#include "utils/elog.h"
#include "utils/memutils.h"
#include "executor/execdebug.h" /* for NDirectFileRead */
#include "catalog/catalog.h"
/*
* if BMTRACE is defined, we trace the last 200 buffer allocations and
* deallocations in a circular buffer in shared memory.
*/
#ifdef BMTRACE
bmtrace *TraceBuf;
long *CurTraceBuf;
#define BMT_LIMIT 200
#endif /* BMTRACE */
int ShowPinTrace = 0;
int NBuffers = NDBUFS; /* NDBUFS defined in miscadmin.h */
int Data_Descriptors;
int Free_List_Descriptor;
int Lookup_List_Descriptor;
int Num_Descriptors;
BufferDesc *BufferDescriptors;
BufferBlock BufferBlocks;
#ifndef HAS_TEST_AND_SET
long *NWaitIOBackendP;
#endif
extern IpcSemaphoreId WaitIOSemId;
long *PrivateRefCount; /* also used in freelist.c */
long *LastRefCount; /* refcounts of last ExecMain level */
/*
* Data Structures:
* buffers live in a freelist and a lookup data structure.
*
*
* Buffer Lookup:
* Two important notes. First, the buffer has to be
* available for lookup BEFORE an IO begins. Otherwise
* a second process trying to read the buffer will
* allocate its own copy and the buffeer pool will
* become inconsistent.
*
* Buffer Replacement:
* see freelist.c. A buffer cannot be replaced while in
* use either by data manager or during IO.
*
* WriteBufferBack:
* currently, a buffer is only written back at the time
* it is selected for replacement. It should
* be done sooner if possible to reduce latency of
* BufferAlloc(). Maybe there should be a daemon process.
*
* Synchronization/Locking:
*
* BufMgrLock lock -- must be acquired before manipulating the
* buffer queues (lookup/freelist). Must be released
* before exit and before doing any IO.
*
* IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
* It must be set when an IO is initiated and cleared at
* the end of the IO. It is there to make sure that one
* process doesn't start to use a buffer while another is
* faulting it in. see IOWait/IOSignal.
*
* refcount -- A buffer is pinned during IO and immediately
* after a BufferAlloc(). A buffer is always either pinned
* or on the freelist but never both. The buffer must be
* released, written, or flushed before the end of
* transaction.
*
* PrivateRefCount -- Each buffer also has a private refcount the keeps
* track of the number of times the buffer is pinned in the current
* processes. This is used for two purposes, first, if we pin a
* a buffer more than once, we only need to change the shared refcount
* once, thus only lock the buffer pool once, second, when a transaction
* aborts, it should only unpin the buffers exactly the number of times it
* has pinned them, so that it will not blow away buffers of another
* backend.
*
*/
SPINLOCK BufMgrLock;
/* delayed write: TRUE on, FALSE off */
int LateWrite = TRUE;
int ReadBufferCount;
int BufferHitCount;
int BufferFlushCount;
/*
* Initialize module:
*
* should calculate size of pool dynamically based on the
* amount of available memory.
*/
void
InitBufferPool(IPCKey key)
{
bool foundBufs,foundDescs;
int i;
Data_Descriptors = NBuffers;
Free_List_Descriptor = Data_Descriptors;
Lookup_List_Descriptor = Data_Descriptors + 1;
Num_Descriptors = Data_Descriptors + 1;
SpinAcquire(BufMgrLock);
#ifdef BMTRACE
CurTraceBuf = (long *) ShmemInitStruct("Buffer trace",
(BMT_LIMIT * sizeof(bmtrace)) + sizeof(long),
&foundDescs);
if (!foundDescs)
memset(CurTraceBuf, 0, (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long));
TraceBuf = (bmtrace *) &(CurTraceBuf[1]);
#endif
BufferDescriptors = (BufferDesc *)
ShmemInitStruct("Buffer Descriptors",
Num_Descriptors*sizeof(BufferDesc),&foundDescs);
BufferBlocks = (BufferBlock)
ShmemInitStruct("Buffer Blocks",
NBuffers*BLCKSZ,&foundBufs);
#ifndef HAS_TEST_AND_SET
{
bool foundNWaitIO;
NWaitIOBackendP = (long *)ShmemInitStruct("#Backends Waiting IO",
sizeof(long),
&foundNWaitIO);
if (!foundNWaitIO)
*NWaitIOBackendP = 0;
}
#endif
if (foundDescs || foundBufs) {
/* both should be present or neither */
Assert(foundDescs && foundBufs);
} else {
BufferDesc *buf;
unsigned long block;
buf = BufferDescriptors;
block = (unsigned long) BufferBlocks;
/*
* link the buffers into a circular, doubly-linked list to
* initialize free list. Still don't know anything about
* replacement strategy in this file.
*/
for (i = 0; i < Data_Descriptors; block+=BLCKSZ,buf++,i++) {
Assert(ShmemIsValid((unsigned long)block));
buf->freeNext = i+1;
buf->freePrev = i-1;
CLEAR_BUFFERTAG(&(buf->tag));
buf->data = MAKE_OFFSET(block);
buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
buf->refcount = 0;
buf->buf_id = i;
#ifdef HAS_TEST_AND_SET
S_INIT_LOCK(&(buf->io_in_progress_lock));
#endif
}
/* close the circular queue */
BufferDescriptors[0].freePrev = Data_Descriptors-1;
BufferDescriptors[Data_Descriptors-1].freeNext = 0;
}
/* Init the rest of the module */
InitBufTable();
InitFreeList(!foundDescs);
SpinRelease(BufMgrLock);
#ifndef HAS_TEST_AND_SET
{
int status;
WaitIOSemId = IpcSemaphoreCreate(IPCKeyGetWaitIOSemaphoreKey(key),
1, IPCProtection, 0, 1, &status);
}
#endif
PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
LastRefCount = (long *) calloc(NBuffers, sizeof(long));
}
/* -----------------------------------------------------
* BufferShmemSize
*
* compute the size of shared memory for the buffer pool including
* data pages, buffer descriptors, hash tables, etc.
* ----------------------------------------------------
*/
int
BufferShmemSize()
{
int size = 0;
int nbuckets;
int nsegs;
int tmp;
nbuckets = 1 << (int)my_log2((NBuffers - 1) / DEF_FFACTOR + 1);
nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
/* size of shmem binding table */
size += MAXALIGN(my_log2(BTABLE_SIZE) * sizeof(void *)); /* HTAB->dir */
size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */
size += MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
size += BUCKET_ALLOC_INCR *
(MAXALIGN(sizeof(BUCKET_INDEX)) +
MAXALIGN(BTABLE_KEYSIZE) +
MAXALIGN(BTABLE_DATASIZE));
/* size of buffer descriptors */
size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc));
/* size of data pages */
size += NBuffers * MAXALIGN(BLCKSZ);
/* size of buffer hash table */
size += MAXALIGN(my_log2(NBuffers) * sizeof(void *)); /* HTAB->dir */
size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */
size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
tmp = (int)ceil((double)NBuffers/BUCKET_ALLOC_INCR);
size += tmp * BUCKET_ALLOC_INCR *
(MAXALIGN(sizeof(BUCKET_INDEX)) +
MAXALIGN(sizeof(BufferTag)) +
MAXALIGN(sizeof(Buffer)));
#ifdef BMTRACE
size += (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long);
#endif
return size;
}

View File

@@ -0,0 +1,162 @@
/*-------------------------------------------------------------------------
*
* buf_table.c--
* routines for finding buffers in the buffer pool.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* OLD COMMENTS
*
* Data Structures:
*
* Buffers are identified by their BufferTag (buf.h). This
* file contains routines for allocating a shmem hash table to
* map buffer tags to buffer descriptors.
*
* Synchronization:
*
* All routines in this file assume buffer manager spinlock is
* held by their caller.
*/
#include "storage/bufmgr.h"
#include "storage/buf_internals.h" /* where the declarations go */
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/hsearch.h"
#include "utils/elog.h"
static HTAB *SharedBufHash;
extern HTAB *ShmemInitHash();
typedef struct lookup {
BufferTag key;
Buffer id;
} LookupEnt;
/*
* Initialize shmem hash table for mapping buffers
*/
void
InitBufTable()
{
HASHCTL info;
int hash_flags;
/* assume lock is held */
/* BufferTag maps to Buffer */
info.keysize = sizeof(BufferTag);
info.datasize = sizeof(Buffer);
info.hash = tag_hash;
hash_flags = (HASH_ELEM | HASH_FUNCTION);
SharedBufHash = (HTAB *) ShmemInitHash("Shared Buf Lookup Table",
NBuffers,NBuffers,
&info,hash_flags);
if (! SharedBufHash) {
elog(FATAL,"couldn't initialize shared buffer pool Hash Tbl");
exit(1);
}
}
BufferDesc *
BufTableLookup(BufferTag *tagPtr)
{
LookupEnt * result;
bool found;
if (tagPtr->blockNum == P_NEW)
return(NULL);
result = (LookupEnt *)
hash_search(SharedBufHash,(char *) tagPtr,HASH_FIND,&found);
if (! result){
elog(WARN,"BufTableLookup: BufferLookup table corrupted");
return(NULL);
}
if (! found) {
return(NULL);
}
return(&(BufferDescriptors[result->id]));
}
/*
* BufTableDelete
*/
bool
BufTableDelete(BufferDesc *buf)
{
LookupEnt * result;
bool found;
/* buffer not initialized or has been removed from
* table already. BM_DELETED keeps us from removing
* buffer twice.
*/
if (buf->flags & BM_DELETED) {
return(TRUE);
}
buf->flags |= BM_DELETED;
result = (LookupEnt *)
hash_search(SharedBufHash,(char *) &(buf->tag),HASH_REMOVE,&found);
if (! (result && found)) {
elog(WARN,"BufTableDelete: BufferLookup table corrupted");
return(FALSE);
}
return(TRUE);
}
bool
BufTableInsert(BufferDesc *buf)
{
LookupEnt * result;
bool found;
/* cannot insert it twice */
Assert (buf->flags & BM_DELETED);
buf->flags &= ~(BM_DELETED);
result = (LookupEnt *)
hash_search(SharedBufHash,(char *) &(buf->tag),HASH_ENTER,&found);
if (! result) {
Assert(0);
elog(WARN,"BufTableInsert: BufferLookup table corrupted");
return(FALSE);
}
/* found something else in the table ! */
if (found) {
Assert(0);
elog(WARN,"BufTableInsert: BufferLookup table corrupted");
return(FALSE);
}
result->id = buf->buf_id;
return(TRUE);
}
/* prints out collision stats for the buf table */
void
DBG_LookupListCheck(int nlookup)
{
nlookup = 10;
hash_stats("Shared",SharedBufHash);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,285 @@
/*-------------------------------------------------------------------------
*
* freelist.c--
* routines for manipulating the buffer pool's replacement strategy
* freelist.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* OLD COMMENTS
*
* Data Structures:
* SharedFreeList is a circular queue. Notice that this
* is a shared memory queue so the next/prev "ptrs" are
* buffer ids, not addresses.
*
* Sync: all routines in this file assume that the buffer
* semaphore has been acquired by the caller.
*/
#include <stdio.h>
#include "storage/bufmgr.h"
#include "storage/buf_internals.h" /* where declarations go */
#include "storage/spin.h"
#include "utils/elog.h"
static BufferDesc *SharedFreeList;
/* only actually used in debugging. The lock
* should be acquired before calling the freelist manager.
*/
extern SPINLOCK BufMgrLock;
#define IsInQueue(bf) \
Assert((bf->freeNext != INVALID_DESCRIPTOR));\
Assert((bf->freePrev != INVALID_DESCRIPTOR));\
Assert((bf->flags & BM_FREE))
#define NotInQueue(bf) \
Assert((bf->freeNext == INVALID_DESCRIPTOR));\
Assert((bf->freePrev == INVALID_DESCRIPTOR));\
Assert(! (bf->flags & BM_FREE))
/*
* AddBufferToFreelist --
*
* In theory, this is the only routine that needs to be changed
* if the buffer replacement strategy changes. Just change
* the manner in which buffers are added to the freelist queue.
* Currently, they are added on an LRU basis.
*/
void
AddBufferToFreelist(BufferDesc *bf)
{
#ifdef BMTRACE
_bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
#endif /* BMTRACE */
NotInQueue(bf);
/* change bf so it points to inFrontOfNew and its successor */
bf->freePrev = SharedFreeList->freePrev;
bf->freeNext = Free_List_Descriptor;
/* insert new into chain */
BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
}
#undef PinBuffer
/*
* PinBuffer -- make buffer unavailable for replacement.
*/
void
PinBuffer(BufferDesc *buf)
{
long b;
/* Assert (buf->refcount < 25); */
if (buf->refcount == 0) {
IsInQueue(buf);
/* remove from freelist queue */
BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
/* mark buffer as no longer free */
buf->flags &= ~BM_FREE;
} else {
NotInQueue(buf);
}
b = BufferDescriptorGetBuffer(buf) - 1;
Assert(PrivateRefCount[b] >= 0);
if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
buf->refcount++;
PrivateRefCount[b]++;
}
void
PinBuffer_Debug(char *file, int line, BufferDesc *buf)
{
PinBuffer(buf);
if (ShowPinTrace) {
Buffer buffer = BufferDescriptorGetBuffer(buf);
fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
buffer, buf->sb_relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
#undef UnpinBuffer
/*
* UnpinBuffer -- make buffer available for replacement.
*/
void
UnpinBuffer(BufferDesc *buf)
{
long b = BufferDescriptorGetBuffer(buf) - 1;
Assert(buf->refcount);
Assert(PrivateRefCount[b] > 0);
PrivateRefCount[b]--;
if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
buf->refcount--;
NotInQueue(buf);
if (buf->refcount == 0) {
AddBufferToFreelist(buf);
buf->flags |= BM_FREE;
} else {
/* do nothing */
}
}
void
UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
{
UnpinBuffer(buf);
if (ShowPinTrace) {
Buffer buffer = BufferDescriptorGetBuffer(buf);
fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
buffer, buf->sb_relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
/*
* GetFreeBuffer() -- get the 'next' buffer from the freelist.
*
*/
BufferDesc *
GetFreeBuffer()
{
BufferDesc *buf;
if (Free_List_Descriptor == SharedFreeList->freeNext) {
/* queue is empty. All buffers in the buffer pool are pinned. */
elog(WARN,"out of free buffers: time to abort !\n");
return(NULL);
}
buf = &(BufferDescriptors[SharedFreeList->freeNext]);
/* remove from freelist queue */
BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
buf->flags &= ~(BM_FREE);
return(buf);
}
/*
* InitFreeList -- initialize the dummy buffer descriptor used
* as a freelist head.
*
* Assume: All of the buffers are already linked in a circular
* queue. Only called by postmaster and only during
* initialization.
*/
void
InitFreeList(bool init)
{
SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
if (init) {
/* we only do this once, normally the postmaster */
SharedFreeList->data = INVALID_OFFSET;
SharedFreeList->flags = 0;
SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
SharedFreeList->buf_id = Free_List_Descriptor;
/* insert it into a random spot in the circular queue */
SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
SharedFreeList->freePrev = 0;
BufferDescriptors[SharedFreeList->freeNext].freePrev =
BufferDescriptors[SharedFreeList->freePrev].freeNext =
Free_List_Descriptor;
}
}
/*
* print out the free list and check for breaks.
*/
void
DBG_FreeListCheck(int nfree)
{
int i;
BufferDesc *buf;
buf = &(BufferDescriptors[SharedFreeList->freeNext]);
for (i=0;i<nfree;i++,buf = &(BufferDescriptors[buf->freeNext])) {
if (! (buf->flags & (BM_FREE))){
if (buf != SharedFreeList) {
printf("\tfree list corrupted: %d flags %x\n",
buf->buf_id,buf->flags);
} else {
printf("\tfree list corrupted: too short -- %d not %d\n",
i,nfree);
}
}
if ((BufferDescriptors[buf->freeNext].freePrev != buf->buf_id) ||
(BufferDescriptors[buf->freePrev].freeNext != buf->buf_id)) {
printf("\tfree list links corrupted: %d %ld %ld\n",
buf->buf_id,buf->freePrev,buf->freeNext);
}
}
if (buf != SharedFreeList) {
printf("\tfree list corrupted: %d-th buffer is %d\n",
nfree,buf->buf_id);
}
}
/*
* PrintBufferFreeList -
* prints the buffer free list, for debugging
*/
void
PrintBufferFreeList()
{
BufferDesc *buf;
if (SharedFreeList->freeNext == Free_List_Descriptor) {
printf("free list is empty.\n");
return;
}
buf = &(BufferDescriptors[SharedFreeList->freeNext]);
for (;;) {
int i = (buf - BufferDescriptors);
printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
i, buf->sb_relname, buf->tag.blockNum,
buf->flags, buf->refcount, PrivateRefCount[i],
buf->freeNext, buf->freePrev);
if (buf->freeNext == Free_List_Descriptor)
break;
buf = &(BufferDescriptors[buf->freeNext]);
}
}

View File

@@ -0,0 +1,284 @@
/*-------------------------------------------------------------------------
*
* localbuf.c--
* local buffer manager. Fast buffer manager for temporary tables
* or special cases when the operation is not visible to other backends.
*
* When a relation is being created, the descriptor will have rd_islocal
* set to indicate that the local buffer manager should be used. During
* the same transaction the relation is being created, any inserts or
* selects from the newly created relation will use the local buffer
* pool. rd_islocal is reset at the end of a transaction (commit/abort).
* This is useful for queries like SELECT INTO TABLE and create index.
*
* Copyright (c) 1994-5, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <sys/file.h>
#include <stdio.h>
#include <math.h>
#include <signal.h>
/* declarations split between these three files */
#include "storage/buf.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "storage/smgr.h"
#include "storage/lmgr.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/hsearch.h"
#include "utils/elog.h"
#include "utils/memutils.h"
#include "executor/execdebug.h" /* for NDirectFileRead */
#include "catalog/catalog.h"
int NLocBuffer = 64;
BufferDesc *LocalBufferDescriptors = NULL;
long *LocalRefCount = NULL;
static int nextFreeLocalBuf = 0;
/*#define LBDEBUG*/
/*
* LocalBufferAlloc -
* allocate a local buffer. We do round robin allocation for now.
*/
BufferDesc *
LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
{
int i;
BufferDesc *bufHdr = (BufferDesc *) NULL;
if (blockNum == P_NEW) {
blockNum = reln->rd_nblocks;
reln->rd_nblocks++;
}
/* a low tech search for now -- not optimized for scans */
for (i=0; i < NLocBuffer; i++) {
if (LocalBufferDescriptors[i].tag.relId.relId == reln->rd_id &&
LocalBufferDescriptors[i].tag.blockNum == blockNum) {
#ifdef LBDEBUG
fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
reln->rd_id, blockNum, -i-1);
#endif
LocalRefCount[i]++;
*foundPtr = TRUE;
return &LocalBufferDescriptors[i];
}
}
#ifdef LBDEBUG
fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
reln->rd_id, blockNum, -nextFreeLocalBuf-1);
#endif
/* need to get a new buffer (round robin for now) */
for(i=0; i < NLocBuffer; i++) {
int b = (nextFreeLocalBuf + i) % NLocBuffer;
if (LocalRefCount[b]==0) {
bufHdr = &LocalBufferDescriptors[b];
LocalRefCount[b]++;
nextFreeLocalBuf = (b + 1) % NLocBuffer;
break;
}
}
if (bufHdr==NULL)
elog(WARN, "no empty local buffer.");
/*
* this buffer is not referenced but it might still be dirty (the
* last transaction to touch it doesn't need its contents but has
* not flushed it). if that's the case, write it out before
* reusing it!
*/
if (bufHdr->flags & BM_DIRTY) {
Relation bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
Assert(bufrel != NULL);
/* flush this page */
smgrwrite(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
}
/*
* it's all ours now.
*/
bufHdr->tag.relId.relId = reln->rd_id;
bufHdr->tag.blockNum = blockNum;
bufHdr->flags &= ~BM_DIRTY;
/*
* lazy memory allocation. (see MAKE_PTR for why we need to do
* MAKE_OFFSET.)
*/
if (bufHdr->data == (SHMEM_OFFSET)0) {
char *data = (char *)malloc(BLCKSZ);
bufHdr->data = MAKE_OFFSET(data);
}
*foundPtr = FALSE;
return bufHdr;
}
/*
* WriteLocalBuffer -
* writes out a local buffer
*/
int
WriteLocalBuffer(Buffer buffer, bool release)
{
int bufid;
Assert(BufferIsLocal(buffer));
#ifdef LBDEBUG
fprintf(stderr, "LB WRITE %d\n", buffer);
#endif
bufid = - (buffer + 1);
LocalBufferDescriptors[bufid].flags |= BM_DIRTY;
if (release) {
Assert(LocalRefCount[bufid] > 0);
LocalRefCount[bufid]--;
}
return true;
}
/*
* FlushLocalBuffer -
* flushes a local buffer
*/
int
FlushLocalBuffer(Buffer buffer)
{
int bufid;
Relation bufrel;
BufferDesc *bufHdr;
Assert(BufferIsLocal(buffer));
#ifdef LBDEBUG
fprintf(stderr, "LB FLUSH %d\n", buffer);
#endif
bufid = - (buffer + 1);
bufHdr = &LocalBufferDescriptors[bufid];
bufHdr->flags &= ~BM_DIRTY;
bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
Assert(bufrel != NULL);
smgrflush(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
Assert(LocalRefCount[bufid] > 0);
LocalRefCount[bufid]--;
return true;
}
/*
* InitLocalBuffer -
* init the local buffer cache. Since most queries (esp. multi-user ones)
* don't involve local buffers, we delay allocating memory for actual the
* buffer until we need it.
*/
void
InitLocalBuffer()
{
int i;
/*
* these aren't going away. I'm not gonna use palloc.
*/
LocalBufferDescriptors =
(BufferDesc *)malloc(sizeof(BufferDesc) * NLocBuffer);
memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
nextFreeLocalBuf = 0;
for (i = 0; i < NLocBuffer; i++) {
BufferDesc *buf = &LocalBufferDescriptors[i];
/*
* negative to indicate local buffer. This is tricky: shared buffers
* start with 0. We have to start with -2. (Note that the routine
* BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
* is -1.)
*/
buf->buf_id = - i - 2;
}
LocalRefCount =
(long *)malloc(sizeof(long) * NLocBuffer);
memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
}
/*
* LocalBufferSync -
* flush all dirty buffers in the local buffer cache. Since the buffer
* cache is only used for keeping relations visible during a transaction,
* we will not need these buffers again.
*/
void
LocalBufferSync()
{
int i;
for (i = 0; i < NLocBuffer; i++) {
BufferDesc *buf = &LocalBufferDescriptors[i];
Relation bufrel;
if (buf->flags & BM_DIRTY) {
#ifdef LBDEBUG
fprintf(stderr, "LB SYNC %d\n", -i-1);
#endif
bufrel = RelationIdCacheGetRelation(buf->tag.relId.relId);
Assert(bufrel != NULL);
smgrwrite(bufrel->rd_rel->relsmgr, bufrel, buf->tag.blockNum,
(char *) MAKE_PTR(buf->data));
buf->tag.relId.relId = InvalidOid;
buf->flags &= ~BM_DIRTY;
}
}
memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
}
void
ResetLocalBufferPool()
{
int i;
memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
nextFreeLocalBuf = 0;
for (i = 0; i < NLocBuffer; i++) {
BufferDesc *buf = &LocalBufferDescriptors[i];
/* just like InitLocalBuffer() */
buf->buf_id = - i - 2;
}
memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
}

View File

@@ -0,0 +1,112 @@
/*-------------------------------------------------------------------------
*
* bufmgr.h--
* POSTGRES buffer manager definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: bufmgr.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef BUFMGR_H
#define BUFMGR_H
#include "c.h"
#include "machine.h" /* for BLCKSZ */
#include "utils/rel.h"
#include "storage/buf_internals.h" /* UGLY! -- ay */
/*
* the maximum size of a disk block for any possible installation.
*
* in theory this could be anything, but in practice this is actually
* limited to 2^13 bytes because we have limited ItemIdData.lp_off and
* ItemIdData.lp_len to 13 bits (see itemid.h).
*/
#define MAXBLCKSZ 8192
typedef void *Block;
/* special pageno for bget */
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
typedef bits16 BufferLock;
/**********************************************************************
the rest is function defns in the bufmgr that are externally callable
**********************************************************************/
/*
* These routines are beaten on quite heavily, hence the macroization.
* See buf_internals.h for a related comment.
*/
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
/*
* BufferIsPinned --
* True iff the buffer is pinned (and therefore valid)
*
* Note:
* Smenatics are identical to BufferIsValid
* XXX - need to remove either one eventually.
*/
#define BufferIsPinned BufferIsValid
extern int ShowPinTrace;
/*
* prototypes for functions in bufmgr.c
*/
extern Buffer RelationGetBufferWithBuffer(Relation relation,
BlockNumber blockNumber, Buffer buffer);
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
extern Buffer ReadBuffer_Debug(char *file, int line, Relation reln,
BlockNumber blockNum);
extern int WriteBuffer(Buffer buffer);
extern void WriteBuffer_Debug(char *file, int line, Buffer buffer);
extern void DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno,
char *dest);
extern int WriteNoReleaseBuffer(Buffer buffer);
extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
BlockNumber blockNum);
extern void InitBufferPool(IPCKey key);
extern void PrintBufferUsage(FILE *statfp);
extern void ResetBufferUsage(void);
extern void ResetBufferPool(void);
extern int BufferPoolCheckLeak(void);
extern void FlushBufferPool(int StableMainMemoryFlag);
extern bool BufferIsValid(Buffer bufnum);
extern BlockNumber BufferGetBlockNumber(Buffer buffer);
extern Relation BufferGetRelation(Buffer buffer);
extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
extern Block BufferGetBlock(Buffer buffer);
extern void ReleaseTmpRelBuffers(Relation tempreldesc);
extern void DropBuffers(Oid dbid);
extern void PrintBufferDescs(void);
extern void PrintPinnedBufs(void);
extern int BufferShmemSize(void);
extern void BufferPoolBlowaway(void);
extern void IncrBufferRefCount(Buffer buffer);
extern int ReleaseBuffer(Buffer buffer);
extern void IncrBufferRefCount_Debug(char *file, int line, Buffer buffer);
extern void ReleaseBuffer_Debug(char *file, int line, Buffer buffer);
extern int ReleaseAndReadBuffer_Debug(char *file,
int line,
Buffer buffer,
Relation relation,
BlockNumber blockNum);
extern void BufferRefCountReset(int *refcountsave);
extern void BufferRefCountRestore(int *refcountsave);
#endif /* !defined(BufMgrIncluded) */

View File

@@ -0,0 +1,256 @@
/*-------------------------------------------------------------------------
*
* bufpage.h--
* Standard POSTGRES buffer page definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: bufpage.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef BUFPAGE_H
#define BUFPAGE_H
#include "c.h"
#include "machine.h" /* for BLCKSZ */
#include "storage/buf.h"
#include "storage/item.h"
#include "storage/itemid.h"
#include "storage/itemptr.h"
/*
* a postgres disk page is an abstraction layered on top of a postgres
* disk block (which is simply a unit of i/o, see block.h).
*
* specifically, while a disk block can be unformatted, a postgres
* disk page is always a slotted page of the form:
*
* +----------------+---------------------------------+
* | PageHeaderData | linp0 linp1 linp2 ... |
* +-----------+----+---------------------------------+
* | ... linpN | |
* +-----------+--------------------------------------+
* | ^ pd_lower |
* | |
* | v pd_upper |
* +-------------+------------------------------------+
* | | tupleN ... |
* +-------------+------------------+-----------------+
* | ... tuple2 tuple1 tuple0 | "special space" |
* +--------------------------------+-----------------+
* ^ pd_special
*
* a page is full when nothing can be added between pd_lower and
* pd_upper.
*
* all blocks written out by an access method must be disk pages.
*
* EXCEPTIONS:
*
* obviously, a page is not formatted before it is initialized with by
* a call to PageInit.
*
* the contents of the special pg_variable/pg_time/pg_log tables are
* raw disk blocks with special formats. these are the only "access
* methods" that need not write disk pages.
*
* NOTES:
*
* linp0..N form an ItemId array. ItemPointers point into this array
* rather than pointing directly to a tuple.
*
* tuple0..N are added "backwards" on the page. because a tuple's
* ItemPointer points to its ItemId entry rather than its actual
* byte-offset position, tuples can be physically shuffled on a page
* whenever the need arises.
*
* AM-generic per-page information is kept in the pd_opaque field of
* the PageHeaderData. (this is currently only the page size.)
* AM-specific per-page data is kept in the area marked "special
* space"; each AM has an "opaque" structure defined somewhere that is
* stored as the page trailer. an access method should always
* initialize its pages with PageInit and then set its own opaque
* fields.
*/
typedef Pointer Page;
/*
* PageIsValid --
* True iff page is valid.
*/
#define PageIsValid(page) PointerIsValid(page)
/*
* location (byte offset) within a page.
*
* note that this is actually limited to 2^13 because we have limited
* ItemIdData.lp_off and ItemIdData.lp_len to 13 bits (see itemid.h).
*/
typedef uint16 LocationIndex;
/*
* space management information generic to any page
*
* od_pagesize - size in bytes.
* in reality, we need at least 64B to fit the
* page header, opaque space and a minimal tuple;
* on the high end, we can only support pages up
* to 8KB because lp_off/lp_len are 13 bits.
*/
typedef struct OpaqueData {
uint16 od_pagesize;
} OpaqueData;
typedef OpaqueData *Opaque;
/*
* disk page organization
*/
typedef struct PageHeaderData {
LocationIndex pd_lower; /* offset to start of free space */
LocationIndex pd_upper; /* offset to end of free space */
LocationIndex pd_special; /* offset to start of special space */
OpaqueData pd_opaque; /* AM-generic information */
ItemIdData pd_linp[1]; /* line pointers */
} PageHeaderData;
typedef PageHeaderData *PageHeader;
typedef enum {
ShufflePageManagerMode,
OverwritePageManagerMode
} PageManagerMode;
/* ----------------
* misc support macros
* ----------------
*/
/*
* XXX this is wrong -- ignores padding/alignment, variable page size,
* AM-specific opaque space at the end of the page (as in btrees), ...
* however, it at least serves as an upper bound for heap pages.
*/
#define MAXTUPLEN (BLCKSZ - sizeof (PageHeaderData))
/* ----------------------------------------------------------------
* page support macros
* ----------------------------------------------------------------
*/
/*
* PageIsValid -- This is defined in page.h.
*/
/*
* PageIsUsed --
* True iff the page size is used.
*
* Note:
* Assumes page is valid.
*/
#define PageIsUsed(page) \
(AssertMacro(PageIsValid(page)) ? \
((bool) (((PageHeader) (page))->pd_lower != 0)) : false)
/*
* PageIsEmpty --
* returns true iff no itemid has been allocated on the page
*/
#define PageIsEmpty(page) \
(((PageHeader) (page))->pd_lower == \
(sizeof(PageHeaderData) - sizeof(ItemIdData)) ? true : false)
/*
* PageGetItemId --
* Returns an item identifier of a page.
*/
#define PageGetItemId(page, offsetNumber) \
((ItemId) (&((PageHeader) (page))->pd_linp[(-1) + (offsetNumber)]))
/* ----------------
* macros to access opaque space
* ----------------
*/
/*
* PageSizeIsValid --
* True iff the page size is valid.
*
* XXX currently all page sizes are "valid" but we only actually
* use BLCKSZ.
*/
#define PageSizeIsValid(pageSize) 1
/*
* PageGetPageSize --
* Returns the page size of a page.
*
* this can only be called on a formatted page (unlike
* BufferGetPageSize, which can be called on an unformatted page).
* however, it can be called on a page for which there is no buffer.
*/
#define PageGetPageSize(page) \
((Size) ((PageHeader) (page))->pd_opaque.od_pagesize)
/*
* PageSetPageSize --
* Sets the page size of a page.
*/
#define PageSetPageSize(page, size) \
((PageHeader) (page))->pd_opaque.od_pagesize = (size)
/* ----------------
* page special data macros
* ----------------
*/
/*
* PageGetSpecialSize --
* Returns size of special space on a page.
*
* Note:
* Assumes page is locked.
*/
#define PageGetSpecialSize(page) \
((uint16) (PageGetPageSize(page) - ((PageHeader)page)->pd_special))
/*
* PageGetSpecialPointer --
* Returns pointer to special space on a page.
*
* Note:
* Assumes page is locked.
*/
#define PageGetSpecialPointer(page) \
(AssertMacro(PageIsValid(page)) ? \
(char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \
: (char *) 0)
/* ----------------------------------------------------------------
* extern declarations
* ----------------------------------------------------------------
*/
extern Size BufferGetPageSize(Buffer buffer);
extern Page BufferGetPage(Buffer buffer);
extern void PageInit(Page page, Size pageSize, Size specialSize);
extern Item PageGetItem(Page page, ItemId itemId);
extern OffsetNumber PageAddItem(Page page, Item item, Size size,
OffsetNumber offsetNumber, ItemIdFlags flags);
extern Page PageGetTempPage(Page page, Size specialSize);
extern void PageRestoreTempPage(Page tempPage, Page oldPage);
extern OffsetNumber PageGetMaxOffsetNumber(Page page);
extern void PageRepairFragmentation(Page page);
extern Size PageGetFreeSpace(Page page);
extern void PageManagerModeSet(PageManagerMode mode);
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
extern void PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
char *location, Size size);
#endif /* BUFPAGE_H */

96
src/backend/storage/fd.h Normal file
View File

@@ -0,0 +1,96 @@
/*-------------------------------------------------------------------------
*
* fd.h--
* Virtual file descriptor definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: fd.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* calls:
*
* File {Close, Read, Write, Seek, Tell, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
* use them for all file activity...
*
* fd = FilePathOpenFile("foo", O_RDONLY);
* File fd;
*
* use AllocateFile if you need a file descriptor in some other context.
* it will make sure that there is a file descriptor free
*
* use FreeFile to let the virtual file descriptor package know that
* there is now a free fd (when you are done with it)
*
* AllocateFile();
* FreeFile();
*/
#ifndef FD_H
#define FD_H
/*
* FileOpen uses the standard UNIX open(2) flags.
*/
#include <fcntl.h> /* for O_ on most */
#ifndef O_RDONLY
#include <sys/file.h> /* for O_ on the rest */
#endif /* O_RDONLY */
/*
* FileSeek uses the standard UNIX lseek(2) flags.
*/
#ifndef WIN32
#include <unistd.h> /* for SEEK_ on most */
#else
#ifndef SEEK_SET
#include <stdio.h> /* for SEEK_ on the rest */
#endif /* SEEK_SET */
#endif /* WIN32 */
#include "c.h"
#include "storage/block.h"
typedef char *FileName;
typedef int File;
/* originally in libpq-fs.h */
struct pgstat { /* just the fields we need from stat structure */
int st_ino;
int st_mode;
unsigned int st_size;
unsigned int st_sizehigh; /* high order bits */
/* 2^64 == 1.8 x 10^20 bytes */
int st_uid;
int st_atime_s; /* just the seconds */
int st_mtime_s; /* since SysV and the new BSD both have */
int st_ctime_s; /* usec fields.. */
};
/*
* prototypes for functions in fd.c
*/
extern void FileInvalidate(File file);
extern File FileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
extern void FileClose(File file);
extern void FileUnlink(File file);
extern int FileRead(File file, char *buffer, int amount);
extern int FileWrite(File file, char *buffer, int amount);
extern long FileSeek(File file, long offset, int whence);
extern long FileTell(File file);
extern int FileTruncate(File file, int offset);
extern int FileSync(File file);
extern int FileNameUnlink(char *filename);
extern void AllocateFile(void);
extern void FreeFile(void);
extern void closeAllVfds(void);
extern void closeOneVfd(void);
#endif /* FD_H */

View File

@@ -0,0 +1,14 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/file
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/file/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= fd.c

View File

@@ -0,0 +1,888 @@
/*-------------------------------------------------------------------------
*
* fd.c--
* Virtual file descriptor code.
*
* Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Id: fd.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
*
* NOTES:
*
* This code manages a cache of 'virtual' file descriptors (VFDs).
* The server opens many file descriptors for a variety of reasons,
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
* opened using these interfaces, all subsequent operations must also
* be through these interfaces (the File type is not a real file
* descriptor).
*
* For this scheme to work, most (if not all) routines throughout the
* server should use these interfaces instead of calling the C library
* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
* may find ourselves short of real file descriptors anyway.
*
* This file used to contain a bunch of stuff to support RAID levels 0
* (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
* because the parallel query processing code that called it is all
* gone. If you really need it you could get it from the original
* POSTGRES source.
*-------------------------------------------------------------------------
*/
#include <stdio.h>
#include <sys/file.h>
#include <sys/param.h>
#include <errno.h>
#include <sys/stat.h>
#include <string.h>
#include <unistd.h>
#include "c.h"
#include "miscadmin.h" /* for DataDir */
#include "utils/palloc.h"
#ifdef PORTNAME_sparc
/*
* the SunOS 4 NOFILE is a lie, because the default limit is *not* the
* maximum number of file descriptors you can have open.
*
* we have to either use this number (the default dtablesize) or
* explicitly call setrlimit(RLIMIT_NOFILE, NOFILE).
*/
#include <sys/user.h>
#undef NOFILE
#define NOFILE NOFILE_IN_U
#endif /* PORTNAME_sparc */
/*
* Problem: Postgres does a system(ld...) to do dynamic loading. This
* will open several extra files in addition to those used by
* Postgres. We need to do this hack to guarentee that there are file
* descriptors free for ld to use.
*
* The current solution is to limit the number of files descriptors
* that this code will allocated at one time. (it leaves
* RESERVE_FOR_LD free).
*
* (Even though most dynamic loaders now use dlopen(3) or the
* equivalent, the OS must still open several files to perform the
* dynamic loading. Keep this here.)
*/
#define RESERVE_FOR_LD 10
/*
* If we are using weird storage managers, we may need to keep real
* file descriptors open so that the jukebox server doesn't think we
* have gone away (and no longer care about a platter or file that
* we've been using). This might be an actual file descriptor for a
* local jukebox interface that uses paths, or a socket connection for
* a network jukebox server. Since we can't be opening and closing
* these descriptors at whim, we must make allowances for them.
*/
#ifdef HP_JUKEBOX
#define RESERVE_FOR_JB 25
#define MAXFILES ((NOFILE - RESERVE_FOR_LD) - RESERVE_FOR_JB)
#else /* HP_JUKEBOX */
#define MAXFILES (NOFILE - RESERVE_FOR_LD)
#endif /* HP_JUKEBOX */
/* Debugging.... */
#ifdef FDDEBUG
# define DO_DB(A) A
#else
# define DO_DB(A) /* A */
#endif
#define VFD_CLOSED -1
#include "storage/fd.h"
#include "utils/elog.h"
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
typedef struct vfd {
signed short fd;
unsigned short fdstate;
#define FD_DIRTY (1 << 0)
File nextFree;
File lruMoreRecently;
File lruLessRecently;
long seekPos;
char *fileName;
int fileFlags;
int fileMode;
} Vfd;
/*
* Virtual File Descriptor array pointer and size. This grows as
* needed.
*/
static Vfd *VfdCache;
static Size SizeVfdCache = 0;
/*
* Minimum number of file descriptors known to be free.
*/
static int FreeFd = 0;
/*
* Number of file descriptors known to be open.
*/
static int nfile = 0;
/*
* we use the name of the null device in various places, mostly so
* that we can open it and find out if we really have any descriptors
* available or not.
*/
#ifndef WIN32
static char *Nulldev = "/dev/null";
static char Sep_char = '/';
#else
static char *Nulldev = "NUL";
static char Sep_char = '\\';
#endif /* WIN32 */
/*
* Private Routines
*
* Delete - delete a file from the Lru ring
* LruDelete - remove a file from the Lru ring and close
* Insert - put a file at the front of the Lru ring
* LruInsert - put a file at the front of the Lru ring and open
* AssertLruRoom - make sure that there is a free fd.
*
* the Last Recently Used ring is a doubly linked list that begins and
* ends on element zero.
*
* example:
*
* /--less----\ /---------\
* v \ v \
* #0 --more---> LeastRecentlyUsed --more-\ \
* ^\ | |
* \\less--> MostRecentlyUsedFile <---/ |
* \more---/ \--less--/
*
* AllocateVfd - grab a free (or new) file record (from VfdArray)
* FreeVfd - free a file record
*
*/
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int LruInsert (File file);
static void AssertLruRoom(void);
static File AllocateVfd(void);
static void FreeVfd(File file);
static int FileAccess(File file);
static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
static char *filepath(char *filename);
#if defined(FDDEBUG)
static void
_dump_lru()
{
int mru = VfdCache[0].lruLessRecently;
Vfd *vfdP = &VfdCache[mru];
printf("MOST %d ", mru);
while (mru != 0)
{
mru = vfdP->lruLessRecently;
vfdP = &VfdCache[mru];
printf("%d ", mru);
}
printf("LEAST\n");
}
#endif /* FDDEBUG */
static void
Delete(File file)
{
Vfd *fileP;
DO_DB(printf("DEBUG: Delete %d (%s)\n",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
Assert(file != 0);
fileP = &VfdCache[file];
VfdCache[fileP->lruLessRecently].lruMoreRecently =
VfdCache[file].lruMoreRecently;
VfdCache[fileP->lruMoreRecently].lruLessRecently =
VfdCache[file].lruLessRecently;
DO_DB(_dump_lru());
}
static void
LruDelete(File file)
{
Vfd *fileP;
int returnValue;
DO_DB(printf("DEBUG: LruDelete %d (%s)\n",
file, VfdCache[file].fileName));
Assert(file != 0);
fileP = &VfdCache[file];
/* delete the vfd record from the LRU ring */
Delete(file);
/* save the seek position */
fileP->seekPos = lseek(fileP->fd, 0L, SEEK_CUR);
Assert( fileP->seekPos != -1);
/* if we have written to the file, sync it */
if (fileP->fdstate & FD_DIRTY) {
returnValue = fsync(fileP->fd);
Assert(returnValue != -1);
fileP->fdstate &= ~FD_DIRTY;
}
/* close the file */
returnValue = close(fileP->fd);
Assert(returnValue != -1);
--nfile;
fileP->fd = VFD_CLOSED;
/* note that there is now one more free real file descriptor */
FreeFd++;
}
static void
Insert(File file)
{
Vfd *vfdP;
DO_DB(printf("DEBUG: Insert %d (%s)\n",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
vfdP->lruMoreRecently = 0;
vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
VfdCache[0].lruLessRecently = file;
VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
DO_DB(_dump_lru());
}
static int
LruInsert (File file)
{
Vfd *vfdP;
int returnValue;
DO_DB(printf("DEBUG: LruInsert %d (%s)\n",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file)) {
int tmpfd;
/*
* Note, we check to see if there's a free file descriptor
* before attempting to open a file. One general way to do
* this is to try to open the null device which everybody
* should be able to open all the time. If this fails, we
* assume this is because there's no free file descriptors.
*/
tryAgain:
tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
if (tmpfd < 0) {
FreeFd = 0;
errno = 0;
AssertLruRoom();
goto tryAgain;
} else {
close(tmpfd);
}
vfdP->fd = open(vfdP->fileName,vfdP->fileFlags,vfdP->fileMode);
if (vfdP->fd < 0) {
DO_DB(printf("RE_OPEN FAILED: %d\n",
errno));
return (vfdP->fd);
} else {
DO_DB(printf("RE_OPEN SUCCESS\n"));
++nfile;
}
/* seek to the right position */
if (vfdP->seekPos != 0L) {
returnValue =
lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != -1);
}
/* init state on open */
vfdP->fdstate = 0x0;
/* note that a file descriptor has been used up */
if (FreeFd > 0)
FreeFd--;
}
/*
* put it at the head of the Lru ring
*/
Insert(file);
return (0);
}
static void
AssertLruRoom()
{
DO_DB(printf("DEBUG: AssertLruRoom (FreeFd = %d)\n",
FreeFd));
if (FreeFd <= 0 || nfile >= MAXFILES) {
LruDelete(VfdCache[0].lruMoreRecently);
}
}
static File
AllocateVfd()
{
Index i;
File file;
DO_DB(printf("DEBUG: AllocateVfd\n"));
if (SizeVfdCache == 0) {
/* initialize */
VfdCache = (Vfd *)malloc(sizeof(Vfd));
VfdCache->nextFree = 0;
VfdCache->lruMoreRecently = 0;
VfdCache->lruLessRecently = 0;
VfdCache->fd = VFD_CLOSED;
VfdCache->fdstate = 0x0;
SizeVfdCache = 1;
}
if (VfdCache[0].nextFree == 0) {
/*
* The free list is empty so it is time to increase the
* size of the array
*/
VfdCache =(Vfd *)realloc(VfdCache, sizeof(Vfd)*SizeVfdCache*2);
Assert(VfdCache != NULL);
/*
* Set up the free list for the new entries
*/
for (i = SizeVfdCache; i < 2*SizeVfdCache; i++) {
memset((char *) &(VfdCache[i]), 0, sizeof(VfdCache[0]));
VfdCache[i].nextFree = i+1;
VfdCache[i].fd = VFD_CLOSED;
}
/*
* Element 0 is the first and last element of the free
* list
*/
VfdCache[0].nextFree = SizeVfdCache;
VfdCache[2*SizeVfdCache-1].nextFree = 0;
/*
* Record the new size
*/
SizeVfdCache *= 2;
}
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
return file;
}
static void
FreeVfd(File file)
{
DO_DB(printf("DB: FreeVfd: %d (%s)\n",
file, VfdCache[file].fileName));
VfdCache[file].nextFree = VfdCache[0].nextFree;
VfdCache[0].nextFree = file;
}
static char *
filepath(char *filename)
{
char *buf;
char basename[16];
int len;
#ifndef WIN32
if (*filename != Sep_char) {
#else
if (!(filename[1] == ':' && filename[2] == Sep_char)) {
#endif /* WIN32 */
/* Either /base/ or \base\ */
sprintf(basename, "%cbase%c", Sep_char, Sep_char);
len = strlen(DataDir) + strlen(basename) + strlen(GetDatabaseName())
+ strlen(filename) + 2;
buf = (char*) palloc(len);
sprintf(buf, "%s%s%s%c%s",
DataDir, basename, GetDatabaseName(), Sep_char, filename);
} else {
buf = (char *) palloc(strlen(filename) + 1);
strcpy(buf, filename);
}
return(buf);
}
static int
FileAccess(File file)
{
int returnValue;
DO_DB(printf("DB: FileAccess %d (%s)\n",
file, VfdCache[file].fileName));
/*
* Is the file open? If not, close the least recently used,
* then open it and stick it at the head of the used ring
*/
if (FileIsNotOpen(file)) {
AssertLruRoom();
returnValue = LruInsert(file);
if (returnValue != 0)
return returnValue;
} else {
/*
* We now know that the file is open and that it is not the
* last one accessed, so we need to more it to the head of
* the Lru ring.
*/
Delete(file);
Insert(file);
}
return (0);
}
/*
* Called when we get a shared invalidation message on some relation.
*/
void
FileInvalidate(File file)
{
if (!FileIsNotOpen(file)) {
LruDelete(file);
}
}
/* VARARGS2 */
static File
fileNameOpenFile(FileName fileName,
int fileFlags,
int fileMode)
{
static int osRanOut = 0;
File file;
Vfd *vfdP;
int tmpfd;
DO_DB(printf("DEBUG: FileNameOpenFile: %s %x %o\n",
fileName, fileFlags, fileMode));
file = AllocateVfd();
vfdP = &VfdCache[file];
if (nfile >= MAXFILES || (FreeFd == 0 && osRanOut)) {
AssertLruRoom();
}
tryAgain:
tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
if (tmpfd < 0) {
DO_DB(printf("DB: not enough descs, retry, er= %d\n",
errno));
errno = 0;
FreeFd = 0;
osRanOut = 1;
AssertLruRoom();
goto tryAgain;
} else {
close(tmpfd);
}
#ifdef WIN32
fileFlags |= _O_BINARY;
#endif /* WIN32 */
vfdP->fd = open(fileName,fileFlags,fileMode);
vfdP->fdstate = 0x0;
if (vfdP->fd < 0) {
FreeVfd(file);
return -1;
}
++nfile;
DO_DB(printf("DB: FNOF success %d\n",
vfdP->fd));
(void)LruInsert(file);
if (fileName==NULL) {
elog(WARN, "fileNameOpenFile: NULL fname");
}
vfdP->fileName = malloc(strlen(fileName)+1);
strcpy(vfdP->fileName,fileName);
vfdP->fileFlags = fileFlags & ~(O_TRUNC|O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
return file;
}
/*
* open a file in the database directory ($PGDATA/base/...)
*/
File
FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
File fd;
char *fname;
fname = filepath(fileName);
fd = fileNameOpenFile(fname, fileFlags, fileMode);
pfree(fname);
return(fd);
}
/*
* open a file in an arbitrary directory
*/
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
return(fileNameOpenFile(fileName, fileFlags, fileMode));
}
void
FileClose(File file)
{
int returnValue;
DO_DB(printf("DEBUG: FileClose: %d (%s)\n",
file, VfdCache[file].fileName));
if (!FileIsNotOpen(file)) {
/* remove the file from the lru ring */
Delete(file);
/* record the new free operating system file descriptor */
FreeFd++;
/* if we did any writes, sync the file before closing */
if (VfdCache[file].fdstate & FD_DIRTY) {
returnValue = fsync(VfdCache[file].fd);
Assert(returnValue != -1);
VfdCache[file].fdstate &= ~FD_DIRTY;
}
/* close the file */
returnValue = close(VfdCache[file].fd);
Assert(returnValue != -1);
--nfile;
VfdCache[file].fd = VFD_CLOSED;
}
/*
* Add the Vfd slot to the free list
*/
FreeVfd(file);
/*
* Free the filename string
*/
free(VfdCache[file].fileName);
}
void
FileUnlink(File file)
{
int returnValue;
DO_DB(printf("DB: FileClose: %d (%s)\n",
file, VfdCache[file].fileName));
if (!FileIsNotOpen(file)) {
/* remove the file from the lru ring */
Delete(file);
/* record the new free operating system file descriptor */
FreeFd++;
/* if we did any writes, sync the file before closing */
if (VfdCache[file].fdstate & FD_DIRTY) {
returnValue = fsync(VfdCache[file].fd);
Assert(returnValue != -1);
VfdCache[file].fdstate &= ~FD_DIRTY;
}
/* close the file */
returnValue = close(VfdCache[file].fd);
Assert(returnValue != -1);
--nfile;
VfdCache[file].fd = VFD_CLOSED;
}
/* add the Vfd slot to the free list */
FreeVfd(file);
/* free the filename string */
unlink(VfdCache[file].fileName);
free(VfdCache[file].fileName);
}
int
FileRead(File file, char *buffer, int amount)
{
int returnCode;
DO_DB(printf("DEBUG: FileRead: %d (%s) %d 0x%x\n",
file, VfdCache[file].fileName, amount, buffer));
FileAccess(file);
returnCode = read(VfdCache[file].fd, buffer, amount);
if (returnCode > 0) {
VfdCache[file].seekPos += returnCode;
}
return returnCode;
}
int
FileWrite(File file, char *buffer, int amount)
{
int returnCode;
DO_DB(printf("DB: FileWrite: %d (%s) %d 0x%lx\n",
file, VfdCache[file].fileName, amount, buffer));
FileAccess(file);
returnCode = write(VfdCache[file].fd, buffer, amount);
if (returnCode > 0) { /* changed by Boris with Mao's advice */
VfdCache[file].seekPos += returnCode;
}
/* record the write */
VfdCache[file].fdstate |= FD_DIRTY;
return returnCode;
}
long
FileSeek(File file, long offset, int whence)
{
int returnCode;
DO_DB(printf("DEBUG: FileSeek: %d (%s) %d %d\n",
file, VfdCache[file].fileName, offset, whence));
if (FileIsNotOpen(file)) {
switch(whence) {
case SEEK_SET:
VfdCache[file].seekPos = offset;
return offset;
case SEEK_CUR:
VfdCache[file].seekPos = VfdCache[file].seekPos +offset;
return VfdCache[file].seekPos;
case SEEK_END:
FileAccess(file);
returnCode = VfdCache[file].seekPos =
lseek(VfdCache[file].fd, offset, whence);
return returnCode;
default:
elog(WARN, "FileSeek: invalid whence: %d", whence);
break;
}
} else {
returnCode = VfdCache[file].seekPos =
lseek(VfdCache[file].fd, offset, whence);
return returnCode;
}
/*NOTREACHED*/
return(-1L);
}
/*
* XXX not actually used but here for completeness
*/
long
FileTell(File file)
{
DO_DB(printf("DEBUG: FileTell %d (%s)\n",
file, VfdCache[file].fileName));
return VfdCache[file].seekPos;
}
int
FileTruncate(File file, int offset)
{
int returnCode;
DO_DB(printf("DEBUG: FileTruncate %d (%s)\n",
file, VfdCache[file].fileName));
(void) FileSync(file);
(void) FileAccess(file);
returnCode = ftruncate(VfdCache[file].fd, offset);
return(returnCode);
}
int
FileSync(File file)
{
int returnCode;
/*
* If the file isn't open, then we don't need to sync it; we
* always sync files when we close them. Also, if we haven't
* done any writes that we haven't already synced, we can ignore
* the request.
*/
if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY)) {
returnCode = 0;
} else {
returnCode = fsync(VfdCache[file].fd);
VfdCache[file].fdstate &= ~FD_DIRTY;
}
return returnCode;
}
int
FileNameUnlink(char *filename)
{
int retval;
char *fname;
fname = filepath(filename);
retval = unlink(fname);
pfree(fname);
return(retval);
}
/*
* if we want to be sure that we have a real file descriptor available
* (e.g., we want to know this in psort) we call AllocateFile to force
* availability. when we are done we call FreeFile to deallocate the
* descriptor.
*
* allocatedFiles keeps track of how many have been allocated so we
* can give a warning if there are too few left.
*/
static int allocatedFiles = 0;
void
AllocateFile()
{
int fd;
int fdleft;
while ((fd = open(Nulldev,O_WRONLY,0)) < 0) {
if (errno == EMFILE) {
errno = 0;
FreeFd = 0;
AssertLruRoom();
} else {
elog(WARN,"Open: %s in %s line %d\n", Nulldev,
__FILE__, __LINE__);
}
}
close(fd);
++allocatedFiles;
fdleft = MAXFILES - allocatedFiles;
if (fdleft < 6) {
elog(DEBUG,"warning: few usable file descriptors left (%d)", fdleft);
}
DO_DB(printf("DEBUG: AllocatedFile. FreeFd = %d\n",
FreeFd));
}
/*
* XXX What happens if FreeFile() is called without a previous
* AllocateFile()?
*/
void
FreeFile()
{
DO_DB(printf("DEBUG: FreeFile. FreeFd now %d\n",
FreeFd));
FreeFd++;
nfile++; /* dangerous */
Assert(allocatedFiles > 0);
--allocatedFiles;
}
void
closeAllVfds()
{
int i;
for (i=0; i<SizeVfdCache; i++) {
if (!FileIsNotOpen(i))
LruDelete(i);
}
}
void
closeOneVfd()
{
int tmpfd;
tmpfd = open(Nulldev, O_CREAT | O_RDWR, 0666);
if (tmpfd < 0) {
FreeFd = 0;
AssertLruRoom();
FreeFd = 0;
}
else
close(tmpfd);
}

285
src/backend/storage/ipc.h Normal file
View File

@@ -0,0 +1,285 @@
/*-------------------------------------------------------------------------
*
* ipc.h--
* POSTGRES inter-process communication definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: ipc.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
* NOTES
* This file is very architecture-specific. This stuff should actually
* be factored into the port/ directories.
*
*-------------------------------------------------------------------------
*/
#ifndef IPC_H
#define IPC_H
#include <sys/types.h>
#ifndef _IPC_
#define _IPC_
#include <sys/ipc.h>
#endif
#include "c.h"
/*
* Many architectures have support for user-level spinlocks (i.e., an
* atomic test-and-set instruction). However, we have only written
* spinlock code for the architectures listed.
*/
#if defined(PORTNAME_aix) || \
defined(PORTNAME_alpha) || \
defined(PORTNAME_hpux) || \
defined(PORTNAME_irix5) || \
defined(PORTNAME_next) || \
defined(PORTNAME_sparc) || \
defined(PORTNAME_sparc_solaris) || \
(defined(__i386__) && defined(__GNUC__))
#define HAS_TEST_AND_SET
#endif
#if defined(HAS_TEST_AND_SET)
#if defined(PORTNAME_next)
/*
* Use Mach mutex routines since these are, in effect, test-and-set
* spinlocks.
*/
#undef NEVER /* definition in cthreads.h conflicts with parse.h */
#include <mach/cthreads.h>
typedef struct mutex slock_t;
#else /* next */
#if defined(PORTNAME_aix)
/*
* The AIX C library has the cs(3) builtin for compare-and-set that
* operates on ints.
*/
typedef unsigned int slock_t;
#else /* aix */
#if defined(PORTNAME_alpha)
#include <sys/mman.h>
typedef msemaphore slock_t;
#else /* alpha */
#if defined(PORTNAME_hpux)
/*
* The PA-RISC "semaphore" for the LDWCX instruction is 4 bytes aligned
* to a 16-byte boundary.
*/
typedef struct { int sem[4]; } slock_t;
#else /* hpux */
#if defined(PORTNAME_irix5)
#include <abi_mutex.h>
typedef abilock_t slock_t;
#else /* irix5 */
/*
* On all other architectures spinlocks are a single byte.
*/
typedef unsigned char slock_t;
#endif /* irix5 */
#endif /* hpux */
#endif /* alpha */
#endif /* aix */
#endif /* next */
extern void S_LOCK(slock_t *lock);
extern void S_UNLOCK(slock_t *lock);
extern void S_INIT_LOCK(slock_t *lock);
#if defined(PORTNAME_hpux) || defined(PORTNAME_alpha) || defined(PORTNAME_irix5) || defined(PORTNAME_next)
extern int S_LOCK_FREE(slock_t *lock);
#else /* PORTNAME_hpux */
#define S_LOCK_FREE(lock) ((*lock) == 0)
#endif /* PORTNAME_hpux */
#endif /* HAS_TEST_AND_SET */
/*
* On architectures for which we have not implemented spinlocks (or
* cannot do so), we use System V semaphores. We also use them for
* long locks. For some reason union semun is never defined in the
* System V header files so we must do it ourselves.
*/
#if defined(sequent) || \
defined(PORTNAME_aix) || \
defined(PORTNAME_alpha) || \
defined(PORTNAME_hpux) || \
defined(PORTNAME_sparc_solaris) || \
defined(WIN32) || \
defined(PORTNAME_ultrix4)
union semun {
int val;
struct semid_ds *buf;
unsigned short *array;
};
#endif
typedef uint16 SystemPortAddress;
/* semaphore definitions */
#define IPCProtection (0600) /* access/modify by user only */
#define IPC_NMAXSEM 25 /* maximum number of semaphores */
#define IpcSemaphoreDefaultStartValue 255
#define IpcSharedLock (-1)
#define IpcExclusiveLock (-255)
#define IpcUnknownStatus (-1)
#define IpcInvalidArgument (-2)
#define IpcSemIdExist (-3)
#define IpcSemIdNotExist (-4)
typedef uint32 IpcSemaphoreKey; /* semaphore key */
typedef int IpcSemaphoreId;
/* shared memory definitions */
#define IpcMemCreationFailed (-1)
#define IpcMemIdGetFailed (-2)
#define IpcMemAttachFailed 0
typedef uint32 IPCKey;
#define PrivateIPCKey IPC_PRIVATE
#define DefaultIPCKey 17317
typedef uint32 IpcMemoryKey; /* shared memory key */
typedef int IpcMemoryId;
/* ipc.c */
extern void exitpg(int code);
extern void quasi_exitpg(void);
extern on_exitpg(void (*function)(), caddr_t arg);
extern IpcSemaphoreId IpcSemaphoreCreate(IpcSemaphoreKey semKey,
int semNum, int permission, int semStartValue,
int removeOnExit, int *status);
extern void IpcSemaphoreSet(int semId, int semno, int value);
extern void IpcSemaphoreKill(IpcSemaphoreKey key);
extern void IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock);
extern void IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock);
extern int IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem);
extern int IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem);
extern IpcMemoryId IpcMemoryCreate(IpcMemoryKey memKey, uint32 size,
int permission);
extern IpcMemoryId IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size);
extern void IpcMemoryDetach(int status, char *shmaddr);
extern char *IpcMemoryAttach(IpcMemoryId memId);
extern void IpcMemoryKill(IpcMemoryKey memKey);
extern void CreateAndInitSLockMemory(IPCKey key);
extern void AttachSLockMemory(IPCKey key);
#ifdef HAS_TEST_AND_SET
#define NSLOCKS 2048
#define NOLOCK 0
#define SHAREDLOCK 1
#define EXCLUSIVELOCK 2
typedef enum _LockId_ {
BUFMGRLOCKID,
LOCKLOCKID,
OIDGENLOCKID,
SHMEMLOCKID,
BINDINGLOCKID,
LOCKMGRLOCKID,
SINVALLOCKID,
#ifdef MAIN_MEMORY
MMCACHELOCKID,
#endif /* MAIN_MEMORY */
PROCSTRUCTLOCKID,
FIRSTFREELOCKID
} _LockId_;
#define MAX_SPINS FIRSTFREELOCKID
typedef struct slock {
slock_t locklock;
unsigned char flag;
short nshlocks;
slock_t shlock;
slock_t exlock;
slock_t comlock;
struct slock *next;
} SLock;
extern void ExclusiveLock(int lockid);
extern void ExclusiveUnlock(int lockid);
extern bool LockIsFree(int lockid);
#else /* HAS_TEST_AND_SET */
typedef enum _LockId_ {
SHMEMLOCKID,
BINDINGLOCKID,
BUFMGRLOCKID,
LOCKMGRLOCKID,
SINVALLOCKID,
#ifdef MAIN_MEMORY
MMCACHELOCKID,
#endif /* MAIN_MEMORY */
PROCSTRUCTLOCKID,
OIDGENLOCKID,
FIRSTFREELOCKID
} _LockId_;
#define MAX_SPINS FIRSTFREELOCKID
#endif /* HAS_TEST_AND_SET */
/*
* the following are originally in ipci.h but the prototypes have circular
* dependencies and most files include both ipci.h and ipc.h anyway, hence
* combined.
*
*/
/*
* Note:
* These must not hash to DefaultIPCKey or PrivateIPCKey.
*/
#define SystemPortAddressGetIPCKey(address) \
(28597 * (address) + 17491)
/*
* these keys are originally numbered from 1 to 12 consecutively but not
* all are used. The unused ones are removed. - ay 4/95.
*/
#define IPCKeyGetBufferMemoryKey(key) \
((key == PrivateIPCKey) ? key : 1 + (key))
#define IPCKeyGetSIBufferMemoryBlock(key) \
((key == PrivateIPCKey) ? key : 7 + (key))
#define IPCKeyGetSLockSharedMemoryKey(key) \
((key == PrivateIPCKey) ? key : 10 + (key))
#define IPCKeyGetSpinLockSemaphoreKey(key) \
((key == PrivateIPCKey) ? key : 11 + (key))
#define IPCKeyGetWaitIOSemaphoreKey(key) \
((key == PrivateIPCKey) ? key : 12 + (key))
/* --------------------------
* NOTE: This macro must always give the highest numbered key as every backend
* process forked off by the postmaster will be trying to acquire a semaphore
* with a unique key value starting at key+14 and incrementing up. Each
* backend uses the current key value then increments it by one.
* --------------------------
*/
#define IPCGetProcessSemaphoreInitKey(key) \
((key == PrivateIPCKey) ? key : 14 + (key))
/* ipci.c */
extern IPCKey SystemPortAddressCreateIPCKey(SystemPortAddress address);
extern void CreateSharedMemoryAndSemaphores(IPCKey key);
extern void AttachSharedMemoryAndSemaphores(IPCKey key);
#endif /* IPC_H */

View File

@@ -0,0 +1,15 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/ipc
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= ipc.c ipci.c s_lock.c shmem.c shmqueue.c sinval.c \
sinvaladt.c spin.c

View File

@@ -0,0 +1,31 @@
$Header: /cvsroot/pgsql/src/backend/storage/ipc/README,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
Mon Jul 18 11:09:22 PDT 1988 W.KLAS
Cache invalidation synchronization routines:
===========================================
The cache synchronization is done using a message queue. Every
backend can register a message which then has to be read by
all backends. A message read by all backends is removed from the
queue automatically. If a message has been lost because the buffer
was full, all backends that haven't read this message will be
noticed that they have to reset their cache state. This is done
at the time when they try to read the message queue.
The message queue is implemented as a shared buffer segment. Actually,
the queue is a circle to allow fast inserting, reading (invalidate data) and
maintaining the buffer.
Access to this shared message buffer is synchronized by the lock manager.
The lock manager treats the buffer as a regular relation and sets
relation level locks (with mode = LockWait) to block backends while
another backend is writing or reading the buffer. The identifiers used
for this special 'relation' are database id = 0 and relation id = 0.
The current implementation prints regular (e)log information
when a message has been removed from the buffer because the buffer
is full, and a backend has to reset its cache state. The elog level
is NOTICE. This can be used to improve teh behavior of backends
when invalidating or reseting their cache state.

View File

@@ -0,0 +1,718 @@
/*-------------------------------------------------------------------------
*
* ipc.c--
* POSTGRES inter-process communication definitions.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
* NOTES
*
* Currently, semaphores are used (my understanding anyway) in two
* different ways:
* 1. as mutexes on machines that don't have test-and-set (eg.
* mips R3000).
* 2. for putting processes to sleep when waiting on a lock
* and waking them up when the lock is free.
* The number of semaphores in (1) is fixed and those are shared
* among all backends. In (2), there is 1 semaphore per process and those
* are not shared with anyone else.
* -ay 4/95
*
*-------------------------------------------------------------------------
*/
#include <sys/types.h>
#include <sys/file.h>
#include <stdio.h>
#include <errno.h>
/* XXX - the following dependency should be moved into the defaults.mk file */
#ifndef _IPC_
#define _IPC_
#include <sys/ipc.h>
#include <sys/sem.h>
#include <sys/shm.h>
#endif
#include "storage/ipc.h"
#include "utils/memutils.h"
#include "utils/elog.h"
#if defined(PORTNAME_bsd44)
int UsePrivateMemory = 1;
#else
int UsePrivateMemory = 0;
#endif
#if defined(PORTNAME_bsdi)
/* hacka, hacka, hacka (XXX) */
union semun {
int val; /* value for SETVAL */
struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
ushort *array; /* array for GETALL & SETALL */
};
#endif
/* ----------------------------------------------------------------
* exit() handling stuff
* ----------------------------------------------------------------
*/
#define MAX_ON_EXITS 20
static struct ONEXIT {
void (*function)();
caddr_t arg;
} onexit_list[ MAX_ON_EXITS ];
static int onexit_index;
typedef struct _PrivateMemStruct {
int id;
char *memptr;
} PrivateMem;
PrivateMem IpcPrivateMem[16];
static int
PrivateMemoryCreate(IpcMemoryKey memKey,
uint32 size)
{
static int memid = 0;
UsePrivateMemory = 1;
IpcPrivateMem[memid].id = memid;
IpcPrivateMem[memid].memptr = malloc(size);
if (IpcPrivateMem[memid].memptr == NULL)
elog(WARN, "PrivateMemoryCreate: not enough memory to malloc");
memset(IpcPrivateMem[memid].memptr, 0, size); /* XXX PURIFY */
return (memid++);
}
static char *
PrivateMemoryAttach(IpcMemoryId memid)
{
return ( IpcPrivateMem[memid].memptr );
}
/* ----------------------------------------------------------------
* exitpg
*
* this function calls all the callbacks registered
* for it (to free resources) and then calls exit.
* This should be the only function to call exit().
* -cim 2/6/90
* ----------------------------------------------------------------
*/
static int exitpg_inprogress = 0;
void
exitpg(int code)
{
int i;
/* ----------------
* if exitpg_inprocess is true, then it means that we
* are being invoked from within an on_exit() handler
* and so we return immediately to avoid recursion.
* ----------------
*/
if (exitpg_inprogress)
return;
exitpg_inprogress = 1;
/* ----------------
* call all the callbacks registered before calling exit().
* ----------------
*/
for (i = onexit_index - 1; i >= 0; --i)
(*onexit_list[i].function)(code, onexit_list[i].arg);
exit(code);
}
/* ------------------
* Run all of the on_exitpg routines but don't exit in the end.
* This is used by the postmaster to re-initialize shared memory and
* semaphores after a backend dies horribly
* ------------------
*/
void
quasi_exitpg()
{
int i;
/* ----------------
* if exitpg_inprocess is true, then it means that we
* are being invoked from within an on_exit() handler
* and so we return immediately to avoid recursion.
* ----------------
*/
if (exitpg_inprogress)
return;
exitpg_inprogress = 1;
/* ----------------
* call all the callbacks registered before calling exit().
* ----------------
*/
for (i = onexit_index - 1; i >= 0; --i)
(*onexit_list[i].function)(0, onexit_list[i].arg);
onexit_index = 0;
exitpg_inprogress = 0;
}
/* ----------------------------------------------------------------
* on_exitpg
*
* this function adds a callback function to the list of
* functions invoked by exitpg(). -cim 2/6/90
* ----------------------------------------------------------------
*/
int
on_exitpg(void (*function)(), caddr_t arg)
{
if (onexit_index >= MAX_ON_EXITS)
return(-1);
onexit_list[ onexit_index ].function = function;
onexit_list[ onexit_index ].arg = arg;
++onexit_index;
return(0);
}
/****************************************************************************/
/* IPCPrivateSemaphoreKill(status, semId) */
/* */
/****************************************************************************/
static void
IPCPrivateSemaphoreKill(int status,
int semId) /* caddr_t */
{
union semun semun;
semctl(semId, 0, IPC_RMID, semun);
}
/****************************************************************************/
/* IPCPrivateMemoryKill(status, shmId) */
/* */
/****************************************************************************/
static void
IPCPrivateMemoryKill(int status,
int shmId) /* caddr_t */
{
if ( UsePrivateMemory ) {
/* free ( IpcPrivateMem[shmId].memptr ); */
} else {
if (shmctl(shmId, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
elog(NOTICE, "IPCPrivateMemoryKill: shmctl(%d, %d, 0) failed: %m",
shmId, IPC_RMID);
}
}
}
/****************************************************************************/
/* IpcSemaphoreCreate(semKey, semNum, permission, semStartValue) */
/* */
/* - returns a semaphore identifier: */
/* */
/* if key doesn't exist: return a new id, status:= IpcSemIdNotExist */
/* if key exists: return the old id, status:= IpcSemIdExist */
/* if semNum > MAX : return # of argument, status:=IpcInvalidArgument */
/* */
/****************************************************************************/
/*
* Note:
* XXX This should be split into two different calls. One should
* XXX be used to create a semaphore set. The other to "attach" a
* XXX existing set. It should be an error for the semaphore set
* XXX to to already exist or for it not to, respectively.
*
* Currently, the semaphore sets are "attached" and an error
* is detected only when a later shared memory attach fails.
*/
IpcSemaphoreId
IpcSemaphoreCreate(IpcSemaphoreKey semKey,
int semNum,
int permission,
int semStartValue,
int removeOnExit,
int *status)
{
int i;
int errStatus;
int semId;
u_short array[IPC_NMAXSEM];
union semun semun;
/* get a semaphore if non-existent */
/* check arguments */
if (semNum > IPC_NMAXSEM || semNum <= 0) {
*status = IpcInvalidArgument;
return(2); /* returns the number of the invalid argument */
}
semId = semget(semKey, 0, 0);
if (semId == -1) {
*status = IpcSemIdNotExist; /* there doesn't exist a semaphore */
#ifdef DEBUG_IPC
fprintf(stderr,"calling semget with %d, %d , %d\n",
semKey,
semNum,
IPC_CREAT|permission );
#endif
semId = semget(semKey, semNum, IPC_CREAT|permission);
if (semId < 0) {
perror("semget");
exitpg(3);
}
for (i = 0; i < semNum; i++) {
array[i] = semStartValue;
}
semun.array = array;
errStatus = semctl(semId, 0, SETALL, semun);
if (errStatus == -1) {
perror("semctl");
}
if (removeOnExit)
on_exitpg(IPCPrivateSemaphoreKill, (caddr_t)semId);
} else {
/* there is a semaphore id for this key */
*status = IpcSemIdExist;
}
#ifdef DEBUG_IPC
fprintf(stderr,"\nIpcSemaphoreCreate, status %d, returns %d\n",
*status,
semId );
fflush(stdout);
fflush(stderr);
#endif
return(semId);
}
/****************************************************************************/
/* IpcSemaphoreSet() - sets the initial value of the semaphore */
/* */
/* note: the xxx_return variables are only used for debugging. */
/****************************************************************************/
static int IpcSemaphoreSet_return;
void
IpcSemaphoreSet(int semId, int semno, int value)
{
int errStatus;
union semun semun;
semun.val = value;
errStatus = semctl(semId, semno, SETVAL, semun);
IpcSemaphoreSet_return = errStatus;
if (errStatus == -1)
perror("semctl");
}
/****************************************************************************/
/* IpcSemaphoreKill(key) - removes a semaphore */
/* */
/****************************************************************************/
void
IpcSemaphoreKill(IpcSemaphoreKey key)
{
int semId;
union semun semun;
/* kill semaphore if existent */
semId = semget(key, 0, 0);
if (semId != -1)
semctl(semId, 0, IPC_RMID, semun);
}
/****************************************************************************/
/* IpcSemaphoreLock(semId, sem, lock) - locks a semaphore */
/* */
/* note: the xxx_return variables are only used for debugging. */
/****************************************************************************/
static int IpcSemaphoreLock_return;
void
IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock)
{
extern int errno;
int errStatus;
struct sembuf sops;
sops.sem_op = lock;
sops.sem_flg = 0;
sops.sem_num = sem;
/* ----------------
* Note: if errStatus is -1 and errno == EINTR then it means we
* returned from the operation prematurely because we were
* sent a signal. So we try and lock the semaphore again.
* I am not certain this is correct, but the semantics aren't
* clear it fixes problems with parallel abort synchronization,
* namely that after processing an abort signal, the semaphore
* call returns with -1 (and errno == EINTR) before it should.
* -cim 3/28/90
* ----------------
*/
do {
errStatus = semop(semId, &sops, 1);
} while (errStatus == -1 && errno == EINTR);
IpcSemaphoreLock_return = errStatus;
if (errStatus == -1) {
perror("semop");
exitpg(255);
}
}
/****************************************************************************/
/* IpcSemaphoreUnlock(semId, sem, lock) - unlocks a semaphore */
/* */
/* note: the xxx_return variables are only used for debugging. */
/****************************************************************************/
static int IpcSemaphoreUnlock_return;
void
IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock)
{
extern int errno;
int errStatus;
struct sembuf sops;
sops.sem_op = -lock;
sops.sem_flg = 0;
sops.sem_num = sem;
/* ----------------
* Note: if errStatus is -1 and errno == EINTR then it means we
* returned from the operation prematurely because we were
* sent a signal. So we try and lock the semaphore again.
* I am not certain this is correct, but the semantics aren't
* clear it fixes problems with parallel abort synchronization,
* namely that after processing an abort signal, the semaphore
* call returns with -1 (and errno == EINTR) before it should.
* -cim 3/28/90
* ----------------
*/
do {
errStatus = semop(semId, &sops, 1);
} while (errStatus == -1 && errno == EINTR);
IpcSemaphoreUnlock_return = errStatus;
if (errStatus == -1) {
perror("semop");
exitpg(255);
}
}
int
IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem)
{
int semncnt;
union semun dummy; /* for Solaris */
semncnt = semctl(semId, sem, GETNCNT, dummy);
return semncnt;
}
int
IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem)
{
int semval;
union semun dummy; /* for Solaris */
semval = semctl(semId, sem, GETVAL, dummy);
return semval;
}
/****************************************************************************/
/* IpcMemoryCreate(memKey) */
/* */
/* - returns the memory identifier, if creation succeeds */
/* returns IpcMemCreationFailed, if failure */
/****************************************************************************/
IpcMemoryId
IpcMemoryCreate(IpcMemoryKey memKey, uint32 size, int permission)
{
IpcMemoryId shmid;
if (memKey == PrivateIPCKey) {
/* private */
shmid = PrivateMemoryCreate(memKey, size);
}else {
shmid = shmget(memKey, size, IPC_CREAT|permission);
}
if (shmid < 0) {
fprintf(stderr,"IpcMemoryCreate: memKey=%d , size=%d , permission=%d",
memKey, size , permission );
perror("IpcMemoryCreate: shmget(..., create, ...) failed");
return(IpcMemCreationFailed);
}
/* if (memKey == PrivateIPCKey) */
on_exitpg(IPCPrivateMemoryKill, (caddr_t)shmid);
return(shmid);
}
/****************************************************************************/
/* IpcMemoryIdGet(memKey, size) returns the shared memory Id */
/* or IpcMemIdGetFailed */
/****************************************************************************/
IpcMemoryId
IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size)
{
IpcMemoryId shmid;
shmid = shmget(memKey, size, 0);
if (shmid < 0) {
fprintf(stderr,"IpcMemoryIdGet: memKey=%d , size=%d , permission=%d",
memKey, size , 0 );
perror("IpcMemoryIdGet: shmget() failed");
return(IpcMemIdGetFailed);
}
return(shmid);
}
/****************************************************************************/
/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
/* from a backend address space */
/* (only called by backends running under the postmaster) */
/****************************************************************************/
void
IpcMemoryDetach(int status, char *shmaddr)
{
if (shmdt(shmaddr) < 0) {
elog(NOTICE, "IpcMemoryDetach: shmdt(0x%x): %m", shmaddr);
}
}
/****************************************************************************/
/* IpcMemoryAttach(memId) returns the adress of shared memory */
/* or IpcMemAttachFailed */
/* */
/* CALL IT: addr = (struct <MemoryStructure> *) IpcMemoryAttach(memId); */
/* */
/****************************************************************************/
char *
IpcMemoryAttach(IpcMemoryId memId)
{
char *memAddress;
if (UsePrivateMemory) {
memAddress = (char *) PrivateMemoryAttach(memId);
} else {
memAddress = (char *) shmat(memId, 0, 0);
}
/* if ( *memAddress == -1) { XXX ??? */
if ( memAddress == (char *)-1) {
perror("IpcMemoryAttach: shmat() failed");
return(IpcMemAttachFailed);
}
if (!UsePrivateMemory)
on_exitpg(IpcMemoryDetach, (caddr_t) memAddress);
return((char *) memAddress);
}
/****************************************************************************/
/* IpcMemoryKill(memKey) removes a shared memory segment */
/* (only called by the postmaster and standalone backends) */
/****************************************************************************/
void
IpcMemoryKill(IpcMemoryKey memKey)
{
IpcMemoryId shmid;
if (!UsePrivateMemory && (shmid = shmget(memKey, 0, 0)) >= 0) {
if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
elog(NOTICE, "IpcMemoryKill: shmctl(%d, %d, 0) failed: %m",
shmid, IPC_RMID);
}
}
}
#ifdef HAS_TEST_AND_SET
/* ------------------
* use hardware locks to replace semaphores for sequent machines
* to avoid costs of swapping processes and to provide unlimited
* supply of locks.
* ------------------
*/
static SLock *SLockArray = NULL;
static SLock **FreeSLockPP;
static int *UnusedSLockIP;
static slock_t *SLockMemoryLock;
static IpcMemoryId SLockMemoryId = -1;
struct ipcdummy { /* to get alignment/size right */
SLock *free;
int unused;
slock_t memlock;
SLock slocks[NSLOCKS];
};
static int SLockMemorySize = sizeof(struct ipcdummy);
void
CreateAndInitSLockMemory(IPCKey key)
{
int id;
SLock *slckP;
SLockMemoryId = IpcMemoryCreate(key,
SLockMemorySize,
0700);
AttachSLockMemory(key);
*FreeSLockPP = NULL;
*UnusedSLockIP = (int)FIRSTFREELOCKID;
for (id=0; id<(int)FIRSTFREELOCKID; id++) {
slckP = &(SLockArray[id]);
S_INIT_LOCK(&(slckP->locklock));
slckP->flag = NOLOCK;
slckP->nshlocks = 0;
S_INIT_LOCK(&(slckP->shlock));
S_INIT_LOCK(&(slckP->exlock));
S_INIT_LOCK(&(slckP->comlock));
slckP->next = NULL;
}
return;
}
void
AttachSLockMemory(IPCKey key)
{
struct ipcdummy *slockM;
if (SLockMemoryId == -1)
SLockMemoryId = IpcMemoryIdGet(key,SLockMemorySize);
if (SLockMemoryId == -1)
elog(FATAL, "SLockMemory not in shared memory");
slockM = (struct ipcdummy *) IpcMemoryAttach(SLockMemoryId);
if (slockM == IpcMemAttachFailed)
elog(FATAL, "AttachSLockMemory: could not attach segment");
FreeSLockPP = (SLock **) &(slockM->free);
UnusedSLockIP = (int *) &(slockM->unused);
SLockMemoryLock = (slock_t *) &(slockM->memlock);
S_INIT_LOCK(SLockMemoryLock);
SLockArray = (SLock *) &(slockM->slocks[0]);
return;
}
#ifdef LOCKDEBUG
#define PRINT_LOCK(LOCK) printf("(locklock = %d, flag = %d, nshlocks = %d, \
shlock = %d, exlock =%d)\n", LOCK->locklock, \
LOCK->flag, LOCK->nshlocks, LOCK->shlock, \
LOCK->exlock)
#endif
void
ExclusiveLock(int lockid)
{
SLock *slckP;
slckP = &(SLockArray[lockid]);
#ifdef LOCKDEBUG
printf("ExclusiveLock(%d)\n", lockid);
printf("IN: ");
PRINT_LOCK(slckP);
#endif
ex_try_again:
S_LOCK(&(slckP->locklock));
switch (slckP->flag) {
case NOLOCK:
slckP->flag = EXCLUSIVELOCK;
S_LOCK(&(slckP->exlock));
S_LOCK(&(slckP->shlock));
S_UNLOCK(&(slckP->locklock));
#ifdef LOCKDEBUG
printf("OUT: ");
PRINT_LOCK(slckP);
#endif
return;
case SHAREDLOCK:
case EXCLUSIVELOCK:
S_UNLOCK(&(slckP->locklock));
S_LOCK(&(slckP->exlock));
S_UNLOCK(&(slckP->exlock));
goto ex_try_again;
}
}
void
ExclusiveUnlock(int lockid)
{
SLock *slckP;
slckP = &(SLockArray[lockid]);
#ifdef LOCKDEBUG
printf("ExclusiveUnlock(%d)\n", lockid);
printf("IN: ");
PRINT_LOCK(slckP);
#endif
S_LOCK(&(slckP->locklock));
/* -------------
* give favor to read processes
* -------------
*/
slckP->flag = NOLOCK;
if (slckP->nshlocks > 0) {
while (slckP->nshlocks > 0) {
S_UNLOCK(&(slckP->shlock));
S_LOCK(&(slckP->comlock));
}
S_UNLOCK(&(slckP->shlock));
}
else {
S_UNLOCK(&(slckP->shlock));
}
S_UNLOCK(&(slckP->exlock));
S_UNLOCK(&(slckP->locklock));
#ifdef LOCKDEBUG
printf("OUT: ");
PRINT_LOCK(slckP);
#endif
return;
}
bool
LockIsFree(int lockid)
{
return(SLockArray[lockid].flag == NOLOCK);
}
#endif /* HAS_TEST_AND_SET */

View File

@@ -0,0 +1,149 @@
/*-------------------------------------------------------------------------
*
* ipci.c--
* POSTGRES inter-process communication initialization code.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include "storage/ipc.h"
#include "storage/multilev.h"
#include "utils/elog.h"
#include "storage/sinval.h"
#include "storage/bufmgr.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/lock.h"
#include "miscadmin.h" /* for DebugLvl */
/*
* SystemPortAddressCreateMemoryKey --
* Returns a memory key given a port address.
*/
IPCKey
SystemPortAddressCreateIPCKey(SystemPortAddress address)
{
Assert(address < 32768); /* XXX */
return (SystemPortAddressGetIPCKey(address));
}
/*
* CreateSharedMemoryAndSemaphores --
* Creates and initializes shared memory and semaphores.
*/
/**************************************************
CreateSharedMemoryAndSemaphores
is called exactly *ONCE* by the postmaster.
It is *NEVER* called by the postgres backend
0) destroy any existing semaphores for both buffer
and lock managers.
1) create the appropriate *SHARED* memory segments
for the two resource managers.
**************************************************/
void
CreateSharedMemoryAndSemaphores(IPCKey key)
{
int size;
#ifdef HAS_TEST_AND_SET
/* ---------------
* create shared memory for slocks
* --------------
*/
CreateAndInitSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
#endif
/* ----------------
* kill and create the buffer manager buffer pool (and semaphore)
* ----------------
*/
CreateSpinlocks(IPCKeyGetSpinLockSemaphoreKey(key));
size = BufferShmemSize() + LockShmemSize();
#ifdef MAIN_MEMORY
size += MMShmemSize();
#endif /* MAIN_MEMORY */
if (DebugLvl > 1) {
fprintf(stderr, "binding ShmemCreate(key=%x, size=%d)\n",
IPCKeyGetBufferMemoryKey(key), size);
}
ShmemCreate(IPCKeyGetBufferMemoryKey(key), size);
ShmemBindingTabReset();
InitShmem(key, size);
InitBufferPool(key);
/* ----------------
* do the lock table stuff
* ----------------
*/
InitLocks();
InitMultiLevelLockm();
if (InitMultiLevelLockm() == INVALID_TABLEID)
elog(FATAL, "Couldn't create the lock table");
/* ----------------
* do process table stuff
* ----------------
*/
InitProcGlobal(key);
on_exitpg(ProcFreeAllSemaphores, 0);
CreateSharedInvalidationState(key);
}
/*
* AttachSharedMemoryAndSemaphores --
* Attachs existant shared memory and semaphores.
*/
void
AttachSharedMemoryAndSemaphores(IPCKey key)
{
int size;
/* ----------------
* create rather than attach if using private key
* ----------------
*/
if (key == PrivateIPCKey) {
CreateSharedMemoryAndSemaphores(key);
return;
}
#ifdef HAS_TEST_AND_SET
/* ----------------
* attach the slock shared memory
* ----------------
*/
AttachSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
#endif
/* ----------------
* attach the buffer manager buffer pool (and semaphore)
* ----------------
*/
size = BufferShmemSize() + LockShmemSize();
InitShmem(key, size);
InitBufferPool(key);
/* ----------------
* initialize lock table stuff
* ----------------
*/
InitLocks();
if (InitMultiLevelLockm() == INVALID_TABLEID)
elog(FATAL, "Couldn't attach to the lock table");
AttachSharedInvalidationState(key);
}

View File

@@ -0,0 +1,440 @@
/*-------------------------------------------------------------------------
*
* s_lock.c--
* This file contains the implementation (if any) for spinlocks.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/s_lock.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* DESCRIPTION
* The following code fragment should be written (in assembly
* language) on machines that have a native test-and-set instruction:
*
* void
* S_LOCK(char_address)
* char *char_address;
* {
* while (test_and_set(char_address))
* ;
* }
*
* If this is not done, POSTGRES will default to using System V
* semaphores (and take a large performance hit -- around 40% of
* its time on a DS5000/240 is spent in semop(3)...).
*
* NOTES
* AIX has a test-and-set but the recommended interface is the cs(3)
* system call. This provides an 8-instruction (plus system call
* overhead) uninterruptible compare-and-set operation. True
* spinlocks might be faster but using cs(3) still speeds up the
* regression test suite by about 25%. I don't have an assembler
* manual for POWER in any case.
*
*/
#ifdef WIN32
#include <windows.h>
#endif /* WIN32 */
#include "storage/ipc.h"
#if defined(HAS_TEST_AND_SET)
#if defined (PORTNAME_next)
/*
* NEXTSTEP (mach)
* slock_t is defined as a struct mutex.
*/
void
S_LOCK(slock_t *lock)
{
mutex_lock(lock);
}
void
S_UNLOCK(slock_t *lock)
{
mutex_unlock(lock);
}
void
S_INIT_LOCK(slock_t *lock)
{
mutex_init(lock);
}
/* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
int
S_LOCK_FREE(slock_t *lock)
{
/* For Mach, we have to delve inside the entrails of `struct
mutex'. Ick! */
return (lock->lock == 0);
}
#endif /* PORTNAME_next */
#if defined(PORTNAME_irix5)
/*
* SGI IRIX 5
* slock_t is defined as a struct abilock_t, which has a single unsigned long
* member.
*
* This stuff may be supplemented in the future with Masato Kataoka's MIPS-II
* assembly from his NECEWS SVR4 port, but we probably ought to retain this
* for the R3000 chips out there.
*/
void
S_LOCK(slock_t *lock)
{
/* spin_lock(lock); */
while (!acquire_lock(lock))
;
}
void
S_UNLOCK(slock_t *lock)
{
(void)release_lock(lock);
}
void
S_INIT_LOCK(slock_t *lock)
{
(void)init_lock(lock);
}
/* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
int
S_LOCK_FREE(slock_t *lock)
{
return(stat_lock(lock)==UNLOCKED);
}
#endif /* PORTNAME_irix5 */
/*
* OSF/1 (Alpha AXP)
*
* Note that slock_t on the Alpha AXP is msemaphore instead of char
* (see storage/ipc.h).
*/
#if defined(PORTNAME_alpha)
void
S_LOCK(slock_t *lock)
{
while (msem_lock(lock, MSEM_IF_NOWAIT) < 0)
;
}
void
S_UNLOCK(slock_t *lock)
{
(void) msem_unlock(lock, 0);
}
void
S_INIT_LOCK(slock_t *lock)
{
(void) msem_init(lock, MSEM_UNLOCKED);
}
int
S_LOCK_FREE(slock_t *lock)
{
return(lock->msem_state ? 0 : 1);
}
#endif /* PORTNAME_alpha */
/*
* Solaris 2
*/
#if defined(PORTNAME_sparc_solaris)
/* defined in port/.../tas.s */
extern int tas(slock_t *lock);
void
S_LOCK(slock_t *lock)
{
while (tas(lock))
;
}
void
S_UNLOCK(slock_t *lock)
{
*lock = 0;
}
void
S_INIT_LOCK(slock_t *lock)
{
S_UNLOCK(lock);
}
#endif /* PORTNAME_sparc_solaris */
/*
* AIX (POWER)
*
* Note that slock_t on POWER/POWER2/PowerPC is int instead of char
* (see storage/ipc.h).
*/
#if defined(PORTNAME_aix)
void
S_LOCK(slock_t *lock)
{
while (cs((int *) lock, 0, 1))
;
}
void
S_UNLOCK(slock_t *lock)
{
*lock = 0;
}
void
S_INIT_LOCK(slock_t *lock)
{
S_UNLOCK(lock);
}
#endif /* PORTNAME_aix */
/*
* HP-UX (PA-RISC)
*
* Note that slock_t on PA-RISC is a structure instead of char
* (see storage/ipc.h).
*/
#if defined(PORTNAME_hpux)
/* defined in port/.../tas.s */
extern int tas(slock_t *lock);
/*
* a "set" slock_t has a single word cleared. a "clear" slock_t has
* all words set to non-zero.
*/
static slock_t clear_lock = { -1, -1, -1, -1 };
void
S_LOCK(slock_t *lock)
{
while (tas(lock))
;
}
void
S_UNLOCK(slock_t *lock)
{
*lock = clear_lock; /* struct assignment */
}
void
S_INIT_LOCK(slock_t *lock)
{
S_UNLOCK(lock);
}
int
S_LOCK_FREE(slock_t *lock)
{
register int *lock_word = (int *) (((long) lock + 15) & ~15);
return(*lock_word != 0);
}
#endif /* PORTNAME_hpux */
/*
* sun3
*/
#if (defined(sun) && ! defined(sparc))
void
S_LOCK(slock_t *lock)
{
while (tas(lock));
}
void
S_UNLOCK(slock_t *lock)
{
*lock = 0;
}
void
S_INIT_LOCK(slock_t *lock)
{
S_UNLOCK(lock);
}
static int
tas_dummy()
{
asm("LLA0:");
asm(" .data");
asm(" .text");
asm("|#PROC# 04");
asm(" .globl _tas");
asm("_tas:");
asm("|#PROLOGUE# 1");
asm(" movel sp@(0x4),a0");
asm(" tas a0@");
asm(" beq LLA1");
asm(" moveq #-128,d0");
asm(" rts");
asm("LLA1:");
asm(" moveq #0,d0");
asm(" rts");
asm(" .data");
}
#endif
/*
* SPARC (SunOS 4)
*/
#if defined(PORTNAME_sparc)
/* if we're using -ansi w/ gcc, use __asm__ instead of asm */
#if defined(__STRICT_ANSI__)
#define asm(x) __asm__(x)
#endif
static int
tas_dummy()
{
asm(".seg \"data\"");
asm(".seg \"text\"");
asm(".global _tas");
asm("_tas:");
/*
* Sparc atomic test and set (sparc calls it "atomic load-store")
*/
asm("ldstub [%r8], %r8");
/*
* Did test and set actually do the set?
*/
asm("tst %r8");
asm("be,a ReturnZero");
/*
* otherwise, just return.
*/
asm("clr %r8");
asm("mov 0x1, %r8");
asm("ReturnZero:");
asm("retl");
asm("nop");
}
void
S_LOCK(unsigned char *addr)
{
while (tas(addr));
}
/*
* addr should be as in the above S_LOCK routine
*/
void
S_UNLOCK(unsigned char *addr)
{
*addr = 0;
}
void
S_INIT_LOCK(unsigned char *addr)
{
*addr = 0;
}
#endif /* PORTNAME_sparc */
/*
* Linux and friends
*/
#if defined(PORTNAME_linux) || defined(PORTNAME_BSD44_derived)
int
tas(slock_t *m)
{
slock_t res;
__asm__("xchgb %0,%1":"=q" (res),"=m" (*m):"0" (0x1));
return(res);
}
void
S_LOCK(slock_t *lock)
{
while (tas(lock))
;
}
void
S_UNLOCK(slock_t *lock)
{
*lock = 0;
}
void
S_INIT_LOCK(slock_t *lock)
{
S_UNLOCK(lock);
}
#endif /* PORTNAME_linux || PORTNAME_BSD44_derived */
#endif /* HAS_TEST_AND_SET */
#ifdef WIN32
void
S_LOCK(HANDLE *lock)
{
int x = 0;
x = x / x;
}
void
S_UNLOCK(HANDLE *lock)
{
int x = 0;
x = x / x;
}
void
S_INIT_LOCK(HANDLE *lock)
{
int x = 0;
x = x / x;
}
#endif /*WIN32*/

View File

@@ -0,0 +1,561 @@
/*-------------------------------------------------------------------------
*
* shmem.c--
* create shared memory and initialize shared memory data structures.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* POSTGRES processes share one or more regions of shared memory.
* The shared memory is created by a postmaster and is "attached to"
* by each of the backends. The routines in this file are used for
* allocating and binding to shared memory data structures.
*
* NOTES:
* (a) There are three kinds of shared memory data structures
* available to POSTGRES: fixed-size structures, queues and hash
* tables. Fixed-size structures contain things like global variables
* for a module and should never be allocated after the process
* initialization phase. Hash tables have a fixed maximum size, but
* their actual size can vary dynamically. When entries are added
* to the table, more space is allocated. Queues link data structures
* that have been allocated either as fixed size structures or as hash
* buckets. Each shared data structure has a string name to identify
* it (assigned in the module that declares it).
*
* (b) During initialization, each module looks for its
* shared data structures in a hash table called the "Binding Table".
* If the data structure is not present, the caller can allocate
* a new one and initialize it. If the data structure is present,
* the caller "attaches" to the structure by initializing a pointer
* in the local address space.
* The binding table has two purposes: first, it gives us
* a simple model of how the world looks when a backend process
* initializes. If something is present in the binding table,
* it is initialized. If it is not, it is uninitialized. Second,
* the binding table allows us to allocate shared memory on demand
* instead of trying to preallocate structures and hard-wire the
* sizes and locations in header files. If you are using a lot
* of shared memory in a lot of different places (and changing
* things during development), this is important.
*
* (c) memory allocation model: shared memory can never be
* freed, once allocated. Each hash table has its own free list,
* so hash buckets can be reused when an item is deleted. However,
* if one hash table grows very large and then shrinks, its space
* cannot be redistributed to other tables. We could build a simple
* hash bucket garbage collector if need be. Right now, it seems
* unnecessary.
*
* See InitSem() in sem.c for an example of how to use the
* binding table.
*
*/
#include <stdio.h>
#include <string.h>
#include "postgres.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/hsearch.h"
#include "utils/elog.h"
/* shared memory global variables */
unsigned long ShmemBase = 0; /* start and end address of
* shared memory
*/
static unsigned long ShmemEnd = 0;
static unsigned long ShmemSize = 0; /* current size (and default) */
SPINLOCK ShmemLock; /* lock for shared memory allocation */
SPINLOCK BindingLock; /* lock for binding table access */
static unsigned long *ShmemFreeStart = NULL; /* pointer to the OFFSET of
* first free shared memory
*/
static unsigned long *ShmemBindingTabOffset = NULL; /* start of the binding
* table (for bootstrap)
*/
static int ShmemBootstrap = FALSE; /* flag becomes true when shared mem
* is created by POSTMASTER
*/
static HTAB *BindingTable = NULL;
/* ---------------------
* ShmemBindingTabReset() - Resets the binding table to NULL....
* useful when the postmaster destroys existing shared memory
* and creates all new segments after a backend crash.
* ----------------------
*/
void
ShmemBindingTabReset()
{
BindingTable = (HTAB *)NULL;
}
/*
* CreateSharedRegion() --
*
* This routine is called once by the postmaster to
* initialize the shared buffer pool. Assume there is
* only one postmaster so no synchronization is necessary
* until after this routine completes successfully.
*
* key is a unique identifier for the shmem region.
* size is the size of the region.
*/
static IpcMemoryId ShmemId;
void
ShmemCreate(unsigned int key, unsigned int size)
{
if (size)
ShmemSize = size;
/* create shared mem region */
if ((ShmemId=IpcMemoryCreate(key,ShmemSize,IPCProtection))
==IpcMemCreationFailed) {
elog(FATAL,"ShmemCreate: cannot create region");
exit(1);
}
/* ShmemBootstrap is true if shared memory has been
* created, but not yet initialized. Only the
* postmaster/creator-of-all-things should have
* this flag set.
*/
ShmemBootstrap = TRUE;
}
/*
* InitShmem() -- map region into process address space
* and initialize shared data structures.
*
*/
int
InitShmem(unsigned int key, unsigned int size)
{
Pointer sharedRegion;
unsigned long currFreeSpace;
HASHCTL info;
int hash_flags;
BindingEnt * result,item;
bool found;
IpcMemoryId shmid;
/* if zero key, use default memory size */
if (size)
ShmemSize = size;
/* default key is 0 */
/* attach to shared memory region (SysV or BSD OS specific) */
if (ShmemBootstrap && key == PrivateIPCKey)
/* if we are running backend alone */
shmid = ShmemId;
else
shmid = IpcMemoryIdGet(IPCKeyGetBufferMemoryKey(key), ShmemSize);
sharedRegion = IpcMemoryAttach(shmid);
if (sharedRegion == NULL) {
elog(FATAL,"AttachSharedRegion: couldn't attach to shmem\n");
return(FALSE);
}
/* get pointers to the dimensions of shared memory */
ShmemBase = (unsigned long) sharedRegion;
ShmemEnd = (unsigned long) sharedRegion + ShmemSize;
currFreeSpace = 0;
/* First long in shared memory is the count of available space */
ShmemFreeStart = (unsigned long *) ShmemBase;
/* next is a shmem pointer to the binding table */
ShmemBindingTabOffset = ShmemFreeStart + 1;
currFreeSpace +=
sizeof(ShmemFreeStart) + sizeof(ShmemBindingTabOffset);
/* bootstrap initialize spin locks so we can start to use the
* allocator and binding table.
*/
if (! InitSpinLocks(ShmemBootstrap, IPCKeyGetSpinLockSemaphoreKey(key))) {
return(FALSE);
}
/* We have just allocated additional space for two spinlocks.
* Now setup the global free space count
*/
if (ShmemBootstrap) {
*ShmemFreeStart = currFreeSpace;
}
/* if ShmemFreeStart is NULL, then the allocator won't work */
Assert(*ShmemFreeStart);
/* create OR attach to the shared memory binding table */
info.keysize = BTABLE_KEYSIZE;
info.datasize = BTABLE_DATASIZE;
hash_flags = (HASH_ELEM);
/* This will acquire the binding table lock, but not release it. */
BindingTable = ShmemInitHash("BindingTable",
BTABLE_SIZE,BTABLE_SIZE,
&info,hash_flags);
if (! BindingTable) {
elog(FATAL,"InitShmem: couldn't initialize Binding Table");
return(FALSE);
}
/* Now, check the binding table for an entry to the binding
* table. If there is an entry there, someone else created
* the table. Otherwise, we did and we have to initialize it.
*/
memset(item.key, 0, BTABLE_KEYSIZE);
strncpy(item.key,"BindingTable",BTABLE_KEYSIZE);
result = (BindingEnt *)
hash_search(BindingTable,(char *) &item,HASH_ENTER, &found);
if (! result ) {
elog(FATAL,"InitShmem: corrupted binding table");
return(FALSE);
}
if (! found) {
/* bootstrapping shmem: we have to initialize the
* binding table now.
*/
Assert(ShmemBootstrap);
result->location = MAKE_OFFSET(BindingTable->hctl);
*ShmemBindingTabOffset = result->location;
result->size = BTABLE_SIZE;
ShmemBootstrap = FALSE;
} else {
Assert(! ShmemBootstrap);
}
/* now release the lock acquired in ShmemHashInit */
SpinRelease (BindingLock);
Assert (result->location == MAKE_OFFSET(BindingTable->hctl));
return(TRUE);
}
/*
* ShmemAlloc -- allocate word-aligned byte string from
* shared memory
*
* Assumes ShmemLock and ShmemFreeStart are initialized.
* Returns: real pointer to memory or NULL if we are out
* of space. Has to return a real pointer in order
* to be compatable with malloc().
*/
long *
ShmemAlloc(unsigned long size)
{
unsigned long tmpFree;
long *newSpace;
/*
* ensure space is word aligned.
*
* Word-alignment is not good enough. We have to be more
* conservative: doubles need 8-byte alignment. (We probably only need
* this on RISC platforms but this is not a big waste of space.)
* - ay 12/94
*/
if (size % sizeof(double))
size += sizeof(double) - (size % sizeof(double));
Assert(*ShmemFreeStart);
SpinAcquire(ShmemLock);
tmpFree = *ShmemFreeStart + size;
if (tmpFree <= ShmemSize) {
newSpace = (long *)MAKE_PTR(*ShmemFreeStart);
*ShmemFreeStart += size;
} else {
newSpace = NULL;
}
SpinRelease(ShmemLock);
if (! newSpace) {
elog(NOTICE,"ShmemAlloc: out of memory ");
}
return(newSpace);
}
/*
* ShmemIsValid -- test if an offset refers to valid shared memory
*
* Returns TRUE if the pointer is valid.
*/
int
ShmemIsValid(unsigned long addr)
{
return ((addr<ShmemEnd) && (addr>=ShmemBase));
}
/*
* ShmemInitHash -- Create/Attach to and initialize
* shared memory hash table.
*
* Notes:
*
* assume caller is doing some kind of synchronization
* so that two people dont try to create/initialize the
* table at once. Use SpinAlloc() to create a spinlock
* for the structure before creating the structure itself.
*/
HTAB *
ShmemInitHash(char *name, /* table string name for binding */
long init_size, /* initial size */
long max_size, /* max size of the table */
HASHCTL *infoP, /* info about key and bucket size */
int hash_flags) /* info about infoP */
{
bool found;
long * location;
/* shared memory hash tables have a fixed max size so that the
* control structures don't try to grow. The segbase is for
* calculating pointer values. The shared memory allocator
* must be specified.
*/
infoP->segbase = (long *) ShmemBase;
infoP->alloc = ShmemAlloc;
infoP->max_size = max_size;
hash_flags |= HASH_SHARED_MEM;
/* look it up in the binding table */
location =
ShmemInitStruct(name,my_log2(max_size) + sizeof(HHDR),&found);
/* binding table is corrupted. Let someone else give the
* error message since they have more information
*/
if (location == NULL) {
return(0);
}
/* it already exists, attach to it rather than allocate and
* initialize new space
*/
if (found) {
hash_flags |= HASH_ATTACH;
}
/* these structures were allocated or bound in ShmemInitStruct */
/* control information and parameters */
infoP->hctl = (long *) location;
/* directory for hash lookup */
infoP->dir = (long *) (location + sizeof(HHDR));
return(hash_create(init_size, infoP, hash_flags));;
}
/*
* ShmemPIDLookup -- lookup process data structure using process id
*
* Returns: TRUE if no error. locationPtr is initialized if PID is
* found in the binding table.
*
* NOTES:
* only information about success or failure is the value of
* locationPtr.
*/
bool
ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr)
{
BindingEnt * result,item;
bool found;
Assert (BindingTable);
memset(item.key, 0, BTABLE_KEYSIZE);
sprintf(item.key,"PID %d",pid);
SpinAcquire(BindingLock);
result = (BindingEnt *)
hash_search(BindingTable,(char *) &item, HASH_ENTER, &found);
if (! result) {
SpinRelease(BindingLock);
elog(WARN,"ShmemInitPID: BindingTable corrupted");
return(FALSE);
}
if (found) {
*locationPtr = result->location;
} else {
result->location = *locationPtr;
}
SpinRelease(BindingLock);
return (TRUE);
}
/*
* ShmemPIDDestroy -- destroy binding table entry for process
* using process id
*
* Returns: offset of the process struct in shared memory or
* INVALID_OFFSET if not found.
*
* Side Effect: removes the entry from the binding table
*/
SHMEM_OFFSET
ShmemPIDDestroy(int pid)
{
BindingEnt * result,item;
bool found;
SHMEM_OFFSET location;
Assert(BindingTable);
memset(item.key, 0, BTABLE_KEYSIZE);
sprintf(item.key,"PID %d",pid);
SpinAcquire(BindingLock);
result = (BindingEnt *)
hash_search(BindingTable,(char *) &item, HASH_REMOVE, &found);
if (found)
location = result->location;
SpinRelease(BindingLock);
if (! result) {
elog(WARN,"ShmemPIDDestroy: PID table corrupted");
return(INVALID_OFFSET);
}
if (found)
return (location);
else {
return(INVALID_OFFSET);
}
}
/*
* ShmemInitStruct -- Create/attach to a structure in shared
* memory.
*
* This is called during initialization to find or allocate
* a data structure in shared memory. If no other processes
* have created the structure, this routine allocates space
* for it. If it exists already, a pointer to the existing
* table is returned.
*
* Returns: real pointer to the object. FoundPtr is TRUE if
* the object is already in the binding table (hence, already
* initialized).
*/
long *
ShmemInitStruct(char *name, unsigned long size, bool *foundPtr)
{
BindingEnt * result,item;
long * structPtr;
strncpy(item.key,name,BTABLE_KEYSIZE);
item.location = BAD_LOCATION;
SpinAcquire(BindingLock);
if (! BindingTable) {
/* Assert() is a macro now. substitutes inside quotes. */
char *strname = "BindingTable";
/* If the binding table doesnt exist, we fake it.
*
* If we are creating the first binding table, then let
* shmemalloc() allocate the space for a new HTAB. Otherwise,
* find the old one and return that. Notice that the
* BindingLock is held until the binding table has been completely
* initialized.
*/
Assert (! strcmp(name,strname)) ;
if (ShmemBootstrap) {
/* in POSTMASTER/Single process */
*foundPtr = FALSE;
return((long *)ShmemAlloc(size));
} else {
Assert (ShmemBindingTabOffset);
*foundPtr = TRUE;
return((long *)MAKE_PTR(*ShmemBindingTabOffset));
}
} else {
/* look it up in the bindint table */
result = (BindingEnt *)
hash_search(BindingTable,(char *) &item,HASH_ENTER, foundPtr);
}
if (! result) {
SpinRelease(BindingLock);
elog(WARN,"ShmemInitStruct: Binding Table corrupted");
return(NULL);
} else if (*foundPtr) {
/*
* Structure is in the binding table so someone else has allocated
* it already. The size better be the same as the size we are
* trying to initialize to or there is a name conflict (or worse).
*/
if (result->size != size) {
SpinRelease(BindingLock);
elog(NOTICE,"ShmemInitStruct: BindingTable entry size is wrong");
/* let caller print its message too */
return(NULL);
}
structPtr = (long *)MAKE_PTR(result->location);
} else {
/* It isn't in the table yet. allocate and initialize it */
structPtr = ShmemAlloc((long)size);
if (! structPtr) {
/* out of memory */
Assert (BindingTable);
(void) hash_search(BindingTable,(char *) &item,HASH_REMOVE, foundPtr);
SpinRelease(BindingLock);
*foundPtr = FALSE;
elog(NOTICE,"ShmemInitStruct: cannot allocate '%s'",
name);
return(NULL);
}
result->size = size;
result->location = MAKE_OFFSET(structPtr);
}
Assert (ShmemIsValid((unsigned long)structPtr));
SpinRelease(BindingLock);
return(structPtr);
}

View File

@@ -0,0 +1,251 @@
/*-------------------------------------------------------------------------
*
* shmqueue.c--
* shared memory linked lists
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmqueue.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
* NOTES
*
* Package for managing doubly-linked lists in shared memory.
* The only tricky thing is that SHM_QUEUE will usually be a field
* in a larger record. SHMQueueGetFirst has to return a pointer
* to the record itself instead of a pointer to the SHMQueue field
* of the record. It takes an extra pointer and does some extra
* pointer arithmetic to do this correctly.
*
* NOTE: These are set up so they can be turned into macros some day.
*
*-------------------------------------------------------------------------
*/
#include <stdio.h> /* for sprintf() */
#include "postgres.h"
#include "storage/shmem.h" /* where the declarations go */
#include "utils/elog.h"
/*#define SHMQUEUE_DEBUG*/
#ifdef SHMQUEUE_DEBUG
#define SHMQUEUE_DEBUG_DEL /* deletions */
#define SHMQUEUE_DEBUG_HD /* head inserts */
#define SHMQUEUE_DEBUG_TL /* tail inserts */
#define SHMQUEUE_DEBUG_ELOG NOTICE
#endif /* SHMQUEUE_DEBUG */
/*
* ShmemQueueInit -- make the head of a new queue point
* to itself
*/
void
SHMQueueInit(SHM_QUEUE *queue)
{
Assert(SHM_PTR_VALID(queue));
(queue)->prev = (queue)->next = MAKE_OFFSET(queue);
}
/*
* SHMQueueIsDetached -- TRUE if element is not currently
* in a queue.
*/
bool
SHMQueueIsDetached(SHM_QUEUE *queue)
{
Assert(SHM_PTR_VALID(queue));
return ((queue)->prev == INVALID_OFFSET);
}
/*
* SHMQueueElemInit -- clear an element's links
*/
void
SHMQueueElemInit(SHM_QUEUE *queue)
{
Assert(SHM_PTR_VALID(queue));
(queue)->prev = (queue)->next = INVALID_OFFSET;
}
/*
* SHMQueueDelete -- remove an element from the queue and
* close the links
*/
void
SHMQueueDelete(SHM_QUEUE *queue)
{
SHM_QUEUE *nextElem = (SHM_QUEUE *) MAKE_PTR((queue)->next);
SHM_QUEUE *prevElem = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
Assert(SHM_PTR_VALID(queue));
Assert(SHM_PTR_VALID(nextElem));
Assert(SHM_PTR_VALID(prevElem));
#ifdef SHMQUEUE_DEBUG_DEL
dumpQ(queue, "in SHMQueueDelete: begin");
#endif /* SHMQUEUE_DEBUG_DEL */
prevElem->next = (queue)->next;
nextElem->prev = (queue)->prev;
#ifdef SHMQUEUE_DEBUG_DEL
dumpQ((SHM_QUEUE *)MAKE_PTR(queue->prev), "in SHMQueueDelete: end");
#endif /* SHMQUEUE_DEBUG_DEL */
}
#ifdef SHMQUEUE_DEBUG
void
dumpQ(SHM_QUEUE *q, char *s)
{
char elem[16];
char buf[1024];
SHM_QUEUE *start = q;
int count = 0;
sprintf(buf, "q prevs: %x", MAKE_OFFSET(q));
q = (SHM_QUEUE *)MAKE_PTR(q->prev);
while (q != start)
{
sprintf(elem, "--->%x", MAKE_OFFSET(q));
strcat(buf, elem);
q = (SHM_QUEUE *)MAKE_PTR(q->prev);
if (q->prev == MAKE_OFFSET(q))
break;
if (count++ > 40)
{
strcat(buf, "BAD PREV QUEUE!!");
break;
}
}
sprintf(elem, "--->%x", MAKE_OFFSET(q));
strcat(buf, elem);
elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
sprintf(buf, "q nexts: %x", MAKE_OFFSET(q));
count = 0;
q = (SHM_QUEUE *)MAKE_PTR(q->next);
while (q != start)
{
sprintf(elem, "--->%x", MAKE_OFFSET(q));
strcat(buf, elem);
q = (SHM_QUEUE *)MAKE_PTR(q->next);
if (q->next == MAKE_OFFSET(q))
break;
if (count++ > 10)
{
strcat(buf, "BAD NEXT QUEUE!!");
break;
}
}
sprintf(elem, "--->%x", MAKE_OFFSET(q));
strcat(buf, elem);
elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
}
#endif /* SHMQUEUE_DEBUG */
/*
* SHMQueueInsertHD -- put elem in queue between the queue head
* and its "prev" element.
*/
void
SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem)
{
SHM_QUEUE *prevPtr = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
SHMEM_OFFSET elemOffset = MAKE_OFFSET(elem);
Assert(SHM_PTR_VALID(queue));
Assert(SHM_PTR_VALID(elem));
#ifdef SHMQUEUE_DEBUG_HD
dumpQ(queue, "in SHMQueueInsertHD: begin");
#endif /* SHMQUEUE_DEBUG_HD */
(elem)->next = prevPtr->next;
(elem)->prev = queue->prev;
(queue)->prev = elemOffset;
prevPtr->next = elemOffset;
#ifdef SHMQUEUE_DEBUG_HD
dumpQ(queue, "in SHMQueueInsertHD: end");
#endif /* SHMQUEUE_DEBUG_HD */
}
void
SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem)
{
SHM_QUEUE *nextPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
SHMEM_OFFSET elemOffset = MAKE_OFFSET(elem);
Assert(SHM_PTR_VALID(queue));
Assert(SHM_PTR_VALID(elem));
#ifdef SHMQUEUE_DEBUG_TL
dumpQ(queue, "in SHMQueueInsertTL: begin");
#endif /* SHMQUEUE_DEBUG_TL */
(elem)->prev = nextPtr->prev;
(elem)->next = queue->next;
(queue)->next = elemOffset;
nextPtr->prev = elemOffset;
#ifdef SHMQUEUE_DEBUG_TL
dumpQ(queue, "in SHMQueueInsertTL: end");
#endif /* SHMQUEUE_DEBUG_TL */
}
/*
* SHMQueueFirst -- Get the first element from a queue
*
* First element is queue->next. If SHMQueue is part of
* a larger structure, we want to return a pointer to the
* whole structure rather than a pointer to its SHMQueue field.
* I.E. struct {
* int stuff;
* SHMQueue elem;
* } ELEMType;
* when this element is in a queue (queue->next) is struct.elem.
* nextQueue allows us to calculate the offset of the SHMQueue
* field in the structure.
*
* call to SHMQueueFirst should take these parameters:
*
* &(queueHead),&firstElem,&(firstElem->next)
*
* Note that firstElem may well be uninitialized. if firstElem
* is initially K, &(firstElem->next) will be K+ the offset to
* next.
*/
void
SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr, SHM_QUEUE *nextQueue)
{
SHM_QUEUE *elemPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
Assert(SHM_PTR_VALID(queue));
*nextPtrPtr = (Pointer) (((unsigned long) *nextPtrPtr) +
((unsigned long) elemPtr) - ((unsigned long) nextQueue));
/*
nextPtrPtr a ptr to a structure linked in the queue
nextQueue is the SHMQueue field of the structure
*nextPtrPtr - nextQueue is 0 minus the offset of the queue
field n the record
elemPtr + (*nextPtrPtr - nexQueue) is the start of the
structure containing elemPtr.
*/
}
/*
* SHMQueueEmpty -- TRUE if queue head is only element, FALSE otherwise
*/
bool
SHMQueueEmpty(SHM_QUEUE *queue)
{
Assert(SHM_PTR_VALID(queue));
if (queue->prev == MAKE_OFFSET(queue))
{
Assert(queue->next = MAKE_OFFSET(queue));
return(TRUE);
}
return(FALSE);
}

View File

@@ -0,0 +1,169 @@
/*-------------------------------------------------------------------------
*
* sinval.c--
* POSTGRES shared cache invalidation communication code.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/* #define INVALIDDEBUG 1 */
#include "postgres.h"
#include "storage/sinval.h"
#include "storage/sinvaladt.h"
#include "storage/spin.h"
#include "utils/elog.h"
extern SISeg *shmInvalBuffer;/* the shared buffer segment, set by*/
/* SISegmentAttach() */
extern BackendId MyBackendId;
extern BackendTag MyBackendTag;
SPINLOCK SInvalLock = (SPINLOCK) NULL;
/****************************************************************************/
/* CreateSharedInvalidationState(key) Create a buffer segment */
/* */
/* should be called only by the POSTMASTER */
/****************************************************************************/
void
CreateSharedInvalidationState(IPCKey key)
{
int status;
/* REMOVED
SISyncKill(IPCKeyGetSIBufferMemorySemaphoreKey(key));
SISyncInit(IPCKeyGetSIBufferMemorySemaphoreKey(key));
*/
/* SInvalLock gets set in spin.c, during spinlock init */
status = SISegmentInit(true, IPCKeyGetSIBufferMemoryBlock(key));
if (status == -1) {
elog(FATAL, "CreateSharedInvalidationState: failed segment init");
}
}
/****************************************************************************/
/* AttachSharedInvalidationState(key) Attach a buffer segment */
/* */
/* should be called only by the POSTMASTER */
/****************************************************************************/
void
AttachSharedInvalidationState(IPCKey key)
{
int status;
if (key == PrivateIPCKey) {
CreateSharedInvalidationState(key);
return;
}
/* SInvalLock gets set in spin.c, during spinlock init */
status = SISegmentInit(false, IPCKeyGetSIBufferMemoryBlock(key));
if (status == -1) {
elog(FATAL, "AttachSharedInvalidationState: failed segment init");
}
}
void
InitSharedInvalidationState()
{
SpinAcquire(SInvalLock);
if (!SIBackendInit(shmInvalBuffer))
{
SpinRelease(SInvalLock);
elog(FATAL, "Backend cache invalidation initialization failed");
}
SpinRelease(SInvalLock);
}
/*
* RegisterSharedInvalid --
* Returns a new local cache invalidation state containing a new entry.
*
* Note:
* Assumes hash index is valid.
* Assumes item pointer is valid.
*/
/****************************************************************************/
/* RegisterSharedInvalid(cacheId, hashIndex, pointer) */
/* */
/* register a message in the buffer */
/* should be called by a backend */
/****************************************************************************/
void
RegisterSharedInvalid(int cacheId, /* XXX */
Index hashIndex,
ItemPointer pointer)
{
SharedInvalidData newInvalid;
/*
* This code has been hacked to accept two types of messages. This might
* be treated more generally in the future.
*
* (1)
* cacheId= system cache id
* hashIndex= system cache hash index for a (possibly) cached tuple
* pointer= pointer of (possibly) cached tuple
*
* (2)
* cacheId= special non-syscache id
* hashIndex= object id contained in (possibly) cached relation descriptor
* pointer= null
*/
newInvalid.cacheId = cacheId;
newInvalid.hashIndex = hashIndex;
if (ItemPointerIsValid(pointer)) {
ItemPointerCopy(pointer, &newInvalid.pointerData);
} else {
ItemPointerSetInvalid(&newInvalid.pointerData);
}
SpinAcquire(SInvalLock);
if (!SISetDataEntry(shmInvalBuffer, &newInvalid)) {
/* buffer full */
/* release a message, mark process cache states to be invalid */
SISetProcStateInvalid(shmInvalBuffer);
if (!SIDelDataEntry(shmInvalBuffer)) {
/* inconsistent buffer state -- shd never happen */
SpinRelease(SInvalLock);
elog(FATAL, "RegisterSharedInvalid: inconsistent buffer state");
}
/* write again */
(void) SISetDataEntry(shmInvalBuffer, &newInvalid);
}
SpinRelease(SInvalLock);
}
/*
* InvalidateSharedInvalid --
* Processes all entries in a shared cache invalidation state.
*/
/****************************************************************************/
/* InvalidateSharedInvalid(invalFunction, resetFunction) */
/* */
/* invalidate a message in the buffer (read and clean up) */
/* should be called by a backend */
/****************************************************************************/
void
InvalidateSharedInvalid(void (*invalFunction)(),
void (*resetFunction)())
{
SpinAcquire(SInvalLock);
SIReadEntryData(shmInvalBuffer, MyBackendId,
invalFunction, resetFunction);
SIDelExpiredDataEntries(shmInvalBuffer);
SpinRelease(SInvalLock);
}

View File

@@ -0,0 +1,797 @@
/*-------------------------------------------------------------------------
*
* sinvaladt.c--
* POSTGRES shared cache invalidation segment definitions.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include "storage/ipc.h"
#include "storage/sinvaladt.h"
#include "storage/lmgr.h"
#include "utils/elog.h"
#include "utils/palloc.h"
/* ----------------
* global variable notes
*
* SharedInvalidationSemaphore
*
* shmInvalBuffer
* the shared buffer segment, set by SISegmentAttach()
*
* MyBackendId
* might be removed later, used only for
* debugging in debug routines (end of file)
*
* SIDbId
* identification of buffer (disappears)
*
* SIRelId \
* SIDummyOid \ identification of buffer
* SIXidData /
* SIXid /
*
* XXX This file really needs to be cleaned up. We switched to using
* spinlocks to protect critical sections (as opposed to using fake
* relations and going through the lock manager) and some of the old
* cruft was 'ifdef'ed out, while other parts (now unused) are still
* compiled into the system. -mer 5/24/92
* ----------------
*/
#ifdef HAS_TEST_AND_SET
int SharedInvalidationLockId;
#else
IpcSemaphoreId SharedInvalidationSemaphore;
#endif
SISeg *shmInvalBuffer;
extern BackendId MyBackendId;
static void CleanupInvalidationState(int status, SISeg *segInOutP);
static BackendId SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag);
static int SIGetNumEntries(SISeg *segP);
/************************************************************************/
/* SISetActiveProcess(segP, backendId) set the backend status active */
/* should be called only by the postmaster when creating a backend */
/************************************************************************/
/* XXX I suspect that the segP parameter is extraneous. -hirohama */
static void
SISetActiveProcess(SISeg *segInOutP, BackendId backendId)
{
/* mark all messages as read */
/* Assert(segP->procState[backendId - 1].tag == MyBackendTag); */
segInOutP->procState[backendId - 1].resetState = false;
segInOutP->procState[backendId - 1].limit = SIGetNumEntries(segInOutP);
}
/****************************************************************************/
/* SIBackendInit() initializes a backend to operate on the buffer */
/****************************************************************************/
int
SIBackendInit(SISeg *segInOutP)
{
LRelId LtCreateRelId();
TransactionId LMITransactionIdCopy();
Assert(MyBackendTag > 0);
MyBackendId = SIAssignBackendId(segInOutP, MyBackendTag);
if (MyBackendId == InvalidBackendTag)
return 0;
#ifdef INVALIDDEBUG
elog(DEBUG, "SIBackendInit: backend tag %d; backend id %d.",
MyBackendTag, MyBackendId);
#endif /* INVALIDDEBUG */
SISetActiveProcess(segInOutP, MyBackendId);
on_exitpg(CleanupInvalidationState, (caddr_t)segInOutP);
return 1;
}
/* ----------------
* SIAssignBackendId
* ----------------
*/
static BackendId
SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag)
{
Index index;
ProcState *stateP;
stateP = NULL;
for (index = 0; index < MaxBackendId; index += 1) {
if (segInOutP->procState[index].tag == InvalidBackendTag ||
segInOutP->procState[index].tag == backendTag)
{
stateP = &segInOutP->procState[index];
break;
}
if (!PointerIsValid(stateP) ||
(segInOutP->procState[index].resetState &&
(!stateP->resetState ||
stateP->tag < backendTag)) ||
(!stateP->resetState &&
(segInOutP->procState[index].limit <
stateP->limit ||
stateP->tag < backendTag)))
{
stateP = &segInOutP->procState[index];
}
}
/* verify that all "procState" entries checked for matching tags */
for (index += 1; index < MaxBackendId; index += 1) {
if (segInOutP->procState[index].tag == backendTag) {
elog (FATAL, "SIAssignBackendId: tag %d found twice",
backendTag);
}
}
if (stateP->tag != InvalidBackendTag) {
if (stateP->tag == backendTag) {
elog(NOTICE, "SIAssignBackendId: reusing tag %d",
backendTag);
} else {
elog(NOTICE,
"SIAssignBackendId: discarding tag %d",
stateP->tag);
return InvalidBackendTag;
}
}
stateP->tag = backendTag;
return (1 + stateP - &segInOutP->procState[0]);
}
/************************************************************************/
/* The following function should be called only by the postmaster !! */
/************************************************************************/
/************************************************************************/
/* SISetDeadProcess(segP, backendId) set the backend status DEAD */
/* should be called only by the postmaster when a backend died */
/************************************************************************/
static void
SISetDeadProcess(SISeg *segP, int backendId)
{
/* XXX call me.... */
segP->procState[backendId - 1].resetState = false;
segP->procState[backendId - 1].limit = -1;
segP->procState[backendId - 1].tag = InvalidBackendTag;
}
/*
* CleanupInvalidationState --
* Note:
* This is a temporary hack. ExitBackend should call this instead
* of exit (via on_exitpg).
*/
static void
CleanupInvalidationState(int status, /* XXX */
SISeg *segInOutP) /* XXX style */
{
Assert(PointerIsValid(segInOutP));
SISetDeadProcess(segInOutP, MyBackendId);
}
/************************************************************************/
/* SIComputeSize() - retuns the size of a buffer segment */
/************************************************************************/
static SISegOffsets *
SIComputeSize(int *segSize)
{
int A, B, a, b, totalSize;
SISegOffsets *oP;
A = 0;
a = SizeSISeg; /* offset to first data entry */
b = SizeOfOneSISegEntry * MAXNUMMESSAGES;
B = A + a + b;
totalSize = B - A;
*segSize = totalSize;
oP = (SISegOffsets *) palloc(sizeof(SISegOffsets));
oP->startSegment = A;
oP->offsetToFirstEntry = a; /* relatiove to A */
oP->offsetToEndOfSegemnt = totalSize; /* relative to A */
return(oP);
}
/************************************************************************/
/* SISetStartEntrySection(segP, offset) - sets the offset */
/************************************************************************/
static void
SISetStartEntrySection(SISeg *segP, Offset offset)
{
segP->startEntrySection = offset;
}
/************************************************************************/
/* SIGetStartEntrySection(segP) - returnss the offset */
/************************************************************************/
static Offset
SIGetStartEntrySection(SISeg *segP)
{
return(segP->startEntrySection);
}
/************************************************************************/
/* SISetEndEntrySection(segP, offset) - sets the offset */
/************************************************************************/
static void
SISetEndEntrySection(SISeg *segP, Offset offset)
{
segP->endEntrySection = offset;
}
/************************************************************************/
/* SISetEndEntryChain(segP, offset) - sets the offset */
/************************************************************************/
static void
SISetEndEntryChain(SISeg *segP, Offset offset)
{
segP->endEntryChain = offset;
}
/************************************************************************/
/* SIGetEndEntryChain(segP) - returnss the offset */
/************************************************************************/
static Offset
SIGetEndEntryChain(SISeg *segP)
{
return(segP->endEntryChain);
}
/************************************************************************/
/* SISetStartEntryChain(segP, offset) - sets the offset */
/************************************************************************/
static void
SISetStartEntryChain(SISeg *segP, Offset offset)
{
segP->startEntryChain = offset;
}
/************************************************************************/
/* SIGetStartEntryChain(segP) - returns the offset */
/************************************************************************/
static Offset
SIGetStartEntryChain(SISeg *segP)
{
return(segP->startEntryChain);
}
/************************************************************************/
/* SISetNumEntries(segP, num) sets the current nuber of entries */
/************************************************************************/
static bool
SISetNumEntries(SISeg *segP, int num)
{
if ( num <= MAXNUMMESSAGES) {
segP->numEntries = num;
return(true);
} else {
return(false); /* table full */
}
}
/************************************************************************/
/* SIGetNumEntries(segP) - returns the current nuber of entries */
/************************************************************************/
static int
SIGetNumEntries(SISeg *segP)
{
return(segP->numEntries);
}
/************************************************************************/
/* SISetMaxNumEntries(segP, num) sets the maximal number of entries */
/************************************************************************/
static bool
SISetMaxNumEntries(SISeg *segP, int num)
{
if ( num <= MAXNUMMESSAGES) {
segP->maxNumEntries = num;
return(true);
} else {
return(false); /* wrong number */
}
}
/************************************************************************/
/* SIGetProcStateLimit(segP, i) returns the limit of read messages */
/************************************************************************/
static int
SIGetProcStateLimit(SISeg *segP, int i)
{
return(segP->procState[i].limit);
}
/************************************************************************/
/* SIIncNumEntries(segP, num) increments the current nuber of entries */
/************************************************************************/
static bool
SIIncNumEntries(SISeg *segP, int num)
{
if ((segP->numEntries + num) <= MAXNUMMESSAGES) {
segP->numEntries = segP->numEntries + num;
return(true);
} else {
return(false); /* table full */
}
}
/************************************************************************/
/* SIDecNumEntries(segP, num) decrements the current nuber of entries */
/************************************************************************/
static bool
SIDecNumEntries(SISeg *segP, int num)
{
if ((segP->numEntries - num) >= 0) {
segP->numEntries = segP->numEntries - num;
return(true);
} else {
return(false); /* not enough entries in table */
}
}
/************************************************************************/
/* SISetStartFreeSpace(segP, offset) - sets the offset */
/************************************************************************/
static void
SISetStartFreeSpace(SISeg *segP, Offset offset)
{
segP->startFreeSpace = offset;
}
/************************************************************************/
/* SIGetStartFreeSpace(segP) - returns the offset */
/************************************************************************/
static Offset
SIGetStartFreeSpace(SISeg *segP)
{
return(segP->startFreeSpace);
}
/************************************************************************/
/* SIGetFirstDataEntry(segP) returns first data entry */
/************************************************************************/
static SISegEntry *
SIGetFirstDataEntry(SISeg *segP)
{
SISegEntry *eP;
Offset startChain;
startChain = SIGetStartEntryChain(segP);
if (startChain == InvalidOffset)
return(NULL);
eP = (SISegEntry *) ((Pointer) segP +
SIGetStartEntrySection(segP) +
startChain );
return(eP);
}
/************************************************************************/
/* SIGetLastDataEntry(segP) returns last data entry in the chain */
/************************************************************************/
static SISegEntry *
SIGetLastDataEntry(SISeg *segP)
{
SISegEntry *eP;
Offset endChain;
endChain = SIGetEndEntryChain(segP);
if (endChain == InvalidOffset)
return(NULL);
eP = (SISegEntry *) ((Pointer) segP +
SIGetStartEntrySection(segP) +
endChain );
return(eP);
}
/************************************************************************/
/* SIGetNextDataEntry(segP, offset) returns next data entry */
/************************************************************************/
static SISegEntry *
SIGetNextDataEntry(SISeg *segP, Offset offset)
{
SISegEntry *eP;
if (offset == InvalidOffset)
return(NULL);
eP = (SISegEntry *) ((Pointer) segP +
SIGetStartEntrySection(segP) +
offset);
return(eP);
}
/************************************************************************/
/* SIGetNthDataEntry(segP, n) returns the n-th data entry in chain */
/************************************************************************/
static SISegEntry *
SIGetNthDataEntry(SISeg *segP,
int n) /* must range from 1 to MaxMessages */
{
SISegEntry *eP;
int i;
if (n <= 0) return(NULL);
eP = SIGetFirstDataEntry(segP);
for (i = 1; i < n; i++) {
/* skip one and get the next */
eP = SIGetNextDataEntry(segP, eP->next);
}
return(eP);
}
/************************************************************************/
/* SIEntryOffset(segP, entryP) returns the offset for an pointer */
/************************************************************************/
static Offset
SIEntryOffset(SISeg *segP, SISegEntry *entryP)
{
/* relative to B !! */
return ((Offset) ((Pointer) entryP -
(Pointer) segP -
SIGetStartEntrySection(segP) ));
}
/************************************************************************/
/* SISetDataEntry(segP, data) - sets a message in the segemnt */
/************************************************************************/
bool
SISetDataEntry(SISeg *segP, SharedInvalidData *data)
{
Offset offsetToNewData;
SISegEntry *eP, *lastP;
bool SISegFull();
Offset SIEntryOffset();
Offset SIGetStartFreeSpace();
SISegEntry *SIGetFirstDataEntry();
SISegEntry *SIGetNextDataEntry();
SISegEntry *SIGetLastDataEntry();
if (!SIIncNumEntries(segP, 1))
return(false); /* no space */
/* get a free entry */
offsetToNewData = SIGetStartFreeSpace(segP);
eP = SIGetNextDataEntry(segP, offsetToNewData); /* it's a free one */
SISetStartFreeSpace(segP, eP->next);
/* fill it up */
eP->entryData = *data;
eP->isfree = false;
eP->next = InvalidOffset;
/* handle insertion point at the end of the chain !!*/
lastP = SIGetLastDataEntry(segP);
if (lastP == NULL) {
/* there is no chain, insert the first entry */
SISetStartEntryChain(segP, SIEntryOffset(segP, eP));
} else {
/* there is a last entry in the chain */
lastP->next = SIEntryOffset(segP, eP);
}
SISetEndEntryChain(segP, SIEntryOffset(segP, eP));
return(true);
}
/************************************************************************/
/* SIDecProcLimit(segP, num) decrements all process limits */
/************************************************************************/
static void
SIDecProcLimit(SISeg *segP, int num)
{
int i;
for (i=0; i < MaxBackendId; i++) {
/* decrement only, if there is a limit > 0 */
if (segP->procState[i].limit > 0) {
segP->procState[i].limit = segP->procState[i].limit - num;
if (segP->procState[i].limit < 0) {
/* limit was not high enough, reset to zero */
/* negative means it's a dead backend */
segP->procState[i].limit = 0;
}
}
}
}
/************************************************************************/
/* SIDelDataEntry(segP) - free the FIRST entry */
/************************************************************************/
bool
SIDelDataEntry(SISeg *segP)
{
SISegEntry *e1P;
SISegEntry *SIGetFirstDataEntry();
if (!SIDecNumEntries(segP, 1)) {
/* no entries in buffer */
return(false);
}
e1P = SIGetFirstDataEntry(segP);
SISetStartEntryChain(segP, e1P->next);
if (SIGetStartEntryChain(segP) == InvalidOffset) {
/* it was the last entry */
SISetEndEntryChain(segP, InvalidOffset);
}
/* free the entry */
e1P->isfree = true;
e1P->next = SIGetStartFreeSpace(segP);
SISetStartFreeSpace(segP, SIEntryOffset(segP, e1P));
SIDecProcLimit(segP, 1);
return(true);
}
/************************************************************************/
/* SISetProcStateInvalid(segP) checks and marks a backends state as */
/* invalid */
/************************************************************************/
void
SISetProcStateInvalid(SISeg *segP)
{
int i;
for (i=0; i < MaxBackendId; i++) {
if (segP->procState[i].limit == 0) {
/* backend i didn't read any message */
segP->procState[i].resetState = true;
/*XXX signal backend that it has to reset its internal cache ? */
}
}
}
/************************************************************************/
/* SIReadEntryData(segP, backendId, function) */
/* - marks messages to be read by id */
/* and executes function */
/************************************************************************/
void
SIReadEntryData(SISeg *segP,
int backendId,
void (*invalFunction)(),
void (*resetFunction)())
{
int i = 0;
SISegEntry *data;
Assert(segP->procState[backendId - 1].tag == MyBackendTag);
if (!segP->procState[backendId - 1].resetState) {
/* invalidate data, but only those, you have not seen yet !!*/
/* therefore skip read messages */
data = SIGetNthDataEntry(segP,
SIGetProcStateLimit(segP, backendId - 1) + 1);
while (data != NULL) {
i++;
segP->procState[backendId - 1].limit++; /* one more message read */
invalFunction(data->entryData.cacheId,
data->entryData.hashIndex,
&data->entryData.pointerData);
data = SIGetNextDataEntry(segP, data->next);
}
/* SIDelExpiredDataEntries(segP); */
} else {
/*backend must not read messages, its own state has to be reset */
elog(NOTICE, "SIMarkEntryData: cache state reset");
resetFunction(); /* XXXX call it here, parameters? */
/* new valid state--mark all messages "read" */
segP->procState[backendId - 1].resetState = false;
segP->procState[backendId - 1].limit = SIGetNumEntries(segP);
}
/* check whether we can remove dead messages */
if (i > MAXNUMMESSAGES) {
elog(FATAL, "SIReadEntryData: Invalid segment state");
}
}
/************************************************************************/
/* SIDelExpiredDataEntries (segP) - removes irrelevant messages */
/************************************************************************/
void
SIDelExpiredDataEntries(SISeg *segP)
{
int min, i, h;
min = 9999999;
for (i = 0; i < MaxBackendId; i++) {
h = SIGetProcStateLimit(segP, i);
if (h >= 0) { /* backend active */
if (h < min ) min = h;
}
}
if (min != 9999999) {
/* we can remove min messages */
for (i = 1; i <= min; i++) {
/* this adjusts also the state limits!*/
if (!SIDelDataEntry(segP)) {
elog(FATAL, "SIDelExpiredDataEntries: Invalid segment state");
}
}
}
}
/************************************************************************/
/* SISegInit(segP) - initializes the segment */
/************************************************************************/
static void
SISegInit(SISeg *segP)
{
SISegOffsets *oP;
int segSize, i;
SISegEntry *eP;
oP = SIComputeSize(&segSize);
/* set sempahore ids in the segment */
/* XXX */
SISetStartEntrySection(segP, oP->offsetToFirstEntry);
SISetEndEntrySection(segP, oP->offsetToEndOfSegemnt);
SISetStartFreeSpace(segP, 0);
SISetStartEntryChain(segP, InvalidOffset);
SISetEndEntryChain(segP, InvalidOffset);
(void) SISetNumEntries(segP, 0);
(void) SISetMaxNumEntries(segP, MAXNUMMESSAGES);
for (i = 0; i < MaxBackendId; i++) {
segP->procState[i].limit = -1; /* no backend active !!*/
segP->procState[i].resetState = false;
segP->procState[i].tag = InvalidBackendTag;
}
/* construct a chain of free entries */
for (i = 1; i < MAXNUMMESSAGES; i++) {
eP = (SISegEntry *) ((Pointer) segP +
SIGetStartEntrySection(segP) +
(i - 1) * sizeof(SISegEntry));
eP->isfree = true;
eP->next = i * sizeof(SISegEntry); /* relative to B */
}
/* handle the last free entry separate */
eP = (SISegEntry *) ((Pointer) segP +
SIGetStartEntrySection(segP) +
(MAXNUMMESSAGES - 1) * sizeof(SISegEntry));
eP->isfree = true;
eP->next = InvalidOffset; /* it's the end of the chain !! */
/*
* Be tidy
*/
pfree(oP);
}
/************************************************************************/
/* SISegmentKill(key) - kill any segment */
/************************************************************************/
static void
SISegmentKill(int key) /* the corresponding key for the segment */
{
IpcMemoryKill(key);
}
/************************************************************************/
/* SISegmentGet(key, size) - get a shared segment of size <size> */
/* returns a segment id */
/************************************************************************/
static IpcMemoryId
SISegmentGet(int key, /* the corresponding key for the segment */
int size, /* size of segment in bytes */
bool create)
{
IpcMemoryId shmid;
if (create) {
shmid = IpcMemoryCreate(key, size, IPCProtection);
} else {
shmid = IpcMemoryIdGet(key, size);
}
return(shmid);
}
/************************************************************************/
/* SISegmentAttach(shmid) - attach a shared segment with id shmid */
/************************************************************************/
static void
SISegmentAttach(IpcMemoryId shmid)
{
shmInvalBuffer = (struct SISeg *) IpcMemoryAttach(shmid);
if (shmInvalBuffer == IpcMemAttachFailed) {
/* XXX use validity function */
elog(NOTICE, "SISegmentAttach: Could not attach segment");
elog(FATAL, "SISegmentAttach: %m");
}
}
/************************************************************************/
/* SISegmentInit(killExistingSegment, key) initialize segment */
/************************************************************************/
int
SISegmentInit(bool killExistingSegment, IPCKey key)
{
SISegOffsets *oP;
int segSize;
IpcMemoryId shmId;
bool create;
if (killExistingSegment) {
/* Kill existing segment */
/* set semaphore */
SISegmentKill(key);
/* Get a shared segment */
oP = SIComputeSize(&segSize);
/*
* Be tidy
*/
pfree(oP);
create = true;
shmId = SISegmentGet(key,segSize, create);
if (shmId < 0) {
perror("SISegmentGet: failed");
return(-1); /* an error */
}
/* Attach the shared cache invalidation segment */
/* sets the global variable shmInvalBuffer */
SISegmentAttach(shmId);
/* Init shared memory table */
SISegInit(shmInvalBuffer);
} else {
/* use an existing segment */
create = false;
shmId = SISegmentGet(key, 0, create);
if (shmId < 0) {
perror("SISegmentGet: getting an existent segment failed");
return(-1); /* an error */
}
/* Attach the shared cache invalidation segment */
SISegmentAttach(shmId);
}
return(1);
}

View File

@@ -0,0 +1,247 @@
/*-------------------------------------------------------------------------
*
* spin.c--
* routines for managing spin locks
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/spin.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* POSTGRES has two kinds of locks: semaphores (which put the
* process to sleep) and spinlocks (which are supposed to be
* short term locks). Currently both are implemented as SysV
* semaphores, but presumably this can change if we move to
* a machine with a test-and-set (TAS) instruction. Its probably
* a good idea to think about (and allocate) short term and long
* term semaphores separately anyway.
*
* NOTE: These routines are not supposed to be widely used in Postgres.
* They are preserved solely for the purpose of porting Mark Sullivan's
* buffer manager to Postgres.
*/
#include <errno.h>
#include "postgres.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "storage/proc.h"
#include "utils/elog.h"
/* globals used in this file */
IpcSemaphoreId SpinLockId;
#ifdef HAS_TEST_AND_SET
/* real spin lock implementations */
bool
CreateSpinlocks(IPCKey key)
{
/* the spin lock shared memory must have been created by now */
return(TRUE);
}
bool
AttachSpinLocks(IPCKey key)
{
/* the spin lock shared memory must have been attached by now */
return(TRUE);
}
bool
InitSpinLocks(int init, IPCKey key)
{
extern SPINLOCK ShmemLock;
extern SPINLOCK BindingLock;
extern SPINLOCK BufMgrLock;
extern SPINLOCK LockMgrLock;
extern SPINLOCK ProcStructLock;
extern SPINLOCK SInvalLock;
extern SPINLOCK OidGenLockId;
#ifdef MAIN_MEMORY
extern SPINLOCK MMCacheLock;
#endif /* SONY_JUKEBOX */
/* These six spinlocks have fixed location is shmem */
ShmemLock = (SPINLOCK) SHMEMLOCKID;
BindingLock = (SPINLOCK) BINDINGLOCKID;
BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
SInvalLock = (SPINLOCK) SINVALLOCKID;
OidGenLockId = (SPINLOCK) OIDGENLOCKID;
#ifdef MAIN_MEMORY
MMCacheLock = (SPINLOCK) MMCACHELOCKID;
#endif /* MAIN_MEMORY */
return(TRUE);
}
void
SpinAcquire(SPINLOCK lock)
{
ExclusiveLock(lock);
PROC_INCR_SLOCK(lock);
}
void
SpinRelease(SPINLOCK lock)
{
PROC_DECR_SLOCK(lock);
ExclusiveUnlock(lock);
}
bool
SpinIsLocked(SPINLOCK lock)
{
return(!LockIsFree(lock));
}
#else /* HAS_TEST_AND_SET */
/* Spinlocks are implemented using SysV semaphores */
/*
* SpinAcquire -- try to grab a spinlock
*
* FAILS if the semaphore is corrupted.
*/
void
SpinAcquire(SPINLOCK lock)
{
IpcSemaphoreLock(SpinLockId, lock, IpcExclusiveLock);
PROC_INCR_SLOCK(lock);
}
/*
* SpinRelease -- release a spin lock
*
* FAILS if the semaphore is corrupted
*/
void
SpinRelease(SPINLOCK lock)
{
Assert(SpinIsLocked(lock))
PROC_DECR_SLOCK(lock);
IpcSemaphoreUnlock(SpinLockId, lock, IpcExclusiveLock);
}
bool
SpinIsLocked(SPINLOCK lock)
{
int semval;
semval = IpcSemaphoreGetValue(SpinLockId, lock);
return(semval < IpcSemaphoreDefaultStartValue);
}
/*
* CreateSpinlocks -- Create a sysV semaphore array for
* the spinlocks
*
*/
bool
CreateSpinlocks(IPCKey key)
{
int status;
IpcSemaphoreId semid;
semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection,
IpcSemaphoreDefaultStartValue, 1, &status);
if (status == IpcSemIdExist) {
IpcSemaphoreKill(key);
elog(NOTICE,"Destroying old spinlock semaphore");
semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection,
IpcSemaphoreDefaultStartValue, 1, &status);
}
if (semid >= 0) {
SpinLockId = semid;
return(TRUE);
}
/* cannot create spinlocks */
elog(FATAL,"CreateSpinlocks: cannot create spin locks");
return(FALSE);
}
/*
* Attach to existing spinlock set
*/
bool
AttachSpinLocks(IPCKey key)
{
IpcSemaphoreId id;
id = semget (key, MAX_SPINS, 0);
if (id < 0) {
if (errno == EEXIST) {
/* key is the name of someone else's semaphore */
elog (FATAL,"AttachSpinlocks: SPIN_KEY belongs to someone else");
}
/* cannot create spinlocks */
elog(FATAL,"AttachSpinlocks: cannot create spin locks");
return(FALSE);
}
SpinLockId = id;
return(TRUE);
}
/*
* InitSpinLocks -- Spinlock bootstrapping
*
* We need several spinlocks for bootstrapping:
* BindingLock (for the shmem binding table) and
* ShmemLock (for the shmem allocator), BufMgrLock (for buffer
* pool exclusive access), LockMgrLock (for the lock table), and
* ProcStructLock (a spin lock for the shared process structure).
* If there's a Sony WORM drive attached, we also have a spinlock
* (SJCacheLock) for it. Same story for the main memory storage mgr.
*
*/
bool
InitSpinLocks(int init, IPCKey key)
{
extern SPINLOCK ShmemLock;
extern SPINLOCK BindingLock;
extern SPINLOCK BufMgrLock;
extern SPINLOCK LockMgrLock;
extern SPINLOCK ProcStructLock;
extern SPINLOCK SInvalLock;
extern SPINLOCK OidGenLockId;
#ifdef MAIN_MEMORY
extern SPINLOCK MMCacheLock;
#endif /* MAIN_MEMORY */
if (!init || key != IPC_PRIVATE) {
/* if bootstrap and key is IPC_PRIVATE, it means that we are running
* backend by itself. no need to attach spinlocks
*/
if (! AttachSpinLocks(key)) {
elog(FATAL,"InitSpinLocks: couldnt attach spin locks");
return(FALSE);
}
}
/* These five (or six) spinlocks have fixed location is shmem */
ShmemLock = (SPINLOCK) SHMEMLOCKID;
BindingLock = (SPINLOCK) BINDINGLOCKID;
BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
SInvalLock = (SPINLOCK) SINVALLOCKID;
OidGenLockId = (SPINLOCK) OIDGENLOCKID;
#ifdef MAIN_MEMORY
MMCacheLock = (SPINLOCK) MMCACHELOCKID;
#endif /* MAIN_MEMORY */
return(TRUE);
}
#endif /* HAS_TEST_AND_SET */

View File

@@ -0,0 +1,20 @@
/*-------------------------------------------------------------------------
*
* item.h--
* POSTGRES disk item definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: item.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef ITEM_H
#define ITEM_H
#include "c.h"
typedef Pointer Item;
#endif /* ITEM_H */

View File

@@ -0,0 +1,75 @@
/*-------------------------------------------------------------------------
*
* itemid.h--
* Standard POSTGRES buffer page item identifier definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: itemid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef ITEMID_H
#define ITEMID_H
typedef uint16 ItemOffset;
typedef uint16 ItemLength;
typedef bits16 ItemIdFlags;
typedef struct ItemIdData { /* line pointers */
unsigned lp_off:13, /* offset to find tup */
/* can be reduced by 2 if necc. */
lp_flags:6, /* flags on tuple */
lp_len:13; /* length of tuple */
} ItemIdData;
typedef struct ItemIdData *ItemId;
#ifndef LP_USED
#define LP_USED 0x01 /* this line pointer is being used */
#endif
/* ----------------
* support macros
* ----------------
*/
/*
* ItemIdGetLength
*/
#define ItemIdGetLength(itemId) \
((itemId)->lp_len)
/*
* ItemIdGetOffset
*/
#define ItemIdGetOffset(itemId) \
((itemId)->lp_off)
/*
* ItemIdGetFlags
*/
#define ItemIdGetFlags(itemId) \
((itemId)->lp_flags)
/*
* ItemIdIsValid --
* True iff disk item identifier is valid.
*/
#define ItemIdIsValid(itemId) PointerIsValid(itemId)
/*
* ItemIdIsUsed --
* True iff disk item identifier is in use.
*
* Note:
* Assumes disk item identifier is valid.
*/
#define ItemIdIsUsed(itemId) \
(AssertMacro(ItemIdIsValid(itemId)) ? \
(bool) (((itemId)->lp_flags & LP_USED) != 0) : false)
#endif /* ITEMID_H */

View File

@@ -0,0 +1,44 @@
/*-------------------------------------------------------------------------
*
* itempos.h--
* Standard POSTGRES buffer page long item subposition definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: itempos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef ITEMPOS_H
#define ITEMPOS_H
#include "c.h"
#include "storage/buf.h"
#include "storage/itemid.h"
typedef struct ItemSubpositionData {
Buffer op_db;
ItemId op_lpp;
char *op_cp; /* XXX */
uint32 op_len;
} ItemSubpositionData;
typedef ItemSubpositionData *ItemSubposition;
/*
* PNOBREAK(OBJP, LEN)
* struct objpos *OBJP;
* unsigned LEN;
*/
#define PNOBREAK(OBJP, LEN) ((OBJP)->op_len >= LEN)
/*
* PSKIP(OBJP, LEN)
* struct objpos *OBJP;
* unsigned LEN;
*/
#define PSKIP(OBJP, LEN)\
{ (OBJP)->op_cp += (LEN); (OBJP)->op_len -= (LEN); }
#endif /* ITEMPOS_H */

View File

@@ -0,0 +1,115 @@
/*-------------------------------------------------------------------------
*
* itemptr.h--
* POSTGRES disk item pointer definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: itemptr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef ITEMPTR_H
#define ITEMPTR_H
#include "c.h"
#include "storage/block.h"
#include "storage/off.h"
#include "storage/itemid.h"
/*
* ItemPointer:
*
* this is a pointer to an item on another disk page in the same file.
* blkid tells us which block, posid tells us which entry in the linp
* (ItemIdData) array we want.
*/
typedef struct ItemPointerData {
BlockIdData ip_blkid;
OffsetNumber ip_posid;
} ItemPointerData;
typedef ItemPointerData *ItemPointer;
/* ----------------
* support macros
* ----------------
*/
/*
* ItemPointerIsValid --
* True iff the disk item pointer is not NULL.
*/
#define ItemPointerIsValid(pointer) \
((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
/*
* ItemPointerGetBlockNumber --
* Returns the block number of a disk item pointer.
*/
#define ItemPointerGetBlockNumber(pointer) \
(AssertMacro(ItemPointerIsValid(pointer)) ? \
BlockIdGetBlockNumber(&(pointer)->ip_blkid) : (BlockNumber) 0)
/*
* ItemPointerGetOffsetNumber --
* Returns the offset number of a disk item pointer.
*/
#define ItemPointerGetOffsetNumber(pointer) \
(AssertMacro(ItemPointerIsValid(pointer)) ? \
(pointer)->ip_posid : \
InvalidOffsetNumber)
/*
* ItemPointerSet --
* Sets a disk item pointer to the specified block and offset.
*/
#define ItemPointerSet(pointer, blockNumber, offNum) \
Assert(PointerIsValid(pointer)); \
BlockIdSet(&((pointer)->ip_blkid), blockNumber); \
(pointer)->ip_posid = offNum
/*
* ItemPointerSetBlockNumber --
* Sets a disk item pointer to the specified block.
*/
#define ItemPointerSetBlockNumber(pointer, blockNumber) \
Assert(PointerIsValid(pointer)); \
BlockIdSet(&((pointer)->ip_blkid), blockNumber)
/*
* ItemPointerSetOffsetNumber --
* Sets a disk item pointer to the specified offset.
*/
#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \
AssertMacro(PointerIsValid(pointer)); \
(pointer)->ip_posid = (offsetNumber)
/*
* ItemPointerCopy --
* Copies the contents of one disk item pointer to another.
*/
#define ItemPointerCopy(fromPointer, toPointer) \
Assert(PointerIsValid(toPointer)); \
Assert(PointerIsValid(fromPointer)); \
*(toPointer) = *(fromPointer)
/*
* ItemPointerSetInvalid --
* Sets a disk item pointer to be invalid.
*/
#define ItemPointerSetInvalid(pointer) \
Assert(PointerIsValid(pointer)); \
BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber); \
(pointer)->ip_posid = InvalidOffsetNumber
/* ----------------
* externs
* ----------------
*/
extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2);
#endif /* ITEMPTR_H */

View File

@@ -0,0 +1,58 @@
/*-------------------------------------------------------------------------
*
* large_object.h--
* file of info for Postgres large objects. POSTGRES 4.2 supports
* zillions of large objects (internal, external, jaquith, inversion).
* Now we only support inversion.
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: large_object.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef LARGE_OBJECT_H
#define LARGE_OBJECT_H
#include "c.h"
#include "utils/rel.h"
#include "access/relscan.h"
/*
* This structure will eventually have lots more stuff associated with it.
*/
typedef struct LargeObjectDesc
{
Relation heap_r; /* heap relation */
Relation index_r; /* index relation on seqno attribute */
IndexScanDesc iscan; /* index scan we're using */
TupleDesc hdesc; /* heap relation tuple desc */
TupleDesc idesc; /* index relation tuple desc */
uint32 lowbyte; /* low byte on the current page */
uint32 highbyte; /* high byte on the current page */
uint32 offset; /* current seek pointer */
ItemPointerData htid; /* tid of current heap tuple */
#define IFS_RDLOCK (1 << 0)
#define IFS_WRLOCK (1 << 1)
#define IFS_ATEOF (1 << 2)
u_long flags; /* locking info, etc */
} LargeObjectDesc;
/*
* Function definitions...
*/
/* inversion stuff in inv_api.c */
extern LargeObjectDesc *inv_create(int flags);
extern LargeObjectDesc *inv_open(Oid lobjId, int flags);
extern void inv_close(LargeObjectDesc *obj_desc);
extern int inv_destroy(Oid lobjId);
extern int inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf);
extern int inv_seek(LargeObjectDesc *obj_desc, int offset, int whence);
extern int inv_tell(LargeObjectDesc *obj_desc);
extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes);
extern int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes);
#endif /* LARGE_OBJECT_H */

View File

@@ -0,0 +1,14 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/large_object
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/large_object/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= inv_api.c

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,84 @@
/*-------------------------------------------------------------------------
*
* lmgr.h--
* POSTGRES lock manager definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: lmgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef LMGR_H
#define LMGR_H
#include "postgres.h"
#include "storage/itemptr.h"
#include "storage/lock.h"
#include "utils/rel.h"
/*
* This was moved from pladt.h for the new lock manager. Want to obsolete
* all of the old code.
*/
typedef struct LRelId {
Oid relId; /* a relation identifier */
Oid dbId; /* a database identifier */
} LRelId;
typedef struct LockInfoData {
bool initialized;
LRelId lRelId;
TransactionId transactionIdData;
uint16 flags;
} LockInfoData;
typedef LockInfoData *LockInfo;
#define LockInfoIsValid(linfo) \
((PointerIsValid(linfo)) && ((LockInfo) linfo)->initialized)
extern LRelId RelationGetLRelId(Relation relation);
extern Oid LRelIdGetDatabaseId(LRelId lRelId);
extern Oid LRelIdGetRelationId(LRelId lRelId);
extern bool DatabaseIdIsMyDatabaseId(Oid databaseId);
extern bool LRelIdContainsMyDatabaseId(LRelId lRelId);
extern void RelationInitLockInfo(Relation relation);
extern void RelationDiscardLockInfo(Relation relation);
extern void RelationSetLockForDescriptorOpen(Relation relation);
extern void RelationSetLockForRead(Relation relation);
extern void RelationUnsetLockForRead(Relation relation);
extern void RelationSetLockForWrite(Relation relation);
extern void RelationUnsetLockForWrite(Relation relation);
extern void RelationSetLockForTupleRead(Relation relation,
ItemPointer itemPointer);
/* used in vaccum.c */
extern void RelationSetLockForWritePage(Relation relation,
ItemPointer itemPointer);
/* used in nbtpage.c, hashpage.c */
extern void RelationSetSingleWLockPage(Relation relation,
ItemPointer itemPointer);
extern void RelationUnsetSingleWLockPage(Relation relation,
ItemPointer itemPointer);
extern void RelationSetSingleRLockPage(Relation relation,
ItemPointer itemPointer);
extern void RelationUnsetSingleRLockPage(Relation relation,
ItemPointer itemPointer);
extern void RelationSetRIntentLock(Relation relation);
extern void RelationUnsetRIntentLock(Relation relation);
extern void RelationSetWIntentLock(Relation relation);
extern void RelationUnsetWIntentLock(Relation relation);
extern void RelationSetLockForExtend(Relation relation);
extern void RelationUnsetLockForExtend(Relation relation);
extern void LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId);
/* single.c */
extern bool SingleLockReln(LockInfo linfo, LOCKT lockt, int action);
extern bool SingleLockPage(LockInfo linfo, ItemPointer tidPtr,
LOCKT lockt, int action);
#endif /* LMGR_H */

View File

@@ -0,0 +1,14 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/lmgr
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= lmgr.c lock.c multi.c proc.c single.c

View File

@@ -0,0 +1,93 @@
$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
This file is an attempt to save me (and future code maintainers) some
time and a lot of headaches. The existing lock manager code at the time
of this writing (June 16 1992) can best be described as confusing. The
complexity seems inherent in lock manager functionality, but variable
names chosen in the current implementation really confuse me everytime
I have to track down a bug. Also, what gets done where and by whom isn't
always clear....
Starting with the data structures the lock manager relies upon...
(NOTE - these will undoubtedly change over time and it is likely
that this file won't always be updated along with the structs.)
The lock manager's LOCK:
tag -
The key fields that are used for hashing locks in the shared memory
lock hash table. This is kept as a separate struct to ensure that we
always zero out the correct number of bytes. This is a problem as
part of the tag is an itempointer which is 6 bytes and causes 2
additional bytes to be added as padding.
tag.relId -
Uniquely identifies the relation that the lock corresponds to.
tag.dbId -
Uniquely identifies the database in which the relation lives. If
this is a shared system relation (e.g. pg_user) the dbId should be
set to 0.
tag.tupleId -
Uniquely identifies the block/page within the relation and the
tuple within the block. If we are setting a table level lock
both the blockId and tupleId (in an item pointer this is called
the position) are set to invalid, if it is a page level lock the
blockId is valid, while the tuleId is still invalid. Finally if
this is a tuple level lock (we currently never do this) then both
the blockId and tupleId are set to valid specifications. This is
how we get the appearance of a multi-level lock table while using
only a single table (see Gray's paper on 2 phase locking if
you are puzzled about how multi-level lock tables work).
mask -
This field indicates what types of locks are currently held in the
given lock. It is used (against the lock table's conflict table)
to determine if the new lock request will conflict with existing
lock types held. Conficts are determined by bitwise AND operations
between the mask and the conflict table entry for the given lock type
to be set. The current representation is that each bit (1 through 5)
is set when that lock type (WRITE, READ, WRITE INTENT, READ INTENT, EXTEND)
has been acquired for the lock.
waitProcs -
This is a shared memory queue of all process structures corresponding to
a backend that is waiting (sleeping) until another backend releases this
lock. The process structure holds the information needed to determine
if it should be woken up when this lock is released. If, for example,
we are releasing a read lock and the process is sleeping trying to acquire
a read lock then there is no point in waking it since the lock being
released isn't what caused it to sleep in the first place. There will
be more on this below (when I get to releasing locks and waking sleeping
process routines).
nHolding -
Keeps a count of how many times this lock has been attempted to be
acquired. The count includes attempts by processes which were put
to sleep due to conflicts. It also counts the same backend twice
if, for example, a backend process first acquires a read and then
acquires a write.
holders -
Keeps a count of how many locks of each type have been attempted. Only
elements 1 through MAX_LOCK_TYPES are used as they correspond to the lock
type defined constants (WRITE through EXTEND). Summing the values of
holders should come out equal to nHolding.
nActive -
Keeps a count of how many times this lock has been succesfully acquired.
This count does not include attempts that were rejected due to conflicts,
but can count the same backend twice (e.g. a read then a write -- since
its the same transaction this won't cause a conflict)
activeHolders -
Keeps a count of how locks of each type are currently held. Once again
only elements 1 through MAX_LOCK_TYPES are used (0 is not). Also, like
holders, summing the values of activeHolders should total to the value
of nActive.
This is all I had the stomach for right now..... I will get back to this
someday. -mer 17 June 1992 12:00 am

View File

@@ -0,0 +1,933 @@
/*-------------------------------------------------------------------------
*
* lmgr.c--
* POSTGRES lock manager code
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/* #define LOCKDEBUGALL 1 */
/* #define LOCKDEBUG 1 */
#ifdef LOCKDEBUGALL
#define LOCKDEBUG 1
#endif /* LOCKDEBUGALL */
#include "postgres.h"
#include "access/heapam.h"
#include "access/htup.h"
#include "access/relscan.h"
#include "access/skey.h"
#include "utils/tqual.h"
#include "access/xact.h"
#include "storage/block.h"
#include "storage/buf.h"
#include "storage/itemptr.h"
#include "storage/bufpage.h"
#include "storage/multilev.h"
#include "storage/lmgr.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/rel.h"
#include "catalog/catname.h"
#include "catalog/catalog.h"
#include "catalog/pg_class.h"
#include "nodes/memnodes.h"
#include "storage/bufmgr.h"
#include "access/transam.h" /* for AmiTransactionId */
/* ----------------
*
* ----------------
*/
#define MaxRetries 4 /* XXX about 1/4 minute--a hack */
#define IntentReadRelationLock 0x0100
#define ReadRelationLock 0x0200
#define IntentWriteRelationLock 0x0400
#define WriteRelationLock 0x0800
#define IntentReadPageLock 0x1000
#define ReadTupleLock 0x2000
#define TupleLevelLockCountMask 0x000f
#define TupleLevelLockLimit 10
extern Oid MyDatabaseId;
static LRelId VariableRelationLRelId = {
RelOid_pg_variable,
InvalidOid
};
/* ----------------
* RelationGetLRelId
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_10 \
elog(NOTICE, "RelationGetLRelId(%s) invalid lockInfo", \
RelationGetRelationName(relation));
#else
#define LOCKDEBUG_10
#endif /* LOCKDEBUG */
/*
* RelationGetLRelId --
* Returns "lock" relation identifier for a relation.
*/
LRelId
RelationGetLRelId(Relation relation)
{
LockInfo linfo;
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
linfo = (LockInfo) relation->lockInfo;
/* ----------------
* initialize lock info if necessary
* ----------------
*/
if (! LockInfoIsValid(linfo)) {
LOCKDEBUG_10;
RelationInitLockInfo(relation);
linfo = (LockInfo) relation->lockInfo;
}
/* ----------------
* XXX hack to prevent problems during
* VARIABLE relation initialization
* ----------------
*/
if (strcmp(RelationGetRelationName(relation)->data,
VariableRelationName) == 0) {
return (VariableRelationLRelId);
}
return (linfo->lRelId);
}
/*
* LRelIdGetDatabaseId --
* Returns database identifier for a "lock" relation identifier.
*/
/* ----------------
* LRelIdGetDatabaseId
*
* Note: The argument may not be correct, if it is not used soon
* after it is created.
* ----------------
*/
Oid
LRelIdGetDatabaseId(LRelId lRelId)
{
return (lRelId.dbId);
}
/*
* LRelIdGetRelationId --
* Returns relation identifier for a "lock" relation identifier.
*/
Oid
LRelIdGetRelationId(LRelId lRelId)
{
return (lRelId.relId);
}
/*
* DatabaseIdIsMyDatabaseId --
* True iff database object identifier is valid in my present database.
*/
bool
DatabaseIdIsMyDatabaseId(Oid databaseId)
{
return (bool)
(!OidIsValid(databaseId) || databaseId == MyDatabaseId);
}
/*
* LRelIdContainsMyDatabaseId --
* True iff "lock" relation identifier is valid in my present database.
*/
bool
LRelIdContainsMyDatabaseId(LRelId lRelId)
{
return (bool)
(!OidIsValid(lRelId.dbId) || lRelId.dbId == MyDatabaseId);
}
/*
* RelationInitLockInfo --
* Initializes the lock information in a relation descriptor.
*/
/* ----------------
* RelationInitLockInfo
*
* XXX processingVariable is a hack to prevent problems during
* VARIABLE relation initialization.
* ----------------
*/
void
RelationInitLockInfo(Relation relation)
{
LockInfo info;
char *relname;
Oid relationid;
bool processingVariable;
extern Oid MyDatabaseId; /* XXX use include */
extern GlobalMemory CacheCxt;
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
Assert(OidIsValid(RelationGetRelationId(relation)));
/* ----------------
* get information from relation descriptor
* ----------------
*/
info = (LockInfo) relation->lockInfo;
relname = (char *) RelationGetRelationName(relation);
relationid = RelationGetRelationId(relation);
processingVariable = (strcmp(relname, VariableRelationName) == 0);
/* ----------------
* create a new lockinfo if not already done
* ----------------
*/
if (! PointerIsValid(info))
{
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo((MemoryContext)CacheCxt);
info = (LockInfo)palloc(sizeof(LockInfoData));
MemoryContextSwitchTo(oldcxt);
}
else if (processingVariable) {
if (IsTransactionState()) {
TransactionIdStore(GetCurrentTransactionId(),
&info->transactionIdData);
}
info->flags = 0x0;
return; /* prevent an infinite loop--still true? */
}
else if (info->initialized)
{
/* ------------
* If we've already initialized we're done.
* ------------
*/
return;
}
/* ----------------
* initialize lockinfo.dbId and .relId appropriately
* ----------------
*/
if (IsSharedSystemRelationName(relname))
LRelIdAssign(&info->lRelId, InvalidOid, relationid);
else
LRelIdAssign(&info->lRelId, MyDatabaseId, relationid);
/* ----------------
* store the transaction id in the lockInfo field
* ----------------
*/
if (processingVariable)
TransactionIdStore(AmiTransactionId,
&info->transactionIdData);
else if (IsTransactionState())
TransactionIdStore(GetCurrentTransactionId(),
&info->transactionIdData);
else
StoreInvalidTransactionId(&(info->transactionIdData));
/* ----------------
* initialize rest of lockinfo
* ----------------
*/
info->flags = 0x0;
info->initialized = (bool)true;
relation->lockInfo = (Pointer) info;
}
/* ----------------
* RelationDiscardLockInfo
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_20 \
elog(DEBUG, "DiscardLockInfo: NULL relation->lockInfo")
#else
#define LOCKDEBUG_20
#endif /* LOCKDEBUG */
/*
* RelationDiscardLockInfo --
* Discards the lock information in a relation descriptor.
*/
void
RelationDiscardLockInfo(Relation relation)
{
if (! LockInfoIsValid(relation->lockInfo)) {
LOCKDEBUG_20;
return;
}
pfree(relation->lockInfo);
relation->lockInfo = NULL;
}
/*
* RelationSetLockForDescriptorOpen --
* Sets read locks for a relation descriptor.
*/
#ifdef LOCKDEBUGALL
#define LOCKDEBUGALL_30 \
elog(DEBUG, "RelationSetLockForDescriptorOpen(%s[%d,%d]) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
#else
#define LOCKDEBUGALL_30
#endif /* LOCKDEBUGALL*/
void
RelationSetLockForDescriptorOpen(Relation relation)
{
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
LOCKDEBUGALL_30;
/* ----------------
* read lock catalog tuples which compose the relation descriptor
* XXX race condition? XXX For now, do nothing.
* ----------------
*/
}
/* ----------------
* RelationSetLockForRead
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_40 \
elog(DEBUG, "RelationSetLockForRead(%s[%d,%d]) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
#else
#define LOCKDEBUG_40
#endif /* LOCKDEBUG*/
/*
* RelationSetLockForRead --
* Sets relation level read lock.
*/
void
RelationSetLockForRead(Relation relation)
{
LockInfo linfo;
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
LOCKDEBUG_40;
/* ----------------
* If we don't have lock info on the reln just go ahead and
* lock it without trying to short circuit the lock manager.
* ----------------
*/
if (!LockInfoIsValid(relation->lockInfo))
{
RelationInitLockInfo(relation);
linfo = (LockInfo) relation->lockInfo;
linfo->flags |= ReadRelationLock;
MultiLockReln(linfo, READ_LOCK);
return;
}
else
linfo = (LockInfo) relation->lockInfo;
MultiLockReln(linfo, READ_LOCK);
}
/* ----------------
* RelationUnsetLockForRead
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_50 \
elog(DEBUG, "RelationUnsetLockForRead(%s[%d,%d]) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
#else
#define LOCKDEBUG_50
#endif /* LOCKDEBUG*/
/*
* RelationUnsetLockForRead --
* Unsets relation level read lock.
*/
void
RelationUnsetLockForRead(Relation relation)
{
LockInfo linfo;
/* ----------------
* sanity check
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
linfo = (LockInfo) relation->lockInfo;
/* ----------------
* If we don't have lock info on the reln just go ahead and
* release it.
* ----------------
*/
if (!LockInfoIsValid(linfo))
{
elog(WARN,
"Releasing a lock on %s with invalid lock information",
RelationGetRelationName(relation));
}
MultiReleaseReln(linfo, READ_LOCK);
}
/* ----------------
* RelationSetLockForWrite(relation)
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_60 \
elog(DEBUG, "RelationSetLockForWrite(%s[%d,%d]) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
#else
#define LOCKDEBUG_60
#endif /* LOCKDEBUG*/
/*
* RelationSetLockForWrite --
* Sets relation level write lock.
*/
void
RelationSetLockForWrite(Relation relation)
{
LockInfo linfo;
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
LOCKDEBUG_60;
/* ----------------
* If we don't have lock info on the reln just go ahead and
* lock it without trying to short circuit the lock manager.
* ----------------
*/
if (!LockInfoIsValid(relation->lockInfo))
{
RelationInitLockInfo(relation);
linfo = (LockInfo) relation->lockInfo;
linfo->flags |= WriteRelationLock;
MultiLockReln(linfo, WRITE_LOCK);
return;
}
else
linfo = (LockInfo) relation->lockInfo;
MultiLockReln(linfo, WRITE_LOCK);
}
/* ----------------
* RelationUnsetLockForWrite
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_70 \
elog(DEBUG, "RelationUnsetLockForWrite(%s[%d,%d]) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId);
#else
#define LOCKDEBUG_70
#endif /* LOCKDEBUG */
/*
* RelationUnsetLockForWrite --
* Unsets relation level write lock.
*/
void
RelationUnsetLockForWrite(Relation relation)
{
LockInfo linfo;
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled()) {
return;
}
linfo = (LockInfo) relation->lockInfo;
if (!LockInfoIsValid(linfo))
{
elog(WARN,
"Releasing a lock on %s with invalid lock information",
RelationGetRelationName(relation));
}
MultiReleaseReln(linfo, WRITE_LOCK);
}
/* ----------------
* RelationSetLockForTupleRead
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_80 \
elog(DEBUG, "RelationSetLockForTupleRead(%s[%d,%d], 0x%x) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, \
itemPointer)
#define LOCKDEBUG_81 \
elog(DEBUG, "RelationSetLockForTupleRead() escalating");
#else
#define LOCKDEBUG_80
#define LOCKDEBUG_81
#endif /* LOCKDEBUG */
/*
* RelationSetLockForTupleRead --
* Sets tuple level read lock.
*/
void
RelationSetLockForTupleRead(Relation relation, ItemPointer itemPointer)
{
LockInfo linfo;
TransactionId curXact;
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
LOCKDEBUG_80;
/* ---------------------
* If our lock info is invalid don't bother trying to short circuit
* the lock manager.
* ---------------------
*/
if (!LockInfoIsValid(relation->lockInfo))
{
RelationInitLockInfo(relation);
linfo = (LockInfo) relation->lockInfo;
linfo->flags |=
IntentReadRelationLock |
IntentReadPageLock |
ReadTupleLock;
MultiLockTuple(linfo, itemPointer, READ_LOCK);
return;
}
else
linfo = (LockInfo) relation->lockInfo;
/* ----------------
* no need to set a lower granularity lock
* ----------------
*/
curXact = GetCurrentTransactionId();
if ((linfo->flags & ReadRelationLock) &&
TransactionIdEquals(curXact, linfo->transactionIdData))
{
return;
}
/* ----------------
* If we don't already have a tuple lock this transaction
* ----------------
*/
if (!( (linfo->flags & ReadTupleLock) &&
TransactionIdEquals(curXact, linfo->transactionIdData) )) {
linfo->flags |=
IntentReadRelationLock |
IntentReadPageLock |
ReadTupleLock;
/* clear count */
linfo->flags &= ~TupleLevelLockCountMask;
} else {
if (TupleLevelLockLimit == (TupleLevelLockCountMask &
linfo->flags)) {
LOCKDEBUG_81;
/* escalate */
MultiLockReln(linfo, READ_LOCK);
/* clear count */
linfo->flags &= ~TupleLevelLockCountMask;
return;
}
/* increment count */
linfo->flags =
(linfo->flags & ~TupleLevelLockCountMask) |
(1 + (TupleLevelLockCountMask & linfo->flags));
}
TransactionIdStore(curXact, &linfo->transactionIdData);
/* ----------------
* Lock the tuple.
* ----------------
*/
MultiLockTuple(linfo, itemPointer, READ_LOCK);
}
/* ----------------
* RelationSetLockForReadPage
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_90 \
elog(DEBUG, "RelationSetLockForReadPage(%s[%d,%d], @%d) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
#else
#define LOCKDEBUG_90
#endif /* LOCKDEBUG*/
/* ----------------
* RelationSetLockForWritePage
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_100 \
elog(DEBUG, "RelationSetLockForWritePage(%s[%d,%d], @%d) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
#else
#define LOCKDEBUG_100
#endif /* LOCKDEBUG */
/*
* RelationSetLockForWritePage --
* Sets write lock on a page.
*/
void
RelationSetLockForWritePage(Relation relation,
ItemPointer itemPointer)
{
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
/* ---------------
* Make sure linfo is initialized
* ---------------
*/
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
/* ----------------
* attempt to set lock
* ----------------
*/
MultiLockPage((LockInfo) relation->lockInfo, itemPointer, WRITE_LOCK);
}
/* ----------------
* RelationUnsetLockForReadPage
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_110 \
elog(DEBUG, "RelationUnsetLockForReadPage(%s[%d,%d], @%d) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
#else
#define LOCKDEBUG_110
#endif /* LOCKDEBUG */
/* ----------------
* RelationUnsetLockForWritePage
* ----------------
*/
#ifdef LOCKDEBUG
#define LOCKDEBUG_120 \
elog(DEBUG, "RelationUnsetLockForWritePage(%s[%d,%d], @%d) called", \
RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
#else
#define LOCKDEBUG_120
#endif /* LOCKDEBUG */
/*
* Set a single level write page lock. Assumes that you already
* have a write intent lock on the relation.
*/
void
RelationSetSingleWLockPage(Relation relation,
ItemPointer itemPointer)
{
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, !UNLOCK);
}
/*
* Unset a single level write page lock
*/
void
RelationUnsetSingleWLockPage(Relation relation,
ItemPointer itemPointer)
{
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
elog(WARN,
"Releasing a lock on %s with invalid lock information",
RelationGetRelationName(relation));
SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, UNLOCK);
}
/*
* Set a single level read page lock. Assumes you already have a read
* intent lock set on the relation.
*/
void
RelationSetSingleRLockPage(Relation relation,
ItemPointer itemPointer)
{
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, !UNLOCK);
}
/*
* Unset a single level read page lock.
*/
void
RelationUnsetSingleRLockPage(Relation relation,
ItemPointer itemPointer)
{
/* ----------------
* sanity checks
* ----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
elog(WARN,
"Releasing a lock on %s with invalid lock information",
RelationGetRelationName(relation));
SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, UNLOCK);
}
/*
* Set a read intent lock on a relation.
*
* Usually these are set in a multi-level table when you acquiring a
* page level lock. i.e. To acquire a lock on a page you first acquire
* an intent lock on the entire relation. Acquiring an intent lock along
* allows one to use the single level locking routines later. Good for
* index scans that do a lot of page level locking.
*/
void
RelationSetRIntentLock(Relation relation)
{
/* -----------------
* Sanity check
* -----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, !UNLOCK);
}
/*
* Unset a read intent lock on a relation
*/
void
RelationUnsetRIntentLock(Relation relation)
{
/* -----------------
* Sanity check
* -----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, UNLOCK);
}
/*
* Set a write intent lock on a relation. For a more complete explanation
* see RelationSetRIntentLock()
*/
void
RelationSetWIntentLock(Relation relation)
{
/* -----------------
* Sanity check
* -----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, !UNLOCK);
}
/*
* Unset a write intent lock.
*/
void
RelationUnsetWIntentLock(Relation relation)
{
/* -----------------
* Sanity check
* -----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, UNLOCK);
}
/*
* Extend locks are used primarily in tertiary storage devices such as
* a WORM disk jukebox. Sometimes need exclusive access to extend a
* file by a block.
*/
void
RelationSetLockForExtend(Relation relation)
{
/* -----------------
* Sanity check
* -----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
MultiLockReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
}
void
RelationUnsetLockForExtend(Relation relation)
{
/* -----------------
* Sanity check
* -----------------
*/
Assert(RelationIsValid(relation));
if (LockingDisabled())
return;
if (!LockInfoIsValid(relation->lockInfo))
RelationInitLockInfo(relation);
MultiReleaseReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
}
/*
* Create an LRelid --- Why not just pass in a pointer to the storage?
*/
void
LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId)
{
lRelId->dbId = dbId;
lRelId->relId = relId;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,415 @@
/*-------------------------------------------------------------------------
*
* multi.c--
* multi level lock table manager
*
* Standard multi-level lock manager as per the Gray paper
* (at least, that is what it is supposed to be). We implement
* three levels -- RELN, PAGE, TUPLE. Tuple is actually TID
* a physical record pointer. It isn't an object id.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/multi.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
*
* NOTES:
* (1) The lock.c module assumes that the caller here is doing
* two phase locking.
*
*-------------------------------------------------------------------------
*/
#include <stdio.h>
#include <string.h>
#include "storage/lmgr.h"
#include "storage/multilev.h"
#include "utils/rel.h"
#include "utils/elog.h"
#include "miscadmin.h" /* MyDatabaseId */
/*
* INTENT indicates to higher level that a lower level lock has been
* set. For example, a write lock on a tuple conflicts with a write
* lock on a relation. This conflict is detected as a WRITE_INTENT/
* WRITE conflict between the tuple's intent lock and the relation's
* write lock.
*/
static int MultiConflicts[] = {
(int)NULL,
/* All reads and writes at any level conflict with a write lock */
(1 << WRITE_LOCK)|(1 << WRITE_INTENT)|(1 << READ_LOCK)|(1 << READ_INTENT),
/* read locks conflict with write locks at curr and lower levels */
(1 << WRITE_LOCK)| (1 << WRITE_INTENT),
/* write intent locks */
(1 << READ_LOCK) | (1 << WRITE_LOCK),
/* read intent locks*/
(1 << WRITE_LOCK),
/* extend locks for archive storage manager conflict only w/extend locks */
(1 << EXTEND_LOCK)
};
/*
* write locks have higher priority than read locks and extend locks. May
* want to treat INTENT locks differently.
*/
static int MultiPrios[] = {
(int)NULL,
2,
1,
2,
1,
1
};
/*
* Lock table identifier for this lock table. The multi-level
* lock table is ONE lock table, not three.
*/
LockTableId MultiTableId = (LockTableId)NULL;
LockTableId ShortTermTableId = (LockTableId)NULL;
/*
* Create the lock table described by MultiConflicts and Multiprio.
*/
LockTableId
InitMultiLevelLockm()
{
int tableId;
/* -----------------------
* If we're already initialized just return the table id.
* -----------------------
*/
if (MultiTableId)
return MultiTableId;
tableId = LockTabInit("LockTable", MultiConflicts, MultiPrios, 5);
MultiTableId = tableId;
if (! (MultiTableId)) {
elog(WARN,"InitMultiLockm: couldnt initialize lock table");
}
/* -----------------------
* No short term lock table for now. -Jeff 15 July 1991
*
* ShortTermTableId = LockTabRename(tableId);
* if (! (ShortTermTableId)) {
* elog(WARN,"InitMultiLockm: couldnt rename lock table");
* }
* -----------------------
*/
return MultiTableId;
}
/*
* MultiLockReln -- lock a relation
*
* Returns: TRUE if the lock can be set, FALSE otherwise.
*/
bool
MultiLockReln(LockInfo linfo, LOCKT lockt)
{
LOCKTAG tag;
/* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
*/
memset(&tag,0,sizeof(tag));
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
return(MultiAcquire(MultiTableId, &tag, lockt, RELN_LEVEL));
}
/*
* MultiLockTuple -- Lock the TID associated with a tuple
*
* Returns: TRUE if lock is set, FALSE otherwise.
*
* Side Effects: causes intention level locks to be set
* at the page and relation level.
*/
bool
MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
{
LOCKTAG tag;
/* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
*/
memset(&tag,0,sizeof(tag));
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
/* not locking any valid Tuple, just the page */
tag.tupleId = *tidPtr;
return(MultiAcquire(MultiTableId, &tag, lockt, TUPLE_LEVEL));
}
/*
* same as above at page level
*/
bool
MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
{
LOCKTAG tag;
/* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
*/
memset(&tag,0,sizeof(tag));
/* ----------------------------
* Now we want to set the page offset to be invalid
* and lock the block. There is some confusion here as to what
* a page is. In Postgres a page is an 8k block, however this
* block may be partitioned into many subpages which are sometimes
* also called pages. The term is overloaded, so don't be fooled
* when we say lock the page we mean the 8k block. -Jeff 16 July 1991
* ----------------------------
*/
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
return(MultiAcquire(MultiTableId, &tag, lockt, PAGE_LEVEL));
}
/*
* MultiAcquire -- acquire multi level lock at requested level
*
* Returns: TRUE if lock is set, FALSE if not
* Side Effects:
*/
bool
MultiAcquire(LockTableId tableId,
LOCKTAG *tag,
LOCKT lockt,
LOCK_LEVEL level)
{
LOCKT locks[N_LEVELS];
int i,status;
LOCKTAG xxTag, *tmpTag = &xxTag;
int retStatus = TRUE;
/*
* Three levels implemented. If we set a low level (e.g. Tuple)
* lock, we must set INTENT locks on the higher levels. The
* intent lock detects conflicts between the low level lock
* and an existing high level lock. For example, setting a
* write lock on a tuple in a relation is disallowed if there
* is an existing read lock on the entire relation. The
* write lock would set a WRITE + INTENT lock on the relation
* and that lock would conflict with the read.
*/
switch (level) {
case RELN_LEVEL:
locks[0] = lockt;
locks[1] = NO_LOCK;
locks[2] = NO_LOCK;
break;
case PAGE_LEVEL:
locks[0] = lockt + INTENT;
locks[1] = lockt;
locks[2] = NO_LOCK;
break;
case TUPLE_LEVEL:
locks[0] = lockt + INTENT;
locks[1] = lockt + INTENT;
locks[2] = lockt;
break;
default:
elog(WARN,"MultiAcquire: bad lock level");
return(FALSE);
}
/*
* construct a new tag as we go. Always loop through all levels,
* but if we arent' seting a low level lock, locks[i] is set to
* NO_LOCK for the lower levels. Always start from the highest
* level and go to the lowest level.
*/
memset(tmpTag,0,sizeof(*tmpTag));
tmpTag->relId = tag->relId;
tmpTag->dbId = tag->dbId;
for (i=0;i<N_LEVELS;i++) {
if (locks[i] != NO_LOCK) {
switch (i) {
case RELN_LEVEL:
/* -------------
* Set the block # and offset to invalid
* -------------
*/
BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
break;
case PAGE_LEVEL:
/* -------------
* Copy the block #, set the offset to invalid
* -------------
*/
BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
&(tag->tupleId.ip_blkid));
tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
break;
case TUPLE_LEVEL:
/* --------------
* Copy the entire tuple id.
* --------------
*/
ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
break;
}
status = LockAcquire(tableId, tmpTag, locks[i]);
if (! status) {
/* failed for some reason. Before returning we have
* to release all of the locks we just acquired.
* MultiRelease(xx,xx,xx, i) means release starting from
* the last level lock we successfully acquired
*/
retStatus = FALSE;
(void) MultiRelease(tableId, tag, lockt, i);
/* now leave the loop. Don't try for any more locks */
break;
}
}
}
return(retStatus);
}
/* ------------------
* Release a page in the multi-level lock table
* ------------------
*/
bool
MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
{
LOCKTAG tag;
/* ------------------
* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
* ------------------
*/
memset(&tag, 0,sizeof(LOCKTAG));
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
return (MultiRelease(MultiTableId, &tag, lockt, PAGE_LEVEL));
}
/* ------------------
* Release a relation in the multi-level lock table
* ------------------
*/
bool
MultiReleaseReln(LockInfo linfo, LOCKT lockt)
{
LOCKTAG tag;
/* ------------------
* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
* ------------------
*/
memset(&tag, 0, sizeof(LOCKTAG));
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
return (MultiRelease(MultiTableId, &tag, lockt, RELN_LEVEL));
}
/*
* MultiRelease -- release a multi-level lock
*
* Returns: TRUE if successful, FALSE otherwise.
*/
bool
MultiRelease(LockTableId tableId,
LOCKTAG *tag,
LOCKT lockt,
LOCK_LEVEL level)
{
LOCKT locks[N_LEVELS];
int i,status;
LOCKTAG xxTag, *tmpTag = &xxTag;
/*
* same level scheme as MultiAcquire().
*/
switch (level) {
case RELN_LEVEL:
locks[0] = lockt;
locks[1] = NO_LOCK;
locks[2] = NO_LOCK;
break;
case PAGE_LEVEL:
locks[0] = lockt + INTENT;
locks[1] = lockt;
locks[2] = NO_LOCK;
break;
case TUPLE_LEVEL:
locks[0] = lockt + INTENT;
locks[1] = lockt + INTENT;
locks[2] = lockt;
break;
default:
elog(WARN,"MultiRelease: bad lockt");
}
/*
* again, construct the tag on the fly. This time, however,
* we release the locks in the REVERSE order -- from lowest
* level to highest level.
*
* Must zero out the tag to set padding byes to zero and ensure
* hashing consistency.
*/
memset(tmpTag, 0, sizeof(*tmpTag));
tmpTag->relId = tag->relId;
tmpTag->dbId = tag->dbId;
for (i=(N_LEVELS-1); i>=0; i--) {
if (locks[i] != NO_LOCK) {
switch (i) {
case RELN_LEVEL:
/* -------------
* Set the block # and offset to invalid
* -------------
*/
BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
break;
case PAGE_LEVEL:
/* -------------
* Copy the block #, set the offset to invalid
* -------------
*/
BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
&(tag->tupleId.ip_blkid));
tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
break;
case TUPLE_LEVEL:
ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
break;
}
status = LockRelease(tableId, tmpTag, locks[i]);
if (! status) {
elog(WARN,"MultiRelease: couldn't release after error");
}
}
}
/* shouldn't reach here */
return false;
}

View File

@@ -0,0 +1,826 @@
/*-------------------------------------------------------------------------
*
* proc.c--
* routines to manage per-process shared memory data structure
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
/*
* Each postgres backend gets one of these. We'll use it to
* clean up after the process should the process suddenly die.
*
*
* Interface (a):
* ProcSleep(), ProcWakeup(), ProcWakeupNext(),
* ProcQueueAlloc() -- create a shm queue for sleeping processes
* ProcQueueInit() -- create a queue without allocing memory
*
* Locking and waiting for buffers can cause the backend to be
* put to sleep. Whoever releases the lock, etc. wakes the
* process up again (and gives it an error code so it knows
* whether it was awoken on an error condition).
*
* Interface (b):
*
* ProcReleaseLocks -- frees the locks associated with this process,
* ProcKill -- destroys the shared memory state (and locks)
* associated with the process.
*
* 5/15/91 -- removed the buffer pool based lock chain in favor
* of a shared memory lock chain. The write-protection is
* more expensive if the lock chain is in the buffer pool.
* The only reason I kept the lock chain in the buffer pool
* in the first place was to allow the lock table to grow larger
* than available shared memory and that isn't going to work
* without a lot of unimplemented support anyway.
*
* 4/7/95 -- instead of allocating a set of 1 semaphore per process, we
* allocate a semaphore from a set of PROC_NSEMS_PER_SET semaphores
* shared among backends (we keep a few sets of semaphores around).
* This is so that we can support more backends. (system-wide semaphore
* sets run out pretty fast.) -ay 4/95
*
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
*/
#include <sys/time.h>
#ifndef WIN32
#include <unistd.h>
#endif /* WIN32 */
#include <string.h>
#include <sys/types.h>
#include "libpq/pqsignal.h" /* substitute for <signal.h> */
#if defined(PORTNAME_bsdi)
/* hacka, hacka, hacka (XXX) */
union semun {
int val; /* value for SETVAL */
struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
ushort *array; /* array for GETALL & SETALL */
};
#endif
#include "access/xact.h"
#include "utils/hsearch.h"
#include "utils/elog.h"
#include "storage/buf.h"
#include "storage/lock.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "storage/proc.h"
/*
* timeout (in seconds) for resolving possible deadlock
*/
#ifndef DEADLOCK_TIMEOUT
#define DEADLOCK_TIMEOUT 60
#endif
/* --------------------
* Spin lock for manipulating the shared process data structure:
* ProcGlobal.... Adding an extra spin lock seemed like the smallest
* hack to get around reading and updating this structure in shared
* memory. -mer 17 July 1991
* --------------------
*/
SPINLOCK ProcStructLock;
/*
* For cleanup routines. Don't cleanup if the initialization
* has not happened.
*/
static bool ProcInitialized = FALSE;
static PROC_HDR *ProcGlobal = NULL;
PROC *MyProc = NULL;
static void ProcKill(int exitStatus, int pid);
static void ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum);
static void ProcFreeSem(IpcSemaphoreKey semKey, int semNum);
#if defined(PORTNAME_linux)
extern int HandleDeadLock(int);
#else
extern int HandleDeadLock(void);
#endif
/*
* InitProcGlobal -
* initializes the global process table. We put it here so that
* the postmaster can do this initialization. (ProcFreeAllSem needs
* to read this table on exiting the postmaster. If we have the first
* backend do this, starting up and killing the postmaster without
* starting any backends will be a problem.)
*/
void
InitProcGlobal(IPCKey key)
{
bool found = false;
/* attach to the free list */
ProcGlobal = (PROC_HDR *)
ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
/* --------------------
* We're the first - initialize.
* --------------------
*/
if (! found)
{
int i;
ProcGlobal->numProcs = 0;
ProcGlobal->freeProcs = INVALID_OFFSET;
ProcGlobal->currKey = IPCGetProcessSemaphoreInitKey(key);
for (i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++)
ProcGlobal->freeSemMap[i] = 0;
}
}
/* ------------------------
* InitProc -- create a per-process data structure for this process
* used by the lock manager on semaphore queues.
* ------------------------
*/
void
InitProcess(IPCKey key)
{
bool found = false;
int pid;
int semstat;
unsigned long location, myOffset;
/* ------------------
* Routine called if deadlock timer goes off. See ProcSleep()
* ------------------
*/
#ifndef WIN32
signal(SIGALRM, HandleDeadLock);
#endif /* WIN32 we'll have to figure out how to handle this later */
SpinAcquire(ProcStructLock);
/* attach to the free list */
ProcGlobal = (PROC_HDR *)
ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
if (!found) {
/* this should not happen. InitProcGlobal() is called before this. */
elog(WARN, "InitProcess: Proc Header uninitialized");
}
if (MyProc != NULL)
{
SpinRelease(ProcStructLock);
elog(WARN,"ProcInit: you already exist");
return;
}
/* try to get a proc from the free list first */
myOffset = ProcGlobal->freeProcs;
if (myOffset != INVALID_OFFSET)
{
MyProc = (PROC *) MAKE_PTR(myOffset);
ProcGlobal->freeProcs = MyProc->links.next;
}
else
{
/* have to allocate one. We can't use the normal binding
* table mechanism because the proc structure is stored
* by PID instead of by a global name (need to look it
* up by PID when we cleanup dead processes).
*/
MyProc = (PROC *) ShmemAlloc((unsigned)sizeof(PROC));
if (! MyProc)
{
SpinRelease(ProcStructLock);
elog (FATAL,"cannot create new proc: out of memory");
}
/* this cannot be initialized until after the buffer pool */
SHMQueueInit(&(MyProc->lockQueue));
MyProc->procId = ProcGlobal->numProcs;
ProcGlobal->numProcs++;
}
/*
* zero out the spin lock counts and set the sLocks field for
* ProcStructLock to 1 as we have acquired this spinlock above but
* didn't record it since we didn't have MyProc until now.
*/
memset(MyProc->sLocks, 0, sizeof(MyProc->sLocks));
MyProc->sLocks[ProcStructLock] = 1;
if (IsUnderPostmaster) {
IPCKey semKey;
int semNum;
int semId;
union semun semun;
ProcGetNewSemKeyAndNum(&semKey, &semNum);
semId = IpcSemaphoreCreate(semKey,
PROC_NSEMS_PER_SET,
IPCProtection,
IpcSemaphoreDefaultStartValue,
0,
&semstat);
/*
* we might be reusing a semaphore that belongs to a dead
* backend. So be careful and reinitialize its value here.
*/
semun.val = IpcSemaphoreDefaultStartValue;
semctl(semId, semNum, SETVAL, semun);
IpcSemaphoreLock(semId, semNum, IpcExclusiveLock);
MyProc->sem.semId = semId;
MyProc->sem.semNum = semNum;
MyProc->sem.semKey = semKey;
} else {
MyProc->sem.semId = -1;
}
/* ----------------------
* Release the lock.
* ----------------------
*/
SpinRelease(ProcStructLock);
MyProc->pid = 0;
#if 0
MyProc->pid = MyPid;
#endif
/* ----------------
* Start keeping spin lock stats from here on. Any botch before
* this initialization is forever botched
* ----------------
*/
memset(MyProc->sLocks, 0, MAX_SPINS*sizeof(*MyProc->sLocks));
/* -------------------------
* Install ourselves in the binding table. The name to
* use is determined by the OS-assigned process id. That
* allows the cleanup process to find us after any untimely
* exit.
* -------------------------
*/
pid = getpid();
location = MAKE_OFFSET(MyProc);
if ((! ShmemPIDLookup(pid,&location)) || (location != MAKE_OFFSET(MyProc)))
{
elog(FATAL,"InitProc: ShmemPID table broken");
}
MyProc->errType = NO_ERROR;
SHMQueueElemInit(&(MyProc->links));
on_exitpg(ProcKill, (caddr_t)pid);
ProcInitialized = TRUE;
}
/*
* ProcReleaseLocks() -- release all locks associated with this process
*
*/
void
ProcReleaseLocks()
{
if (!MyProc)
return;
LockReleaseAll(1,&MyProc->lockQueue);
}
/*
* ProcRemove -
* used by the postmaster to clean up the global tables. This also frees
* up the semaphore used for the lmgr of the process. (We have to do
* this is the postmaster instead of doing a IpcSemaphoreKill on exiting
* the process because the semaphore set is shared among backends and
* we don't want to remove other's semaphores on exit.)
*/
bool
ProcRemove(int pid)
{
SHMEM_OFFSET location;
PROC *proc;
location = INVALID_OFFSET;
location = ShmemPIDDestroy(pid);
if (location == INVALID_OFFSET)
return(FALSE);
proc = (PROC *) MAKE_PTR(location);
SpinAcquire(ProcStructLock);
ProcFreeSem(proc->sem.semKey, proc->sem.semNum);
proc->links.next = ProcGlobal->freeProcs;
ProcGlobal->freeProcs = MAKE_OFFSET(proc);
SpinRelease(ProcStructLock);
return(TRUE);
}
/*
* ProcKill() -- Destroy the per-proc data structure for
* this process. Release any of its held spin locks.
*/
static void
ProcKill(int exitStatus, int pid)
{
PROC *proc;
SHMEM_OFFSET location;
/* --------------------
* If this is a FATAL exit the postmaster will have to kill all the
* existing backends and reinitialize shared memory. So all we don't
* need to do anything here.
* --------------------
*/
if (exitStatus != 0)
return;
if (! pid)
{
pid = getpid();
}
ShmemPIDLookup(pid,&location);
if (location == INVALID_OFFSET)
return;
proc = (PROC *) MAKE_PTR(location);
if (proc != MyProc) {
Assert( pid != getpid() );
} else
MyProc = NULL;
/* ---------------
* Assume one lock table.
* ---------------
*/
ProcReleaseSpins(proc);
LockReleaseAll(1,&proc->lockQueue);
/* ----------------
* get off the wait queue
* ----------------
*/
LockLockTable();
if (proc->links.next != INVALID_OFFSET) {
Assert(proc->waitLock->waitProcs.size > 0);
SHMQueueDelete(&(proc->links));
--proc->waitLock->waitProcs.size;
}
SHMQueueElemInit(&(proc->links));
UnlockLockTable();
return;
}
/*
* ProcQueue package: routines for putting processes to sleep
* and waking them up
*/
/*
* ProcQueueAlloc -- alloc/attach to a shared memory process queue
*
* Returns: a pointer to the queue or NULL
* Side Effects: Initializes the queue if we allocated one
*/
PROC_QUEUE *
ProcQueueAlloc(char *name)
{
bool found;
PROC_QUEUE *queue = (PROC_QUEUE *)
ShmemInitStruct(name,(unsigned)sizeof(PROC_QUEUE),&found);
if (! queue)
{
return(NULL);
}
if (! found)
{
ProcQueueInit(queue);
}
return(queue);
}
/*
* ProcQueueInit -- initialize a shared memory process queue
*/
void
ProcQueueInit(PROC_QUEUE *queue)
{
SHMQueueInit(&(queue->links));
queue->size = 0;
}
/*
* ProcSleep -- put a process to sleep
*
* P() on the semaphore should put us to sleep. The process
* semaphore is cleared by default, so the first time we try
* to acquire it, we sleep.
*
* ASSUME: that no one will fiddle with the queue until after
* we release the spin lock.
*
* NOTES: The process queue is now a priority queue for locking.
*/
int
ProcSleep(PROC_QUEUE *queue,
SPINLOCK spinlock,
int token,
int prio,
LOCK *lock)
{
int i;
PROC *proc;
#ifndef WIN32 /* figure this out later */
struct itimerval timeval, dummy;
#endif /* WIN32 */
proc = (PROC *) MAKE_PTR(queue->links.prev);
for (i=0;i<queue->size;i++)
{
if (proc->prio < prio)
proc = (PROC *) MAKE_PTR(proc->links.prev);
else
break;
}
MyProc->token = token;
MyProc->waitLock = lock;
/* -------------------
* currently, we only need this for the ProcWakeup routines
* -------------------
*/
TransactionIdStore((TransactionId) GetCurrentTransactionId(), &MyProc->xid);
/* -------------------
* assume that these two operations are atomic (because
* of the spinlock).
* -------------------
*/
SHMQueueInsertTL(&(proc->links),&(MyProc->links));
queue->size++;
SpinRelease(spinlock);
/* --------------
* Postgres does not have any deadlock detection code and for this
* reason we must set a timer to wake up the process in the event of
* a deadlock. For now the timer is set for 1 minute and we assume that
* any process which sleeps for this amount of time is deadlocked and will
* receive a SIGALRM signal. The handler should release the processes
* semaphore and abort the current transaction.
*
* Need to zero out struct to set the interval and the micro seconds fields
* to 0.
* --------------
*/
#ifndef WIN32
memset(&timeval, 0, sizeof(struct itimerval));
timeval.it_value.tv_sec = DEADLOCK_TIMEOUT;
if (setitimer(ITIMER_REAL, &timeval, &dummy))
elog(FATAL, "ProcSleep: Unable to set timer for process wakeup");
#endif /* WIN32 */
/* --------------
* if someone wakes us between SpinRelease and IpcSemaphoreLock,
* IpcSemaphoreLock will not block. The wakeup is "saved" by
* the semaphore implementation.
* --------------
*/
IpcSemaphoreLock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
/* ---------------
* We were awoken before a timeout - now disable the timer
* ---------------
*/
#ifndef WIN32
timeval.it_value.tv_sec = 0;
if (setitimer(ITIMER_REAL, &timeval, &dummy))
elog(FATAL, "ProcSleep: Unable to diable timer for process wakeup");
#endif /* WIN32 */
/* ----------------
* We were assumed to be in a critical section when we went
* to sleep.
* ----------------
*/
SpinAcquire(spinlock);
return(MyProc->errType);
}
/*
* ProcWakeup -- wake up a process by releasing its private semaphore.
*
* remove the process from the wait queue and set its links invalid.
* RETURN: the next process in the wait queue.
*/
PROC *
ProcWakeup(PROC *proc, int errType)
{
PROC *retProc;
/* assume that spinlock has been acquired */
if (proc->links.prev == INVALID_OFFSET ||
proc->links.next == INVALID_OFFSET)
return((PROC *) NULL);
retProc = (PROC *) MAKE_PTR(proc->links.prev);
/* you have to update waitLock->waitProcs.size yourself */
SHMQueueDelete(&(proc->links));
SHMQueueElemInit(&(proc->links));
proc->errType = errType;
IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum, IpcExclusiveLock);
return retProc;
}
/*
* ProcGetId --
*/
int
ProcGetId()
{
return( MyProc->procId );
}
/*
* ProcLockWakeup -- routine for waking up processes when a lock is
* released.
*/
int
ProcLockWakeup(PROC_QUEUE *queue, char *ltable, char *lock)
{
PROC *proc;
int count;
if (! queue->size)
return(STATUS_NOT_FOUND);
proc = (PROC *) MAKE_PTR(queue->links.prev);
count = 0;
while ((LockResolveConflicts ((LOCKTAB *) ltable,
(LOCK *) lock,
proc->token,
proc->xid) == STATUS_OK))
{
/* there was a waiting process, grant it the lock before waking it
* up. This will prevent another process from seizing the lock
* between the time we release the lock master (spinlock) and
* the time that the awoken process begins executing again.
*/
GrantLock((LOCK *) lock, proc->token);
queue->size--;
/*
* ProcWakeup removes proc from the lock waiting process queue and
* returns the next proc in chain. If a writer just dropped
* its lock and there are several waiting readers, wake them all up.
*/
proc = ProcWakeup(proc, NO_ERROR);
count++;
if (!proc || queue->size == 0)
break;
}
if (count)
return(STATUS_OK);
else
/* Something is still blocking us. May have deadlocked. */
return(STATUS_NOT_FOUND);
}
void
ProcAddLock(SHM_QUEUE *elem)
{
SHMQueueInsertTL(&MyProc->lockQueue,elem);
}
/* --------------------
* We only get to this routine if we got SIGALRM after DEADLOCK_TIMEOUT
* while waiting for a lock to be released by some other process. After
* the one minute deadline we assume we have a deadlock and must abort
* this transaction. We must also indicate that I'm no longer waiting
* on a lock so that other processes don't try to wake me up and screw
* up my semaphore.
* --------------------
*/
int
#if defined(PORTNAME_linux)
HandleDeadLock(int i)
#else
HandleDeadLock()
#endif
{
LOCK *lock;
int size;
LockLockTable();
/* ---------------------
* Check to see if we've been awoken by anyone in the interim.
*
* If we have we can return and resume our transaction -- happy day.
* Before we are awoken the process releasing the lock grants it to
* us so we know that we don't have to wait anymore.
*
* Damn these names are LONG! -mer
* ---------------------
*/
if (IpcSemaphoreGetCount(MyProc->sem.semId, MyProc->sem.semNum) ==
IpcSemaphoreDefaultStartValue) {
UnlockLockTable();
return 1;
}
/*
* you would think this would be unnecessary, but...
*
* this also means we've been removed already. in some ports
* (e.g., sparc and aix) the semop(2) implementation is such that
* we can actually end up in this handler after someone has removed
* us from the queue and bopped the semaphore *but the test above
* fails to detect the semaphore update* (presumably something weird
* having to do with the order in which the semaphore wakeup signal
* and SIGALRM get handled).
*/
if (MyProc->links.prev == INVALID_OFFSET ||
MyProc->links.next == INVALID_OFFSET) {
UnlockLockTable();
return(1);
}
lock = MyProc->waitLock;
size = lock->waitProcs.size; /* so we can look at this in the core */
/* ------------------------
* Get this process off the lock's wait queue
* ------------------------
*/
Assert(lock->waitProcs.size > 0);
--lock->waitProcs.size;
SHMQueueDelete(&(MyProc->links));
SHMQueueElemInit(&(MyProc->links));
/* ------------------
* Unlock my semaphore so that the count is right for next time.
* I was awoken by a signal, not by someone unlocking my semaphore.
* ------------------
*/
IpcSemaphoreUnlock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
/* -------------
* Set MyProc->errType to STATUS_ERROR so that we abort after
* returning from this handler.
* -------------
*/
MyProc->errType = STATUS_ERROR;
/*
* if this doesn't follow the IpcSemaphoreUnlock then we get lock
* table corruption ("LockReplace: xid table corrupted") due to
* race conditions. i don't claim to understand this...
*/
UnlockLockTable();
elog(NOTICE, "Timeout -- possible deadlock");
return 0;
}
void
ProcReleaseSpins(PROC *proc)
{
int i;
if (!proc)
proc = MyProc;
if (!proc)
return;
for (i=0; i < (int)MAX_SPINS; i++)
{
if (proc->sLocks[i])
{
Assert(proc->sLocks[i] == 1);
SpinRelease(i);
}
}
}
/*****************************************************************************
*
*****************************************************************************/
/*
* ProcGetNewSemKeyAndNum -
* scan the free semaphore bitmap and allocate a single semaphore from
* a semaphore set. (If the semaphore set doesn't exist yet,
* IpcSemaphoreCreate will create it. Otherwise, we use the existing
* semaphore set.)
*/
static void
ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum)
{
int i;
int32 *freeSemMap = ProcGlobal->freeSemMap;
unsigned int fullmask;
/*
* we hold ProcStructLock when entering this routine. We scan through
* the bitmap to look for a free semaphore.
*/
fullmask = ~0 >> (32 - PROC_NSEMS_PER_SET);
for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
int mask = 1;
int j;
if (freeSemMap[i] == fullmask)
continue; /* none free for this set */
for(j = 0; j < PROC_NSEMS_PER_SET; j++) {
if ((freeSemMap[i] & mask) == 0) {
/*
* a free semaphore found. Mark it as allocated.
*/
freeSemMap[i] |= mask;
*key = ProcGlobal->currKey + i;
*semNum = j;
return;
}
mask <<= 1;
}
}
/* if we reach here, all the semaphores are in use. */
elog(WARN, "InitProc: cannot allocate a free semaphore");
}
/*
* ProcFreeSem -
* free up our semaphore in the semaphore set. If we're the last one
* in the set, also remove the semaphore set.
*/
static void
ProcFreeSem(IpcSemaphoreKey semKey, int semNum)
{
int mask;
int i;
int32 *freeSemMap = ProcGlobal->freeSemMap;
i = semKey - ProcGlobal->currKey;
mask = ~(1 << semNum);
freeSemMap[i] &= mask;
if (freeSemMap[i]==0)
IpcSemaphoreKill(semKey);
}
/*
* ProcFreeAllSemaphores -
* on exiting the postmaster, we free up all the semaphores allocated
* to the lmgrs of the backends.
*/
void
ProcFreeAllSemaphores()
{
int i;
int32 *freeSemMap = ProcGlobal->freeSemMap;
for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
if (freeSemMap[i]!=0)
IpcSemaphoreKill(ProcGlobal->currKey + i);
}
}

View File

@@ -0,0 +1,86 @@
/*-------------------------------------------------------------------------
*
* single.c--
* set single locks in the multi-level lock hierarchy
*
* Sometimes we don't want to set all levels of the multi-level
* lock hierarchy at once. This allows us to set and release
* one level at a time. It's useful in index scans when
* you can set an intent lock at the beginning and thereafter
* only set page locks. Tends to speed things up.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/single.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <string.h>
#include "storage/lmgr.h" /* where the declarations go */
#include "storage/lock.h"
#include "storage/multilev.h"
#include "utils/rel.h"
/*
* SingleLockReln -- lock a relation
*
* Returns: TRUE if the lock can be set, FALSE otherwise.
*/
bool
SingleLockReln(LockInfo linfo, LOCKT lockt, int action)
{
LOCKTAG tag;
/*
* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
*/
memset(&tag,0,sizeof(tag));
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
BlockIdSet(&(tag.tupleId.ip_blkid), InvalidBlockNumber);
tag.tupleId.ip_posid = InvalidOffsetNumber;
if (action == UNLOCK)
return(LockRelease(MultiTableId, &tag, lockt));
else
return(LockAcquire(MultiTableId, &tag, lockt));
}
/*
* SingleLockPage -- use multi-level lock table, but lock
* only at the page level.
*
* Assumes that an INTENT lock has already been set in the
* multi-level lock table.
*
*/
bool
SingleLockPage(LockInfo linfo,
ItemPointer tidPtr,
LOCKT lockt,
int action)
{
LOCKTAG tag;
/*
* LOCKTAG has two bytes of padding, unfortunately. The
* hash function will return miss if the padding bytes aren't
* zero'd.
*/
memset(&tag,0,sizeof(tag));
tag.relId = linfo->lRelId.relId;
tag.dbId = linfo->lRelId.dbId;
BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
tag.tupleId.ip_posid = InvalidOffsetNumber;
if (action == UNLOCK)
return(LockRelease(MultiTableId, &tag, lockt));
else
return(LockAcquire(MultiTableId, &tag, lockt));
}

218
src/backend/storage/lock.h Normal file
View File

@@ -0,0 +1,218 @@
/*-------------------------------------------------------------------------
*
* lock.h--
*
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: lock.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef LOCK_H_
#define LOCK_H_
#include "postgres.h"
#include "storage/itemptr.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "storage/backendid.h"
#include "utils/hsearch.h"
extern SPINLOCK LockMgrLock;
typedef int MASK;
#define INIT_TABLE_SIZE 100
#define MAX_TABLE_SIZE 1000
/* ----------------------
* The following defines are used to estimate how much shared
* memory the lock manager is going to require.
*
* NBACKENDS - The number of concurrently running backends
* NLOCKS_PER_XACT - The number of unique locks acquired in a transaction
* NLOCKENTS - The maximum number of lock entries in the lock table.
* ----------------------
*/
#define NBACKENDS 50
#define NLOCKS_PER_XACT 40
#define NLOCKENTS NLOCKS_PER_XACT*NBACKENDS
typedef int LOCK_TYPE;
typedef int LOCKT;
typedef int LockTableId;
/* MAX_LOCKTYPES cannot be larger than the bits in MASK */
#define MAX_LOCKTYPES 6
/*
* MAX_TABLES corresponds to the number of spin locks allocated in
* CreateSpinLocks() or the number of shared memory locations allocated
* for lock table spin locks in the case of machines with TAS instructions.
*/
#define MAX_TABLES 2
#define INVALID_TABLEID 0
/*typedef struct LOCK LOCK; */
typedef struct ltag {
Oid relId;
Oid dbId;
ItemPointerData tupleId;
} LOCKTAG;
#define TAGSIZE (sizeof(LOCKTAG))
/* This is the control structure for a lock table. It
* lives in shared memory:
*
* tableID -- the handle used by the lock table's clients to
* refer to the table.
*
* nLockTypes -- number of lock types (READ,WRITE,etc) that
* are defined on this lock table
*
* conflictTab -- this is an array of bitmasks showing lock
* type conflicts. conflictTab[i] is a mask with the j-th bit
* turned on if lock types i and j conflict.
*
* prio -- each locktype has a priority, so, for example, waiting
* writers can be given priority over readers (to avoid
* starvation).
*
* masterlock -- synchronizes access to the table
*
*/
typedef struct lockctl {
LockTableId tableId;
int nLockTypes;
int conflictTab[MAX_LOCKTYPES];
int prio[MAX_LOCKTYPES];
SPINLOCK masterLock;
} LOCKCTL;
/*
* lockHash -- hash table on lock Ids,
* xidHash -- hash on xid and lockId in case
* multiple processes are holding the lock
* ctl - control structure described above.
*/
typedef struct ltable {
HTAB *lockHash;
HTAB *xidHash;
LOCKCTL *ctl;
} LOCKTAB;
/* -----------------------
* A transaction never conflicts with its own locks. Hence, if
* multiple transactions hold non-conflicting locks on the same
* data, private per-transaction information must be stored in the
* XID table. The tag is XID + shared memory lock address so that
* all locks can use the same XID table. The private information
* we store is the number of locks of each type (holders) and the
* total number of locks (nHolding) held by the transaction.
*
* NOTE: --
* There were some problems with the fact that currently TransactionIdData
* is a 5 byte entity and compilers long word aligning of structure fields.
* If the 3 byte padding is put in front of the actual xid data then the
* hash function (which uses XID_TAGSIZE when deciding how many bytes of a
* struct to look at for the key) might only see the last two bytes of the xid.
*
* Clearly this is not good since its likely that these bytes will be the
* same for many transactions and hence they will share the same entry in
* hash table causing the entry to be corrupted. For this long-winded
* reason I have put the tag in a struct of its own to ensure that the
* XID_TAGSIZE is computed correctly. It used to be sizeof (SHMEM_OFFSET) +
* sizeof(TransactionIdData) which != sizeof(XIDTAG).
*
* Finally since the hash function will now look at all 12 bytes of the tag
* the padding bytes MUST be zero'd before use in hash_search() as they
* will have random values otherwise. Jeff 22 July 1991.
* -----------------------
*/
typedef struct XIDTAG {
SHMEM_OFFSET lock;
int pid;
TransactionId xid;
} XIDTAG;
typedef struct XIDLookupEnt {
/* tag */
XIDTAG tag;
/* data */
int holders[MAX_LOCKTYPES];
int nHolding;
SHM_QUEUE queue;
} XIDLookupEnt;
#define XID_TAGSIZE (sizeof(XIDTAG))
/* originally in procq.h */
typedef struct procQueue {
SHM_QUEUE links;
int size;
} PROC_QUEUE;
/*
* lock information:
*
* tag -- uniquely identifies the object being locked
* mask -- union of the conflict masks of all lock types
* currently held on this object.
* waitProcs -- queue of processes waiting for this lock
* holders -- count of each lock type currently held on the
* lock.
* nHolding -- total locks of all types.
*/
typedef struct Lock {
/* hash key */
LOCKTAG tag;
/* data */
int mask;
PROC_QUEUE waitProcs;
int holders[MAX_LOCKTYPES];
int nHolding;
int activeHolders[MAX_LOCKTYPES];
int nActive;
} LOCK;
#define LockGetLock_nHolders(l) l->nHolders
#define LockDecrWaitHolders(lock, lockt) \
lock->nHolding--; \
lock->holders[lockt]--
#define LockLockTable() SpinAcquire(LockMgrLock);
#define UnlockLockTable() SpinRelease(LockMgrLock);
extern SPINLOCK LockMgrLock;
/*
* function prototypes
*/
extern void InitLocks(void);
extern void LockDisable(int status);
extern LockTableId LockTabInit(char *tabName, MASK *conflictsP, int *prioP,
int ntypes);
extern LockTableId LockTabRename(LockTableId tableId);
extern bool LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
extern int LockResolveConflicts(LOCKTAB *ltable, LOCK *lock, LOCKT lockt,
TransactionId xid);
extern int WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock,
LOCKT lockt);
extern bool LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
extern void GrantLock(LOCK *lock, LOCKT lockt);
extern bool LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue);
extern int LockShmemSize(void);
extern bool LockingDisabled(void);
#endif /* LOCK_H */

View File

@@ -0,0 +1,64 @@
/*-------------------------------------------------------------------------
*
* multilev.h--
* multi level lock table consts/defs for single.c and multi.c and their
* clients
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: multilev.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef MULTILEV_H
#define MULTILEV_H
#include "storage/lock.h"
#include "storage/lmgr.h"
#define READ_LOCK 2
#define WRITE_LOCK 1
/* any time a small granularity READ/WRITE lock is set.
* Higher granularity READ_INTENT/WRITE_INTENT locks must
* also be set. A read intent lock is has value READ+INTENT.
* in this implementation.
*/
#define NO_LOCK 0
#define INTENT 2
#define READ_INTENT (READ_LOCK+INTENT)
#define WRITE_INTENT (WRITE_LOCK+INTENT)
#define EXTEND_LOCK 5
#define SHORT_TERM 1
#define LONG_TERM 2
#define UNLOCK 0
#define N_LEVELS 3
#define RELN_LEVEL 0
#define PAGE_LEVEL 1
#define TUPLE_LEVEL 2
typedef int LOCK_LEVEL;
/* multi.c */
extern LockTableId MultiTableId;
extern LockTableId ShortTermTableId;
/*
* function prototypes
*/
extern LockTableId InitMultiLevelLockm(void);
extern bool MultiLockReln(LockInfo linfo, LOCKT lockt);
extern bool MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
extern bool MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
extern bool MultiAcquire(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
LOCK_LEVEL level);
extern bool MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
extern bool MultiReleaseReln(LockInfo linfo, LOCKT lockt);
extern bool MultiRelease(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
LOCK_LEVEL level);
#endif /* MULTILEV_H */

60
src/backend/storage/off.h Normal file
View File

@@ -0,0 +1,60 @@
/*-------------------------------------------------------------------------
*
* off.h--
* POSTGRES disk "offset" definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: off.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef OFF_H
#define OFF_H
#include "c.h"
#include "machine.h" /* for BLCKSZ */
#include "storage/itemid.h"
/*
* OffsetNumber:
*
* this is a 1-based index into the linp (ItemIdData) array in the
* header of each disk page.
*/
typedef uint16 OffsetNumber;
#define InvalidOffsetNumber ((OffsetNumber) 0)
#define FirstOffsetNumber ((OffsetNumber) 1)
#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData)))
#define OffsetNumberMask (0xffff) /* valid uint16 bits */
/* ----------------
* support macros
* ----------------
*/
/*
* OffsetNumberIsValid --
* True iff the offset number is valid.
*/
#define OffsetNumberIsValid(offsetNumber) \
((bool) ((offsetNumber != InvalidOffsetNumber) && \
(offsetNumber <= MaxOffsetNumber)))
/*
* OffsetNumberNext --
* OffsetNumberPrev --
* Increments/decrements the argument. These macros look pointless
* but they help us disambiguate the different manipulations on
* OffsetNumbers (e.g., sometimes we substract one from an
* OffsetNumber to move back, and sometimes we do so to form a
* real C array index).
*/
#define OffsetNumberNext(offsetNumber) \
((OffsetNumber) (1 + (offsetNumber)))
#define OffsetNumberPrev(offsetNumber) \
((OffsetNumber) (-1 + (offsetNumber)))
#endif /* OFF_H */

View File

@@ -0,0 +1,26 @@
/*-------------------------------------------------------------------------
*
* page.h--
* POSTGRES buffer page abstraction definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: page.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef PAGE_H
#define PAGE_H
#include "c.h"
typedef Pointer Page;
/*
* PageIsValid --
* True iff page is valid.
*/
#define PageIsValid(page) PointerIsValid(page)
#endif /* PAGE_H */

View File

@@ -0,0 +1,16 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/page
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/page/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= bufpage.c itemptr.c

View File

@@ -0,0 +1,519 @@
/*-------------------------------------------------------------------------
*
* bufpage.c--
* POSTGRES standard buffer page code.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/page/bufpage.c,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <sys/types.h>
#include <sys/file.h>
#include "c.h"
#include "storage/item.h"
#include "storage/buf.h"
#include "storage/bufmgr.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/memutils.h"
#include "storage/bufpage.h"
#include "lib/qsort.h"
static bool PageManagerShuffle = true; /* default is shuffle mode */
/* ----------------------------------------------------------------
* Buffer support functions
* ----------------------------------------------------------------
*/
/*
* BufferGetPageSize --
* Returns the page size within a buffer.
*
* Notes:
* Assumes buffer is valid.
*
* The buffer can be a raw disk block and need not contain a valid
* (formatted) disk page.
*/
Size
BufferGetPageSize(Buffer buffer)
{
Size pageSize;
Assert(BufferIsValid(buffer));
pageSize = BLCKSZ; /* XXX dig out of buffer descriptor */
Assert(PageSizeIsValid(pageSize));
return (pageSize);
}
/*
* BufferGetPage --
* Returns the page associated with a buffer.
*/
Page
BufferGetPage(Buffer buffer)
{
return (Page) BufferGetBlock(buffer);
}
/* ----------------------------------------------------------------
* Page support functions
* ----------------------------------------------------------------
*/
/*
* PageInit --
* Initializes the contents of a page.
*/
void
PageInit(Page page, Size pageSize, Size specialSize)
{
PageHeader p = (PageHeader) page;
Assert(pageSize == BLCKSZ);
Assert(pageSize >
specialSize + sizeof(PageHeaderData) - sizeof(ItemIdData));
specialSize = DOUBLEALIGN(specialSize);
p->pd_lower = sizeof(PageHeaderData) - sizeof(ItemIdData);
p->pd_upper = pageSize - specialSize;
p->pd_special = pageSize - specialSize;
PageSetPageSize(page, pageSize);
}
/*
* PageGetItem --
* Retrieves an item on the given page.
*
* Note:
* This does change the status of any of the resources passed.
* The semantics may change in the future.
*/
Item
PageGetItem(Page page, ItemId itemId)
{
Item item;
Assert(PageIsValid(page));
Assert((*itemId).lp_flags & LP_USED);
item = (Item)(((char *)page) + (*itemId).lp_off);
return (item);
}
/*
* PageAddItem --
* Adds item to the given page.
*
* Note:
* This does not assume that the item resides on a single page.
* It is the responsiblity of the caller to act appropriately
* depending on this fact. The "pskip" routines provide a
* friendlier interface, in this case.
*
* This does change the status of any of the resources passed.
* The semantics may change in the future.
*
* This routine should probably be combined with others?
*/
/* ----------------
* PageAddItem
*
* add an item to a page.
*
* Notes on interface:
* If offsetNumber is valid, shuffle ItemId's down to make room
* to use it, if PageManagerShuffle is true. If PageManagerShuffle is
* false, then overwrite the specified ItemId. (PageManagerShuffle is
* true by default, and is modified by calling PageManagerModeSet.)
* If offsetNumber is not valid, then assign one by finding the first
* one that is both unused and deallocated.
*
* NOTE: If offsetNumber is valid, and PageManagerShuffle is true, it
* is assumed that there is room on the page to shuffle the ItemId's
* down by one.
* ----------------
*/
OffsetNumber
PageAddItem(Page page,
Item item,
Size size,
OffsetNumber offsetNumber,
ItemIdFlags flags)
{
register i;
Size alignedSize;
Offset lower;
Offset upper;
ItemId itemId;
ItemId fromitemId, toitemId;
OffsetNumber limit;
bool shuffled = false;
/*
* Find first unallocated offsetNumber
*/
limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
/* was offsetNumber passed in? */
if (OffsetNumberIsValid(offsetNumber)) {
if (PageManagerShuffle == true) {
/* shuffle ItemId's (Do the PageManager Shuffle...) */
for (i = (limit - 1); i >= offsetNumber; i--) {
fromitemId = &((PageHeader)page)->pd_linp[i - 1];
toitemId = &((PageHeader)page)->pd_linp[i];
*toitemId = *fromitemId;
}
shuffled = true; /* need to increase "lower" */
} else { /* overwrite mode */
itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
if (((*itemId).lp_flags & LP_USED) ||
((*itemId).lp_len != 0)) {
elog(WARN, "PageAddItem: tried overwrite of used ItemId");
return (InvalidOffsetNumber);
}
}
} else { /* offsetNumber was not passed in, so find one */
/* look for "recyclable" (unused & deallocated) ItemId */
for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) {
itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
if ((((*itemId).lp_flags & LP_USED) == 0) &&
((*itemId).lp_len == 0))
break;
}
}
if (offsetNumber > limit)
lower = (Offset) (((char *) (&((PageHeader)page)->pd_linp[offsetNumber])) - ((char *) page));
else if (offsetNumber == limit || shuffled == true)
lower = ((PageHeader)page)->pd_lower + sizeof (ItemIdData);
else
lower = ((PageHeader)page)->pd_lower;
alignedSize = DOUBLEALIGN(size);
upper = ((PageHeader)page)->pd_upper - alignedSize;
if (lower > upper) {
return (InvalidOffsetNumber);
}
itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
(*itemId).lp_off = upper;
(*itemId).lp_len = size;
(*itemId).lp_flags = flags;
memmove((char *)page + upper, item, size);
((PageHeader)page)->pd_lower = lower;
((PageHeader)page)->pd_upper = upper;
return (offsetNumber);
}
/*
* PageGetTempPage --
* Get a temporary page in local memory for special processing
*/
Page
PageGetTempPage(Page page, Size specialSize)
{
Size pageSize;
Size size;
Page temp;
PageHeader thdr;
pageSize = PageGetPageSize(page);
if ((temp = (Page) palloc(pageSize)) == (Page) NULL)
elog(FATAL, "Cannot allocate %d bytes for temp page.", pageSize);
thdr = (PageHeader) temp;
/* copy old page in */
memmove(temp, page, pageSize);
/* clear out the middle */
size = (pageSize - sizeof(PageHeaderData)) + sizeof(ItemIdData);
size -= DOUBLEALIGN(specialSize);
memset((char *) &(thdr->pd_linp[0]), 0, size);
/* set high, low water marks */
thdr->pd_lower = sizeof (PageHeaderData) - sizeof (ItemIdData);
thdr->pd_upper = pageSize - DOUBLEALIGN(specialSize);
return (temp);
}
/*
* PageRestoreTempPage --
* Copy temporary page back to permanent page after special processing
* and release the temporary page.
*/
void
PageRestoreTempPage(Page tempPage, Page oldPage)
{
Size pageSize;
pageSize = PageGetPageSize(tempPage);
memmove((char *) oldPage, (char *) tempPage, pageSize);
pfree(tempPage);
}
/*
* PageGetMaxOffsetNumber --
* Returns the maximum offset number used by the given page.
*
* NOTE: The offset is invalid if the page is non-empty.
* Test whether PageIsEmpty before calling this routine
* and/or using its return value.
*/
OffsetNumber
PageGetMaxOffsetNumber(Page page)
{
LocationIndex low;
OffsetNumber i;
low = ((PageHeader) page)->pd_lower;
i = (low - (sizeof(PageHeaderData) - sizeof(ItemIdData)))
/ sizeof(ItemIdData);
return(i);
}
/* ----------------
* itemid stuff for PageRepairFragmentation
* ----------------
*/
struct itemIdSortData {
int offsetindex; /* linp array index */
ItemIdData itemiddata;
};
static int
itemidcompare(struct itemIdSortData *itemidp1, struct itemIdSortData *itemidp2)
{
if (itemidp1->itemiddata.lp_off == itemidp2->itemiddata.lp_off)
return(0);
else if (itemidp1->itemiddata.lp_off < itemidp2->itemiddata.lp_off)
return(1);
else
return(-1);
}
/*
* PageRepairFragmentation --
* Frees fragmented space on a page.
*/
void
PageRepairFragmentation(Page page)
{
int i;
struct itemIdSortData *itemidbase, *itemidptr;
ItemId lp;
int nline, nused;
int itemidcompare();
Offset upper;
Size alignedSize;
nline = (int16) PageGetMaxOffsetNumber(page);
nused = 0;
for (i=0; i<nline; i++) {
lp = ((PageHeader)page)->pd_linp + i;
if ((*lp).lp_flags & LP_USED)
nused++;
}
if (nused == 0) {
for (i=0; i<nline; i++) {
lp = ((PageHeader)page)->pd_linp + i;
if ((*lp).lp_len > 0) /* unused, but allocated */
(*lp).lp_len = 0; /* indicate unused & deallocated */
}
((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
} else { /* nused != 0 */
itemidbase = (struct itemIdSortData *)
palloc(sizeof(struct itemIdSortData) * nused);
memset((char *) itemidbase, 0, sizeof(struct itemIdSortData) * nused);
itemidptr = itemidbase;
for (i=0; i<nline; i++) {
lp = ((PageHeader)page)->pd_linp + i;
if ((*lp).lp_flags & LP_USED) {
itemidptr->offsetindex = i;
itemidptr->itemiddata = *lp;
itemidptr++;
} else {
if ((*lp).lp_len > 0) /* unused, but allocated */
(*lp).lp_len = 0; /* indicate unused & deallocated */
}
}
/* sort itemIdSortData array...*/
pg_qsort((char *) itemidbase, nused, sizeof(struct itemIdSortData),
(void*) itemidcompare);
/* compactify page */
((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
for (i=0, itemidptr = itemidbase; i<nused; i++, itemidptr++) {
lp = ((PageHeader)page)->pd_linp + itemidptr->offsetindex;
alignedSize = DOUBLEALIGN((*lp).lp_len);
upper = ((PageHeader)page)->pd_upper - alignedSize;
memmove((char *) page + upper,
(char *)page + (*lp).lp_off,
(*lp).lp_len);
(*lp).lp_off = upper;
((PageHeader)page)->pd_upper = upper;
}
pfree(itemidbase);
}
}
/*
* PageGetFreeSpace --
* Returns the size of the free (allocatable) space on a page.
*/
Size
PageGetFreeSpace(Page page)
{
Size space;
space = ((PageHeader)page)->pd_upper - ((PageHeader)page)->pd_lower;
if (space < sizeof (ItemIdData)) {
return (0);
}
space -= sizeof (ItemIdData); /* XXX not always true */
return (space);
}
/*
* PageManagerModeSet --
*
* Sets mode to either: ShufflePageManagerMode (the default) or
* OverwritePageManagerMode. For use by access methods code
* for determining semantics of PageAddItem when the offsetNumber
* argument is passed in.
*/
void
PageManagerModeSet(PageManagerMode mode)
{
if (mode == ShufflePageManagerMode)
PageManagerShuffle = true;
else if (mode == OverwritePageManagerMode)
PageManagerShuffle = false;
}
/*
*----------------------------------------------------------------
* PageIndexTupleDelete
*----------------------------------------------------------------
*
* This routine does the work of removing a tuple from an index page.
*/
void
PageIndexTupleDelete(Page page, OffsetNumber offnum)
{
PageHeader phdr;
char *addr;
ItemId tup;
Size size;
char *locn;
int nbytes;
int offidx;
phdr = (PageHeader) page;
/* change offset number to offset index */
offidx = offnum - 1;
tup = PageGetItemId(page, offnum);
size = ItemIdGetLength(tup);
size = DOUBLEALIGN(size);
/* location of deleted tuple data */
locn = (char *) (page + ItemIdGetOffset(tup));
/*
* First, we want to get rid of the pd_linp entry for the index
* tuple. We copy all subsequent linp's back one slot in the
* array.
*/
nbytes = phdr->pd_lower -
((char *)&phdr->pd_linp[offidx + 1] - (char *) phdr);
memmove((char *) &(phdr->pd_linp[offidx]),
(char *) &(phdr->pd_linp[offidx + 1]),
nbytes);
/*
* Now move everything between the old upper bound (beginning of tuple
* space) and the beginning of the deleted tuple forward, so that
* space in the middle of the page is left free. If we've just deleted
* the tuple at the beginning of tuple space, then there's no need
* to do the copy (and bcopy on some architectures SEGV's if asked
* to move zero bytes).
*/
/* beginning of tuple space */
addr = (char *) (page + phdr->pd_upper);
if (locn != addr)
memmove(addr + size, addr, (int) (locn - addr));
/* adjust free space boundary pointers */
phdr->pd_upper += size;
phdr->pd_lower -= sizeof (ItemIdData);
/* finally, we need to adjust the linp entries that remain */
if (!PageIsEmpty(page))
PageIndexTupleDeleteAdjustLinePointers(phdr, locn, size);
}
/*
*----------------------------------------------------------------
* PageIndexTupleDeleteAdjustLinePointers
*----------------------------------------------------------------
*
* Once the line pointers and tuple data have been shifted around
* on the page, we need to go down the line pointer vector and
* adjust pointers to reflect new locations. Anything that used
* to be before the deleted tuple's data was moved forward by the
* size of the deleted tuple.
*
* This routine does the work of adjusting the line pointers.
* Location is where the tuple data used to lie; size is how
* much space it occupied. We assume that size has been aligned
* as required by the time we get here.
*
* This routine should never be called on an empty page.
*/
void
PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
char *location,
Size size)
{
int i;
/* location is an index into the page... */
location -= (int) phdr;
for (i = PageGetMaxOffsetNumber((Page) phdr) - 1; i >= 0; i--) {
if (phdr->pd_linp[i].lp_off <= (unsigned) location) {
phdr->pd_linp[i].lp_off += size;
}
}
}

View File

@@ -0,0 +1,40 @@
/*-------------------------------------------------------------------------
*
* itemptr.c--
* POSTGRES disk item pointer code.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/page/itemptr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include "storage/block.h"
#include "storage/off.h"
#include "storage/itemptr.h"
#include "storage/bufpage.h"
/*
* ItemPointerEquals --
* Returns true if both item pointers point to the same item,
* otherwise returns false.
*
* Note:
* Assumes that the disk item pointers are not NULL.
*/
bool
ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
{
if (ItemPointerGetBlockNumber(pointer1) ==
ItemPointerGetBlockNumber(pointer2) &&
ItemPointerGetOffsetNumber(pointer1) ==
ItemPointerGetOffsetNumber(pointer2))
return(true);
else
return(false);
}

View File

@@ -0,0 +1,33 @@
/*-------------------------------------------------------------------------
*
* pagenum.h--
* POSTGRES page number definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: pagenum.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef PAGENUM_H
#define PAGENUM_H
#include "c.h"
#include "storage/page.h"
typedef uint16 PageNumber;
typedef uint32 LogicalPageNumber;
#define InvalidLogicalPageNumber 0
/*
* LogicalPageNumberIsValid --
* True iff the logical page number is valid.
*/
#define LogicalPageNumberIsValid(pageNumber) \
((bool)((pageNumber) != InvalidLogicalPageNumber))
#endif /* PAGENUM_H */

64
src/backend/storage/pos.h Normal file
View File

@@ -0,0 +1,64 @@
/*-------------------------------------------------------------------------
*
* pos.h--
* POSTGRES "position" definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: pos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef POS_H
#define POS_H
#include "c.h"
/*
* a 'position' used to be <pagenumber, offset> in postgres. this has
* been changed to just <offset> as the notion of having multiple pages
* within a block has been removed.
*
* the 'offset' abstraction is somewhat confusing. it is NOT a byte
* offset within the page; instead, it is an offset into the line
* pointer array contained on every page that store (heap or index)
* tuples.
*/
typedef bits16 PositionIdData;
typedef PositionIdData *PositionId;
/* ----------------
* support macros
* ----------------
*/
/*
* PositionIdIsValid --
* True iff the position identifier is valid.
*/
#define PositionIdIsValid(positionId) \
PointerIsValid(positionId)
/*
* PositionIdSetInvalid --
* Make an invalid position.
*/
#define PositionIdSetInvalid(positionId) \
*(positionId) = (bits16) 0
/*
* PositionIdSet --
* Sets a position identifier to the specified value.
*/
#define PositionIdSet(positionId, offsetNumber) \
*(positionId) = (offsetNumber)
/*
* PositionIdGetOffsetNumber --
* Retrieve the offset number from a position identifier.
*/
#define PositionIdGetOffsetNumber(positionId) \
((OffsetNumber) *(positionId))
#endif /* POS_H */

127
src/backend/storage/proc.h Normal file
View File

@@ -0,0 +1,127 @@
/*-------------------------------------------------------------------------
*
* proc.h--
*
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: proc.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef _PROC_H_
#define _PROC_H_
#include "storage/ipc.h"
#include "storage/lock.h"
#ifndef WIN32
#include <sys/sem.h>
#else
/* This is because WIN32 already defines PROC */
#define PROC PGL_PROC
#endif /* WIN32 */
#include "storage/shmem.h"
typedef struct {
int sleeplock;
int semNum;
IpcSemaphoreId semId;
IpcSemaphoreKey semKey;
} SEMA;
/*
* Each backend has:
*/
typedef struct proc {
/* proc->links MUST BE THE FIRST ELEMENT OF STRUCT (see ProcWakeup()) */
SHM_QUEUE links; /* proc can be waiting for one event(lock) */
SEMA sem; /* ONE semaphore to sleep on */
int errType; /* error code tells why we woke up */
int procId; /* unique number for this structure
* NOT unique per backend, these things
* are reused after the backend dies.
*/
int critSects; /* If critSects > 0, we are in sensitive
* routines that cannot be recovered when
* the process fails.
*/
int prio; /* priority for sleep queue */
TransactionId xid; /* transaction currently being executed
* by this proc
*/
LOCK * waitLock; /* Lock we're sleeping on */
int token; /* info for proc wakeup routines */
int pid; /* This procs process id */
short sLocks[MAX_SPINS]; /* Spin lock stats */
SHM_QUEUE lockQueue; /* locks associated with current transaction */
} PROC;
/*
* MAX_PROC_SEMS is the maximum number of per-process semaphores (those used
* by the lock mgr) we can keep track of. PROC_NSEMS_PER_SET is the number
* of semaphores in each (sys-V) semaphore set allocated. (Be careful not
* to set it to greater 32. Otherwise, the bitmap will overflow.)
*/
#define MAX_PROC_SEMS 128
#define PROC_NSEMS_PER_SET 16
typedef struct procglobal {
SHMEM_OFFSET freeProcs;
int numProcs;
IPCKey currKey;
int32 freeSemMap[MAX_PROC_SEMS/PROC_NSEMS_PER_SET];
} PROC_HDR;
extern PROC *MyProc;
#define PROC_INCR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])++
#define PROC_DECR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])--
/*
* flags explaining why process woke up
*/
#define NO_ERROR 0
#define ERR_TIMEOUT 1
#define ERR_BUFFER_IO 2
#define MAX_PRIO 50
#define MIN_PRIO (-1)
extern SPINLOCK ProcStructLock;
/*
* Function Prototypes
*/
extern void InitProcess(IPCKey key);
extern void ProcReleaseLocks(void);
extern bool ProcRemove(int pid);
/* extern bool ProcKill(int exitStatus, int pid); */
/* make static in storage/lmgr/proc.c -- jolly */
extern PROC_QUEUE *ProcQueueAlloc(char *name);
extern void ProcQueueInit(PROC_QUEUE *queue);
extern int ProcSleep(PROC_QUEUE *queue, SPINLOCK spinlock, int token,
int prio, LOCK *lock);
extern PROC *ProcWakeup(PROC *proc, int errType);
extern int ProcGetId(void);
extern int ProcLockWakeup(PROC_QUEUE *queue, char * ltable, char * lock);
extern void ProcAddLock(SHM_QUEUE *elem);
#if defined(PORTNAME_linux)
extern int HandleDeadLock(int);
#else
extern int HandleDeadLock(void);
#endif
extern void ProcReleaseSpins(PROC *proc);
extern void ProcFreeAllSemaphores(void);
#endif /* PROC_H */

104
src/backend/storage/shmem.h Normal file
View File

@@ -0,0 +1,104 @@
/*-------------------------------------------------------------------------
*
* shmem.h--
* shared memory management structures
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: shmem.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef SHMEM_H
#define SHMEM_H
#include "storage/spin.h" /* for SPINLOCK */
#include "utils/hsearch.h" /* for HTAB */
/* The shared memory region can start at a different address
* in every process. Shared memory "pointers" are actually
* offsets relative to the start of the shared memory region(s).
*/
typedef unsigned long SHMEM_OFFSET;
#define INVALID_OFFSET (-1)
#define BAD_LOCATION (-1)
/* start of the lowest shared memory region. For now, assume that
* there is only one shared memory region
*/
extern SHMEM_OFFSET ShmemBase;
/* coerce an offset into a pointer in this process's address space */
#define MAKE_PTR(xx_offs)\
(ShmemBase+((unsigned long)(xx_offs)))
/* coerce a pointer into a shmem offset */
#define MAKE_OFFSET(xx_ptr)\
(SHMEM_OFFSET) (((unsigned long)(xx_ptr))-ShmemBase)
#define SHM_PTR_VALID(xx_ptr)\
(((unsigned long)xx_ptr) > ShmemBase)
/* cannot have an offset to ShmemFreeStart (offset 0) */
#define SHM_OFFSET_VALID(xx_offs)\
((xx_offs != 0) && (xx_offs != INVALID_OFFSET))
extern SPINLOCK ShmemLock;
extern SPINLOCK BindingLock;
/* shmemqueue.c */
typedef struct SHM_QUEUE {
SHMEM_OFFSET prev;
SHMEM_OFFSET next;
} SHM_QUEUE;
/* shmem.c */
extern void ShmemBindingTabReset();
extern void ShmemCreate(unsigned int key, unsigned int size);
extern int InitShmem(unsigned int key, unsigned int size);
extern long *ShmemAlloc(unsigned long size);
extern int ShmemIsValid(unsigned long addr);
extern HTAB *ShmemInitHash(char *name, long init_size, long max_size,
HASHCTL *infoP, int hash_flags);
extern bool ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr);
extern SHMEM_OFFSET ShmemPIDDestroy(int pid);
extern long *ShmemInitStruct(char *name, unsigned long size,
bool *foundPtr);
typedef int TableID;
/* size constants for the binding table */
/* max size of data structure string name */
#define BTABLE_KEYSIZE (50)
/* data in binding table hash bucket */
#define BTABLE_DATASIZE (sizeof(BindingEnt) - BTABLE_KEYSIZE)
/* maximum size of the binding table */
#define BTABLE_SIZE (100)
/* this is a hash bucket in the binding table */
typedef struct {
char key[BTABLE_KEYSIZE]; /* string name */
unsigned long location; /* location in shared mem */
unsigned long size; /* numbytes allocated for the
* structure
*/
} BindingEnt;
/*
* prototypes for functions in shmqueue.c
*/
extern void SHMQueueInit(SHM_QUEUE *queue);
extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
extern void SHMQueueElemInit(SHM_QUEUE *queue);
extern void SHMQueueDelete(SHM_QUEUE *queue);
extern void SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem);
extern void SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem);
extern void SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr,
SHM_QUEUE *nextQueue);
extern bool SHMQueueEmpty(SHM_QUEUE *queue);
#endif /* SHMEM_H */

View File

@@ -0,0 +1,33 @@
/*-------------------------------------------------------------------------
*
* sinval.h--
* POSTGRES shared cache invalidation communication definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: sinval.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef SINVAL_H
#define SINVAL_H
#include "c.h"
#include "storage/spin.h"
#include "storage/ipc.h"
#include "storage/itemptr.h"
#include "storage/backendid.h"
extern SPINLOCK SInvalLock;
extern void CreateSharedInvalidationState(IPCKey key);
extern void AttachSharedInvalidationState(IPCKey key);
extern void InitSharedInvalidationState();
extern void RegisterSharedInvalid(int cacheId, Index hashIndex,
ItemPointer pointer);
extern void InvalidateSharedInvalid(void (*invalFunction)(),
void (*resetFunction)());
#endif /* SINVAL_H */

View File

@@ -0,0 +1,126 @@
/*-------------------------------------------------------------------------
*
* sinvaladt.h--
* POSTGRES shared cache invalidation segment definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: sinvaladt.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef SINVALADT_H
#define SINVALADT_H
#include "postgres.h" /* XXX */
#include "storage/ipc.h"
#include "storage/itemptr.h"
#include "storage/sinval.h"
/*
* The structure of the shared cache invaidation segment
*
*/
/*
A------------- Header info --------------
criticalSectionSemaphoreId
generalSemaphoreId
startEntrySection (offset a)
endEntrySection (offset a + b)
startFreeSpace (offset relative to B)
startEntryChain (offset relatiev to B)
endEntryChain (offset relative to B)
numEntries
maxNumEntries
procState[MaxBackendId] --> limit
resetState (bool)
a tag (POSTID)
B------------- Start entry section -------
SISegEntry --> entryData --> ... (see SharedInvalidData!)
isfree (bool)
next (offset to next entry in chain )
b .... (dynamically growing down)
C----------------End shared segment -------
*/
/* Parameters (configurable) *******************************************/
#define MaxBackendId 32 /* maximum number of backends */
#define MAXNUMMESSAGES 1000 /* maximum number of messages in seg*/
#define InvalidOffset 1000000000 /* a invalid offset (End of chain) */
typedef struct ProcState {
int limit; /* the number of read messages */
bool resetState; /* true, if backend has to reset its state */
int tag; /* special tag, recieved from the postmaster */
} ProcState;
typedef struct SISeg {
IpcSemaphoreId criticalSectionSemaphoreId; /* semaphore id */
IpcSemaphoreId generalSemaphoreId; /* semaphore id */
Offset startEntrySection; /* (offset a) */
Offset endEntrySection; /* (offset a + b) */
Offset startFreeSpace; /* (offset relative to B) */
Offset startEntryChain; /* (offset relative to B) */
Offset endEntryChain; /* (offset relative to B) */
int numEntries;
int maxNumEntries;
ProcState procState[MaxBackendId]; /* reflects the invalidation state */
/* here starts the entry section, controlled by offsets */
} SISeg;
#define SizeSISeg sizeof(SISeg)
typedef struct SharedInvalidData {
int cacheId; /* XXX */
Index hashIndex;
ItemPointerData pointerData;
} SharedInvalidData;
typedef SharedInvalidData *SharedInvalid;
typedef struct SISegEntry {
SharedInvalidData entryData; /* the message data */
bool isfree; /* entry free? */
Offset next; /* offset to next entry*/
} SISegEntry;
#define SizeOfOneSISegEntry sizeof(SISegEntry)
typedef struct SISegOffsets {
Offset startSegment; /* always 0 (for now) */
Offset offsetToFirstEntry; /* A + a = B */
Offset offsetToEndOfSegemnt; /* A + a + b */
} SISegOffsets;
/****************************************************************************/
/* synchronization of the shared buffer access */
/* access to the buffer is synchronized by the lock manager !! */
/****************************************************************************/
#define SI_LockStartValue 255
#define SI_SharedLock (-1)
#define SI_ExclusiveLock (-255)
extern SISeg *shmInvalBuffer;
/*
* prototypes for functions in sinvaladt.c
*/
extern int SIBackendInit(SISeg *segInOutP);
extern int SISegmentInit(bool killExistingSegment, IPCKey key);
extern bool SISetDataEntry(SISeg *segP, SharedInvalidData *data);
extern void SISetProcStateInvalid(SISeg *segP);
extern bool SIDelDataEntry(SISeg *segP);
extern void SIReadEntryData(SISeg *segP, int backendId,
void (*invalFunction)(), void (*resetFunction)());
extern void SIDelExpiredDataEntries(SISeg *segP);
#endif /* SINVALADT_H */

View File

@@ -0,0 +1,84 @@
/*-------------------------------------------------------------------------
*
* smgr.h--
* storage manager switch public interface declarations.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: smgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef SMGR_H
#define SMGR_H
#include "utils/rel.h"
#include "storage/spin.h" /* for SPINLOCK */
#define SM_FAIL 0
#define SM_SUCCESS 1
#define DEFAULT_SMGR 0
extern int smgrinit(void);
extern void smgrshutdown(int dummy);
extern int smgrcreate(int16 which, Relation reln);
extern int smgrunlink(int16 which, Relation reln);
extern int smgrextend(int16 which, Relation reln, char *buffer);
extern int smgropen(int16 which, Relation reln);
extern int smgrclose(int16 which, Relation reln);
extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
Oid relid, BlockNumber blkno, char *buffer);
extern int smgrnblocks(int16 which, Relation reln);
extern int smgrcommit(void);
extern int smgrabort(void);
extern bool smgriswo(int16 smgrno);
/* internals: move me elsewhere -- ay 7/94 */
/* in md.c */
extern int mdinit(void);
extern int mdcreate(Relation reln);
extern int mdunlink(Relation reln);
extern int mdextend(Relation reln, char *buffer);
extern int mdopen(Relation reln);
extern int mdclose(Relation reln);
extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mdnblocks(Relation reln);
extern int mdcommit(void);
extern int mdabort(void);
/* mm.c */
extern SPINLOCK MMCacheLock;
extern int mminit(void);
extern int mmshutdown(void);
extern int mmcreate(Relation reln);
extern int mmunlink(Relation reln);
extern int mmextend(Relation reln, char *buffer);
extern int mmopen(Relation reln);
extern int mmclose(Relation reln);
extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mmnblocks(Relation reln);
extern int mmcommit(void);
extern int mmabort(void);
extern int MMShmemSize(void);
#endif /* SMGR_H */

View File

@@ -0,0 +1,14 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for storage/smgr
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= md.c mm.c smgr.c smgrtype.c

View File

@@ -0,0 +1,40 @@
# $Header: /cvsroot/pgsql/src/backend/storage/smgr/README,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
This directory contains the code that supports the Postgres storage manager
switch and all of the installed storage managers. In released systems,
the only supported storage manager is the magnetic disk manager. At UC
Berkeley, the Sony WORM optical disk jukebox and persistent main memory are
also supported.
As of Postgres Release 3.0, every relation in the system is tagged with the
storage manager on which it resides. The storage manager switch code turns
what used to by filesystem operations into operations on the correct store,
for any given relation.
The files in this directory, and their contents, are
smgrtype.c Storage manager type -- maps string names to storage manager
IDs and provides simple comparison operators. This is the
regproc support for type 'smgr' in the system catalogs.
smgr.c The storage manager switch dispatch code. The routines in
this file call the appropriate storage manager to do hardware
accesses requested by the backend.
md.c The magnetic disk storage manager.
mm.c The persistent main memory storage manager (#undef'ed in
tmp/c.h for all distributed systems).
sj.c The sony jukebox storage manager and cache management code
(#undef'ed in tmp/c.h for all distributed systems). The
routines in this file allocate extents, maintain block
maps, and guarantee the persistence and coherency of a cache
of jukebox blocks on magnetic disk.
pgjb.c The postgres jukebox interface routines. The routines here
handle exclusion on the physical device and translate requests
from the storage manager code (sj.c) into jbaccess calls.
jbaccess.c Access code for the physical Sony jukebox device. This code
was swiped from Andy McFadden's jblib.a code at UC Berkeley.

View File

@@ -0,0 +1,697 @@
/*-------------------------------------------------------------------------
*
* md.c--
* This code manages relations that reside on magnetic disk.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <stdio.h> /* for sprintf() */
#include <sys/file.h>
#include "postgres.h"
#include "miscadmin.h" /* for DataDir */
#include "machine.h"
#include "storage/smgr.h" /* where the declarations go */
#include "storage/block.h"
#include "storage/fd.h"
#include "utils/mcxt.h"
#include "utils/rel.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "catalog/catalog.h"
#undef DIAGNOSTIC
/*
* The magnetic disk storage manager keeps track of open file descriptors
* in its own descriptor pool. This happens for two reasons. First, at
* transaction boundaries, we walk the list of descriptors and flush
* anything that we've dirtied in the current transaction. Second, we
* have to support relations of > 4GBytes. In order to do this, we break
* relations up into chunks of < 2GBytes and store one chunk in each of
* several files that represent the relation.
*/
typedef struct _MdfdVec {
int mdfd_vfd; /* fd number in vfd pool */
uint16 mdfd_flags; /* clean, dirty */
int mdfd_lstbcnt; /* most recent block count */
struct _MdfdVec *mdfd_chain; /* for large relations */
} MdfdVec;
static int Nfds = 100;
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
static int CurFd = 0;
static MemoryContext MdCxt;
#define MDFD_DIRTY (uint16) 0x01
#define RELSEG_SIZE 262144 /* (2 ** 31) / 8192 -- 2GB file */
/* routines declared here */
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
static int _fdvec_ext(void);
static BlockNumber _mdnblocks(File file, Size blcksz);
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
*
* We keep a private table of all file descriptors. Whenever we do
* a write to one, we mark it dirty in our table. Whenever we force
* changes to disk, we mark the file descriptor clean. At transaction
* commit, we force changes to disk for all dirty file descriptors.
* This routine allocates and initializes the table.
*
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mdinit()
{
MemoryContext oldcxt;
MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
if (MdCxt == (MemoryContext) NULL)
return (SM_FAIL);
oldcxt = MemoryContextSwitchTo(MdCxt);
Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
(void) MemoryContextSwitchTo(oldcxt);
if (Md_fdvec == (MdfdVec *) NULL)
return (SM_FAIL);
memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
return (SM_SUCCESS);
}
int
mdcreate(Relation reln)
{
int fd, vfd;
int tmp;
char *path;
extern bool IsBootstrapProcessingMode();
path = relpath(&(reln->rd_rel->relname.data[0]));
fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
/*
* If the file already exists and is empty, we pretend that the
* create succeeded. During bootstrap processing, we skip that check,
* because pg_time, pg_variable, and pg_log get created before their
* .bki file entries are processed.
*/
if (fd < 0) {
if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
if (!IsBootstrapProcessingMode() &&
FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
FileClose(fd);
return (-1);
}
}
}
if (CurFd >= Nfds) {
if (_fdvec_ext() == SM_FAIL)
return (-1);
}
Md_fdvec[CurFd].mdfd_vfd = fd;
Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
Md_fdvec[CurFd].mdfd_lstbcnt = 0;
vfd = CurFd++;
return (vfd);
}
/*
* mdunlink() -- Unlink a relation.
*/
int
mdunlink(Relation reln)
{
int fd;
int i;
MdfdVec *v, *ov;
MemoryContext oldcxt;
char fname[20]; /* XXX should have NAMESIZE defined */
char tname[20];
/* On Windows NT you can't unlink a file if it is open so we have
** to do this.
*/
#ifdef WIN32
(void) mdclose(reln);
#endif /* WIN32 */
memset(fname,0,20);
strncpy(fname, RelationGetRelationName(reln)->data, 16);
if (FileNameUnlink(fname) < 0)
return (SM_FAIL);
/* unlink all the overflow files for large relations */
for (i = 1; ; i++) {
#ifdef WIN32
(void) mdclose(reln);
#endif /* WIN32 */
sprintf(tname, "%s.%d", fname, i);
if (FileNameUnlink(tname) < 0)
break;
}
/* finally, clean out the mdfd vector */
fd = RelationGetFile(reln);
Md_fdvec[fd].mdfd_flags = (uint16) 0;
oldcxt = MemoryContextSwitchTo(MdCxt);
for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
ov = v;
v = v->mdfd_chain;
if (ov != &Md_fdvec[fd])
pfree(ov);
}
Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
(void) MemoryContextSwitchTo(oldcxt);
return (SM_SUCCESS);
}
/*
* mdextend() -- Add a block to the specified relation.
*
* This routine returns SM_FAIL or SM_SUCCESS, with errno set as
* appropriate.
*/
int
mdextend(Relation reln, char *buffer)
{
long pos;
int nblocks;
MdfdVec *v;
nblocks = mdnblocks(reln);
v = _mdfd_getseg(reln, nblocks, O_CREAT);
if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
return (SM_FAIL);
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
return (SM_FAIL);
/* remember that we did a write, so we can sync at xact commit */
v->mdfd_flags |= MDFD_DIRTY;
/* try to keep the last block count current, though it's just a hint */
if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
v->mdfd_lstbcnt = RELSEG_SIZE;
#ifdef DIAGNOSTIC
if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
|| v->mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big!");
#endif
return (SM_SUCCESS);
}
/*
* mdopen() -- Open the specified relation.
*/
int
mdopen(Relation reln)
{
char *path;
int fd;
int vfd;
if (CurFd >= Nfds) {
if (_fdvec_ext() == SM_FAIL)
return (-1);
}
path = relpath(&(reln->rd_rel->relname.data[0]));
fd = FileNameOpenFile(path, O_RDWR, 0600);
/* this should only happen during bootstrap processing */
if (fd < 0)
fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
Md_fdvec[CurFd].mdfd_vfd = fd;
Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
#ifdef DIAGNOSTIC
if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big on relopen!");
#endif
vfd = CurFd++;
return (vfd);
}
/*
* mdclose() -- Close the specified relation.
*
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mdclose(Relation reln)
{
int fd;
MdfdVec *v;
fd = RelationGetFile(reln);
for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
/* may be closed already */
if (v->mdfd_vfd < 0)
continue;
/*
* We sync the file descriptor so that we don't need to reopen it at
* transaction commit to force changes to disk.
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
/* mark this file descriptor as clean in our private table */
v->mdfd_flags &= ~MDFD_DIRTY;
}
return (SM_SUCCESS);
}
/*
* mdread() -- Read the specified block from a relation.
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
long seekpos;
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, 0);
seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
#ifdef DIAGNOSTIC
if (seekpos >= BLCKSZ * RELSEG_SIZE)
elog(FATAL, "seekpos too big!");
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
return (SM_FAIL);
}
status = SM_SUCCESS;
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
if (nbytes == 0) {
memset(buffer, 0, BLCKSZ);
} else {
status = SM_FAIL;
}
}
return (status);
}
/*
* mdwrite() -- Write the supplied block at the appropriate location.
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
long seekpos;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, 0);
seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
#ifdef DIAGNOSTIC
if (seekpos >= BLCKSZ * RELSEG_SIZE)
elog(FATAL, "seekpos too big!");
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
return (SM_FAIL);
}
status = SM_SUCCESS;
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
v->mdfd_flags |= MDFD_DIRTY;
return (status);
}
/*
* mdflush() -- Synchronously write a block to disk.
*
* This is exactly like mdwrite(), but doesn't return until the file
* system buffer cache has been flushed.
*/
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
long seekpos;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum, 0);
seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
#ifdef DIAGNOSTIC
if (seekpos >= BLCKSZ * RELSEG_SIZE)
elog(FATAL, "seekpos too big!");
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
return (SM_FAIL);
}
/* write and sync the block */
status = SM_SUCCESS;
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
|| FileSync(v->mdfd_vfd) < 0)
status = SM_FAIL;
/*
* By here, the block is written and changes have been forced to stable
* storage. Mark the descriptor as clean until the next write, so we
* don't sync it again unnecessarily at transaction commit.
*/
v->mdfd_flags &= ~MDFD_DIRTY;
return (status);
}
/*
* mdblindwrt() -- Write a block to disk blind.
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. This
* is a synchronous write.
*/
int
mdblindwrt(char *dbstr,
char *relstr,
Oid dbid,
Oid relid,
BlockNumber blkno,
char *buffer)
{
int fd;
int segno;
long seekpos;
int status;
char *path;
int nchars;
/* be sure we have enough space for the '.segno', if any */
segno = blkno / RELSEG_SIZE;
if (segno > 0)
nchars = 10;
else
nchars = 0;
/* construct the path to the file and open it */
if (dbid == (Oid) 0) {
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
else
sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
} else {
path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN,
dbstr, NAMEDATALEN, relstr);
else
sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
NAMEDATALEN, relstr, segno);
}
if ((fd = open(path, O_RDWR, 0600)) < 0)
return (SM_FAIL);
/* seek to the right spot */
seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
(void) close(fd);
return (SM_FAIL);
}
status = SM_SUCCESS;
/* write and sync the block */
if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
status = SM_FAIL;
if (close(fd) < 0)
status = SM_FAIL;
pfree(path);
return (status);
}
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
* Returns # of blocks or -1 on error.
*/
int
mdnblocks(Relation reln)
{
int fd;
MdfdVec *v;
int nblocks;
int segno;
fd = RelationGetFile(reln);
v = &Md_fdvec[fd];
#ifdef DIAGNOSTIC
if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
elog(FATAL, "segment too big in getseg!");
#endif
segno = 0;
for (;;) {
if (v->mdfd_lstbcnt == RELSEG_SIZE
|| (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {
v->mdfd_lstbcnt = RELSEG_SIZE;
segno++;
if (v->mdfd_chain == (MdfdVec *) NULL) {
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
if (v->mdfd_chain == (MdfdVec *) NULL)
elog(WARN, "cannot count blocks for %.16s -- open failed",
RelationGetRelationName(reln));
}
v = v->mdfd_chain;
} else {
return ((segno * RELSEG_SIZE) + nblocks);
}
}
}
/*
* mdcommit() -- Commit a transaction.
*
* All changes to magnetic disk relations must be forced to stable
* storage. This routine makes a pass over the private table of
* file descriptors. Any descriptors to which we have done writes,
* but not synced, are synced here.
*
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mdcommit()
{
int i;
MdfdVec *v;
for (i = 0; i < CurFd; i++) {
for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
if (v->mdfd_flags & MDFD_DIRTY) {
if (FileSync(v->mdfd_vfd) < 0)
return (SM_FAIL);
v->mdfd_flags &= ~MDFD_DIRTY;
}
}
}
return (SM_SUCCESS);
}
/*
* mdabort() -- Abort a transaction.
*
* Changes need not be forced to disk at transaction abort. We mark
* all file descriptors as clean here. Always returns SM_SUCCESS.
*/
int
mdabort()
{
int i;
MdfdVec *v;
for (i = 0; i < CurFd; i++) {
for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
v->mdfd_flags &= ~MDFD_DIRTY;
}
}
return (SM_SUCCESS);
}
/*
* _fdvec_ext() -- Extend the md file descriptor vector.
*
* The file descriptor vector must be large enough to hold at least
* 'fd' entries.
*/
static
int _fdvec_ext()
{
MdfdVec *nvec;
MemoryContext oldcxt;
Nfds *= 2;
oldcxt = MemoryContextSwitchTo(MdCxt);
nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
memset(nvec, 0, Nfds * sizeof(MdfdVec));
memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec));
pfree(Md_fdvec);
(void) MemoryContextSwitchTo(oldcxt);
Md_fdvec = nvec;
return (SM_SUCCESS);
}
static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
MemoryContext oldcxt;
MdfdVec *v;
int fd;
bool dofree;
char *path, *fullpath;
/* be sure we have enough space for the '.segno', if any */
path = relpath(RelationGetRelationName(reln)->data);
dofree = false;
if (segno > 0) {
dofree = true;
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%d", path, segno);
} else
fullpath = path;
/* open the file */
fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);
if (dofree)
pfree(fullpath);
if (fd < 0)
return ((MdfdVec *) NULL);
/* allocate an mdfdvec entry for it */
oldcxt = MemoryContextSwitchTo(MdCxt);
v = (MdfdVec *) palloc(sizeof(MdfdVec));
(void) MemoryContextSwitchTo(oldcxt);
/* fill the entry */
v->mdfd_vfd = fd;
v->mdfd_flags = (uint16) 0;
v->mdfd_chain = (MdfdVec *) NULL;
v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
#ifdef DIAGNOSTIC
if (v->mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big on open!");
#endif
/* all done */
return (v);
}
static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
MdfdVec *v;
int segno;
int fd;
int i;
fd = RelationGetFile(reln);
if (fd < 0) {
if ((fd = mdopen(reln)) < 0)
elog(WARN, "cannot open relation %.16s",
RelationGetRelationName(reln));
reln->rd_fd = fd;
}
for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
segno > 0;
i++, segno--) {
if (v->mdfd_chain == (MdfdVec *) NULL) {
v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
if (v->mdfd_chain == (MdfdVec *) NULL)
elog(WARN, "cannot open segment %d of relation %.16s",
i, RelationGetRelationName(reln));
}
v = v->mdfd_chain;
}
return (v);
}
static BlockNumber
_mdnblocks(File file, Size blcksz)
{
long len;
len = FileSeek(file, 0L, SEEK_END) - 1;
return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));
}

View File

@@ -0,0 +1,586 @@
/*-------------------------------------------------------------------------
*
* mm.c--
* main memory storage manager
*
* This code manages relations that reside in (presumably stable)
* main memory.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#ifdef MAIN_MEMORY
#include <math.h>
#include "machine.h"
#include "storage/ipc.h"
#include "storage/smgr.h" /* where the declarations go */
#include "storage/block.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/hsearch.h"
#include "utils/rel.h"
#include "utils/elog.h"
#include "utils/memutils.h"
/*
* MMCacheTag -- Unique triplet for blocks stored by the main memory
* storage manager.
*/
typedef struct MMCacheTag {
Oid mmct_dbid;
Oid mmct_relid;
BlockNumber mmct_blkno;
} MMCacheTag;
/*
* Shared-memory hash table for main memory relations contains
* entries of this form.
*/
typedef struct MMHashEntry {
MMCacheTag mmhe_tag;
int mmhe_bufno;
} MMHashEntry;
/*
* MMRelTag -- Unique identifier for each relation that is stored in the
* main-memory storage manager.
*/
typedef struct MMRelTag {
Oid mmrt_dbid;
Oid mmrt_relid;
} MMRelTag;
/*
* Shared-memory hash table for # blocks in main memory relations contains
* entries of this form.
*/
typedef struct MMRelHashEntry {
MMRelTag mmrhe_tag;
int mmrhe_nblocks;
} MMRelHashEntry;
#define MMNBUFFERS 10
#define MMNRELATIONS 2
SPINLOCK MMCacheLock;
extern bool IsPostmaster;
extern Oid MyDatabaseId;
static int *MMCurTop;
static int *MMCurRelno;
static MMCacheTag *MMBlockTags;
static char *MMBlockCache;
static HTAB *MMCacheHT;
static HTAB *MMRelCacheHT;
int
mminit()
{
char *mmcacheblk;
int mmsize = 0;
bool found;
HASHCTL info;
SpinAcquire(MMCacheLock);
mmsize += MAXALIGN(BLCKSZ * MMNBUFFERS);
mmsize += MAXALIGN(sizeof(*MMCurTop));
mmsize += MAXALIGN(sizeof(*MMCurRelno));
mmsize += MAXALIGN((MMNBUFFERS * sizeof(MMCacheTag)));
mmcacheblk = (char *) ShmemInitStruct("Main memory smgr", mmsize, &found);
if (mmcacheblk == (char *) NULL) {
SpinRelease(MMCacheLock);
return (SM_FAIL);
}
info.keysize = sizeof(MMCacheTag);
info.datasize = sizeof(int);
info.hash = tag_hash;
MMCacheHT = (HTAB *) ShmemInitHash("Main memory store HT",
MMNBUFFERS, MMNBUFFERS,
&info, (HASH_ELEM|HASH_FUNCTION));
if (MMCacheHT == (HTAB *) NULL) {
SpinRelease(MMCacheLock);
return (SM_FAIL);
}
info.keysize = sizeof(MMRelTag);
info.datasize = sizeof(int);
info.hash = tag_hash;
MMRelCacheHT = (HTAB *) ShmemInitHash("Main memory rel HT",
MMNRELATIONS, MMNRELATIONS,
&info, (HASH_ELEM|HASH_FUNCTION));
if (MMRelCacheHT == (HTAB *) NULL) {
SpinRelease(MMCacheLock);
return (SM_FAIL);
}
if (IsPostmaster) {
memset(mmcacheblk, 0, mmsize);
SpinRelease(MMCacheLock);
return (SM_SUCCESS);
}
SpinRelease(MMCacheLock);
MMCurTop = (int *) mmcacheblk;
mmcacheblk += sizeof(int);
MMCurRelno = (int *) mmcacheblk;
mmcacheblk += sizeof(int);
MMBlockTags = (MMCacheTag *) mmcacheblk;
mmcacheblk += (MMNBUFFERS * sizeof(MMCacheTag));
MMBlockCache = mmcacheblk;
return (SM_SUCCESS);
}
int
mmshutdown()
{
return (SM_SUCCESS);
}
int
mmcreate(Relation reln)
{
MMRelHashEntry *entry;
bool found;
MMRelTag tag;
SpinAcquire(MMCacheLock);
if (*MMCurRelno == MMNRELATIONS) {
SpinRelease(MMCacheLock);
return (SM_FAIL);
}
(*MMCurRelno)++;
tag.mmrt_relid = reln->rd_id;
if (reln->rd_rel->relisshared)
tag.mmrt_dbid = (Oid) 0;
else
tag.mmrt_dbid = MyDatabaseId;
entry = (MMRelHashEntry *) hash_search(MMRelCacheHT,
(char *) &tag, HASH_ENTER, &found);
if (entry == (MMRelHashEntry *) NULL) {
SpinRelease(MMCacheLock);
elog(FATAL, "main memory storage mgr rel cache hash table corrupt");
}
if (found) {
/* already exists */
SpinRelease(MMCacheLock);
return (SM_FAIL);
}
entry->mmrhe_nblocks = 0;
SpinRelease(MMCacheLock);
return (SM_SUCCESS);
}
/*
* mmunlink() -- Unlink a relation.
*/
int
mmunlink(Relation reln)
{
int i;
Oid reldbid;
MMHashEntry *entry;
MMRelHashEntry *rentry;
bool found;
MMRelTag rtag;
if (reln->rd_rel->relisshared)
reldbid = (Oid) 0;
else
reldbid = MyDatabaseId;
SpinAcquire(MMCacheLock);
for (i = 0; i < MMNBUFFERS; i++) {
if (MMBlockTags[i].mmct_dbid == reldbid
&& MMBlockTags[i].mmct_relid == reln->rd_id) {
entry = (MMHashEntry *) hash_search(MMCacheHT,
(char *) &MMBlockTags[i],
HASH_REMOVE, &found);
if (entry == (MMHashEntry *) NULL || !found) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmunlink: cache hash table corrupted");
}
MMBlockTags[i].mmct_dbid = (Oid) 0;
MMBlockTags[i].mmct_relid = (Oid) 0;
MMBlockTags[i].mmct_blkno = (BlockNumber) 0;
}
}
rtag.mmrt_dbid = reldbid;
rtag.mmrt_relid = reln->rd_id;
rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
HASH_REMOVE, &found);
if (rentry == (MMRelHashEntry *) NULL || !found) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmunlink: rel cache hash table corrupted");
}
(*MMCurRelno)--;
SpinRelease(MMCacheLock);
return 1;
}
/*
* mmextend() -- Add a block to the specified relation.
*
* This routine returns SM_FAIL or SM_SUCCESS, with errno set as
* appropriate.
*/
int
mmextend(Relation reln, char *buffer)
{
MMRelHashEntry *rentry;
MMHashEntry *entry;
int i;
Oid reldbid;
int offset;
bool found;
MMRelTag rtag;
MMCacheTag tag;
if (reln->rd_rel->relisshared)
reldbid = (Oid) 0;
else
reldbid = MyDatabaseId;
tag.mmct_dbid = rtag.mmrt_dbid = reldbid;
tag.mmct_relid = rtag.mmrt_relid = reln->rd_id;
SpinAcquire(MMCacheLock);
if (*MMCurTop == MMNBUFFERS) {
for (i = 0; i < MMNBUFFERS; i++) {
if (MMBlockTags[i].mmct_dbid == 0 &&
MMBlockTags[i].mmct_relid == 0)
break;
}
if (i == MMNBUFFERS) {
SpinRelease(MMCacheLock);
return (SM_FAIL);
}
} else {
i = *MMCurTop;
(*MMCurTop)++;
}
rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
HASH_FIND, &found);
if (rentry == (MMRelHashEntry *) NULL || !found) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmextend: rel cache hash table corrupt");
}
tag.mmct_blkno = rentry->mmrhe_nblocks;
entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
HASH_ENTER, &found);
if (entry == (MMHashEntry *) NULL || found) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmextend: cache hash table corrupt");
}
entry->mmhe_bufno = i;
MMBlockTags[i].mmct_dbid = reldbid;
MMBlockTags[i].mmct_relid = reln->rd_id;
MMBlockTags[i].mmct_blkno = rentry->mmrhe_nblocks;
/* page numbers are zero-based, so we increment this at the end */
(rentry->mmrhe_nblocks)++;
/* write the extended page */
offset = (i * BLCKSZ);
memmove(&(MMBlockCache[offset]), buffer, BLCKSZ);
SpinRelease(MMCacheLock);
return (SM_SUCCESS);
}
/*
* mmopen() -- Open the specified relation.
*/
int
mmopen(Relation reln)
{
/* automatically successful */
return (0);
}
/*
* mmclose() -- Close the specified relation.
*
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mmclose(Relation reln)
{
/* automatically successful */
return (SM_SUCCESS);
}
/*
* mmread() -- Read the specified block from a relation.
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mmread(Relation reln, BlockNumber blocknum, char *buffer)
{
MMHashEntry *entry;
bool found;
int offset;
MMCacheTag tag;
if (reln->rd_rel->relisshared)
tag.mmct_dbid = (Oid) 0;
else
tag.mmct_dbid = MyDatabaseId;
tag.mmct_relid = reln->rd_id;
tag.mmct_blkno = blocknum;
SpinAcquire(MMCacheLock);
entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
HASH_FIND, &found);
if (entry == (MMHashEntry *) NULL) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmread: hash table corrupt");
}
if (!found) {
/* reading nonexistent pages is defined to fill them with zeroes */
SpinRelease(MMCacheLock);
memset(buffer, 0, BLCKSZ);
return (SM_SUCCESS);
}
offset = (entry->mmhe_bufno * BLCKSZ);
memmove(buffer, &MMBlockCache[offset], BLCKSZ);
SpinRelease(MMCacheLock);
return (SM_SUCCESS);
}
/*
* mmwrite() -- Write the supplied block at the appropriate location.
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
MMHashEntry *entry;
bool found;
int offset;
MMCacheTag tag;
if (reln->rd_rel->relisshared)
tag.mmct_dbid = (Oid) 0;
else
tag.mmct_dbid = MyDatabaseId;
tag.mmct_relid = reln->rd_id;
tag.mmct_blkno = blocknum;
SpinAcquire(MMCacheLock);
entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
HASH_FIND, &found);
if (entry == (MMHashEntry *) NULL) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmread: hash table corrupt");
}
if (!found) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmwrite: hash table missing requested page");
}
offset = (entry->mmhe_bufno * BLCKSZ);
memmove(&MMBlockCache[offset], buffer, BLCKSZ);
SpinRelease(MMCacheLock);
return (SM_SUCCESS);
}
/*
* mmflush() -- Synchronously write a block to stable storage.
*
* For main-memory relations, this is exactly equivalent to mmwrite().
*/
int
mmflush(Relation reln, BlockNumber blocknum, char *buffer)
{
return (mmwrite(reln, blocknum, buffer));
}
/*
* mmblindwrt() -- Write a block to stable storage blind.
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs.
*/
int
mmblindwrt(char *dbstr,
char *relstr,
Oid dbid,
Oid relid,
BlockNumber blkno,
char *buffer)
{
return (SM_FAIL);
}
/*
* mmnblocks() -- Get the number of blocks stored in a relation.
*
* Returns # of blocks or -1 on error.
*/
int
mmnblocks(Relation reln)
{
MMRelTag rtag;
MMRelHashEntry *rentry;
bool found;
int nblocks;
if (reln->rd_rel->relisshared)
rtag.mmrt_dbid = (Oid) 0;
else
rtag.mmrt_dbid = MyDatabaseId;
rtag.mmrt_relid = reln->rd_id;
SpinAcquire(MMCacheLock);
rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
HASH_FIND, &found);
if (rentry == (MMRelHashEntry *) NULL) {
SpinRelease(MMCacheLock);
elog(FATAL, "mmnblocks: rel cache hash table corrupt");
}
if (found)
nblocks = rentry->mmrhe_nblocks;
else
nblocks = -1;
SpinRelease(MMCacheLock);
return (nblocks);
}
/*
* mmcommit() -- Commit a transaction.
*
* Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
*/
int
mmcommit()
{
return (SM_SUCCESS);
}
/*
* mmabort() -- Abort a transaction.
*/
int
mmabort()
{
return (SM_SUCCESS);
}
/*
* MMShmemSize() -- Declare amount of shared memory we require.
*
* The shared memory initialization code creates a block of shared
* memory exactly big enough to hold all the structures it needs to.
* This routine declares how much space the main memory storage
* manager will use.
*/
int
MMShmemSize()
{
int size = 0;
int nbuckets;
int nsegs;
int tmp;
/*
* first compute space occupied by the (dbid,relid,blkno) hash table
*/
nbuckets = 1 << (int)my_log2((MMNBUFFERS - 1) / DEF_FFACTOR + 1);
nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
size += MAXALIGN(my_log2(MMNBUFFERS) * sizeof(void *));
size += MAXALIGN(sizeof(HHDR));
size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
tmp = (int)ceil((double)MMNBUFFERS/BUCKET_ALLOC_INCR);
size += tmp * BUCKET_ALLOC_INCR *
(MAXALIGN(sizeof(BUCKET_INDEX)) +
MAXALIGN(sizeof(MMHashEntry))); /* contains hash key */
/*
* now do the same for the rel hash table
*/
size += MAXALIGN(my_log2(MMNRELATIONS) * sizeof(void *));
size += MAXALIGN(sizeof(HHDR));
size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
tmp = (int)ceil((double)MMNRELATIONS/BUCKET_ALLOC_INCR);
size += tmp * BUCKET_ALLOC_INCR *
(MAXALIGN(sizeof(BUCKET_INDEX)) +
MAXALIGN(sizeof(MMRelHashEntry))); /* contains hash key */
/*
* finally, add in the memory block we use directly
*/
size += MAXALIGN(BLCKSZ * MMNBUFFERS);
size += MAXALIGN(sizeof(*MMCurTop));
size += MAXALIGN(sizeof(*MMCurRelno));
size += MAXALIGN(MMNBUFFERS * sizeof(MMCacheTag));
return (size);
}
#endif /* MAIN_MEMORY */

View File

@@ -0,0 +1,371 @@
/*-------------------------------------------------------------------------
*
* smgr.c--
* public interface routines to storage manager switch.
*
* All file system operations in POSTGRES dispatch through these
* routines.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <string.h>
#include "postgres.h"
#include "machine.h"
#include "storage/ipc.h"
#include "storage/smgr.h"
#include "storage/block.h"
#include "utils/rel.h"
#include "utils/elog.h"
#include "utils/palloc.h"
typedef struct f_smgr {
int (*smgr_init)(); /* may be NULL */
int (*smgr_shutdown)(); /* may be NULL */
int (*smgr_create)();
int (*smgr_unlink)();
int (*smgr_extend)();
int (*smgr_open)();
int (*smgr_close)();
int (*smgr_read)();
int (*smgr_write)();
int (*smgr_flush)();
int (*smgr_blindwrt)();
int (*smgr_nblocks)();
int (*smgr_commit)(); /* may be NULL */
int (*smgr_abort)(); /* may be NULL */
} f_smgr;
/*
* The weird placement of commas in this init block is to keep the compiler
* happy, regardless of what storage managers we have (or don't have).
*/
static f_smgr smgrsw[] = {
/* magnetic disk */
{ mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdcommit, mdabort },
#ifdef MAIN_MEMORY
/* main memory */
{ mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, mmcommit, mmabort },
#endif /* MAIN_MEMORY */
};
/*
* This array records which storage managers are write-once, and which
* support overwrite. A 'true' entry means that the storage manager is
* write-once. In the best of all possible worlds, there would be no
* write-once storage managers.
*/
static bool smgrwo[] = {
false, /* magnetic disk */
#ifdef MAIN_MEMORY
false, /* main memory*/
#endif /* MAIN_MEMORY */
};
static int NSmgr = lengthof(smgrsw);
/*
* smgrinit(), smgrshutdown() -- Initialize or shut down all storage
* managers.
*
*/
int
smgrinit()
{
int i;
extern char *smgrout();
for (i = 0; i < NSmgr; i++) {
if (smgrsw[i].smgr_init) {
if ((*(smgrsw[i].smgr_init))() == SM_FAIL)
elog(FATAL, "initialization failed on %s", smgrout(i));
}
}
/* register the shutdown proc */
on_exitpg(smgrshutdown, 0);
return (SM_SUCCESS);
}
void
smgrshutdown(int dummy)
{
int i;
extern char *smgrout();
for (i = 0; i < NSmgr; i++) {
if (smgrsw[i].smgr_shutdown) {
if ((*(smgrsw[i].smgr_shutdown))() == SM_FAIL)
elog(FATAL, "shutdown failed on %s", smgrout(i));
}
}
}
/*
* smgrcreate() -- Create a new relation.
*
* This routine takes a reldesc, creates the relation on the appropriate
* device, and returns a file descriptor for it.
*/
int
smgrcreate(int16 which, Relation reln)
{
int fd;
if ((fd = (*(smgrsw[which].smgr_create))(reln)) < 0)
elog(WARN, "cannot open %.*s",
NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (fd);
}
/*
* smgrunlink() -- Unlink a relation.
*
* The relation is removed from the store.
*/
int
smgrunlink(int16 which, Relation reln)
{
int status;
if ((status = (*(smgrsw[which].smgr_unlink))(reln)) == SM_FAIL)
elog(WARN, "cannot unlink %.*s",
NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (status);
}
/*
* smgrextend() -- Add a new block to a file.
*
* Returns SM_SUCCESS on success; aborts the current transaction on
* failure.
*/
int
smgrextend(int16 which, Relation reln, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_extend))(reln, buffer);
if (status == SM_FAIL)
elog(WARN, "%.*s: cannot extend",
NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (status);
}
/*
* smgropen() -- Open a relation using a particular storage manager.
*
* Returns the fd for the open relation on success, aborts the
* transaction on failure.
*/
int
smgropen(int16 which, Relation reln)
{
int fd;
if ((fd = (*(smgrsw[which].smgr_open))(reln)) < 0)
elog(WARN, "cannot open %.*s",
NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (fd);
}
/*
* smgrclose() -- Close a relation.
*
* Returns SM_SUCCESS on success, aborts on failure.
*/
int
smgrclose(int16 which, Relation reln)
{
if ((*(smgrsw[which].smgr_close))(reln) == SM_FAIL)
elog(WARN, "cannot close %.*s",
NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (SM_SUCCESS);
}
/*
* smgrread() -- read a particular block from a relation into the supplied
* buffer.
*
* This routine is called from the buffer manager in order to
* instantiate pages in the shared buffer cache. All storage managers
* return pages in the format that POSTGRES expects. This routine
* dispatches the read. On success, it returns SM_SUCCESS. On failure,
* the current transaction is aborted.
*/
int
smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_read))(reln, blocknum, buffer);
if (status == SM_FAIL)
elog(WARN, "cannot read block %d of %.*s",
blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (status);
}
/*
* smgrwrite() -- Write the supplied buffer out.
*
* This is not a synchronous write -- the interface for that is
* smgrflush(). The buffer is written out via the appropriate
* storage manager. This routine returns SM_SUCCESS or aborts
* the current transaction.
*/
int
smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_write))(reln, blocknum, buffer);
if (status == SM_FAIL)
elog(WARN, "cannot write block %d of %.*s",
blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (status);
}
/*
* smgrflush() -- A synchronous smgrwrite().
*/
int
smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
int status;
status = (*(smgrsw[which].smgr_flush))(reln, blocknum, buffer);
if (status == SM_FAIL)
elog(WARN, "cannot flush block %d of %.*s to stable store",
blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (status);
}
/*
* smgrblindwrt() -- Write a page out blind.
*
* In some cases, we may find a page in the buffer cache that we
* can't make a reldesc for. This happens, for example, when we
* want to reuse a dirty page that was written by a transaction
* that has not yet committed, which created a new relation. In
* this case, the buffer manager will call smgrblindwrt() with
* the name and OID of the database and the relation to which the
* buffer belongs. Every storage manager must be able to force
* this page down to stable storage in this circumstance.
*/
int
smgrblindwrt(int16 which,
char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno,
char *buffer)
{
char *dbstr;
char *relstr;
int status;
dbstr = pstrdup(dbname);
relstr = pstrdup(relname);
status = (*(smgrsw[which].smgr_blindwrt))(dbstr, relstr, dbid, relid,
blkno, buffer);
if (status == SM_FAIL)
elog(WARN, "cannot write block %d of %s [%s] blind",
blkno, relstr, dbstr);
pfree(dbstr);
pfree(relstr);
return (status);
}
/*
* smgrnblocks() -- Calculate the number of POSTGRES blocks in the
* supplied relation.
*
* Returns the number of blocks on success, aborts the current
* transaction on failure.
*/
int
smgrnblocks(int16 which, Relation reln)
{
int nblocks;
if ((nblocks = (*(smgrsw[which].smgr_nblocks))(reln)) < 0)
elog(WARN, "cannot count blocks for %.*s",
NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
return (nblocks);
}
/*
* smgrcommit(), smgrabort() -- Commit or abort changes made during the
* current transaction.
*/
int
smgrcommit()
{
int i;
extern char *smgrout();
for (i = 0; i < NSmgr; i++) {
if (smgrsw[i].smgr_commit) {
if ((*(smgrsw[i].smgr_commit))() == SM_FAIL)
elog(FATAL, "transaction commit failed on %s", smgrout(i));
}
}
return (SM_SUCCESS);
}
int
smgrabort()
{
int i;
extern char *smgrout();
for (i = 0; i < NSmgr; i++) {
if (smgrsw[i].smgr_abort) {
if ((*(smgrsw[i].smgr_abort))() == SM_FAIL)
elog(FATAL, "transaction abort failed on %s", smgrout(i));
}
}
return (SM_SUCCESS);
}
bool
smgriswo(int16 smgrno)
{
if (smgrno < 0 || smgrno >= NSmgr)
elog(WARN, "illegal storage manager number %d", smgrno);
return (smgrwo[smgrno]);
}

View File

@@ -0,0 +1,82 @@
/*-------------------------------------------------------------------------
*
* smgrtype.c--
* storage manager type
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgrtype.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <string.h>
#include "postgres.h"
#include "utils/builtins.h" /* where the declarations go */
#include "utils/elog.h"
#include "utils/palloc.h"
#include "storage/smgr.h"
typedef struct smgrid {
char *smgr_name;
} smgrid;
/*
* StorageManager[] -- List of defined storage managers.
*
* The weird comma placement is to keep compilers happy no matter
* which of these is (or is not) defined.
*/
static smgrid StorageManager[] = {
{"magnetic disk"},
#ifdef MAIN_MEMORY
{"main memory"}
#endif /* MAIN_MEMORY */
};
static int NStorageManagers = lengthof(StorageManager);
int2
smgrin(char *s)
{
int i;
for (i = 0; i < NStorageManagers; i++) {
if (strcmp(s, StorageManager[i].smgr_name) == 0)
return((int2) i);
}
elog(WARN, "smgrin: illegal storage manager name %s", s);
return 0;
}
char *
smgrout(int2 i)
{
char *s;
if (i >= NStorageManagers || i < 0)
elog(WARN, "Illegal storage manager id %d", i);
s = (char *) palloc(strlen(StorageManager[i].smgr_name) + 1);
strcpy(s, StorageManager[i].smgr_name);
return (s);
}
bool
smgreq(int2 a, int2 b)
{
if (a == b)
return (true);
return (false);
}
bool
smgrne(int2 a, int2 b)
{
if (a == b)
return (false);
return (true);
}

View File

@@ -0,0 +1,38 @@
/*-------------------------------------------------------------------------
*
* spin.h--
* synchronization routines
*
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: spin.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef SPIN_H
#define SPIN_H
#include "ipc.h"
/*
* two implementations of spin locks
*
* sequent, sparc, sun3: real spin locks. uses a TAS instruction; see
* src/storage/ipc/s_lock.c for details.
*
* default: fake spin locks using semaphores. see spin.c
*
*/
typedef int SPINLOCK;
extern bool CreateSpinlocks(IPCKey key);
extern bool AttachSpinLocks(IPCKey key);
extern bool InitSpinLocks(int init, IPCKey key);
extern void SpinAcquire(SPINLOCK lock);
extern void SpinRelease(SPINLOCK lock);
extern bool SpinIsLocked(SPINLOCK lock);
#endif /* SPIN_H */