mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-22 14:32:25 +03:00 
			
		
		
		
	Replace the BufMgrLock with separate locks on the lookup hashtable and
the freelist, plus per-buffer spinlocks that protect access to individual shared buffer headers. This requires abandoning a global freelist (since the freelist is a global contention point), which shoots down ARC and 2Q as well as plain LRU management. Adopt a clock sweep algorithm instead. Preliminary results show substantial improvement in multi-backend situations.
This commit is contained in:
		| @@ -1,5 +1,5 @@ | ||||
| <!-- | ||||
| $PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.306 2005/03/02 19:58:54 tgl Exp $ | ||||
| $PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.307 2005/03/04 20:21:05 tgl Exp $ | ||||
| --> | ||||
|  | ||||
| <chapter Id="runtime"> | ||||
| @@ -1379,9 +1379,7 @@ SET ENABLE_SEQSCAN TO OFF; | ||||
|          Specifies the delay between activity rounds for the | ||||
|          background writer.  In each round the writer issues writes | ||||
|          for some number of dirty buffers (controllable by the | ||||
|          following parameters).  The selected buffers will always be | ||||
|          the least recently used ones among the currently dirty | ||||
|          buffers.  It then sleeps for <varname>bgwriter_delay</> | ||||
|          following parameters).  It then sleeps for <varname>bgwriter_delay</> | ||||
|          milliseconds, and repeats.  The default value is 200. Note | ||||
|          that on many systems, the effective resolution of sleep | ||||
|          delays is 10 milliseconds; setting <varname>bgwriter_delay</> | ||||
| @@ -1393,32 +1391,77 @@ SET ENABLE_SEQSCAN TO OFF; | ||||
|        </listitem> | ||||
|       </varlistentry> | ||||
|  | ||||
|       <varlistentry id="guc-bgwriter-percent" xreflabel="bgwriter_percent"> | ||||
|        <term><varname>bgwriter_percent</varname> (<type>integer</type>)</term> | ||||
|       <varlistentry id="guc-bgwriter-lru-percent" xreflabel="bgwriter_lru_percent"> | ||||
|        <term><varname>bgwriter_lru_percent</varname> (<type>floating point</type>)</term> | ||||
|        <indexterm> | ||||
|         <primary><varname>bgwriter_percent</> configuration parameter</primary> | ||||
|         <primary><varname>bgwriter_lru_percent</> configuration parameter</primary> | ||||
|        </indexterm> | ||||
|        <listitem> | ||||
|         <para> | ||||
|          In each round, no more than this percentage of the currently | ||||
|          dirty buffers will be written (rounding up any fraction to | ||||
|          the next whole number of buffers).  The default value is | ||||
|          1. This option can only be set at server start or in the | ||||
|          To reduce the probability that server processes will need to issue | ||||
|          their own writes, the background writer tries to write buffers that | ||||
|          are likely to be recycled soon.  In each round, it examines up to | ||||
|          <varname>bgwriter_lru_percent</> of the buffers that are nearest to | ||||
|          being recycled, and writes any that are dirty. | ||||
|          The default value is 1.0 (this is a percentage of the total number | ||||
|          of shared buffers). | ||||
|          This option can only be set at server start or in the | ||||
|          <filename>postgresql.conf</filename> file. | ||||
|         </para> | ||||
|        </listitem> | ||||
|       </varlistentry> | ||||
|  | ||||
|       <varlistentry id="guc-bgwriter-maxpages" xreflabel="bgwriter_maxpages"> | ||||
|        <term><varname>bgwriter_maxpages</varname> (<type>integer</type>)</term> | ||||
|       <varlistentry id="guc-bgwriter-lru-maxpages" xreflabel="bgwriter_lru_maxpages"> | ||||
|        <term><varname>bgwriter_lru_maxpages</varname> (<type>integer</type>)</term> | ||||
|        <indexterm> | ||||
|         <primary><varname>bgwriter_maxpages</> configuration parameter</primary> | ||||
|         <primary><varname>bgwriter_lru_maxpages</> configuration parameter</primary> | ||||
|        </indexterm> | ||||
|        <listitem> | ||||
|         <para> | ||||
|          In each round, no more than this many dirty buffers will be | ||||
|          written. The default value is 100. This option can only be | ||||
|          set at server start or in the | ||||
|          In each round, no more than this many buffers will be written | ||||
|          as a result of scanning soon-to-be-recycled buffers. | ||||
|          The default value is 5. | ||||
|          This option can only be set at server start or in the | ||||
|          <filename>postgresql.conf</filename> file. | ||||
|         </para> | ||||
|        </listitem> | ||||
|       </varlistentry> | ||||
|  | ||||
|       <varlistentry id="guc-bgwriter-all-percent" xreflabel="bgwriter_all_percent"> | ||||
|        <term><varname>bgwriter_all_percent</varname> (<type>floating point</type>)</term> | ||||
|        <indexterm> | ||||
|         <primary><varname>bgwriter_all_percent</> configuration parameter</primary> | ||||
|        </indexterm> | ||||
|        <listitem> | ||||
|         <para> | ||||
|          To reduce the amount of work that will be needed at checkpoint time, | ||||
|          the background writer also does a circular scan through the entire | ||||
|          buffer pool, writing buffers that are found to be dirty. | ||||
|          In each round, it examines up to | ||||
|          <varname>bgwriter_all_percent</> of the buffers for this purpose. | ||||
|          The default value is 0.333 (this is a percentage of the total number | ||||
|          of shared buffers).  With the default <varname>bgwriter_delay</> | ||||
|          setting, this will allow the entire shared buffer pool to be scanned | ||||
|          about once per minute. | ||||
|          This option can only be set at server start or in the | ||||
|          <filename>postgresql.conf</filename> file. | ||||
|         </para> | ||||
|        </listitem> | ||||
|       </varlistentry> | ||||
|  | ||||
|       <varlistentry id="guc-bgwriter-all-maxpages" xreflabel="bgwriter_all_maxpages"> | ||||
|        <term><varname>bgwriter_all_maxpages</varname> (<type>integer</type>)</term> | ||||
|        <indexterm> | ||||
|         <primary><varname>bgwriter_all_maxpages</> configuration parameter</primary> | ||||
|        </indexterm> | ||||
|        <listitem> | ||||
|         <para> | ||||
|          In each round, no more than this many buffers will be written | ||||
|          as a result of the scan of the entire buffer pool.  (If this | ||||
|          limit is reached, the scan stops, and resumes at the next buffer | ||||
|          during the next round.) | ||||
|          The default value is 5. | ||||
|          This option can only be set at server start or in the | ||||
|          <filename>postgresql.conf</filename> file. | ||||
|         </para> | ||||
|        </listitem> | ||||
| @@ -1426,13 +1469,19 @@ SET ENABLE_SEQSCAN TO OFF; | ||||
|      </variablelist> | ||||
|  | ||||
|      <para> | ||||
|       Smaller values of <varname>bgwriter_percent</varname> and | ||||
|       <varname>bgwriter_maxpages</varname> reduce the extra I/O load | ||||
|       Smaller values of <varname>bgwriter_all_percent</varname> and | ||||
|       <varname>bgwriter_all_maxpages</varname> reduce the extra I/O load | ||||
|       caused by the background writer, but leave more work to be done | ||||
|       at checkpoint time.  To reduce load spikes at checkpoints, | ||||
|       increase the values.  To disable background writing entirely, | ||||
|       set <varname>bgwriter_percent</varname> and/or | ||||
|       <varname>bgwriter_maxpages</varname> to zero. | ||||
|       increase these two values. | ||||
|       Similarly, smaller values of <varname>bgwriter_lru_percent</varname> and | ||||
|       <varname>bgwriter_lru_maxpages</varname> reduce the extra I/O load | ||||
|       caused by the background writer, but make it more likely that server | ||||
|       processes will have to issue writes for themselves, delaying interactive | ||||
|       queries. | ||||
|       To disable background writing entirely, | ||||
|       set both <varname>maxpages</varname> values and/or both | ||||
|       <varname>percent</varname> values to zero. | ||||
|      </para> | ||||
|     </sect3> | ||||
|  | ||||
| @@ -3866,20 +3915,6 @@ plruby.bar = true        # generates error, unknown class name | ||||
|       </listitem> | ||||
|      </varlistentry> | ||||
|  | ||||
|      <varlistentry id="guc-debug-shared-buffers" xreflabel="debug_shared_buffers"> | ||||
|       <term><varname>debug_shared_buffers</varname> (<type>integer</type>)</term> | ||||
|       <indexterm> | ||||
|        <primary><varname>debug_shared_buffers</> configuration parameter</primary> | ||||
|       </indexterm> | ||||
|       <listitem> | ||||
|        <para> | ||||
|         Number of seconds between ARC reports. | ||||
|         If set greater than zero, emit ARC statistics to the log every so many | ||||
|         seconds.  Zero (the default) disables reporting. | ||||
|        </para> | ||||
|       </listitem> | ||||
|      </varlistentry> | ||||
|  | ||||
|      <varlistentry id="guc-pre-auth-delay" xreflabel="pre_auth_delay"> | ||||
|       <term><varname>pre_auth_delay</varname> (<type>integer</type>)</term> | ||||
|       <indexterm> | ||||
|   | ||||
| @@ -8,7 +8,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.244 2005/01/10 20:02:19 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.245 2005/03/04 20:21:05 tgl Exp $ | ||||
|  * | ||||
|  * | ||||
|  * INTERFACE ROUTINES | ||||
| @@ -1060,7 +1060,6 @@ setRelhasindex(Oid relid, bool hasindex, bool isprimary, Oid reltoastidxid) | ||||
| 		/* Send out shared cache inval if necessary */ | ||||
| 		if (!IsBootstrapProcessingMode()) | ||||
| 			CacheInvalidateHeapTuple(pg_class, tuple); | ||||
| 		BufferSync(-1, -1); | ||||
| 	} | ||||
| 	else if (dirty) | ||||
| 	{ | ||||
|   | ||||
| @@ -15,7 +15,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.151 2005/02/26 18:43:33 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.152 2005/03/04 20:21:05 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -339,7 +339,7 @@ createdb(const CreatedbStmt *stmt) | ||||
| 	 * up-to-date for the copy.  (We really only need to flush buffers for | ||||
| 	 * the source database, but bufmgr.c provides no API for that.) | ||||
| 	 */ | ||||
| 	BufferSync(-1, -1); | ||||
| 	BufferSync(); | ||||
|  | ||||
| 	/* | ||||
| 	 * Close virtual file descriptors so the kernel has more available for | ||||
| @@ -1201,7 +1201,7 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record) | ||||
| 		 * up-to-date for the copy.  (We really only need to flush buffers for | ||||
| 		 * the source database, but bufmgr.c provides no API for that.) | ||||
| 		 */ | ||||
| 		BufferSync(-1, -1); | ||||
| 		BufferSync(); | ||||
|  | ||||
| #ifndef WIN32 | ||||
|  | ||||
|   | ||||
| @@ -13,7 +13,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.302 2005/02/26 18:43:33 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.303 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -36,7 +36,6 @@ | ||||
| #include "commands/vacuum.h" | ||||
| #include "executor/executor.h" | ||||
| #include "miscadmin.h" | ||||
| #include "storage/buf_internals.h" | ||||
| #include "storage/freespace.h" | ||||
| #include "storage/sinval.h" | ||||
| #include "storage/smgr.h" | ||||
|   | ||||
| @@ -37,7 +37,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.14 2005/02/19 23:16:15 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.15 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -116,9 +116,6 @@ static BgWriterShmemStruct *BgWriterShmem; | ||||
|  * GUC parameters | ||||
|  */ | ||||
| int			BgWriterDelay = 200; | ||||
| int			BgWriterPercent = 1; | ||||
| int			BgWriterMaxPages = 100; | ||||
|  | ||||
| int			CheckPointTimeout = 300; | ||||
| int			CheckPointWarning = 30; | ||||
|  | ||||
| @@ -274,7 +271,6 @@ BackgroundWriterMain(void) | ||||
| 		bool		force_checkpoint = false; | ||||
| 		time_t		now; | ||||
| 		int			elapsed_secs; | ||||
| 		int			n; | ||||
| 		long		udelay; | ||||
|  | ||||
| 		/* | ||||
| @@ -365,16 +361,13 @@ BackgroundWriterMain(void) | ||||
| 			 * checkpoints happen at a predictable spacing. | ||||
| 			 */ | ||||
| 			last_checkpoint_time = now; | ||||
|  | ||||
| 			/* Nap for configured time before rechecking */ | ||||
| 			n = 1; | ||||
| 		} | ||||
| 		else | ||||
| 			n = BufferSync(BgWriterPercent, BgWriterMaxPages); | ||||
| 			BgBufferSync(); | ||||
|  | ||||
| 		/* | ||||
| 		 * Nap for the configured time or sleep for 10 seconds if there | ||||
| 		 * was nothing to do at all. | ||||
| 		 * Nap for the configured time, or sleep for 10 seconds if there | ||||
| 		 * is no bgwriter activity configured. | ||||
| 		 * | ||||
| 		 * On some platforms, signals won't interrupt the sleep.  To ensure | ||||
| 		 * we respond reasonably promptly when someone signals us, break | ||||
| @@ -383,7 +376,11 @@ BackgroundWriterMain(void) | ||||
| 		 * | ||||
| 		 * We absorb pending requests after each short sleep. | ||||
| 		 */ | ||||
| 		udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L; | ||||
| 		if ((bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0) || | ||||
| 			(bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0)) | ||||
| 			udelay = BgWriterDelay * 1000L; | ||||
| 		else | ||||
| 			udelay = 10000000L; | ||||
| 		while (udelay > 1000000L) | ||||
| 		{ | ||||
| 			if (got_SIGHUP || checkpoint_requested || shutdown_requested) | ||||
|   | ||||
| @@ -1,12 +1,12 @@ | ||||
| $PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $ | ||||
| $PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.8 2005/03/04 20:21:06 tgl Exp $ | ||||
|  | ||||
| Notes about shared buffer access rules | ||||
| -------------------------------------- | ||||
|  | ||||
| There are two separate access control mechanisms for shared disk buffers: | ||||
| reference counts (a/k/a pin counts) and buffer locks.  (Actually, there's | ||||
| a third level of access control: one must hold the appropriate kind of | ||||
| lock on a relation before one can legally access any page belonging to | ||||
| reference counts (a/k/a pin counts) and buffer content locks.  (Actually, | ||||
| there's a third level of access control: one must hold the appropriate kind | ||||
| of lock on a relation before one can legally access any page belonging to | ||||
| the relation.  Relation-level locks are not discussed here.) | ||||
|  | ||||
| Pins: one must "hold a pin on" a buffer (increment its reference count) | ||||
| @@ -26,7 +26,7 @@ handled by waiting to obtain the relation-level lock, which is why you'd | ||||
| better hold one first.)  Pins may not be held across transaction | ||||
| boundaries, however. | ||||
|  | ||||
| Buffer locks: there are two kinds of buffer locks, shared and exclusive, | ||||
| Buffer content locks: there are two kinds of buffer lock, shared and exclusive, | ||||
| which act just as you'd expect: multiple backends can hold shared locks on | ||||
| the same buffer, but an exclusive lock prevents anyone else from holding | ||||
| either shared or exclusive lock.  (These can alternatively be called READ | ||||
| @@ -38,12 +38,12 @@ the same buffer.  One must pin a buffer before trying to lock it. | ||||
| Buffer access rules: | ||||
|  | ||||
| 1. To scan a page for tuples, one must hold a pin and either shared or | ||||
| exclusive lock.  To examine the commit status (XIDs and status bits) of | ||||
| a tuple in a shared buffer, one must likewise hold a pin and either shared | ||||
| exclusive content lock.  To examine the commit status (XIDs and status bits) | ||||
| of a tuple in a shared buffer, one must likewise hold a pin and either shared | ||||
| or exclusive lock. | ||||
|  | ||||
| 2. Once one has determined that a tuple is interesting (visible to the | ||||
| current transaction) one may drop the buffer lock, yet continue to access | ||||
| current transaction) one may drop the content lock, yet continue to access | ||||
| the tuple's data for as long as one holds the buffer pin.  This is what is | ||||
| typically done by heap scans, since the tuple returned by heap_fetch | ||||
| contains a pointer to tuple data in the shared buffer.  Therefore the | ||||
| @@ -52,9 +52,9 @@ change, but that is assumed not to matter after the initial determination | ||||
| of visibility is made. | ||||
|  | ||||
| 3. To add a tuple or change the xmin/xmax fields of an existing tuple, | ||||
| one must hold a pin and an exclusive lock on the containing buffer. | ||||
| one must hold a pin and an exclusive content lock on the containing buffer. | ||||
| This ensures that no one else might see a partially-updated state of the | ||||
| tuple. | ||||
| tuple while they are doing visibility checks. | ||||
|  | ||||
| 4. It is considered OK to update tuple commit status bits (ie, OR the | ||||
| values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or | ||||
| @@ -76,7 +76,7 @@ no other backend can be holding a reference to an existing tuple that it | ||||
| might expect to examine again.  Note that another backend might pin the | ||||
| buffer (increment the refcount) while one is performing the cleanup, but | ||||
| it won't be able to actually examine the page until it acquires shared | ||||
| or exclusive lock. | ||||
| or exclusive content lock. | ||||
|  | ||||
|  | ||||
| VACUUM FULL ignores rule #5, because it instead acquires exclusive lock at | ||||
| @@ -97,149 +97,142 @@ for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a | ||||
| single relation anyway. | ||||
|  | ||||
|  | ||||
| Buffer replacement strategy interface | ||||
| ------------------------------------- | ||||
| Buffer manager's internal locking | ||||
| --------------------------------- | ||||
|  | ||||
| The file freelist.c contains the buffer cache replacement strategy. | ||||
| The interface to the strategy is: | ||||
| Before PostgreSQL 8.1, all operations of the shared buffer manager itself | ||||
| were protected by a single system-wide lock, the BufMgrLock, which | ||||
| unsurprisingly proved to be a source of contention.  The new locking scheme | ||||
| avoids grabbing system-wide exclusive locks in common code paths.  It works | ||||
| like this: | ||||
|  | ||||
| 	BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck, | ||||
| 	                                 int *cdb_found_index) | ||||
| * There is a system-wide LWLock, the BufMappingLock, that notionally | ||||
| protects the mapping from buffer tags (page identifiers) to buffers. | ||||
| (Physically, it can be thought of as protecting the hash table maintained | ||||
| by buf_table.c.)  To look up whether a buffer exists for a tag, it is | ||||
| sufficient to obtain share lock on the BufMappingLock.  Note that one | ||||
| must pin the found buffer, if any, before releasing the BufMappingLock. | ||||
| To alter the page assignment of any buffer, one must hold exclusive lock | ||||
| on the BufMappingLock.  This lock must be held across adjusting the buffer's | ||||
| header fields and changing the buf_table hash table.  The only common | ||||
| operation that needs exclusive lock is reading in a page that was not | ||||
| in shared buffers already, which will require at least a kernel call | ||||
| and usually a wait for I/O, so it will be slow anyway. | ||||
|  | ||||
| This is always the first call made by the buffer manager to check if a disk | ||||
| page is in memory. If so, the function returns the buffer descriptor and no | ||||
| further action is required. If the page is not in memory, | ||||
| StrategyBufferLookup() returns NULL. | ||||
| * A separate system-wide LWLock, the BufFreelistLock, provides mutual | ||||
| exclusion for operations that access the buffer free list or select | ||||
| buffers for replacement.  This is always taken in exclusive mode since | ||||
| there are no read-only operations on those data structures.  The buffer | ||||
| management policy is designed so that BufFreelistLock need not be taken | ||||
| except in paths that will require I/O, and thus will be slow anyway. | ||||
| (Details appear below.)  It is never necessary to hold the BufMappingLock | ||||
| and the BufFreelistLock at the same time. | ||||
|  | ||||
| The flag recheck tells the strategy that this is a second lookup after | ||||
| flushing a dirty block. If the buffer manager has to evict another buffer, | ||||
| it will release the bufmgr lock while doing the write IO. During this time, | ||||
| another backend could possibly fault in the same page this backend is after, | ||||
| so we have to check again after the IO is done if the page is in memory now. | ||||
| * Each buffer header contains a spinlock that must be taken when examining | ||||
| or changing fields of that buffer header.  This allows operations such as | ||||
| ReleaseBuffer to make local state changes without taking any system-wide | ||||
| lock.  We use a spinlock, not an LWLock, since there are no cases where | ||||
| the lock needs to be held for more than a few instructions. | ||||
|  | ||||
| *cdb_found_index is set to the index of the found CDB, or -1 if none. | ||||
| This is not intended to be used by the caller, except to pass to | ||||
| StrategyReplaceBuffer(). | ||||
| Note that a buffer header's spinlock does not control access to the data | ||||
| held within the buffer.  Each buffer header also contains an LWLock, the | ||||
| "buffer content lock", that *does* represent the right to access the data | ||||
| in the buffer.  It is used per the rules above. | ||||
|  | ||||
| 	BufferDesc *StrategyGetBuffer(int *cdb_replace_index) | ||||
|  | ||||
| The buffer manager calls this function to get an unpinned cache buffer whose | ||||
| content can be evicted. The returned buffer might be empty, clean or dirty. | ||||
|  | ||||
| The returned buffer is only a candidate for replacement.  It is possible that | ||||
| while the buffer is being written, another backend finds and modifies it, so | ||||
| that it is dirty again.  The buffer manager will then have to call | ||||
| StrategyGetBuffer() again to ask for another candidate. | ||||
|  | ||||
| *cdb_replace_index is set to the index of the candidate CDB, or -1 if none | ||||
| (meaning we are using a previously free buffer).  This is not intended to be | ||||
| used by the caller, except to pass to StrategyReplaceBuffer(). | ||||
|  | ||||
| 	void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, | ||||
| 	                           int cdb_found_index, int cdb_replace_index) | ||||
|  | ||||
| Called by the buffer manager at the time it is about to change the association | ||||
| of a buffer with a disk page. | ||||
|  | ||||
| Before this call, StrategyBufferLookup() still has to find the buffer under | ||||
| its old tag, even if it was returned by StrategyGetBuffer() as a candidate | ||||
| for replacement. | ||||
|  | ||||
| After this call, this buffer must be returned for a lookup of the new page | ||||
| identified by *newTag. | ||||
|  | ||||
| cdb_found_index and cdb_replace_index must be the auxiliary values | ||||
| returned by previous calls to StrategyBufferLookup and StrategyGetBuffer. | ||||
|  | ||||
| 	void StrategyInvalidateBuffer(BufferDesc *buf) | ||||
|  | ||||
| Called by the buffer manager to inform the strategy that the content of this | ||||
| buffer is being thrown away. This happens for example in the case of dropping | ||||
| a relation.  The buffer must be clean and unpinned on call. | ||||
|  | ||||
| If the buffer was associated with a disk page, StrategyBufferLookup() | ||||
| must not return it for this page after the call. | ||||
|  | ||||
| 	void StrategyHintVacuum(bool vacuum_active) | ||||
|  | ||||
| Because VACUUM reads all relations of the entire database through the buffer | ||||
| manager, it can greatly disturb the buffer replacement strategy. This function | ||||
| is used by VACUUM to inform the strategy that subsequent buffer lookups are | ||||
| (or are not) caused by VACUUM scanning relations. | ||||
| There is yet another set of per-buffer LWLocks, the io_in_progress locks, | ||||
| that are used to wait for I/O on a buffer to complete.  The process doing | ||||
| a read or write takes exclusive lock for the duration, and processes that | ||||
| need to wait for completion try to take shared locks (which they release | ||||
| immediately upon obtaining).  XXX on systems where an LWLock represents | ||||
| nontrivial resources, it's fairly annoying to need so many locks.  Possibly | ||||
| we could use per-backend LWLocks instead (a buffer header would then contain | ||||
| a field to show which backend is doing its I/O). | ||||
|  | ||||
|  | ||||
| Buffer replacement strategy | ||||
| --------------------------- | ||||
|  | ||||
| The buffer replacement strategy actually used in freelist.c is a version of | ||||
| the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL. | ||||
| There is a "free list" of buffers that are prime candidates for replacement. | ||||
| In particular, buffers that are completely free (contain no valid page) are | ||||
| always in this list.  We may also throw buffers into this list if we | ||||
| consider their pages unlikely to be needed soon.  The list is singly-linked | ||||
| using fields in the buffer headers; we maintain head and tail pointers in | ||||
| global variables.  (Note: although the list links are in the buffer headers, | ||||
| they are considered to be protected by the BufFreelistLock, not the | ||||
| buffer-header spinlocks.)  To choose a victim buffer to recycle when there | ||||
| are no free buffers available, we use a simple clock-sweep algorithm, which | ||||
| avoids the need to take system-wide locks during common operations.  It | ||||
| works like this: | ||||
|  | ||||
| The algorithm works as follows: | ||||
| Each buffer header contains a usage counter, which is incremented (up to a | ||||
| small limit value) whenever the buffer is unpinned.  (This requires only the | ||||
| buffer header spinlock, which would have to be taken anyway to decrement the | ||||
| buffer reference count, so it's nearly free.) | ||||
|  | ||||
| C is the size of the cache in number of pages (a/k/a shared_buffers or | ||||
| NBuffers).  ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block | ||||
| is always associated with one unique file page.  It may point to one shared | ||||
| buffer, or may indicate that the file page is not in a buffer but has been | ||||
| accessed recently. | ||||
| The "clock hand" is a buffer index, NextVictimBuffer, that moves circularly | ||||
| through all the available buffers.  NextVictimBuffer is protected by the | ||||
| BufFreelistLock. | ||||
|  | ||||
| All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and | ||||
| T2 lists are the "real" cache entries, linking a file page to a memory buffer | ||||
| where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2 | ||||
| are ghost cache directories that extend T1 and T2 so that the strategy | ||||
| remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len | ||||
| both at C. T1len and T2len vary over the runtime depending on the lookup | ||||
| pattern and its resulting cache hits. The desired size of T1len is called | ||||
| T1target. | ||||
| The algorithm for a process that needs to obtain a victim buffer is: | ||||
|  | ||||
| Assuming we have a full cache, one of 5 cases happens on a lookup: | ||||
| 1. Obtain BufFreelistLock. | ||||
|  | ||||
| MISS	On a cache miss, depending on T1target and the actual T1len | ||||
| 	the LRU buffer of either T1 or T2 is evicted. Its CDB is removed | ||||
| 	from the T list and added as MRU of the corresponding B list. | ||||
| 	The now free buffer is replaced with the requested page | ||||
| 	and added as MRU of T1. | ||||
| 2. If buffer free list is nonempty, remove its head buffer.  If the buffer | ||||
| is pinned or has a nonzero usage count, it cannot be used; ignore it and | ||||
| return to the start of step 2.  Otherwise, pin the buffer, release | ||||
| BufFreelistLock, and return the buffer. | ||||
|  | ||||
| T1 hit	The T1 CDB is moved to the MRU position of the T2 list. | ||||
| 3. Otherwise, select the buffer pointed to by NextVictimBuffer, and | ||||
| circularly advance NextVictimBuffer for next time. | ||||
|  | ||||
| T2 hit	The T2 CDB is moved to the MRU position of the T2 list. | ||||
| 4. If the selected buffer is pinned or has a nonzero usage count, it cannot | ||||
| be used.  Decrement its usage count (if nonzero) and return to step 3 to | ||||
| examine the next buffer. | ||||
|  | ||||
| B1 hit	This means that a buffer that was evicted from the T1 | ||||
| 	list is now requested again, indicating that T1target is | ||||
| 	too small (otherwise it would still be in T1 and thus in | ||||
| 	memory). The strategy raises T1target, evicts a buffer | ||||
| 	depending on T1target and T1len and places the CDB at | ||||
| 	MRU of T2. | ||||
| 5. Pin the selected buffer, release BufFreelistLock, and return the buffer. | ||||
|  | ||||
| B2 hit	This means the opposite of B1, the T2 list is probably too | ||||
| 	small. So the strategy lowers T1target, evicts a buffer | ||||
| 	and places the CDB at MRU of T2. | ||||
| (Note that if the selected buffer is dirty, we will have to write it out | ||||
| before we can recycle it; if someone else pins the buffer meanwhile we will | ||||
| have to give up and try another buffer.  This however is not a concern | ||||
| of the basic select-a-victim-buffer algorithm.) | ||||
|  | ||||
| Thus, every page that is found on lookup in any of the four lists | ||||
| ends up as the MRU of the T2 list. The T2 list therefore is the | ||||
| "frequency" cache, holding frequently requested pages. | ||||
| A special provision is that while running VACUUM, a backend does not | ||||
| increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer | ||||
| sees that it is dropping the pin count to zero and the usage count is zero, | ||||
| then it appends the buffer to the tail of the free list.  (This implies that | ||||
| VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer; | ||||
| this shouldn't create much of a contention problem.)  This provision | ||||
| encourages VACUUM to work in a relatively small number of buffers rather | ||||
| than blowing out the entire buffer cache.  It is reasonable since a page | ||||
| that has been touched only by VACUUM is unlikely to be needed again soon. | ||||
|  | ||||
| Every page that is seen for the first time ends up as the MRU of the T1 | ||||
| list. The T1 list is the "recency" cache, holding recent newcomers. | ||||
|  | ||||
| The tailoring done for PostgreSQL has to do with the way the query executor | ||||
| works. A typical UPDATE or DELETE first scans the relation, searching for the | ||||
| tuples and then calls heap_update() or heap_delete(). This causes at least 2 | ||||
| lookups for the block in the same statement. In the case of multiple matches | ||||
| in one block even more often. As a result, every block touched in an UPDATE or | ||||
| DELETE would directly jump into the T2 cache, which is wrong. To prevent this | ||||
| the strategy remembers which transaction added a buffer to the T1 list and | ||||
| will not promote it from there into the T2 cache during the same transaction. | ||||
|  | ||||
| Another specialty is the change of the strategy during VACUUM.  Lookups during | ||||
| VACUUM do not represent application needs, and do not suggest that the page | ||||
| will be hit again soon, so it would be wrong to change the cache balance | ||||
| T1target due to that or to cause massive cache evictions. Therefore, a page | ||||
| read in to satisfy vacuum is placed at the LRU position of the T1 list, for | ||||
| immediate reuse.  Also, if we happen to get a hit on a CDB entry during | ||||
| VACUUM, we do not promote the page above its current position in the list. | ||||
| Since VACUUM usually requests many pages very fast, the effect of this is that | ||||
| it will get back the very buffers it filled and possibly modified on the next | ||||
| call and will therefore do its work in a few shared memory buffers, while | ||||
| being able to use whatever it finds in the cache already.  This also implies | ||||
| that most of the write traffic caused by a VACUUM will be done by the VACUUM | ||||
| itself and not pushed off onto other processes. | ||||
|  | ||||
|  | ||||
| Background writer's processing | ||||
| ------------------------------ | ||||
|  | ||||
| The background writer is designed to write out pages that are likely to be | ||||
| recycled soon, thereby offloading the writing work from active backends. | ||||
| To do this, it scans forward circularly from the current position of | ||||
| NextVictimBuffer (which it does not change!), looking for buffers that are | ||||
| dirty and not pinned nor marked with a positive usage count.  It pins, | ||||
| writes, and releases any such buffer. | ||||
|  | ||||
| If we can assume that reading NextVictimBuffer is an atomic action, then | ||||
| the writer doesn't even need to take the BufFreelistLock in order to look | ||||
| for buffers to write; it needs only to spinlock each buffer header for long | ||||
| enough to check the dirtybit.  Even without that assumption, the writer | ||||
| only needs to take the lock long enough to read the variable value, not | ||||
| while scanning the buffers.  (This is a very substantial improvement in | ||||
| the contention cost of the writer compared to PG 8.0.) | ||||
|  | ||||
| During a checkpoint, the writer's strategy must be to write every dirty | ||||
| buffer (pinned or not!).  We may as well make it start this scan from  | ||||
| NextVictimBuffer, however, so that the first-to-be-written pages are the | ||||
| ones that backends might otherwise have to write for themselves soon. | ||||
|   | ||||
| @@ -8,7 +8,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.71 2005/02/03 23:29:11 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.72 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -22,6 +22,8 @@ BufferDesc *BufferDescriptors; | ||||
| Block	   *BufferBlockPointers; | ||||
| int32	   *PrivateRefCount; | ||||
|  | ||||
| static char *BufferBlocks; | ||||
|  | ||||
| /* statistics counters */ | ||||
| long int	ReadBufferCount; | ||||
| long int	ReadLocalBufferCount; | ||||
| @@ -50,16 +52,11 @@ long int	LocalBufferFlushCount; | ||||
|  * | ||||
|  * Synchronization/Locking: | ||||
|  * | ||||
|  * BufMgrLock lock -- must be acquired before manipulating the | ||||
|  *		buffer search datastructures (lookup/freelist, as well as the | ||||
|  *		flag bits of any buffer).  Must be released | ||||
|  *		before exit and before doing any IO. | ||||
|  * | ||||
|  * IO_IN_PROGRESS -- this is a flag in the buffer descriptor. | ||||
|  *		It must be set when an IO is initiated and cleared at | ||||
|  *		the end of the IO.	It is there to make sure that one | ||||
|  *		process doesn't start to use a buffer while another is | ||||
|  *		faulting it in.  see IOWait/IOSignal. | ||||
|  *		faulting it in.  see WaitIO and related routines. | ||||
|  * | ||||
|  * refcount --	Counts the number of processes holding pins on a buffer. | ||||
|  *		A buffer is pinned during IO and immediately after a BufferAlloc(). | ||||
| @@ -85,10 +82,8 @@ long int	LocalBufferFlushCount; | ||||
| void | ||||
| InitBufferPool(void) | ||||
| { | ||||
| 	char	   *BufferBlocks; | ||||
| 	bool		foundBufs, | ||||
| 				foundDescs; | ||||
| 	int			i; | ||||
|  | ||||
| 	BufferDescriptors = (BufferDesc *) | ||||
| 		ShmemInitStruct("Buffer Descriptors", | ||||
| @@ -102,52 +97,42 @@ InitBufferPool(void) | ||||
| 	{ | ||||
| 		/* both should be present or neither */ | ||||
| 		Assert(foundDescs && foundBufs); | ||||
| 		/* note: this path is only taken in EXEC_BACKEND case */ | ||||
| 	} | ||||
| 	else | ||||
| 	{ | ||||
| 		BufferDesc *buf; | ||||
| 		char	   *block; | ||||
|  | ||||
| 		/* | ||||
| 		 * It's probably not really necessary to grab the lock --- if | ||||
| 		 * there's anyone else attached to the shmem at this point, we've | ||||
| 		 * got problems. | ||||
| 		 */ | ||||
| 		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); | ||||
| 		int			i; | ||||
|  | ||||
| 		buf = BufferDescriptors; | ||||
| 		block = BufferBlocks; | ||||
|  | ||||
| 		/* | ||||
| 		 * Initialize all the buffer headers. | ||||
| 		 */ | ||||
| 		for (i = 0; i < NBuffers; block += BLCKSZ, buf++, i++) | ||||
| 		for (i = 0; i < NBuffers; buf++, i++) | ||||
| 		{ | ||||
| 			Assert(ShmemIsValid((unsigned long) block)); | ||||
|  | ||||
| 			/* | ||||
| 			 * The bufNext fields link together all totally-unused buffers. | ||||
| 			 * Subsequent management of this list is done by | ||||
| 			 * StrategyGetBuffer(). | ||||
| 			 */ | ||||
| 			buf->bufNext = i + 1; | ||||
|  | ||||
| 			CLEAR_BUFFERTAG(buf->tag); | ||||
| 			buf->flags = 0; | ||||
| 			buf->usage_count = 0; | ||||
| 			buf->refcount = 0; | ||||
| 			buf->wait_backend_id = 0; | ||||
|  | ||||
| 			SpinLockInit(&buf->buf_hdr_lock); | ||||
|  | ||||
| 			buf->buf_id = i; | ||||
|  | ||||
| 			buf->data = MAKE_OFFSET(block); | ||||
| 			buf->flags = 0; | ||||
| 			buf->refcount = 0; | ||||
| 			/* | ||||
| 			 * Initially link all the buffers together as unused. | ||||
| 			 * Subsequent management of this list is done by freelist.c. | ||||
| 			 */ | ||||
| 			buf->freeNext = i + 1; | ||||
|  | ||||
| 			buf->io_in_progress_lock = LWLockAssign(); | ||||
| 			buf->cntx_lock = LWLockAssign(); | ||||
| 			buf->cntxDirty = false; | ||||
| 			buf->wait_backend_id = 0; | ||||
| 			buf->content_lock = LWLockAssign(); | ||||
| 		} | ||||
|  | ||||
| 		/* Correct last entry of linked list */ | ||||
| 		BufferDescriptors[NBuffers - 1].bufNext = -1; | ||||
|  | ||||
| 		LWLockRelease(BufMgrLock); | ||||
| 		BufferDescriptors[NBuffers - 1].freeNext = FREENEXT_END_OF_LIST; | ||||
| 	} | ||||
|  | ||||
| 	/* Init other shared buffer-management stuff */ | ||||
| @@ -162,12 +147,13 @@ InitBufferPool(void) | ||||
|  * buffer pool. | ||||
|  * | ||||
|  * NB: this is called before InitProcess(), so we do not have a PGPROC and | ||||
|  * cannot do LWLockAcquire; hence we can't actually access the bufmgr's | ||||
|  * cannot do LWLockAcquire; hence we can't actually access stuff in | ||||
|  * shared memory yet.  We are only initializing local data here. | ||||
|  */ | ||||
| void | ||||
| InitBufferPoolAccess(void) | ||||
| { | ||||
| 	char	   *block; | ||||
| 	int			i; | ||||
|  | ||||
| 	/* | ||||
| @@ -179,12 +165,18 @@ InitBufferPoolAccess(void) | ||||
| 									   sizeof(*PrivateRefCount)); | ||||
|  | ||||
| 	/* | ||||
| 	 * Convert shmem offsets into addresses as seen by this process. This | ||||
| 	 * is just to speed up the BufferGetBlock() macro.  It is OK to do this | ||||
| 	 * without any lock since the data pointers never change. | ||||
| 	 * Construct addresses for the individual buffer data blocks.  We do | ||||
| 	 * this just to speed up the BufferGetBlock() macro.  (Since the | ||||
| 	 * addresses should be the same in every backend, we could inherit | ||||
| 	 * this data from the postmaster --- but in the EXEC_BACKEND case | ||||
| 	 * that doesn't work.) | ||||
| 	 */ | ||||
| 	block = BufferBlocks; | ||||
| 	for (i = 0; i < NBuffers; i++) | ||||
| 		BufferBlockPointers[i] = (Block) MAKE_PTR(BufferDescriptors[i].data); | ||||
| 	{ | ||||
| 		BufferBlockPointers[i] = (Block) block; | ||||
| 		block += BLCKSZ; | ||||
| 	} | ||||
| } | ||||
|  | ||||
| /* | ||||
|   | ||||
| @@ -3,12 +3,9 @@ | ||||
|  * buf_table.c | ||||
|  *	  routines for mapping BufferTags to buffer indexes. | ||||
|  * | ||||
|  * NOTE: this module is called only by freelist.c, and the "buffer IDs" | ||||
|  * it deals with are whatever freelist.c needs them to be; they may not be | ||||
|  * directly equivalent to Buffer numbers. | ||||
|  * | ||||
|  * Note: all routines in this file assume that the BufMgrLock is held | ||||
|  * by the caller, so no synchronization is needed. | ||||
|  * Note: the routines in this file do no locking of their own.  The caller | ||||
|  * must hold a suitable lock on the BufMappingLock, as specified in the | ||||
|  * comments. | ||||
|  * | ||||
|  * | ||||
|  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group | ||||
| @@ -16,7 +13,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.39 2005/02/03 23:29:11 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.40 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -74,17 +71,17 @@ InitBufTable(int size) | ||||
| /* | ||||
|  * BufTableLookup | ||||
|  *		Lookup the given BufferTag; return buffer ID, or -1 if not found | ||||
|  * | ||||
|  * Caller must hold at least share lock on BufMappingLock | ||||
|  */ | ||||
| int | ||||
| BufTableLookup(BufferTag *tagPtr) | ||||
| { | ||||
| 	BufferLookupEnt *result; | ||||
|  | ||||
| 	if (tagPtr->blockNum == P_NEW) | ||||
| 		return -1; | ||||
|  | ||||
| 	result = (BufferLookupEnt *) | ||||
| 		hash_search(SharedBufHash, (void *) tagPtr, HASH_FIND, NULL); | ||||
|  | ||||
| 	if (!result) | ||||
| 		return -1; | ||||
|  | ||||
| @@ -93,14 +90,23 @@ BufTableLookup(BufferTag *tagPtr) | ||||
|  | ||||
| /* | ||||
|  * BufTableInsert | ||||
|  *		Insert a hashtable entry for given tag and buffer ID | ||||
|  *		Insert a hashtable entry for given tag and buffer ID, | ||||
|  *		unless an entry already exists for that tag | ||||
|  * | ||||
|  * Returns -1 on successful insertion.  If a conflicting entry exists | ||||
|  * already, returns the buffer ID in that entry. | ||||
|  * | ||||
|  * Caller must hold write lock on BufMappingLock | ||||
|  */ | ||||
| void | ||||
| int | ||||
| BufTableInsert(BufferTag *tagPtr, int buf_id) | ||||
| { | ||||
| 	BufferLookupEnt *result; | ||||
| 	bool		found; | ||||
|  | ||||
| 	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */ | ||||
| 	Assert(tagPtr->blockNum != P_NEW); /* invalid tag */ | ||||
|  | ||||
| 	result = (BufferLookupEnt *) | ||||
| 		hash_search(SharedBufHash, (void *) tagPtr, HASH_ENTER, &found); | ||||
|  | ||||
| @@ -109,15 +115,19 @@ BufTableInsert(BufferTag *tagPtr, int buf_id) | ||||
| 				(errcode(ERRCODE_OUT_OF_MEMORY), | ||||
| 				 errmsg("out of shared memory"))); | ||||
|  | ||||
| 	if (found)					/* found something already in the table? */ | ||||
| 		elog(ERROR, "shared buffer hash table corrupted"); | ||||
| 	if (found)					/* found something already in the table */ | ||||
| 		return result->id; | ||||
|  | ||||
| 	result->id = buf_id; | ||||
|  | ||||
| 	return -1; | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * BufTableDelete | ||||
|  *		Delete the hashtable entry for given tag (which must exist) | ||||
|  * | ||||
|  * Caller must hold write lock on BufMappingLock | ||||
|  */ | ||||
| void | ||||
| BufTableDelete(BufferTag *tagPtr) | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -9,7 +9,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.62 2005/01/10 20:02:21 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.63 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -24,6 +24,10 @@ | ||||
|  | ||||
| /*#define LBDEBUG*/ | ||||
|  | ||||
| /* Note: this macro only works on local buffers, not shared ones! */ | ||||
| #define LocalBufHdrGetBlock(bufHdr)	\ | ||||
| 	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] | ||||
|  | ||||
| /* should be a GUC parameter some day */ | ||||
| int			NLocBuffer = 64; | ||||
|  | ||||
| @@ -39,7 +43,7 @@ static int	nextFreeLocalBuf = 0; | ||||
|  *	  allocate a local buffer. We do round robin allocation for now. | ||||
|  * | ||||
|  * API is similar to bufmgr.c's BufferAlloc, except that we do not need | ||||
|  * to have the BufMgrLock since this is all local.	Also, IO_IN_PROGRESS | ||||
|  * to do any locking since this is all local.	Also, IO_IN_PROGRESS | ||||
|  * does not get set. | ||||
|  */ | ||||
| BufferDesc * | ||||
| @@ -47,11 +51,12 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) | ||||
| { | ||||
| 	BufferTag	newTag;			/* identity of requested block */ | ||||
| 	int			i; | ||||
| 	int			trycounter; | ||||
| 	BufferDesc *bufHdr; | ||||
|  | ||||
| 	INIT_BUFFERTAG(newTag, reln, blockNum); | ||||
|  | ||||
| 	/* a low tech search for now -- not optimized for scans */ | ||||
| 	/* a low tech search for now -- should use a hashtable */ | ||||
| 	for (i = 0; i < NLocBuffer; i++) | ||||
| 	{ | ||||
| 		bufHdr = &LocalBufferDescriptors[i]; | ||||
| @@ -81,32 +86,44 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) | ||||
| 			RelationGetRelid(reln), blockNum, -nextFreeLocalBuf - 1); | ||||
| #endif | ||||
|  | ||||
| 	/* need to get a new buffer (round robin for now) */ | ||||
| 	bufHdr = NULL; | ||||
| 	for (i = 0; i < NLocBuffer; i++) | ||||
| 	/* | ||||
| 	 * Need to get a new buffer.  We use a clock sweep algorithm | ||||
| 	 * (essentially the same as what freelist.c does now...) | ||||
| 	 */ | ||||
| 	trycounter = NLocBuffer; | ||||
| 	for (;;) | ||||
| 	{ | ||||
| 		int			b = (nextFreeLocalBuf + i) % NLocBuffer; | ||||
| 		int			b = nextFreeLocalBuf; | ||||
|  | ||||
| 		if (LocalRefCount[b] == 0) | ||||
| 		if (++nextFreeLocalBuf >= NLocBuffer) | ||||
| 			nextFreeLocalBuf = 0; | ||||
|  | ||||
| 		bufHdr = &LocalBufferDescriptors[b]; | ||||
|  | ||||
| 		if (LocalRefCount[b] == 0 && bufHdr->usage_count == 0) | ||||
| 		{ | ||||
| 			bufHdr = &LocalBufferDescriptors[b]; | ||||
| 			LocalRefCount[b]++; | ||||
| 			ResourceOwnerRememberBuffer(CurrentResourceOwner, | ||||
| 									  BufferDescriptorGetBuffer(bufHdr)); | ||||
| 			nextFreeLocalBuf = (b + 1) % NLocBuffer; | ||||
| 										BufferDescriptorGetBuffer(bufHdr)); | ||||
| 			break; | ||||
| 		} | ||||
|  | ||||
| 		if (bufHdr->usage_count > 0) | ||||
| 		{ | ||||
| 			bufHdr->usage_count--; | ||||
| 			trycounter = NLocBuffer; | ||||
| 		} | ||||
| 		else if (--trycounter == 0) | ||||
| 			ereport(ERROR, | ||||
| 					(errcode(ERRCODE_INSUFFICIENT_RESOURCES), | ||||
| 					 errmsg("no empty local buffer available"))); | ||||
| 	} | ||||
| 	if (bufHdr == NULL) | ||||
| 		ereport(ERROR, | ||||
| 				(errcode(ERRCODE_INSUFFICIENT_RESOURCES), | ||||
| 				 errmsg("no empty local buffer available"))); | ||||
|  | ||||
| 	/* | ||||
| 	 * this buffer is not referenced but it might still be dirty. if | ||||
| 	 * that's the case, write it out before reusing it! | ||||
| 	 */ | ||||
| 	if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) | ||||
| 	if (bufHdr->flags & BM_DIRTY) | ||||
| 	{ | ||||
| 		SMgrRelation oreln; | ||||
|  | ||||
| @@ -116,7 +133,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) | ||||
| 		/* And write... */ | ||||
| 		smgrwrite(oreln, | ||||
| 				  bufHdr->tag.blockNum, | ||||
| 				  (char *) MAKE_PTR(bufHdr->data), | ||||
| 				  (char *) LocalBufHdrGetBlock(bufHdr), | ||||
| 				  true); | ||||
|  | ||||
| 		LocalBufferFlushCount++; | ||||
| @@ -129,7 +146,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) | ||||
| 	 * use, so it's okay to do it (and possibly error out) before marking | ||||
| 	 * the buffer as not dirty. | ||||
| 	 */ | ||||
| 	if (bufHdr->data == (SHMEM_OFFSET) 0) | ||||
| 	if (LocalBufHdrGetBlock(bufHdr) == NULL) | ||||
| 	{ | ||||
| 		char	   *data = (char *) malloc(BLCKSZ); | ||||
|  | ||||
| @@ -138,17 +155,10 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) | ||||
| 					(errcode(ERRCODE_OUT_OF_MEMORY), | ||||
| 					 errmsg("out of memory"))); | ||||
|  | ||||
| 		/* | ||||
| 		 * This is a bit of a hack: bufHdr->data needs to be a shmem | ||||
| 		 * offset for consistency with the shared-buffer case, so make it | ||||
| 		 * one even though it's not really a valid shmem offset. | ||||
| 		 */ | ||||
| 		bufHdr->data = MAKE_OFFSET(data); | ||||
|  | ||||
| 		/* | ||||
| 		 * Set pointer for use by BufferGetBlock() macro. | ||||
| 		 */ | ||||
| 		LocalBufferBlockPointers[-(bufHdr->buf_id + 2)] = (Block) data; | ||||
| 		LocalBufHdrGetBlock(bufHdr) = (Block) data; | ||||
| 	} | ||||
|  | ||||
| 	/* | ||||
| @@ -156,7 +166,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) | ||||
| 	 */ | ||||
| 	bufHdr->tag = newTag; | ||||
| 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); | ||||
| 	bufHdr->cntxDirty = false; | ||||
| 	bufHdr->flags |= BM_TAG_VALID; | ||||
| 	bufHdr->usage_count = 0; | ||||
|  | ||||
| 	*foundPtr = FALSE; | ||||
| 	return bufHdr; | ||||
| @@ -170,6 +181,7 @@ void | ||||
| WriteLocalBuffer(Buffer buffer, bool release) | ||||
| { | ||||
| 	int			bufid; | ||||
| 	BufferDesc *bufHdr; | ||||
|  | ||||
| 	Assert(BufferIsLocal(buffer)); | ||||
|  | ||||
| @@ -178,12 +190,18 @@ WriteLocalBuffer(Buffer buffer, bool release) | ||||
| #endif | ||||
|  | ||||
| 	bufid = -(buffer + 1); | ||||
| 	LocalBufferDescriptors[bufid].flags |= BM_DIRTY; | ||||
|  | ||||
| 	Assert(LocalRefCount[bufid] > 0); | ||||
|  | ||||
| 	bufHdr = &LocalBufferDescriptors[bufid]; | ||||
| 	bufHdr->flags |= BM_DIRTY; | ||||
|  | ||||
| 	if (release) | ||||
| 	{ | ||||
| 		Assert(LocalRefCount[bufid] > 0); | ||||
| 		LocalRefCount[bufid]--; | ||||
| 		if (LocalRefCount[bufid] == 0 && | ||||
| 			bufHdr->usage_count < BM_MAX_USAGE_COUNT) | ||||
| 			bufHdr->usage_count++; | ||||
| 		ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer); | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -10,7 +10,7 @@ | ||||
|  * Written by Peter Eisentraut <peter_e@gmx.net>. | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.253 2005/03/01 20:23:34 tgl Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.254 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *-------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -77,7 +77,6 @@ extern bool Log_disconnections; | ||||
| extern DLLIMPORT bool check_function_bodies; | ||||
| extern int	CommitDelay; | ||||
| extern int	CommitSiblings; | ||||
| extern int	DebugSharedBuffers; | ||||
| extern char *default_tablespace; | ||||
|  | ||||
| static const char *assign_log_destination(const char *value, | ||||
| @@ -1230,15 +1229,6 @@ static struct config_int ConfigureNamesInt[] = | ||||
| 		-1, -1, INT_MAX / 1000, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"debug_shared_buffers", PGC_POSTMASTER, STATS_MONITORING, | ||||
| 			gettext_noop("Interval to report shared buffer status in seconds"), | ||||
| 			NULL | ||||
| 		}, | ||||
| 		&DebugSharedBuffers, | ||||
| 		0, 0, 600, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"bgwriter_delay", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer sleep time between rounds in milliseconds"), | ||||
| @@ -1249,21 +1239,21 @@ static struct config_int ConfigureNamesInt[] = | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"bgwriter_percent", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer percentage of dirty buffers to flush per round"), | ||||
| 		{"bgwriter_lru_maxpages", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer maximum number of all pages to flush per round"), | ||||
| 			NULL | ||||
| 		}, | ||||
| 		&BgWriterPercent, | ||||
| 		1, 0, 100, NULL, NULL | ||||
| 		&bgwriter_lru_maxpages, | ||||
| 		5, 0, 1000, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"bgwriter_maxpages", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer maximum number of pages to flush per round"), | ||||
| 		{"bgwriter_all_maxpages", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer maximum number of LRU pages to flush per round"), | ||||
| 			NULL | ||||
| 		}, | ||||
| 		&BgWriterMaxPages, | ||||
| 		100, 0, 1000, NULL, NULL | ||||
| 		&bgwriter_all_maxpages, | ||||
| 		5, 0, 1000, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| @@ -1394,6 +1384,24 @@ static struct config_real ConfigureNamesReal[] = | ||||
| 		MAX_GEQO_SELECTION_BIAS, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer percentage of LRU buffers to flush per round"), | ||||
| 			NULL | ||||
| 		}, | ||||
| 		&bgwriter_lru_percent, | ||||
| 		1.0, 0.0, 100.0, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"bgwriter_all_percent", PGC_SIGHUP, RESOURCES, | ||||
| 			gettext_noop("Background writer percentage of all buffers to flush per round"), | ||||
| 			NULL | ||||
| 		}, | ||||
| 		&bgwriter_all_percent, | ||||
| 		0.333, 0.0, 100.0, NULL, NULL | ||||
| 	}, | ||||
|  | ||||
| 	{ | ||||
| 		{"seed", PGC_USERSET, UNGROUPED, | ||||
| 			gettext_noop("Sets the seed for random-number generation."), | ||||
|   | ||||
| @@ -99,8 +99,10 @@ | ||||
| # - Background writer - | ||||
|  | ||||
| #bgwriter_delay = 200		# 10-10000 milliseconds between rounds | ||||
| #bgwriter_percent = 1		# 0-100% of dirty buffers in each round | ||||
| #bgwriter_maxpages = 100	# 0-1000 buffers max per round | ||||
| #bgwriter_lru_percent = 1.0	# 0-100% of LRU buffers scanned in each round | ||||
| #bgwriter_lru_maxpages = 5	# 0-1000 buffers max written per round | ||||
| #bgwriter_all_percent = 0.333	# 0-100% of all buffers scanned in each round | ||||
| #bgwriter_all_maxpages = 5	# 0-1000 buffers max written per round | ||||
|  | ||||
|  | ||||
| #--------------------------------------------------------------------------- | ||||
|   | ||||
| @@ -14,7 +14,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $PostgreSQL: pgsql/src/backend/utils/resowner/resowner.c,v 1.9 2004/12/31 22:02:50 pgsql Exp $ | ||||
|  *	  $PostgreSQL: pgsql/src/backend/utils/resowner/resowner.c,v 1.10 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -200,12 +200,7 @@ ResourceOwnerReleaseInternal(ResourceOwner owner, | ||||
| 		 * that would indicate failure to clean up the executor correctly --- | ||||
| 		 * so issue warnings.  In the abort case, just clean up quietly. | ||||
| 		 * | ||||
| 		 * XXX this is fairly inefficient due to multiple BufMgrLock | ||||
| 		 * grabs if there are lots of buffers to be released, but we | ||||
| 		 * don't expect many (indeed none in the success case) so it's | ||||
| 		 * probably not worth optimizing. | ||||
| 		 * | ||||
| 		 * We are however careful to release back-to-front, so as to | ||||
| 		 * We are careful to do the releasing back-to-front, so as to | ||||
| 		 * avoid O(N^2) behavior in ResourceOwnerForgetBuffer(). | ||||
| 		 */ | ||||
| 		while (owner->nbuffers > 0) | ||||
|   | ||||
| @@ -5,7 +5,7 @@ | ||||
|  * | ||||
|  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group | ||||
|  * | ||||
|  * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.4 2004/12/31 22:03:39 pgsql Exp $ | ||||
|  * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.5 2005/03/04 20:21:06 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -18,8 +18,6 @@ | ||||
|  | ||||
| /* GUC options */ | ||||
| extern int	BgWriterDelay; | ||||
| extern int	BgWriterPercent; | ||||
| extern int	BgWriterMaxPages; | ||||
| extern int	CheckPointTimeout; | ||||
| extern int	CheckPointWarning; | ||||
|  | ||||
|   | ||||
| @@ -8,7 +8,7 @@ | ||||
|  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.76 2005/02/03 23:29:19 tgl Exp $ | ||||
|  * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.77 2005/03/04 20:21:07 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -19,24 +19,39 @@ | ||||
| #include "storage/buf.h" | ||||
| #include "storage/lwlock.h" | ||||
| #include "storage/shmem.h" | ||||
| #include "storage/spin.h" | ||||
| #include "utils/rel.h" | ||||
|  | ||||
|  | ||||
| /* | ||||
|  * Flags for buffer descriptors | ||||
|  * | ||||
|  * Note: TAG_VALID essentially means that there is a buffer hashtable | ||||
|  * entry associated with the buffer's tag. | ||||
|  */ | ||||
| #define BM_DIRTY				(1 << 0)		/* data needs writing */ | ||||
| #define BM_VALID				(1 << 1)		/* data is valid */ | ||||
| #define BM_IO_IN_PROGRESS		(1 << 2)		/* read or write in | ||||
| #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */ | ||||
| #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in | ||||
| 												 * progress */ | ||||
| #define BM_IO_ERROR				(1 << 3)		/* previous I/O failed */ | ||||
| #define BM_JUST_DIRTIED			(1 << 4)		/* dirtied since write | ||||
| #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */ | ||||
| #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write | ||||
| 												 * started */ | ||||
| #define BM_PIN_COUNT_WAITER		(1 << 5)		/* have waiter for sole | ||||
| #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole | ||||
| 												 * pin */ | ||||
|  | ||||
| typedef bits16 BufFlags; | ||||
|  | ||||
| /* | ||||
|  * The maximum allowed value of usage_count represents a tradeoff between | ||||
|  * accuracy and speed of the clock-sweep buffer management algorithm.  A | ||||
|  * large value (comparable to NBuffers) would approximate LRU semantics. | ||||
|  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of | ||||
|  * clock sweeps to find a free buffer, so in practice we don't want the | ||||
|  * value to be very large. | ||||
|  */ | ||||
| #define BM_MAX_USAGE_COUNT	5 | ||||
|  | ||||
| /* | ||||
|  * Buffer tag identifies which disk block the buffer contains. | ||||
|  * | ||||
| @@ -77,45 +92,81 @@ typedef struct buftag | ||||
|  | ||||
| /* | ||||
|  *	BufferDesc -- shared descriptor/state data for a single shared buffer. | ||||
|  * | ||||
|  * Note: buf_hdr_lock must be held to examine or change the tag, flags, | ||||
|  * usage_count, refcount, or wait_backend_id fields.  buf_id field never | ||||
|  * changes after initialization, so does not need locking.  freeNext is | ||||
|  * protected by the BufFreelistLock not buf_hdr_lock.  The LWLocks can take | ||||
|  * care of themselves.  The buf_hdr_lock is *not* used to control access to | ||||
|  * the data in the buffer! | ||||
|  * | ||||
|  * An exception is that if we have the buffer pinned, its tag can't change | ||||
|  * underneath us, so we can examine the tag without locking the spinlock. | ||||
|  * Also, in places we do one-time reads of the flags without bothering to | ||||
|  * lock the spinlock; this is generally for situations where we don't expect | ||||
|  * the flag bit being tested to be changing. | ||||
|  * | ||||
|  * We can't physically remove items from a disk page if another backend has | ||||
|  * the buffer pinned.  Hence, a backend may need to wait for all other pins | ||||
|  * to go away.  This is signaled by storing its own backend ID into | ||||
|  * wait_backend_id and setting flag bit BM_PIN_COUNT_WAITER.  At present, | ||||
|  * there can be only one such waiter per buffer. | ||||
|  * | ||||
|  * We use this same struct for local buffer headers, but the lock fields | ||||
|  * are not used and not all of the flag bits are useful either. | ||||
|  */ | ||||
| typedef struct sbufdesc | ||||
| { | ||||
| 	Buffer		bufNext;		/* link in freelist chain */ | ||||
| 	SHMEM_OFFSET data;			/* pointer to data in buf pool */ | ||||
|  | ||||
| 	/* tag and id must be together for table lookup (still true?) */ | ||||
| 	BufferTag	tag;			/* file/block identifier */ | ||||
| 	int			buf_id;			/* buffer's index number (from 0) */ | ||||
|  | ||||
| 	BufferTag	tag;			/* ID of page contained in buffer */ | ||||
| 	BufFlags	flags;			/* see bit definitions above */ | ||||
| 	uint16		usage_count;	/* usage counter for clock sweep code */ | ||||
| 	unsigned	refcount;		/* # of backends holding pins on buffer */ | ||||
| 	BackendId	wait_backend_id;	/* backend ID of pin-count waiter */ | ||||
|  | ||||
| 	slock_t		buf_hdr_lock;	/* protects the above fields */ | ||||
|  | ||||
| 	int			buf_id;			/* buffer's index number (from 0) */ | ||||
| 	int			freeNext;		/* link in freelist chain */ | ||||
|  | ||||
| 	LWLockId	io_in_progress_lock;	/* to wait for I/O to complete */ | ||||
| 	LWLockId	cntx_lock;		/* to lock access to page context */ | ||||
|  | ||||
| 	bool		cntxDirty;		/* new way to mark block as dirty */ | ||||
|  | ||||
| 	/* | ||||
| 	 * We can't physically remove items from a disk page if another | ||||
| 	 * backend has the buffer pinned.  Hence, a backend may need to wait | ||||
| 	 * for all other pins to go away.  This is signaled by storing its own | ||||
| 	 * backend ID into wait_backend_id and setting flag bit | ||||
| 	 * BM_PIN_COUNT_WAITER. At present, there can be only one such waiter | ||||
| 	 * per buffer. | ||||
| 	 */ | ||||
| 	BackendId	wait_backend_id;	/* backend ID of pin-count waiter */ | ||||
| 	LWLockId	content_lock;	/* to lock access to buffer contents */ | ||||
| } BufferDesc; | ||||
|  | ||||
| #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) | ||||
|  | ||||
| /* | ||||
|  * The freeNext field is either the index of the next freelist entry, | ||||
|  * or one of these special values: | ||||
|  */ | ||||
| #define FREENEXT_END_OF_LIST	(-1) | ||||
| #define FREENEXT_NOT_IN_LIST	(-2) | ||||
|  | ||||
| /* in bufmgr.c */ | ||||
| /* | ||||
|  * Macros for acquiring/releasing a buffer header's spinlock.  The | ||||
|  * NoHoldoff cases may be used when we know that we hold some LWLock | ||||
|  * and therefore interrupts are already held off.  Do not apply these | ||||
|  * to local buffers! | ||||
|  */ | ||||
| #define LockBufHdr(bufHdr)  \ | ||||
| 	SpinLockAcquire(&(bufHdr)->buf_hdr_lock) | ||||
| #define UnlockBufHdr(bufHdr)  \ | ||||
| 	SpinLockRelease(&(bufHdr)->buf_hdr_lock) | ||||
| #define LockBufHdr_NoHoldoff(bufHdr)  \ | ||||
| 	SpinLockAcquire_NoHoldoff(&(bufHdr)->buf_hdr_lock) | ||||
| #define UnlockBufHdr_NoHoldoff(bufHdr)  \ | ||||
| 	SpinLockRelease_NoHoldoff(&(bufHdr)->buf_hdr_lock) | ||||
|  | ||||
|  | ||||
| /* in buf_init.c */ | ||||
| extern BufferDesc *BufferDescriptors; | ||||
|  | ||||
| /* in localbuf.c */ | ||||
| extern BufferDesc *LocalBufferDescriptors; | ||||
|  | ||||
| /* counters in buf_init.c */ | ||||
| /* in freelist.c */ | ||||
| extern bool strategy_hint_vacuum; | ||||
|  | ||||
| /* event counters in buf_init.c */ | ||||
| extern long int ReadBufferCount; | ||||
| extern long int ReadLocalBufferCount; | ||||
| extern long int BufferHitCount; | ||||
| @@ -129,15 +180,9 @@ extern long int LocalBufferFlushCount; | ||||
|  */ | ||||
|  | ||||
| /* freelist.c */ | ||||
| extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck, | ||||
| 					 int *cdb_found_index); | ||||
| extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index); | ||||
| extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, | ||||
| 					  int cdb_found_index, int cdb_replace_index); | ||||
| extern void StrategyInvalidateBuffer(BufferDesc *buf); | ||||
| extern void StrategyHintVacuum(bool vacuum_active); | ||||
| extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags, | ||||
| 						int max_buffers); | ||||
| extern BufferDesc *StrategyGetBuffer(void); | ||||
| extern void StrategyFreeBuffer(BufferDesc *buf, bool at_head); | ||||
| extern int	StrategySyncStart(void); | ||||
| extern int	StrategyShmemSize(void); | ||||
| extern void StrategyInitialize(bool init); | ||||
|  | ||||
| @@ -145,7 +190,7 @@ extern void StrategyInitialize(bool init); | ||||
| extern int	BufTableShmemSize(int size); | ||||
| extern void InitBufTable(int size); | ||||
| extern int	BufTableLookup(BufferTag *tagPtr); | ||||
| extern void BufTableInsert(BufferTag *tagPtr, int buf_id); | ||||
| extern int	BufTableInsert(BufferTag *tagPtr, int buf_id); | ||||
| extern void BufTableDelete(BufferTag *tagPtr); | ||||
|  | ||||
| /* localbuf.c */ | ||||
|   | ||||
| @@ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.89 2004/12/31 22:03:42 pgsql Exp $ | ||||
|  * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.90 2005/03/04 20:21:07 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -27,21 +27,25 @@ extern DLLIMPORT int NBuffers; | ||||
|  | ||||
| /* in bufmgr.c */ | ||||
| extern bool zero_damaged_pages; | ||||
| extern double bgwriter_lru_percent; | ||||
| extern double bgwriter_all_percent; | ||||
| extern int	bgwriter_lru_maxpages; | ||||
| extern int	bgwriter_all_maxpages; | ||||
|  | ||||
| /* in buf_init.c */ | ||||
| extern DLLIMPORT Block *BufferBlockPointers; | ||||
| extern int32 *PrivateRefCount; | ||||
| extern DLLIMPORT int32 *PrivateRefCount; | ||||
|  | ||||
| /* in localbuf.c */ | ||||
| extern DLLIMPORT int NLocBuffer; | ||||
| extern DLLIMPORT Block *LocalBufferBlockPointers; | ||||
| extern int32 *LocalRefCount; | ||||
| extern DLLIMPORT int32 *LocalRefCount; | ||||
|  | ||||
| /* special block number for ReadBuffer() */ | ||||
| #define P_NEW	InvalidBlockNumber		/* grow the file to get a new page */ | ||||
|  | ||||
| /* | ||||
|  * Buffer context lock modes | ||||
|  * Buffer content lock modes (mode argument for LockBuffer()) | ||||
|  */ | ||||
| #define BUFFER_LOCK_UNLOCK		0 | ||||
| #define BUFFER_LOCK_SHARE		1 | ||||
| @@ -150,8 +154,12 @@ extern void LockBufferForCleanup(Buffer buffer); | ||||
| extern void AbortBufferIO(void); | ||||
|  | ||||
| extern void BufmgrCommit(void); | ||||
| extern int	BufferSync(int percent, int maxpages); | ||||
| extern void	BufferSync(void); | ||||
| extern void BgBufferSync(void); | ||||
|  | ||||
| extern void InitLocalBuffer(void); | ||||
|  | ||||
| /* in freelist.c */ | ||||
| extern void StrategyHintVacuum(bool vacuum_active); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.16 2004/12/31 22:03:42 pgsql Exp $ | ||||
|  * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.17 2005/03/04 20:21:07 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @@ -25,7 +25,8 @@ | ||||
|  */ | ||||
| typedef enum LWLockId | ||||
| { | ||||
| 	BufMgrLock, | ||||
| 	BufMappingLock, | ||||
| 	BufFreelistLock, | ||||
| 	LockMgrLock, | ||||
| 	OidGenLock, | ||||
| 	XidGenLock, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user