mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Doc: Hash Indexes.
A new chapter for Hash Indexes, designed to help users understand how they work and when to use them. Backpatch-through 10 where we have made hash indexes durable. Author: Simon Riggs Reviewed-By: Justin Pryzby, Amit Kapila Discussion: https://postgr.es/m/CANbhV-HRjNPYgHo--P1ewBrFJ-GpZPb9_25P7=Wgu7s7hy_sLQ@mail.gmail.com
This commit is contained in:
		@@ -89,6 +89,7 @@
 | 
				
			|||||||
<!ENTITY spgist     SYSTEM "spgist.sgml">
 | 
					<!ENTITY spgist     SYSTEM "spgist.sgml">
 | 
				
			||||||
<!ENTITY gin        SYSTEM "gin.sgml">
 | 
					<!ENTITY gin        SYSTEM "gin.sgml">
 | 
				
			||||||
<!ENTITY brin       SYSTEM "brin.sgml">
 | 
					<!ENTITY brin       SYSTEM "brin.sgml">
 | 
				
			||||||
 | 
					<!ENTITY hash       SYSTEM "hash.sgml">
 | 
				
			||||||
<!ENTITY planstats    SYSTEM "planstats.sgml">
 | 
					<!ENTITY planstats    SYSTEM "planstats.sgml">
 | 
				
			||||||
<!ENTITY tableam    SYSTEM "tableam.sgml">
 | 
					<!ENTITY tableam    SYSTEM "tableam.sgml">
 | 
				
			||||||
<!ENTITY indexam    SYSTEM "indexam.sgml">
 | 
					<!ENTITY indexam    SYSTEM "indexam.sgml">
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										162
									
								
								doc/src/sgml/hash.sgml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										162
									
								
								doc/src/sgml/hash.sgml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,162 @@
 | 
				
			|||||||
 | 
					<!-- doc/src/sgml/hash.sgml -->
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<chapter id="hash-index">
 | 
				
			||||||
 | 
					<title>Hash Indexes</title>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   <indexterm>
 | 
				
			||||||
 | 
					    <primary>index</primary>
 | 
				
			||||||
 | 
					    <secondary>Hash</secondary>
 | 
				
			||||||
 | 
					   </indexterm>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<sect1 id="hash-intro">
 | 
				
			||||||
 | 
					 <title>Overview</title>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  <productname>PostgreSQL</productname>
 | 
				
			||||||
 | 
					  includes an implementation of persistent on-disk hash indexes,
 | 
				
			||||||
 | 
					  which are fully crash recoverable. Any data type can be indexed by a
 | 
				
			||||||
 | 
					  hash index, including data types that do not have a well-defined linear
 | 
				
			||||||
 | 
					  ordering. Hash indexes store only the hash value of the data being
 | 
				
			||||||
 | 
					  indexed, thus there are no restrictions on the size of the data column
 | 
				
			||||||
 | 
					  being indexed.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Hash indexes support only single-column indexes and do not allow
 | 
				
			||||||
 | 
					  uniqueness checking.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Hash indexes support only the <literal>=</literal> operator,
 | 
				
			||||||
 | 
					  so WHERE clauses that specify range operations will not be able to take
 | 
				
			||||||
 | 
					  advantage of hash indexes.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Each hash index tuple stores just the 4-byte hash value, not the actual
 | 
				
			||||||
 | 
					  column value. As a result, hash indexes may be much smaller than B-trees
 | 
				
			||||||
 | 
					  when indexing longer data items such as UUIDs, URLs, etc. The absence of
 | 
				
			||||||
 | 
					  the column value also makes all hash index scans lossy. Hash indexes may
 | 
				
			||||||
 | 
					  take part in bitmap index scans and backward scans.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Hash indexes are best optimized for SELECT and UPDATE-heavy workloads
 | 
				
			||||||
 | 
					  that use equality scans on larger tables. In a B-tree index, searches must
 | 
				
			||||||
 | 
					  descend through the tree until the leaf page is found. In tables with
 | 
				
			||||||
 | 
					  millions of rows, this descent can increase access time to data. The
 | 
				
			||||||
 | 
					  equivalent of a leaf page in a hash index is referred to as a bucket page. In
 | 
				
			||||||
 | 
					  contrast, a hash index allows accessing the bucket pages directly,
 | 
				
			||||||
 | 
					  thereby potentially reducing index access time in larger tables. This
 | 
				
			||||||
 | 
					  reduction in "logical I/O" becomes even more pronounced on indexes/data
 | 
				
			||||||
 | 
					  larger than shared_buffers/RAM. 
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Hash indexes have been designed to cope with uneven distributions of
 | 
				
			||||||
 | 
					  hash values. Direct access to the bucket pages works well if the hash
 | 
				
			||||||
 | 
					  values are evenly distributed. When inserts mean that the bucket page
 | 
				
			||||||
 | 
					  becomes full, additional overflow pages are chained to that specific
 | 
				
			||||||
 | 
					  bucket page, locally expanding the storage for index tuples that match
 | 
				
			||||||
 | 
					  that hash value. When scanning a hash bucket during queries, we need to
 | 
				
			||||||
 | 
					  scan through all of the overflow pages. Thus an unbalanced hash index
 | 
				
			||||||
 | 
					  might actually be worse than a B-tree in terms of number of block
 | 
				
			||||||
 | 
					  accesses required, for some data.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  As a result of the overflow cases, we can say that hash indexes are
 | 
				
			||||||
 | 
					  most suitable for unique, nearly unique data or data with a low number
 | 
				
			||||||
 | 
					  of rows per hash bucket.
 | 
				
			||||||
 | 
					  One possible way to avoid problems is to exclude highly non-unique
 | 
				
			||||||
 | 
					  values from the index using a partial index condition, but this may
 | 
				
			||||||
 | 
					  not be suitable in many cases.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Like B-Trees, hash indexes perform simple index tuple deletion. This
 | 
				
			||||||
 | 
					  is a deferred maintenance operation that deletes index tuples that are
 | 
				
			||||||
 | 
					  known to be safe to delete (those whose item identifier's LP_DEAD bit
 | 
				
			||||||
 | 
					  is already set). If an insert finds no space is available on a page we
 | 
				
			||||||
 | 
					  try to avoid creating a new overflow page by attempting to remove dead
 | 
				
			||||||
 | 
					  index tuples. Removal cannot occur if the page is pinned at that time.
 | 
				
			||||||
 | 
					  Deletion of dead index pointers also occurs during VACUUM.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  If it can, VACUUM will also try to squeeze the index tuples onto as
 | 
				
			||||||
 | 
					  few overflow pages as possible, minimizing the overflow chain. If an
 | 
				
			||||||
 | 
					  overflow page becomes empty, overflow pages can be recycled for reuse
 | 
				
			||||||
 | 
					  in other buckets, though we never return them to the operating system.
 | 
				
			||||||
 | 
					  There is currently no provision to shrink a hash index, other than by
 | 
				
			||||||
 | 
					  rebuilding it with REINDEX.  
 | 
				
			||||||
 | 
					  There is no provision for reducing the number of buckets, either.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Hash indexes may expand the number of bucket pages as the number of
 | 
				
			||||||
 | 
					  rows indexed grows. The hash key-to-bucket-number mapping is chosen so that
 | 
				
			||||||
 | 
					  the index can be incrementally expanded. When a new bucket is to be added to
 | 
				
			||||||
 | 
					  the index, exactly one existing bucket will need to be "split", with some of
 | 
				
			||||||
 | 
					  its tuples being transferred to the new bucket according to the updated
 | 
				
			||||||
 | 
					  key-to-bucket-number mapping.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  The expansion occurs in the foreground, which could increase execution
 | 
				
			||||||
 | 
					  time for user inserts. Thus, hash indexes may not be suitable for tables
 | 
				
			||||||
 | 
					  with rapidly increasing number of rows.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</sect1>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<sect1 id="hash-implementation">
 | 
				
			||||||
 | 
					 <title>Implementation</title>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  There are four kinds of pages in a hash index: the meta page (page zero),
 | 
				
			||||||
 | 
					  which contains statically allocated control information; primary bucket
 | 
				
			||||||
 | 
					  pages; overflow pages; and bitmap pages, which keep track of overflow
 | 
				
			||||||
 | 
					  pages that have been freed and are available for re-use. For addressing
 | 
				
			||||||
 | 
					  purposes, bitmap pages are regarded as a subset of the overflow pages.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Both scanning the index and inserting tuples require locating the bucket
 | 
				
			||||||
 | 
					  where a given tuple ought to be located. To do this, we need the bucket
 | 
				
			||||||
 | 
					  count, highmask, and lowmask from the metapage; however, it's undesirable
 | 
				
			||||||
 | 
					  for performance reasons to have to have to lock and pin the metapage for
 | 
				
			||||||
 | 
					  every such operation. Instead, we retain a cached copy of the metapage
 | 
				
			||||||
 | 
					  in each backend's relcache entry. This will produce the correct bucket
 | 
				
			||||||
 | 
					  mapping as long as the target bucket hasn't been split since the last
 | 
				
			||||||
 | 
					  cache refresh.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Primary bucket pages and overflow pages are allocated independently since
 | 
				
			||||||
 | 
					  any given index might need more or fewer overflow pages relative to its
 | 
				
			||||||
 | 
					  number of buckets. The hash code uses an interesting set of addressing
 | 
				
			||||||
 | 
					  rules to support a variable number of overflow pages while not having to
 | 
				
			||||||
 | 
					  move primary bucket pages around after they are created.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  Each row in the table indexed is represented by a single index tuple in
 | 
				
			||||||
 | 
					  the hash index. Hash index tuples are stored in bucket pages, and if
 | 
				
			||||||
 | 
					  they exist, overflow pages. We speed up searches by keeping the index entries
 | 
				
			||||||
 | 
					  in any one index page sorted by hash code, thus allowing binary search to be
 | 
				
			||||||
 | 
					  used within an index page. Note however that there is *no* assumption about
 | 
				
			||||||
 | 
					  the relative ordering of hash codes across different index pages of a bucket.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 <para>
 | 
				
			||||||
 | 
					  The bucket splitting algorithms to expand the hash index are too complex to
 | 
				
			||||||
 | 
					  be worthy of mention here, though are described in more detail in
 | 
				
			||||||
 | 
					  <filename>src/backend/access/hash/README</filename>.
 | 
				
			||||||
 | 
					  The split algorithm is crash safe and can be restarted if not completed
 | 
				
			||||||
 | 
					  successfully.
 | 
				
			||||||
 | 
					 </para>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</sect1>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</chapter>
 | 
				
			||||||
@@ -266,6 +266,7 @@ break is not needed in a wider output rendering.
 | 
				
			|||||||
  &spgist;
 | 
					  &spgist;
 | 
				
			||||||
  &gin;
 | 
					  &gin;
 | 
				
			||||||
  &brin;
 | 
					  &brin;
 | 
				
			||||||
 | 
					  &hash;
 | 
				
			||||||
  &storage;
 | 
					  &storage;
 | 
				
			||||||
  &bki;
 | 
					  &bki;
 | 
				
			||||||
  &planstats;
 | 
					  &planstats;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user