diff --git a/ext/lsm1/Makefile b/ext/lsm1/Makefile new file mode 100644 index 0000000000..46ef9e2534 --- /dev/null +++ b/ext/lsm1/Makefile @@ -0,0 +1,33 @@ +#!/usr/bin/make +# +# This is a temporary makefile for use during experimental development. +# Replace with something more portable, if the experiments actually work out. +# +CC = gcc +CFLAGS =-g -fPIC -Wall -I. -I/home/drh/sqlite/bld + +LSMOBJ = \ + lsm_ckpt.o \ + lsm_file.o \ + lsm_log.o \ + lsm_main.o \ + lsm_mem.o \ + lsm_mutex.o \ + lsm_shared.o \ + lsm_sorted.o \ + lsm_str.o \ + lsm_tree.o \ + lsm_unix.o \ + lsm_varint.o + +LSMHDR = \ + lsm.h \ + lsmInt.h + +all: lsm.so + +lsm.so: $(LSMOBJ) + $(CC) $(CFLAGS) -shared -o lsm.so $(LSMOBJ) + +%.o: %.c $(LSMHDR) + $(CC) $(CFLAGS) -c $< diff --git a/ext/lsm1/lsm.h b/ext/lsm1/lsm.h new file mode 100644 index 0000000000..ee0f8f2ac7 --- /dev/null +++ b/ext/lsm1/lsm.h @@ -0,0 +1,684 @@ +/* +** 2011-08-10 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** This file defines the LSM API. +*/ +#ifndef _LSM_H +#define _LSM_H +#include +#ifdef __cplusplus +extern "C" { +#endif + +/* +** Opaque handle types. +*/ +typedef struct lsm_compress lsm_compress; /* Compression library functions */ +typedef struct lsm_compress_factory lsm_compress_factory; +typedef struct lsm_cursor lsm_cursor; /* Database cursor handle */ +typedef struct lsm_db lsm_db; /* Database connection handle */ +typedef struct lsm_env lsm_env; /* Runtime environment */ +typedef struct lsm_file lsm_file; /* OS file handle */ +typedef struct lsm_mutex lsm_mutex; /* Mutex handle */ + +/* 64-bit integer type used for file offsets. */ +typedef long long int lsm_i64; /* 64-bit signed integer type */ + +/* Candidate values for the 3rd argument to lsm_env.xLock() */ +#define LSM_LOCK_UNLOCK 0 +#define LSM_LOCK_SHARED 1 +#define LSM_LOCK_EXCL 2 + +/* Flags for lsm_env.xOpen() */ +#define LSM_OPEN_READONLY 0x0001 + +/* +** CAPI: Database Runtime Environment +** +** Run-time environment used by LSM +*/ +struct lsm_env { + int nByte; /* Size of this structure in bytes */ + int iVersion; /* Version number of this structure (1) */ + /****** file i/o ***********************************************/ + void *pVfsCtx; + int (*xFullpath)(lsm_env*, const char *, char *, int *); + int (*xOpen)(lsm_env*, const char *, int flags, lsm_file **); + int (*xRead)(lsm_file *, lsm_i64, void *, int); + int (*xWrite)(lsm_file *, lsm_i64, void *, int); + int (*xTruncate)(lsm_file *, lsm_i64); + int (*xSync)(lsm_file *); + int (*xSectorSize)(lsm_file *); + int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*); + int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf); + int (*xClose)(lsm_file *); + int (*xUnlink)(lsm_env*, const char *); + int (*xLock)(lsm_file*, int, int); + int (*xTestLock)(lsm_file*, int, int, int); + int (*xShmMap)(lsm_file*, int, int, void **); + void (*xShmBarrier)(void); + int (*xShmUnmap)(lsm_file*, int); + /****** memory allocation ****************************************/ + void *pMemCtx; + void *(*xMalloc)(lsm_env*, size_t); /* malloc(3) function */ + void *(*xRealloc)(lsm_env*, void *, size_t); /* realloc(3) function */ + void (*xFree)(lsm_env*, void *); /* free(3) function */ + size_t (*xSize)(lsm_env*, void *); /* xSize function */ + /****** mutexes ****************************************************/ + void *pMutexCtx; + int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */ + int (*xMutexNew)(lsm_env*, lsm_mutex**); /* Get a new dynamic mutex */ + void (*xMutexDel)(lsm_mutex *); /* Delete an allocated mutex */ + void (*xMutexEnter)(lsm_mutex *); /* Grab a mutex */ + int (*xMutexTry)(lsm_mutex *); /* Attempt to obtain a mutex */ + void (*xMutexLeave)(lsm_mutex *); /* Leave a mutex */ + int (*xMutexHeld)(lsm_mutex *); /* Return true if mutex is held */ + int (*xMutexNotHeld)(lsm_mutex *); /* Return true if mutex not held */ + /****** other ****************************************************/ + int (*xSleep)(lsm_env*, int microseconds); + + /* New fields may be added in future releases, in which case the + ** iVersion value will increase. */ +}; + +/* +** Values that may be passed as the second argument to xMutexStatic. +*/ +#define LSM_MUTEX_GLOBAL 1 +#define LSM_MUTEX_HEAP 2 + +/* +** CAPI: LSM Error Codes +*/ +#define LSM_OK 0 +#define LSM_ERROR 1 +#define LSM_BUSY 5 +#define LSM_NOMEM 7 +#define LSM_READONLY 8 +#define LSM_IOERR 10 +#define LSM_CORRUPT 11 +#define LSM_FULL 13 +#define LSM_CANTOPEN 14 +#define LSM_PROTOCOL 15 +#define LSM_MISUSE 21 + +#define LSM_MISMATCH 50 + + +#define LSM_IOERR_NOENT (LSM_IOERR | (1<<8)) + +/* +** CAPI: Creating and Destroying Database Connection Handles +** +** Open and close a database connection handle. +*/ +int lsm_new(lsm_env*, lsm_db **ppDb); +int lsm_close(lsm_db *pDb); + +/* +** CAPI: Connecting to a Database +*/ +int lsm_open(lsm_db *pDb, const char *zFilename); + +/* +** CAPI: Obtaining pointers to database environments +** +** Return a pointer to the environment used by the database connection +** passed as the first argument. Assuming the argument is valid, this +** function always returns a valid environment pointer - it cannot fail. +*/ +lsm_env *lsm_get_env(lsm_db *pDb); + +/* +** The lsm_default_env() function returns a pointer to the default LSM +** environment for the current platform. +*/ +lsm_env *lsm_default_env(void); + + +/* +** CAPI: Configuring a database connection. +** +** The lsm_config() function is used to configure a database connection. +*/ +int lsm_config(lsm_db *, int, ...); + +/* +** The following values may be passed as the second argument to lsm_config(). +** +** LSM_CONFIG_AUTOFLUSH: +** A read/write integer parameter. +** +** This value determines the amount of data allowed to accumulate in a +** live in-memory tree before it is marked as old. After committing a +** transaction, a connection checks if the size of the live in-memory tree, +** including data structure overhead, is greater than the value of this +** option in KB. If it is, and there is not already an old in-memory tree, +** the live in-memory tree is marked as old. +** +** The maximum allowable value is 1048576 (1GB). There is no minimum +** value. If this parameter is set to zero, then an attempt is made to +** mark the live in-memory tree as old after each transaction is committed. +** +** The default value is 1024 (1MB). +** +** LSM_CONFIG_PAGE_SIZE: +** A read/write integer parameter. This parameter may only be set before +** lsm_open() has been called. +** +** LSM_CONFIG_BLOCK_SIZE: +** A read/write integer parameter. +** +** This parameter may only be set before lsm_open() has been called. It +** must be set to a power of two between 64 and 65536, inclusive (block +** sizes between 64KB and 64MB). +** +** If the connection creates a new database, the block size of the new +** database is set to the value of this option in KB. After lsm_open() +** has been called, querying this parameter returns the actual block +** size of the opened database. +** +** The default value is 1024 (1MB blocks). +** +** LSM_CONFIG_SAFETY: +** A read/write integer parameter. Valid values are 0, 1 (the default) +** and 2. This parameter determines how robust the database is in the +** face of a system crash (e.g. a power failure or operating system +** crash). As follows: +** +** 0 (off): No robustness. A system crash may corrupt the database. +** +** 1 (normal): Some robustness. A system crash may not corrupt the +** database file, but recently committed transactions may +** be lost following recovery. +** +** 2 (full): Full robustness. A system crash may not corrupt the +** database file. Following recovery the database file +** contains all successfully committed transactions. +** +** LSM_CONFIG_AUTOWORK: +** A read/write integer parameter. +** +** LSM_CONFIG_AUTOCHECKPOINT: +** A read/write integer parameter. +** +** If this option is set to non-zero value N, then a checkpoint is +** automatically attempted after each N KB of data have been written to +** the database file. +** +** The amount of uncheckpointed data already written to the database file +** is a global parameter. After performing database work (writing to the +** database file), the process checks if the total amount of uncheckpointed +** data exceeds the value of this paramter. If so, a checkpoint is performed. +** This means that this option may cause the connection to perform a +** checkpoint even if the current connection has itself written very little +** data into the database file. +** +** The default value is 2048 (checkpoint every 2MB). +** +** LSM_CONFIG_MMAP: +** A read/write integer parameter. If this value is set to 0, then the +** database file is accessed using ordinary read/write IO functions. Or, +** if it is set to 1, then the database file is memory mapped and accessed +** that way. If this parameter is set to any value N greater than 1, then +** up to the first N KB of the file are memory mapped, and any remainder +** accessed using read/write IO. +** +** The default value is 1 on 64-bit platforms and 32768 on 32-bit platforms. +** +** +** LSM_CONFIG_USE_LOG: +** A read/write boolean parameter. True (the default) to use the log +** file normally. False otherwise. +** +** LSM_CONFIG_AUTOMERGE: +** A read/write integer parameter. The minimum number of segments to +** merge together at a time. Default value 4. +** +** LSM_CONFIG_MAX_FREELIST: +** A read/write integer parameter. The maximum number of free-list +** entries that are stored in a database checkpoint (the others are +** stored elsewhere in the database). +** +** There is no reason for an application to configure or query this +** parameter. It is only present because configuring a small value +** makes certain parts of the lsm code easier to test. +** +** LSM_CONFIG_MULTIPLE_PROCESSES: +** A read/write boolean parameter. This parameter may only be set before +** lsm_open() has been called. If true, the library uses shared-memory +** and posix advisory locks to co-ordinate access by clients from within +** multiple processes. Otherwise, if false, all database clients must be +** located in the same process. The default value is true. +** +** LSM_CONFIG_SET_COMPRESSION: +** Set the compression methods used to compress and decompress database +** content. The argument to this option should be a pointer to a structure +** of type lsm_compress. The lsm_config() method takes a copy of the +** structures contents. +** +** This option may only be used before lsm_open() is called. Invoking it +** after lsm_open() has been called results in an LSM_MISUSE error. +** +** LSM_CONFIG_GET_COMPRESSION: +** Query the compression methods used to compress and decompress database +** content. +** +** LSM_CONFIG_SET_COMPRESSION_FACTORY: +** Configure a factory method to be invoked in case of an LSM_MISMATCH +** error. +** +** LSM_CONFIG_READONLY: +** A read/write boolean parameter. This parameter may only be set before +** lsm_open() is called. +*/ +#define LSM_CONFIG_AUTOFLUSH 1 +#define LSM_CONFIG_PAGE_SIZE 2 +#define LSM_CONFIG_SAFETY 3 +#define LSM_CONFIG_BLOCK_SIZE 4 +#define LSM_CONFIG_AUTOWORK 5 +#define LSM_CONFIG_MMAP 7 +#define LSM_CONFIG_USE_LOG 8 +#define LSM_CONFIG_AUTOMERGE 9 +#define LSM_CONFIG_MAX_FREELIST 10 +#define LSM_CONFIG_MULTIPLE_PROCESSES 11 +#define LSM_CONFIG_AUTOCHECKPOINT 12 +#define LSM_CONFIG_SET_COMPRESSION 13 +#define LSM_CONFIG_GET_COMPRESSION 14 +#define LSM_CONFIG_SET_COMPRESSION_FACTORY 15 +#define LSM_CONFIG_READONLY 16 + +#define LSM_SAFETY_OFF 0 +#define LSM_SAFETY_NORMAL 1 +#define LSM_SAFETY_FULL 2 + +/* +** CAPI: Compression and/or Encryption Hooks +*/ +struct lsm_compress { + void *pCtx; + unsigned int iId; + int (*xBound)(void *, int nSrc); + int (*xCompress)(void *, char *, int *, const char *, int); + int (*xUncompress)(void *, char *, int *, const char *, int); + void (*xFree)(void *pCtx); +}; + +struct lsm_compress_factory { + void *pCtx; + int (*xFactory)(void *, lsm_db *, unsigned int); + void (*xFree)(void *pCtx); +}; + +#define LSM_COMPRESSION_EMPTY 0 +#define LSM_COMPRESSION_NONE 1 + +/* +** CAPI: Allocating and Freeing Memory +** +** Invoke the memory allocation functions that belong to environment +** pEnv. Or the system defaults if no memory allocation functions have +** been registered. +*/ +void *lsm_malloc(lsm_env*, size_t); +void *lsm_realloc(lsm_env*, void *, size_t); +void lsm_free(lsm_env*, void *); + +/* +** CAPI: Querying a Connection For Operational Data +** +** Query a database connection for operational statistics or data. +*/ +int lsm_info(lsm_db *, int, ...); + +int lsm_get_user_version(lsm_db *, unsigned int *); +int lsm_set_user_version(lsm_db *, unsigned int); + +/* +** The following values may be passed as the second argument to lsm_info(). +** +** LSM_INFO_NWRITE: +** The third parameter should be of type (int *). The location pointed +** to by the third parameter is set to the number of 4KB pages written to +** the database file during the lifetime of this connection. +** +** LSM_INFO_NREAD: +** The third parameter should be of type (int *). The location pointed +** to by the third parameter is set to the number of 4KB pages read from +** the database file during the lifetime of this connection. +** +** LSM_INFO_DB_STRUCTURE: +** The third argument should be of type (char **). The location pointed +** to is populated with a pointer to a nul-terminated string containing +** the string representation of a Tcl data-structure reflecting the +** current structure of the database file. Specifically, the current state +** of the worker snapshot. The returned string should be eventually freed +** by the caller using lsm_free(). +** +** The returned list contains one element for each level in the database, +** in order from most to least recent. Each element contains a +** single element for each segment comprising the corresponding level, +** starting with the lhs segment, then each of the rhs segments (if any) +** in order from most to least recent. +** +** Each segment element is itself a list of 4 integer values, as follows: +** +**
  1. First page of segment +**
  2. Last page of segment +**
  3. Root page of segment (if applicable) +**
  4. Total number of pages in segment +**
+** +** LSM_INFO_ARRAY_STRUCTURE: +** There should be two arguments passed following this option (i.e. a +** total of four arguments passed to lsm_info()). The first argument +** should be the page number of the first page in a database array +** (perhaps obtained from an earlier INFO_DB_STRUCTURE call). The second +** trailing argument should be of type (char **). The location pointed +** to is populated with a pointer to a nul-terminated string that must +** be eventually freed using lsm_free() by the caller. +** +** The output string contains the text representation of a Tcl list of +** integers. Each pair of integers represent a range of pages used by +** the identified array. For example, if the array occupies database +** pages 993 to 1024, then pages 2048 to 2777, then the returned string +** will be "993 1024 2048 2777". +** +** If the specified integer argument does not correspond to the first +** page of any database array, LSM_ERROR is returned and the output +** pointer is set to a NULL value. +** +** LSM_INFO_LOG_STRUCTURE: +** The third argument should be of type (char **). The location pointed +** to is populated with a pointer to a nul-terminated string containing +** the string representation of a Tcl data-structure. The returned +** string should be eventually freed by the caller using lsm_free(). +** +** The Tcl structure returned is a list of six integers that describe +** the current structure of the log file. +** +** LSM_INFO_ARRAY_PAGES: +** +** LSM_INFO_PAGE_ASCII_DUMP: +** As with LSM_INFO_ARRAY_STRUCTURE, there should be two arguments passed +** with calls that specify this option - an integer page number and a +** (char **) used to return a nul-terminated string that must be later +** freed using lsm_free(). In this case the output string is populated +** with a human-readable description of the page content. +** +** If the page cannot be decoded, it is not an error. In this case the +** human-readable output message will report the systems failure to +** interpret the page data. +** +** LSM_INFO_PAGE_HEX_DUMP: +** This argument is similar to PAGE_ASCII_DUMP, except that keys and +** values are represented using hexadecimal notation instead of ascii. +** +** LSM_INFO_FREELIST: +** The third argument should be of type (char **). The location pointed +** to is populated with a pointer to a nul-terminated string containing +** the string representation of a Tcl data-structure. The returned +** string should be eventually freed by the caller using lsm_free(). +** +** The Tcl structure returned is a list containing one element for each +** free block in the database. The element itself consists of two +** integers - the block number and the id of the snapshot that freed it. +** +** LSM_INFO_CHECKPOINT_SIZE: +** The third argument should be of type (int *). The location pointed to +** by this argument is populated with the number of KB written to the +** database file since the most recent checkpoint. +** +** LSM_INFO_TREE_SIZE: +** If this value is passed as the second argument to an lsm_info() call, it +** should be followed by two arguments of type (int *) (for a total of four +** arguments). +** +** At any time, there are either one or two tree structures held in shared +** memory that new database clients will access (there may also be additional +** tree structures being used by older clients - this API does not provide +** information on them). One tree structure - the current tree - is used to +** accumulate new data written to the database. The other tree structure - +** the old tree - is a read-only tree holding older data and may be flushed +** to disk at any time. +** +** Assuming no error occurs, the location pointed to by the first of the two +** (int *) arguments is set to the size of the old in-memory tree in KB. +** The second is set to the size of the current, or live in-memory tree. +** +** LSM_INFO_COMPRESSION_ID: +** This value should be followed by a single argument of type +** (unsigned int *). If successful, the location pointed to is populated +** with the database compression id before returning. +*/ +#define LSM_INFO_NWRITE 1 +#define LSM_INFO_NREAD 2 +#define LSM_INFO_DB_STRUCTURE 3 +#define LSM_INFO_LOG_STRUCTURE 4 +#define LSM_INFO_ARRAY_STRUCTURE 5 +#define LSM_INFO_PAGE_ASCII_DUMP 6 +#define LSM_INFO_PAGE_HEX_DUMP 7 +#define LSM_INFO_FREELIST 8 +#define LSM_INFO_ARRAY_PAGES 9 +#define LSM_INFO_CHECKPOINT_SIZE 10 +#define LSM_INFO_TREE_SIZE 11 +#define LSM_INFO_FREELIST_SIZE 12 +#define LSM_INFO_COMPRESSION_ID 13 + + +/* +** CAPI: Opening and Closing Write Transactions +** +** These functions are used to open and close transactions and nested +** sub-transactions. +** +** The lsm_begin() function is used to open transactions and sub-transactions. +** A successful call to lsm_begin() ensures that there are at least iLevel +** nested transactions open. To open a top-level transaction, pass iLevel=1. +** To open a sub-transaction within the top-level transaction, iLevel=2. +** Passing iLevel=0 is a no-op. +** +** lsm_commit() is used to commit transactions and sub-transactions. A +** successful call to lsm_commit() ensures that there are at most iLevel +** nested transactions open. To commit a top-level transaction, pass iLevel=0. +** To commit all sub-transactions inside the main transaction, pass iLevel=1. +** +** Function lsm_rollback() is used to roll back transactions and +** sub-transactions. A successful call to lsm_rollback() restores the database +** to the state it was in when the iLevel'th nested sub-transaction (if any) +** was first opened. And then closes transactions to ensure that there are +** at most iLevel nested transactions open. Passing iLevel=0 rolls back and +** closes the top-level transaction. iLevel=1 also rolls back the top-level +** transaction, but leaves it open. iLevel=2 rolls back the sub-transaction +** nested directly inside the top-level transaction (and leaves it open). +*/ +int lsm_begin(lsm_db *pDb, int iLevel); +int lsm_commit(lsm_db *pDb, int iLevel); +int lsm_rollback(lsm_db *pDb, int iLevel); + +/* +** CAPI: Writing to a Database +** +** Write a new value into the database. If a value with a duplicate key +** already exists it is replaced. +*/ +int lsm_insert(lsm_db*, const void *pKey, int nKey, const void *pVal, int nVal); + +/* +** Delete a value from the database. No error is returned if the specified +** key value does not exist in the database. +*/ +int lsm_delete(lsm_db *, const void *pKey, int nKey); + +/* +** Delete all database entries with keys that are greater than (pKey1/nKey1) +** and smaller than (pKey2/nKey2). Note that keys (pKey1/nKey1) and +** (pKey2/nKey2) themselves, if they exist in the database, are not deleted. +** +** Return LSM_OK if successful, or an LSM error code otherwise. +*/ +int lsm_delete_range(lsm_db *, + const void *pKey1, int nKey1, const void *pKey2, int nKey2 +); + +/* +** CAPI: Explicit Database Work and Checkpointing +** +** This function is called by a thread to work on the database structure. +*/ +int lsm_work(lsm_db *pDb, int nMerge, int nKB, int *pnWrite); + +int lsm_flush(lsm_db *pDb); + +/* +** Attempt to checkpoint the current database snapshot. Return an LSM +** error code if an error occurs or LSM_OK otherwise. +** +** If the current snapshot has already been checkpointed, calling this +** function is a no-op. In this case if pnKB is not NULL, *pnKB is +** set to 0. Or, if the current snapshot is successfully checkpointed +** by this function and pbKB is not NULL, *pnKB is set to the number +** of bytes written to the database file since the previous checkpoint +** (the same measure as returned by the LSM_INFO_CHECKPOINT_SIZE query). +*/ +int lsm_checkpoint(lsm_db *pDb, int *pnKB); + +/* +** CAPI: Opening and Closing Database Cursors +** +** Open and close a database cursor. +*/ +int lsm_csr_open(lsm_db *pDb, lsm_cursor **ppCsr); +int lsm_csr_close(lsm_cursor *pCsr); + +/* +** CAPI: Positioning Database Cursors +** +** If the fourth parameter is LSM_SEEK_EQ, LSM_SEEK_GE or LSM_SEEK_LE, +** this function searches the database for an entry with key (pKey/nKey). +** If an error occurs, an LSM error code is returned. Otherwise, LSM_OK. +** +** If no error occurs and the requested key is present in the database, the +** cursor is left pointing to the entry with the specified key. Or, if the +** specified key is not present in the database the state of the cursor +** depends on the value passed as the final parameter, as follows: +** +** LSM_SEEK_EQ: +** The cursor is left at EOF (invalidated). A call to lsm_csr_valid() +** returns non-zero. +** +** LSM_SEEK_LE: +** The cursor is left pointing to the largest key in the database that +** is smaller than (pKey/nKey). If the database contains no keys smaller +** than (pKey/nKey), the cursor is left at EOF. +** +** LSM_SEEK_GE: +** The cursor is left pointing to the smallest key in the database that +** is larger than (pKey/nKey). If the database contains no keys larger +** than (pKey/nKey), the cursor is left at EOF. +** +** If the fourth parameter is LSM_SEEK_LEFAST, this function searches the +** database in a similar manner to LSM_SEEK_LE, with two differences: +** +**
  1. Even if a key can be found (the cursor is not left at EOF), the +** lsm_csr_value() function may not be used (attempts to do so return +** LSM_MISUSE). +** +**
  2. The key that the cursor is left pointing to may be one that has +** been recently deleted from the database. In this case it is +** guaranteed that the returned key is larger than any key currently +** in the database that is less than or equal to (pKey/nKey). +**
+** +** LSM_SEEK_LEFAST requests are intended to be used to allocate database +** keys. +*/ +int lsm_csr_seek(lsm_cursor *pCsr, const void *pKey, int nKey, int eSeek); + +int lsm_csr_first(lsm_cursor *pCsr); +int lsm_csr_last(lsm_cursor *pCsr); + +/* +** Advance the specified cursor to the next or previous key in the database. +** Return LSM_OK if successful, or an LSM error code otherwise. +** +** Functions lsm_csr_seek(), lsm_csr_first() and lsm_csr_last() are "seek" +** functions. Whether or not lsm_csr_next and lsm_csr_prev may be called +** successfully also depends on the most recent seek function called on +** the cursor. Specifically: +** +** +** +** Otherwise, if the above conditions are not met when lsm_csr_next or +** lsm_csr_prev is called, LSM_MISUSE is returned and the cursor position +** remains unchanged. +*/ +int lsm_csr_next(lsm_cursor *pCsr); +int lsm_csr_prev(lsm_cursor *pCsr); + +/* +** Values that may be passed as the fourth argument to lsm_csr_seek(). +*/ +#define LSM_SEEK_LEFAST -2 +#define LSM_SEEK_LE -1 +#define LSM_SEEK_EQ 0 +#define LSM_SEEK_GE 1 + +/* +** CAPI: Extracting Data From Database Cursors +** +** Retrieve data from a database cursor. +*/ +int lsm_csr_valid(lsm_cursor *pCsr); +int lsm_csr_key(lsm_cursor *pCsr, const void **ppKey, int *pnKey); +int lsm_csr_value(lsm_cursor *pCsr, const void **ppVal, int *pnVal); + +/* +** If no error occurs, this function compares the database key passed via +** the pKey/nKey arguments with the key that the cursor passed as the first +** argument currently points to. If the cursors key is less than, equal to +** or greater than pKey/nKey, *piRes is set to less than, equal to or greater +** than zero before returning. LSM_OK is returned in this case. +** +** Or, if an error occurs, an LSM error code is returned and the final +** value of *piRes is undefined. If the cursor does not point to a valid +** key when this function is called, LSM_MISUSE is returned. +*/ +int lsm_csr_cmp(lsm_cursor *pCsr, const void *pKey, int nKey, int *piRes); + +/* +** CAPI: Change these!! +** +** Configure a callback to which debugging and other messages should +** be directed. Only useful for debugging lsm. +*/ +void lsm_config_log(lsm_db *, void (*)(void *, int, const char *), void *); + +/* +** Configure a callback that is invoked if the database connection ever +** writes to the database file. +*/ +void lsm_config_work_hook(lsm_db *, void (*)(lsm_db *, void *), void *); + +/* ENDOFAPI */ +#ifdef __cplusplus +} /* End of the 'extern "C"' block */ +#endif +#endif /* ifndef _LSM_H */ diff --git a/ext/lsm1/lsmInt.h b/ext/lsm1/lsmInt.h new file mode 100644 index 0000000000..ea44dd40fc --- /dev/null +++ b/ext/lsm1/lsmInt.h @@ -0,0 +1,974 @@ +/* +** 2011-08-18 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** Internal structure definitions for the LSM module. +*/ +#ifndef _LSM_INT_H +#define _LSM_INT_H + +#include "lsm.h" +#include +#include + +#include +#include +#include +#include + +#include + +#ifdef NDEBUG +# ifdef LSM_DEBUG_EXPENSIVE +# undef LSM_DEBUG_EXPENSIVE +# endif +# ifdef LSM_DEBUG +# undef LSM_DEBUG +# endif +#else +# ifndef LSM_DEBUG +# define LSM_DEBUG +# endif +#endif + +/* +** Default values for various data structure parameters. These may be +** overridden by calls to lsm_config(). +*/ +#define LSM_DFLT_PAGE_SIZE (4 * 1024) +#define LSM_DFLT_BLOCK_SIZE (1 * 1024 * 1024) +#define LSM_DFLT_AUTOFLUSH (1 * 1024 * 1024) +#define LSM_DFLT_AUTOCHECKPOINT (i64)(2 * 1024 * 1024) +#define LSM_DFLT_AUTOWORK 1 +#define LSM_DFLT_LOG_SIZE (128*1024) +#define LSM_DFLT_AUTOMERGE 4 +#define LSM_DFLT_SAFETY LSM_SAFETY_NORMAL +#define LSM_DFLT_MMAP (LSM_IS_64_BIT ? 1 : 32768) +#define LSM_DFLT_MULTIPLE_PROCESSES 1 +#define LSM_DFLT_USE_LOG 1 + +/* Initial values for log file checksums. These are only used if the +** database file does not contain a valid checkpoint. */ +#define LSM_CKSUM0_INIT 42 +#define LSM_CKSUM1_INIT 42 + +#define LSM_META_PAGE_SIZE 4096 + +/* "mmap" mode is currently only used in environments with 64-bit address +** spaces. The following macro is used to test for this. */ +#define LSM_IS_64_BIT (sizeof(void*)==8) + +#define LSM_AUTOWORK_QUANT 32 + +typedef struct Database Database; +typedef struct DbLog DbLog; +typedef struct FileSystem FileSystem; +typedef struct Freelist Freelist; +typedef struct FreelistEntry FreelistEntry; +typedef struct Level Level; +typedef struct LogMark LogMark; +typedef struct LogRegion LogRegion; +typedef struct LogWriter LogWriter; +typedef struct LsmString LsmString; +typedef struct Mempool Mempool; +typedef struct Merge Merge; +typedef struct MergeInput MergeInput; +typedef struct MetaPage MetaPage; +typedef struct MultiCursor MultiCursor; +typedef struct Page Page; +typedef struct Redirect Redirect; +typedef struct Segment Segment; +typedef struct SegmentMerger SegmentMerger; +typedef struct ShmChunk ShmChunk; +typedef struct ShmHeader ShmHeader; +typedef struct ShmReader ShmReader; +typedef struct Snapshot Snapshot; +typedef struct TransMark TransMark; +typedef struct Tree Tree; +typedef struct TreeCursor TreeCursor; +typedef struct TreeHeader TreeHeader; +typedef struct TreeMark TreeMark; +typedef struct TreeRoot TreeRoot; + +#ifndef _SQLITEINT_H_ +typedef unsigned char u8; +typedef unsigned short int u16; +typedef unsigned int u32; +typedef lsm_i64 i64; +typedef unsigned long long int u64; +#endif + +/* A page number is a 64-bit integer. */ +typedef i64 Pgno; + +#ifdef LSM_DEBUG +int lsmErrorBkpt(int); +#else +# define lsmErrorBkpt(x) (x) +#endif + +#define LSM_PROTOCOL_BKPT lsmErrorBkpt(LSM_PROTOCOL) +#define LSM_IOERR_BKPT lsmErrorBkpt(LSM_IOERR) +#define LSM_NOMEM_BKPT lsmErrorBkpt(LSM_NOMEM) +#define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT) +#define LSM_MISUSE_BKPT lsmErrorBkpt(LSM_MISUSE) + +#define unused_parameter(x) (void)(x) +#define array_size(x) (sizeof(x)/sizeof(x[0])) + + +/* The size of each shared-memory chunk */ +#define LSM_SHM_CHUNK_SIZE (32*1024) + +/* The number of bytes reserved at the start of each shm chunk for MM. */ +#define LSM_SHM_CHUNK_HDR (sizeof(ShmChunk)) + +/* The number of available read locks. */ +#define LSM_LOCK_NREADER 6 + +/* The number of available read-write client locks. */ +#define LSM_LOCK_NRWCLIENT 16 + +/* Lock definitions. +*/ +#define LSM_LOCK_DMS1 1 /* Serialize connect/disconnect ops */ +#define LSM_LOCK_DMS2 2 /* Read-write connections */ +#define LSM_LOCK_DMS3 3 /* Read-only connections */ +#define LSM_LOCK_WRITER 4 +#define LSM_LOCK_WORKER 5 +#define LSM_LOCK_CHECKPOINTER 6 +#define LSM_LOCK_ROTRANS 7 +#define LSM_LOCK_READER(i) ((i) + LSM_LOCK_ROTRANS + 1) +#define LSM_LOCK_RWCLIENT(i) ((i) + LSM_LOCK_READER(LSM_LOCK_NREADER)) + +/* +** Hard limit on the number of free-list entries that may be stored in +** a checkpoint (the remainder are stored as a system record in the LSM). +** See also LSM_CONFIG_MAX_FREELIST. +*/ +#define LSM_MAX_FREELIST_ENTRIES 24 + +#define LSM_MAX_BLOCK_REDIRECTS 16 + +#define LSM_ATTEMPTS_BEFORE_PROTOCOL 10000 + + +/* +** Each entry stored in the LSM (or in-memory tree structure) has an +** associated mask of the following flags. +*/ +#define LSM_START_DELETE 0x01 /* Start of open-ended delete range */ +#define LSM_END_DELETE 0x02 /* End of open-ended delete range */ +#define LSM_POINT_DELETE 0x04 /* Delete this key */ +#define LSM_INSERT 0x08 /* Insert this key and value */ +#define LSM_SEPARATOR 0x10 /* True if entry is separator key only */ +#define LSM_SYSTEMKEY 0x20 /* True if entry is a system key (FREELIST) */ + +#define LSM_CONTIGUOUS 0x40 /* Used in lsm_tree.c */ + +/* +** A string that can grow by appending. +*/ +struct LsmString { + lsm_env *pEnv; /* Run-time environment */ + int n; /* Size of string. -1 indicates error */ + int nAlloc; /* Space allocated for z[] */ + char *z; /* The string content */ +}; + +typedef struct LsmFile LsmFile; +struct LsmFile { + lsm_file *pFile; + LsmFile *pNext; +}; + +/* +** An instance of the following type is used to store an ordered list of +** u32 values. +** +** Note: This is a place-holder implementation. It should be replaced by +** a version that avoids making a single large allocation when the array +** contains a large number of values. For this reason, the internals of +** this object should only manipulated by the intArrayXXX() functions in +** lsm_tree.c. +*/ +typedef struct IntArray IntArray; +struct IntArray { + int nAlloc; + int nArray; + u32 *aArray; +}; + +struct Redirect { + int n; /* Number of redirects */ + struct RedirectEntry { + int iFrom; + int iTo; + } *a; +}; + +/* +** An instance of this structure represents a point in the history of the +** tree structure to roll back to. Refer to comments in lsm_tree.c for +** details. +*/ +struct TreeMark { + u32 iRoot; /* Offset of root node in shm file */ + u32 nHeight; /* Current height of tree structure */ + u32 iWrite; /* Write offset in shm file */ + u32 nChunk; /* Number of chunks in shared-memory file */ + u32 iFirst; /* First chunk in linked list */ + u32 iNextShmid; /* Next id to allocate */ + int iRollback; /* Index in lsm->rollback to revert to */ +}; + +/* +** An instance of this structure represents a point in the database log. +*/ +struct LogMark { + i64 iOff; /* Offset into log (see lsm_log.c) */ + int nBuf; /* Size of in-memory buffer here */ + u8 aBuf[8]; /* Bytes of content in aBuf[] */ + u32 cksum0; /* Checksum 0 at offset (iOff-nBuf) */ + u32 cksum1; /* Checksum 1 at offset (iOff-nBuf) */ +}; + +struct TransMark { + TreeMark tree; + LogMark log; +}; + +/* +** A structure that defines the start and end offsets of a region in the +** log file. The size of the region in bytes is (iEnd - iStart), so if +** iEnd==iStart the region is zero bytes in size. +*/ +struct LogRegion { + i64 iStart; /* Start of region in log file */ + i64 iEnd; /* End of region in log file */ +}; + +struct DbLog { + u32 cksum0; /* Checksum 0 at offset iOff */ + u32 cksum1; /* Checksum 1 at offset iOff */ + i64 iSnapshotId; /* Log space has been reclaimed to this ss */ + LogRegion aRegion[3]; /* Log file regions (see docs in lsm_log.c) */ +}; + +struct TreeRoot { + u32 iRoot; + u32 nHeight; + u32 nByte; /* Total size of this tree in bytes */ + u32 iTransId; +}; + +/* +** Tree header structure. +*/ +struct TreeHeader { + u32 iUsedShmid; /* Id of first shm chunk used by this tree */ + u32 iNextShmid; /* Shm-id of next chunk allocated */ + u32 iFirst; /* Chunk number of smallest shm-id */ + u32 nChunk; /* Number of chunks in shared-memory file */ + TreeRoot root; /* Root and height of current tree */ + u32 iWrite; /* Write offset in shm file */ + TreeRoot oldroot; /* Root and height of the previous tree */ + u32 iOldShmid; /* Last shm-id used by previous tree */ + u32 iUsrVersion; /* get/set_user_version() value */ + i64 iOldLog; /* Log offset associated with old tree */ + u32 oldcksum0; + u32 oldcksum1; + DbLog log; /* Current layout of log file */ + u32 aCksum[2]; /* Checksums 1 and 2. */ +}; + +/* +** Database handle structure. +** +** mLock: +** A bitmask representing the locks currently held by the connection. +** An LSM database supports N distinct locks, where N is some number less +** than or equal to 32. Locks are numbered starting from 1 (see the +** definitions for LSM_LOCK_WRITER and co.). +** +** The least significant 32-bits in mLock represent EXCLUSIVE locks. The +** most significant are SHARED locks. So, if a connection holds a SHARED +** lock on lock region iLock, then the following is true: +** +** (mLock & ((iLock+32-1) << 1)) +** +** Or for an EXCLUSIVE lock: +** +** (mLock & ((iLock-1) << 1)) +** +** pCsr: +** Points to the head of a linked list that contains all currently open +** cursors. Once this list becomes empty, the user has no outstanding +** cursors and the database handle can be successfully closed. +** +** pCsrCache: +** This list contains cursor objects that have been closed using +** lsm_csr_close(). Each time a cursor is closed, it is shifted from +** the pCsr list to this list. When a new cursor is opened, this list +** is inspected to see if there exists a cursor object that can be +** reused. This is an optimization only. +*/ +struct lsm_db { + + /* Database handle configuration */ + lsm_env *pEnv; /* runtime environment */ + int (*xCmp)(void *, int, void *, int); /* Compare function */ + + /* Values configured by calls to lsm_config */ + int eSafety; /* LSM_SAFETY_OFF, NORMAL or FULL */ + int bAutowork; /* Configured by LSM_CONFIG_AUTOWORK */ + int nTreeLimit; /* Configured by LSM_CONFIG_AUTOFLUSH */ + int nMerge; /* Configured by LSM_CONFIG_AUTOMERGE */ + int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */ + int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */ + int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */ + int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */ + int iMmap; /* Configured by LSM_CONFIG_MMAP */ + i64 nAutockpt; /* Configured by LSM_CONFIG_AUTOCHECKPOINT */ + int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */ + int bReadonly; /* Configured by LSM_CONFIG_READONLY */ + lsm_compress compress; /* Compression callbacks */ + lsm_compress_factory factory; /* Compression callback factory */ + + /* Sub-system handles */ + FileSystem *pFS; /* On-disk portion of database */ + Database *pDatabase; /* Database shared data */ + + int iRwclient; /* Read-write client lock held (-1 == none) */ + + /* Client transaction context */ + Snapshot *pClient; /* Client snapshot */ + int iReader; /* Read lock held (-1 == unlocked) */ + int bRoTrans; /* True if a read-only db trans is open */ + MultiCursor *pCsr; /* List of all open cursors */ + LogWriter *pLogWriter; /* Context for writing to the log file */ + int nTransOpen; /* Number of opened write transactions */ + int nTransAlloc; /* Allocated size of aTrans[] array */ + TransMark *aTrans; /* Array of marks for transaction rollback */ + IntArray rollback; /* List of tree-nodes to roll back */ + int bDiscardOld; /* True if lsmTreeDiscardOld() was called */ + + MultiCursor *pCsrCache; /* List of all closed cursors */ + + /* Worker context */ + Snapshot *pWorker; /* Worker snapshot (or NULL) */ + Freelist *pFreelist; /* See sortedNewToplevel() */ + int bUseFreelist; /* True to use pFreelist */ + int bIncrMerge; /* True if currently doing a merge */ + + int bInFactory; /* True if within factory.xFactory() */ + + /* Debugging message callback */ + void (*xLog)(void *, int, const char *); + void *pLogCtx; + + /* Work done notification callback */ + void (*xWork)(lsm_db *, void *); + void *pWorkCtx; + + u64 mLock; /* Mask of current locks. See lsmShmLock(). */ + lsm_db *pNext; /* Next connection to same database */ + + int nShm; /* Size of apShm[] array */ + void **apShm; /* Shared memory chunks */ + ShmHeader *pShmhdr; /* Live shared-memory header */ + TreeHeader treehdr; /* Local copy of tree-header */ + u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)]; +}; + +struct Segment { + Pgno iFirst; /* First page of this run */ + Pgno iLastPg; /* Last page of this run */ + Pgno iRoot; /* Root page number (if any) */ + int nSize; /* Size of this run in pages */ + + Redirect *pRedirect; /* Block redirects (or NULL) */ +}; + +/* +** iSplitTopic/pSplitKey/nSplitKey: +** If nRight>0, this buffer contains a copy of the largest key that has +** already been written to the left-hand-side of the level. +*/ +struct Level { + Segment lhs; /* Left-hand (main) segment */ + int nRight; /* Size of apRight[] array */ + Segment *aRhs; /* Old segments being merged into this */ + int iSplitTopic; /* Split key topic (if nRight>0) */ + void *pSplitKey; /* Pointer to split-key (if nRight>0) */ + int nSplitKey; /* Number of bytes in split-key */ + + u16 iAge; /* Number of times data has been written */ + u16 flags; /* Mask of LEVEL_XXX bits */ + Merge *pMerge; /* Merge operation currently underway */ + Level *pNext; /* Next level in tree */ +}; + +/* +** The Level.flags field is set to a combination of the following bits. +** +** LEVEL_FREELIST_ONLY: +** Set if the level consists entirely of free-list entries. +** +** LEVEL_INCOMPLETE: +** This is set while a new toplevel level is being constructed. It is +** never set for any level other than a new toplevel. +*/ +#define LEVEL_FREELIST_ONLY 0x0001 +#define LEVEL_INCOMPLETE 0x0002 + + +/* +** A structure describing an ongoing merge. There is an instance of this +** structure for every Level currently undergoing a merge in the worker +** snapshot. +** +** It is assumed that code that uses an instance of this structure has +** access to the associated Level struct. +** +** iOutputOff: +** The byte offset to write to next within the last page of the +** output segment. +*/ +struct MergeInput { + Pgno iPg; /* Page on which next input is stored */ + int iCell; /* Cell containing next input to merge */ +}; +struct Merge { + int nInput; /* Number of input runs being merged */ + MergeInput *aInput; /* Array nInput entries in size */ + MergeInput splitkey; /* Location in file of current splitkey */ + int nSkip; /* Number of separators entries to skip */ + int iOutputOff; /* Write offset on output page */ + Pgno iCurrentPtr; /* Current pointer value */ +}; + +/* +** The first argument to this macro is a pointer to a Segment structure. +** Returns true if the structure instance indicates that the separators +** array is valid. +*/ +#define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0) + +/* +** The values that accompany the lock held by a database reader. +*/ +struct ShmReader { + u32 iTreeId; + i64 iLsmId; +}; + +/* +** An instance of this structure is stored in the first shared-memory +** page. The shared-memory header. +** +** bWriter: +** Immediately after opening a write transaction taking the WRITER lock, +** each writer client sets this flag. It is cleared right before the +** WRITER lock is relinquished. If a subsequent writer finds that this +** flag is already set when a write transaction is opened, this indicates +** that a previous writer failed mid-transaction. +** +** iMetaPage: +** If the database file does not contain a valid, synced, checkpoint, this +** value is set to 0. Otherwise, it is set to the meta-page number that +** contains the most recently written checkpoint (either 1 or 2). +** +** hdr1, hdr2: +** The two copies of the in-memory tree header. Two copies are required +** in case a writer fails while updating one of them. +*/ +struct ShmHeader { + u32 aSnap1[LSM_META_PAGE_SIZE / 4]; + u32 aSnap2[LSM_META_PAGE_SIZE / 4]; + u32 bWriter; + u32 iMetaPage; + TreeHeader hdr1; + TreeHeader hdr2; + ShmReader aReader[LSM_LOCK_NREADER]; +}; + +/* +** An instance of this structure is stored at the start of each shared-memory +** chunk except the first (which is the header chunk - see above). +*/ +struct ShmChunk { + u32 iShmid; + u32 iNext; +}; + +/* +** Maximum number of shared-memory chunks allowed in the *-shm file. Since +** each shared-memory chunk is 32KB in size, this is a theoretical limit only. +*/ +#define LSM_MAX_SHMCHUNKS (1<<30) + +/* Return true if shm-sequence "a" is larger than or equal to "b" */ +#define shm_sequence_ge(a, b) (((u32)a-(u32)b) < LSM_MAX_SHMCHUNKS) + +#define LSM_APPLIST_SZ 4 + +/* +** An instance of the following structure stores the in-memory part of +** the current free block list. This structure is to the free block list +** as the in-memory tree is to the users database content. The contents +** of the free block list is found by merging the in-memory components +** with those stored in the LSM, just as the contents of the database is +** found by merging the in-memory tree with the user data entries in the +** LSM. +** +** Each FreelistEntry structure in the array represents either an insert +** or delete operation on the free-list. For deletes, the FreelistEntry.iId +** field is set to -1. For inserts, it is set to zero or greater. +** +** The array of FreelistEntry structures is always sorted in order of +** block number (ascending). +** +** When the in-memory free block list is written into the LSM, each insert +** operation is written separately. The entry key is the bitwise inverse +** of the block number as a 32-bit big-endian integer. This is done so that +** the entries in the LSM are sorted in descending order of block id. +** The associated value is the snapshot id, formated as a varint. +*/ +struct Freelist { + FreelistEntry *aEntry; /* Free list entries */ + int nEntry; /* Number of valid slots in aEntry[] */ + int nAlloc; /* Allocated size of aEntry[] */ +}; +struct FreelistEntry { + u32 iBlk; /* Block number */ + i64 iId; /* Largest snapshot id to use this block */ +}; + +/* +** A snapshot of a database. A snapshot contains all the information required +** to read or write a database file on disk. See the description of struct +** Database below for futher details. +*/ +struct Snapshot { + Database *pDatabase; /* Database this snapshot belongs to */ + u32 iCmpId; /* Id of compression scheme */ + Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */ + i64 iId; /* Snapshot id */ + i64 iLogOff; /* Log file offset */ + Redirect redirect; /* Block redirection array */ + + /* Used by worker snapshots only */ + int nBlock; /* Number of blocks in database file */ + Pgno aiAppend[LSM_APPLIST_SZ]; /* Append point list */ + Freelist freelist; /* Free block list */ + u32 nWrite; /* Total number of pages written to disk */ +}; +#define LSM_INITIAL_SNAPSHOT_ID 11 + +/* +** Functions from file "lsm_ckpt.c". +*/ +int lsmCheckpointWrite(lsm_db *, int, u32 *); +int lsmCheckpointLevels(lsm_db *, int, void **, int *); +int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal); + +int lsmCheckpointRecover(lsm_db *); +int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **); + +int lsmCheckpointLoadWorker(lsm_db *pDb); +int lsmCheckpointStore(lsm_db *pDb, int); + +int lsmCheckpointLoad(lsm_db *pDb, int *); +int lsmCheckpointLoadOk(lsm_db *pDb, int); +int lsmCheckpointClientCacheOk(lsm_db *); + +u32 lsmCheckpointNBlock(u32 *); +i64 lsmCheckpointId(u32 *, int); +u32 lsmCheckpointNWrite(u32 *, int); +i64 lsmCheckpointLogOffset(u32 *); +int lsmCheckpointPgsz(u32 *); +int lsmCheckpointBlksz(u32 *); +void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog); +void lsmCheckpointZeroLogoffset(lsm_db *); + +int lsmCheckpointSaveWorker(lsm_db *pDb, int); +int lsmDatabaseFull(lsm_db *pDb); +int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite); + +int lsmCheckpointSize(lsm_db *db, int *pnByte); + +int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId); + +/* +** Functions from file "lsm_tree.c". +*/ +int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree); +void lsmTreeRelease(lsm_env *, Tree *); +int lsmTreeInit(lsm_db *); +int lsmTreeRepair(lsm_db *); + +void lsmTreeMakeOld(lsm_db *pDb); +void lsmTreeDiscardOld(lsm_db *pDb); +int lsmTreeHasOld(lsm_db *pDb); + +int lsmTreeSize(lsm_db *); +int lsmTreeEndTransaction(lsm_db *pDb, int bCommit); +int lsmTreeLoadHeader(lsm_db *pDb, int *); +int lsmTreeLoadHeaderOk(lsm_db *, int); + +int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal); +int lsmTreeDelete(lsm_db *db, void *pKey1, int nKey1, void *pKey2, int nKey2); +void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark); +void lsmTreeMark(lsm_db *pDb, TreeMark *pMark); + +int lsmTreeCursorNew(lsm_db *pDb, int, TreeCursor **); +void lsmTreeCursorDestroy(TreeCursor *); + +int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes); +int lsmTreeCursorNext(TreeCursor *pCsr); +int lsmTreeCursorPrev(TreeCursor *pCsr); +int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast); +void lsmTreeCursorReset(TreeCursor *pCsr); +int lsmTreeCursorKey(TreeCursor *pCsr, int *pFlags, void **ppKey, int *pnKey); +int lsmTreeCursorFlags(TreeCursor *pCsr); +int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal); +int lsmTreeCursorValid(TreeCursor *pCsr); +int lsmTreeCursorSave(TreeCursor *pCsr); + +void lsmFlagsToString(int flags, char *zFlags); + +/* +** Functions from file "mem.c". +*/ +void *lsmMalloc(lsm_env*, size_t); +void lsmFree(lsm_env*, void *); +void *lsmRealloc(lsm_env*, void *, size_t); +void *lsmReallocOrFree(lsm_env*, void *, size_t); +void *lsmReallocOrFreeRc(lsm_env *, void *, size_t, int *); + +void *lsmMallocZeroRc(lsm_env*, size_t, int *); +void *lsmMallocRc(lsm_env*, size_t, int *); + +void *lsmMallocZero(lsm_env *pEnv, size_t); +char *lsmMallocStrdup(lsm_env *pEnv, const char *); + +/* +** Functions from file "lsm_mutex.c". +*/ +int lsmMutexStatic(lsm_env*, int, lsm_mutex **); +int lsmMutexNew(lsm_env*, lsm_mutex **); +void lsmMutexDel(lsm_env*, lsm_mutex *); +void lsmMutexEnter(lsm_env*, lsm_mutex *); +int lsmMutexTry(lsm_env*, lsm_mutex *); +void lsmMutexLeave(lsm_env*, lsm_mutex *); + +#ifndef NDEBUG +int lsmMutexHeld(lsm_env *, lsm_mutex *); +int lsmMutexNotHeld(lsm_env *, lsm_mutex *); +#endif + +/************************************************************************** +** Start of functions from "lsm_file.c". +*/ +int lsmFsOpen(lsm_db *, const char *, int); +int lsmFsOpenLog(lsm_db *, int *); +void lsmFsCloseLog(lsm_db *); +void lsmFsClose(FileSystem *); + +int lsmFsConfigure(lsm_db *db); + +int lsmFsBlockSize(FileSystem *); +void lsmFsSetBlockSize(FileSystem *, int); +int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom); + +int lsmFsPageSize(FileSystem *); +void lsmFsSetPageSize(FileSystem *, int); + +int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId); + +/* Creating, populating, gobbling and deleting sorted runs. */ +void lsmFsGobble(lsm_db *, Segment *, Pgno *, int); +int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *); +int lsmFsSortedFinish(FileSystem *, Segment *); +int lsmFsSortedAppend(FileSystem *, Snapshot *, Level *, int, Page **); +int lsmFsSortedPadding(FileSystem *, Snapshot *, Segment *); + +/* Functions to retrieve the lsm_env pointer from a FileSystem or Page object */ +lsm_env *lsmFsEnv(FileSystem *); +lsm_env *lsmPageEnv(Page *); +FileSystem *lsmPageFS(Page *); + +int lsmFsSectorSize(FileSystem *); + +void lsmSortedSplitkey(lsm_db *, Level *, int *); + +/* Reading sorted run content. */ +int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg); +int lsmFsDbPageGet(FileSystem *, Segment *, Pgno, Page **); +int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **); + +u8 *lsmFsPageData(Page *, int *); +int lsmFsPageRelease(Page *); +int lsmFsPagePersist(Page *); +void lsmFsPageRef(Page *); +Pgno lsmFsPageNumber(Page *); + +int lsmFsNRead(FileSystem *); +int lsmFsNWrite(FileSystem *); + +int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **); +int lsmFsMetaPageRelease(MetaPage *); +u8 *lsmFsMetaPageData(MetaPage *, int *); + +#ifdef LSM_DEBUG +int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg); +int lsmFsIntegrityCheck(lsm_db *); +#endif + +Pgno lsmFsRedirectPage(FileSystem *, Redirect *, Pgno); + +int lsmFsPageWritable(Page *); + +/* Functions to read, write and sync the log file. */ +int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr); +int lsmFsSyncLog(FileSystem *pFS); +int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr); +int lsmFsTruncateLog(FileSystem *pFS, i64 nByte); +int lsmFsTruncateDb(FileSystem *pFS, i64 nByte); +int lsmFsCloseAndDeleteLog(FileSystem *pFS); + +LsmFile *lsmFsDeferClose(FileSystem *pFS); + +/* And to sync the db file */ +int lsmFsSyncDb(FileSystem *, int); + +void lsmFsFlushWaiting(FileSystem *, int *); + +/* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */ +int lsmInfoArrayStructure(lsm_db *pDb, int bBlock, Pgno iFirst, char **pzOut); +int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut); +int lsmConfigMmap(lsm_db *pDb, int *piParam); + +int lsmEnvOpen(lsm_env *, const char *, int, lsm_file **); +int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile); +int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock); +int lsmEnvTestLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int nLock, int); + +int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **); +void lsmEnvShmBarrier(lsm_env *); +void lsmEnvShmUnmap(lsm_env *, lsm_file *, int); + +void lsmEnvSleep(lsm_env *, int); + +int lsmFsReadSyncedId(lsm_db *db, int, i64 *piVal); + +int lsmFsSegmentContainsPg(FileSystem *pFS, Segment *, Pgno, int *); + +void lsmFsPurgeCache(FileSystem *); + +/* +** End of functions from "lsm_file.c". +**************************************************************************/ + +/* +** Functions from file "lsm_sorted.c". +*/ +int lsmInfoPageDump(lsm_db *, Pgno, int, char **); +void lsmSortedCleanup(lsm_db *); +int lsmSortedAutoWork(lsm_db *, int nUnit); + +int lsmSortedWalkFreelist(lsm_db *, int, int (*)(void *, int, i64), void *); + +int lsmSaveWorker(lsm_db *, int); + +int lsmFlushTreeToDisk(lsm_db *pDb); + +void lsmSortedRemap(lsm_db *pDb); + +void lsmSortedFreeLevel(lsm_env *pEnv, Level *); + +int lsmSortedAdvanceAll(lsm_db *pDb); + +int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *); +int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *); + +void *lsmSortedSplitKey(Level *pLevel, int *pnByte); + +void lsmSortedSaveTreeCursors(lsm_db *); + +int lsmMCursorNew(lsm_db *, MultiCursor **); +void lsmMCursorClose(MultiCursor *, int); +int lsmMCursorSeek(MultiCursor *, int, void *, int , int); +int lsmMCursorFirst(MultiCursor *); +int lsmMCursorPrev(MultiCursor *); +int lsmMCursorLast(MultiCursor *); +int lsmMCursorValid(MultiCursor *); +int lsmMCursorNext(MultiCursor *); +int lsmMCursorKey(MultiCursor *, void **, int *); +int lsmMCursorValue(MultiCursor *, void **, int *); +int lsmMCursorType(MultiCursor *, int *); +lsm_db *lsmMCursorDb(MultiCursor *); +void lsmMCursorFreeCache(lsm_db *); + +int lsmSaveCursors(lsm_db *pDb); +int lsmRestoreCursors(lsm_db *pDb); + +void lsmSortedDumpStructure(lsm_db *pDb, Snapshot *, int, int, const char *); +void lsmFsDumpBlocklists(lsm_db *); + +void lsmSortedExpandBtreePage(Page *pPg, int nOrig); + +void lsmPutU32(u8 *, u32); +u32 lsmGetU32(u8 *); +u64 lsmGetU64(u8 *); + +/* +** Functions from "lsm_varint.c". +*/ +int lsmVarintPut32(u8 *, int); +int lsmVarintGet32(u8 *, int *); +int lsmVarintPut64(u8 *aData, i64 iVal); +int lsmVarintGet64(const u8 *aData, i64 *piVal); + +int lsmVarintLen32(int); +int lsmVarintSize(u8 c); + +/* +** Functions from file "main.c". +*/ +void lsmLogMessage(lsm_db *, int, const char *, ...); +int lsmInfoFreelist(lsm_db *pDb, char **pzOut); + +/* +** Functions from file "lsm_log.c". +*/ +int lsmLogBegin(lsm_db *pDb); +int lsmLogWrite(lsm_db *, void *, int, void *, int); +int lsmLogCommit(lsm_db *); +void lsmLogEnd(lsm_db *pDb, int bCommit); +void lsmLogTell(lsm_db *, LogMark *); +void lsmLogSeek(lsm_db *, LogMark *); +void lsmLogClose(lsm_db *); + +int lsmLogRecover(lsm_db *); +int lsmInfoLogStructure(lsm_db *pDb, char **pzVal); + + +/************************************************************************** +** Functions from file "lsm_shared.c". +*/ + +int lsmDbDatabaseConnect(lsm_db*, const char *); +void lsmDbDatabaseRelease(lsm_db *); + +int lsmBeginReadTrans(lsm_db *); +int lsmBeginWriteTrans(lsm_db *); +int lsmBeginFlush(lsm_db *); + +int lsmDetectRoTrans(lsm_db *db, int *); +int lsmBeginRoTrans(lsm_db *db); + +int lsmBeginWork(lsm_db *); +void lsmFinishWork(lsm_db *, int, int *); + +int lsmFinishRecovery(lsm_db *); +void lsmFinishReadTrans(lsm_db *); +int lsmFinishWriteTrans(lsm_db *, int); +int lsmFinishFlush(lsm_db *, int); + +int lsmSnapshotSetFreelist(lsm_db *, int *, int); + +Snapshot *lsmDbSnapshotClient(lsm_db *); +Snapshot *lsmDbSnapshotWorker(lsm_db *); + +void lsmSnapshotSetCkptid(Snapshot *, i64); + +Level *lsmDbSnapshotLevel(Snapshot *); +void lsmDbSnapshotSetLevel(Snapshot *, Level *); + +void lsmDbRecoveryComplete(lsm_db *, int); + +int lsmBlockAllocate(lsm_db *, int, int *); +int lsmBlockFree(lsm_db *, int); +int lsmBlockRefree(lsm_db *, int); + +void lsmFreelistDeltaBegin(lsm_db *); +void lsmFreelistDeltaEnd(lsm_db *); +int lsmFreelistDelta(lsm_db *pDb); + +DbLog *lsmDatabaseLog(lsm_db *pDb); + +#ifdef LSM_DEBUG + int lsmHoldingClientMutex(lsm_db *pDb); + int lsmShmAssertLock(lsm_db *db, int iLock, int eOp); + int lsmShmAssertWorker(lsm_db *db); +#endif + +void lsmFreeSnapshot(lsm_env *, Snapshot *); + + +/* Candidate values for the 3rd argument to lsmShmLock() */ +#define LSM_LOCK_UNLOCK 0 +#define LSM_LOCK_SHARED 1 +#define LSM_LOCK_EXCL 2 + +int lsmShmCacheChunks(lsm_db *db, int nChunk); +int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock); +int lsmShmTestLock(lsm_db *db, int iLock, int nLock, int eOp); +void lsmShmBarrier(lsm_db *db); + +#ifdef LSM_DEBUG +void lsmShmHasLock(lsm_db *db, int iLock, int eOp); +#else +# define lsmShmHasLock(x,y,z) +#endif + +int lsmReadlock(lsm_db *, i64 iLsm, u32 iShmMin, u32 iShmMax); + +int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse); +int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse); +int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId); + +int lsmDbMultiProc(lsm_db *); +void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *); +LsmFile *lsmDbRecycleFd(lsm_db *); + +int lsmWalkFreelist(lsm_db *, int, int (*)(void *, int, i64), void *); + +int lsmCheckCompressionId(lsm_db *, u32); + + +/************************************************************************** +** functions in lsm_str.c +*/ +void lsmStringInit(LsmString*, lsm_env *pEnv); +int lsmStringExtend(LsmString*, int); +int lsmStringAppend(LsmString*, const char *, int); +void lsmStringVAppendf(LsmString*, const char *zFormat, va_list, va_list); +void lsmStringAppendf(LsmString*, const char *zFormat, ...); +void lsmStringClear(LsmString*); +char *lsmMallocPrintf(lsm_env*, const char*, ...); +int lsmStringBinAppend(LsmString *pStr, const u8 *a, int n); + +int lsmStrlen(const char *zName); + + + +/* +** Round up a number to the next larger multiple of 8. This is used +** to force 8-byte alignment on 64-bit architectures. +*/ +#define ROUND8(x) (((x)+7)&~7) + +#define LSM_MIN(x,y) ((x)>(y) ? (y) : (x)) +#define LSM_MAX(x,y) ((x)>(y) ? (x) : (y)) + +#endif diff --git a/ext/lsm1/lsm_ckpt.c b/ext/lsm1/lsm_ckpt.c new file mode 100644 index 0000000000..36f5cd3146 --- /dev/null +++ b/ext/lsm1/lsm_ckpt.c @@ -0,0 +1,1237 @@ +/* +** 2011-09-11 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** This file contains code to read and write checkpoints. +** +** A checkpoint represents the database layout at a single point in time. +** It includes a log offset. When an existing database is opened, the +** current state is determined by reading the newest checkpoint and updating +** it with all committed transactions from the log that follow the specified +** offset. +*/ +#include "lsmInt.h" + +/* +** CHECKPOINT BLOB FORMAT: +** +** A checkpoint blob is a series of unsigned 32-bit integers stored in +** big-endian byte order. As follows: +** +** Checkpoint header (see the CKPT_HDR_XXX #defines): +** +** 1. The checkpoint id MSW. +** 2. The checkpoint id LSW. +** 3. The number of integer values in the entire checkpoint, including +** the two checksum values. +** 4. The compression scheme id. +** 5. The total number of blocks in the database. +** 6. The block size. +** 7. The number of levels. +** 8. The nominal database page size. +** 9. The number of pages (in total) written to the database file. +** +** Log pointer: +** +** 1. The log offset MSW. +** 2. The log offset LSW. +** 3. Log checksum 0. +** 4. Log checksum 1. +** +** Note that the "log offset" is not the literal byte offset. Instead, +** it is the byte offset multiplied by 2, with least significant bit +** toggled each time the log pointer value is changed. This is to make +** sure that this field changes each time the log pointer is updated, +** even if the log file itself is disabled. See lsmTreeMakeOld(). +** +** See ckptExportLog() and ckptImportLog(). +** +** Append points: +** +** 8 integers (4 * 64-bit page numbers). See ckptExportAppendlist(). +** +** For each level in the database, a level record. Formatted as follows: +** +** 0. Age of the level (least significant 16-bits). And flags mask (most +** significant 16-bits). +** 1. The number of right-hand segments (nRight, possibly 0), +** 2. Segment record for left-hand segment (8 integers defined below), +** 3. Segment record for each right-hand segment (8 integers defined below), +** 4. If nRight>0, The number of segments involved in the merge +** 5. if nRight>0, Current nSkip value (see Merge structure defn.), +** 6. For each segment in the merge: +** 5a. Page number of next cell to read during merge (this field +** is 64-bits - 2 integers) +** 5b. Cell number of next cell to read during merge +** 7. Page containing current split-key (64-bits - 2 integers). +** 8. Cell within page containing current split-key. +** 9. Current pointer value (64-bits - 2 integers). +** +** The block redirect array: +** +** 1. Number of redirections (maximum LSM_MAX_BLOCK_REDIRECTS). +** 2. For each redirection: +** a. "from" block number +** b. "to" block number +** +** The in-memory freelist entries. Each entry is either an insert or a +** delete. The in-memory freelist is to the free-block-list as the +** in-memory tree is to the users database content. +** +** 1. Number of free-list entries stored in checkpoint header. +** 2. Number of free blocks (in total). +** 3. Total number of blocks freed during database lifetime. +** 4. For each entry: +** 2a. Block number of free block. +** 2b. A 64-bit integer (MSW followed by LSW). -1 for a delete entry, +** or the associated checkpoint id for an insert. +** +** The checksum: +** +** 1. Checksum value 1. +** 2. Checksum value 2. +** +** In the above, a segment record consists of the following four 64-bit +** fields (converted to 2 * u32 by storing the MSW followed by LSW): +** +** 1. First page of array, +** 2. Last page of array, +** 3. Root page of array (or 0), +** 4. Size of array in pages. +*/ + +/* +** LARGE NUMBERS OF LEVEL RECORDS: +** +** A limit on the number of rhs segments that may be present in the database +** file. Defining this limit ensures that all level records fit within +** the 4096 byte limit for checkpoint blobs. +** +** The number of right-hand-side segments in a database is counted as +** follows: +** +** * For each level in the database not undergoing a merge, add 1. +** +** * For each level in the database that is undergoing a merge, add +** the number of segments on the rhs of the level. +** +** A level record not undergoing a merge is 10 integers. A level record +** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the +** separators from the next level) is (11*nRhs+20) integers. The maximum +** per right-hand-side level is therefore 21 integers. So the maximum +** size of all level records in a checkpoint is 21*40=820 integers. +** +** TODO: Before pointer values were changed from 32 to 64 bits, the above +** used to come to 420 bytes - leaving significant space for a free-list +** prefix. No more. To fix this, reduce the size of the level records in +** a db snapshot, and improve management of the free-list tail in +** lsm_sorted.c. +*/ +#define LSM_MAX_RHS_SEGMENTS 40 + +/* +** LARGE NUMBERS OF FREELIST ENTRIES: +** +** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h) +** on the number of free-list entries stored in a checkpoint. Since each +** free-list entry consists of 3 integers, the maximum free-list size is +** 3*100=300 integers. Combined with the limit on rhs segments defined +** above, this ensures that a checkpoint always fits within a 4096 byte +** meta page. +** +** If the database contains more than 100 free blocks, the "overflow" flag +** in the checkpoint header is set and the remainder are stored in the +** system FREELIST entry in the LSM (along with user data). The value +** accompanying the FREELIST key in the LSM is, like a checkpoint, an array +** of 32-bit big-endian integers. As follows: +** +** For each entry: +** a. Block number of free block. +** b. MSW of associated checkpoint id. +** c. LSW of associated checkpoint id. +** +** The number of entries is not required - it is implied by the size of the +** value blob containing the integer array. +** +** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit. +** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST. +*/ + +/* +** The argument to this macro must be of type u32. On a little-endian +** architecture, it returns the u32 value that results from interpreting +** the 4 bytes as a big-endian value. On a big-endian architecture, it +** returns the value that would be produced by intepreting the 4 bytes +** of the input value as a little-endian integer. +*/ +#define BYTESWAP32(x) ( \ + (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \ + + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \ +) + +static const int one = 1; +#define LSM_LITTLE_ENDIAN (*(u8 *)(&one)) + +/* Sizes, in integers, of various parts of the checkpoint. */ +#define CKPT_HDR_SIZE 9 +#define CKPT_LOGPTR_SIZE 4 +#define CKPT_APPENDLIST_SIZE (LSM_APPLIST_SZ * 2) + +/* A #define to describe each integer in the checkpoint header. */ +#define CKPT_HDR_ID_MSW 0 +#define CKPT_HDR_ID_LSW 1 +#define CKPT_HDR_NCKPT 2 +#define CKPT_HDR_CMPID 3 +#define CKPT_HDR_NBLOCK 4 +#define CKPT_HDR_BLKSZ 5 +#define CKPT_HDR_NLEVEL 6 +#define CKPT_HDR_PGSZ 7 +#define CKPT_HDR_NWRITE 8 + +#define CKPT_HDR_LO_MSW 9 +#define CKPT_HDR_LO_LSW 10 +#define CKPT_HDR_LO_CKSUM1 11 +#define CKPT_HDR_LO_CKSUM2 12 + +typedef struct CkptBuffer CkptBuffer; + +/* +** Dynamic buffer used to accumulate data for a checkpoint. +*/ +struct CkptBuffer { + lsm_env *pEnv; + int nAlloc; + u32 *aCkpt; +}; + +/* +** Calculate the checksum of the checkpoint specified by arguments aCkpt and +** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning. +** +** The value of the nCkpt parameter includes the two checksum values at +** the end of the checkpoint. They are not used as inputs to the checksum +** calculation. The checksum is based on the array of (nCkpt-2) integers +** at aCkpt[]. +*/ +static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){ + int i; + u32 cksum1 = 1; + u32 cksum2 = 2; + + if( nCkpt % 2 ){ + cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF; + cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000; + } + + for(i=0; (i+3)=p->nAlloc ){ + int nNew = LSM_MAX(8, iIdx*2); + p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32)); + if( !p->aCkpt ){ + *pRc = LSM_NOMEM_BKPT; + return; + } + p->nAlloc = nNew; + } + p->aCkpt[iIdx] = iVal; +} + +/* +** Argument aInt points to an array nInt elements in size. Switch the +** endian-ness of each element of the array. +*/ +static void ckptChangeEndianness(u32 *aInt, int nInt){ + if( LSM_LITTLE_ENDIAN ){ + int i; + for(i=0; iaCkpt, nCkpt+2, &aCksum[0], &aCksum[1]); + ckptSetValue(p, nCkpt, aCksum[0], pRc); + ckptSetValue(p, nCkpt+1, aCksum[1], pRc); + } +} + +static void ckptAppend64(CkptBuffer *p, int *piOut, i64 iVal, int *pRc){ + int iOut = *piOut; + ckptSetValue(p, iOut++, (iVal >> 32) & 0xFFFFFFFF, pRc); + ckptSetValue(p, iOut++, (iVal & 0xFFFFFFFF), pRc); + *piOut = iOut; +} + +static i64 ckptRead64(u32 *a){ + return (((i64)a[0]) << 32) + (i64)a[1]; +} + +static i64 ckptGobble64(u32 *a, int *piIn){ + int iIn = *piIn; + *piIn += 2; + return ckptRead64(&a[iIn]); +} + + +/* +** Append a 6-value segment record corresponding to pSeg to the checkpoint +** buffer passed as the third argument. +*/ +static void ckptExportSegment( + Segment *pSeg, + CkptBuffer *p, + int *piOut, + int *pRc +){ + ckptAppend64(p, piOut, pSeg->iFirst, pRc); + ckptAppend64(p, piOut, pSeg->iLastPg, pRc); + ckptAppend64(p, piOut, pSeg->iRoot, pRc); + ckptAppend64(p, piOut, pSeg->nSize, pRc); +} + +static void ckptExportLevel( + Level *pLevel, /* Level object to serialize */ + CkptBuffer *p, /* Append new level record to this ckpt */ + int *piOut, /* IN/OUT: Size of checkpoint so far */ + int *pRc /* IN/OUT: Error code */ +){ + int iOut = *piOut; + Merge *pMerge; + + pMerge = pLevel->pMerge; + ckptSetValue(p, iOut++, (u32)pLevel->iAge + (u32)(pLevel->flags<<16), pRc); + ckptSetValue(p, iOut++, pLevel->nRight, pRc); + ckptExportSegment(&pLevel->lhs, p, &iOut, pRc); + + assert( (pLevel->nRight>0)==(pMerge!=0) ); + if( pMerge ){ + int i; + for(i=0; inRight; i++){ + ckptExportSegment(&pLevel->aRhs[i], p, &iOut, pRc); + } + assert( pMerge->nInput==pLevel->nRight + || pMerge->nInput==pLevel->nRight+1 + ); + ckptSetValue(p, iOut++, pMerge->nInput, pRc); + ckptSetValue(p, iOut++, pMerge->nSkip, pRc); + for(i=0; inInput; i++){ + ckptAppend64(p, &iOut, pMerge->aInput[i].iPg, pRc); + ckptSetValue(p, iOut++, pMerge->aInput[i].iCell, pRc); + } + ckptAppend64(p, &iOut, pMerge->splitkey.iPg, pRc); + ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc); + ckptAppend64(p, &iOut, pMerge->iCurrentPtr, pRc); + } + + *piOut = iOut; +} + +/* +** Populate the log offset fields of the checkpoint buffer. 4 values. +*/ +static void ckptExportLog( + lsm_db *pDb, + int bFlush, + CkptBuffer *p, + int *piOut, + int *pRc +){ + int iOut = *piOut; + + assert( iOut==CKPT_HDR_LO_MSW ); + + if( bFlush ){ + i64 iOff = pDb->treehdr.iOldLog; + ckptAppend64(p, &iOut, iOff, pRc); + ckptSetValue(p, iOut++, pDb->treehdr.oldcksum0, pRc); + ckptSetValue(p, iOut++, pDb->treehdr.oldcksum1, pRc); + }else{ + for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){ + ckptSetValue(p, iOut, pDb->pShmhdr->aSnap2[iOut], pRc); + } + } + + assert( *pRc || iOut==CKPT_HDR_LO_CKSUM2+1 ); + *piOut = iOut; +} + +static void ckptExportAppendlist( + lsm_db *db, /* Database connection */ + CkptBuffer *p, /* Checkpoint buffer to write to */ + int *piOut, /* IN/OUT: Offset within checkpoint buffer */ + int *pRc /* IN/OUT: Error code */ +){ + int i; + Pgno *aiAppend = db->pWorker->aiAppend; + + for(i=0; ipFS; /* File system object */ + Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */ + int nLevel = 0; /* Number of levels in checkpoint */ + int iLevel; /* Used to count out nLevel levels */ + int iOut = 0; /* Current offset in aCkpt[] */ + Level *pLevel; /* Level iterator */ + int i; /* Iterator used while serializing freelist */ + CkptBuffer ckpt; + + /* Initialize the output buffer */ + memset(&ckpt, 0, sizeof(CkptBuffer)); + ckpt.pEnv = pDb->pEnv; + iOut = CKPT_HDR_SIZE; + + /* Write the log offset into the checkpoint. */ + ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc); + + /* Write the append-point list */ + ckptExportAppendlist(pDb, &ckpt, &iOut, &rc); + + /* Figure out how many levels will be written to the checkpoint. */ + for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++; + + /* Serialize nLevel levels. */ + iLevel = 0; + for(pLevel=lsmDbSnapshotLevel(pSnap); iLevelpNext){ + ckptExportLevel(pLevel, &ckpt, &iOut, &rc); + iLevel++; + } + + /* Write the block-redirect list */ + ckptSetValue(&ckpt, iOut++, pSnap->redirect.n, &rc); + for(i=0; iredirect.n; i++){ + ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iFrom, &rc); + ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iTo, &rc); + } + + /* Write the freelist */ + assert( pSnap->freelist.nEntry<=pDb->nMaxFreelist ); + if( rc==LSM_OK ){ + int nFree = pSnap->freelist.nEntry; + ckptSetValue(&ckpt, iOut++, nFree, &rc); + for(i=0; ifreelist.aEntry[i]; + ckptSetValue(&ckpt, iOut++, p->iBlk, &rc); + ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc); + ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc); + } + } + + /* Write the checkpoint header */ + assert( iId>=0 ); + assert( pSnap->iCmpId==pDb->compress.iId + || pSnap->iCmpId==LSM_COMPRESSION_EMPTY + ); + ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc); + ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc); + ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc); + ckptSetValue(&ckpt, CKPT_HDR_CMPID, pDb->compress.iId, &rc); + ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc); + ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc); + ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc); + ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc); + ckptSetValue(&ckpt, CKPT_HDR_NWRITE, pSnap->nWrite, &rc); + + if( bCksum ){ + ckptAddChecksum(&ckpt, iOut, &rc); + }else{ + ckptSetValue(&ckpt, iOut, 0, &rc); + ckptSetValue(&ckpt, iOut+1, 0, &rc); + } + iOut += 2; + assert( iOut<=1024 ); + +#ifdef LSM_LOG_FREELIST + lsmLogMessage(pDb, rc, + "ckptExportSnapshot(): id=%lld freelist: %d", iId, pSnap->freelist.nEntry + ); + for(i=0; ifreelist.nEntry; i++){ + lsmLogMessage(pDb, rc, + "ckptExportSnapshot(): iBlk=%d id=%lld", + pSnap->freelist.aEntry[i].iBlk, + pSnap->freelist.aEntry[i].iId + ); + } +#endif + + *ppCkpt = (void *)ckpt.aCkpt; + if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut; + return rc; +} + + +/* +** Helper function for ckptImport(). +*/ +static void ckptNewSegment( + u32 *aIn, + int *piIn, + Segment *pSegment /* Populate this structure */ +){ + assert( pSegment->iFirst==0 && pSegment->iLastPg==0 ); + assert( pSegment->nSize==0 && pSegment->iRoot==0 ); + pSegment->iFirst = ckptGobble64(aIn, piIn); + pSegment->iLastPg = ckptGobble64(aIn, piIn); + pSegment->iRoot = ckptGobble64(aIn, piIn); + pSegment->nSize = ckptGobble64(aIn, piIn); + assert( pSegment->iFirst ); +} + +static int ckptSetupMerge(lsm_db *pDb, u32 *aInt, int *piIn, Level *pLevel){ + Merge *pMerge; /* Allocated Merge object */ + int nInput; /* Number of input segments in merge */ + int iIn = *piIn; /* Next value to read from aInt[] */ + int i; /* Iterator variable */ + int nByte; /* Number of bytes to allocate */ + + /* Allocate the Merge object. If malloc() fails, return LSM_NOMEM. */ + nInput = (int)aInt[iIn++]; + nByte = sizeof(Merge) + sizeof(MergeInput) * nInput; + pMerge = (Merge *)lsmMallocZero(pDb->pEnv, nByte); + if( !pMerge ) return LSM_NOMEM_BKPT; + pLevel->pMerge = pMerge; + + /* Populate the Merge object. */ + pMerge->aInput = (MergeInput *)&pMerge[1]; + pMerge->nInput = nInput; + pMerge->iOutputOff = -1; + pMerge->nSkip = (int)aInt[iIn++]; + for(i=0; iaInput[i].iPg = ckptGobble64(aInt, &iIn); + pMerge->aInput[i].iCell = (int)aInt[iIn++]; + } + pMerge->splitkey.iPg = ckptGobble64(aInt, &iIn); + pMerge->splitkey.iCell = (int)aInt[iIn++]; + pMerge->iCurrentPtr = ckptGobble64(aInt, &iIn); + + /* Set *piIn and return LSM_OK. */ + *piIn = iIn; + return LSM_OK; +} + + +static int ckptLoadLevels( + lsm_db *pDb, + u32 *aIn, + int *piIn, + int nLevel, + Level **ppLevel +){ + int i; + int rc = LSM_OK; + Level *pRet = 0; + Level **ppNext; + int iIn = *piIn; + + ppNext = &pRet; + for(i=0; rc==LSM_OK && ipEnv, sizeof(Level), &rc); + if( rc==LSM_OK ){ + pLevel->iAge = (u16)(aIn[iIn] & 0x0000FFFF); + pLevel->flags = (u16)((aIn[iIn]>>16) & 0x0000FFFF); + iIn++; + pLevel->nRight = aIn[iIn++]; + if( pLevel->nRight ){ + int nByte = sizeof(Segment) * pLevel->nRight; + pLevel->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc); + } + if( rc==LSM_OK ){ + *ppNext = pLevel; + ppNext = &pLevel->pNext; + + /* Allocate the main segment */ + ckptNewSegment(aIn, &iIn, &pLevel->lhs); + + /* Allocate each of the right-hand segments, if any */ + for(iRight=0; iRightnRight; iRight++){ + ckptNewSegment(aIn, &iIn, &pLevel->aRhs[iRight]); + } + + /* Set up the Merge object, if required */ + if( pLevel->nRight>0 ){ + rc = ckptSetupMerge(pDb, aIn, &iIn, pLevel); + } + } + } + } + + if( rc!=LSM_OK ){ + /* An OOM must have occurred. Free any level structures allocated and + ** return the error to the caller. */ + lsmSortedFreeLevel(pDb->pEnv, pRet); + pRet = 0; + } + + *ppLevel = pRet; + *piIn = iIn; + return rc; +} + + +int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){ + int rc = LSM_OK; + if( nVal>0 ){ + u32 *aIn; + + aIn = lsmMallocRc(pDb->pEnv, nVal, &rc); + if( aIn ){ + Level *pLevel = 0; + Level *pParent; + + int nIn; + int nLevel; + int iIn = 1; + memcpy(aIn, pVal, nVal); + nIn = nVal / sizeof(u32); + + ckptChangeEndianness(aIn, nIn); + nLevel = aIn[0]; + rc = ckptLoadLevels(pDb, aIn, &iIn, nLevel, &pLevel); + lsmFree(pDb->pEnv, aIn); + assert( rc==LSM_OK || pLevel==0 ); + if( rc==LSM_OK ){ + pParent = lsmDbSnapshotLevel(pDb->pWorker); + assert( pParent ); + while( pParent->pNext ) pParent = pParent->pNext; + pParent->pNext = pLevel; + } + } + } + + return rc; +} + +/* +** Return the data for the LEVELS record. +** +** The size of the checkpoint that can be stored in the database header +** must not exceed 1024 32-bit integers. Normally, it does not. However, +** if it does, part of the checkpoint must be stored in the LSM. This +** routine returns that part. +*/ +int lsmCheckpointLevels( + lsm_db *pDb, /* Database handle */ + int nLevel, /* Number of levels to write to blob */ + void **paVal, /* OUT: Pointer to LEVELS blob */ + int *pnVal /* OUT: Size of LEVELS blob in bytes */ +){ + Level *p; /* Used to iterate through levels */ + int nAll= 0; + int rc; + int i; + int iOut; + CkptBuffer ckpt; + assert( nLevel>0 ); + + for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext) nAll++; + + assert( nAll>nLevel ); + nAll -= nLevel; + for(p=lsmDbSnapshotLevel(pDb->pWorker); p && nAll>0; p=p->pNext) nAll--; + + memset(&ckpt, 0, sizeof(CkptBuffer)); + ckpt.pEnv = pDb->pEnv; + + ckptSetValue(&ckpt, 0, nLevel, &rc); + iOut = 1; + for(i=0; rc==LSM_OK && ipNext; + } + assert( rc!=LSM_OK || p==0 ); + + if( rc==LSM_OK ){ + ckptChangeEndianness(ckpt.aCkpt, iOut); + *paVal = (void *)ckpt.aCkpt; + *pnVal = iOut * sizeof(u32); + }else{ + *pnVal = 0; + *paVal = 0; + } + + return rc; +} + +/* +** Read the checkpoint id from meta-page pPg. +*/ +static i64 ckptLoadId(MetaPage *pPg){ + i64 ret = 0; + if( pPg ){ + int nData; + u8 *aData = lsmFsMetaPageData(pPg, &nData); + ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) + + ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4])); + } + return ret; +} + +/* +** Return true if the buffer passed as an argument contains a valid +** checkpoint. +*/ +static int ckptChecksumOk(u32 *aCkpt){ + u32 nCkpt = aCkpt[CKPT_HDR_NCKPT]; + u32 cksum1; + u32 cksum2; + + if( nCkpt(LSM_META_PAGE_SIZE)/sizeof(u32) ) return 0; + ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2); + return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]); +} + +/* +** Attempt to load a checkpoint from meta page iMeta. +** +** This function is a no-op if *pRc is set to any value other than LSM_OK +** when it is called. If an error occurs, *pRc is set to an LSM error code +** before returning. +** +** If no error occurs and the checkpoint is successfully loaded, copy it to +** ShmHeader.aSnap1[] and ShmHeader.aSnap2[], and set ShmHeader.iMetaPage +** to indicate its origin. In this case return 1. Or, if the checkpoint +** cannot be loaded (because the checksum does not compute), return 0. +*/ +static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){ + int bLoaded = 0; /* Return value */ + if( *pRc==LSM_OK ){ + int rc = LSM_OK; /* Error code */ + u32 *aCkpt = 0; /* Pointer to buffer containing checkpoint */ + u32 nCkpt; /* Number of elements in aCkpt[] */ + int nData; /* Bytes of data in aData[] */ + u8 *aData; /* Meta page data */ + + aData = lsmFsMetaPageData(pPg, &nData); + nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]); + if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){ + aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc); + } + if( aCkpt ){ + memcpy(aCkpt, aData, nCkpt*sizeof(u32)); + ckptChangeEndianness(aCkpt, nCkpt); + if( ckptChecksumOk(aCkpt) ){ + ShmHeader *pShm = pDb->pShmhdr; + memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32)); + memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32)); + memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32)); + pShm->iMetaPage = iMeta; + bLoaded = 1; + } + } + + lsmFree(pDb->pEnv, aCkpt); + *pRc = rc; + } + return bLoaded; +} + +/* +** Initialize the shared-memory header with an empty snapshot. This function +** is called when no valid snapshot can be found in the database header. +*/ +static void ckptLoadEmpty(lsm_db *pDb){ + u32 aCkpt[] = { + 0, /* CKPT_HDR_ID_MSW */ + 10, /* CKPT_HDR_ID_LSW */ + 0, /* CKPT_HDR_NCKPT */ + LSM_COMPRESSION_EMPTY, /* CKPT_HDR_CMPID */ + 0, /* CKPT_HDR_NBLOCK */ + 0, /* CKPT_HDR_BLKSZ */ + 0, /* CKPT_HDR_NLEVEL */ + 0, /* CKPT_HDR_PGSZ */ + 0, /* CKPT_HDR_NWRITE */ + 0, 0, 1234, 5678, /* The log pointer and initial checksum */ + 0,0,0,0, 0,0,0,0, /* The append list */ + 0, /* The redirected block list */ + 0, /* The free block list */ + 0, 0 /* Space for checksum values */ + }; + u32 nCkpt = array_size(aCkpt); + ShmHeader *pShm = pDb->pShmhdr; + + aCkpt[CKPT_HDR_NCKPT] = nCkpt; + aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz; + aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz; + ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]); + + memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32)); + memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32)); + memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32)); +} + +/* +** This function is called as part of database recovery to initialize the +** ShmHeader.aSnap1[] and ShmHeader.aSnap2[] snapshots. +*/ +int lsmCheckpointRecover(lsm_db *pDb){ + int rc = LSM_OK; /* Return Code */ + i64 iId1; /* Id of checkpoint on meta-page 1 */ + i64 iId2; /* Id of checkpoint on meta-page 2 */ + int bLoaded = 0; /* True once checkpoint has been loaded */ + int cmp; /* True if (iId2>iId1) */ + MetaPage *apPg[2] = {0, 0}; /* Meta-pages 1 and 2 */ + + rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]); + if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]); + + iId1 = ckptLoadId(apPg[0]); + iId2 = ckptLoadId(apPg[1]); + cmp = (iId2 > iId1); + bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc); + if( bLoaded==0 ){ + bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc); + } + + /* The database does not contain a valid checkpoint. Initialize the shared + ** memory header with an empty checkpoint. */ + if( bLoaded==0 ){ + ckptLoadEmpty(pDb); + } + + lsmFsMetaPageRelease(apPg[0]); + lsmFsMetaPageRelease(apPg[1]); + + return rc; +} + +/* +** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta. +*/ +int lsmCheckpointStore(lsm_db *pDb, int iMeta){ + MetaPage *pPg = 0; + int rc; + + assert( iMeta==1 || iMeta==2 ); + rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg); + if( rc==LSM_OK ){ + u8 *aData; + int nData; + int nCkpt; + + nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT]; + aData = lsmFsMetaPageData(pPg, &nData); + memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32)); + ckptChangeEndianness((u32 *)aData, nCkpt); + rc = lsmFsMetaPageRelease(pPg); + } + + return rc; +} + +/* +** Copy the current client snapshot from shared-memory to pDb->aSnapshot[]. +*/ +int lsmCheckpointLoad(lsm_db *pDb, int *piRead){ + int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL; + ShmHeader *pShm = pDb->pShmhdr; + while( (nRem--)>0 ){ + int nInt; + + nInt = pShm->aSnap1[CKPT_HDR_NCKPT]; + if( nInt<=(LSM_META_PAGE_SIZE / sizeof(u32)) ){ + memcpy(pDb->aSnapshot, pShm->aSnap1, nInt*sizeof(u32)); + if( ckptChecksumOk(pDb->aSnapshot) ){ + if( piRead ) *piRead = 1; + return LSM_OK; + } + } + + nInt = pShm->aSnap2[CKPT_HDR_NCKPT]; + if( nInt<=(LSM_META_PAGE_SIZE / sizeof(u32)) ){ + memcpy(pDb->aSnapshot, pShm->aSnap2, nInt*sizeof(u32)); + if( ckptChecksumOk(pDb->aSnapshot) ){ + if( piRead ) *piRead = 2; + return LSM_OK; + } + } + + lsmShmBarrier(pDb); + } + return LSM_PROTOCOL_BKPT; +} + +int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId){ + int rc; + + assert( db->pClient==0 && db->pWorker==0 ); + rc = lsmCheckpointLoad(db, 0); + if( rc==LSM_OK ){ + *piCmpId = db->aSnapshot[CKPT_HDR_CMPID]; + } + + return rc; +} + +int lsmCheckpointLoadOk(lsm_db *pDb, int iSnap){ + u32 *aShm; + assert( iSnap==1 || iSnap==2 ); + aShm = (iSnap==1) ? pDb->pShmhdr->aSnap1 : pDb->pShmhdr->aSnap2; + return (lsmCheckpointId(pDb->aSnapshot, 0)==lsmCheckpointId(aShm, 0) ); +} + +int lsmCheckpointClientCacheOk(lsm_db *pDb){ + return ( pDb->pClient + && pDb->pClient->iId==lsmCheckpointId(pDb->aSnapshot, 0) + && pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap1, 0) + && pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap2, 0) + ); +} + +int lsmCheckpointLoadWorker(lsm_db *pDb){ + int rc; + ShmHeader *pShm = pDb->pShmhdr; + int nInt1; + int nInt2; + + /* Must be holding the WORKER lock to do this. Or DMS2. */ + assert( + lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) + || lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) + ); + + /* Check that the two snapshots match. If not, repair them. */ + nInt1 = pShm->aSnap1[CKPT_HDR_NCKPT]; + nInt2 = pShm->aSnap2[CKPT_HDR_NCKPT]; + if( nInt1!=nInt2 || memcmp(pShm->aSnap1, pShm->aSnap2, nInt2*sizeof(u32)) ){ + if( ckptChecksumOk(pShm->aSnap1) ){ + memcpy(pShm->aSnap2, pShm->aSnap1, sizeof(u32)*nInt1); + }else if( ckptChecksumOk(pShm->aSnap2) ){ + memcpy(pShm->aSnap1, pShm->aSnap2, sizeof(u32)*nInt2); + }else{ + return LSM_PROTOCOL_BKPT; + } + } + + rc = lsmCheckpointDeserialize(pDb, 1, pShm->aSnap1, &pDb->pWorker); + if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase; + + if( rc==LSM_OK ){ + rc = lsmCheckCompressionId(pDb, pDb->pWorker->iCmpId); + } + +#if 0 + assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) ); +#endif + return rc; +} + +int lsmCheckpointDeserialize( + lsm_db *pDb, + int bInclFreelist, /* If true, deserialize free-list */ + u32 *aCkpt, + Snapshot **ppSnap +){ + int rc = LSM_OK; + Snapshot *pNew; + + pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc); + if( rc==LSM_OK ){ + Level *pLvl; + int nFree; + int i; + int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL]; + int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE; + + pNew->iId = lsmCheckpointId(aCkpt, 0); + pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK]; + pNew->nWrite = aCkpt[CKPT_HDR_NWRITE]; + rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel); + pNew->iLogOff = lsmCheckpointLogOffset(aCkpt); + pNew->iCmpId = aCkpt[CKPT_HDR_CMPID]; + + /* Make a copy of the append-list */ + for(i=0; iaiAppend[i] = ckptRead64(a); + } + + /* Read the block-redirect list */ + pNew->redirect.n = aCkpt[iIn++]; + if( pNew->redirect.n ){ + pNew->redirect.a = lsmMallocZeroRc(pDb->pEnv, + (sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS), &rc + ); + if( rc==LSM_OK ){ + for(i=0; iredirect.n; i++){ + pNew->redirect.a[i].iFrom = aCkpt[iIn++]; + pNew->redirect.a[i].iTo = aCkpt[iIn++]; + } + } + for(pLvl=pNew->pLevel; pLvl->pNext; pLvl=pLvl->pNext); + if( pLvl->nRight ){ + pLvl->aRhs[pLvl->nRight-1].pRedirect = &pNew->redirect; + }else{ + pLvl->lhs.pRedirect = &pNew->redirect; + } + } + + /* Copy the free-list */ + if( rc==LSM_OK && bInclFreelist ){ + nFree = aCkpt[iIn++]; + if( nFree ){ + pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc( + pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc + ); + if( rc==LSM_OK ){ + int i; + for(i=0; ifreelist.aEntry[i]; + p->iBlk = aCkpt[iIn++]; + p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1]; + iIn += 2; + } + pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree; + } + } + } + } + + if( rc!=LSM_OK ){ + lsmFreeSnapshot(pDb->pEnv, pNew); + pNew = 0; + } + + *ppSnap = pNew; + return rc; +} + +/* +** Connection pDb must be the worker connection in order to call this +** function. It returns true if the database already contains the maximum +** number of levels or false otherwise. +** +** This is used when flushing the in-memory tree to disk. If the database +** is already full, then the caller should invoke lsm_work() or similar +** until it is not full before creating a new level by flushing the in-memory +** tree to disk. Limiting the number of levels in the database ensures that +** the records describing them always fit within the checkpoint blob. +*/ +int lsmDatabaseFull(lsm_db *pDb){ + Level *p; + int nRhs = 0; + + assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) ); + assert( pDb->pWorker ); + + for(p=pDb->pWorker->pLevel; p; p=p->pNext){ + nRhs += (p->nRight ? p->nRight : 1); + } + + return (nRhs >= LSM_MAX_RHS_SEGMENTS); +} + +/* +** The connection passed as the only argument is currently the worker +** connection. Some work has been performed on the database by the connection, +** but no new snapshot has been written into shared memory. +** +** This function updates the shared-memory worker and client snapshots with +** the new snapshot produced by the work performed by pDb. +** +** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM +** error code is returned. +*/ +int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush){ + Snapshot *pSnap = pDb->pWorker; + ShmHeader *pShm = pDb->pShmhdr; + void *p = 0; + int n = 0; + int rc; + + pSnap->iId++; + rc = ckptExportSnapshot(pDb, bFlush, pSnap->iId, 1, &p, &n); + if( rc!=LSM_OK ) return rc; + assert( ckptChecksumOk((u32 *)p) ); + + assert( n<=LSM_META_PAGE_SIZE ); + memcpy(pShm->aSnap2, p, n); + lsmShmBarrier(pDb); + memcpy(pShm->aSnap1, p, n); + lsmFree(pDb->pEnv, p); + + assert( lsmFsIntegrityCheck(pDb) ); + return LSM_OK; +} + +/* +** This function is used to determine the snapshot-id of the most recently +** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of +** the two meta-pages said snapshot resides on (if any). +** +** If successful, this function loads the snapshot from the meta-page, +** verifies its checksum and sets *piId to the snapshot-id before returning +** LSM_OK. Or, if the checksum attempt fails, *piId is set to zero and +** LSM_OK returned. If an error occurs, an LSM error code is returned and +** the final value of *piId is undefined. +*/ +int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite){ + int rc = LSM_OK; + MetaPage *pPg; + u32 iMeta; + + iMeta = pDb->pShmhdr->iMetaPage; + if( iMeta==1 || iMeta==2 ){ + rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg); + if( rc==LSM_OK ){ + int nCkpt; + int nData; + u8 *aData; + + aData = lsmFsMetaPageData(pPg, &nData); + assert( nData==LSM_META_PAGE_SIZE ); + nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]); + if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){ + u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc); + if( aCopy ){ + memcpy(aCopy, aData, nCkpt*sizeof(u32)); + ckptChangeEndianness(aCopy, nCkpt); + if( ckptChecksumOk(aCopy) ){ + if( piId ) *piId = lsmCheckpointId(aCopy, 0); + if( piLog ) *piLog = (lsmCheckpointLogOffset(aCopy) >> 1); + if( pnWrite ) *pnWrite = aCopy[CKPT_HDR_NWRITE]; + } + lsmFree(pDb->pEnv, aCopy); + } + } + lsmFsMetaPageRelease(pPg); + } + } + + if( (iMeta!=1 && iMeta!=2) || rc!=LSM_OK || pDb->pShmhdr->iMetaPage!=iMeta ){ + if( piId ) *piId = 0; + if( piLog ) *piLog = 0; + if( pnWrite ) *pnWrite = 0; + } + return rc; +} + +/* +** Return the checkpoint-id of the checkpoint array passed as the first +** argument to this function. If the second argument is true, then assume +** that the checkpoint is made up of 32-bit big-endian integers. If it +** is false, assume that the integers are in machine byte order. +*/ +i64 lsmCheckpointId(u32 *aCkpt, int bDisk){ + i64 iId; + if( bDisk ){ + u8 *aData = (u8 *)aCkpt; + iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32); + iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4])); + }else{ + iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW]; + } + return iId; +} + +u32 lsmCheckpointNBlock(u32 *aCkpt){ + return aCkpt[CKPT_HDR_NBLOCK]; +} + +u32 lsmCheckpointNWrite(u32 *aCkpt, int bDisk){ + if( bDisk ){ + return lsmGetU32((u8 *)&aCkpt[CKPT_HDR_NWRITE]); + }else{ + return aCkpt[CKPT_HDR_NWRITE]; + } +} + +i64 lsmCheckpointLogOffset(u32 *aCkpt){ + return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW]; +} + +int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; } + +int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; } + +void lsmCheckpointLogoffset( + u32 *aCkpt, + DbLog *pLog +){ + pLog->aRegion[2].iStart = (lsmCheckpointLogOffset(aCkpt) >> 1); + + pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1]; + pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2]; + pLog->iSnapshotId = lsmCheckpointId(aCkpt, 0); +} + +void lsmCheckpointZeroLogoffset(lsm_db *pDb){ + u32 nCkpt; + + nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT]; + assert( nCkpt>CKPT_HDR_NCKPT ); + assert( nCkpt==pDb->pShmhdr->aSnap1[CKPT_HDR_NCKPT] ); + assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap1, nCkpt*sizeof(u32)) ); + assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap2, nCkpt*sizeof(u32)) ); + + pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0; + pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0; + ckptChecksum(pDb->aSnapshot, nCkpt, + &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1] + ); + + memcpy(pDb->pShmhdr->aSnap1, pDb->aSnapshot, nCkpt*sizeof(u32)); + memcpy(pDb->pShmhdr->aSnap2, pDb->aSnapshot, nCkpt*sizeof(u32)); +} + +/* +** Set the output variable to the number of KB of data written into the +** database file since the most recent checkpoint. +*/ +int lsmCheckpointSize(lsm_db *db, int *pnKB){ + int rc = LSM_OK; + u32 nSynced; + + /* Set nSynced to the number of pages that had been written when the + ** database was last checkpointed. */ + rc = lsmCheckpointSynced(db, 0, 0, &nSynced); + + if( rc==LSM_OK ){ + u32 nPgsz = db->pShmhdr->aSnap1[CKPT_HDR_PGSZ]; + u32 nWrite = db->pShmhdr->aSnap1[CKPT_HDR_NWRITE]; + *pnKB = (int)(( ((i64)(nWrite - nSynced) * nPgsz) + 1023) / 1024); + } + + return rc; +} diff --git a/ext/lsm1/lsm_file.c b/ext/lsm1/lsm_file.c new file mode 100644 index 0000000000..fa106054d7 --- /dev/null +++ b/ext/lsm1/lsm_file.c @@ -0,0 +1,3292 @@ +/* +** 2011-08-26 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** NORMAL DATABASE FILE FORMAT +** +** The following database file format concepts are used by the code in +** this file to read and write the database file. +** +** Pages: +** +** A database file is divided into pages. The first 8KB of the file consists +** of two 4KB meta-pages. The meta-page size is not configurable. The +** remainder of the file is made up of database pages. The default database +** page size is 4KB. Database pages are aligned to page-size boundaries, +** so if the database page size is larger than 8KB there is a gap between +** the end of the meta pages and the start of the database pages. +** +** Database pages are numbered based on their position in the file. Page N +** begins at byte offset ((N-1)*pgsz). This means that page 1 does not +** exist - since it would always overlap with the meta pages. If the +** page-size is (say) 512 bytes, then the first usable page in the database +** is page 33. +** +** It is assumed that the first two meta pages and the data that follows +** them are located on different disk sectors. So that if a power failure +** while writing to a meta page there is no risk of damage to the other +** meta page or any other part of the database file. TODO: This may need +** to be revisited. +** +** Blocks: +** +** The database file is also divided into blocks. The default block size is +** 1MB. When writing to the database file, an attempt is made to write data +** in contiguous block-sized chunks. +** +** The first and last page on each block are special in that they are 4 +** bytes smaller than all other pages. This is because the last four bytes +** of space on the first and last pages of each block are reserved for +** pointers to other blocks (i.e. a 32-bit block number). +** +** Runs: +** +** A run is a sequence of pages that the upper layer uses to store a +** sorted array of database keys (and accompanying data - values, FC +** pointers and so on). Given a page within a run, it is possible to +** navigate to the next page in the run as follows: +** +** a) if the current page is not the last in a block, the next page +** in the run is located immediately after the current page, OR +** +** b) if the current page is the last page in a block, the next page +** in the run is the first page on the block identified by the +** block pointer stored in the last 4 bytes of the current block. +** +** It is possible to navigate to the previous page in a similar fashion, +** using the block pointer embedded in the last 4 bytes of the first page +** of each block as required. +** +** The upper layer is responsible for identifying by page number the +** first and last page of any run that it needs to navigate - there are +** no "end-of-run" markers stored or identified by this layer. This is +** necessary as clients reading different database snapshots may access +** different subsets of a run. +** +** THE LOG FILE +** +** This file opens and closes the log file. But it does not contain any +** logic related to the log file format. Instead, it exports the following +** functions that are used by the code in lsm_log.c to read and write the +** log file: +** +** lsmFsOpenLog +** lsmFsWriteLog +** lsmFsSyncLog +** lsmFsReadLog +** lsmFsTruncateLog +** lsmFsCloseAndDeleteLog +** +** COMPRESSED DATABASE FILE FORMAT +** +** The compressed database file format is very similar to the normal format. +** The file still begins with two 4KB meta-pages (which are never compressed). +** It is still divided into blocks. +** +** The first and last four bytes of each block are reserved for 32-bit +** pointer values. Similar to the way four bytes are carved from the end of +** the first and last page of each block in uncompressed databases. From +** the point of view of the upper layer, all pages are the same size - this +** is different from the uncompressed format where the first and last pages +** on each block are 4 bytes smaller than the others. +** +** Pages are stored in variable length compressed form, as follows: +** +** * 3-byte size field containing the size of the compressed page image +** in bytes. The most significant bit of each byte of the size field +** is always set. The remaining 7 bits are used to store a 21-bit +** integer value (in big-endian order - the first byte in the field +** contains the most significant 7 bits). Since the maximum allowed +** size of a compressed page image is (2^17 - 1) bytes, there are +** actually 4 unused bits in the size field. +** +** In other words, if the size of the compressed page image is nSz, +** the header can be serialized as follows: +** +** u8 aHdr[3] +** aHdr[0] = 0x80 | (u8)(nSz >> 14); +** aHdr[1] = 0x80 | (u8)(nSz >> 7); +** aHdr[2] = 0x80 | (u8)(nSz >> 0); +** +** * Compressed page image. +** +** * A second copy of the 3-byte record header. +** +** A page number is a byte offset into the database file. So the smallest +** possible page number is 8192 (immediately after the two meta-pages). +** The first and root page of a segment are identified by a page number +** corresponding to the byte offset of the first byte in the corresponding +** page record. The last page of a segment is identified by the byte offset +** of the last byte in its record. +** +** Unlike uncompressed pages, compressed page records may span blocks. +** +** Sometimes, in order to avoid touching sectors that contain synced data +** when writing, it is necessary to insert unused space between compressed +** page records. This can be done as follows: +** +** * For less than 6 bytes of empty space, the first and last byte +** of the free space contain the total number of free bytes. For +** example: +** +** Block of 4 free bytes: 0x04 0x?? 0x?? 0x04 +** Block of 2 free bytes: 0x02 0x02 +** A single free byte: 0x01 +** +** * For 6 or more bytes of empty space, a record similar to a +** compressed page record is added to the segment. A padding record +** is distinguished from a compressed page record by the most +** significant bit of the second byte of the size field, which is +** cleared instead of set. +*/ +#include "lsmInt.h" + +#include +#include +#include + +/* +** File-system object. Each database connection allocates a single instance +** of the following structure. It is used for all access to the database and +** log files. +** +** The database file may be accessed via two methods - using mmap() or using +** read() and write() calls. In the general case both methods are used - a +** prefix of the file is mapped into memory and the remainder accessed using +** read() and write(). This is helpful when accessing very large files (or +** files that may grow very large during the lifetime of a database +** connection) on systems with 32-bit address spaces. However, it also requires +** that this object manage two distinct types of Page objects simultaneously - +** those that carry pointers to the mapped file and those that carry arrays +** populated by read() calls. +** +** pFree: +** The head of a singly-linked list that containing currently unused Page +** structures suitable for use as mmap-page handles. Connected by the +** Page.pFreeNext pointers. +** +** pMapped: +** The head of a singly-linked list that contains all pages that currently +** carry pointers to the mapped region. This is used if the region is +** every remapped - the pointers carried by existing pages can be adjusted +** to account for the remapping. Connected by the Page.pMappedNext pointers. +** +** pWaiting: +** When the upper layer wishes to append a new b-tree page to a segment, +** it allocates a Page object that carries a malloc'd block of memory - +** regardless of the mmap-related configuration. The page is not assigned +** a page number at first. When the upper layer has finished constructing +** the page contents, it calls lsmFsPagePersist() to assign a page number +** to it. At this point it is likely that N pages have been written to the +** segment, the (N+1)th page is still outstanding and the b-tree page is +** assigned page number (N+2). To avoid writing page (N+2) before page +** (N+1), the recently completed b-tree page is held in the singly linked +** list headed by pWaiting until page (N+1) has been written. +** +** Function lsmFsFlushWaiting() is responsible for eventually writing +** waiting pages to disk. +** +** apHash/nHash: +** Hash table used to store all Page objects that carry malloc'd arrays, +** except those b-tree pages that have not yet been assigned page numbers. +** Once they have been assigned page numbers - they are added to this +** hash table. +** +** Hash table overflow chains are connected using the Page.pHashNext +** pointers. +** +** pLruFirst, pLruLast: +** The first and last entries in a doubly-linked list of pages. This +** list contains all pages with malloc'd data that are present in the +** hash table and have a ref-count of zero. +*/ +struct FileSystem { + lsm_db *pDb; /* Database handle that owns this object */ + lsm_env *pEnv; /* Environment pointer */ + char *zDb; /* Database file name */ + char *zLog; /* Database file name */ + int nMetasize; /* Size of meta pages in bytes */ + int nPagesize; /* Database page-size in bytes */ + int nBlocksize; /* Database block-size in bytes */ + + /* r/w file descriptors for both files. */ + LsmFile *pLsmFile; /* Used after lsm_close() to link into list */ + lsm_file *fdDb; /* Database file */ + lsm_file *fdLog; /* Log file */ + int szSector; /* Database file sector size */ + + /* If this is a compressed database, a pointer to the compression methods. + ** For an uncompressed database, a NULL pointer. */ + lsm_compress *pCompress; + u8 *aIBuffer; /* Buffer to compress to */ + u8 *aOBuffer; /* Buffer to uncompress from */ + int nBuffer; /* Allocated size of above buffers in bytes */ + + /* mmap() page related things */ + i64 nMapLimit; /* Maximum bytes of file to map */ + void *pMap; /* Current mapping of database file */ + i64 nMap; /* Bytes mapped at pMap */ + Page *pFree; /* Unused Page structures */ + Page *pMapped; /* List of Page structs that point to pMap */ + + /* Page cache parameters for non-mmap() pages */ + int nCacheMax; /* Configured cache size (in pages) */ + int nCacheAlloc; /* Current cache size (in pages) */ + Page *pLruFirst; /* Head of the LRU list */ + Page *pLruLast; /* Tail of the LRU list */ + int nHash; /* Number of hash slots in hash table */ + Page **apHash; /* nHash Hash slots */ + Page *pWaiting; /* b-tree pages waiting to be written */ + + /* Statistics */ + int nOut; /* Number of outstanding pages */ + int nWrite; /* Total number of pages written */ + int nRead; /* Total number of pages read */ +}; + +/* +** Database page handle. +** +** pSeg: +** When lsmFsSortedAppend() is called on a compressed database, the new +** page is not assigned a page number or location in the database file +** immediately. Instead, these are assigned by the lsmFsPagePersist() call +** right before it writes the compressed page image to disk. +** +** The lsmFsSortedAppend() function sets the pSeg pointer to point to the +** segment that the new page will be a part of. It is unset by +** lsmFsPagePersist() after the page is written to disk. +*/ +struct Page { + u8 *aData; /* Buffer containing page data */ + int nData; /* Bytes of usable data at aData[] */ + Pgno iPg; /* Page number */ + int nRef; /* Number of outstanding references */ + int flags; /* Combination of PAGE_XXX flags */ + Page *pHashNext; /* Next page in hash table slot */ + Page *pLruNext; /* Next page in LRU list */ + Page *pLruPrev; /* Previous page in LRU list */ + FileSystem *pFS; /* File system that owns this page */ + + /* Only used in compressed database mode: */ + int nCompress; /* Compressed size (or 0 for uncomp. db) */ + int nCompressPrev; /* Compressed size of prev page */ + Segment *pSeg; /* Segment this page will be written to */ + + /* Pointers for singly linked lists */ + Page *pWaitingNext; /* Next page in FileSystem.pWaiting list */ + Page *pFreeNext; /* Next page in FileSystem.pFree list */ + Page *pMappedNext; /* Next page in FileSystem.pMapped list */ +}; + +/* +** Meta-data page handle. There are two meta-data pages at the start of +** the database file, each FileSystem.nMetasize bytes in size. +*/ +struct MetaPage { + int iPg; /* Either 1 or 2 */ + int bWrite; /* Write back to db file on release */ + u8 *aData; /* Pointer to buffer */ + FileSystem *pFS; /* FileSystem that owns this page */ +}; + +/* +** Values for LsmPage.flags +*/ +#define PAGE_DIRTY 0x00000001 /* Set if page is dirty */ +#define PAGE_FREE 0x00000002 /* Set if Page.aData requires lsmFree() */ +#define PAGE_HASPREV 0x00000004 /* Set if page is first on uncomp. block */ + +/* +** Number of pgsz byte pages omitted from the start of block 1. The start +** of block 1 contains two 4096 byte meta pages (8192 bytes in total). +*/ +#define BLOCK1_HDR_SIZE(pgsz) LSM_MAX(1, 8192/(pgsz)) + +/* +** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt() +** to catch IO errors (any error returned by a VFS method). +*/ +#ifndef NDEBUG +static void lsmIoerrBkpt(){ + static int nErr = 0; + nErr++; +} +static int IOERR_WRAPPER(int rc){ + if( rc!=LSM_OK ) lsmIoerrBkpt(); + return rc; +} +#else +# define IOERR_WRAPPER(rc) (rc) +#endif + +#ifdef NDEBUG +# define assert_lists_are_ok(x) +#else +static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash); + +static void assert_lists_are_ok(FileSystem *pFS){ +#if 0 + Page *p; + + assert( pFS->nMapLimit>=0 ); + + /* Check that all pages in the LRU list have nRef==0, pointers to buffers + ** in heap memory, and corresponding entries in the hash table. */ + for(p=pFS->pLruFirst; p; p=p->pLruNext){ + assert( p==pFS->pLruFirst || p->pLruPrev!=0 ); + assert( p==pFS->pLruLast || p->pLruNext!=0 ); + assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p ); + assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p ); + assert( p->nRef==0 ); + assert( p->flags & PAGE_FREE ); + assert( p==fsPageFindInHash(pFS, p->iPg, 0) ); + } +#endif +} +#endif + +/* +** Wrappers around the VFS methods of the lsm_env object: +** +** lsmEnvOpen() +** lsmEnvRead() +** lsmEnvWrite() +** lsmEnvSync() +** lsmEnvSectorSize() +** lsmEnvClose() +** lsmEnvTruncate() +** lsmEnvUnlink() +** lsmEnvRemap() +*/ +int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){ + return pEnv->xOpen(pEnv, zFile, flags, ppNew); +} + +static int lsmEnvRead( + lsm_env *pEnv, + lsm_file *pFile, + lsm_i64 iOff, + void *pRead, + int nRead +){ + return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) ); +} + +static int lsmEnvWrite( + lsm_env *pEnv, + lsm_file *pFile, + lsm_i64 iOff, + const void *pWrite, + int nWrite +){ + return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) ); +} + +static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){ + return IOERR_WRAPPER( pEnv->xSync(pFile) ); +} + +static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){ + return pEnv->xSectorSize(pFile); +} + +int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){ + return IOERR_WRAPPER( pEnv->xClose(pFile) ); +} + +static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){ + return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) ); +} + +static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){ + return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) ); +} + +static int lsmEnvRemap( + lsm_env *pEnv, + lsm_file *pFile, + i64 szMin, + void **ppMap, + i64 *pszMap +){ + return pEnv->xRemap(pFile, szMin, ppMap, pszMap); +} + +int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){ + if( pFile==0 ) return LSM_OK; + return pEnv->xLock(pFile, iLock, eLock); +} + +int lsmEnvTestLock( + lsm_env *pEnv, + lsm_file *pFile, + int iLock, + int nLock, + int eLock +){ + return pEnv->xTestLock(pFile, iLock, nLock, eLock); +} + +int lsmEnvShmMap( + lsm_env *pEnv, + lsm_file *pFile, + int iChunk, + int sz, + void **ppOut +){ + return pEnv->xShmMap(pFile, iChunk, sz, ppOut); +} + +void lsmEnvShmBarrier(lsm_env *pEnv){ + return pEnv->xShmBarrier(); +} + +void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){ + pEnv->xShmUnmap(pFile, bDel); +} + +void lsmEnvSleep(lsm_env *pEnv, int nUs){ + pEnv->xSleep(pEnv, nUs); +} + + +/* +** Write the contents of string buffer pStr into the log file, starting at +** offset iOff. +*/ +int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){ + assert( pFS->fdLog ); + return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n); +} + +/* +** fsync() the log file. +*/ +int lsmFsSyncLog(FileSystem *pFS){ + assert( pFS->fdLog ); + return lsmEnvSync(pFS->pEnv, pFS->fdLog); +} + +/* +** Read nRead bytes of data starting at offset iOff of the log file. Append +** the results to string buffer pStr. +*/ +int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){ + int rc; /* Return code */ + assert( pFS->fdLog ); + rc = lsmStringExtend(pStr, nRead); + if( rc==LSM_OK ){ + rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead); + pStr->n += nRead; + } + return rc; +} + +/* +** Truncate the log file to nByte bytes in size. +*/ +int lsmFsTruncateLog(FileSystem *pFS, i64 nByte){ + if( pFS->fdLog==0 ) return LSM_OK; + return lsmEnvTruncate(pFS->pEnv, pFS->fdLog, nByte); +} + +/* +** Truncate the db file to nByte bytes in size. +*/ +int lsmFsTruncateDb(FileSystem *pFS, i64 nByte){ + if( pFS->fdDb==0 ) return LSM_OK; + return lsmEnvTruncate(pFS->pEnv, pFS->fdDb, nByte); +} + +/* +** Close the log file. Then delete it from the file-system. This function +** is called during database shutdown only. +*/ +int lsmFsCloseAndDeleteLog(FileSystem *pFS){ + char *zDel; + + if( pFS->fdLog ){ + lsmEnvClose(pFS->pEnv, pFS->fdLog ); + pFS->fdLog = 0; + } + + zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb); + if( zDel ){ + lsmEnvUnlink(pFS->pEnv, zDel); + lsmFree(pFS->pEnv, zDel); + } + return LSM_OK; +} + +/* +** Return true if page iReal of the database should be accessed using mmap. +** False otherwise. +*/ +static int fsMmapPage(FileSystem *pFS, Pgno iReal){ + return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit); +} + +/* +** Given that there are currently nHash slots in the hash table, return +** the hash key for file iFile, page iPg. +*/ +static int fsHashKey(int nHash, int iPg){ + return (iPg % nHash); +} + +/* +** This is a helper function for lsmFsOpen(). It opens a single file on +** disk (either the database or log file). +*/ +static lsm_file *fsOpenFile( + FileSystem *pFS, /* File system object */ + int bReadonly, /* True to open this file read-only */ + int bLog, /* True for log, false for db */ + int *pRc /* IN/OUT: Error code */ +){ + lsm_file *pFile = 0; + if( *pRc==LSM_OK ){ + int flags = (bReadonly ? LSM_OPEN_READONLY : 0); + const char *zPath = (bLog ? pFS->zLog : pFS->zDb); + + *pRc = lsmEnvOpen(pFS->pEnv, zPath, flags, &pFile); + } + return pFile; +} + +/* +** If it is not already open, this function opens the log file. It returns +** LSM_OK if successful (or if the log file was already open) or an LSM +** error code otherwise. +** +** The log file must be opened before any of the following may be called: +** +** lsmFsWriteLog +** lsmFsSyncLog +** lsmFsReadLog +*/ +int lsmFsOpenLog(lsm_db *db, int *pbOpen){ + int rc = LSM_OK; + FileSystem *pFS = db->pFS; + + if( 0==pFS->fdLog ){ + pFS->fdLog = fsOpenFile(pFS, db->bReadonly, 1, &rc); + + if( rc==LSM_IOERR_NOENT && db->bReadonly ){ + rc = LSM_OK; + } + } + + if( pbOpen ) *pbOpen = (pFS->fdLog!=0); + return rc; +} + +/* +** Close the log file, if it is open. +*/ +void lsmFsCloseLog(lsm_db *db){ + FileSystem *pFS = db->pFS; + if( pFS->fdLog ){ + lsmEnvClose(pFS->pEnv, pFS->fdLog); + pFS->fdLog = 0; + } +} + +/* +** Open a connection to a database stored within the file-system. +** +** If parameter bReadonly is true, then open a read-only file-descriptor +** on the database file. It is possible that bReadonly will be false even +** if the user requested that pDb be opened read-only. This is because the +** file-descriptor may later on be recycled by a read-write connection. +** If the db file can be opened for read-write access, it always is. Parameter +** bReadonly is only ever true if it has already been determined that the +** db can only be opened for read-only access. +** +** Return LSM_OK if successful or an lsm error code otherwise. +*/ +int lsmFsOpen( + lsm_db *pDb, /* Database connection to open fd for */ + const char *zDb, /* Full path to database file */ + int bReadonly /* True to open db file read-only */ +){ + FileSystem *pFS; + int rc = LSM_OK; + int nDb = strlen(zDb); + int nByte; + + assert( pDb->pFS==0 ); + assert( pDb->pWorker==0 && pDb->pClient==0 ); + + nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1; + pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc); + if( pFS ){ + LsmFile *pLsmFile; + pFS->zDb = (char *)&pFS[1]; + pFS->zLog = &pFS->zDb[nDb+1]; + pFS->nPagesize = LSM_DFLT_PAGE_SIZE; + pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE; + pFS->nMetasize = 4 * 1024; + pFS->pDb = pDb; + pFS->pEnv = pDb->pEnv; + + /* Make a copy of the database and log file names. */ + memcpy(pFS->zDb, zDb, nDb+1); + memcpy(pFS->zLog, zDb, nDb); + memcpy(&pFS->zLog[nDb], "-log", 5); + + /* Allocate the hash-table here. At some point, it should be changed + ** so that it can grow dynamicly. */ + pFS->nCacheMax = 2048*1024 / pFS->nPagesize; + pFS->nHash = 4096; + pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc); + + /* Open the database file */ + pLsmFile = lsmDbRecycleFd(pDb); + if( pLsmFile ){ + pFS->pLsmFile = pLsmFile; + pFS->fdDb = pLsmFile->pFile; + memset(pLsmFile, 0, sizeof(LsmFile)); + }else{ + pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc); + if( rc==LSM_OK ){ + pFS->fdDb = fsOpenFile(pFS, bReadonly, 0, &rc); + } + } + + if( rc!=LSM_OK ){ + lsmFsClose(pFS); + pFS = 0; + }else{ + pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb); + } + } + + pDb->pFS = pFS; + return rc; +} + +/* +** Configure the file-system object according to the current values of +** the LSM_CONFIG_MMAP and LSM_CONFIG_SET_COMPRESSION options. +*/ +int lsmFsConfigure(lsm_db *db){ + FileSystem *pFS = db->pFS; + if( pFS ){ + lsm_env *pEnv = pFS->pEnv; + Page *pPg; + + assert( pFS->nOut==0 ); + assert( pFS->pWaiting==0 ); + assert( pFS->pMapped==0 ); + + /* Reset any compression/decompression buffers already allocated */ + lsmFree(pEnv, pFS->aIBuffer); + lsmFree(pEnv, pFS->aOBuffer); + pFS->nBuffer = 0; + + /* Unmap the file, if it is currently mapped */ + if( pFS->pMap ){ + lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap); + pFS->nMapLimit = 0; + } + + /* Free all allocated page structures */ + pPg = pFS->pLruFirst; + while( pPg ){ + Page *pNext = pPg->pLruNext; + assert( pPg->flags & PAGE_FREE ); + lsmFree(pEnv, pPg->aData); + lsmFree(pEnv, pPg); + pPg = pNext; + } + + pPg = pFS->pFree; + while( pPg ){ + Page *pNext = pPg->pFreeNext; + lsmFree(pEnv, pPg); + pPg = pNext; + } + + /* Zero pointers that point to deleted page objects */ + pFS->nCacheAlloc = 0; + pFS->pLruFirst = 0; + pFS->pLruLast = 0; + pFS->pFree = 0; + if( pFS->apHash ){ + memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0])); + } + + /* Configure the FileSystem object */ + if( db->compress.xCompress ){ + pFS->pCompress = &db->compress; + pFS->nMapLimit = 0; + }else{ + pFS->pCompress = 0; + if( db->iMmap==1 ){ + /* Unlimited */ + pFS->nMapLimit = (i64)1 << 60; + }else{ + /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */ + pFS->nMapLimit = (i64)db->iMmap * 1024; + } + } + } + + return LSM_OK; +} + +/* +** Close and destroy a FileSystem object. +*/ +void lsmFsClose(FileSystem *pFS){ + if( pFS ){ + Page *pPg; + lsm_env *pEnv = pFS->pEnv; + + assert( pFS->nOut==0 ); + pPg = pFS->pLruFirst; + while( pPg ){ + Page *pNext = pPg->pLruNext; + if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData); + lsmFree(pEnv, pPg); + pPg = pNext; + } + + pPg = pFS->pFree; + while( pPg ){ + Page *pNext = pPg->pFreeNext; + if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData); + lsmFree(pEnv, pPg); + pPg = pNext; + } + + if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb ); + if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog ); + lsmFree(pEnv, pFS->pLsmFile); + lsmFree(pEnv, pFS->apHash); + lsmFree(pEnv, pFS->aIBuffer); + lsmFree(pEnv, pFS->aOBuffer); + lsmFree(pEnv, pFS); + } +} + +/* +** This function is called when closing a database handle (i.e. lsm_close()) +** if there exist other connections to the same database within this process. +** In that case the file-descriptor open on the database file is not closed +** when the FileSystem object is destroyed, as this would cause any POSIX +** locks held by the other connections to be silently dropped (see "man close" +** for details). Instead, the file-descriptor is stored in a list by the +** lsm_shared.c module until it is either closed or reused. +** +** This function returns a pointer to an object that can be linked into +** the list described above. The returned object now 'owns' the database +** file descriptr, so that when the FileSystem object is destroyed, it +** will not be closed. +** +** This function may be called at most once in the life-time of a +** FileSystem object. The results of any operations involving the database +** file descriptor are undefined once this function has been called. +** +** None of this is necessary on non-POSIX systems. But we do it anyway in +** the name of using as similar code as possible on all platforms. +*/ +LsmFile *lsmFsDeferClose(FileSystem *pFS){ + LsmFile *p = pFS->pLsmFile; + assert( p->pNext==0 ); + p->pFile = pFS->fdDb; + pFS->fdDb = 0; + pFS->pLsmFile = 0; + return p; +} + +/* +** Allocate a buffer and populate it with the output of the xFileid() +** method of the database file handle. If successful, set *ppId to point +** to the buffer and *pnId to the number of bytes in the buffer and return +** LSM_OK. Otherwise, set *ppId and *pnId to zero and return an LSM +** error code. +*/ +int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId){ + lsm_env *pEnv = pDb->pEnv; + FileSystem *pFS = pDb->pFS; + int rc; + int nId = 0; + void *pId; + + rc = pEnv->xFileid(pFS->fdDb, 0, &nId); + pId = lsmMallocZeroRc(pEnv, nId, &rc); + if( rc==LSM_OK ) rc = pEnv->xFileid(pFS->fdDb, pId, &nId); + + if( rc!=LSM_OK ){ + lsmFree(pEnv, pId); + pId = 0; + nId = 0; + } + + *ppId = pId; + *pnId = nId; + return rc; +} + +/* +** Return the nominal page-size used by this file-system. Actual pages +** may be smaller or larger than this value. +*/ +int lsmFsPageSize(FileSystem *pFS){ + return pFS->nPagesize; +} + +/* +** Return the block-size used by this file-system. +*/ +int lsmFsBlockSize(FileSystem *pFS){ + return pFS->nBlocksize; +} + +/* +** Configure the nominal page-size used by this file-system. Actual +** pages may be smaller or larger than this value. +*/ +void lsmFsSetPageSize(FileSystem *pFS, int nPgsz){ + pFS->nPagesize = nPgsz; + pFS->nCacheMax = 2048*1024 / pFS->nPagesize; +} + +/* +** Configure the block-size used by this file-system. +*/ +void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){ + pFS->nBlocksize = nBlocksize; +} + +/* +** Return the page number of the first page on block iBlock. Blocks are +** numbered starting from 1. +** +** For a compressed database, page numbers are byte offsets. The first +** page on each block is the byte offset immediately following the 4-byte +** "previous block" pointer at the start of each block. +*/ +static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){ + Pgno iPg; + if( pFS->pCompress ){ + if( iBlock==1 ){ + iPg = pFS->nMetasize * 2 + 4; + }else{ + iPg = pFS->nBlocksize * (Pgno)(iBlock-1) + 4; + } + }else{ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + if( iBlock==1 ){ + iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize); + }else{ + iPg = 1 + (iBlock-1) * nPagePerBlock; + } + } + return iPg; +} + +/* +** Return the page number of the last page on block iBlock. Blocks are +** numbered starting from 1. +** +** For a compressed database, page numbers are byte offsets. The first +** page on each block is the byte offset of the byte immediately before +** the 4-byte "next block" pointer at the end of each block. +*/ +static Pgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){ + if( pFS->pCompress ){ + return pFS->nBlocksize * (Pgno)iBlock - 1 - 4; + }else{ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + return iBlock * nPagePerBlock; + } +} + +/* +** Return the block number of the block that page iPg is located on. +** Blocks are numbered starting from 1. +*/ +static int fsPageToBlock(FileSystem *pFS, Pgno iPg){ + if( pFS->pCompress ){ + return (iPg / pFS->nBlocksize) + 1; + }else{ + return 1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)); + } +} + +/* +** Return true if page iPg is the last page on its block. +** +** This function is only called in non-compressed database mode. +*/ +static int fsIsLast(FileSystem *pFS, Pgno iPg){ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + assert( !pFS->pCompress ); + return ( iPg && (iPg % nPagePerBlock)==0 ); +} + +/* +** Return true if page iPg is the first page on its block. +** +** This function is only called in non-compressed database mode. +*/ +static int fsIsFirst(FileSystem *pFS, Pgno iPg){ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + assert( !pFS->pCompress ); + return ( (iPg % nPagePerBlock)==1 + || (iPgnData; + } + return pPage->aData; +} + +/* +** Return the page number of a page. +*/ +Pgno lsmFsPageNumber(Page *pPage){ + /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */ + return pPage ? pPage->iPg : 0; +} + +/* +** Page pPg is currently part of the LRU list belonging to pFS. Remove +** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this +** operation. +*/ +static void fsPageRemoveFromLru(FileSystem *pFS, Page *pPg){ + assert( pPg->pLruNext || pPg==pFS->pLruLast ); + assert( pPg->pLruPrev || pPg==pFS->pLruFirst ); + if( pPg->pLruNext ){ + pPg->pLruNext->pLruPrev = pPg->pLruPrev; + }else{ + pFS->pLruLast = pPg->pLruPrev; + } + if( pPg->pLruPrev ){ + pPg->pLruPrev->pLruNext = pPg->pLruNext; + }else{ + pFS->pLruFirst = pPg->pLruNext; + } + pPg->pLruPrev = 0; + pPg->pLruNext = 0; +} + +/* +** Page pPg is not currently part of the LRU list belonging to pFS. Add it. +*/ +static void fsPageAddToLru(FileSystem *pFS, Page *pPg){ + assert( pPg->pLruNext==0 && pPg->pLruPrev==0 ); + pPg->pLruPrev = pFS->pLruLast; + if( pPg->pLruPrev ){ + pPg->pLruPrev->pLruNext = pPg; + }else{ + pFS->pLruFirst = pPg; + } + pFS->pLruLast = pPg; +} + +/* +** Page pPg is currently stored in the apHash/nHash hash table. Remove it. +*/ +static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){ + int iHash; + Page **pp; + + iHash = fsHashKey(pFS->nHash, pPg->iPg); + for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext); + *pp = pPg->pHashNext; + pPg->pHashNext = 0; +} + +/* +** Free a Page object allocated by fsPageBuffer(). +*/ +static void fsPageBufferFree(Page *pPg){ + pPg->pFS->nCacheAlloc--; + lsmFree(pPg->pFS->pEnv, pPg->aData); + lsmFree(pPg->pFS->pEnv, pPg); +} + + +/* +** Purge the cache of all non-mmap pages with nRef==0. +*/ +void lsmFsPurgeCache(FileSystem *pFS){ + Page *pPg; + + pPg = pFS->pLruFirst; + while( pPg ){ + Page *pNext = pPg->pLruNext; + assert( pPg->flags & PAGE_FREE ); + fsPageRemoveFromHash(pFS, pPg); + fsPageBufferFree(pPg); + pPg = pNext; + } + pFS->pLruFirst = 0; + pFS->pLruLast = 0; + + assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 ); +} + +/* +** Search the hash-table for page iPg. If an entry is round, return a pointer +** to it. Otherwise, return NULL. +** +** Either way, if argument piHash is not NULL set *piHash to the hash slot +** number that page iPg would be stored in before returning. +*/ +static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash){ + Page *p; /* Return value */ + int iHash = fsHashKey(pFS->nHash, iPg); + + if( piHash ) *piHash = iHash; + for(p=pFS->apHash[iHash]; p; p=p->pHashNext){ + if( p->iPg==iPg) break; + } + return p; +} + +/* +** Allocate and return a non-mmap Page object. If there are already +** nCacheMax such Page objects outstanding, try to recycle an existing +** Page instead. +*/ +static int fsPageBuffer( + FileSystem *pFS, + Page **ppOut +){ + int rc = LSM_OK; + Page *pPage = 0; + if( pFS->pLruFirst==0 || pFS->nCacheAllocnCacheMax ){ + /* Allocate a new Page object */ + pPage = lsmMallocZero(pFS->pEnv, sizeof(Page)); + if( !pPage ){ + rc = LSM_NOMEM_BKPT; + }else{ + pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize); + if( !pPage->aData ){ + lsmFree(pFS->pEnv, pPage); + rc = LSM_NOMEM_BKPT; + pPage = 0; + }else{ + pFS->nCacheAlloc++; + } + } + }else{ + /* Reuse an existing Page object */ + u8 *aData; + pPage = pFS->pLruFirst; + aData = pPage->aData; + fsPageRemoveFromLru(pFS, pPage); + fsPageRemoveFromHash(pFS, pPage); + + memset(pPage, 0, sizeof(Page)); + pPage->aData = aData; + } + + if( pPage ){ + pPage->flags = PAGE_FREE; + } + *ppOut = pPage; + return rc; +} + +/* +** Assuming *pRc is initially LSM_OK, attempt to ensure that the +** memory-mapped region is at least iSz bytes in size. If it is not already, +** iSz bytes in size, extend it and update the pointers associated with any +** outstanding Page objects. +** +** If *pRc is not LSM_OK when this function is called, it is a no-op. +** Otherwise, *pRc is set to an lsm error code if an error occurs, or +** left unmodified otherwise. +** +** This function is never called in compressed database mode. +*/ +static void fsGrowMapping( + FileSystem *pFS, /* File system object */ + i64 iSz, /* Minimum size to extend mapping to */ + int *pRc /* IN/OUT: Error code */ +){ + assert( pFS->pCompress==0 ); + assert( PAGE_HASPREV==4 ); + + if( *pRc==LSM_OK && iSz>pFS->nMap ){ + int rc; + u8 *aOld = pFS->pMap; + rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap); + if( rc==LSM_OK && pFS->pMap!=aOld ){ + Page *pFix; + i64 iOff = (u8 *)pFS->pMap - aOld; + for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){ + pFix->aData += iOff; + } + lsmSortedRemap(pFS->pDb); + } + *pRc = rc; + } +} + +/* +** fsync() the database file. +*/ +int lsmFsSyncDb(FileSystem *pFS, int nBlock){ + return lsmEnvSync(pFS->pEnv, pFS->fdDb); +} + +/* +** If block iBlk has been redirected according to the redirections in the +** object passed as the first argument, return the destination block to +** which it is redirected. Otherwise, return a copy of iBlk. +*/ +static int fsRedirectBlock(Redirect *p, int iBlk){ + if( p ){ + int i; + for(i=0; in; i++){ + if( iBlk==p->a[i].iFrom ) return p->a[i].iTo; + } + } + assert( iBlk!=0 ); + return iBlk; +} + +/* +** If page iPg has been redirected according to the redirections in the +** object passed as the second argument, return the destination page to +** which it is redirected. Otherwise, return a copy of iPg. +*/ +Pgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){ + Pgno iReal = iPg; + + if( pRedir ){ + const int nPagePerBlock = ( + pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize) + ); + int iBlk = fsPageToBlock(pFS, iPg); + int i; + for(i=0; in; i++){ + int iFrom = pRedir->a[i].iFrom; + if( iFrom>iBlk ) break; + if( iFrom==iBlk ){ + int iTo = pRedir->a[i].iTo; + iReal = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock; + if( iTo==1 ){ + iReal += (fsFirstPageOnBlock(pFS, 1)-1); + } + break; + } + } + } + + assert( iReal!=0 ); + return iReal; +} + +/* Required by the circular fsBlockNext<->fsPageGet dependency. */ +static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *); + +/* +** Parameter iBlock is a database file block. This function reads the value +** stored in the blocks "next block" pointer and stores it in *piNext. +** LSM_OK is returned if everything is successful, or an LSM error code +** otherwise. +*/ +static int fsBlockNext( + FileSystem *pFS, /* File-system object handle */ + Segment *pSeg, /* Use this segment for block redirects */ + int iBlock, /* Read field from this block */ + int *piNext /* OUT: Next block in linked list */ +){ + int rc; + int iRead; /* Read block from here */ + + if( pSeg ){ + iRead = fsRedirectBlock(pSeg->pRedirect, iBlock); + }else{ + iRead = iBlock; + } + + assert( pFS->nMapLimit==0 || pFS->pCompress==0 ); + if( pFS->pCompress ){ + i64 iOff; /* File offset to read data from */ + u8 aNext[4]; /* 4-byte pointer read from db file */ + + iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext); + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext)); + if( rc==LSM_OK ){ + *piNext = (int)lsmGetU32(aNext); + } + }else{ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + Page *pLast; + rc = fsPageGet(pFS, 0, iRead*nPagePerBlock, 0, &pLast, 0); + if( rc==LSM_OK ){ + *piNext = lsmGetU32(&pLast->aData[pFS->nPagesize-4]); + lsmFsPageRelease(pLast); + } + } + + if( pSeg ){ + *piNext = fsRedirectBlock(pSeg->pRedirect, *piNext); + } + return rc; +} + +/* +** Return the page number of the last page on the same block as page iPg. +*/ +Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){ + return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg)); +} + +/* +** Read nData bytes of data from offset iOff of the database file into +** buffer aData. If this means reading past the end of a block, follow +** the block pointer to the next block and continue reading. +** +** Offset iOff is an absolute offset - not subject to any block redirection. +** However any block pointer followed is. Use pSeg->pRedirect in this case. +** +** This function is only called in compressed database mode. +*/ +static int fsReadData( + FileSystem *pFS, /* File-system handle */ + Segment *pSeg, /* Block redirection */ + i64 iOff, /* Read data from this offset */ + u8 *aData, /* Buffer to read data into */ + int nData /* Number of bytes to read */ +){ + i64 iEob; /* End of block */ + int nRead; + int rc; + + assert( pFS->pCompress ); + + iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1; + nRead = LSM_MIN(iEob - iOff, nData); + + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead); + if( rc==LSM_OK && nRead!=nData ){ + int iBlk; + + rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk); + if( rc==LSM_OK ){ + i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk); + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead); + } + } + + return rc; +} + +/* +** Parameter iBlock is a database file block. This function reads the value +** stored in the blocks "previous block" pointer and stores it in *piPrev. +** LSM_OK is returned if everything is successful, or an LSM error code +** otherwise. +*/ +static int fsBlockPrev( + FileSystem *pFS, /* File-system object handle */ + Segment *pSeg, /* Use this segment for block redirects */ + int iBlock, /* Read field from this block */ + int *piPrev /* OUT: Previous block in linked list */ +){ + int rc = LSM_OK; /* Return code */ + + assert( pFS->nMapLimit==0 || pFS->pCompress==0 ); + assert( iBlock>0 ); + + if( pFS->pCompress ){ + i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4; + u8 aPrev[4]; /* 4-byte pointer read from db file */ + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev)); + if( rc==LSM_OK ){ + Redirect *pRedir = (pSeg ? pSeg->pRedirect : 0); + *piPrev = fsRedirectBlock(pRedir, (int)lsmGetU32(aPrev)); + } + }else{ + assert( 0 ); + } + return rc; +} + +/* +** Encode and decode routines for record size fields. +*/ +static void putRecordSize(u8 *aBuf, int nByte, int bFree){ + aBuf[0] = (u8)(nByte >> 14) | 0x80; + aBuf[1] = ((u8)(nByte >> 7) & 0x7F) | (bFree ? 0x00 : 0x80); + aBuf[2] = (u8)nByte | 0x80; +} +static int getRecordSize(u8 *aBuf, int *pbFree){ + int nByte; + nByte = (aBuf[0] & 0x7F) << 14; + nByte += (aBuf[1] & 0x7F) << 7; + nByte += (aBuf[2] & 0x7F); + *pbFree = !(aBuf[1] & 0x80); + return nByte; +} + +/* +** Subtract iSub from database file offset iOff and set *piRes to the +** result. If doing so means passing the start of a block, follow the +** block pointer stored in the first 4 bytes of the block. +** +** Offset iOff is an absolute offset - not subject to any block redirection. +** However any block pointer followed is. Use pSeg->pRedirect in this case. +** +** Return LSM_OK if successful or an lsm error code if an error occurs. +*/ +static int fsSubtractOffset( + FileSystem *pFS, + Segment *pSeg, + i64 iOff, + int iSub, + i64 *piRes +){ + i64 iStart; + int iBlk = 0; + int rc; + + assert( pFS->pCompress ); + + iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff)); + if( (iOff-iSub)>=iStart ){ + *piRes = (iOff-iSub); + return LSM_OK; + } + + rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk); + *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1); + return rc; +} + +/* +** Add iAdd to database file offset iOff and set *piRes to the +** result. If doing so means passing the end of a block, follow the +** block pointer stored in the last 4 bytes of the block. +** +** Offset iOff is an absolute offset - not subject to any block redirection. +** However any block pointer followed is. Use pSeg->pRedirect in this case. +** +** Return LSM_OK if successful or an lsm error code if an error occurs. +*/ +static int fsAddOffset( + FileSystem *pFS, + Segment *pSeg, + i64 iOff, + int iAdd, + i64 *piRes +){ + i64 iEob; + int iBlk; + int rc; + + assert( pFS->pCompress ); + + iEob = fsLastPageOnPagesBlock(pFS, iOff); + if( (iOff+iAdd)<=iEob ){ + *piRes = (iOff+iAdd); + return LSM_OK; + } + + rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk); + *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1); + return rc; +} + +/* +** If it is not already allocated, allocate either the FileSystem.aOBuffer (if +** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return +** LSM_OK if successful if the attempt to allocate memory fails. +*/ +static int fsAllocateBuffer(FileSystem *pFS, int bWrite){ + u8 **pp; /* Pointer to either aIBuffer or aOBuffer */ + + assert( pFS->pCompress ); + + /* If neither buffer has been allocated, figure out how large they + ** should be. Store this value in FileSystem.nBuffer. */ + if( pFS->nBuffer==0 ){ + assert( pFS->aIBuffer==0 && pFS->aOBuffer==0 ); + pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize); + if( pFS->nBuffer<(pFS->szSector+6) ){ + pFS->nBuffer = pFS->szSector+6; + } + } + + pp = (bWrite ? &pFS->aOBuffer : &pFS->aIBuffer); + if( *pp==0 ){ + *pp = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize)); + if( *pp==0 ) return LSM_NOMEM_BKPT; + } + + return LSM_OK; +} + +/* +** This function is only called in compressed database mode. It reads and +** uncompresses the compressed data for page pPg from the database and +** populates the pPg->aData[] buffer and pPg->nCompress field. +** +** It is possible that instead of a page record, there is free space +** at offset pPg->iPgno. In this case no data is read from the file, but +** output variable *pnSpace is set to the total number of free bytes. +** +** LSM_OK is returned if successful, or an LSM error code otherwise. +*/ +static int fsReadPagedata( + FileSystem *pFS, /* File-system handle */ + Segment *pSeg, /* pPg is part of this segment */ + Page *pPg, /* Page to read and uncompress data for */ + int *pnSpace /* OUT: Total bytes of free space */ +){ + lsm_compress *p = pFS->pCompress; + i64 iOff = pPg->iPg; + u8 aSz[3]; + int rc; + + assert( p && pPg->nCompress==0 ); + + if( fsAllocateBuffer(pFS, 0) ) return LSM_NOMEM; + + rc = fsReadData(pFS, pSeg, iOff, aSz, sizeof(aSz)); + + if( rc==LSM_OK ){ + int bFree; + if( aSz[0] & 0x80 ){ + pPg->nCompress = (int)getRecordSize(aSz, &bFree); + }else{ + pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2; + bFree = 1; + } + if( bFree ){ + if( pnSpace ){ + *pnSpace = pPg->nCompress + sizeof(aSz)*2; + }else{ + rc = LSM_CORRUPT_BKPT; + } + }else{ + rc = fsAddOffset(pFS, pSeg, iOff, 3, &iOff); + if( rc==LSM_OK ){ + if( pPg->nCompress>pFS->nBuffer ){ + rc = LSM_CORRUPT_BKPT; + }else{ + rc = fsReadData(pFS, pSeg, iOff, pFS->aIBuffer, pPg->nCompress); + } + if( rc==LSM_OK ){ + int n = pFS->nPagesize; + rc = p->xUncompress(p->pCtx, + (char *)pPg->aData, &n, + (const char *)pFS->aIBuffer, pPg->nCompress + ); + if( rc==LSM_OK && n!=pPg->pFS->nPagesize ){ + rc = LSM_CORRUPT_BKPT; + } + } + } + } + } + return rc; +} + +/* +** Return a handle for a database page. +** +** If this file-system object is accessing a compressed database it may be +** that there is no page record at database file offset iPg. Instead, there +** may be a free space record. In this case, set *ppPg to NULL and *pnSpace +** to the total number of free bytes before returning. +** +** If no error occurs, LSM_OK is returned. Otherwise, an lsm error code. +*/ +static int fsPageGet( + FileSystem *pFS, /* File-system handle */ + Segment *pSeg, /* Block redirection to use (or NULL) */ + Pgno iPg, /* Page id */ + int noContent, /* True to not load content from disk */ + Page **ppPg, /* OUT: New page handle */ + int *pnSpace /* OUT: Bytes of free space */ +){ + Page *p; + int iHash; + int rc = LSM_OK; + + /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is + ** not NULL, and the block containing iPg has been redirected, then iReal + ** is the page number after redirection. */ + Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg); + + assert_lists_are_ok(pFS); + assert( iPg>=fsFirstPageOnBlock(pFS, 1) ); + assert( iReal>=fsFirstPageOnBlock(pFS, 1) ); + *ppPg = 0; + + /* Search the hash-table for the page */ + p = fsPageFindInHash(pFS, iReal, &iHash); + + if( p ){ + assert( p->flags & PAGE_FREE ); + if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p); + }else{ + + if( fsMmapPage(pFS, iReal) ){ + i64 iEnd = (i64)iReal * pFS->nPagesize; + fsGrowMapping(pFS, iEnd, &rc); + if( rc!=LSM_OK ) return rc; + + if( pFS->pFree ){ + p = pFS->pFree; + pFS->pFree = p->pFreeNext; + assert( p->nRef==0 ); + }else{ + p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc); + if( rc ) return rc; + p->pFS = pFS; + } + p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)]; + p->iPg = iReal; + + /* This page now carries a pointer to the mapping. Link it in to + ** the FileSystem.pMapped list. */ + assert( p->pMappedNext==0 ); + p->pMappedNext = pFS->pMapped; + pFS->pMapped = p; + + assert( pFS->pCompress==0 ); + assert( (p->flags & PAGE_FREE)==0 ); + }else{ + rc = fsPageBuffer(pFS, &p); + if( rc==LSM_OK ){ + int nSpace = 0; + p->iPg = iReal; + p->nRef = 0; + p->pFS = pFS; + assert( p->flags==0 || p->flags==PAGE_FREE ); + +#ifdef LSM_DEBUG + memset(p->aData, 0x56, pFS->nPagesize); +#endif + assert( p->pLruNext==0 && p->pLruPrev==0 ); + if( noContent==0 ){ + if( pFS->pCompress ){ + rc = fsReadPagedata(pFS, pSeg, p, &nSpace); + }else{ + int nByte = pFS->nPagesize; + i64 iOff = (i64)(iReal-1) * pFS->nPagesize; + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte); + } + pFS->nRead++; + } + + /* If the xRead() call was successful (or not attempted), link the + ** page into the page-cache hash-table. Otherwise, if it failed, + ** free the buffer. */ + if( rc==LSM_OK && nSpace==0 ){ + p->pHashNext = pFS->apHash[iHash]; + pFS->apHash[iHash] = p; + }else{ + fsPageBufferFree(p); + p = 0; + if( pnSpace ) *pnSpace = nSpace; + } + } + } + + assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace))) + || (rc!=LSM_OK && p==0) + ); + } + + if( rc==LSM_OK && p ){ + if( pFS->pCompress==0 && (fsIsLast(pFS, iReal) || fsIsFirst(pFS, iReal)) ){ + p->nData = pFS->nPagesize - 4; + if( fsIsFirst(pFS, iReal) && p->nRef==0 ){ + p->aData += 4; + p->flags |= PAGE_HASPREV; + } + }else{ + p->nData = pFS->nPagesize; + } + pFS->nOut += (p->nRef==0); + p->nRef++; + } + *ppPg = p; + return rc; +} + +/* +** Read the 64-bit checkpoint id of the checkpoint currently stored on meta +** page iMeta of the database file. If no error occurs, store the id value +** in *piVal and return LSM_OK. Otherwise, return an LSM error code and leave +** *piVal unmodified. +** +** If a checkpointer connection is currently updating meta-page iMeta, or an +** earlier checkpointer crashed while doing so, the value read into *piVal +** may be garbage. It is the callers responsibility to deal with this. +*/ +int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){ + FileSystem *pFS = db->pFS; + int rc = LSM_OK; + + assert( iMeta==1 || iMeta==2 ); + if( pFS->nMapLimit>0 ){ + fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc); + if( rc==LSM_OK ){ + *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]); + } + }else{ + MetaPage *pMeta = 0; + rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta); + if( rc==LSM_OK ){ + *piVal = (i64)lsmGetU64(pMeta->aData); + lsmFsMetaPageRelease(pMeta); + } + } + + return rc; +} + + +/* +** Return true if the first or last page of segment pRun falls between iFirst +** and iLast, inclusive, and pRun is not equal to pIgnore. +*/ +static int fsRunEndsBetween( + Segment *pRun, + Segment *pIgnore, + Pgno iFirst, + Pgno iLast +){ + return (pRun!=pIgnore && ( + (pRun->iFirst>=iFirst && pRun->iFirst<=iLast) + || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast) + )); +} + +/* +** Return true if level pLevel contains a segment other than pIgnore for +** which the first or last page is between iFirst and iLast, inclusive. +*/ +static int fsLevelEndsBetween( + Level *pLevel, + Segment *pIgnore, + Pgno iFirst, + Pgno iLast +){ + int i; + + if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){ + return 1; + } + for(i=0; inRight; i++){ + if( fsRunEndsBetween(&pLevel->aRhs[i], pIgnore, iFirst, iLast) ){ + return 1; + } + } + + return 0; +} + +/* +** Block iBlk is no longer in use by segment pIgnore. If it is not in use +** by any other segment, move it to the free block list. +*/ +static int fsFreeBlock( + FileSystem *pFS, /* File system object */ + Snapshot *pSnapshot, /* Worker snapshot */ + Segment *pIgnore, /* Ignore this run when searching */ + int iBlk /* Block number of block to free */ +){ + int rc = LSM_OK; /* Return code */ + int iFirst; /* First page on block iBlk */ + int iLast; /* Last page on block iBlk */ + Level *pLevel; /* Used to iterate through levels */ + + int iIn; /* Used to iterate through append points */ + int iOut = 0; /* Used to output append points */ + Pgno *aApp = pSnapshot->aiAppend; + + iFirst = fsFirstPageOnBlock(pFS, iBlk); + iLast = fsLastPageOnBlock(pFS, iBlk); + + /* Check if any other run in the snapshot has a start or end page + ** within this block. If there is such a run, return early. */ + for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){ + if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){ + return LSM_OK; + } + } + + /* Remove any entries that lie on this block from the append-list. */ + for(iIn=0; iIniLast ){ + aApp[iOut++] = aApp[iIn]; + } + } + while( iOutpDb, iBlk); + } + return rc; +} + +/* +** Delete or otherwise recycle the blocks currently occupied by run pDel. +*/ +int lsmFsSortedDelete( + FileSystem *pFS, + Snapshot *pSnapshot, + int bZero, /* True to zero the Segment structure */ + Segment *pDel +){ + if( pDel->iFirst ){ + int rc = LSM_OK; + + int iBlk; + int iLastBlk; + + iBlk = fsPageToBlock(pFS, pDel->iFirst); + iLastBlk = fsPageToBlock(pFS, pDel->iLastPg); + + /* Mark all blocks currently used by this sorted run as free */ + while( iBlk && rc==LSM_OK ){ + int iNext = 0; + if( iBlk!=iLastBlk ){ + rc = fsBlockNext(pFS, pDel, iBlk, &iNext); + }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){ + break; + } + rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk); + iBlk = iNext; + } + + if( pDel->pRedirect ){ + assert( pDel->pRedirect==&pSnapshot->redirect ); + pSnapshot->redirect.n = 0; + } + + if( bZero ) memset(pDel, 0, sizeof(Segment)); + } + return LSM_OK; +} + +/* +** aPgno is an array containing nPgno page numbers. Return the smallest page +** number from the array that falls on block iBlk. Or, if none of the pages +** in aPgno[] fall on block iBlk, return 0. +*/ +static Pgno firstOnBlock(FileSystem *pFS, int iBlk, Pgno *aPgno, int nPgno){ + Pgno iRet = 0; + int i; + for(i=0; ipRedirect, iPg)); +} + +/* +** Return true if the second argument is not NULL and any of the first +** last or root pages lie on a redirected block. +*/ +static int fsSegmentRedirects(FileSystem *pFS, Segment *p){ + return (p && ( + fsPageRedirects(pFS, p, p->iFirst) + || fsPageRedirects(pFS, p, p->iRoot) + || fsPageRedirects(pFS, p, p->iLastPg) + )); +} +#endif + +/* +** Argument aPgno is an array of nPgno page numbers. All pages belong to +** the segment pRun. This function gobbles from the start of the run to the +** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is +** the new first page of the run). +*/ +void lsmFsGobble( + lsm_db *pDb, + Segment *pRun, + Pgno *aPgno, + int nPgno +){ + int rc = LSM_OK; + FileSystem *pFS = pDb->pFS; + Snapshot *pSnapshot = pDb->pWorker; + int iBlk; + + assert( pRun->nSize>0 ); + assert( 0==fsSegmentRedirects(pFS, pRun) ); + assert( nPgno>0 && 0==fsPageRedirects(pFS, pRun, aPgno[0]) ); + + iBlk = fsPageToBlock(pFS, pRun->iFirst); + pRun->nSize += (pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk)); + + while( rc==LSM_OK ){ + int iNext = 0; + Pgno iFirst = firstOnBlock(pFS, iBlk, aPgno, nPgno); + if( iFirst ){ + pRun->iFirst = iFirst; + break; + } + rc = fsBlockNext(pFS, pRun, iBlk, &iNext); + if( rc==LSM_OK ) rc = fsFreeBlock(pFS, pSnapshot, pRun, iBlk); + pRun->nSize -= ( + 1 + fsLastPageOnBlock(pFS, iBlk) - fsFirstPageOnBlock(pFS, iBlk) + ); + iBlk = iNext; + } + + pRun->nSize -= (pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk)); + assert( pRun->nSize>0 ); +} + +/* +** This function is only used in compressed database mode. +** +** Argument iPg is the page number (byte offset) of a page within segment +** pSeg. The page record, including all headers, is nByte bytes in size. +** Before returning, set *piNext to the page number of the next page in +** the segment, or to zero if iPg is the last. +** +** In other words, do: +** +** *piNext = iPg + nByte; +** +** But take block overflow and redirection into account. +*/ +static int fsNextPageOffset( + FileSystem *pFS, /* File system object */ + Segment *pSeg, /* Segment to move within */ + Pgno iPg, /* Offset of current page */ + int nByte, /* Size of current page including headers */ + Pgno *piNext /* OUT: Offset of next page. Or zero (EOF) */ +){ + Pgno iNext; + int rc; + + assert( pFS->pCompress ); + + rc = fsAddOffset(pFS, pSeg, iPg, nByte-1, &iNext); + if( pSeg && iNext==pSeg->iLastPg ){ + iNext = 0; + }else if( rc==LSM_OK ){ + rc = fsAddOffset(pFS, pSeg, iNext, 1, &iNext); + } + + *piNext = iNext; + return rc; +} + +/* +** This function is only used in compressed database mode. +** +** Argument iPg is the page number of a pagethat appears in segment pSeg. +** This function determines the page number of the previous page in the +** same run. *piPrev is set to the previous page number before returning. +** +** LSM_OK is returned if no error occurs. Otherwise, an lsm error code. +** If any value other than LSM_OK is returned, then the final value of +** *piPrev is undefined. +*/ +static int fsGetPageBefore( + FileSystem *pFS, + Segment *pSeg, + Pgno iPg, + Pgno *piPrev +){ + u8 aSz[3]; + int rc; + i64 iRead; + + assert( pFS->pCompress ); + + rc = fsSubtractOffset(pFS, pSeg, iPg, sizeof(aSz), &iRead); + if( rc==LSM_OK ) rc = fsReadData(pFS, pSeg, iRead, aSz, sizeof(aSz)); + + if( rc==LSM_OK ){ + int bFree; + int nSz; + if( aSz[2] & 0x80 ){ + nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2; + }else{ + nSz = (int)(aSz[2] & 0x7F); + bFree = 1; + } + rc = fsSubtractOffset(pFS, pSeg, iPg, nSz, piPrev); + } + + return rc; +} + +/* +** The first argument to this function is a valid reference to a database +** file page that is part of a sorted run. If parameter eDir is -1, this +** function attempts to locate and load the previous page in the same run. +** Or, if eDir is +1, it attempts to find the next page in the same run. +** The results of passing an eDir value other than positive or negative one +** are undefined. +** +** If parameter pRun is not NULL then it must point to the run that page +** pPg belongs to. In this case, if pPg is the first or last page of the +** run, and the request is for the previous or next page, respectively, +** *ppNext is set to NULL before returning LSM_OK. If pRun is NULL, then it +** is assumed that the next or previous page, as requested, exists. +** +** If the previous/next page does exist and is successfully loaded, *ppNext +** is set to point to it and LSM_OK is returned. Otherwise, if an error +** occurs, *ppNext is set to NULL and and lsm error code returned. +** +** Page references returned by this function should be released by the +** caller using lsmFsPageRelease(). +*/ +int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){ + int rc = LSM_OK; + FileSystem *pFS = pPg->pFS; + Pgno iPg = pPg->iPg; + + assert( 0==fsSegmentRedirects(pFS, pRun) ); + if( pFS->pCompress ){ + int nSpace = pPg->nCompress + 2*3; + + do { + if( eDir>0 ){ + rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg); + }else{ + if( iPg==pRun->iFirst ){ + iPg = 0; + }else{ + rc = fsGetPageBefore(pFS, pRun, iPg, &iPg); + } + } + + nSpace = 0; + if( iPg!=0 ){ + rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, &nSpace); + assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) ); + }else{ + *ppNext = 0; + } + }while( nSpace>0 && rc==LSM_OK ); + + }else{ + Redirect *pRedir = pRun ? pRun->pRedirect : 0; + assert( eDir==1 || eDir==-1 ); + if( eDir<0 ){ + if( pRun && iPg==pRun->iFirst ){ + *ppNext = 0; + return LSM_OK; + }else if( fsIsFirst(pFS, iPg) ){ + assert( pPg->flags & PAGE_HASPREV ); + iPg = fsLastPageOnBlock(pFS, lsmGetU32(&pPg->aData[-4])); + }else{ + iPg--; + } + }else{ + if( pRun ){ + if( iPg==pRun->iLastPg ){ + *ppNext = 0; + return LSM_OK; + } + } + + if( fsIsLast(pFS, iPg) ){ + int iBlk = fsRedirectBlock( + pRedir, lsmGetU32(&pPg->aData[pFS->nPagesize-4]) + ); + iPg = fsFirstPageOnBlock(pFS, iBlk); + }else{ + iPg++; + } + } + rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, 0); + } + + return rc; +} + +/* +** This function is called when creating a new segment to determine if the +** first part of it can be written following an existing segment on an +** already allocated block. If it is possible, the page number of the first +** page to use for the new segment is returned. Otherwise zero. +** +** If argument pLvl is not NULL, then this function will not attempt to +** start the new segment immediately following any segment that is part +** of the right-hand-side of pLvl. +*/ +static Pgno findAppendPoint(FileSystem *pFS, Level *pLvl){ + int i; + Pgno *aiAppend = pFS->pDb->pWorker->aiAppend; + Pgno iRet = 0; + + for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){ + if( (iRet = aiAppend[i]) ){ + if( pLvl ){ + int iBlk = fsPageToBlock(pFS, iRet); + int j; + for(j=0; iRet && jnRight; j++){ + if( fsPageToBlock(pFS, pLvl->aRhs[j].iLastPg)==iBlk ){ + iRet = 0; + } + } + } + if( iRet ) aiAppend[i] = 0; + } + } + return iRet; +} + +/* +** Append a page to the left-hand-side of pLvl. Set the ref-count to 1 and +** return a pointer to it. The page is writable until either +** lsmFsPagePersist() is called on it or the ref-count drops to zero. +*/ +int lsmFsSortedAppend( + FileSystem *pFS, + Snapshot *pSnapshot, + Level *pLvl, + int bDefer, + Page **ppOut +){ + int rc = LSM_OK; + Page *pPg = 0; + *ppOut = 0; + int iApp = 0; + int iNext = 0; + Segment *p = &pLvl->lhs; + int iPrev = p->iLastPg; + + assert( p->pRedirect==0 ); + + if( pFS->pCompress || bDefer ){ + /* In compressed database mode the page is not assigned a page number + ** or location in the database file at this point. This will be done + ** by the lsmFsPagePersist() call. */ + rc = fsPageBuffer(pFS, &pPg); + if( rc==LSM_OK ){ + pPg->pFS = pFS; + pPg->pSeg = p; + pPg->iPg = 0; + pPg->flags |= PAGE_DIRTY; + pPg->nData = pFS->nPagesize; + assert( pPg->aData ); + if( pFS->pCompress==0 ) pPg->nData -= 4; + + pPg->nRef = 1; + pFS->nOut++; + } + }else{ + if( iPrev==0 ){ + iApp = findAppendPoint(pFS, pLvl); + }else if( fsIsLast(pFS, iPrev) ){ + int iNext; + rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iPrev), &iNext); + if( rc!=LSM_OK ) return rc; + iApp = fsFirstPageOnBlock(pFS, iNext); + }else{ + iApp = iPrev + 1; + } + + /* If this is the first page allocated, or if the page allocated is the + ** last in the block, also allocate the next block here. */ + if( iApp==0 || fsIsLast(pFS, iApp) ){ + int iNew; /* New block number */ + + rc = lsmBlockAllocate(pFS->pDb, 0, &iNew); + if( rc!=LSM_OK ) return rc; + if( iApp==0 ){ + iApp = fsFirstPageOnBlock(pFS, iNew); + }else{ + iNext = fsFirstPageOnBlock(pFS, iNew); + } + } + + /* Grab the new page. */ + pPg = 0; + rc = fsPageGet(pFS, 0, iApp, 1, &pPg, 0); + assert( rc==LSM_OK || pPg==0 ); + + /* If this is the first or last page of a block, fill in the pointer + ** value at the end of the new page. */ + if( rc==LSM_OK ){ + p->nSize++; + p->iLastPg = iApp; + if( p->iFirst==0 ) p->iFirst = iApp; + pPg->flags |= PAGE_DIRTY; + + if( fsIsLast(pFS, iApp) ){ + lsmPutU32(&pPg->aData[pFS->nPagesize-4], fsPageToBlock(pFS, iNext)); + }else if( fsIsFirst(pFS, iApp) ){ + lsmPutU32(&pPg->aData[-4], fsPageToBlock(pFS, iPrev)); + } + } + } + + *ppOut = pPg; + return rc; +} + +/* +** Mark the segment passed as the second argument as finished. Once a segment +** is marked as finished it is not possible to append any further pages to +** it. +** +** Return LSM_OK if successful or an lsm error code if an error occurs. +*/ +int lsmFsSortedFinish(FileSystem *pFS, Segment *p){ + int rc = LSM_OK; + if( p && p->iLastPg ){ + assert( p->pRedirect==0 ); + + /* Check if the last page of this run happens to be the last of a block. + ** If it is, then an extra block has already been allocated for this run. + ** Shift this extra block back to the free-block list. + ** + ** Otherwise, add the first free page in the last block used by the run + ** to the lAppend list. + */ + if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){ + int i; + Pgno *aiAppend = pFS->pDb->pWorker->aiAppend; + for(i=0; iiLastPg+1; + break; + } + } + }else if( pFS->pCompress==0 ){ + Page *pLast; + rc = fsPageGet(pFS, 0, p->iLastPg, 0, &pLast, 0); + if( rc==LSM_OK ){ + int iBlk = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]); + lsmBlockRefree(pFS->pDb, iBlk); + lsmFsPageRelease(pLast); + } + }else{ + int iBlk = 0; + rc = fsBlockNext(pFS, p, fsPageToBlock(pFS, p->iLastPg), &iBlk); + if( rc==LSM_OK ){ + lsmBlockRefree(pFS->pDb, iBlk); + } + } + } + return rc; +} + +/* +** Obtain a reference to page number iPg. +** +** Return LSM_OK if successful, or an lsm error code if an error occurs. +*/ +int lsmFsDbPageGet(FileSystem *pFS, Segment *pSeg, Pgno iPg, Page **ppPg){ + return fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0); +} + +/* +** Obtain a reference to the last page in the segment passed as the +** second argument. +** +** Return LSM_OK if successful, or an lsm error code if an error occurs. +*/ +int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){ + int rc; + Pgno iPg = pSeg->iLastPg; + if( pFS->pCompress ){ + int nSpace; + iPg++; + do { + nSpace = 0; + rc = fsGetPageBefore(pFS, pSeg, iPg, &iPg); + if( rc==LSM_OK ){ + rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, &nSpace); + } + }while( rc==LSM_OK && nSpace>0 ); + + }else{ + rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0); + } + return rc; +} + +/* +** Return a reference to meta-page iPg. If successful, LSM_OK is returned +** and *ppPg populated with the new page reference. The reference should +** be released by the caller using lsmFsPageRelease(). +** +** Otherwise, if an error occurs, *ppPg is set to NULL and an LSM error +** code is returned. +*/ +int lsmFsMetaPageGet( + FileSystem *pFS, /* File-system connection */ + int bWrite, /* True for write access, false for read */ + int iPg, /* Either 1 or 2 */ + MetaPage **ppPg /* OUT: Pointer to MetaPage object */ +){ + int rc = LSM_OK; + MetaPage *pPg; + assert( iPg==1 || iPg==2 ); + + pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc); + + if( pPg ){ + i64 iOff = (iPg-1) * pFS->nMetasize; + if( pFS->nMapLimit>0 ){ + fsGrowMapping(pFS, 2*pFS->nMetasize, &rc); + pPg->aData = (u8 *)(pFS->pMap) + iOff; + }else{ + pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc); + if( rc==LSM_OK && bWrite==0 ){ + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetasize); + } +#ifndef NDEBUG + /* pPg->aData causes an uninitialized access via a downstreadm write(). + After discussion on this list, this memory should not, for performance + reasons, be memset. However, tracking down "real" misuse is more + difficult with this "false" positive, so it is set when NDEBUG. + */ + else if( rc==LSM_OK ){ + memset( pPg->aData, 0x77, pFS->nMetasize ); + } +#endif + } + + if( rc!=LSM_OK ){ + if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData); + lsmFree(pFS->pEnv, pPg); + pPg = 0; + }else{ + pPg->iPg = iPg; + pPg->bWrite = bWrite; + pPg->pFS = pFS; + } + } + + *ppPg = pPg; + return rc; +} + +/* +** Release a meta-page reference obtained via a call to lsmFsMetaPageGet(). +*/ +int lsmFsMetaPageRelease(MetaPage *pPg){ + int rc = LSM_OK; + if( pPg ){ + FileSystem *pFS = pPg->pFS; + + if( pFS->nMapLimit==0 ){ + if( pPg->bWrite ){ + i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0); + int nWrite = pFS->nMetasize; + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite); + } + lsmFree(pFS->pEnv, pPg->aData); + } + + lsmFree(pFS->pEnv, pPg); + } + return rc; +} + +/* +** Return a pointer to a buffer containing the data associated with the +** meta-page passed as the first argument. If parameter pnData is not NULL, +** set *pnData to the size of the meta-page in bytes before returning. +*/ +u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){ + if( pnData ) *pnData = pPg->pFS->nMetasize; + return pPg->aData; +} + +/* +** Return true if page is currently writable. This is used in assert() +** statements only. +*/ +#ifndef NDEBUG +int lsmFsPageWritable(Page *pPg){ + return (pPg->flags & PAGE_DIRTY) ? 1 : 0; +} +#endif + +/* +** This is called when block iFrom is being redirected to iTo. If page +** number (*piPg) lies on block iFrom, then calculate the equivalent +** page on block iTo and set *piPg to this value before returning. +*/ +static void fsMovePage( + FileSystem *pFS, /* File system object */ + int iTo, /* Destination block */ + int iFrom, /* Source block */ + Pgno *piPg /* IN/OUT: Page number */ +){ + Pgno iPg = *piPg; + if( iFrom==fsPageToBlock(pFS, iPg) ){ + const int nPagePerBlock = ( + pFS->pCompress ? pFS ->nBlocksize : (pFS->nBlocksize / pFS->nPagesize) + ); + *piPg = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock; + } +} + +/* +** Copy the contents of block iFrom to block iTo. +** +** It is safe to assume that there are no outstanding references to pages +** on block iTo. And that block iFrom is not currently being written. In +** other words, the data can be read and written directly. +*/ +int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){ + Snapshot *p = pFS->pDb->pWorker; + int rc = LSM_OK; + int i; + i64 nMap; + + i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize; + i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize; + + assert( iTo!=1 ); + assert( iFrom>iTo ); + + /* Grow the mapping as required. */ + nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize); + fsGrowMapping(pFS, nMap, &rc); + + if( rc==LSM_OK ){ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + int nSz = pFS->nPagesize; + u8 *aBuf = 0; + u8 *aData = 0; + + for(i=0; rc==LSM_OK && inMapLimit ){ + u8 *aMap = (u8 *)(pFS->pMap); + aData = &aMap[iOff]; + }else{ + if( aBuf==0 ){ + aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc); + if( aBuf==0 ) break; + } + aData = aBuf; + rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz); + } + + /* Copy aData to the to page */ + if( rc==LSM_OK ){ + iOff = iToOff + i*nSz; + if( (iOff+nSz)<=pFS->nMapLimit ){ + u8 *aMap = (u8 *)(pFS->pMap); + memcpy(&aMap[iOff], aData, nSz); + }else{ + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz); + } + } + } + lsmFree(pFS->pEnv, aBuf); + lsmFsPurgeCache(pFS); + } + + /* Update append-point list if necessary */ + for(i=0; iaiAppend[i]); + } + + /* Update the Segment structure itself */ + fsMovePage(pFS, iTo, iFrom, &pSeg->iFirst); + fsMovePage(pFS, iTo, iFrom, &pSeg->iLastPg); + fsMovePage(pFS, iTo, iFrom, &pSeg->iRoot); + + return rc; +} + +/* +** Append raw data to a segment. Return the database file offset that the +** data is written to (this may be used as the page number if the data +** being appended is a new page record). +** +** This function is only used in compressed database mode. +*/ +static Pgno fsAppendData( + FileSystem *pFS, /* File-system handle */ + Segment *pSeg, /* Segment to append to */ + const u8 *aData, /* Buffer containing data to write */ + int nData, /* Size of buffer aData[] in bytes */ + int *pRc /* IN/OUT: Error code */ +){ + Pgno iRet = 0; + int rc = *pRc; + assert( pFS->pCompress ); + if( rc==LSM_OK ){ + int nRem; + int nWrite; + Pgno iLastOnBlock; + Pgno iApp = pSeg->iLastPg+1; + + /* If this is the first data written into the segment, find an append-point + ** or allocate a new block. */ + if( iApp==1 ){ + pSeg->iFirst = iApp = findAppendPoint(pFS, 0); + if( iApp==0 ){ + int iBlk; + rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk); + pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk); + } + } + iRet = iApp; + + /* Write as much data as is possible at iApp (usually all of it). */ + iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp); + if( rc==LSM_OK ){ + int nSpace = iLastOnBlock - iApp + 1; + nWrite = LSM_MIN(nData, nSpace); + nRem = nData - nWrite; + assert( nWrite>=0 ); + if( nWrite!=0 ){ + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite); + } + iApp += nWrite; + } + + /* If required, allocate a new block and write the rest of the data + ** into it. Set the next and previous block pointers to link the new + ** block to the old. */ + assert( nRem<=0 || (iApp-1)==iLastOnBlock ); + if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){ + u8 aPtr[4]; /* Space to serialize a u32 */ + int iBlk; /* New block number */ + + if( nWrite>0 ){ + /* Allocate a new block. */ + rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk); + + /* Set the "next" pointer on the old block */ + if( rc==LSM_OK ){ + assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 ); + lsmPutU32(aPtr, iBlk); + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr)); + } + + /* Set the "prev" pointer on the new block */ + if( rc==LSM_OK ){ + Pgno iWrite; + lsmPutU32(aPtr, fsPageToBlock(pFS, iApp)); + iWrite = fsFirstPageOnBlock(pFS, iBlk); + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr)); + if( nRem>0 ) iApp = iWrite; + } + }else{ + /* The next block is already allocated. */ + assert( nRem>0 ); + assert( pSeg->pRedirect==0 ); + rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iApp), &iBlk); + iRet = iApp = fsFirstPageOnBlock(pFS, iBlk); + } + + /* Write the remaining data into the new block */ + if( rc==LSM_OK && nRem>0 ){ + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem); + iApp += nRem; + } + } + + pSeg->iLastPg = iApp-1; + *pRc = rc; + } + + return iRet; +} + +/* +** This function is only called in compressed database mode. It +** compresses the contents of page pPg and writes the result to the +** buffer at pFS->aOBuffer. The size of the compressed data is stored in +** pPg->nCompress. +** +** If buffer pFS->aOBuffer[] has not been allocated then this function +** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK. +*/ +static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){ + lsm_compress *p = pFS->pCompress; + + if( fsAllocateBuffer(pFS, 1) ) return LSM_NOMEM; + assert( pPg->nData==pFS->nPagesize ); + + pPg->nCompress = pFS->nBuffer; + return p->xCompress(p->pCtx, + (char *)pFS->aOBuffer, &pPg->nCompress, + (const char *)pPg->aData, pPg->nData + ); +} + +/* +** Append a new page to segment pSeg. Set output variable *piNew to the +** page number of the new page before returning. +** +** If the new page is the last on its block, then the 'next' block that +** will be used by the segment is allocated here too. In this case output +** variable *piNext is set to the block number of the next block. +** +** If the new page is the first on its block but not the first in the +** entire segment, set output variable *piPrev to the block number of +** the previous block in the segment. +** +** LSM_OK is returned if successful, or an lsm error code otherwise. If +** any value other than LSM_OK is returned, then the final value of all +** output variables is undefined. +*/ +static int fsAppendPage( + FileSystem *pFS, + Segment *pSeg, + Pgno *piNew, + int *piPrev, + int *piNext +){ + Pgno iPrev = pSeg->iLastPg; + int rc; + assert( iPrev!=0 ); + + *piPrev = 0; + *piNext = 0; + + if( fsIsLast(pFS, iPrev) ){ + /* Grab the first page on the next block (which has already be + ** allocated). In this case set *piPrev to tell the caller to set + ** the "previous block" pointer in the first 4 bytes of the page. + */ + int iNext; + int iBlk = fsPageToBlock(pFS, iPrev); + assert( pSeg->pRedirect==0 ); + rc = fsBlockNext(pFS, 0, iBlk, &iNext); + if( rc!=LSM_OK ) return rc; + *piNew = fsFirstPageOnBlock(pFS, iNext); + *piPrev = iBlk; + }else{ + *piNew = iPrev+1; + if( fsIsLast(pFS, *piNew) ){ + /* Allocate the next block here. */ + int iBlk; + rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk); + if( rc!=LSM_OK ) return rc; + *piNext = iBlk; + } + } + + pSeg->nSize++; + pSeg->iLastPg = *piNew; + return LSM_OK; +} + +/* +** Flush all pages in the FileSystem.pWaiting list to disk. +*/ +void lsmFsFlushWaiting(FileSystem *pFS, int *pRc){ + int rc = *pRc; + Page *pPg; + + pPg = pFS->pWaiting; + pFS->pWaiting = 0; + + while( pPg ){ + Page *pNext = pPg->pWaitingNext; + if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg); + assert( pPg->nRef==1 ); + lsmFsPageRelease(pPg); + pPg = pNext; + } + *pRc = rc; +} + +/* +** If there exists a hash-table entry associated with page iPg, remove it. +*/ +static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){ + Page *p; + int iHash = fsHashKey(pFS->nHash, iPg); + + for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext); + + if( p ){ + assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 ); + fsPageRemoveFromHash(pFS, p); + p->iPg = 0; + iHash = fsHashKey(pFS->nHash, 0); + p->pHashNext = pFS->apHash[iHash]; + pFS->apHash[iHash] = p; + } +} + +/* +** If the page passed as an argument is dirty, update the database file +** (or mapping of the database file) with its current contents and mark +** the page as clean. +** +** Return LSM_OK if the operation is a success, or an LSM error code +** otherwise. +*/ +int lsmFsPagePersist(Page *pPg){ + int rc = LSM_OK; + if( pPg && (pPg->flags & PAGE_DIRTY) ){ + FileSystem *pFS = pPg->pFS; + + if( pFS->pCompress ){ + int iHash; /* Hash key of assigned page number */ + u8 aSz[3]; /* pPg->nCompress as a 24-bit big-endian */ + assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 ); + + /* Compress the page image. */ + rc = fsCompressIntoBuffer(pFS, pPg); + + /* Serialize the compressed size into buffer aSz[] */ + putRecordSize(aSz, pPg->nCompress, 0); + + /* Write the serialized page record into the database file. */ + pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc); + fsAppendData(pFS, pPg->pSeg, pFS->aOBuffer, pPg->nCompress, &rc); + fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc); + + /* Now that it has a page number, insert the page into the hash table */ + iHash = fsHashKey(pFS->nHash, pPg->iPg); + pPg->pHashNext = pFS->apHash[iHash]; + pFS->apHash[iHash] = pPg; + + pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress; + + pPg->flags &= ~PAGE_DIRTY; + pFS->nWrite++; + }else{ + + if( pPg->iPg==0 ){ + /* No page number has been assigned yet. This occurs with pages used + ** in the b-tree hierarchy. They were not assigned page numbers when + ** they were created as doing so would cause this call to + ** lsmFsPagePersist() to write an out-of-order page. Instead a page + ** number is assigned here so that the page data will be appended + ** to the current segment. + */ + Page **pp; + int iPrev = 0; + int iNext = 0; + int iHash; + + assert( pPg->pSeg->iFirst ); + assert( pPg->flags & PAGE_FREE ); + assert( (pPg->flags & PAGE_HASPREV)==0 ); + assert( pPg->nData==pFS->nPagesize-4 ); + + rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext); + if( rc!=LSM_OK ) return rc; + + assert( pPg->flags & PAGE_FREE ); + iHash = fsHashKey(pFS->nHash, pPg->iPg); + fsRemoveHashEntry(pFS, pPg->iPg); + pPg->pHashNext = pFS->apHash[iHash]; + pFS->apHash[iHash] = pPg; + assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg ); + + if( iPrev ){ + assert( iNext==0 ); + memmove(&pPg->aData[4], pPg->aData, pPg->nData); + lsmPutU32(pPg->aData, iPrev); + pPg->flags |= PAGE_HASPREV; + pPg->aData += 4; + }else if( iNext ){ + assert( iPrev==0 ); + lsmPutU32(&pPg->aData[pPg->nData], iNext); + }else{ + int nData = pPg->nData; + pPg->nData += 4; + lsmSortedExpandBtreePage(pPg, nData); + } + + pPg->nRef++; + for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext); + *pp = pPg; + assert( pPg->pWaitingNext==0 ); + + }else{ + i64 iOff; /* Offset to write within database file */ + + iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1); + if( fsMmapPage(pFS, pPg->iPg)==0 ){ + u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV); + rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize); + }else if( pPg->flags & PAGE_FREE ){ + fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc); + if( rc==LSM_OK ){ + u8 *aTo = &((u8 *)(pFS->pMap))[iOff]; + u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV); + memcpy(aTo, aFrom, pFS->nPagesize); + lsmFree(pFS->pEnv, aFrom); + pFS->nCacheAlloc--; + pPg->aData = aTo + (pPg->flags & PAGE_HASPREV); + pPg->flags &= ~PAGE_FREE; + fsPageRemoveFromHash(pFS, pPg); + pPg->pMappedNext = pFS->pMapped; + pFS->pMapped = pPg; + } + } + + lsmFsFlushWaiting(pFS, &rc); + pPg->flags &= ~PAGE_DIRTY; + pFS->nWrite++; + } + } + } + + return rc; +} + +/* +** For non-compressed databases, this function is a no-op. For compressed +** databases, it adds a padding record to the segment passed as the third +** argument. +** +** The size of the padding records is selected so that the last byte +** written is the last byte of a disk sector. This means that if a +** snapshot is taken and checkpointed, subsequent worker processes will +** not write to any sector that contains checkpointed data. +*/ +int lsmFsSortedPadding( + FileSystem *pFS, + Snapshot *pSnapshot, + Segment *pSeg +){ + int rc = LSM_OK; + if( pFS->pCompress ){ + Pgno iLast2; + Pgno iLast = pSeg->iLastPg; /* Current last page of segment */ + int nPad; /* Bytes of padding required */ + u8 aSz[3]; + + iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1; + assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) ); + nPad = iLast2 - iLast; + + if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){ + nPad -= 4; + } + assert( nPad>=0 ); + + if( nPad>=6 ){ + pSeg->nSize += nPad; + nPad -= 6; + putRecordSize(aSz, nPad, 1); + fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc); + memset(pFS->aOBuffer, 0, nPad); + fsAppendData(pFS, pSeg, pFS->aOBuffer, nPad, &rc); + fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc); + }else if( nPad>0 ){ + u8 aBuf[5] = {0,0,0,0,0}; + aBuf[0] = (u8)nPad; + aBuf[nPad-1] = (u8)nPad; + fsAppendData(pFS, pSeg, aBuf, nPad, &rc); + } + + assert( rc!=LSM_OK + || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg) + || ((pSeg->iLastPg + 1) % pFS->szSector)==0 + ); + } + + return rc; +} + + +/* +** Increment the reference count on the page object passed as the first +** argument. +*/ +void lsmFsPageRef(Page *pPg){ + if( pPg ){ + pPg->nRef++; + } +} + +/* +** Release a page-reference obtained using fsPageGet(). +*/ +int lsmFsPageRelease(Page *pPg){ + int rc = LSM_OK; + if( pPg ){ + assert( pPg->nRef>0 ); + pPg->nRef--; + if( pPg->nRef==0 ){ + FileSystem *pFS = pPg->pFS; + rc = lsmFsPagePersist(pPg); + pFS->nOut--; + + assert( pPg->pFS->pCompress + || fsIsFirst(pPg->pFS, pPg->iPg)==0 + || (pPg->flags & PAGE_HASPREV) + ); + pPg->aData -= (pPg->flags & PAGE_HASPREV); + pPg->flags &= ~PAGE_HASPREV; + + if( (pPg->flags & PAGE_FREE)==0 ){ + /* Removed from mapped list */ + Page **pp; + for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext); + *pp = pPg->pMappedNext; + pPg->pMappedNext = 0; + + /* Add to free list */ + pPg->pFreeNext = pFS->pFree; + pFS->pFree = pPg; + }else{ + fsPageAddToLru(pFS, pPg); + } + } + } + + return rc; +} + +/* +** Return the total number of pages read from the database file. +*/ +int lsmFsNRead(FileSystem *pFS){ return pFS->nRead; } + +/* +** Return the total number of pages written to the database file. +*/ +int lsmFsNWrite(FileSystem *pFS){ return pFS->nWrite; } + +/* +** Return a copy of the environment pointer used by the file-system object. +*/ +lsm_env *lsmFsEnv(FileSystem *pFS){ + return pFS->pEnv; +} + +/* +** Return a copy of the environment pointer used by the file-system object +** to which this page belongs. +*/ +lsm_env *lsmPageEnv(Page *pPg) { + return pPg->pFS->pEnv; +} + +/* +** Return a pointer to the file-system object associated with the Page +** passed as the only argument. +*/ +FileSystem *lsmPageFS(Page *pPg){ + return pPg->pFS; +} + +/* +** Return the sector-size as reported by the log file handle. +*/ +int lsmFsSectorSize(FileSystem *pFS){ + return pFS->szSector; +} + +/* +** Helper function for lsmInfoArrayStructure(). +*/ +static Segment *startsWith(Segment *pRun, Pgno iFirst){ + return (iFirst==pRun->iFirst) ? pRun : 0; +} + +/* +** Return the segment that starts with page iFirst, if any. If no such segment +** can be found, return NULL. +*/ +static Segment *findSegment(Snapshot *pWorker, Pgno iFirst){ + Level *pLvl; /* Used to iterate through db levels */ + Segment *pSeg = 0; /* Pointer to segment to return */ + + for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pSeg==0; pLvl=pLvl->pNext){ + if( 0==(pSeg = startsWith(&pLvl->lhs, iFirst)) ){ + int i; + for(i=0; inRight; i++){ + if( (pSeg = startsWith(&pLvl->aRhs[i], iFirst)) ) break; + } + } + } + + return pSeg; +} + +/* +** This function implements the lsm_info(LSM_INFO_ARRAY_STRUCTURE) request. +** If successful, *pzOut is set to point to a nul-terminated string +** containing the array structure and LSM_OK is returned. The caller should +** eventually free the string using lsmFree(). +** +** If an error occurs, *pzOut is set to NULL and an LSM error code returned. +*/ +int lsmInfoArrayStructure( + lsm_db *pDb, + int bBlock, /* True for block numbers only */ + Pgno iFirst, + char **pzOut +){ + int rc = LSM_OK; + Snapshot *pWorker; /* Worker snapshot */ + Segment *pArray = 0; /* Array to report on */ + int bUnlock = 0; + + *pzOut = 0; + if( iFirst==0 ) return LSM_ERROR; + + /* Obtain the worker snapshot */ + pWorker = pDb->pWorker; + if( !pWorker ){ + rc = lsmBeginWork(pDb); + if( rc!=LSM_OK ) return rc; + pWorker = pDb->pWorker; + bUnlock = 1; + } + + /* Search for the array that starts on page iFirst */ + pArray = findSegment(pWorker, iFirst); + + if( pArray==0 ){ + /* Could not find the requested array. This is an error. */ + rc = LSM_ERROR; + }else{ + FileSystem *pFS = pDb->pFS; + LsmString str; + int iBlk; + int iLastBlk; + + iBlk = fsPageToBlock(pFS, pArray->iFirst); + iLastBlk = fsPageToBlock(pFS, pArray->iLastPg); + + lsmStringInit(&str, pDb->pEnv); + if( bBlock ){ + lsmStringAppendf(&str, "%d", iBlk); + while( iBlk!=iLastBlk ){ + fsBlockNext(pFS, pArray, iBlk, &iBlk); + lsmStringAppendf(&str, " %d", iBlk); + } + }else{ + lsmStringAppendf(&str, "%d", pArray->iFirst); + while( iBlk!=iLastBlk ){ + lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk)); + fsBlockNext(pFS, pArray, iBlk, &iBlk); + lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk)); + } + lsmStringAppendf(&str, " %d", pArray->iLastPg); + } + + *pzOut = str.z; + } + + if( bUnlock ){ + int rcwork = LSM_BUSY; + lsmFinishWork(pDb, 0, &rcwork); + } + return rc; +} + +int lsmFsSegmentContainsPg( + FileSystem *pFS, + Segment *pSeg, + Pgno iPg, + int *pbRes +){ + Redirect *pRedir = pSeg->pRedirect; + int rc = LSM_OK; + int iBlk; + int iLastBlk; + int iPgBlock; /* Block containing page iPg */ + + iPgBlock = fsPageToBlock(pFS, pSeg->iFirst); + iBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iFirst)); + iLastBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iLastPg)); + + while( iBlk!=iLastBlk && iBlk!=iPgBlock && rc==LSM_OK ){ + rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk); + } + + *pbRes = (iBlk==iPgBlock); + return rc; +} + +/* +** This function implements the lsm_info(LSM_INFO_ARRAY_PAGES) request. +** If successful, *pzOut is set to point to a nul-terminated string +** containing the array structure and LSM_OK is returned. The caller should +** eventually free the string using lsmFree(). +** +** If an error occurs, *pzOut is set to NULL and an LSM error code returned. +*/ +int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut){ + int rc = LSM_OK; + Snapshot *pWorker; /* Worker snapshot */ + Segment *pSeg = 0; /* Array to report on */ + int bUnlock = 0; + + *pzOut = 0; + if( iFirst==0 ) return LSM_ERROR; + + /* Obtain the worker snapshot */ + pWorker = pDb->pWorker; + if( !pWorker ){ + rc = lsmBeginWork(pDb); + if( rc!=LSM_OK ) return rc; + pWorker = pDb->pWorker; + bUnlock = 1; + } + + /* Search for the array that starts on page iFirst */ + pSeg = findSegment(pWorker, iFirst); + + if( pSeg==0 ){ + /* Could not find the requested array. This is an error. */ + rc = LSM_ERROR; + }else{ + Page *pPg = 0; + FileSystem *pFS = pDb->pFS; + LsmString str; + + lsmStringInit(&str, pDb->pEnv); + rc = lsmFsDbPageGet(pFS, pSeg, iFirst, &pPg); + while( rc==LSM_OK && pPg ){ + Page *pNext = 0; + lsmStringAppendf(&str, " %lld", lsmFsPageNumber(pPg)); + rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext); + lsmFsPageRelease(pPg); + pPg = pNext; + } + + if( rc!=LSM_OK ){ + lsmFree(pDb->pEnv, str.z); + }else{ + *pzOut = str.z; + } + } + + if( bUnlock ){ + int rcwork = LSM_BUSY; + lsmFinishWork(pDb, 0, &rcwork); + } + return rc; +} + +/* +** The following macros are used by the integrity-check code. Associated with +** each block in the database is an 8-bit bit mask (the entry in the aUsed[] +** array). As the integrity-check meanders through the database, it sets the +** following bits to indicate how each block is used. +** +** INTEGRITY_CHECK_FIRST_PG: +** First page of block is in use by sorted run. +** +** INTEGRITY_CHECK_LAST_PG: +** Last page of block is in use by sorted run. +** +** INTEGRITY_CHECK_USED: +** At least one page of the block is in use by a sorted run. +** +** INTEGRITY_CHECK_FREE: +** The free block list contains an entry corresponding to this block. +*/ +#define INTEGRITY_CHECK_FIRST_PG 0x01 +#define INTEGRITY_CHECK_LAST_PG 0x02 +#define INTEGRITY_CHECK_USED 0x04 +#define INTEGRITY_CHECK_FREE 0x08 + +/* +** Helper function for lsmFsIntegrityCheck() +*/ +static void checkBlocks( + FileSystem *pFS, + Segment *pSeg, + int bExtra, /* If true, count the "next" block if any */ + int nUsed, + u8 *aUsed +){ + if( pSeg ){ + if( pSeg && pSeg->nSize>0 ){ + int rc; + int iBlk; /* Current block (during iteration) */ + int iLastBlk; /* Last block of segment */ + int iFirstBlk; /* First block of segment */ + int bLastIsLastOnBlock; /* True iLast is the last on its block */ + + assert( 0==fsSegmentRedirects(pFS, pSeg) ); + iBlk = iFirstBlk = fsPageToBlock(pFS, pSeg->iFirst); + iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg); + + bLastIsLastOnBlock = (fsLastPageOnBlock(pFS, iLastBlk)==pSeg->iLastPg); + assert( iBlk>0 ); + + do { + /* iBlk is a part of this sorted run. */ + aUsed[iBlk-1] |= INTEGRITY_CHECK_USED; + + /* If the first page of this block is also part of the segment, + ** set the flag to indicate that the first page of iBlk is in use. + */ + if( fsFirstPageOnBlock(pFS, iBlk)==pSeg->iFirst || iBlk!=iFirstBlk ){ + assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_FIRST_PG)==0 ); + aUsed[iBlk-1] |= INTEGRITY_CHECK_FIRST_PG; + } + + /* Unless the sorted run finishes before the last page on this block, + ** the last page of this block is also in use. */ + if( iBlk!=iLastBlk || bLastIsLastOnBlock ){ + assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_LAST_PG)==0 ); + aUsed[iBlk-1] |= INTEGRITY_CHECK_LAST_PG; + } + + /* Special case. The sorted run being scanned is the output run of + ** a level currently undergoing an incremental merge. The sorted + ** run ends on the last page of iBlk, but the next block has already + ** been allocated. So mark it as in use as well. */ + if( iBlk==iLastBlk && bLastIsLastOnBlock && bExtra ){ + int iExtra = 0; + rc = fsBlockNext(pFS, pSeg, iBlk, &iExtra); + assert( rc==LSM_OK ); + + assert( aUsed[iExtra-1]==0 ); + aUsed[iExtra-1] |= INTEGRITY_CHECK_USED; + aUsed[iExtra-1] |= INTEGRITY_CHECK_FIRST_PG; + aUsed[iExtra-1] |= INTEGRITY_CHECK_LAST_PG; + } + + /* Move on to the next block in the sorted run. Or set iBlk to zero + ** in order to break out of the loop if this was the last block in + ** the run. */ + if( iBlk==iLastBlk ){ + iBlk = 0; + }else{ + rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk); + assert( rc==LSM_OK ); + } + }while( iBlk ); + } + } +} + +typedef struct CheckFreelistCtx CheckFreelistCtx; +struct CheckFreelistCtx { + u8 *aUsed; + int nBlock; +}; +static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){ + CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx; + + assert( iBlk>=1 ); + assert( iBlk<=p->nBlock ); + assert( p->aUsed[iBlk-1]==0 ); + p->aUsed[iBlk-1] = INTEGRITY_CHECK_FREE; + return 0; +} + +/* +** This function checks that all blocks in the database file are accounted +** for. For each block, exactly one of the following must be true: +** +** + the block is part of a sorted run, or +** + the block is on the free-block list +** +** This function also checks that there are no references to blocks with +** out-of-range block numbers. +** +** If no errors are found, non-zero is returned. If an error is found, an +** assert() fails. +*/ +int lsmFsIntegrityCheck(lsm_db *pDb){ + CheckFreelistCtx ctx; + FileSystem *pFS = pDb->pFS; + int i; + int rc; + Freelist freelist = {0, 0, 0}; + u8 *aUsed; + Level *pLevel; + Snapshot *pWorker = pDb->pWorker; + int nBlock = pWorker->nBlock; + +#if 0 + static int nCall = 0; + nCall++; + printf("%d calls\n", nCall); +#endif + + aUsed = lsmMallocZero(pDb->pEnv, nBlock); + if( aUsed==0 ){ + /* Malloc has failed. Since this function is only called within debug + ** builds, this probably means the user is running an OOM injection test. + ** Regardless, it will not be possible to run the integrity-check at this + ** time, so assume the database is Ok and return non-zero. */ + return 1; + } + + for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){ + int i; + checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed); + for(i=0; inRight; i++){ + checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed); + } + } + + /* Mark all blocks in the free-list as used */ + ctx.aUsed = aUsed; + ctx.nBlock = nBlock; + rc = lsmWalkFreelist(pDb, 0, checkFreelistCb, (void *)&ctx); + + if( rc==LSM_OK ){ + for(i=0; ipEnv, aUsed); + lsmFree(pDb->pEnv, freelist.aEntry); + + return 1; +} + +#ifndef NDEBUG +/* +** Return true if pPg happens to be the last page in segment pSeg. Or false +** otherwise. This function is only invoked as part of assert() conditions. +*/ +int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){ + if( pPg->pFS->pCompress ){ + Pgno iNext = 0; + int rc; + rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext); + return (rc!=LSM_OK || iNext==0); + } + return (pPg->iPg==pSeg->iLastPg); +} +#endif diff --git a/ext/lsm1/lsm_log.c b/ext/lsm1/lsm_log.c new file mode 100644 index 0000000000..a61d72ada3 --- /dev/null +++ b/ext/lsm1/lsm_log.c @@ -0,0 +1,1134 @@ +/* +** 2011-08-13 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** This file contains the implementation of LSM database logging. Logging +** has one purpose in LSM - to make transactions durable. +** +** When data is written to an LSM database, it is initially stored in an +** in-memory tree structure. Since this structure is in volatile memory, +** if a power failure or application crash occurs it may be lost. To +** prevent loss of data in this case, each time a record is written to the +** in-memory tree an equivalent record is appended to the log on disk. +** If a power failure or application crash does occur, data can be recovered +** by reading the log. +** +** A log file consists of the following types of records representing data +** written into the database: +** +** LOG_WRITE: A key-value pair written to the database. +** LOG_DELETE: A delete key issued to the database. +** LOG_COMMIT: A transaction commit. +** +** And the following types of records for ancillary purposes.. +** +** LOG_EOF: A record indicating the end of a log file. +** LOG_PAD1: A single byte padding record. +** LOG_PAD2: An N byte padding record (N>1). +** LOG_JUMP: A pointer to another offset within the log file. +** +** Each transaction written to the log contains one or more LOG_WRITE and/or +** LOG_DELETE records, followed by a LOG_COMMIT record. The LOG_COMMIT record +** contains an 8-byte checksum based on all previous data written to the +** log file. +** +** LOG CHECKSUMS & RECOVERY +** +** Checksums are found in two types of log records: LOG_COMMIT and +** LOG_CKSUM records. In order to recover content from a log, a client +** reads each record from the start of the log, calculating a checksum as +** it does. Each time a LOG_COMMIT or LOG_CKSUM is encountered, the +** recovery process verifies that the checksum stored in the log +** matches the calculated checksum. If it does not, the recovery process +** can stop reading the log. +** +** If a recovery process reads records (other than COMMIT or CKSUM) +** consisting of at least LSM_CKSUM_MAXDATA bytes, then the next record in +** the log must be either a LOG_CKSUM or LOG_COMMIT record. If it is +** not, the recovery process also stops reading the log. +** +** To recover the log file, it must be read twice. The first time to +** determine the location of the last valid commit record. And the second +** time to load data into the in-memory tree. +** +** Todo: Surely there is a better way... +** +** LOG WRAPPING +** +** If the log file were never deleted or wrapped, it would be possible to +** read it from start to end each time is required recovery (i.e each time +** the number of database clients changes from 0 to 1). Effectively reading +** the entire history of the database each time. This would quickly become +** inefficient. Additionally, since the log file would grow without bound, +** it wastes storage space. +** +** Instead, part of each checkpoint written into the database file contains +** a log offset (and other information required to read the log starting at +** at this offset) at which to begin recovery. Offset $O. +** +** Once a checkpoint has been written and synced into the database file, it +** is guaranteed that no recovery process will need to read any data before +** offset $O of the log file. It is therefore safe to begin overwriting +** any data that occurs before offset $O. +** +** This implementation separates the log into three regions mapped into +** the log file - regions 0, 1 and 2. During recovery, regions are read +** in ascending order (i.e. 0, then 1, then 2). Each region is zero or +** more bytes in size. +** +** |---1---|..|--0--|.|--2--|.... +** +** New records are always appended to the end of region 2. +** +** Initially (when it is empty), all three regions are zero bytes in size. +** Each of them are located at the beginning of the file. As records are +** added to the log, region 2 grows, so that the log consists of a zero +** byte region 1, followed by a zero byte region 0, followed by an N byte +** region 2. After one or more checkpoints have been written to disk, +** the start point of region 2 is moved to $O. For example: +** +** A) ||.........|--2--|.... +** +** (both regions 0 and 1 are 0 bytes in size at offset 0). +** +** Eventually, the log wraps around to write new records into the start. +** At this point, region 2 is renamed to region 0. Region 0 is renamed +** to region 2. After appending a few records to the new region 2, the +** log file looks like this: +** +** B) ||--2--|...|--0--|.... +** +** (region 1 is still 0 bytes in size, located at offset 0). +** +** Any checkpoints made at this point may reduce the size of region 0. +** However, if they do not, and region 2 expands so that it is about to +** overwrite the start of region 0, then region 2 is renamed to region 1, +** and a new region 2 created at the end of the file following the existing +** region 0. +** +** C) |---1---|..|--0--|.|-2-| +** +** In this state records are appended to region 2 until checkpoints have +** contracted regions 0 AND 1 UNTil they are both zero bytes in size. They +** are then shifted to the start of the log file, leaving the system in +** the equivalent of state A above. +** +** Alternatively, state B may transition directly to state A if the size +** of region 0 is reduced to zero bytes before region 2 threatens to +** encroach upon it. +** +** LOG_PAD1 & LOG_PAD2 RECORDS +** +** PAD1 and PAD2 records may appear in a log file at any point. They allow +** a process writing the log file align the beginning of transactions with +** the beginning of disk sectors, which increases robustness. +** +** RECORD FORMATS: +** +** LOG_EOF: * A single 0x00 byte. +** +** LOG_PAD1: * A single 0x01 byte. +** +** LOG_PAD2: * A single 0x02 byte, followed by +** * The number of unused bytes (N) as a varint, +** * An N byte block of unused space. +** +** LOG_COMMIT: * A single 0x03 byte. +** * An 8-byte checksum. +** +** LOG_JUMP: * A single 0x04 byte. +** * Absolute file offset to jump to, encoded as a varint. +** +** LOG_WRITE: * A single 0x06 or 0x07 byte, +** * The number of bytes in the key, encoded as a varint, +** * The number of bytes in the value, encoded as a varint, +** * If the first byte was 0x07, an 8 byte checksum. +** * The key data, +** * The value data. +** +** LOG_DELETE: * A single 0x08 or 0x09 byte, +** * The number of bytes in the key, encoded as a varint, +** * If the first byte was 0x09, an 8 byte checksum. +** * The key data. +** +** Varints are as described in lsm_varint.c (SQLite 4 format). +** +** CHECKSUMS: +** +** The checksum is calculated using two 32-bit unsigned integers, s0 and +** s1. The initial value for both is 42. It is updated each time a record +** is written into the log file by treating the encoded (binary) record as +** an array of 32-bit little-endian integers. Then, if x[] is the integer +** array, updating the checksum accumulators as follows: +** +** for i from 0 to n-1 step 2: +** s0 += x[i] + s1; +** s1 += x[i+1] + s0; +** endfor +** +** If the record is not an even multiple of 8-bytes in size it is padded +** with zeroes to make it so before the checksum is updated. +** +** The checksum stored in a COMMIT, WRITE or DELETE is based on all bytes +** up to the start of the 8-byte checksum itself, including the COMMIT, +** WRITE or DELETE fields that appear before the checksum in the record. +** +** VARINT FORMAT +** +** See lsm_varint.c. +*/ + +#ifndef _LSM_INT_H +# include "lsmInt.h" +#endif + +/* Log record types */ +#define LSM_LOG_EOF 0x00 +#define LSM_LOG_PAD1 0x01 +#define LSM_LOG_PAD2 0x02 +#define LSM_LOG_COMMIT 0x03 +#define LSM_LOG_JUMP 0x04 + +#define LSM_LOG_WRITE 0x06 +#define LSM_LOG_WRITE_CKSUM 0x07 +#define LSM_LOG_DELETE 0x08 +#define LSM_LOG_DELETE_CKSUM 0x09 + +/* Require a checksum every 32KB. */ +#define LSM_CKSUM_MAXDATA (32*1024) + +/* Do not wrap a log file smaller than this in bytes. */ +#define LSM_MIN_LOGWRAP (128*1024) + +/* +** szSector: +** Commit records must be aligned to end on szSector boundaries. If +** the safety-mode is set to NORMAL or OFF, this value is 1. Otherwise, +** if the safety-mode is set to FULL, it is the size of the file-system +** sectors as reported by lsmFsSectorSize(). +*/ +struct LogWriter { + u32 cksum0; /* Checksum 0 at offset iOff */ + u32 cksum1; /* Checksum 1 at offset iOff */ + int iCksumBuf; /* Bytes of buf that have been checksummed */ + i64 iOff; /* Offset at start of buffer buf */ + int szSector; /* Sector size for this transaction */ + LogRegion jump; /* Avoid writing to this region */ + i64 iRegion1End; /* End of first region written by trans */ + i64 iRegion2Start; /* Start of second regions written by trans */ + LsmString buf; /* Buffer containing data not yet written */ +}; + +/* +** Return the result of interpreting the first 4 bytes in buffer aIn as +** a 32-bit unsigned little-endian integer. +*/ +static u32 getU32le(u8 *aIn){ + return ((u32)aIn[3] << 24) + + ((u32)aIn[2] << 16) + + ((u32)aIn[1] << 8) + + ((u32)aIn[0]); +} + + +/* +** This function is the same as logCksum(), except that pointer "a" need +** not be aligned to an 8-byte boundary or padded with zero bytes. This +** version is slower, but sometimes more convenient to use. +*/ +static void logCksumUnaligned( + char *z, /* Input buffer */ + int n, /* Size of input buffer in bytes */ + u32 *pCksum0, /* IN/OUT: Checksum value 1 */ + u32 *pCksum1 /* IN/OUT: Checksum value 2 */ +){ + u8 *a = (u8 *)z; + u32 cksum0 = *pCksum0; + u32 cksum1 = *pCksum1; + int nIn = (n/8) * 8; + int i; + + assert( n>0 ); + for(i=0; inIn ); + memcpy(aBuf, &a[nIn], n-nIn); + cksum0 += getU32le(aBuf) + cksum1; + cksum1 += getU32le(&aBuf[4]) + cksum0; + } + + *pCksum0 = cksum0; + *pCksum1 = cksum1; +} + +/* +** Update pLog->cksum0 and pLog->cksum1 so that the first nBuf bytes in the +** write buffer (pLog->buf) are included in the checksum. +*/ +static void logUpdateCksum(LogWriter *pLog, int nBuf){ + assert( (pLog->iCksumBuf % 8)==0 ); + assert( pLog->iCksumBuf<=nBuf ); + assert( (nBuf % 8)==0 || nBuf==pLog->buf.n ); + if( nBuf>pLog->iCksumBuf ){ + logCksumUnaligned( + &pLog->buf.z[pLog->iCksumBuf], nBuf-pLog->iCksumBuf, + &pLog->cksum0, &pLog->cksum1 + ); + } + pLog->iCksumBuf = nBuf; +} + +static i64 firstByteOnSector(LogWriter *pLog, i64 iOff){ + return (iOff / pLog->szSector) * pLog->szSector; +} +static i64 lastByteOnSector(LogWriter *pLog, i64 iOff){ + return firstByteOnSector(pLog, iOff) + pLog->szSector - 1; +} + +/* +** If possible, reclaim log file space. Log file space is reclaimed after +** a snapshot that points to the same data in the database file is synced +** into the db header. +*/ +static int logReclaimSpace(lsm_db *pDb){ + int rc; + int iMeta; + int bRotrans; /* True if there exists some ro-trans */ + + /* Test if there exists some other connection with a read-only transaction + ** open. If there does, then log file space may not be reclaimed. */ + rc = lsmDetectRoTrans(pDb, &bRotrans); + if( rc!=LSM_OK || bRotrans ) return rc; + + iMeta = (int)pDb->pShmhdr->iMetaPage; + if( iMeta==1 || iMeta==2 ){ + DbLog *pLog = &pDb->treehdr.log; + i64 iSyncedId; + + /* Read the snapshot-id of the snapshot stored on meta-page iMeta. Note + ** that in theory, the value read is untrustworthy (due to a race + ** condition - see comments above lsmFsReadSyncedId()). So it is only + ** ever used to conclude that no log space can be reclaimed. If it seems + ** to indicate that it may be possible to reclaim log space, a + ** second call to lsmCheckpointSynced() (which does return trustworthy + ** values) is made below to confirm. */ + rc = lsmFsReadSyncedId(pDb, iMeta, &iSyncedId); + + if( rc==LSM_OK && pLog->iSnapshotId!=iSyncedId ){ + i64 iSnapshotId = 0; + i64 iOff = 0; + rc = lsmCheckpointSynced(pDb, &iSnapshotId, &iOff, 0); + if( rc==LSM_OK && pLog->iSnapshotIdaRegion[iRegion]; + if( iOff>=p->iStart && iOff<=p->iEnd ) break; + p->iStart = 0; + p->iEnd = 0; + } + assert( iRegion<3 ); + pLog->aRegion[iRegion].iStart = iOff; + pLog->iSnapshotId = iSnapshotId; + } + } + } + return rc; +} + +/* +** This function is called when a write-transaction is first opened. It +** is assumed that the caller is holding the client-mutex when it is +** called. +** +** Before returning, this function allocates the LogWriter object that +** will be used to write to the log file during the write transaction. +** LSM_OK is returned if no error occurs, otherwise an LSM error code. +*/ +int lsmLogBegin(lsm_db *pDb){ + int rc = LSM_OK; + LogWriter *pNew; + LogRegion *aReg; + + if( pDb->bUseLog==0 ) return LSM_OK; + + /* If the log file has not yet been opened, open it now. Also allocate + ** the LogWriter structure, if it has not already been allocated. */ + rc = lsmFsOpenLog(pDb, 0); + if( pDb->pLogWriter==0 ){ + pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc); + if( pNew ){ + lsmStringInit(&pNew->buf, pDb->pEnv); + rc = lsmStringExtend(&pNew->buf, 2); + } + }else{ + pNew = pDb->pLogWriter; + assert( (u8 *)(&pNew[1])==(u8 *)(&((&pNew->buf)[1])) ); + memset(pNew, 0, ((u8 *)&pNew->buf) - (u8 *)pNew); + pNew->buf.n = 0; + } + + if( rc==LSM_OK ){ + /* The following call detects whether or not a new snapshot has been + ** synced into the database file. If so, it updates the contents of + ** the pDb->treehdr.log structure to reclaim any space in the log + ** file that is no longer required. + ** + ** TODO: Calling this every transaction is overkill. And since the + ** call has to read and checksum a snapshot from the database file, + ** it is expensive. It would be better to figure out a way so that + ** this is only called occasionally - say for every 32KB written to + ** the log file. + */ + rc = logReclaimSpace(pDb); + } + if( rc!=LSM_OK ){ + lsmLogClose(pDb); + return rc; + } + + /* Set the effective sector-size for this transaction. Sectors are assumed + ** to be one byte in size if the safety-mode is OFF or NORMAL, or as + ** reported by lsmFsSectorSize if it is FULL. */ + if( pDb->eSafety==LSM_SAFETY_FULL ){ + pNew->szSector = lsmFsSectorSize(pDb->pFS); + assert( pNew->szSector>0 ); + }else{ + pNew->szSector = 1; + } + + /* There are now three scenarios: + ** + ** 1) Regions 0 and 1 are both zero bytes in size and region 2 begins + ** at a file offset greater than LSM_MIN_LOGWRAP. In this case, wrap + ** around to the start and write data into the start of the log file. + ** + ** 2) Region 1 is zero bytes in size and region 2 occurs earlier in the + ** file than region 0. In this case, append data to region 2, but + ** remember to jump over region 1 if required. + ** + ** 3) Region 2 is the last in the file. Append to it. + */ + aReg = &pDb->treehdr.log.aRegion[0]; + + assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart ); + assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart ); + + pNew->cksum0 = pDb->treehdr.log.cksum0; + pNew->cksum1 = pDb->treehdr.log.cksum1; + + if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=LSM_MIN_LOGWRAP ){ + /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP + ** into the log file in this case. Pad it out to 8 bytes using a PAD2 + ** record so that the checksums can be updated immediately. */ + u8 aJump[] = { + LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00 + }; + + lsmStringBinAppend(&pNew->buf, aJump, sizeof(aJump)); + logUpdateCksum(pNew, pNew->buf.n); + rc = lsmFsWriteLog(pDb->pFS, aReg[2].iEnd, &pNew->buf); + pNew->iCksumBuf = pNew->buf.n = 0; + + aReg[2].iEnd += 8; + pNew->jump = aReg[0] = aReg[2]; + aReg[2].iStart = aReg[2].iEnd = 0; + }else if( aReg[1].iEnd==0 && aReg[2].iEndiOff = aReg[2].iEnd; + pNew->jump = aReg[0]; + }else{ + /* Case 3. */ + assert( aReg[2].iStart>=aReg[0].iEnd && aReg[2].iStart>=aReg[1].iEnd ); + pNew->iOff = aReg[2].iEnd; + } + + if( pNew->jump.iStart ){ + i64 iRound; + assert( pNew->jump.iStart>pNew->iOff ); + + iRound = firstByteOnSector(pNew, pNew->jump.iStart); + if( iRound>pNew->iOff ) pNew->jump.iStart = iRound; + pNew->jump.iEnd = lastByteOnSector(pNew, pNew->jump.iEnd); + } + + pDb->pLogWriter = pNew; + return rc; +} + +/* +** This function is called when a write-transaction is being closed. +** Parameter bCommit is true if the transaction is being committed, +** or false otherwise. The caller must hold the client-mutex to call +** this function. +** +** A call to this function deletes the LogWriter object allocated by +** lsmLogBegin(). If the transaction is being committed, the shared state +** in *pLog is updated before returning. +*/ +void lsmLogEnd(lsm_db *pDb, int bCommit){ + DbLog *pLog; + LogWriter *p; + p = pDb->pLogWriter; + + if( p==0 ) return; + pLog = &pDb->treehdr.log; + + if( bCommit ){ + pLog->aRegion[2].iEnd = p->iOff; + pLog->cksum0 = p->cksum0; + pLog->cksum1 = p->cksum1; + if( p->iRegion1End ){ + /* This happens when the transaction had to jump over some other + ** part of the log. */ + assert( pLog->aRegion[1].iEnd==0 ); + assert( pLog->aRegion[2].iStartiRegion1End ); + pLog->aRegion[1].iStart = pLog->aRegion[2].iStart; + pLog->aRegion[1].iEnd = p->iRegion1End; + pLog->aRegion[2].iStart = p->iRegion2Start; + } + } +} + +static int jumpIfRequired( + lsm_db *pDb, + LogWriter *pLog, + int nReq, + int *pbJump +){ + /* Determine if it is necessary to add an LSM_LOG_JUMP to jump over the + ** jump region before writing the LSM_LOG_WRITE or DELETE record. This + ** is necessary if there is insufficient room between the current offset + ** and the jump region to fit the new WRITE/DELETE record and the largest + ** possible JUMP record with up to 7 bytes of padding (a total of 17 + ** bytes). */ + if( (pLog->jump.iStart > (pLog->iOff + pLog->buf.n)) + && (pLog->jump.iStart < (pLog->iOff + pLog->buf.n + (nReq + 17))) + ){ + int rc; /* Return code */ + i64 iJump; /* Offset to jump to */ + u8 aJump[10]; /* Encoded jump record */ + int nJump; /* Valid bytes in aJump[] */ + int nPad; /* Bytes of padding required */ + + /* Serialize the JUMP record */ + iJump = pLog->jump.iEnd+1; + aJump[0] = LSM_LOG_JUMP; + nJump = 1 + lsmVarintPut64(&aJump[1], iJump); + + /* Adding padding to the contents of the buffer so that it will be a + ** multiple of 8 bytes in size after the JUMP record is appended. This + ** is not strictly required, it just makes the keeping the running + ** checksum up to date in this file a little simpler. */ + nPad = (pLog->buf.n + nJump) % 8; + if( nPad ){ + u8 aPad[7] = {0,0,0,0,0,0,0}; + nPad = 8-nPad; + if( nPad==1 ){ + aPad[0] = LSM_LOG_PAD1; + }else{ + aPad[0] = LSM_LOG_PAD2; + aPad[1] = (nPad-2); + } + rc = lsmStringBinAppend(&pLog->buf, aPad, nPad); + if( rc!=LSM_OK ) return rc; + } + + /* Append the JUMP record to the buffer. Then flush the buffer to disk + ** and update the checksums. The next write to the log file (assuming + ** there is no transaction rollback) will be to offset iJump (just past + ** the jump region). */ + rc = lsmStringBinAppend(&pLog->buf, aJump, nJump); + if( rc!=LSM_OK ) return rc; + assert( (pLog->buf.n % 8)==0 ); + rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf); + if( rc!=LSM_OK ) return rc; + logUpdateCksum(pLog, pLog->buf.n); + pLog->iRegion1End = (pLog->iOff + pLog->buf.n); + pLog->iRegion2Start = iJump; + pLog->iOff = iJump; + pLog->iCksumBuf = pLog->buf.n = 0; + if( pbJump ) *pbJump = 1; + } + + return LSM_OK; +} + +static int logCksumAndFlush(lsm_db *pDb){ + int rc; /* Return code */ + LogWriter *pLog = pDb->pLogWriter; + + /* Calculate the checksum value. Append it to the buffer. */ + logUpdateCksum(pLog, pLog->buf.n); + lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum0); + pLog->buf.n += 4; + lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum1); + pLog->buf.n += 4; + + /* Write the contents of the buffer to disk. */ + rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf); + pLog->iOff += pLog->buf.n; + pLog->iCksumBuf = pLog->buf.n = 0; + + return rc; +} + +/* +** Write the contents of the log-buffer to disk. Then write either a CKSUM +** or COMMIT record, depending on the value of parameter eType. +*/ +static int logFlush(lsm_db *pDb, int eType){ + int rc; + int nReq; + LogWriter *pLog = pDb->pLogWriter; + + assert( eType==LSM_LOG_COMMIT ); + assert( pLog ); + + /* Commit record is always 9 bytes in size. */ + nReq = 9; + if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ) nReq += pLog->szSector + 17; + rc = jumpIfRequired(pDb, pLog, nReq, 0); + + /* If this is a COMMIT, add padding to the log so that the COMMIT record + ** is aligned against the end of a disk sector. In other words, add padding + ** so that the first byte following the COMMIT record lies on a different + ** sector. */ + if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ){ + int nPad; /* Bytes of padding to add */ + + /* Determine the value of nPad. */ + nPad = ((pLog->iOff + pLog->buf.n + 9) % pLog->szSector); + if( nPad ) nPad = pLog->szSector - nPad; + rc = lsmStringExtend(&pLog->buf, nPad); + if( rc!=LSM_OK ) return rc; + + while( nPad ){ + if( nPad==1 ){ + pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD1; + nPad = 0; + }else{ + int n = LSM_MIN(200, nPad-2); + pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD2; + pLog->buf.z[pLog->buf.n++] = n; + nPad -= 2; + memset(&pLog->buf.z[pLog->buf.n], 0x2B, n); + pLog->buf.n += n; + nPad -= n; + } + } + } + + /* Make sure there is room in the log-buffer to add the CKSUM or COMMIT + ** record. Then add the first byte of it. */ + rc = lsmStringExtend(&pLog->buf, 9); + if( rc!=LSM_OK ) return rc; + pLog->buf.z[pLog->buf.n++] = eType; + memset(&pLog->buf.z[pLog->buf.n], 0, 8); + + rc = logCksumAndFlush(pDb); + + /* If this is a commit and synchronous=full, sync the log to disk. */ + if( rc==LSM_OK && eType==LSM_LOG_COMMIT && pDb->eSafety==LSM_SAFETY_FULL ){ + rc = lsmFsSyncLog(pDb->pFS); + } + return rc; +} + +/* +** Append an LSM_LOG_WRITE (if nVal>=0) or LSM_LOG_DELETE (if nVal<0) +** record to the database log. +*/ +int lsmLogWrite( + lsm_db *pDb, /* Database handle */ + void *pKey, int nKey, /* Database key to write to log */ + void *pVal, int nVal /* Database value (or nVal<0) to write */ +){ + int rc = LSM_OK; + LogWriter *pLog; /* Log object to write to */ + int nReq; /* Bytes of space required in log */ + int bCksum = 0; /* True to embed a checksum in this record */ + + if( pDb->bUseLog==0 ) return LSM_OK; + pLog = pDb->pLogWriter; + + /* Determine how many bytes of space are required, assuming that a checksum + ** will be embedded in this record (even though it may not be). */ + nReq = 1 + lsmVarintLen32(nKey) + 8 + nKey; + if( nVal>=0 ) nReq += lsmVarintLen32(nVal) + nVal; + + /* Jump over the jump region if required. Set bCksum to true to tell the + ** code below to include a checksum in the record if either (a) writing + ** this record would mean that more than LSM_CKSUM_MAXDATA bytes of data + ** have been written to the log since the last checksum, or (b) the jump + ** is taken. */ + rc = jumpIfRequired(pDb, pLog, nReq, &bCksum); + if( (pLog->buf.n+nReq) > LSM_CKSUM_MAXDATA ) bCksum = 1; + + if( rc==LSM_OK ){ + rc = lsmStringExtend(&pLog->buf, nReq); + } + if( rc==LSM_OK ){ + u8 *a = (u8 *)&pLog->buf.z[pLog->buf.n]; + + /* Write the record header - the type byte followed by either 1 (for + ** DELETE) or 2 (for WRITE) varints. */ + assert( LSM_LOG_WRITE_CKSUM == (LSM_LOG_WRITE | 0x0001) ); + assert( LSM_LOG_DELETE_CKSUM == (LSM_LOG_DELETE | 0x0001) ); + *(a++) = (nVal>=0 ? LSM_LOG_WRITE : LSM_LOG_DELETE) | (u8)bCksum; + a += lsmVarintPut32(a, nKey); + if( nVal>=0 ) a += lsmVarintPut32(a, nVal); + + if( bCksum ){ + pLog->buf.n = (a - (u8 *)pLog->buf.z); + rc = logCksumAndFlush(pDb); + a = (u8 *)&pLog->buf.z[pLog->buf.n]; + } + + memcpy(a, pKey, nKey); + a += nKey; + if( nVal>=0 ){ + memcpy(a, pVal, nVal); + a += nVal; + } + pLog->buf.n = a - (u8 *)pLog->buf.z; + assert( pLog->buf.n<=pLog->buf.nAlloc ); + } + + return rc; +} + +/* +** Append an LSM_LOG_COMMIT record to the database log. +*/ +int lsmLogCommit(lsm_db *pDb){ + if( pDb->bUseLog==0 ) return LSM_OK; + return logFlush(pDb, LSM_LOG_COMMIT); +} + +/* +** Store the current offset and other checksum related information in the +** structure *pMark. Later, *pMark can be passed to lsmLogSeek() to "rewind" +** the LogWriter object to the current log file offset. This is used when +** rolling back savepoint transactions. +*/ +void lsmLogTell( + lsm_db *pDb, /* Database handle */ + LogMark *pMark /* Populate this object with current offset */ +){ + LogWriter *pLog; + int nCksum; + + if( pDb->bUseLog==0 ) return; + pLog = pDb->pLogWriter; + nCksum = pLog->buf.n & 0xFFFFFFF8; + logUpdateCksum(pLog, nCksum); + assert( pLog->iCksumBuf==nCksum ); + pMark->nBuf = pLog->buf.n - nCksum; + memcpy(pMark->aBuf, &pLog->buf.z[nCksum], pMark->nBuf); + + pMark->iOff = pLog->iOff + pLog->buf.n; + pMark->cksum0 = pLog->cksum0; + pMark->cksum1 = pLog->cksum1; +} + +/* +** Seek (rewind) back to the log file offset stored by an ealier call to +** lsmLogTell() in *pMark. +*/ +void lsmLogSeek( + lsm_db *pDb, /* Database handle */ + LogMark *pMark /* Object containing log offset to seek to */ +){ + LogWriter *pLog; + + if( pDb->bUseLog==0 ) return; + pLog = pDb->pLogWriter; + + assert( pMark->iOff<=pLog->iOff+pLog->buf.n ); + if( (pMark->iOff & 0xFFFFFFF8)>=pLog->iOff ){ + pLog->buf.n = pMark->iOff - pLog->iOff; + pLog->iCksumBuf = (pLog->buf.n & 0xFFFFFFF8); + }else{ + pLog->buf.n = pMark->nBuf; + memcpy(pLog->buf.z, pMark->aBuf, pMark->nBuf); + pLog->iCksumBuf = 0; + pLog->iOff = pMark->iOff - pMark->nBuf; + } + pLog->cksum0 = pMark->cksum0; + pLog->cksum1 = pMark->cksum1; + + if( pMark->iOff > pLog->iRegion1End ) pLog->iRegion1End = 0; + if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0; +} + +/* +** This function does the work for an lsm_info(LOG_STRUCTURE) request. +*/ +int lsmInfoLogStructure(lsm_db *pDb, char **pzVal){ + int rc = LSM_OK; + char *zVal = 0; + + /* If there is no read or write transaction open, read the latest + ** tree-header from shared-memory to report on. If necessary, update + ** it based on the contents of the database header. + ** + ** No locks are taken here - these are passive read operations only. + */ + if( pDb->pCsr==0 && pDb->nTransOpen==0 ){ + rc = lsmTreeLoadHeader(pDb, 0); + if( rc==LSM_OK ) rc = logReclaimSpace(pDb); + } + + if( rc==LSM_OK ){ + DbLog *pLog = &pDb->treehdr.log; + zVal = lsmMallocPrintf(pDb->pEnv, + "%d %d %d %d %d %d", + (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd, + (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd, + (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd + ); + if( !zVal ) rc = LSM_NOMEM_BKPT; + } + + *pzVal = zVal; + return rc; +} + +/************************************************************************* +** Begin code for log recovery. +*/ + +typedef struct LogReader LogReader; +struct LogReader { + FileSystem *pFS; /* File system to read from */ + i64 iOff; /* File offset at end of buf content */ + int iBuf; /* Current read offset in buf */ + LsmString buf; /* Buffer containing file content */ + + int iCksumBuf; /* Offset in buf corresponding to cksum[01] */ + u32 cksum0; /* Checksum 0 at offset iCksumBuf */ + u32 cksum1; /* Checksum 1 at offset iCksumBuf */ +}; + +static void logReaderBlob( + LogReader *p, /* Log reader object */ + LsmString *pBuf, /* Dynamic storage, if required */ + int nBlob, /* Number of bytes to read */ + u8 **ppBlob, /* OUT: Pointer to blob read */ + int *pRc /* IN/OUT: Error code */ +){ + static const int LOG_READ_SIZE = 512; + int rc = *pRc; /* Return code */ + int nReq = nBlob; /* Bytes required */ + + while( rc==LSM_OK && nReq>0 ){ + int nAvail; /* Bytes of data available in p->buf */ + if( p->buf.n==p->iBuf ){ + int nCksum; /* Total bytes requiring checksum */ + int nCarry = 0; /* Total bytes requiring checksum */ + + nCksum = p->iBuf - p->iCksumBuf; + if( nCksum>0 ){ + nCarry = nCksum % 8; + nCksum = ((nCksum / 8) * 8); + if( nCksum>0 ){ + logCksumUnaligned( + &p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1 + ); + } + } + if( nCarry>0 ) memcpy(p->buf.z, &p->buf.z[p->iBuf-nCarry], nCarry); + p->buf.n = nCarry; + p->iBuf = nCarry; + + rc = lsmFsReadLog(p->pFS, p->iOff, LOG_READ_SIZE, &p->buf); + if( rc!=LSM_OK ) break; + p->iCksumBuf = 0; + p->iOff += LOG_READ_SIZE; + } + + nAvail = p->buf.n - p->iBuf; + if( ppBlob && nReq==nBlob && nBlob<=nAvail ){ + *ppBlob = (u8 *)&p->buf.z[p->iBuf]; + p->iBuf += nBlob; + nReq = 0; + }else{ + int nCopy = LSM_MIN(nAvail, nReq); + if( nBlob==nReq ){ + if( ppBlob ) *ppBlob = (u8 *)pBuf->z; + pBuf->n = 0; + } + rc = lsmStringBinAppend(pBuf, (u8 *)&p->buf.z[p->iBuf], nCopy); + nReq -= nCopy; + p->iBuf += nCopy; + } + } + + *pRc = rc; +} + +static void logReaderVarint( + LogReader *p, + LsmString *pBuf, + int *piVal, /* OUT: Value read from log */ + int *pRc /* IN/OUT: Error code */ +){ + if( *pRc==LSM_OK ){ + u8 *aVarint; + if( p->buf.n==p->iBuf ){ + logReaderBlob(p, 0, 10, &aVarint, pRc); + if( LSM_OK==*pRc ) p->iBuf -= (10 - lsmVarintGet32(aVarint, piVal)); + }else{ + logReaderBlob(p, pBuf, lsmVarintSize(p->buf.z[p->iBuf]), &aVarint, pRc); + if( LSM_OK==*pRc ) lsmVarintGet32(aVarint, piVal); + } + } +} + +static void logReaderByte(LogReader *p, u8 *pByte, int *pRc){ + u8 *pPtr = 0; + logReaderBlob(p, 0, 1, &pPtr, pRc); + if( pPtr ) *pByte = *pPtr; +} + +static void logReaderCksum(LogReader *p, LsmString *pBuf, int *pbEof, int *pRc){ + if( *pRc==LSM_OK ){ + u8 *pPtr = 0; + u32 cksum0, cksum1; + int nCksum = p->iBuf - p->iCksumBuf; + + /* Update in-memory (expected) checksums */ + assert( nCksum>=0 ); + logCksumUnaligned(&p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1); + p->iCksumBuf = p->iBuf + 8; + logReaderBlob(p, pBuf, 8, &pPtr, pRc); + + /* Read the checksums from the log file. Set *pbEof if they do not match. */ + if( pPtr ){ + cksum0 = lsmGetU32(pPtr); + cksum1 = lsmGetU32(&pPtr[4]); + *pbEof = (cksum0!=p->cksum0 || cksum1!=p->cksum1); + p->iCksumBuf = p->iBuf; + } + } +} + +static void logReaderInit( + lsm_db *pDb, /* Database handle */ + DbLog *pLog, /* Log object associated with pDb */ + int bInitBuf, /* True if p->buf is uninitialized */ + LogReader *p /* Initialize this LogReader object */ +){ + p->pFS = pDb->pFS; + p->iOff = pLog->aRegion[2].iStart; + p->cksum0 = pLog->cksum0; + p->cksum1 = pLog->cksum1; + if( bInitBuf ){ lsmStringInit(&p->buf, pDb->pEnv); } + p->buf.n = 0; + p->iCksumBuf = 0; + p->iBuf = 0; +} + +/* +** This function is called after reading the header of a LOG_DELETE or +** LOG_WRITE record. Parameter nByte is the total size of the key and +** value that follow the header just read. Return true if the size and +** position of the record indicate that it should contain a checksum. +*/ +static int logRequireCksum(LogReader *p, int nByte){ + return ((p->iBuf + nByte - p->iCksumBuf) > LSM_CKSUM_MAXDATA); +} + +/* +** Recover the contents of the log file. +*/ +int lsmLogRecover(lsm_db *pDb){ + LsmString buf1; /* Key buffer */ + LsmString buf2; /* Value buffer */ + LogReader reader; /* Log reader object */ + int rc = LSM_OK; /* Return code */ + int nCommit = 0; /* Number of transactions to recover */ + int iPass; + int nJump = 0; /* Number of LSM_LOG_JUMP records in pass 0 */ + DbLog *pLog; + int bOpen; + + rc = lsmFsOpenLog(pDb, &bOpen); + if( rc!=LSM_OK ) return rc; + + rc = lsmTreeInit(pDb); + if( rc!=LSM_OK ) return rc; + + pLog = &pDb->treehdr.log; + lsmCheckpointLogoffset(pDb->pShmhdr->aSnap2, pLog); + + logReaderInit(pDb, pLog, 1, &reader); + lsmStringInit(&buf1, pDb->pEnv); + lsmStringInit(&buf2, pDb->pEnv); + + /* The outer for() loop runs at most twice. The first iteration is to + ** count the number of committed transactions in the log. The second + ** iterates through those transactions and updates the in-memory tree + ** structure with their contents. */ + if( bOpen ){ + for(iPass=0; iPass<2 && rc==LSM_OK; iPass++){ + int bEof = 0; + + while( rc==LSM_OK && !bEof ){ + u8 eType = 0; + logReaderByte(&reader, &eType, &rc); + + switch( eType ){ + case LSM_LOG_PAD1: + break; + + case LSM_LOG_PAD2: { + int nPad; + logReaderVarint(&reader, &buf1, &nPad, &rc); + logReaderBlob(&reader, &buf1, nPad, 0, &rc); + break; + } + + case LSM_LOG_WRITE: + case LSM_LOG_WRITE_CKSUM: { + int nKey; + int nVal; + u8 *aVal; + logReaderVarint(&reader, &buf1, &nKey, &rc); + logReaderVarint(&reader, &buf2, &nVal, &rc); + + if( eType==LSM_LOG_WRITE_CKSUM ){ + logReaderCksum(&reader, &buf1, &bEof, &rc); + }else{ + bEof = logRequireCksum(&reader, nKey+nVal); + } + if( bEof ) break; + + logReaderBlob(&reader, &buf1, nKey, 0, &rc); + logReaderBlob(&reader, &buf2, nVal, &aVal, &rc); + if( iPass==1 && rc==LSM_OK ){ + rc = lsmTreeInsert(pDb, (u8 *)buf1.z, nKey, aVal, nVal); + } + break; + } + + case LSM_LOG_DELETE: + case LSM_LOG_DELETE_CKSUM: { + int nKey; u8 *aKey; + logReaderVarint(&reader, &buf1, &nKey, &rc); + + if( eType==LSM_LOG_DELETE_CKSUM ){ + logReaderCksum(&reader, &buf1, &bEof, &rc); + }else{ + bEof = logRequireCksum(&reader, nKey); + } + if( bEof ) break; + + logReaderBlob(&reader, &buf1, nKey, &aKey, &rc); + if( iPass==1 && rc==LSM_OK ){ + rc = lsmTreeInsert(pDb, aKey, nKey, NULL, -1); + } + break; + } + + case LSM_LOG_COMMIT: + logReaderCksum(&reader, &buf1, &bEof, &rc); + if( bEof==0 ){ + nCommit++; + assert( nCommit>0 || iPass==1 ); + if( nCommit==0 ) bEof = 1; + } + break; + + case LSM_LOG_JUMP: { + int iOff = 0; + logReaderVarint(&reader, &buf1, &iOff, &rc); + if( rc==LSM_OK ){ + if( iPass==1 ){ + if( pLog->aRegion[2].iStart==0 ){ + assert( pLog->aRegion[1].iStart==0 ); + pLog->aRegion[1].iEnd = reader.iOff; + }else{ + assert( pLog->aRegion[0].iStart==0 ); + pLog->aRegion[0].iStart = pLog->aRegion[2].iStart; + pLog->aRegion[0].iEnd = reader.iOff-reader.buf.n+reader.iBuf; + } + pLog->aRegion[2].iStart = iOff; + }else{ + if( (nJump++)==2 ){ + bEof = 1; + } + } + + reader.iOff = iOff; + reader.buf.n = reader.iBuf; + } + break; + } + + default: + /* Including LSM_LOG_EOF */ + bEof = 1; + break; + } + } + + if( rc==LSM_OK && iPass==0 ){ + if( nCommit==0 ){ + if( pLog->aRegion[2].iStart==0 ){ + iPass = 1; + }else{ + pLog->aRegion[2].iStart = 0; + iPass = -1; + lsmCheckpointZeroLogoffset(pDb); + } + } + logReaderInit(pDb, pLog, 0, &reader); + nCommit = nCommit * -1; + } + } + } + + /* Initialize DbLog object */ + if( rc==LSM_OK ){ + pLog->aRegion[2].iEnd = reader.iOff - reader.buf.n + reader.iBuf; + pLog->cksum0 = reader.cksum0; + pLog->cksum1 = reader.cksum1; + } + + if( rc==LSM_OK ){ + rc = lsmFinishRecovery(pDb); + }else{ + lsmFinishRecovery(pDb); + } + + if( pDb->bRoTrans ){ + lsmFsCloseLog(pDb); + } + + lsmStringClear(&buf1); + lsmStringClear(&buf2); + lsmStringClear(&reader.buf); + return rc; +} + +void lsmLogClose(lsm_db *db){ + if( db->pLogWriter ){ + lsmFree(db->pEnv, db->pLogWriter->buf.z); + lsmFree(db->pEnv, db->pLogWriter); + db->pLogWriter = 0; + } +} diff --git a/ext/lsm1/lsm_main.c b/ext/lsm1/lsm_main.c new file mode 100644 index 0000000000..3146cc6ad0 --- /dev/null +++ b/ext/lsm1/lsm_main.c @@ -0,0 +1,1009 @@ +/* +** 2011-08-18 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** The main interface to the LSM module. +*/ +#include "lsmInt.h" + + +#ifdef LSM_DEBUG +/* +** This function returns a copy of its only argument. +** +** When the library is built with LSM_DEBUG defined, this function is called +** whenever an error code is generated (not propagated - generated). So +** if the library is mysteriously returning (say) LSM_IOERR, a breakpoint +** may be set in this function to determine why. +*/ +int lsmErrorBkpt(int rc){ + /* Set breakpoint here! */ + return rc; +} + +/* +** This function contains various assert() statements that test that the +** lsm_db structure passed as an argument is internally consistent. +*/ +static void assert_db_state(lsm_db *pDb){ + + /* If there is at least one cursor or a write transaction open, the database + ** handle must be holding a pointer to a client snapshot. And the reverse + ** - if there are no open cursors and no write transactions then there must + ** not be a client snapshot. */ + + assert( (pDb->pCsr!=0||pDb->nTransOpen>0)==(pDb->iReader>=0||pDb->bRoTrans) ); + + assert( (pDb->iReader<0 && pDb->bRoTrans==0) || pDb->pClient!=0 ); + + assert( pDb->nTransOpen>=0 ); +} +#else +# define assert_db_state(x) +#endif + +/* +** The default key-compare function. +*/ +static int xCmp(void *p1, int n1, void *p2, int n2){ + int res; + res = memcmp(p1, p2, LSM_MIN(n1, n2)); + if( res==0 ) res = (n1-n2); + return res; +} + +static void xLog(void *pCtx, int rc, const char *z){ + (void)(rc); + (void)(pCtx); + fprintf(stderr, "%s\n", z); + fflush(stderr); +} + +/* +** Allocate a new db handle. +*/ +int lsm_new(lsm_env *pEnv, lsm_db **ppDb){ + lsm_db *pDb; + + /* If the user did not provide an environment, use the default. */ + if( pEnv==0 ) pEnv = lsm_default_env(); + assert( pEnv ); + + /* Allocate the new database handle */ + *ppDb = pDb = (lsm_db *)lsmMallocZero(pEnv, sizeof(lsm_db)); + if( pDb==0 ) return LSM_NOMEM_BKPT; + + /* Initialize the new object */ + pDb->pEnv = pEnv; + pDb->nTreeLimit = LSM_DFLT_AUTOFLUSH; + pDb->nAutockpt = LSM_DFLT_AUTOCHECKPOINT; + pDb->bAutowork = LSM_DFLT_AUTOWORK; + pDb->eSafety = LSM_DFLT_SAFETY; + pDb->xCmp = xCmp; + pDb->nDfltPgsz = LSM_DFLT_PAGE_SIZE; + pDb->nDfltBlksz = LSM_DFLT_BLOCK_SIZE; + pDb->nMerge = LSM_DFLT_AUTOMERGE; + pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES; + pDb->bUseLog = LSM_DFLT_USE_LOG; + pDb->iReader = -1; + pDb->iRwclient = -1; + pDb->bMultiProc = LSM_DFLT_MULTIPLE_PROCESSES; + pDb->iMmap = LSM_DFLT_MMAP; + pDb->xLog = xLog; + pDb->compress.iId = LSM_COMPRESSION_NONE; + return LSM_OK; +} + +lsm_env *lsm_get_env(lsm_db *pDb){ + assert( pDb->pEnv ); + return pDb->pEnv; +} + +/* +** If database handle pDb is currently holding a client snapshot, but does +** not have any open cursors or write transactions, release it. +*/ +static void dbReleaseClientSnapshot(lsm_db *pDb){ + if( pDb->nTransOpen==0 && pDb->pCsr==0 ){ + lsmFinishReadTrans(pDb); + } +} + +static int getFullpathname( + lsm_env *pEnv, + const char *zRel, + char **pzAbs +){ + int nAlloc = 0; + char *zAlloc = 0; + int nReq = 0; + int rc; + + do{ + nAlloc = nReq; + rc = pEnv->xFullpath(pEnv, zRel, zAlloc, &nReq); + if( nReq>nAlloc ){ + zAlloc = lsmReallocOrFreeRc(pEnv, zAlloc, nReq, &rc); + } + }while( nReq>nAlloc && rc==LSM_OK ); + + if( rc!=LSM_OK ){ + lsmFree(pEnv, zAlloc); + zAlloc = 0; + } + *pzAbs = zAlloc; + return rc; +} + +/* +** Check that the bits in the db->mLock mask are consistent with the +** value stored in db->iRwclient. An assert shall fail otherwise. +*/ +static void assertRwclientLockValue(lsm_db *db){ +#ifndef NDEBUG + u64 msk; /* Mask of mLock bits for RWCLIENT locks */ + u64 rwclient = 0; /* Bit corresponding to db->iRwclient */ + + if( db->iRwclient>=0 ){ + rwclient = ((u64)1 << (LSM_LOCK_RWCLIENT(db->iRwclient)-1)); + } + msk = ((u64)1 << (LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT)-1)) - 1; + msk -= (((u64)1 << (LSM_LOCK_RWCLIENT(0)-1)) - 1); + + assert( (db->mLock & msk)==rwclient ); +#endif +} + +/* +** Open a new connection to database zFilename. +*/ +int lsm_open(lsm_db *pDb, const char *zFilename){ + int rc; + + if( pDb->pDatabase ){ + rc = LSM_MISUSE; + }else{ + char *zFull; + + /* Translate the possibly relative pathname supplied by the user into + ** an absolute pathname. This is required because the supplied path + ** is used (either directly or with "-log" appended to it) for more + ** than one purpose - to open both the database and log files, and + ** perhaps to unlink the log file during disconnection. An absolute + ** path is required to ensure that the correct files are operated + ** on even if the application changes the cwd. */ + rc = getFullpathname(pDb->pEnv, zFilename, &zFull); + assert( rc==LSM_OK || zFull==0 ); + + /* Connect to the database. */ + if( rc==LSM_OK ){ + rc = lsmDbDatabaseConnect(pDb, zFull); + } + + if( pDb->bReadonly==0 ){ + /* Configure the file-system connection with the page-size and block-size + ** of this database. Even if the database file is zero bytes in size + ** on disk, these values have been set in shared-memory by now, and so + ** are guaranteed not to change during the lifetime of this connection. + */ + if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb, 0)) ){ + lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot)); + lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot)); + } + } + + lsmFree(pDb->pEnv, zFull); + assertRwclientLockValue(pDb); + } + + assert( pDb->bReadonly==0 || pDb->bReadonly==1 ); + assert( rc!=LSM_OK || (pDb->pShmhdr==0)==(pDb->bReadonly==1) ); + + return rc; +} + +int lsm_close(lsm_db *pDb){ + int rc = LSM_OK; + if( pDb ){ + assert_db_state(pDb); + if( pDb->pCsr || pDb->nTransOpen ){ + rc = LSM_MISUSE_BKPT; + }else{ + lsmMCursorFreeCache(pDb); + lsmFreeSnapshot(pDb->pEnv, pDb->pClient); + pDb->pClient = 0; + + assertRwclientLockValue(pDb); + + lsmDbDatabaseRelease(pDb); + lsmLogClose(pDb); + lsmFsClose(pDb->pFS); + assert( pDb->mLock==0 ); + + /* Invoke any destructors registered for the compression or + ** compression factory callbacks. */ + if( pDb->factory.xFree ) pDb->factory.xFree(pDb->factory.pCtx); + if( pDb->compress.xFree ) pDb->compress.xFree(pDb->compress.pCtx); + + lsmFree(pDb->pEnv, pDb->rollback.aArray); + lsmFree(pDb->pEnv, pDb->aTrans); + lsmFree(pDb->pEnv, pDb->apShm); + lsmFree(pDb->pEnv, pDb); + } + } + return rc; +} + +int lsm_config(lsm_db *pDb, int eParam, ...){ + int rc = LSM_OK; + va_list ap; + va_start(ap, eParam); + + switch( eParam ){ + case LSM_CONFIG_AUTOFLUSH: { + /* This parameter is read and written in KB. But all internal + ** processing is done in bytes. */ + int *piVal = va_arg(ap, int *); + int iVal = *piVal; + if( iVal>=0 && iVal<=(1024*1024) ){ + pDb->nTreeLimit = iVal*1024; + } + *piVal = (pDb->nTreeLimit / 1024); + break; + } + + case LSM_CONFIG_AUTOWORK: { + int *piVal = va_arg(ap, int *); + if( *piVal>=0 ){ + pDb->bAutowork = *piVal; + } + *piVal = pDb->bAutowork; + break; + } + + case LSM_CONFIG_AUTOCHECKPOINT: { + /* This parameter is read and written in KB. But all internal processing + ** (including the lsm_db.nAutockpt variable) is done in bytes. */ + int *piVal = va_arg(ap, int *); + if( *piVal>=0 ){ + int iVal = *piVal; + pDb->nAutockpt = (i64)iVal * 1024; + } + *piVal = (int)(pDb->nAutockpt / 1024); + break; + } + + case LSM_CONFIG_PAGE_SIZE: { + int *piVal = va_arg(ap, int *); + if( pDb->pDatabase ){ + /* If lsm_open() has been called, this is a read-only parameter. + ** Set the output variable to the page-size according to the + ** FileSystem object. */ + *piVal = lsmFsPageSize(pDb->pFS); + }else{ + if( *piVal>=256 && *piVal<=65536 && ((*piVal-1) & *piVal)==0 ){ + pDb->nDfltPgsz = *piVal; + }else{ + *piVal = pDb->nDfltPgsz; + } + } + break; + } + + case LSM_CONFIG_BLOCK_SIZE: { + /* This parameter is read and written in KB. But all internal + ** processing is done in bytes. */ + int *piVal = va_arg(ap, int *); + if( pDb->pDatabase ){ + /* If lsm_open() has been called, this is a read-only parameter. + ** Set the output variable to the block-size in KB according to the + ** FileSystem object. */ + *piVal = lsmFsBlockSize(pDb->pFS) / 1024; + }else{ + int iVal = *piVal; + if( iVal>=64 && iVal<=65536 && ((iVal-1) & iVal)==0 ){ + pDb->nDfltBlksz = iVal * 1024; + }else{ + *piVal = pDb->nDfltBlksz / 1024; + } + } + break; + } + + case LSM_CONFIG_SAFETY: { + int *piVal = va_arg(ap, int *); + if( *piVal>=0 && *piVal<=2 ){ + pDb->eSafety = *piVal; + } + *piVal = pDb->eSafety; + break; + } + + case LSM_CONFIG_MMAP: { + int *piVal = va_arg(ap, int *); + if( pDb->iReader<0 && *piVal>=0 ){ + pDb->iMmap = *piVal; + rc = lsmFsConfigure(pDb); + } + *piVal = pDb->iMmap; + break; + } + + case LSM_CONFIG_USE_LOG: { + int *piVal = va_arg(ap, int *); + if( pDb->nTransOpen==0 && (*piVal==0 || *piVal==1) ){ + pDb->bUseLog = *piVal; + } + *piVal = pDb->bUseLog; + break; + } + + case LSM_CONFIG_AUTOMERGE: { + int *piVal = va_arg(ap, int *); + if( *piVal>1 ) pDb->nMerge = *piVal; + *piVal = pDb->nMerge; + break; + } + + case LSM_CONFIG_MAX_FREELIST: { + int *piVal = va_arg(ap, int *); + if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){ + pDb->nMaxFreelist = *piVal; + } + *piVal = pDb->nMaxFreelist; + break; + } + + case LSM_CONFIG_MULTIPLE_PROCESSES: { + int *piVal = va_arg(ap, int *); + if( pDb->pDatabase ){ + /* If lsm_open() has been called, this is a read-only parameter. + ** Set the output variable to true if this connection is currently + ** in multi-process mode. */ + *piVal = lsmDbMultiProc(pDb); + }else{ + pDb->bMultiProc = *piVal = (*piVal!=0); + } + break; + } + + case LSM_CONFIG_READONLY: { + int *piVal = va_arg(ap, int *); + /* If lsm_open() has been called, this is a read-only parameter. */ + if( pDb->pDatabase==0 && *piVal>=0 ){ + pDb->bReadonly = *piVal = (*piVal!=0); + } + *piVal = pDb->bReadonly; + break; + } + + case LSM_CONFIG_SET_COMPRESSION: { + lsm_compress *p = va_arg(ap, lsm_compress *); + if( pDb->iReader>=0 && pDb->bInFactory==0 ){ + /* May not change compression schemes with an open transaction */ + rc = LSM_MISUSE_BKPT; + }else{ + if( pDb->compress.xFree ){ + /* Invoke any destructor belonging to the current compression. */ + pDb->compress.xFree(pDb->compress.pCtx); + } + if( p->xBound==0 ){ + memset(&pDb->compress, 0, sizeof(lsm_compress)); + pDb->compress.iId = LSM_COMPRESSION_NONE; + }else{ + memcpy(&pDb->compress, p, sizeof(lsm_compress)); + } + rc = lsmFsConfigure(pDb); + } + break; + } + + case LSM_CONFIG_SET_COMPRESSION_FACTORY: { + lsm_compress_factory *p = va_arg(ap, lsm_compress_factory *); + if( pDb->factory.xFree ){ + /* Invoke any destructor belonging to the current factory. */ + pDb->factory.xFree(pDb->factory.pCtx); + } + memcpy(&pDb->factory, p, sizeof(lsm_compress_factory)); + break; + } + + case LSM_CONFIG_GET_COMPRESSION: { + lsm_compress *p = va_arg(ap, lsm_compress *); + memcpy(p, &pDb->compress, sizeof(lsm_compress)); + break; + } + + default: + rc = LSM_MISUSE; + break; + } + + va_end(ap); + return rc; +} + +void lsmAppendSegmentList(LsmString *pStr, char *zPre, Segment *pSeg){ + lsmStringAppendf(pStr, "%s{%d %d %d %d}", zPre, + pSeg->iFirst, pSeg->iLastPg, pSeg->iRoot, pSeg->nSize + ); +} + +static int infoGetWorker(lsm_db *pDb, Snapshot **pp, int *pbUnlock){ + int rc = LSM_OK; + + assert( *pbUnlock==0 ); + if( !pDb->pWorker ){ + rc = lsmBeginWork(pDb); + if( rc!=LSM_OK ) return rc; + *pbUnlock = 1; + } + if( pp ) *pp = pDb->pWorker; + return rc; +} + +static void infoFreeWorker(lsm_db *pDb, int bUnlock){ + if( bUnlock ){ + int rcdummy = LSM_BUSY; + lsmFinishWork(pDb, 0, &rcdummy); + } +} + +int lsmStructList( + lsm_db *pDb, /* Database handle */ + char **pzOut /* OUT: Nul-terminated string (tcl list) */ +){ + Level *pTopLevel = 0; /* Top level of snapshot to report on */ + int rc = LSM_OK; + Level *p; + LsmString s; + Snapshot *pWorker; /* Worker snapshot */ + int bUnlock = 0; + + /* Obtain the worker snapshot */ + rc = infoGetWorker(pDb, &pWorker, &bUnlock); + if( rc!=LSM_OK ) return rc; + + /* Format the contents of the snapshot as text */ + pTopLevel = lsmDbSnapshotLevel(pWorker); + lsmStringInit(&s, pDb->pEnv); + for(p=pTopLevel; rc==LSM_OK && p; p=p->pNext){ + int i; + lsmStringAppendf(&s, "%s{%d", (s.n ? " " : ""), (int)p->iAge); + lsmAppendSegmentList(&s, " ", &p->lhs); + for(i=0; rc==LSM_OK && inRight; i++){ + lsmAppendSegmentList(&s, " ", &p->aRhs[i]); + } + lsmStringAppend(&s, "}", 1); + } + rc = s.n>=0 ? LSM_OK : LSM_NOMEM; + + /* Release the snapshot and return */ + infoFreeWorker(pDb, bUnlock); + *pzOut = s.z; + return rc; +} + +static int infoFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){ + LsmString *pStr = (LsmString *)pCtx; + lsmStringAppendf(pStr, "%s{%d %lld}", (pStr->n?" ":""), iBlk, iSnapshot); + return 0; +} + +int lsmInfoFreelist(lsm_db *pDb, char **pzOut){ + Snapshot *pWorker; /* Worker snapshot */ + int bUnlock = 0; + LsmString s; + int rc; + + /* Obtain the worker snapshot */ + rc = infoGetWorker(pDb, &pWorker, &bUnlock); + if( rc!=LSM_OK ) return rc; + + lsmStringInit(&s, pDb->pEnv); + rc = lsmWalkFreelist(pDb, 0, infoFreelistCb, &s); + if( rc!=LSM_OK ){ + lsmFree(pDb->pEnv, s.z); + }else{ + *pzOut = s.z; + } + + /* Release the snapshot and return */ + infoFreeWorker(pDb, bUnlock); + return rc; +} + +static int infoTreeSize(lsm_db *db, int *pnOldKB, int *pnNewKB){ + ShmHeader *pShm = db->pShmhdr; + TreeHeader *p = &pShm->hdr1; + + /* The following code suffers from two race conditions, as it accesses and + ** trusts the contents of shared memory without verifying checksums: + ** + ** * The two values read - TreeHeader.root.nByte and oldroot.nByte - are + ** 32-bit fields. It is assumed that reading from one of these + ** is atomic - that it is not possible to read a partially written + ** garbage value. However the two values may be mutually inconsistent. + ** + ** * TreeHeader.iLogOff is a 64-bit value. And lsmCheckpointLogOffset() + ** reads a 64-bit value from a snapshot stored in shared memory. It + ** is assumed that in each case it is possible to read a partially + ** written garbage value. If this occurs, then the value returned + ** for the size of the "old" tree may reflect the size of an "old" + ** tree that was recently flushed to disk. + ** + ** Given the context in which this function is called (as a result of an + ** lsm_info(LSM_INFO_TREE_SIZE) request), neither of these are considered to + ** be problems. + */ + *pnNewKB = ((int)p->root.nByte + 1023) / 1024; + if( p->iOldShmid ){ + if( p->iOldLog==lsmCheckpointLogOffset(pShm->aSnap1) ){ + *pnOldKB = 0; + }else{ + *pnOldKB = ((int)p->oldroot.nByte + 1023) / 1024; + } + }else{ + *pnOldKB = 0; + } + + return LSM_OK; +} + +int lsm_info(lsm_db *pDb, int eParam, ...){ + int rc = LSM_OK; + va_list ap; + va_start(ap, eParam); + + switch( eParam ){ + case LSM_INFO_NWRITE: { + int *piVal = va_arg(ap, int *); + *piVal = lsmFsNWrite(pDb->pFS); + break; + } + + case LSM_INFO_NREAD: { + int *piVal = va_arg(ap, int *); + *piVal = lsmFsNRead(pDb->pFS); + break; + } + + case LSM_INFO_DB_STRUCTURE: { + char **pzVal = va_arg(ap, char **); + rc = lsmStructList(pDb, pzVal); + break; + } + + case LSM_INFO_ARRAY_STRUCTURE: { + Pgno pgno = va_arg(ap, Pgno); + char **pzVal = va_arg(ap, char **); + rc = lsmInfoArrayStructure(pDb, 0, pgno, pzVal); + break; + } + + case LSM_INFO_ARRAY_PAGES: { + Pgno pgno = va_arg(ap, Pgno); + char **pzVal = va_arg(ap, char **); + rc = lsmInfoArrayPages(pDb, pgno, pzVal); + break; + } + + case LSM_INFO_PAGE_HEX_DUMP: + case LSM_INFO_PAGE_ASCII_DUMP: { + Pgno pgno = va_arg(ap, Pgno); + char **pzVal = va_arg(ap, char **); + int bUnlock = 0; + rc = infoGetWorker(pDb, 0, &bUnlock); + if( rc==LSM_OK ){ + int bHex = (eParam==LSM_INFO_PAGE_HEX_DUMP); + rc = lsmInfoPageDump(pDb, pgno, bHex, pzVal); + } + infoFreeWorker(pDb, bUnlock); + break; + } + + case LSM_INFO_LOG_STRUCTURE: { + char **pzVal = va_arg(ap, char **); + rc = lsmInfoLogStructure(pDb, pzVal); + break; + } + + case LSM_INFO_FREELIST: { + char **pzVal = va_arg(ap, char **); + rc = lsmInfoFreelist(pDb, pzVal); + break; + } + + case LSM_INFO_CHECKPOINT_SIZE: { + int *pnKB = va_arg(ap, int *); + rc = lsmCheckpointSize(pDb, pnKB); + break; + } + + case LSM_INFO_TREE_SIZE: { + int *pnOld = va_arg(ap, int *); + int *pnNew = va_arg(ap, int *); + rc = infoTreeSize(pDb, pnOld, pnNew); + break; + } + + case LSM_INFO_COMPRESSION_ID: { + unsigned int *piOut = va_arg(ap, unsigned int *); + if( pDb->pClient ){ + *piOut = pDb->pClient->iCmpId; + }else{ + rc = lsmInfoCompressionId(pDb, piOut); + } + break; + } + + default: + rc = LSM_MISUSE; + break; + } + + va_end(ap); + return rc; +} + +static int doWriteOp( + lsm_db *pDb, + int bDeleteRange, + const void *pKey, int nKey, /* Key to write or delete */ + const void *pVal, int nVal /* Value to write. Or nVal==-1 for a delete */ +){ + int rc = LSM_OK; /* Return code */ + int bCommit = 0; /* True to commit before returning */ + + if( pDb->nTransOpen==0 ){ + bCommit = 1; + rc = lsm_begin(pDb, 1); + } + + if( rc==LSM_OK ){ + if( bDeleteRange==0 ){ + rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal); + }else{ + /* TODO */ + } + } + + lsmSortedSaveTreeCursors(pDb); + + if( rc==LSM_OK ){ + int pgsz = lsmFsPageSize(pDb->pFS); + int nQuant = LSM_AUTOWORK_QUANT * pgsz; + int nBefore; + int nAfter; + int nDiff; + + if( nQuant>pDb->nTreeLimit ){ + nQuant = pDb->nTreeLimit; + } + + nBefore = lsmTreeSize(pDb); + if( bDeleteRange ){ + rc = lsmTreeDelete(pDb, (void *)pKey, nKey, (void *)pVal, nVal); + }else{ + rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal); + } + + nAfter = lsmTreeSize(pDb); + nDiff = (nAfter/nQuant) - (nBefore/nQuant); + if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){ + rc = lsmSortedAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT); + } + } + + /* If a transaction was opened at the start of this function, commit it. + ** Or, if an error has occurred, roll it back. */ + if( bCommit ){ + if( rc==LSM_OK ){ + rc = lsm_commit(pDb, 0); + }else{ + lsm_rollback(pDb, 0); + } + } + + return rc; +} + +/* +** Write a new value into the database. +*/ +int lsm_insert( + lsm_db *db, /* Database connection */ + const void *pKey, int nKey, /* Key to write or delete */ + const void *pVal, int nVal /* Value to write. Or nVal==-1 for a delete */ +){ + return doWriteOp(db, 0, pKey, nKey, pVal, nVal); +} + +/* +** Delete a value from the database. +*/ +int lsm_delete(lsm_db *db, const void *pKey, int nKey){ + return doWriteOp(db, 0, pKey, nKey, 0, -1); +} + +/* +** Delete a range of database keys. +*/ +int lsm_delete_range( + lsm_db *db, /* Database handle */ + const void *pKey1, int nKey1, /* Lower bound of range to delete */ + const void *pKey2, int nKey2 /* Upper bound of range to delete */ +){ + int rc = LSM_OK; + if( db->xCmp((void *)pKey1, nKey1, (void *)pKey2, nKey2)<0 ){ + rc = doWriteOp(db, 1, pKey1, nKey1, pKey2, nKey2); + } + return rc; +} + +/* +** Open a new cursor handle. +** +** If there are currently no other open cursor handles, and no open write +** transaction, open a read transaction here. +*/ +int lsm_csr_open(lsm_db *pDb, lsm_cursor **ppCsr){ + int rc = LSM_OK; /* Return code */ + MultiCursor *pCsr = 0; /* New cursor object */ + + /* Open a read transaction if one is not already open. */ + assert_db_state(pDb); + + if( pDb->pShmhdr==0 ){ + assert( pDb->bReadonly ); + rc = lsmBeginRoTrans(pDb); + }else if( pDb->iReader<0 ){ + rc = lsmBeginReadTrans(pDb); + } + + /* Allocate the multi-cursor. */ + if( rc==LSM_OK ){ + rc = lsmMCursorNew(pDb, &pCsr); + } + + /* If an error has occured, set the output to NULL and delete any partially + ** allocated cursor. If this means there are no open cursors, release the + ** client snapshot. */ + if( rc!=LSM_OK ){ + lsmMCursorClose(pCsr, 0); + dbReleaseClientSnapshot(pDb); + } + + assert_db_state(pDb); + *ppCsr = (lsm_cursor *)pCsr; + return rc; +} + +/* +** Close a cursor opened using lsm_csr_open(). +*/ +int lsm_csr_close(lsm_cursor *p){ + if( p ){ + lsm_db *pDb = lsmMCursorDb((MultiCursor *)p); + assert_db_state(pDb); + lsmMCursorClose((MultiCursor *)p, 1); + dbReleaseClientSnapshot(pDb); + assert_db_state(pDb); + } + return LSM_OK; +} + +/* +** Attempt to seek the cursor to the database entry specified by pKey/nKey. +** If an error occurs (e.g. an OOM or IO error), return an LSM error code. +** Otherwise, return LSM_OK. +*/ +int lsm_csr_seek(lsm_cursor *pCsr, const void *pKey, int nKey, int eSeek){ + return lsmMCursorSeek((MultiCursor *)pCsr, 0, (void *)pKey, nKey, eSeek); +} + +int lsm_csr_next(lsm_cursor *pCsr){ + return lsmMCursorNext((MultiCursor *)pCsr); +} + +int lsm_csr_prev(lsm_cursor *pCsr){ + return lsmMCursorPrev((MultiCursor *)pCsr); +} + +int lsm_csr_first(lsm_cursor *pCsr){ + return lsmMCursorFirst((MultiCursor *)pCsr); +} + +int lsm_csr_last(lsm_cursor *pCsr){ + return lsmMCursorLast((MultiCursor *)pCsr); +} + +int lsm_csr_valid(lsm_cursor *pCsr){ + return lsmMCursorValid((MultiCursor *)pCsr); +} + +int lsm_csr_key(lsm_cursor *pCsr, const void **ppKey, int *pnKey){ + return lsmMCursorKey((MultiCursor *)pCsr, (void **)ppKey, pnKey); +} + +int lsm_csr_value(lsm_cursor *pCsr, const void **ppVal, int *pnVal){ + return lsmMCursorValue((MultiCursor *)pCsr, (void **)ppVal, pnVal); +} + +void lsm_config_log( + lsm_db *pDb, + void (*xLog)(void *, int, const char *), + void *pCtx +){ + pDb->xLog = xLog; + pDb->pLogCtx = pCtx; +} + +void lsm_config_work_hook( + lsm_db *pDb, + void (*xWork)(lsm_db *, void *), + void *pCtx +){ + pDb->xWork = xWork; + pDb->pWorkCtx = pCtx; +} + +void lsmLogMessage(lsm_db *pDb, int rc, const char *zFormat, ...){ + if( pDb->xLog ){ + LsmString s; + va_list ap, ap2; + lsmStringInit(&s, pDb->pEnv); + va_start(ap, zFormat); + va_start(ap2, zFormat); + lsmStringVAppendf(&s, zFormat, ap, ap2); + va_end(ap); + va_end(ap2); + pDb->xLog(pDb->pLogCtx, rc, s.z); + lsmStringClear(&s); + } +} + +int lsm_begin(lsm_db *pDb, int iLevel){ + int rc; + + assert_db_state( pDb ); + rc = (pDb->bReadonly ? LSM_READONLY : LSM_OK); + + /* A value less than zero means open one more transaction. */ + if( iLevel<0 ) iLevel = pDb->nTransOpen + 1; + if( iLevel>pDb->nTransOpen ){ + int i; + + /* Extend the pDb->aTrans[] array if required. */ + if( rc==LSM_OK && pDb->nTransAllocpEnv, pDb->aTrans, nByte); + if( !aNew ){ + rc = LSM_NOMEM; + }else{ + nByte = sizeof(TransMark) * (iLevel+1 - pDb->nTransAlloc); + memset(&aNew[pDb->nTransAlloc], 0, nByte); + pDb->nTransAlloc = iLevel+1; + pDb->aTrans = aNew; + } + } + + if( rc==LSM_OK && pDb->nTransOpen==0 ){ + rc = lsmBeginWriteTrans(pDb); + } + + if( rc==LSM_OK ){ + for(i=pDb->nTransOpen; iaTrans[i].tree); + lsmLogTell(pDb, &pDb->aTrans[i].log); + } + pDb->nTransOpen = iLevel; + } + } + + return rc; +} + +int lsm_commit(lsm_db *pDb, int iLevel){ + int rc = LSM_OK; + + assert_db_state( pDb ); + + /* A value less than zero means close the innermost nested transaction. */ + if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1); + + if( iLevelnTransOpen ){ + if( iLevel==0 ){ + /* Commit the transaction to disk. */ + if( rc==LSM_OK ) rc = lsmLogCommit(pDb); + if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){ + rc = lsmFsSyncLog(pDb->pFS); + } + lsmFinishWriteTrans(pDb, (rc==LSM_OK)); + } + pDb->nTransOpen = iLevel; + } + dbReleaseClientSnapshot(pDb); + return rc; +} + +int lsm_rollback(lsm_db *pDb, int iLevel){ + int rc = LSM_OK; + assert_db_state( pDb ); + + if( pDb->nTransOpen ){ + /* A value less than zero means close the innermost nested transaction. */ + if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1); + + if( iLevel<=pDb->nTransOpen ){ + TransMark *pMark = &pDb->aTrans[(iLevel==0 ? 0 : iLevel-1)]; + lsmTreeRollback(pDb, &pMark->tree); + if( iLevel ) lsmLogSeek(pDb, &pMark->log); + pDb->nTransOpen = iLevel; + } + + if( pDb->nTransOpen==0 ){ + lsmFinishWriteTrans(pDb, 0); + } + dbReleaseClientSnapshot(pDb); + } + + return rc; +} + +int lsm_get_user_version(lsm_db *pDb, unsigned int *piUsr){ + int rc = LSM_OK; /* Return code */ + + /* Open a read transaction if one is not already open. */ + assert_db_state(pDb); + if( pDb->pShmhdr==0 ){ + assert( pDb->bReadonly ); + rc = lsmBeginRoTrans(pDb); + }else if( pDb->iReader<0 ){ + rc = lsmBeginReadTrans(pDb); + } + + /* Allocate the multi-cursor. */ + if( rc==LSM_OK ){ + *piUsr = pDb->treehdr.iUsrVersion; + } + + dbReleaseClientSnapshot(pDb); + assert_db_state(pDb); + return rc; +} + +int lsm_set_user_version(lsm_db *pDb, unsigned int iUsr){ + int rc = LSM_OK; /* Return code */ + int bCommit = 0; /* True to commit before returning */ + + if( pDb->nTransOpen==0 ){ + bCommit = 1; + rc = lsm_begin(pDb, 1); + } + + if( rc==LSM_OK ){ + pDb->treehdr.iUsrVersion = iUsr; + } + + /* If a transaction was opened at the start of this function, commit it. + ** Or, if an error has occurred, roll it back. */ + if( bCommit ){ + if( rc==LSM_OK ){ + rc = lsm_commit(pDb, 0); + }else{ + lsm_rollback(pDb, 0); + } + } + + return rc; +} diff --git a/ext/lsm1/lsm_mem.c b/ext/lsm1/lsm_mem.c new file mode 100644 index 0000000000..13dd9fe312 --- /dev/null +++ b/ext/lsm1/lsm_mem.c @@ -0,0 +1,104 @@ +/* +** 2011-08-18 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** Helper routines for memory allocation. +*/ +#include "lsmInt.h" + +/* +** The following routines are called internally by LSM sub-routines. In +** this case a valid environment pointer must be supplied. +*/ +void *lsmMalloc(lsm_env *pEnv, size_t N){ + assert( pEnv ); + return pEnv->xMalloc(pEnv, N); +} +void lsmFree(lsm_env *pEnv, void *p){ + assert( pEnv ); + pEnv->xFree(pEnv, p); +} +void *lsmRealloc(lsm_env *pEnv, void *p, size_t N){ + assert( pEnv ); + return pEnv->xRealloc(pEnv, p, N); +} + +/* +** Core memory allocation routines for LSM. +*/ +void *lsm_malloc(lsm_env *pEnv, size_t N){ + return lsmMalloc(pEnv ? pEnv : lsm_default_env(), N); +} +void lsm_free(lsm_env *pEnv, void *p){ + lsmFree(pEnv ? pEnv : lsm_default_env(), p); +} +void *lsm_realloc(lsm_env *pEnv, void *p, size_t N){ + return lsmRealloc(pEnv ? pEnv : lsm_default_env(), p, N); +} + +void *lsmMallocZero(lsm_env *pEnv, size_t N){ + void *pRet; + assert( pEnv ); + pRet = lsmMalloc(pEnv, N); + if( pRet ) memset(pRet, 0, N); + return pRet; +} + +void *lsmMallocRc(lsm_env *pEnv, size_t N, int *pRc){ + void *pRet = 0; + if( *pRc==LSM_OK ){ + pRet = lsmMalloc(pEnv, N); + if( pRet==0 ){ + *pRc = LSM_NOMEM_BKPT; + } + } + return pRet; +} + +void *lsmMallocZeroRc(lsm_env *pEnv, size_t N, int *pRc){ + void *pRet = 0; + if( *pRc==LSM_OK ){ + pRet = lsmMallocZero(pEnv, N); + if( pRet==0 ){ + *pRc = LSM_NOMEM_BKPT; + } + } + return pRet; +} + +void *lsmReallocOrFree(lsm_env *pEnv, void *p, size_t N){ + void *pNew; + pNew = lsm_realloc(pEnv, p, N); + if( !pNew ) lsm_free(pEnv, p); + return pNew; +} + +void *lsmReallocOrFreeRc(lsm_env *pEnv, void *p, size_t N, int *pRc){ + void *pRet = 0; + if( *pRc ){ + lsmFree(pEnv, p); + }else{ + pRet = lsmReallocOrFree(pEnv, p, N); + if( !pRet ) *pRc = LSM_NOMEM_BKPT; + } + return pRet; +} + +char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){ + int nByte; + char *zRet; + nByte = strlen(zIn); + zRet = lsmMalloc(pEnv, nByte+1); + if( zRet ){ + memcpy(zRet, zIn, nByte+1); + } + return zRet; +} diff --git a/ext/lsm1/lsm_mutex.c b/ext/lsm1/lsm_mutex.c new file mode 100644 index 0000000000..cb99b2a61e --- /dev/null +++ b/ext/lsm1/lsm_mutex.c @@ -0,0 +1,88 @@ +/* +** 2012-01-30 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** Mutex functions for LSM. +*/ +#include "lsmInt.h" + +/* +** Allocate a new mutex. +*/ +int lsmMutexNew(lsm_env *pEnv, lsm_mutex **ppNew){ + return pEnv->xMutexNew(pEnv, ppNew); +} + +/* +** Return a handle for one of the static mutexes. +*/ +int lsmMutexStatic(lsm_env *pEnv, int iMutex, lsm_mutex **ppStatic){ + return pEnv->xMutexStatic(pEnv, iMutex, ppStatic); +} + +/* +** Free a mutex allocated by lsmMutexNew(). +*/ +void lsmMutexDel(lsm_env *pEnv, lsm_mutex *pMutex){ + if( pMutex ) pEnv->xMutexDel(pMutex); +} + +/* +** Enter a mutex. +*/ +void lsmMutexEnter(lsm_env *pEnv, lsm_mutex *pMutex){ + pEnv->xMutexEnter(pMutex); +} + +/* +** Attempt to enter a mutex, but do not block. If successful, return zero. +** Otherwise, if the mutex is already held by some other thread and is not +** entered, return non zero. +** +** Each successful call to this function must be matched by a call to +** lsmMutexLeave(). +*/ +int lsmMutexTry(lsm_env *pEnv, lsm_mutex *pMutex){ + return pEnv->xMutexTry(pMutex); +} + +/* +** Leave a mutex. +*/ +void lsmMutexLeave(lsm_env *pEnv, lsm_mutex *pMutex){ + pEnv->xMutexLeave(pMutex); +} + +#ifndef NDEBUG +/* +** Return non-zero if the mutex passed as the second argument is held +** by the calling thread, or zero otherwise. If the implementation is not +** able to tell if the mutex is held by the caller, it should return +** non-zero. +** +** This function is only used as part of assert() statements. +*/ +int lsmMutexHeld(lsm_env *pEnv, lsm_mutex *pMutex){ + return pEnv->xMutexHeld ? pEnv->xMutexHeld(pMutex) : 1; +} + +/* +** Return non-zero if the mutex passed as the second argument is not +** held by the calling thread, or zero otherwise. If the implementation +** is not able to tell if the mutex is held by the caller, it should +** return non-zero. +** +** This function is only used as part of assert() statements. +*/ +int lsmMutexNotHeld(lsm_env *pEnv, lsm_mutex *pMutex){ + return pEnv->xMutexNotHeld ? pEnv->xMutexNotHeld(pMutex) : 1; +} +#endif diff --git a/ext/lsm1/lsm_shared.c b/ext/lsm1/lsm_shared.c new file mode 100644 index 0000000000..f00338979f --- /dev/null +++ b/ext/lsm1/lsm_shared.c @@ -0,0 +1,1970 @@ +/* +** 2012-01-23 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** Utilities used to help multiple LSM clients to coexist within the +** same process space. +*/ +#include "lsmInt.h" + +/* +** Global data. All global variables used by code in this file are grouped +** into the following structure instance. +** +** pDatabase: +** Linked list of all Database objects allocated within this process. +** This list may not be traversed without holding the global mutex (see +** functions enterGlobalMutex() and leaveGlobalMutex()). +*/ +static struct SharedData { + Database *pDatabase; /* Linked list of all Database objects */ +} gShared; + +/* +** Database structure. There is one such structure for each distinct +** database accessed by this process. They are stored in the singly linked +** list starting at global variable gShared.pDatabase. Database objects are +** reference counted. Once the number of connections to the associated +** database drops to zero, they are removed from the linked list and deleted. +** +** pFile: +** In multi-process mode, this file descriptor is used to obtain locks +** and to access shared-memory. In single process mode, its only job is +** to hold the exclusive lock on the file. +** +*/ +struct Database { + /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */ + char *zName; /* Canonical path to database file */ + int nName; /* strlen(zName) */ + int nDbRef; /* Number of associated lsm_db handles */ + Database *pDbNext; /* Next Database structure in global list */ + + /* Protected by the local mutex (pClientMutex) */ + int bReadonly; /* True if Database.pFile is read-only */ + int bMultiProc; /* True if running in multi-process mode */ + lsm_file *pFile; /* Used for locks/shm in multi-proc mode */ + LsmFile *pLsmFile; /* List of deferred closes */ + lsm_mutex *pClientMutex; /* Protects the apShmChunk[] and pConn */ + int nShmChunk; /* Number of entries in apShmChunk[] array */ + void **apShmChunk; /* Array of "shared" memory regions */ + lsm_db *pConn; /* List of connections to this db. */ +}; + +/* +** Functions to enter and leave the global mutex. This mutex is used +** to protect the global linked-list headed at gShared.pDatabase. +*/ +static int enterGlobalMutex(lsm_env *pEnv){ + lsm_mutex *p; + int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); + if( rc==LSM_OK ) lsmMutexEnter(pEnv, p); + return rc; +} +static void leaveGlobalMutex(lsm_env *pEnv){ + lsm_mutex *p; + lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); + lsmMutexLeave(pEnv, p); +} + +#ifdef LSM_DEBUG +static int holdingGlobalMutex(lsm_env *pEnv){ + lsm_mutex *p; + lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); + return lsmMutexHeld(pEnv, p); +} +#endif + +#if 0 +static void assertNotInFreelist(Freelist *p, int iBlk){ + int i; + for(i=0; inEntry; i++){ + assert( p->aEntry[i].iBlk!=iBlk ); + } +} +#else +# define assertNotInFreelist(x,y) +#endif + +/* +** Append an entry to the free-list. If (iId==-1), this is a delete. +*/ +int freelistAppend(lsm_db *db, int iBlk, i64 iId){ + lsm_env *pEnv = db->pEnv; + Freelist *p; + int i; + + assert( iId==-1 || iId>=0 ); + p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist; + + /* Extend the space allocated for the freelist, if required */ + assert( p->nAlloc>=p->nEntry ); + if( p->nAlloc==p->nEntry ){ + int nNew; + int nByte; + FreelistEntry *aNew; + + nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2); + nByte = sizeof(FreelistEntry) * nNew; + aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte); + if( !aNew ) return LSM_NOMEM_BKPT; + p->nAlloc = nNew; + p->aEntry = aNew; + } + + for(i=0; inEntry; i++){ + assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk ); + if( p->aEntry[i].iBlk>=iBlk ) break; + } + + if( inEntry && p->aEntry[i].iBlk==iBlk ){ + /* Clobber an existing entry */ + p->aEntry[i].iId = iId; + }else{ + /* Insert a new entry into the list */ + int nByte = sizeof(FreelistEntry)*(p->nEntry-i); + memmove(&p->aEntry[i+1], &p->aEntry[i], nByte); + p->aEntry[i].iBlk = iBlk; + p->aEntry[i].iId = iId; + p->nEntry++; + } + + return LSM_OK; +} + +/* +** This function frees all resources held by the Database structure passed +** as the only argument. +*/ +static void freeDatabase(lsm_env *pEnv, Database *p){ + assert( holdingGlobalMutex(pEnv) ); + if( p ){ + /* Free the mutexes */ + lsmMutexDel(pEnv, p->pClientMutex); + + if( p->pFile ){ + lsmEnvClose(pEnv, p->pFile); + } + + /* Free the array of shm pointers */ + lsmFree(pEnv, p->apShmChunk); + + /* Free the memory allocated for the Database struct itself */ + lsmFree(pEnv, p); + } +} + +typedef struct DbTruncateCtx DbTruncateCtx; +struct DbTruncateCtx { + int nBlock; + i64 iInUse; +}; + +static int dbTruncateCb(void *pCtx, int iBlk, i64 iSnapshot){ + DbTruncateCtx *p = (DbTruncateCtx *)pCtx; + if( iBlk!=p->nBlock || (p->iInUse>=0 && iSnapshot>=p->iInUse) ) return 1; + p->nBlock--; + return 0; +} + +static int dbTruncate(lsm_db *pDb, i64 iInUse){ + int rc = LSM_OK; +#if 0 + int i; + DbTruncateCtx ctx; + + assert( pDb->pWorker ); + ctx.nBlock = pDb->pWorker->nBlock; + ctx.iInUse = iInUse; + + rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx); + for(i=ctx.nBlock+1; rc==LSM_OK && i<=pDb->pWorker->nBlock; i++){ + rc = freelistAppend(pDb, i, -1); + } + + if( rc==LSM_OK ){ +#ifdef LSM_LOG_FREELIST + if( ctx.nBlock!=pDb->pWorker->nBlock ){ + lsmLogMessage(pDb, 0, + "dbTruncate(): truncated db to %d blocks",ctx.nBlock + ); + } +#endif + pDb->pWorker->nBlock = ctx.nBlock; + } +#endif + return rc; +} + + +/* +** This function is called during database shutdown (when the number of +** connections drops from one to zero). It truncates the database file +** to as small a size as possible without truncating away any blocks that +** contain data. +*/ +static int dbTruncateFile(lsm_db *pDb){ + int rc; + + assert( pDb->pWorker==0 ); + assert( lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) ); + rc = lsmCheckpointLoadWorker(pDb); + + if( rc==LSM_OK ){ + DbTruncateCtx ctx; + + /* Walk the database free-block-list in reverse order. Set ctx.nBlock + ** to the block number of the last block in the database that actually + ** contains data. */ + ctx.nBlock = pDb->pWorker->nBlock; + ctx.iInUse = -1; + rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx); + + /* If the last block that contains data is not already the last block in + ** the database file, truncate the database file so that it is. */ + if( rc==LSM_OK && ctx.nBlock!=pDb->pWorker->nBlock ){ + rc = lsmFsTruncateDb( + pDb->pFS, (i64)ctx.nBlock*lsmFsBlockSize(pDb->pFS) + ); + } + } + + lsmFreeSnapshot(pDb->pEnv, pDb->pWorker); + pDb->pWorker = 0; + return rc; +} + +static void doDbDisconnect(lsm_db *pDb){ + int rc; + + if( pDb->bReadonly ){ + lsmShmLock(pDb, LSM_LOCK_DMS3, LSM_LOCK_UNLOCK, 0); + }else{ + /* Block for an exclusive lock on DMS1. This lock serializes all calls + ** to doDbConnect() and doDbDisconnect() across all processes. */ + rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); + if( rc==LSM_OK ){ + + /* Try an exclusive lock on DMS2. If successful, this is the last + ** connection to the database. In this case flush the contents of the + ** in-memory tree to disk and write a checkpoint. */ + rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 1, LSM_LOCK_EXCL); + if( rc==LSM_OK ){ + rc = lsmShmTestLock(pDb, LSM_LOCK_CHECKPOINTER, 1, LSM_LOCK_EXCL); + } + if( rc==LSM_OK ){ + int bReadonly = 0; /* True if there exist read-only conns. */ + + /* Flush the in-memory tree, if required. If there is data to flush, + ** this will create a new client snapshot in Database.pClient. The + ** checkpoint (serialization) of this snapshot may be written to disk + ** by the following block. + ** + ** There is no need to take a WRITER lock here. That there are no + ** other locks on DMS2 guarantees that there are no other read-write + ** connections at this time (and the lock on DMS1 guarantees that + ** no new ones may appear). + */ + rc = lsmTreeLoadHeader(pDb, 0); + if( rc==LSM_OK && (lsmTreeHasOld(pDb) || lsmTreeSize(pDb)>0) ){ + rc = lsmFlushTreeToDisk(pDb); + } + + /* Now check if there are any read-only connections. If there are, + ** then do not truncate the db file or unlink the shared-memory + ** region. */ + if( rc==LSM_OK ){ + rc = lsmShmTestLock(pDb, LSM_LOCK_DMS3, 1, LSM_LOCK_EXCL); + if( rc==LSM_BUSY ){ + bReadonly = 1; + rc = LSM_OK; + } + } + + /* Write a checkpoint to disk. */ + if( rc==LSM_OK ){ + rc = lsmCheckpointWrite(pDb, (bReadonly==0), 0); + } + + /* If the checkpoint was written successfully, delete the log file + ** and, if possible, truncate the database file. */ + if( rc==LSM_OK ){ + int bRotrans = 0; + Database *p = pDb->pDatabase; + + /* The log file may only be deleted if there are no clients + ** read-only clients running rotrans transactions. */ + rc = lsmDetectRoTrans(pDb, &bRotrans); + if( rc==LSM_OK && bRotrans==0 ){ + lsmFsCloseAndDeleteLog(pDb->pFS); + } + + /* The database may only be truncated if there exist no read-only + ** clients - either connected or running rotrans transactions. */ + if( bReadonly==0 && bRotrans==0 ){ + dbTruncateFile(pDb); + if( p->pFile && p->bMultiProc ){ + lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1); + } + } + } + } + } + + if( pDb->iRwclient>=0 ){ + lsmShmLock(pDb, LSM_LOCK_RWCLIENT(pDb->iRwclient), LSM_LOCK_UNLOCK, 0); + pDb->iRwclient = -1; + } + + lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0); + lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); + } + pDb->pShmhdr = 0; +} + +static int doDbConnect(lsm_db *pDb){ + const int nUsMax = 100000; /* Max value for nUs */ + int nUs = 1000; /* us to wait between DMS1 attempts */ + int rc; + + /* Obtain a pointer to the shared-memory header */ + assert( pDb->pShmhdr==0 ); + assert( pDb->bReadonly==0 ); + rc = lsmShmCacheChunks(pDb, 1); + if( rc!=LSM_OK ) return rc; + pDb->pShmhdr = (ShmHeader *)pDb->apShm[0]; + + /* Block for an exclusive lock on DMS1. This lock serializes all calls + ** to doDbConnect() and doDbDisconnect() across all processes. */ + while( 1 ){ + rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); + if( rc!=LSM_BUSY ) break; + lsmEnvSleep(pDb->pEnv, nUs); + nUs = nUs * 2; + if( nUs>nUsMax ) nUs = nUsMax; + } + if( rc!=LSM_OK ){ + pDb->pShmhdr = 0; + return rc; + } + + /* Try an exclusive lock on DMS2/DMS3. If successful, this is the first + ** and only connection to the database. In this case initialize the + ** shared-memory and run log file recovery. */ + assert( LSM_LOCK_DMS3==1+LSM_LOCK_DMS2 ); + rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 2, LSM_LOCK_EXCL); + if( rc==LSM_OK ){ + memset(pDb->pShmhdr, 0, sizeof(ShmHeader)); + rc = lsmCheckpointRecover(pDb); + if( rc==LSM_OK ){ + rc = lsmLogRecover(pDb); + } + if( rc==LSM_OK ){ + ShmHeader *pShm = pDb->pShmhdr; + pShm->aReader[0].iLsmId = lsmCheckpointId(pShm->aSnap1, 0); + pShm->aReader[0].iTreeId = pDb->treehdr.iUsedShmid; + } + }else if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + + /* Take a shared lock on DMS2. In multi-process mode this lock "cannot" + ** fail, as connections may only hold an exclusive lock on DMS2 if they + ** first hold an exclusive lock on DMS1. And this connection is currently + ** holding the exclusive lock on DSM1. + ** + ** However, if some other connection has the database open in single-process + ** mode, this operation will fail. In this case, return the error to the + ** caller - the attempt to connect to the db has failed. + */ + if( rc==LSM_OK ){ + rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0); + } + + /* If anything went wrong, unlock DMS2. Otherwise, try to take an exclusive + ** lock on one of the LSM_LOCK_RWCLIENT() locks. Unlock DMS1 in any case. */ + if( rc!=LSM_OK ){ + pDb->pShmhdr = 0; + }else{ + int i; + for(i=0; iiRwclient = i; + if( rc2!=LSM_BUSY ){ + rc = rc2; + break; + } + } + } + lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); + + return rc; +} + +static int dbOpenSharedFd(lsm_env *pEnv, Database *p, int bRoOk){ + int rc; + + rc = lsmEnvOpen(pEnv, p->zName, 0, &p->pFile); + if( rc==LSM_IOERR && bRoOk ){ + rc = lsmEnvOpen(pEnv, p->zName, LSM_OPEN_READONLY, &p->pFile); + p->bReadonly = 1; + } + + return rc; +} + +/* +** Return a reference to the shared Database handle for the database +** identified by canonical path zName. If this is the first connection to +** the named database, a new Database object is allocated. Otherwise, a +** pointer to an existing object is returned. +** +** If successful, *ppDatabase is set to point to the shared Database +** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL +** and and LSM error code returned. +** +** Each successful call to this function should be (eventually) matched +** by a call to lsmDbDatabaseRelease(). +*/ +int lsmDbDatabaseConnect( + lsm_db *pDb, /* Database handle */ + const char *zName /* Full-path to db file */ +){ + lsm_env *pEnv = pDb->pEnv; + int rc; /* Return code */ + Database *p = 0; /* Pointer returned via *ppDatabase */ + int nName = lsmStrlen(zName); + + assert( pDb->pDatabase==0 ); + rc = enterGlobalMutex(pEnv); + if( rc==LSM_OK ){ + + /* Search the global list for an existing object. TODO: Need something + ** better than the memcmp() below to figure out if a given Database + ** object represents the requested file. */ + for(p=gShared.pDatabase; p; p=p->pDbNext){ + if( nName==p->nName && 0==memcmp(zName, p->zName, nName) ) break; + } + + /* If no suitable Database object was found, allocate a new one. */ + if( p==0 ){ + p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nName+1, &rc); + + /* If the allocation was successful, fill in other fields and + ** allocate the client mutex. */ + if( rc==LSM_OK ){ + p->bMultiProc = pDb->bMultiProc; + p->zName = (char *)&p[1]; + p->nName = nName; + memcpy((void *)p->zName, zName, nName+1); + rc = lsmMutexNew(pEnv, &p->pClientMutex); + } + + /* If nothing has gone wrong so far, open the shared fd. And if that + ** succeeds and this connection requested single-process mode, + ** attempt to take the exclusive lock on DMS2. */ + if( rc==LSM_OK ){ + int bReadonly = (pDb->bReadonly && pDb->bMultiProc); + rc = dbOpenSharedFd(pDb->pEnv, p, bReadonly); + } + + if( rc==LSM_OK && p->bMultiProc==0 ){ + assert( p->bReadonly==0 ); + rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS2, LSM_LOCK_EXCL); + } + + if( rc==LSM_OK ){ + p->pDbNext = gShared.pDatabase; + gShared.pDatabase = p; + }else{ + freeDatabase(pEnv, p); + p = 0; + } + } + + if( p ){ + p->nDbRef++; + } + leaveGlobalMutex(pEnv); + + if( p ){ + lsmMutexEnter(pDb->pEnv, p->pClientMutex); + pDb->pNext = p->pConn; + p->pConn = pDb; + lsmMutexLeave(pDb->pEnv, p->pClientMutex); + } + } + + pDb->pDatabase = p; + if( rc==LSM_OK ){ + assert( p ); + rc = lsmFsOpen(pDb, zName, p->bReadonly); + } + + /* If the db handle is read-write, then connect to the system now. Run + ** recovery as necessary. Or, if this is a read-only database handle, + ** defer attempting to connect to the system until a read-transaction + ** is opened. */ + if( pDb->bReadonly==0 ){ + if( rc==LSM_OK ){ + rc = lsmFsConfigure(pDb); + } + if( rc==LSM_OK ){ + rc = doDbConnect(pDb); + } + } + + return rc; +} + +static void dbDeferClose(lsm_db *pDb){ + if( pDb->pFS ){ + LsmFile *pLsmFile; + Database *p = pDb->pDatabase; + pLsmFile = lsmFsDeferClose(pDb->pFS); + pLsmFile->pNext = p->pLsmFile; + p->pLsmFile = pLsmFile; + } +} + +LsmFile *lsmDbRecycleFd(lsm_db *db){ + LsmFile *pRet; + Database *p = db->pDatabase; + lsmMutexEnter(db->pEnv, p->pClientMutex); + if( (pRet = p->pLsmFile)!=0 ){ + p->pLsmFile = pRet->pNext; + } + lsmMutexLeave(db->pEnv, p->pClientMutex); + return pRet; +} + +/* +** Release a reference to a Database object obtained from +** lsmDbDatabaseConnect(). There should be exactly one call to this function +** for each successful call to Find(). +*/ +void lsmDbDatabaseRelease(lsm_db *pDb){ + Database *p = pDb->pDatabase; + if( p ){ + lsm_db **ppDb; + + if( pDb->pShmhdr ){ + doDbDisconnect(pDb); + } + + lsmMutexEnter(pDb->pEnv, p->pClientMutex); + for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext)); + *ppDb = pDb->pNext; + dbDeferClose(pDb); + lsmMutexLeave(pDb->pEnv, p->pClientMutex); + + enterGlobalMutex(pDb->pEnv); + p->nDbRef--; + if( p->nDbRef==0 ){ + LsmFile *pIter; + LsmFile *pNext; + Database **pp; + + /* Remove the Database structure from the linked list. */ + for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext)); + *pp = p->pDbNext; + + /* If they were allocated from the heap, free the shared memory chunks */ + if( p->bMultiProc==0 ){ + int i; + for(i=0; inShmChunk; i++){ + lsmFree(pDb->pEnv, p->apShmChunk[i]); + } + } + + /* Close any outstanding file descriptors */ + for(pIter=p->pLsmFile; pIter; pIter=pNext){ + pNext = pIter->pNext; + lsmEnvClose(pDb->pEnv, pIter->pFile); + lsmFree(pDb->pEnv, pIter); + } + freeDatabase(pDb->pEnv, p); + } + leaveGlobalMutex(pDb->pEnv); + } +} + +Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){ + return pSnapshot->pLevel; +} + +void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){ + pSnap->pLevel = pLevel; +} + +/* TODO: Shuffle things around to get rid of this */ +static int firstSnapshotInUse(lsm_db *, i64 *); + +/* +** Context object used by the lsmWalkFreelist() utility. +*/ +typedef struct WalkFreelistCtx WalkFreelistCtx; +struct WalkFreelistCtx { + lsm_db *pDb; + int bReverse; + Freelist *pFreelist; + int iFree; + int (*xUsr)(void *, int, i64); /* User callback function */ + void *pUsrctx; /* User callback context */ + int bDone; /* Set to true after xUsr() returns true */ +}; + +/* +** Callback used by lsmWalkFreelist(). +*/ +static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){ + WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx; + const int iDir = (p->bReverse ? -1 : 1); + Freelist *pFree = p->pFreelist; + + assert( p->bDone==0 ); + if( pFree ){ + while( (p->iFree < pFree->nEntry) && p->iFree>=0 ){ + FreelistEntry *pEntry = &pFree->aEntry[p->iFree]; + if( (p->bReverse==0 && pEntry->iBlk>iBlk) + || (p->bReverse!=0 && pEntry->iBlkiFree += iDir; + if( pEntry->iId>=0 + && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) + ){ + p->bDone = 1; + return 1; + } + if( pEntry->iBlk==iBlk ) return 0; + } + } + } + + if( p->xUsr(p->pUsrctx, iBlk, iSnapshot) ){ + p->bDone = 1; + return 1; + } + return 0; +} + +/* +** The database handle passed as the first argument must be the worker +** connection. This function iterates through the contents of the current +** free block list, invoking the supplied callback once for each list +** element. +** +** The difference between this function and lsmSortedWalkFreelist() is +** that lsmSortedWalkFreelist() only considers those free-list elements +** stored within the LSM. This function also merges in any in-memory +** elements. +*/ +int lsmWalkFreelist( + lsm_db *pDb, /* Database handle (must be worker) */ + int bReverse, /* True to iterate from largest to smallest */ + int (*x)(void *, int, i64), /* Callback function */ + void *pCtx /* First argument to pass to callback */ +){ + const int iDir = (bReverse ? -1 : 1); + int rc; + int iCtx; + + WalkFreelistCtx ctx[2]; + + ctx[0].pDb = pDb; + ctx[0].bReverse = bReverse; + ctx[0].pFreelist = &pDb->pWorker->freelist; + if( ctx[0].pFreelist && bReverse ){ + ctx[0].iFree = ctx[0].pFreelist->nEntry-1; + }else{ + ctx[0].iFree = 0; + } + ctx[0].xUsr = walkFreelistCb; + ctx[0].pUsrctx = (void *)&ctx[1]; + ctx[0].bDone = 0; + + ctx[1].pDb = pDb; + ctx[1].bReverse = bReverse; + ctx[1].pFreelist = pDb->pFreelist; + if( ctx[1].pFreelist && bReverse ){ + ctx[1].iFree = ctx[1].pFreelist->nEntry-1; + }else{ + ctx[1].iFree = 0; + } + ctx[1].xUsr = x; + ctx[1].pUsrctx = pCtx; + ctx[1].bDone = 0; + + rc = lsmSortedWalkFreelist(pDb, bReverse, walkFreelistCb, (void *)&ctx[0]); + + if( ctx[0].bDone==0 ){ + for(iCtx=0; iCtx<2; iCtx++){ + int i; + WalkFreelistCtx *p = &ctx[iCtx]; + for(i=p->iFree; + p->pFreelist && rc==LSM_OK && ipFreelist->nEntry && i>=0; + i += iDir + ){ + FreelistEntry *pEntry = &p->pFreelist->aEntry[i]; + if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){ + return LSM_OK; + } + } + } + } + + return rc; +} + + +typedef struct FindFreeblockCtx FindFreeblockCtx; +struct FindFreeblockCtx { + i64 iInUse; + int iRet; + int bNotOne; +}; + +static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){ + FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx; + if( iSnapshotiInUse && (iBlk!=1 || p->bNotOne==0) ){ + p->iRet = iBlk; + return 1; + } + return 0; +} + +static int findFreeblock(lsm_db *pDb, i64 iInUse, int bNotOne, int *piRet){ + int rc; /* Return code */ + FindFreeblockCtx ctx; /* Context object */ + + ctx.iInUse = iInUse; + ctx.iRet = 0; + ctx.bNotOne = bNotOne; + rc = lsmWalkFreelist(pDb, 0, findFreeblockCb, (void *)&ctx); + *piRet = ctx.iRet; + + return rc; +} + +/* +** Allocate a new database file block to write data to, either by extending +** the database file or by recycling a free-list entry. The worker snapshot +** must be held in order to call this function. +** +** If successful, *piBlk is set to the block number allocated and LSM_OK is +** returned. Otherwise, *piBlk is zeroed and an lsm error code returned. +*/ +int lsmBlockAllocate(lsm_db *pDb, int iBefore, int *piBlk){ + Snapshot *p = pDb->pWorker; + int iRet = 0; /* Block number of allocated block */ + int rc = LSM_OK; + i64 iInUse = 0; /* Snapshot id still in use */ + i64 iSynced = 0; /* Snapshot id synced to disk */ + + assert( p ); + +#ifdef LSM_LOG_FREELIST + { + static int nCall = 0; + char *zFree = 0; + nCall++; + rc = lsmInfoFreelist(pDb, &zFree); + if( rc!=LSM_OK ) return rc; + lsmLogMessage(pDb, 0, "lsmBlockAllocate(): %d freelist: %s", nCall, zFree); + lsmFree(pDb->pEnv, zFree); + } +#endif + + /* Set iInUse to the smallest snapshot id that is either: + ** + ** * Currently in use by a database client, + ** * May be used by a database client in the future, or + ** * Is the most recently checkpointed snapshot (i.e. the one that will + ** be used following recovery if a failure occurs at this point). + */ + rc = lsmCheckpointSynced(pDb, &iSynced, 0, 0); + if( rc==LSM_OK && iSynced==0 ) iSynced = p->iId; + iInUse = iSynced; + if( rc==LSM_OK && pDb->iReader>=0 ){ + assert( pDb->pClient ); + iInUse = LSM_MIN(iInUse, pDb->pClient->iId); + } + if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse); + +#ifdef LSM_LOG_FREELIST + { + lsmLogMessage(pDb, 0, "lsmBlockAllocate(): " + "snapshot-in-use: %lld (iSynced=%lld) (client-id=%lld)", + iInUse, iSynced, (pDb->iReader>=0 ? pDb->pClient->iId : 0) + ); + } +#endif + + + /* Unless there exists a read-only transaction (which prevents us from + ** recycling any blocks regardless, query the free block list for a + ** suitable block to reuse. + ** + ** It might seem more natural to check for a read-only transaction at + ** the start of this function. However, it is better do wait until after + ** the call to lsmCheckpointSynced() to do so. + */ + if( rc==LSM_OK ){ + int bRotrans; + rc = lsmDetectRoTrans(pDb, &bRotrans); + + if( rc==LSM_OK && bRotrans==0 ){ + rc = findFreeblock(pDb, iInUse, (iBefore>0), &iRet); + } + } + + if( iBefore>0 && (iRet<=0 || iRet>=iBefore) ){ + iRet = 0; + + }else if( rc==LSM_OK ){ + /* If a block was found in the free block list, use it and remove it from + ** the list. Otherwise, if no suitable block was found, allocate one from + ** the end of the file. */ + if( iRet>0 ){ +#ifdef LSM_LOG_FREELIST + lsmLogMessage(pDb, 0, + "reusing block %d (snapshot-in-use=%lld)", iRet, iInUse); +#endif + rc = freelistAppend(pDb, iRet, -1); + if( rc==LSM_OK ){ + rc = dbTruncate(pDb, iInUse); + } + }else{ + iRet = ++(p->nBlock); +#ifdef LSM_LOG_FREELIST + lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet); +#endif + } + } + + assert( iBefore>0 || iRet>0 || rc!=LSM_OK ); + *piBlk = iRet; + return rc; +} + +/* +** Free a database block. The worker snapshot must be held in order to call +** this function. +** +** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. +** LSM_NOMEM). +*/ +int lsmBlockFree(lsm_db *pDb, int iBlk){ + Snapshot *p = pDb->pWorker; + assert( lsmShmAssertWorker(pDb) ); + +#ifdef LSM_LOG_FREELIST + lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk); +#endif + + return freelistAppend(pDb, iBlk, p->iId); +} + +/* +** Refree a database block. The worker snapshot must be held in order to call +** this function. +** +** Refreeing is required when a block is allocated using lsmBlockAllocate() +** but then not used. This function is used to push the block back onto +** the freelist. Refreeing a block is different from freeing is, as a refreed +** block may be reused immediately. Whereas a freed block can not be reused +** until (at least) after the next checkpoint. +*/ +int lsmBlockRefree(lsm_db *pDb, int iBlk){ + int rc = LSM_OK; /* Return code */ + +#ifdef LSM_LOG_FREELIST + lsmLogMessage(pDb, LSM_OK, "lsmBlockRefree(): Refree block %d", iBlk); +#endif + + rc = freelistAppend(pDb, iBlk, 0); + return rc; +} + +/* +** If required, copy a database checkpoint from shared memory into the +** database itself. +** +** The WORKER lock must not be held when this is called. This is because +** this function may indirectly call fsync(). And the WORKER lock should +** not be held that long (in case it is required by a client flushing an +** in-memory tree to disk). +*/ +int lsmCheckpointWrite(lsm_db *pDb, int bTruncate, u32 *pnWrite){ + int rc; /* Return Code */ + u32 nWrite = 0; + + assert( pDb->pWorker==0 ); + assert( 1 || pDb->pClient==0 ); + assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) ); + + rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0); + if( rc!=LSM_OK ) return rc; + + rc = lsmCheckpointLoad(pDb, 0); + if( rc==LSM_OK ){ + int nBlock = lsmCheckpointNBlock(pDb->aSnapshot); + ShmHeader *pShm = pDb->pShmhdr; + int bDone = 0; /* True if checkpoint is already stored */ + + /* Check if this checkpoint has already been written to the database + ** file. If so, set variable bDone to true. */ + if( pShm->iMetaPage ){ + MetaPage *pPg; /* Meta page */ + u8 *aData; /* Meta-page data buffer */ + int nData; /* Size of aData[] in bytes */ + i64 iCkpt; /* Id of checkpoint just loaded */ + i64 iDisk; /* Id of checkpoint already stored in db */ + iCkpt = lsmCheckpointId(pDb->aSnapshot, 0); + rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg); + if( rc==LSM_OK ){ + aData = lsmFsMetaPageData(pPg, &nData); + iDisk = lsmCheckpointId((u32 *)aData, 1); + nWrite = lsmCheckpointNWrite((u32 *)aData, 1); + lsmFsMetaPageRelease(pPg); + } + bDone = (iDisk>=iCkpt); + } + + if( rc==LSM_OK && bDone==0 ){ + int iMeta = (pShm->iMetaPage % 2) + 1; + if( pDb->eSafety!=LSM_SAFETY_OFF ){ + rc = lsmFsSyncDb(pDb->pFS, nBlock); + } + if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta); + if( rc==LSM_OK && pDb->eSafety!=LSM_SAFETY_OFF){ + rc = lsmFsSyncDb(pDb->pFS, 0); + } + if( rc==LSM_OK ){ + pShm->iMetaPage = iMeta; + nWrite = lsmCheckpointNWrite(pDb->aSnapshot, 0) - nWrite; + } +#ifdef LSM_LOG_WORK + lsmLogMessage(pDb, 0, "finish checkpoint %d", + (int)lsmCheckpointId(pDb->aSnapshot, 0) + ); +#endif + } + + if( rc==LSM_OK && bTruncate && nBlock>0 ){ + rc = lsmFsTruncateDb(pDb->pFS, (i64)nBlock*lsmFsBlockSize(pDb->pFS)); + } + } + + lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0); + if( pnWrite && rc==LSM_OK ) *pnWrite = nWrite; + return rc; +} + +int lsmBeginWork(lsm_db *pDb){ + int rc; + + /* Attempt to take the WORKER lock */ + rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0); + + /* Deserialize the current worker snapshot */ + if( rc==LSM_OK ){ + rc = lsmCheckpointLoadWorker(pDb); + } + return rc; +} + +void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){ + if( p ){ + lsmSortedFreeLevel(pEnv, p->pLevel); + lsmFree(pEnv, p->freelist.aEntry); + lsmFree(pEnv, p->redirect.a); + lsmFree(pEnv, p); + } +} + +/* +** Attempt to populate one of the read-lock slots to contain lock values +** iLsm/iShm. Or, if such a slot exists already, this function is a no-op. +** +** It is not an error if no slot can be populated because the write-lock +** cannot be obtained. If any other error occurs, return an LSM error code. +** Otherwise, LSM_OK. +** +** This function is called at various points to try to ensure that there +** always exists at least one read-lock slot that can be used by a read-only +** client. And so that, in the usual case, there is an "exact match" available +** whenever a read transaction is opened by any client. At present this +** function is called when: +** +** * A write transaction that called lsmTreeDiscardOld() is committed, and +** * Whenever the working snapshot is updated (i.e. lsmFinishWork()). +*/ +static int dbSetReadLock(lsm_db *db, i64 iLsm, u32 iShm){ + int rc = LSM_OK; + ShmHeader *pShm = db->pShmhdr; + int i; + + /* Check if there is already a slot containing the required values. */ + for(i=0; iaReader[i]; + if( p->iLsmId==iLsm && p->iTreeId==iShm ) return LSM_OK; + } + + /* Iterate through all read-lock slots, attempting to take a write-lock + ** on each of them. If a write-lock succeeds, populate the locked slot + ** with the required values and break out of the loop. */ + for(i=0; rc==LSM_OK && iaReader[i]; + p->iLsmId = iLsm; + p->iTreeId = iShm; + lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); + break; + } + } + + return rc; +} + +/* +** Release the read-lock currently held by connection db. +*/ +int dbReleaseReadlock(lsm_db *db){ + int rc = LSM_OK; + if( db->iReader>=0 ){ + rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0); + db->iReader = -1; + } + db->bRoTrans = 0; + return rc; +} + + +/* +** Argument bFlush is true if the contents of the in-memory tree has just +** been flushed to disk. The significance of this is that once the snapshot +** created to hold the updated state of the database is synced to disk, log +** file space can be recycled. +*/ +void lsmFinishWork(lsm_db *pDb, int bFlush, int *pRc){ + int rc = *pRc; + assert( rc!=0 || pDb->pWorker ); + if( pDb->pWorker ){ + /* If no error has occurred, serialize the worker snapshot and write + ** it to shared memory. */ + if( rc==LSM_OK ){ + rc = lsmSaveWorker(pDb, bFlush); + } + + /* Assuming no error has occurred, update a read lock slot with the + ** new snapshot id (see comments above function dbSetReadLock()). */ + if( rc==LSM_OK ){ + if( pDb->iReader<0 ){ + rc = lsmTreeLoadHeader(pDb, 0); + } + if( rc==LSM_OK ){ + rc = dbSetReadLock(pDb, pDb->pWorker->iId, pDb->treehdr.iUsedShmid); + } + } + + /* Free the snapshot object. */ + lsmFreeSnapshot(pDb->pEnv, pDb->pWorker); + pDb->pWorker = 0; + } + + lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0); + *pRc = rc; +} + +/* +** Called when recovery is finished. +*/ +int lsmFinishRecovery(lsm_db *pDb){ + lsmTreeEndTransaction(pDb, 1); + return LSM_OK; +} + +/* +** Check if the currently configured compression functions +** (LSM_CONFIG_SET_COMPRESSION) are compatible with a database that has its +** compression id set to iReq. Compression routines are compatible if iReq +** is zero (indicating the database is empty), or if it is equal to the +** compression id of the configured compression routines. +** +** If the check shows that the current compression are incompatible and there +** is a compression factory registered, give it a chance to install new +** compression routines. +** +** If, after any registered factory is invoked, the compression functions +** are still incompatible, return LSM_MISMATCH. Otherwise, LSM_OK. +*/ +int lsmCheckCompressionId(lsm_db *pDb, u32 iReq){ + if( iReq!=LSM_COMPRESSION_EMPTY && pDb->compress.iId!=iReq ){ + if( pDb->factory.xFactory ){ + pDb->bInFactory = 1; + pDb->factory.xFactory(pDb->factory.pCtx, pDb, iReq); + pDb->bInFactory = 0; + } + if( pDb->compress.iId!=iReq ){ + /* Incompatible */ + return LSM_MISMATCH; + } + } + /* Compatible */ + return LSM_OK; +} + +/* +** Begin a read transaction. This function is a no-op if the connection +** passed as the only argument already has an open read transaction. +*/ +int lsmBeginReadTrans(lsm_db *pDb){ + const int MAX_READLOCK_ATTEMPTS = 10; + const int nMaxAttempt = (pDb->bRoTrans ? 1 : MAX_READLOCK_ATTEMPTS); + + int rc = LSM_OK; /* Return code */ + int iAttempt = 0; + + assert( pDb->pWorker==0 ); + + while( rc==LSM_OK && pDb->iReader<0 && (iAttempt++)pCsr==0 && pDb->nTransOpen==0 ); + + /* Load the in-memory tree header. */ + rc = lsmTreeLoadHeader(pDb, &iTreehdr); + + /* Load the database snapshot */ + if( rc==LSM_OK ){ + if( lsmCheckpointClientCacheOk(pDb)==0 ){ + lsmFreeSnapshot(pDb->pEnv, pDb->pClient); + pDb->pClient = 0; + lsmMCursorFreeCache(pDb); + lsmFsPurgeCache(pDb->pFS); + rc = lsmCheckpointLoad(pDb, &iSnap); + }else{ + iSnap = 1; + } + } + + /* Take a read-lock on the tree and snapshot just loaded. Then check + ** that the shared-memory still contains the same values. If so, proceed. + ** Otherwise, relinquish the read-lock and retry the whole procedure + ** (starting with loading the in-memory tree header). */ + if( rc==LSM_OK ){ + u32 iShmMax = pDb->treehdr.iUsedShmid; + u32 iShmMin = pDb->treehdr.iNextShmid+1-LSM_MAX_SHMCHUNKS; + rc = lsmReadlock( + pDb, lsmCheckpointId(pDb->aSnapshot, 0), iShmMin, iShmMax + ); + if( rc==LSM_OK ){ + if( lsmTreeLoadHeaderOk(pDb, iTreehdr) + && lsmCheckpointLoadOk(pDb, iSnap) + ){ + /* Read lock has been successfully obtained. Deserialize the + ** checkpoint just loaded. TODO: This will be removed after + ** lsm_sorted.c is changed to work directly from the serialized + ** version of the snapshot. */ + if( pDb->pClient==0 ){ + rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot,&pDb->pClient); + } + assert( (rc==LSM_OK)==(pDb->pClient!=0) ); + assert( pDb->iReader>=0 ); + + /* Check that the client has the right compression hooks loaded. + ** If not, set rc to LSM_MISMATCH. */ + if( rc==LSM_OK ){ + rc = lsmCheckCompressionId(pDb, pDb->pClient->iCmpId); + } + }else{ + rc = dbReleaseReadlock(pDb); + } + } + + if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + } +#if 0 +if( rc==LSM_OK && pDb->pClient ){ + fprintf(stderr, + "reading %p: snapshot:%d used-shmid:%d trans-id:%d iOldShmid=%d\n", + (void *)pDb, + (int)pDb->pClient->iId, (int)pDb->treehdr.iUsedShmid, + (int)pDb->treehdr.root.iTransId, + (int)pDb->treehdr.iOldShmid + ); +} +#endif + } + + if( rc==LSM_OK ){ + rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk); + } + if( rc!=LSM_OK ){ + dbReleaseReadlock(pDb); + } + if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY; + return rc; +} + +/* +** This function is used by a read-write connection to determine if there +** are currently one or more read-only transactions open on the database +** (in this context a read-only transaction is one opened by a read-only +** connection on a non-live database). +** +** If no error occurs, LSM_OK is returned and *pbExists is set to true if +** some other connection has a read-only transaction open, or false +** otherwise. If an error occurs an LSM error code is returned and the final +** value of *pbExist is undefined. +*/ +int lsmDetectRoTrans(lsm_db *db, int *pbExist){ + int rc; + + /* Only a read-write connection may use this function. */ + assert( db->bReadonly==0 ); + + rc = lsmShmTestLock(db, LSM_LOCK_ROTRANS, 1, LSM_LOCK_EXCL); + if( rc==LSM_BUSY ){ + *pbExist = 1; + rc = LSM_OK; + }else{ + *pbExist = 0; + } + + return rc; +} + +/* +** db is a read-only database handle in the disconnected state. This function +** attempts to open a read-transaction on the database. This may involve +** connecting to the database system (opening shared memory etc.). +*/ +int lsmBeginRoTrans(lsm_db *db){ + int rc = LSM_OK; + + assert( db->bReadonly && db->pShmhdr==0 ); + assert( db->iReader<0 ); + + if( db->bRoTrans==0 ){ + + /* Attempt a shared-lock on DMS1. */ + rc = lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_SHARED, 0); + if( rc!=LSM_OK ) return rc; + + rc = lsmShmTestLock( + db, LSM_LOCK_RWCLIENT(0), LSM_LOCK_NREADER, LSM_LOCK_SHARED + ); + if( rc==LSM_OK ){ + /* System is not live. Take a SHARED lock on the ROTRANS byte and + ** release DMS1. Locking ROTRANS tells all read-write clients that they + ** may not recycle any disk space from within the database or log files, + ** as a read-only client may be using it. */ + rc = lsmShmLock(db, LSM_LOCK_ROTRANS, LSM_LOCK_SHARED, 0); + lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); + + if( rc==LSM_OK ){ + db->bRoTrans = 1; + rc = lsmShmCacheChunks(db, 1); + if( rc==LSM_OK ){ + db->pShmhdr = (ShmHeader *)db->apShm[0]; + memset(db->pShmhdr, 0, sizeof(ShmHeader)); + rc = lsmCheckpointRecover(db); + if( rc==LSM_OK ){ + rc = lsmLogRecover(db); + } + } + } + }else if( rc==LSM_BUSY ){ + /* System is live! */ + rc = lsmShmLock(db, LSM_LOCK_DMS3, LSM_LOCK_SHARED, 0); + lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); + if( rc==LSM_OK ){ + rc = lsmShmCacheChunks(db, 1); + if( rc==LSM_OK ){ + db->pShmhdr = (ShmHeader *)db->apShm[0]; + } + } + } + + if( rc==LSM_OK ){ + rc = lsmBeginReadTrans(db); + } + } + + return rc; +} + +/* +** Close the currently open read transaction. +*/ +void lsmFinishReadTrans(lsm_db *pDb){ + + /* Worker connections should not be closing read transactions. And + ** read transactions should only be closed after all cursors and write + ** transactions have been closed. Finally pClient should be non-NULL + ** only iff pDb->iReader>=0. */ + assert( pDb->pWorker==0 ); + assert( pDb->pCsr==0 && pDb->nTransOpen==0 ); + + if( pDb->bRoTrans ){ + int i; + for(i=0; inShm; i++){ + lsmFree(pDb->pEnv, pDb->apShm[i]); + } + lsmFree(pDb->pEnv, pDb->apShm); + pDb->apShm = 0; + pDb->nShm = 0; + pDb->pShmhdr = 0; + + lsmShmLock(pDb, LSM_LOCK_ROTRANS, LSM_LOCK_UNLOCK, 0); + } + dbReleaseReadlock(pDb); +} + +/* +** Open a write transaction. +*/ +int lsmBeginWriteTrans(lsm_db *pDb){ + int rc = LSM_OK; /* Return code */ + ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */ + + assert( pDb->nTransOpen==0 ); + assert( pDb->bDiscardOld==0 ); + assert( pDb->bReadonly==0 ); + + /* If there is no read-transaction open, open one now. */ + if( pDb->iReader<0 ){ + rc = lsmBeginReadTrans(pDb); + } + + /* Attempt to take the WRITER lock */ + if( rc==LSM_OK ){ + rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); + } + + /* If the previous writer failed mid-transaction, run emergency rollback. */ + if( rc==LSM_OK && pShm->bWriter ){ + rc = lsmTreeRepair(pDb); + if( rc==LSM_OK ) pShm->bWriter = 0; + } + + /* Check that this connection is currently reading from the most recent + ** version of the database. If not, return LSM_BUSY. */ + if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){ + rc = LSM_BUSY; + } + + if( rc==LSM_OK ){ + rc = lsmLogBegin(pDb); + } + + /* If everything was successful, set the "transaction-in-progress" flag + ** and return LSM_OK. Otherwise, if some error occurred, relinquish the + ** WRITER lock and return an error code. */ + if( rc==LSM_OK ){ + TreeHeader *p = &pDb->treehdr; + pShm->bWriter = 1; + p->root.iTransId++; + if( lsmTreeHasOld(pDb) && p->iOldLog==pDb->pClient->iLogOff ){ + lsmTreeDiscardOld(pDb); + pDb->bDiscardOld = 1; + } + }else{ + lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); + if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb); + } + return rc; +} + +/* +** End the current write transaction. The connection is left with an open +** read transaction. It is an error to call this if there is no open write +** transaction. +** +** If the transaction was committed, then a commit record has already been +** written into the log file when this function is called. Or, if the +** transaction was rolled back, both the log file and in-memory tree +** structure have already been restored. In either case, this function +** merely releases locks and other resources held by the write-transaction. +** +** LSM_OK is returned if successful, or an LSM error code otherwise. +*/ +int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){ + int rc = LSM_OK; + int bFlush = 0; + + lsmLogEnd(pDb, bCommit); + if( rc==LSM_OK && bCommit && lsmTreeSize(pDb)>pDb->nTreeLimit ){ + bFlush = 1; + lsmTreeMakeOld(pDb); + } + lsmTreeEndTransaction(pDb, bCommit); + + if( rc==LSM_OK ){ + if( bFlush && pDb->bAutowork ){ + rc = lsmSortedAutoWork(pDb, 1); + }else if( bCommit && pDb->bDiscardOld ){ + rc = dbSetReadLock(pDb, pDb->pClient->iId, pDb->treehdr.iUsedShmid); + } + } + pDb->bDiscardOld = 0; + lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); + + if( bFlush && pDb->bAutowork==0 && pDb->xWork ){ + pDb->xWork(pDb, pDb->pWorkCtx); + } + return rc; +} + + +/* +** Return non-zero if the caller is holding the client mutex. +*/ +#ifdef LSM_DEBUG +int lsmHoldingClientMutex(lsm_db *pDb){ + return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex); +} +#endif + +static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){ + return( + p->iLsmId && p->iLsmId<=iLsm + && shm_sequence_ge(iShmMax, p->iTreeId) + && shm_sequence_ge(p->iTreeId, iShmMin) + ); +} + +/* +** Obtain a read-lock on database version identified by the combination +** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or +** an LSM error code otherwise. +*/ +int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){ + int rc = LSM_OK; + ShmHeader *pShm = db->pShmhdr; + int i; + + assert( db->iReader<0 ); + assert( shm_sequence_ge(iShmMax, iShmMin) ); + + /* This is a no-op if the read-only transaction flag is set. */ + if( db->bRoTrans ){ + db->iReader = 0; + return LSM_OK; + } + + /* Search for an exact match. */ + for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; + if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){ + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); + if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){ + db->iReader = i; + }else if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + } + } + + /* Try to obtain a write-lock on each slot, in order. If successful, set + ** the slot values to iLsm/iTree. */ + for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; + p->iLsmId = iLsm; + p->iTreeId = iShmMax; + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); + assert( rc!=LSM_BUSY ); + if( rc==LSM_OK ) db->iReader = i; + } + } + + /* Search for any usable slot */ + for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; + if( slotIsUsable(p, iLsm, iShmMin, iShmMax) ){ + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); + if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMin, iShmMax) ){ + db->iReader = i; + }else if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + } + } + + if( rc==LSM_OK && db->iReader<0 ){ + rc = LSM_BUSY; + } + return rc; +} + +/* +** This is used to check if there exists a read-lock locking a particular +** version of either the in-memory tree or database file. +** +** If iLsmId is non-zero, then it is a snapshot id. If there exists a +** read-lock using this snapshot or newer, set *pbInUse to true. Or, +** if there is no such read-lock, set it to false. +** +** Or, if iLsmId is zero, then iShmid is a shared-memory sequence id. +** Search for a read-lock using this sequence id or newer. etc. +*/ +static int isInUse(lsm_db *db, i64 iLsmId, u32 iShmid, int *pbInUse){ + ShmHeader *pShm = db->pShmhdr; + int i; + int rc = LSM_OK; + + for(i=0; rc==LSM_OK && iaReader[i]; + if( p->iLsmId ){ + if( (iLsmId!=0 && p->iLsmId!=0 && iLsmId>=p->iLsmId) + || (iLsmId==0 && shm_sequence_ge(p->iTreeId, iShmid)) + ){ + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); + if( rc==LSM_OK ){ + p->iLsmId = 0; + lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); + } + } + } + } + + if( rc==LSM_BUSY ){ + *pbInUse = 1; + return LSM_OK; + } + *pbInUse = 0; + return rc; +} + +/* +** This function is called by worker connections to determine the smallest +** snapshot id that is currently in use by a database client. The worker +** connection uses this result to determine whether or not it is safe to +** recycle a database block. +*/ +static int firstSnapshotInUse( + lsm_db *db, /* Database handle */ + i64 *piInUse /* IN/OUT: Smallest snapshot id in use */ +){ + ShmHeader *pShm = db->pShmhdr; + i64 iInUse = *piInUse; + int i; + + assert( iInUse>0 ); + for(i=0; iaReader[i]; + if( p->iLsmId ){ + i64 iThis = p->iLsmId; + if( iThis!=0 && iInUse>iThis ){ + int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); + if( rc==LSM_OK ){ + p->iLsmId = 0; + lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); + }else if( rc==LSM_BUSY ){ + iInUse = iThis; + }else{ + /* Some error other than LSM_BUSY. Return the error code to + ** the caller in this case. */ + return rc; + } + } + } + } + + *piInUse = iInUse; + return LSM_OK; +} + +int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){ + if( db->treehdr.iUsedShmid==iShmid ){ + *pbInUse = 1; + return LSM_OK; + } + return isInUse(db, 0, iShmid, pbInUse); +} + +int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){ + if( db->pClient && db->pClient->iId<=iLsmId ){ + *pbInUse = 1; + return LSM_OK; + } + return isInUse(db, iLsmId, 0, pbInUse); +} + +/* +** This function may only be called after a successful call to +** lsmDbDatabaseConnect(). It returns true if the connection is in +** multi-process mode, or false otherwise. +*/ +int lsmDbMultiProc(lsm_db *pDb){ + return pDb->pDatabase && pDb->pDatabase->bMultiProc; +} + + +/************************************************************************* +************************************************************************** +************************************************************************** +************************************************************************** +************************************************************************** +*************************************************************************/ + +/* +** Ensure that database connection db has cached pointers to at least the +** first nChunk chunks of shared memory. +*/ +int lsmShmCacheChunks(lsm_db *db, int nChunk){ + int rc = LSM_OK; + if( nChunk>db->nShm ){ + static const int NINCR = 16; + Database *p = db->pDatabase; + lsm_env *pEnv = db->pEnv; + int nAlloc; + int i; + + /* Ensure that the db->apShm[] array is large enough. If an attempt to + ** allocate memory fails, return LSM_NOMEM immediately. The apShm[] array + ** is always extended in multiples of 16 entries - so the actual allocated + ** size can be inferred from nShm. */ + nAlloc = ((db->nShm + NINCR - 1) / NINCR) * NINCR; + while( nChunk>=nAlloc ){ + void **apShm; + nAlloc += NINCR; + apShm = lsmRealloc(pEnv, db->apShm, sizeof(void*)*nAlloc); + if( !apShm ) return LSM_NOMEM_BKPT; + db->apShm = apShm; + } + + if( db->bRoTrans ){ + for(i=db->nShm; rc==LSM_OK && iapShm[i] = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc); + db->nShm++; + } + + }else{ + + /* Enter the client mutex */ + lsmMutexEnter(pEnv, p->pClientMutex); + + /* Extend the Database objects apShmChunk[] array if necessary. Using the + ** same pattern as for the lsm_db.apShm[] array above. */ + nAlloc = ((p->nShmChunk + NINCR - 1) / NINCR) * NINCR; + while( nChunk>=nAlloc ){ + void **apShm; + nAlloc += NINCR; + apShm = lsmRealloc(pEnv, p->apShmChunk, sizeof(void*)*nAlloc); + if( !apShm ){ + rc = LSM_NOMEM_BKPT; + break; + } + p->apShmChunk = apShm; + } + + for(i=db->nShm; rc==LSM_OK && i=p->nShmChunk ){ + void *pChunk = 0; + if( p->bMultiProc==0 ){ + /* Single process mode */ + pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc); + }else{ + /* Multi-process mode */ + rc = lsmEnvShmMap(pEnv, p->pFile, i, LSM_SHM_CHUNK_SIZE, &pChunk); + } + if( rc==LSM_OK ){ + p->apShmChunk[i] = pChunk; + p->nShmChunk++; + } + } + if( rc==LSM_OK ){ + db->apShm[i] = p->apShmChunk[i]; + db->nShm++; + } + } + + /* Release the client mutex */ + lsmMutexLeave(pEnv, p->pClientMutex); + } + } + + return rc; +} + +static int lockSharedFile(lsm_env *pEnv, Database *p, int iLock, int eOp){ + int rc = LSM_OK; + if( p->bMultiProc ){ + rc = lsmEnvLock(pEnv, p->pFile, iLock, eOp); + } + return rc; +} + +/* +** Test if it would be possible for connection db to obtain a lock of type +** eType on the nLock locks starting at iLock. If so, return LSM_OK. If it +** would not be possible to obtain the lock due to a lock held by another +** connection, return LSM_BUSY. If an IO or other error occurs (i.e. in the +** lsm_env.xTestLock function), return some other LSM error code. +** +** Note that this function never actually locks the database - it merely +** queries the system to see if there exists a lock that would prevent +** it from doing so. +*/ +int lsmShmTestLock( + lsm_db *db, + int iLock, + int nLock, + int eOp +){ + int rc = LSM_OK; + lsm_db *pIter; + Database *p = db->pDatabase; + int i; + u64 mask = 0; + + for(i=iLock; i<(iLock+nLock); i++){ + mask |= ((u64)1 << (iLock-1)); + if( eOp==LSM_LOCK_EXCL ) mask |= ((u64)1 << (iLock+32-1)); + } + + lsmMutexEnter(db->pEnv, p->pClientMutex); + for(pIter=p->pConn; pIter; pIter=pIter->pNext){ + if( pIter!=db && (pIter->mLock & mask) ) break; + } + + if( pIter ){ + rc = LSM_BUSY; + }else if( p->bMultiProc ){ + rc = lsmEnvTestLock(db->pEnv, p->pFile, iLock, nLock, eOp); + } + + lsmMutexLeave(db->pEnv, p->pClientMutex); + return rc; +} + +/* +** Attempt to obtain the lock identified by the iLock and bExcl parameters. +** If successful, return LSM_OK. If the lock cannot be obtained because +** there exists some other conflicting lock, return LSM_BUSY. If some other +** error occurs, return an LSM error code. +** +** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER, +** or else a value returned by the LSM_LOCK_READER macro. +*/ +int lsmShmLock( + lsm_db *db, + int iLock, + int eOp, /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */ + int bBlock /* True for a blocking lock */ +){ + lsm_db *pIter; + const u64 me = ((u64)1 << (iLock-1)); + const u64 ms = ((u64)1 << (iLock+32-1)); + int rc = LSM_OK; + Database *p = db->pDatabase; + + assert( eOp!=LSM_LOCK_EXCL || p->bReadonly==0 ); + assert( iLock>=1 && iLock<=LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1) ); + assert( LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1)<=32 ); + assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); + + /* Check for a no-op. Proceed only if this is not one of those. */ + if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0) + || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms) + || (eOp==LSM_LOCK_EXCL && (db->mLock & me)==0) + ){ + int nExcl = 0; /* Number of connections holding EXCLUSIVE */ + int nShared = 0; /* Number of connections holding SHARED */ + lsmMutexEnter(db->pEnv, p->pClientMutex); + + /* Figure out the locks currently held by this process on iLock, not + ** including any held by connection db. */ + for(pIter=p->pConn; pIter; pIter=pIter->pNext){ + assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 ); + if( pIter!=db ){ + if( pIter->mLock & me ){ + nExcl++; + }else if( pIter->mLock & ms ){ + nShared++; + } + } + } + assert( nExcl==0 || nExcl==1 ); + assert( nExcl==0 || nShared==0 ); + assert( nExcl==0 || (db->mLock & (me|ms))==0 ); + + switch( eOp ){ + case LSM_LOCK_UNLOCK: + if( nShared==0 ){ + lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_UNLOCK); + } + db->mLock &= ~(me|ms); + break; + + case LSM_LOCK_SHARED: + if( nExcl ){ + rc = LSM_BUSY; + }else{ + if( nShared==0 ){ + rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_SHARED); + } + if( rc==LSM_OK ){ + db->mLock |= ms; + db->mLock &= ~me; + } + } + break; + + default: + assert( eOp==LSM_LOCK_EXCL ); + if( nExcl || nShared ){ + rc = LSM_BUSY; + }else{ + rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_EXCL); + if( rc==LSM_OK ){ + db->mLock |= (me|ms); + } + } + break; + } + + lsmMutexLeave(db->pEnv, p->pClientMutex); + } + + return rc; +} + +#ifdef LSM_DEBUG + +int shmLockType(lsm_db *db, int iLock){ + const u64 me = ((u64)1 << (iLock-1)); + const u64 ms = ((u64)1 << (iLock+32-1)); + + if( db->mLock & me ) return LSM_LOCK_EXCL; + if( db->mLock & ms ) return LSM_LOCK_SHARED; + return LSM_LOCK_UNLOCK; +} + +/* +** The arguments passed to this function are similar to those passed to +** the lsmShmLock() function. However, instead of obtaining a new lock +** this function returns true if the specified connection already holds +** (or does not hold) such a lock, depending on the value of eOp. As +** follows: +** +** (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock +** (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock. +** (eOp==LSM_LOCK_EXCL) -> true if db has an EXCLUSIVE lock on iLock. +*/ +int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){ + int ret; + int eHave; + + assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) ); + assert( iLock<=16 ); + assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); + + eHave = shmLockType(db, iLock); + + switch( eOp ){ + case LSM_LOCK_UNLOCK: + ret = (eHave==LSM_LOCK_UNLOCK); + break; + case LSM_LOCK_SHARED: + ret = (eHave!=LSM_LOCK_UNLOCK); + break; + case LSM_LOCK_EXCL: + ret = (eHave==LSM_LOCK_EXCL); + break; + default: + assert( !"bad eOp value passed to lsmShmAssertLock()" ); + break; + } + + return ret; +} + +int lsmShmAssertWorker(lsm_db *db){ + return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker; +} + +/* +** This function does not contribute to library functionality, and is not +** included in release builds. It is intended to be called from within +** an interactive debugger. +** +** When called, this function prints a single line of human readable output +** to stdout describing the locks currently held by the connection. For +** example: +** +** (gdb) call print_db_locks(pDb) +** (shared on dms2) (exclusive on writer) +*/ +void print_db_locks(lsm_db *db){ + int iLock; + for(iLock=0; iLock<16; iLock++){ + int bOne = 0; + const char *azLock[] = {0, "shared", "exclusive"}; + const char *azName[] = { + 0, "dms1", "dms2", "writer", "worker", "checkpointer", + "reader0", "reader1", "reader2", "reader3", "reader4", "reader5" + }; + int eHave = shmLockType(db, iLock); + if( azLock[eHave] ){ + printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]); + bOne = 1; + } + } + printf("\n"); +} +void print_all_db_locks(lsm_db *db){ + lsm_db *p; + for(p=db->pDatabase->pConn; p; p=p->pNext){ + printf("%s connection %p ", ((p==db)?"*":""), p); + print_db_locks(p); + } +} +#endif + +void lsmShmBarrier(lsm_db *db){ + lsmEnvShmBarrier(db->pEnv); +} + +int lsm_checkpoint(lsm_db *pDb, int *pnKB){ + int rc; /* Return code */ + u32 nWrite = 0; /* Number of pages checkpointed */ + + /* Attempt the checkpoint. If successful, nWrite is set to the number of + ** pages written between this and the previous checkpoint. */ + rc = lsmCheckpointWrite(pDb, 0, &nWrite); + + /* If required, calculate the output variable (KB of data checkpointed). + ** Set it to zero if an error occured. */ + if( pnKB ){ + int nKB = 0; + if( rc==LSM_OK && nWrite ){ + nKB = (((i64)nWrite * lsmFsPageSize(pDb->pFS)) + 1023) / 1024; + } + *pnKB = nKB; + } + + return rc; +} diff --git a/ext/lsm1/lsm_sorted.c b/ext/lsm1/lsm_sorted.c new file mode 100644 index 0000000000..1b38e5b505 --- /dev/null +++ b/ext/lsm1/lsm_sorted.c @@ -0,0 +1,6149 @@ +/* +** 2011-08-14 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** PAGE FORMAT: +** +** The maximum page size is 65536 bytes. +** +** Since all records are equal to or larger than 2 bytes in size, and +** some space within the page is consumed by the page footer, there must +** be less than 2^15 records on each page. +** +** Each page ends with a footer that describes the pages contents. This +** footer serves as similar purpose to the page header in an SQLite database. +** A footer is used instead of a header because it makes it easier to +** populate a new page based on a sorted list of key/value pairs. +** +** The footer consists of the following values (starting at the end of +** the page and continuing backwards towards the start). All values are +** stored as unsigned big-endian integers. +** +** * Number of records on page (2 bytes). +** * Flags field (2 bytes). +** * Left-hand pointer value (8 bytes). +** * The starting offset of each record (2 bytes per record). +** +** Records may span pages. Unless it happens to be an exact fit, the part +** of the final record that starts on page X that does not fit on page X +** is stored at the start of page (X+1). This means there may be pages where +** (N==0). And on most pages the first record that starts on the page will +** not start at byte offset 0. For example: +** +** aaaaa bbbbb ccc