mirror of
https://github.com/MariaDB/server.git
synced 2025-11-28 17:36:30 +03:00
[t:2892] Merge upgrade logic to main. Merge command was svn merge --accept=postpone -r25293:HEAD ../tokudb.main+2892 .
git-svn-id: file:///svn/toku/tokudb@25303 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
committed by
Yoni Fogel
parent
35800b4c8b
commit
548d03d70e
@@ -232,6 +232,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -234,6 +234,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -234,6 +234,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -234,6 +234,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -235,6 +235,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -72,6 +72,7 @@ enum {
|
||||
TOKUDB_NO_DATA = -100008,
|
||||
TOKUDB_ACCEPT = -100009,
|
||||
TOKUDB_MVCC_DICTIONARY_TOO_NEW = -100010,
|
||||
TOKUDB_UPGRADE_FAILURE = -100011,
|
||||
};
|
||||
|
||||
static void print_defines (void) {
|
||||
@@ -218,6 +219,7 @@ static void print_defines (void) {
|
||||
dodefine(TOKUDB_NO_DATA);
|
||||
dodefine(TOKUDB_ACCEPT);
|
||||
dodefine(TOKUDB_MVCC_DICTIONARY_TOO_NEW);
|
||||
dodefine(TOKUDB_UPGRADE_FAILURE);
|
||||
|
||||
/* LOADER flags */
|
||||
printf("/* LOADER flags */\n");
|
||||
|
||||
@@ -235,6 +235,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -235,6 +235,7 @@ typedef enum {
|
||||
#define TOKUDB_NO_DATA -100008
|
||||
#define TOKUDB_ACCEPT -100009
|
||||
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
|
||||
#define TOKUDB_UPGRADE_FAILURE -100011
|
||||
/* LOADER flags */
|
||||
#define LOADER_USE_PUTS 1
|
||||
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
|
||||
|
||||
@@ -748,8 +748,7 @@ static void
|
||||
translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize
|
||||
DISKOFF location_on_disk, //Location of translation_buffer
|
||||
u_int64_t size_on_disk,
|
||||
unsigned char * translation_buffer,
|
||||
BOOL invert_checksum) { // buffer with serialized translation
|
||||
unsigned char * translation_buffer) { // buffer with serialized translation
|
||||
assert(location_on_disk!=0);
|
||||
t->type = TRANSLATION_CHECKPOINTED;
|
||||
{
|
||||
@@ -758,9 +757,6 @@ translation_deserialize_from_buffer(struct translation *t, // destination int
|
||||
u_int64_t offset = size_on_disk - 4;
|
||||
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
|
||||
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset));
|
||||
if (invert_checksum) {
|
||||
x1764 = ~x1764;
|
||||
}
|
||||
assert(x1764 == stored_x1764);
|
||||
}
|
||||
struct rbuf rt;
|
||||
@@ -808,10 +804,9 @@ void
|
||||
toku_blocktable_create_from_buffer(BLOCK_TABLE *btp,
|
||||
DISKOFF location_on_disk, //Location of translation_buffer
|
||||
DISKOFF size_on_disk,
|
||||
unsigned char *translation_buffer,
|
||||
BOOL invert_checksum) {
|
||||
unsigned char *translation_buffer) {
|
||||
BLOCK_TABLE bt = blocktable_create_internal();
|
||||
translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer, invert_checksum);
|
||||
translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer);
|
||||
blocktable_note_translation(bt->block_allocator, &bt->checkpointed);
|
||||
// we just filled in checkpointed, now copy it to current.
|
||||
copy_translation(&bt->current, &bt->checkpointed, TRANSLATION_CURRENT);
|
||||
|
||||
@@ -21,7 +21,7 @@ struct block_translation_pair {
|
||||
};
|
||||
|
||||
void toku_blocktable_create_new(BLOCK_TABLE *btp);
|
||||
void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer, BOOL invert_checksum);
|
||||
void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
|
||||
void toku_blocktable_destroy(BLOCK_TABLE *btp);
|
||||
|
||||
void toku_brtheader_lock(struct brt_header *h);
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
|
||||
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
|
||||
|
||||
#include "brt_layout_version.h"
|
||||
#include "toku_assert.h"
|
||||
#include "block_allocator.h"
|
||||
#include "cachetable.h"
|
||||
@@ -44,7 +45,7 @@ enum { BUFFER_HEADER_SIZE = (4 // height//
|
||||
struct subtree_estimates {
|
||||
// estimate number of rows in the tree by counting the number of rows
|
||||
// in the leaves. The stuff in the internal nodes is likely to be off O(1).
|
||||
u_int64_t nkeys; // number of distinct keys.
|
||||
u_int64_t nkeys; // number of distinct keys (obsolete with removal of dupsort, but not worth removing)
|
||||
u_int64_t ndata; // number of key-data pairs (previously leafentry_estimate)
|
||||
u_int64_t dsize; // total size of leafentries
|
||||
BOOL exact; // are the estimates exact?
|
||||
@@ -82,7 +83,6 @@ struct brtnode_nonleaf_childinfo {
|
||||
unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
|
||||
};
|
||||
|
||||
typedef struct brtnode *BRTNODE;
|
||||
/* Internal nodes. */
|
||||
struct brtnode {
|
||||
unsigned int nodesize;
|
||||
@@ -121,6 +121,7 @@ struct brtnode {
|
||||
} n;
|
||||
struct leaf {
|
||||
struct subtree_estimates leaf_stats; // actually it is exact.
|
||||
uint32_t optimized_for_upgrade; // version number to which this leaf has been optimized, zero if never optimized for upgrade
|
||||
OMT buffer;
|
||||
LEAFLOCK_POOL leaflock_pool;
|
||||
LEAFLOCK leaflock;
|
||||
@@ -166,7 +167,7 @@ struct brt_header {
|
||||
int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging)
|
||||
int layout_version_read_from_disk; // transient, not serialized to disk
|
||||
BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been)
|
||||
uint64_t num_blocks_to_upgrade; // Number of blocks still not newest version. When we release layout 13 we may need to turn this to an array.
|
||||
int64_t num_blocks_to_upgrade; // Number of v12 blocks still not newest version. When we release layout 14 we may need to turn this to an array or add more variables.
|
||||
unsigned int nodesize;
|
||||
BLOCKNUM root; // roots of the dictionary
|
||||
struct remembered_hash root_hash; // hash of the root offset.
|
||||
@@ -269,7 +270,7 @@ struct brtenv {
|
||||
};
|
||||
|
||||
extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, long size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint);
|
||||
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, void*extraargs);
|
||||
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, int*dirty, void*extraargs);
|
||||
extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn);
|
||||
extern int toku_read_brt_header_and_store_in_cachefile (CACHEFILE cf, struct brt_header **header, BOOL* was_open);
|
||||
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash);
|
||||
@@ -352,21 +353,6 @@ void toku_verify_all_in_mempool(BRTNODE node);
|
||||
|
||||
int toku_verify_brtnode (BRT brt, BLOCKNUM blocknum, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) ;
|
||||
|
||||
enum brt_layout_version_e {
|
||||
BRT_LAYOUT_VERSION_5 = 5,
|
||||
BRT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate
|
||||
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
|
||||
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
|
||||
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
|
||||
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
|
||||
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
|
||||
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
|
||||
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Added MVCC
|
||||
BRT_NEXT_VERSION, // the version after the current version
|
||||
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
|
||||
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported
|
||||
};
|
||||
|
||||
void toku_brtheader_free (struct brt_header *h);
|
||||
int toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **error_string, BOOL oplsn_valid, LSN oplsn);
|
||||
int toku_brtheader_begin_checkpoint (CACHEFILE cachefile, int fd, LSN checkpoint_lsn, void *header_v);
|
||||
@@ -380,9 +366,10 @@ int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p);
|
||||
|
||||
|
||||
typedef struct brt_upgrade_status {
|
||||
u_int64_t header;
|
||||
u_int64_t nonleaf;
|
||||
u_int64_t leaf;
|
||||
u_int64_t header_12; // how many headers upgrade from version 12
|
||||
u_int64_t nonleaf_12;
|
||||
u_int64_t leaf_12;
|
||||
u_int64_t optimized_for_upgrade_12; // how many optimize_for_upgrade messages sent
|
||||
} BRT_UPGRADE_STATUS_S, *BRT_UPGRADE_STATUS;
|
||||
|
||||
void toku_brt_get_upgrade_status(BRT_UPGRADE_STATUS);
|
||||
|
||||
@@ -225,6 +225,7 @@ toku_serialize_brtnode_size_slow (BRTNODE node) {
|
||||
invariant(hsize==node->u.l.n_bytes_in_buffer);
|
||||
hsize += 4; // add n entries in buffer table
|
||||
hsize += 3*8; // add the three leaf stats, but no exact bit
|
||||
hsize += 4; // optimized_for_upgrade
|
||||
size += 4 + 1*stored_sub_block_map_size; // one partition
|
||||
return size+hsize;
|
||||
}
|
||||
@@ -247,6 +248,7 @@ toku_serialize_brtnode_size (BRTNODE node) {
|
||||
} else {
|
||||
result += 4; // n_entries in buffer table
|
||||
result += 3*8; // the three leaf stats
|
||||
result += 4; // optimized_for_upgrade
|
||||
result += node->u.l.n_bytes_in_buffer;
|
||||
result += 4 + 1*stored_sub_block_map_size; // one partition
|
||||
}
|
||||
@@ -372,6 +374,8 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str
|
||||
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.ndata);
|
||||
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.dsize);
|
||||
|
||||
wbuf_nocrc_int(wbuf, node->u.l.optimized_for_upgrade);
|
||||
|
||||
// RFP partition the leaf elements. for now, 1 partition
|
||||
const int npartitions = 1;
|
||||
wbuf_nocrc_int(wbuf, npartitions);
|
||||
@@ -732,6 +736,13 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
|
||||
result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb);
|
||||
result->u.l.leaf_stats.exact = TRUE;
|
||||
|
||||
if (result->layout_version >= BRT_LAYOUT_VERSION_13) {
|
||||
result->u.l.optimized_for_upgrade = rbuf_int(rb);
|
||||
}
|
||||
else {
|
||||
result->u.l.optimized_for_upgrade = 0;
|
||||
}
|
||||
|
||||
// deserialize the number of partitions
|
||||
int npartitions = rbuf_int(rb);
|
||||
invariant(npartitions == 1);
|
||||
@@ -759,14 +770,31 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
|
||||
u_int32_t actual_sum = 0;
|
||||
u_int32_t start_of_data = rb->ndone;
|
||||
OMTVALUE *MALLOC_N(n_in_buf, array);
|
||||
for (int i=0; i<n_in_buf; i++) {
|
||||
LEAFENTRY le = (LEAFENTRY)(&rb->buf[rb->ndone]);
|
||||
u_int32_t disksize = leafentry_disksize(le);
|
||||
rb->ndone += disksize;
|
||||
invariant(rb->ndone<=rb->size);
|
||||
if (result->layout_version == BRT_LAYOUT_VERSION) {
|
||||
for (int i=0; i<n_in_buf; i++) {
|
||||
LEAFENTRY le = (LEAFENTRY)(&rb->buf[rb->ndone]);
|
||||
u_int32_t disksize = leafentry_disksize(le);
|
||||
rb->ndone += disksize;
|
||||
invariant(rb->ndone<=rb->size);
|
||||
array[i]=(OMTVALUE)le;
|
||||
actual_sum += x1764_memory(le, disksize);
|
||||
}
|
||||
}
|
||||
else if (result->layout_version == BRT_LAYOUT_VERSION_12) {
|
||||
for (int i=0; i<n_in_buf; i++) {
|
||||
// these two lines and optimized_for_upgrade logic above are only difference in handling
|
||||
// versions 12 and 13 at this layer (more logic at higher layer)
|
||||
LEAFENTRY_12 le = (LEAFENTRY_12)(&rb->buf[rb->ndone]);
|
||||
u_int32_t disksize = leafentry_disksize_12(le);
|
||||
rb->ndone += disksize;
|
||||
invariant(rb->ndone<=rb->size);
|
||||
|
||||
array[i]=(OMTVALUE)le;
|
||||
actual_sum += x1764_memory(le, disksize);
|
||||
array[i]=(OMTVALUE)le;
|
||||
actual_sum += x1764_memory(le, disksize);
|
||||
}
|
||||
}
|
||||
else {
|
||||
invariant(FALSE);
|
||||
}
|
||||
toku_trace("fill array");
|
||||
u_int32_t end_of_data = rb->ndone;
|
||||
@@ -822,7 +850,8 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b
|
||||
bytevec magic;
|
||||
rbuf_literal_bytes(rb, &magic, 8);
|
||||
result->layout_version = rbuf_int(rb);
|
||||
invariant(result->layout_version == BRT_LAYOUT_VERSION);
|
||||
invariant(result->layout_version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
|
||||
invariant(result->layout_version <= BRT_LAYOUT_VERSION);
|
||||
result->layout_version_original = rbuf_int(rb);
|
||||
result->layout_version_read_from_disk = result->layout_version;
|
||||
result->nodesize = rbuf_int(rb);
|
||||
@@ -927,8 +956,10 @@ decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, size_t raw_block_size,
|
||||
|
||||
static int
|
||||
decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
|
||||
// This function exists solely to accomodate future changes in compression.
|
||||
int r;
|
||||
switch (version) {
|
||||
case BRT_LAYOUT_VERSION_12:
|
||||
case BRT_LAYOUT_VERSION:
|
||||
r = decompress_from_raw_block_into_rbuf(raw_block, raw_block_size, rb, blocknum);
|
||||
break;
|
||||
@@ -941,27 +972,87 @@ decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_b
|
||||
static int
|
||||
deserialize_brtnode_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb) {
|
||||
int r = 0;
|
||||
BRTNODE brtnode_12 = NULL;
|
||||
BRTNODE node = NULL;
|
||||
r = deserialize_brtnode_from_rbuf(blocknum, fullhash, &node, h, rb); // we just filled the node with contents from rbuf
|
||||
if (r==0) {
|
||||
invariant(node);
|
||||
int upgrade = 0;
|
||||
switch (version) {
|
||||
case BRT_LAYOUT_VERSION_12:
|
||||
invariant(node->layout_version == BRT_LAYOUT_VERSION_12);
|
||||
//Any upgrade necessary.
|
||||
if (node->height == 0) {
|
||||
//leaf
|
||||
uint32_t i;
|
||||
OMT omt = node->u.l.buffer;
|
||||
uint32_t num_les = toku_omt_size(omt);
|
||||
LEAFENTRY *XCALLOC_N(num_les, new_les);
|
||||
OMTVALUE v;
|
||||
|
||||
int upgrade = 0;
|
||||
switch (version) {
|
||||
case BRT_LAYOUT_VERSION:
|
||||
if (!upgrade)
|
||||
r = deserialize_brtnode_from_rbuf(blocknum, fullhash, &brtnode_12, h, rb);
|
||||
if (r==0) {
|
||||
lazy_assert(brtnode_12);
|
||||
*brtnode = brtnode_12;
|
||||
}
|
||||
if (upgrade && r == 0) {
|
||||
toku_brtheader_lock(h);
|
||||
lazy_assert(h->num_blocks_to_upgrade>0);
|
||||
h->num_blocks_to_upgrade--;
|
||||
toku_brtheader_unlock(h);
|
||||
(*brtnode)->dirty = 1;
|
||||
}
|
||||
break; // this is the only break
|
||||
default:
|
||||
lazy_assert(FALSE);
|
||||
u_int32_t incremental_fingerprint = 0;
|
||||
u_int32_t incremental_size = 0;
|
||||
for (i = 0; i < num_les; i++) {
|
||||
r = toku_omt_fetch(omt, i, &v, NULL);
|
||||
invariant(r==0);
|
||||
size_t new_memsize, new_disksize;
|
||||
// Translate packed version 12 leafentry to packed version 13 leafentry
|
||||
r = toku_le_upgrade_12_13(v, &new_memsize, &new_disksize, &new_les[i]);
|
||||
invariant(r==0);
|
||||
invariant(new_memsize == new_disksize);
|
||||
incremental_size += OMT_ITEM_OVERHEAD + new_memsize;
|
||||
incremental_fingerprint += toku_le_crc(new_les[i]);
|
||||
}
|
||||
//Regenerate fingerprint.
|
||||
node->local_fingerprint = node->rand4fingerprint * incremental_fingerprint;
|
||||
//Set buffer size.
|
||||
node->u.l.n_bytes_in_buffer = incremental_size;
|
||||
|
||||
//Replace mempool (destroy old, create new).
|
||||
uint8_t *p;
|
||||
{
|
||||
void *mpbase = toku_mempool_get_base(&node->u.l.buffer_mempool);
|
||||
toku_mempool_fini(&node->u.l.buffer_mempool);
|
||||
toku_free(mpbase);
|
||||
mpbase = toku_xmalloc(incremental_size);
|
||||
toku_mempool_init(&node->u.l.buffer_mempool, mpbase, incremental_size);
|
||||
node->u.l.buffer_mempool.free_offset = incremental_size;
|
||||
p = mpbase;
|
||||
}
|
||||
//p points to beginning of new mempool
|
||||
for (i = 0; i < num_les; i++) {
|
||||
size_t len = leafentry_memsize(new_les[i]);
|
||||
memcpy(p, new_les[i], len);
|
||||
r = toku_omt_set_at(omt, p, i);
|
||||
invariant(r==0);
|
||||
p += len;
|
||||
toku_free(new_les[i]); //Free malloced version of new leafentry (copy exists in mempool)
|
||||
}
|
||||
toku_free(new_les); // Free array of pointers to new leafentries
|
||||
//Regenerate nkeys, ndata, dsize
|
||||
toku_brt_leaf_reset_calc_leaf_stats(node);
|
||||
toku_sync_fetch_and_increment_uint64(&upgrade_status.leaf_12); // how many leaf nodes upgraded from v12
|
||||
}
|
||||
else {
|
||||
toku_sync_fetch_and_increment_uint64(&upgrade_status.nonleaf_12); // how many nonleaf nodes upgraded from v12
|
||||
}
|
||||
node->flags &= ~TOKU_DB_VALCMP_BUILTIN_12; // delete obsolete flag
|
||||
node->layout_version = BRT_LAYOUT_VERSION;
|
||||
upgrade++;
|
||||
//Fall through on purpose
|
||||
case BRT_LAYOUT_VERSION:
|
||||
invariant(node->layout_version == BRT_LAYOUT_VERSION);
|
||||
if (upgrade) {
|
||||
toku_brtheader_lock(h);
|
||||
invariant(h->num_blocks_to_upgrade>0);
|
||||
h->num_blocks_to_upgrade--;
|
||||
toku_brtheader_unlock(h);
|
||||
node->dirty = 1;
|
||||
}
|
||||
*brtnode = node;
|
||||
break; // this is the only break
|
||||
default:
|
||||
invariant(FALSE);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
@@ -1051,21 +1142,32 @@ cleanup:
|
||||
|
||||
int
|
||||
toku_maybe_upgrade_brt(BRT t) { // possibly do some work to complete the version upgrade of brt
|
||||
// If someday we need to inject a message to upgrade the brt, this is where
|
||||
// it should be done. Whenever an upgrade is done, all nodes will be marked
|
||||
// as dirty, so it makes sense here to always inject an OPTIMIZE message.
|
||||
// (Note, if someday the version number is stored in the translation instead
|
||||
// of in each node, then the upgrade would not necessarily dirty each node.)
|
||||
int r = 0;
|
||||
|
||||
int version = t->h->layout_version_read_from_disk;
|
||||
if (!t->h->upgrade_brt_performed) {
|
||||
int upgrade = 0;
|
||||
if (!t->h->upgrade_brt_performed) { // upgrade may be necessary
|
||||
switch (version) {
|
||||
case BRT_LAYOUT_VERSION_11:
|
||||
r = 0;
|
||||
//Fall through on purpose.
|
||||
case BRT_LAYOUT_VERSION:
|
||||
if (r == 0) {
|
||||
t->h->upgrade_brt_performed = TRUE;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
lazy_assert(FALSE);
|
||||
case BRT_LAYOUT_VERSION_12:
|
||||
r = 0;
|
||||
upgrade++;
|
||||
//Fall through on purpose.
|
||||
case BRT_LAYOUT_VERSION:
|
||||
if (r == 0 && upgrade) {
|
||||
r = toku_brt_optimize_for_upgrade(t);
|
||||
toku_sync_fetch_and_increment_uint64(&upgrade_status.optimized_for_upgrade_12);
|
||||
}
|
||||
if (r == 0) {
|
||||
t->h->upgrade_brt_performed = TRUE; // no further upgrade necessary
|
||||
}
|
||||
break;
|
||||
default:
|
||||
invariant(FALSE);
|
||||
}
|
||||
}
|
||||
if (r) {
|
||||
@@ -1147,9 +1249,6 @@ serialize_brt_header_min_size (u_int32_t version) {
|
||||
case BRT_LAYOUT_VERSION_13:
|
||||
size += 8; //TXNID that created
|
||||
case BRT_LAYOUT_VERSION_12:
|
||||
size += 8; // Number of blocks in old version.
|
||||
// fall through to add up bytes in previous version
|
||||
case BRT_LAYOUT_VERSION_11:
|
||||
size += (+8 // "tokudata"
|
||||
+4 // version
|
||||
+4 // original_version
|
||||
@@ -1161,9 +1260,9 @@ serialize_brt_header_min_size (u_int32_t version) {
|
||||
+8 // translation_size_on_disk
|
||||
+8 // translation_address_on_disk
|
||||
+4 // checksum
|
||||
);
|
||||
size+=(+8 // diskoff
|
||||
+4 // flags
|
||||
+8 // Number of blocks in old version.
|
||||
+8 // diskoff
|
||||
+4 // flags
|
||||
);
|
||||
break;
|
||||
default:
|
||||
@@ -1398,7 +1497,9 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
|
||||
//version MUST be in network order on disk regardless of disk order
|
||||
h->layout_version = rbuf_network_int(&rc);
|
||||
//TODO: #1924
|
||||
lazy_assert(h->layout_version==BRT_LAYOUT_VERSION);
|
||||
invariant(h->layout_version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
|
||||
invariant(h->layout_version <= BRT_LAYOUT_VERSION);
|
||||
h->layout_version_read_from_disk = h->layout_version;
|
||||
|
||||
//Size MUST be in network order regardless of disk order.
|
||||
u_int32_t size = rbuf_network_int(&rc);
|
||||
@@ -1432,8 +1533,7 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
|
||||
toku_blocktable_create_from_buffer(&h->blocktable,
|
||||
translation_address_on_disk,
|
||||
translation_size_on_disk,
|
||||
tbuf,
|
||||
FALSE /*not version 11 or older */ );
|
||||
tbuf);
|
||||
toku_free(tbuf);
|
||||
}
|
||||
|
||||
@@ -1443,7 +1543,10 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
|
||||
deserialize_descriptor_from(fd, h, &h->descriptor);
|
||||
h->layout_version_original = rbuf_int(&rc);
|
||||
h->num_blocks_to_upgrade = rbuf_ulonglong(&rc);
|
||||
rbuf_TXNID(&rc, &h->root_xid_that_created);
|
||||
if (h->layout_version >= BRT_LAYOUT_VERSION_13) {
|
||||
// at this layer, this new field is the only difference between versions 12 and 13
|
||||
rbuf_TXNID(&rc, &h->root_xid_that_created);
|
||||
}
|
||||
(void)rbuf_int(&rc); //Read in checksum and ignore (already verified).
|
||||
if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;}
|
||||
toku_free(rc.buf);
|
||||
@@ -1454,36 +1557,46 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
|
||||
|
||||
|
||||
|
||||
//TODO: When version 13 exists, add case for version 12 that looks like version 10 case,
|
||||
// but calls deserialize_brtheader_12() and upgrade_12_13()
|
||||
//TODO: When version 14 exists, add case for version 13 that looks like version 12 case,
|
||||
static int
|
||||
deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **brth, u_int32_t version) {
|
||||
int rval;
|
||||
struct brt_header *brth_12 = NULL;
|
||||
int upgrade = 0;
|
||||
|
||||
switch(version) {
|
||||
case BRT_LAYOUT_VERSION:
|
||||
if (!upgrade)
|
||||
rval = deserialize_brtheader (fd, rb, &brth_12);
|
||||
if (rval == 0) {
|
||||
lazy_assert(brth_12);
|
||||
*brth = brth_12;
|
||||
}
|
||||
if (upgrade && rval == 0) {
|
||||
toku_brtheader_lock(*brth);
|
||||
(*brth)->num_blocks_to_upgrade = toku_block_get_blocks_in_use_unlocked((*brth)->blocktable);
|
||||
(*brth)->dirty = 1;
|
||||
toku_brtheader_unlock(*brth);
|
||||
}
|
||||
break; // this is the only break
|
||||
default:
|
||||
lazy_assert(FALSE);
|
||||
}
|
||||
struct brt_header *h = NULL;
|
||||
rval = deserialize_brtheader (fd, rb, &h); //deserialize from rbuf and fd into header
|
||||
if (rval == 0) {
|
||||
lazy_assert((*brth)->layout_version == BRT_LAYOUT_VERSION);
|
||||
(*brth)->layout_version_read_from_disk = version;
|
||||
(*brth)->upgrade_brt_performed = FALSE;
|
||||
invariant(h);
|
||||
switch (version) {
|
||||
case BRT_LAYOUT_VERSION_12:
|
||||
invariant(h->layout_version == BRT_LAYOUT_VERSION_12);
|
||||
{
|
||||
//Upgrade root_xid_that_created
|
||||
//Fake creation during the last checkpoint.
|
||||
h->root_xid_that_created = h->checkpoint_lsn.lsn;
|
||||
}
|
||||
{
|
||||
//Deprecate 'TOKU_DB_VALCMP_BUILTIN'. Just remove the flag
|
||||
h->flags &= ~TOKU_DB_VALCMP_BUILTIN_12;
|
||||
}
|
||||
h->layout_version++;
|
||||
toku_sync_fetch_and_increment_uint64(&upgrade_status.header_12); // how many header nodes upgraded from v12
|
||||
upgrade++;
|
||||
//Fall through on purpose
|
||||
case BRT_LAYOUT_VERSION:
|
||||
invariant(h->layout_version == BRT_LAYOUT_VERSION);
|
||||
h->upgrade_brt_performed = FALSE;
|
||||
if (upgrade) {
|
||||
toku_brtheader_lock(h);
|
||||
h->num_blocks_to_upgrade = toku_block_get_blocks_in_use_unlocked(h->blocktable); //Total number of blocks
|
||||
h->dirty = 1;
|
||||
toku_brtheader_unlock(h);
|
||||
}
|
||||
*brth = h;
|
||||
break; // this is the only break
|
||||
default:
|
||||
invariant(FALSE);
|
||||
}
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
@@ -1494,14 +1607,14 @@ deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **br
|
||||
// If that ever changes, then modify this.
|
||||
//TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the file AND the header is useless
|
||||
static int
|
||||
deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *rb, u_int64_t *checkpoint_count, u_int32_t * version_p) {
|
||||
deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset_of_header, struct rbuf *rb, u_int64_t *checkpoint_count, u_int32_t * version_p) {
|
||||
int r = 0;
|
||||
const int64_t prefix_size = 8 + // magic ("tokudata")
|
||||
4 + // version
|
||||
4; // size
|
||||
unsigned char prefix[prefix_size];
|
||||
rb->buf = NULL;
|
||||
int64_t n = pread(fd, prefix, prefix_size, offset);
|
||||
int64_t n = pread(fd, prefix, prefix_size, offset_of_header);
|
||||
if (n==0) r = TOKUDB_DICTIONARY_NO_HEADER;
|
||||
else if (n<0) {r = errno; lazy_assert(r!=0);}
|
||||
else if (n!=prefix_size) r = EINVAL;
|
||||
@@ -1546,7 +1659,7 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *
|
||||
rb->buf = toku_xmalloc(rb->size);
|
||||
}
|
||||
if (r==0) {
|
||||
n = pread(fd, rb->buf, rb->size, offset);
|
||||
n = pread(fd, rb->buf, rb->size, offset_of_header);
|
||||
if (n==-1) {
|
||||
r = errno;
|
||||
lazy_assert(r!=0);
|
||||
@@ -1557,12 +1670,9 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *
|
||||
//We have an rbuf that represents the header.
|
||||
//Size is within acceptable bounds.
|
||||
if (r==0) {
|
||||
//Verify checksum
|
||||
//Verify checksum (BRT_LAYOUT_VERSION_12 or later, when checksum function changed)
|
||||
u_int32_t calculated_x1764 = x1764_memory(rb->buf, rb->size-4);
|
||||
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
|
||||
if (version<=BRT_LAYOUT_VERSION_11) {
|
||||
calculated_x1764 = ~calculated_x1764;
|
||||
}
|
||||
if (calculated_x1764!=stored_x1764) r = TOKUDB_DICTIONARY_NO_HEADER; //Header useless
|
||||
}
|
||||
if (r==0) {
|
||||
@@ -1837,23 +1947,12 @@ static int
|
||||
deserialize_rollback_log_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash,
|
||||
ROLLBACK_LOG_NODE *log,
|
||||
struct brt_header *h, struct rbuf *rb) {
|
||||
//Upgrade is not necessary really here. Rollback log nodes do not survive version changes.
|
||||
int r = 0;
|
||||
ROLLBACK_LOG_NODE rollback_log_node = NULL;
|
||||
|
||||
int upgrade = 0;
|
||||
switch (version) {
|
||||
case BRT_LAYOUT_VERSION:
|
||||
if (!upgrade)
|
||||
r = deserialize_rollback_log_from_rbuf(blocknum, fullhash, &rollback_log_node, h, rb);
|
||||
if (r==0) {
|
||||
lazy_assert(rollback_log_node);
|
||||
*log = rollback_log_node;
|
||||
}
|
||||
if (upgrade && r == 0) (*log)->dirty = 1;
|
||||
break; // this is the only break
|
||||
default:
|
||||
lazy_assert(FALSE);
|
||||
invariant(version==BRT_LAYOUT_VERSION); //Rollback log nodes do not survive version changes.
|
||||
r = deserialize_rollback_log_from_rbuf(blocknum, fullhash, &rollback_log_node, h, rb);
|
||||
if (r==0) {
|
||||
*log = rollback_log_node;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
61
newbrt/brt.c
61
newbrt/brt.c
@@ -174,6 +174,10 @@ message are not gorged. (But they may be hungry or too fat or too thin.)
|
||||
#include "roll.h"
|
||||
#include "toku_atomic.h"
|
||||
|
||||
|
||||
static const uint32_t this_version = BRT_LAYOUT_VERSION;
|
||||
|
||||
|
||||
void
|
||||
toku_brt_header_suppress_rollbacks(struct brt_header *h, TOKUTXN txn) {
|
||||
TXNID txnid = toku_txn_get_txnid(txn);
|
||||
@@ -296,6 +300,12 @@ calc_leaf_stats (BRTNODE node) {
|
||||
return e;
|
||||
}
|
||||
|
||||
void
|
||||
toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) {
|
||||
invariant(node->height==0);
|
||||
node->u.l.leaf_stats = calc_leaf_stats(node);
|
||||
}
|
||||
|
||||
static void __attribute__((__unused__))
|
||||
brt_leaf_check_leaf_stats (BRTNODE node)
|
||||
{
|
||||
@@ -483,13 +493,16 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename
|
||||
}
|
||||
|
||||
//fd is protected (must be holding fdlock)
|
||||
int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, void*extraargs) {
|
||||
int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash,
|
||||
void **brtnode_pv, long *sizep, int *dirtyp, void *extraargs) {
|
||||
lazy_assert(extraargs);
|
||||
struct brt_header *h = extraargs;
|
||||
BRTNODE *result=(BRTNODE*)brtnode_pv;
|
||||
int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, result, h);
|
||||
if (r == 0)
|
||||
if (r == 0) {
|
||||
*sizep = brtnode_memory_size(*result);
|
||||
*dirtyp = (*result)->dirty;
|
||||
}
|
||||
//(*result)->parent_brtnode = 0; /* Don't know it right now. */
|
||||
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
|
||||
return r;
|
||||
@@ -656,6 +669,7 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, size_
|
||||
n->u.n.childkeys=0;
|
||||
} else {
|
||||
n->u.l.leaf_stats = zero_estimates;
|
||||
n->u.l.optimized_for_upgrade = 0;
|
||||
int r;
|
||||
r = toku_omt_create(&n->u.l.buffer);
|
||||
lazy_assert_zero(r);
|
||||
@@ -1646,6 +1660,9 @@ brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd,
|
||||
lazy_assert(toku_omt_size(node->u.l.buffer) == omt_size);
|
||||
|
||||
break;
|
||||
case BRT_OPTIMIZE_FOR_UPGRADE:
|
||||
node->dirty = 1;
|
||||
node->u.l.optimized_for_upgrade = *((uint32_t*)(cmd->u.id.val->data)); // record version of software that sent the optimize_for_upgrade message
|
||||
case BRT_OPTIMIZE:
|
||||
// Apply to all leafentries
|
||||
idx = 0;
|
||||
@@ -1893,6 +1910,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd,
|
||||
case BRT_COMMIT_BROADCAST_TXN:
|
||||
case BRT_ABORT_BROADCAST_TXN:
|
||||
case BRT_OPTIMIZE:
|
||||
case BRT_OPTIMIZE_FOR_UPGRADE:
|
||||
return brt_nonleaf_cmd_all (t, node, cmd, re_array, did_io); // send message to all children
|
||||
case BRT_NONE:
|
||||
break;
|
||||
@@ -2601,14 +2619,33 @@ toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_ina
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static int brt_optimize (BRT brt, BOOL upgrade);
|
||||
|
||||
// Effect: Optimize the brt.
|
||||
int
|
||||
toku_brt_optimize (BRT brt) {
|
||||
int r = 0;
|
||||
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
|
||||
TXNID oldest = toku_logger_get_oldest_living_xid(logger);
|
||||
int r = brt_optimize(brt, FALSE);
|
||||
return r;
|
||||
}
|
||||
|
||||
XIDS root_xids = xids_get_root_xids();
|
||||
int
|
||||
toku_brt_optimize_for_upgrade (BRT brt) {
|
||||
int r = brt_optimize(brt, TRUE);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int
|
||||
brt_optimize (BRT brt, BOOL upgrade) {
|
||||
int r = 0;
|
||||
|
||||
TXNID oldest = TXNID_NONE_LIVING;
|
||||
if (!upgrade) {
|
||||
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
|
||||
oldest = toku_logger_get_oldest_living_xid(logger);
|
||||
}
|
||||
|
||||
XIDS root_xids = xids_get_root_xids();
|
||||
XIDS message_xids;
|
||||
if (oldest == TXNID_NONE_LIVING) {
|
||||
message_xids = root_xids;
|
||||
@@ -2622,8 +2659,16 @@ toku_brt_optimize (BRT brt) {
|
||||
DBT val;
|
||||
toku_init_dbt(&key);
|
||||
toku_init_dbt(&val);
|
||||
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, message_xids, .u.id={&key,&val}};
|
||||
r = toku_brt_root_put_cmd(brt, &brtcmd);
|
||||
if (upgrade) {
|
||||
// maybe there's a better place than the val dbt to put the version, but it seems harmless and is convenient
|
||||
toku_fill_dbt(&val, &this_version, sizeof(this_version));
|
||||
BRT_MSG_S brtcmd = { BRT_OPTIMIZE_FOR_UPGRADE, message_xids, .u.id={&key,&val}};
|
||||
r = toku_brt_root_put_cmd(brt, &brtcmd);
|
||||
}
|
||||
else {
|
||||
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, message_xids, .u.id={&key,&val}};
|
||||
r = toku_brt_root_put_cmd(brt, &brtcmd);
|
||||
}
|
||||
xids_destroy(&message_xids);
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -68,6 +68,8 @@ int toku_brt_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn) __attribute__ ((warn
|
||||
|
||||
int toku_brt_optimize (BRT brt) __attribute__ ((warn_unused_result));
|
||||
|
||||
int toku_brt_optimize_for_upgrade (BRT brt) __attribute__ ((warn_unused_result));
|
||||
|
||||
// Effect: Insert a key and data pair into a brt if the oplsn is newer than the brt lsn. This function is called during recovery.
|
||||
// Returns 0 if successful
|
||||
int toku_brt_maybe_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn, BOOL oplsn_valid, LSN oplsn, int do_logging, enum brt_msg_type type) __attribute__ ((warn_unused_result));
|
||||
@@ -176,7 +178,9 @@ enum brt_header_flags {
|
||||
//TOKU_DB_DUP = (1<<0), //Obsolete #2862
|
||||
//TOKU_DB_DUPSORT = (1<<1), //Obsolete #2862
|
||||
TOKU_DB_KEYCMP_BUILTIN = (1<<2),
|
||||
//TOKU_DB_VALCMP_BUILTIN = (1<<3),
|
||||
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_12
|
||||
TOKU_DB_VALCMP_BUILTIN_12 = (1<<3),
|
||||
#endif
|
||||
};
|
||||
|
||||
int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater) __attribute__ ((warn_unused_result));
|
||||
@@ -238,6 +242,8 @@ BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_r
|
||||
#define TOKU_MULTIPLE_MAIN_THREADS 0
|
||||
#endif
|
||||
|
||||
void toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node);
|
||||
|
||||
int toku_brt_strerror_r(int error, char *buf, size_t buflen);
|
||||
// Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror().
|
||||
// If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message.
|
||||
|
||||
25
newbrt/brt_layout_version.h
Normal file
25
newbrt/brt_layout_version.h
Normal file
@@ -0,0 +1,25 @@
|
||||
/* -*- mode: C; c-basic-offset: 4 -*- */
|
||||
#ifndef BRT_LAYOUT_VERSION_H
|
||||
#define BRT_LAYOUT_VERSION_H
|
||||
|
||||
#ident "$Id$"
|
||||
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
|
||||
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
|
||||
|
||||
//Must be defined before other recursive headers could include logger.h
|
||||
enum brt_layout_version_e {
|
||||
BRT_LAYOUT_VERSION_5 = 5,
|
||||
BRT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate
|
||||
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
|
||||
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
|
||||
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
|
||||
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
|
||||
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
|
||||
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
|
||||
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Added MVCC, deprecated TOKU_DB_VALCMP_BUILTIN(_12)
|
||||
BRT_NEXT_VERSION, // the version after the current version
|
||||
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
|
||||
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -120,6 +120,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
|
||||
case BRT_COMMIT_BROADCAST_TXN: printf("COMMIT_BROADCAST_TXN"); goto ok;
|
||||
case BRT_ABORT_BROADCAST_TXN: printf("ABORT_BROADCAST_TXN"); goto ok;
|
||||
case BRT_OPTIMIZE: printf("OPTIMIZE"); goto ok;
|
||||
case BRT_OPTIMIZE_FOR_UPGRADE: printf("OPTIMIZE_FOR_UPGRADE"); goto ok;
|
||||
}
|
||||
printf("HUH?");
|
||||
ok:
|
||||
@@ -139,6 +140,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
|
||||
} else {
|
||||
struct subtree_estimates *est = &n->u.l.leaf_stats;
|
||||
printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }\n", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F");
|
||||
printf(" optimized_for_upgrade=%u\n", n->u.l.optimized_for_upgrade);
|
||||
printf(" n_bytes_in_buffer=%u\n", n->u.l.n_bytes_in_buffer);
|
||||
printf(" items_in_buffer =%u\n", toku_omt_size(n->u.l.buffer));
|
||||
if (dump_data) toku_omt_iterate(n->u.l.buffer, print_le, 0);
|
||||
|
||||
@@ -2227,6 +2227,9 @@ static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc)
|
||||
lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
|
||||
lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
|
||||
lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
|
||||
|
||||
putbuf_int32(&lbuf->dbuf, 0); // optimized_for_upgrade
|
||||
|
||||
lbuf->partitions_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->dbuf.off += stored_sub_block_map_size; // RFP partition map
|
||||
lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4;
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct brt *BRT;
|
||||
typedef struct brtnode *BRTNODE;
|
||||
struct brt_header;
|
||||
struct wbuf;
|
||||
struct dbuf;
|
||||
@@ -96,7 +97,8 @@ enum brt_msg_type {
|
||||
BRT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction).
|
||||
BRT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction).
|
||||
BRT_INSERT_NO_OVERWRITE = 11,
|
||||
BRT_OPTIMIZE = 12,
|
||||
BRT_OPTIMIZE = 12, // Broadcast
|
||||
BRT_OPTIMIZE_FOR_UPGRADE = 13, // same as BRT_OPTIMIZE, but record version number in leafnode
|
||||
};
|
||||
|
||||
typedef struct xids_t *XIDS;
|
||||
|
||||
@@ -1092,6 +1092,8 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
|
||||
|
||||
void *toku_value = 0;
|
||||
long size = 0;
|
||||
|
||||
int dirty = 0;
|
||||
|
||||
WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));
|
||||
|
||||
@@ -1100,7 +1102,9 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
|
||||
|
||||
int r;
|
||||
if (toku_cachefile_is_dev_null_unlocked(cf)) r = -1;
|
||||
else r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &size, extraargs);
|
||||
else r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &size, &dirty, extraargs);
|
||||
if (dirty)
|
||||
p->dirty = CACHETABLE_DIRTY;
|
||||
|
||||
cachetable_lock(ct);
|
||||
rwlock_read_unlock(&cf->fdlock);
|
||||
|
||||
@@ -122,7 +122,7 @@ typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void
|
||||
// Returns: 0 if success, otherwise an error number. The address and size of the object
|
||||
// associated with the key are returned.
|
||||
// Can access fd (fd is protected by a readlock during call)
|
||||
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs);
|
||||
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs);
|
||||
|
||||
void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata,
|
||||
int (*log_fassociate_during_checkpoint)(CACHEFILE, void*),
|
||||
|
||||
@@ -111,7 +111,7 @@ struct __attribute__ ((__packed__)) leafentry {
|
||||
|
||||
|
||||
typedef struct leafentry *LEAFENTRY;
|
||||
|
||||
typedef struct leafentry_12 *LEAFENTRY_12;
|
||||
|
||||
u_int32_t toku_le_crc(LEAFENTRY v);
|
||||
|
||||
@@ -173,7 +173,6 @@ le_clean(uint8_t *key, uint32_t keylen,
|
||||
struct dbuf *d);
|
||||
|
||||
|
||||
|
||||
//Callback contract:
|
||||
// Function checks to see if id is accepted by context.
|
||||
// Returns:
|
||||
@@ -187,6 +186,15 @@ int le_iterate_is_empty(LEAFENTRY le, LE_ITERATE_CALLBACK f, BOOL *is_empty, TOK
|
||||
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, u_int32_t *vallenp, TOKUTXN context);
|
||||
|
||||
|
||||
size_t
|
||||
leafentry_disksize_12(LEAFENTRY_12 le);
|
||||
int
|
||||
toku_le_upgrade_12_13(LEAFENTRY_12 old_leafentry, // NULL if there was no stored data.
|
||||
size_t *new_leafentry_memorysize,
|
||||
size_t *new_leafentry_disksize,
|
||||
LEAFENTRY *new_leafentry_p);
|
||||
|
||||
|
||||
#if defined(__cplusplus) || defined(__cilkplusplus)
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@ static inline void toku_free_FILENUMS(FILENUMS val) { toku_free(val.filenums); }
|
||||
|
||||
void toku_set_lsn_increment (uint64_t incr) __attribute__((__visibility__("default")));
|
||||
|
||||
int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir);
|
||||
int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir, LSN * lsn_of_clean_shutdown, BOOL * upgrade_in_progress);
|
||||
uint64_t toku_log_upgrade_get_footprint(void);
|
||||
|
||||
|
||||
|
||||
@@ -8,15 +8,18 @@
|
||||
#include "checkpoint.h"
|
||||
|
||||
static uint64_t footprint = 0; // for debug and accountability
|
||||
static uint64_t footprint_previous_upgrade = 0; // for debug and accountability
|
||||
|
||||
uint64_t
|
||||
toku_log_upgrade_get_footprint(void) {
|
||||
return footprint + (100000 * footprint_previous_upgrade);
|
||||
return footprint;
|
||||
}
|
||||
|
||||
#define FOOTPRINT(x) footprint=footprint_start+(x*footprint_increment)
|
||||
#define FOOTPRINTSETUP(increment) uint64_t footprint_start=footprint; uint64_t footprint_increment=increment;
|
||||
// Footprint concept here is that each function increments a different decimal digit.
|
||||
// The cumulative total shows the path taken for the upgrade.
|
||||
// Each function must have a single return for this to work.
|
||||
#define FOOTPRINT(x) function_footprint=(x*footprint_increment)
|
||||
#define FOOTPRINTSETUP(increment) uint64_t function_footprint = 0; uint64_t footprint_increment=increment;
|
||||
#define FOOTPRINTCAPTURE footprint+=function_footprint;
|
||||
|
||||
// The lock file is used to detect a failed upgrade. It is created at the start
|
||||
// of the upgrade procedure and deleted at the end of the upgrade procedure. If
|
||||
@@ -37,17 +40,17 @@ static const int upgrade_lock_prefix_size = 8 // magic ("tokuupgr")
|
||||
|
||||
static int
|
||||
verify_clean_shutdown_of_log_version_current(const char *log_dir, LSN * last_lsn) {
|
||||
int rval = DB_RUNRECOVERY;
|
||||
TOKULOGCURSOR logcursor = NULL;
|
||||
int rval = TOKUDB_UPGRADE_FAILURE;
|
||||
TOKULOGCURSOR cursor = NULL;
|
||||
int r;
|
||||
FOOTPRINTSETUP(100);
|
||||
|
||||
FOOTPRINT(1);
|
||||
|
||||
r = toku_logcursor_create(&logcursor, log_dir);
|
||||
r = toku_logcursor_create(&cursor, log_dir);
|
||||
assert(r == 0);
|
||||
struct log_entry *le = NULL;
|
||||
r = toku_logcursor_last(logcursor, &le);
|
||||
r = toku_logcursor_last(cursor, &le);
|
||||
if (r == 0) {
|
||||
FOOTPRINT(2);
|
||||
if (le->cmd==LT_shutdown) {
|
||||
@@ -57,276 +60,108 @@ verify_clean_shutdown_of_log_version_current(const char *log_dir, LSN * last_lsn
|
||||
rval = 0;
|
||||
}
|
||||
}
|
||||
r = toku_logcursor_destroy(&logcursor);
|
||||
r = toku_logcursor_destroy(&cursor);
|
||||
assert(r == 0);
|
||||
FOOTPRINTCAPTURE;
|
||||
return rval;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
verify_clean_shutdown_of_log_version_1(const char *log_dir, LSN * last_lsn) {
|
||||
FOOTPRINTSETUP(100);
|
||||
verify_clean_shutdown_of_log_version_old(const char *log_dir, LSN * last_lsn) {
|
||||
int rval = TOKUDB_UPGRADE_FAILURE;
|
||||
int r;
|
||||
FOOTPRINTSETUP(10);
|
||||
|
||||
FOOTPRINT(1);
|
||||
//TODO: Remove this hack:
|
||||
//Base this function on
|
||||
// - (above)verify_clean_shutdown_of_log_version_current
|
||||
// - (3.1)tokudb_needs_recovery
|
||||
// - do breadth/depth first search to find out which functions have to be copied over from 3.1
|
||||
// - Put copied functions in .. backwards_log_1.[ch]
|
||||
LSN lsn = {.lsn = 1LLU << 40};
|
||||
if (last_lsn)
|
||||
*last_lsn = lsn;
|
||||
log_dir = log_dir;
|
||||
|
||||
return 0;
|
||||
|
||||
int n_logfiles;
|
||||
char **logfiles;
|
||||
r = toku_logger_find_logfiles(log_dir, &logfiles, &n_logfiles);
|
||||
if (r!=0) return r;
|
||||
|
||||
char *basename;
|
||||
TOKULOGCURSOR cursor;
|
||||
struct log_entry *entry;
|
||||
//Only look at newest log
|
||||
basename = strrchr(logfiles[n_logfiles-1], '/') + 1;
|
||||
int version;
|
||||
long long index = -1;
|
||||
r = sscanf(basename, "log%lld.tokulog%d", &index, &version);
|
||||
assert(r==2); // found index and version
|
||||
assert(version>=TOKU_LOG_MIN_SUPPORTED_VERSION);
|
||||
assert(version< TOKU_LOG_VERSION); //Must be old
|
||||
// find last LSN
|
||||
r = toku_logcursor_create_for_file(&cursor, log_dir, basename);
|
||||
if (r==0) {
|
||||
r = toku_logcursor_last(cursor, &entry);
|
||||
if (r == 0) {
|
||||
FOOTPRINT(2);
|
||||
if (entry->cmd==LT_shutdown) {
|
||||
LSN lsn = entry->u.shutdown.lsn;
|
||||
if (last_lsn)
|
||||
*last_lsn = lsn;
|
||||
rval = 0;
|
||||
}
|
||||
}
|
||||
r = toku_logcursor_destroy(&cursor);
|
||||
assert(r == 0);
|
||||
}
|
||||
for(int i=0;i<n_logfiles;i++) {
|
||||
toku_free(logfiles[i]);
|
||||
}
|
||||
toku_free(logfiles);
|
||||
FOOTPRINTCAPTURE;
|
||||
return rval;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
verify_clean_shutdown_of_log_version(const char *log_dir, uint32_t version, LSN *last_lsn) {
|
||||
// return 0 if clean shutdown, DB_RUNRECOVERY if not clean shutdown
|
||||
// return 0 if clean shutdown, TOKUDB_UPGRADE_FAILURE if not clean shutdown
|
||||
// examine logfile at logfilenum and possibly logfilenum-1
|
||||
int r = 0;
|
||||
FOOTPRINTSETUP(100);
|
||||
FOOTPRINTSETUP(1000);
|
||||
|
||||
if (version == TOKU_LOG_VERSION_1) {
|
||||
if (version < TOKU_LOG_VERSION) {
|
||||
FOOTPRINT(1);
|
||||
r = verify_clean_shutdown_of_log_version_1(log_dir, last_lsn);
|
||||
r = verify_clean_shutdown_of_log_version_old(log_dir, last_lsn);
|
||||
}
|
||||
else {
|
||||
FOOTPRINT(2);
|
||||
assert(version == TOKU_LOG_VERSION);
|
||||
r = verify_clean_shutdown_of_log_version_current(log_dir, last_lsn);
|
||||
}
|
||||
FOOTPRINTCAPTURE;
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//Cross the Rubicon (POINT OF NO RETURN)
|
||||
static int
|
||||
convert_logs_and_fsync(const char *log_dir, const char *env_dir, uint32_t from_version, uint32_t to_version) {
|
||||
int r;
|
||||
FOOTPRINTSETUP(100);
|
||||
|
||||
r = verify_clean_shutdown_of_log_version(log_dir, to_version, NULL);
|
||||
assert(r==0);
|
||||
r = toku_delete_all_logs_of_version(log_dir, from_version);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
if (to_version==TOKU_LOG_VERSION_1) {
|
||||
//Undo an upgrade from version 1.
|
||||
//Delete rollback cachefile if it exists.
|
||||
FOOTPRINT(1);
|
||||
|
||||
int rollback_len = strlen(log_dir) + sizeof(ROLLBACK_CACHEFILE_NAME) +1; //1 for '/'
|
||||
char rollback_fname[rollback_len];
|
||||
|
||||
{
|
||||
int l = snprintf(rollback_fname, sizeof(rollback_fname),
|
||||
"%s/%s", env_dir, ROLLBACK_CACHEFILE_NAME);
|
||||
assert(l+1 == (signed)(sizeof(rollback_fname)));
|
||||
}
|
||||
r = unlink(rollback_fname);
|
||||
assert(r==0 || errno==ENOENT);
|
||||
if (r==0) {
|
||||
r = toku_fsync_dir_by_name_without_accounting(env_dir);
|
||||
assert(r==0);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
//After this function completes:
|
||||
// If any log files exist they are all of the same version.
|
||||
// There is no lock file.
|
||||
// There is no commit file.
|
||||
static int
|
||||
cleanup_previous_upgrade_attempt(const char *env_dir, const char *log_dir,
|
||||
const char *upgrade_lock_fname,
|
||||
const char *upgrade_commit_fname) {
|
||||
int r = 0;
|
||||
int lock_fd;
|
||||
int commit_fd;
|
||||
unsigned char prefix[upgrade_lock_prefix_size];
|
||||
FOOTPRINTSETUP(1000);
|
||||
|
||||
commit_fd = open(upgrade_commit_fname, O_RDONLY|O_BINARY, S_IRWXU);
|
||||
if (commit_fd<0) {
|
||||
assert(errno==ENOENT);
|
||||
}
|
||||
lock_fd = open(upgrade_lock_fname, O_RDONLY|O_BINARY, S_IRWXU);
|
||||
if (lock_fd<0) {
|
||||
assert(errno == ENOENT);
|
||||
//Nothing to clean up (lock file does not exist).
|
||||
}
|
||||
else { //Lock file exists. Will commit or abort the upgrade.
|
||||
FOOTPRINT(1);
|
||||
int64_t n = pread(lock_fd, prefix, upgrade_lock_prefix_size, 0);
|
||||
assert(n>=0 && n <= upgrade_lock_prefix_size);
|
||||
struct rbuf rb;
|
||||
rb.size = upgrade_lock_prefix_size;
|
||||
rb.buf = prefix;
|
||||
rb.ndone = 0;
|
||||
if (n == upgrade_lock_prefix_size) {
|
||||
FOOTPRINT(2);
|
||||
//Check magic number
|
||||
bytevec magic;
|
||||
rbuf_literal_bytes(&rb, &magic, 8);
|
||||
assert(memcmp(magic,"tokuupgr",8)==0);
|
||||
uint32_t to_version = rbuf_network_int(&rb);
|
||||
uint32_t from_version = rbuf_network_int(&rb);
|
||||
uint32_t suffix_length = rbuf_int(&rb);
|
||||
uint32_t stored_x1764 = rbuf_int(&rb);
|
||||
uint32_t calculated_x1764 = x1764_memory(rb.buf, rb.size-4);
|
||||
assert(calculated_x1764 == stored_x1764);
|
||||
//Now that checksum matches, verify data.
|
||||
|
||||
assert(to_version == TOKU_LOG_VERSION); //Only upgrading directly to newest log version.
|
||||
assert(from_version < TOKU_LOG_VERSION); //Otherwise it isn't an upgrade.
|
||||
assert(from_version >= TOKU_LOG_MIN_SUPPORTED_VERSION); //TODO: make this an error case once we have 3 log versions
|
||||
assert(suffix_length == 0); //TODO: Future versions may change this.
|
||||
if (commit_fd>=0) { //Commit the upgrade
|
||||
footprint_previous_upgrade = 1;
|
||||
FOOTPRINT(3);
|
||||
r = convert_logs_and_fsync(log_dir, env_dir, from_version, to_version);
|
||||
assert(r==0);
|
||||
}
|
||||
else { //Abort the upgrade
|
||||
footprint_previous_upgrade = 2;
|
||||
FOOTPRINT(4);
|
||||
r = convert_logs_and_fsync(log_dir, env_dir, to_version, from_version);
|
||||
assert(r==0);
|
||||
}
|
||||
}
|
||||
else { // We never finished writing lock file: commit file cannot exist yet.
|
||||
// We are aborting the upgrade, but because the previous attempt never got past
|
||||
// writing the lock file, nothing needs to be undone.
|
||||
assert(commit_fd<0);
|
||||
}
|
||||
{ //delete lock file
|
||||
r = close(lock_fd);
|
||||
assert(r==0);
|
||||
r = unlink(upgrade_lock_fname);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
}
|
||||
}
|
||||
if (commit_fd>=0) { //delete commit file
|
||||
r = close(commit_fd);
|
||||
assert(r==0);
|
||||
r = unlink(upgrade_commit_fname);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
write_commit_file_and_fsync(const char *log_dir, const char * upgrade_commit_fname) {
|
||||
int fd;
|
||||
fd = open(upgrade_commit_fname, O_RDWR|O_BINARY|O_CREAT|O_EXCL, S_IRWXU);
|
||||
assert(fd>=0);
|
||||
|
||||
int r;
|
||||
r = toku_file_fsync_without_accounting(fd);
|
||||
assert(r==0);
|
||||
r = close(fd);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int
|
||||
write_lock_file_and_fsync(const char *log_dir, const char * upgrade_lock_fname, uint32_t from_version) {
|
||||
int fd;
|
||||
fd = open(upgrade_lock_fname, O_RDWR|O_BINARY|O_CREAT|O_EXCL, S_IRWXU);
|
||||
assert(fd>=0);
|
||||
|
||||
char buf[upgrade_lock_prefix_size];
|
||||
struct wbuf wb;
|
||||
const int suffix_size = 0;
|
||||
wbuf_init(&wb, buf, upgrade_lock_prefix_size);
|
||||
{ //Serialize to wbuf
|
||||
wbuf_literal_bytes(&wb, "tokuupgr", 8); //magic
|
||||
wbuf_network_int(&wb, TOKU_LOG_VERSION); //to version
|
||||
wbuf_network_int(&wb, from_version); //from version
|
||||
wbuf_int(&wb, suffix_size); //Suffix Length
|
||||
u_int32_t checksum = x1764_finish(&wb.checksum);
|
||||
wbuf_int(&wb, checksum); //checksum
|
||||
assert(wb.ndone == wb.size);
|
||||
}
|
||||
toku_os_full_pwrite(fd, wb.buf, wb.size, 0);
|
||||
{
|
||||
//Serialize suffix to wbuf and then disk (if exist)
|
||||
//There is no suffix as of TOKU_LOG_VERSION_2
|
||||
}
|
||||
int r;
|
||||
r = toku_file_fsync_without_accounting(fd);
|
||||
assert(r==0);
|
||||
r = close(fd);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
return r;
|
||||
}
|
||||
|
||||
// from_version is version of lognumber_newest, which contains last_lsn
|
||||
static int
|
||||
upgrade_log(const char *env_dir, const char *log_dir,
|
||||
const char * upgrade_lock_fname, const char * upgrade_commit_fname,
|
||||
LSN last_lsn,
|
||||
uint32_t from_version) { // the real deal
|
||||
upgrade_log(const char *env_dir, const char *log_dir, LSN last_lsn) { // the real deal
|
||||
int r;
|
||||
FOOTPRINTSETUP(1000);
|
||||
FOOTPRINTSETUP(10000);
|
||||
|
||||
r = write_lock_file_and_fsync(log_dir, upgrade_lock_fname, from_version);
|
||||
assert(r==0);
|
||||
|
||||
LSN initial_lsn = last_lsn;
|
||||
initial_lsn.lsn++;
|
||||
CACHETABLE ct;
|
||||
TOKULOGGER logger;
|
||||
|
||||
FOOTPRINT(1);
|
||||
|
||||
{ //Create temporary environment
|
||||
r = toku_create_cachetable(&ct, 1<<25, initial_lsn, NULL);
|
||||
assert(r == 0);
|
||||
toku_cachetable_set_env_dir(ct, env_dir);
|
||||
r = toku_logger_create(&logger);
|
||||
assert(r == 0);
|
||||
toku_logger_write_log_files(logger, FALSE); //Prevent initial creation of log file
|
||||
toku_logger_set_cachetable(logger, ct);
|
||||
r = toku_logger_open(log_dir, logger);
|
||||
assert(r==0);
|
||||
r = toku_logger_restart(logger, initial_lsn); //Turn log writing on and create first log file with initial lsn
|
||||
assert(r==0);
|
||||
FOOTPRINT(1);
|
||||
}
|
||||
if (from_version == TOKU_LOG_VERSION_1) {
|
||||
{ //Create rollback cachefile
|
||||
r = toku_logger_open_rollback(logger, ct, TRUE);
|
||||
assert(r==0);
|
||||
}
|
||||
{ //Checkpoint
|
||||
r = toku_checkpoint(ct, logger, NULL, NULL, NULL, NULL);
|
||||
assert(r == 0);
|
||||
}
|
||||
{ //Close rollback cachefile
|
||||
r = toku_logger_close_rollback(logger, FALSE);
|
||||
assert(r==0);
|
||||
}
|
||||
FOOTPRINT(2);
|
||||
}
|
||||
{ //Checkpoint
|
||||
r = toku_checkpoint(ct, logger, NULL, NULL, NULL, NULL); //fsyncs log dir
|
||||
assert(r == 0);
|
||||
FOOTPRINT(3);
|
||||
}
|
||||
{ //Close cachetable and logger
|
||||
r = toku_logger_shutdown(logger);
|
||||
@@ -335,82 +170,53 @@ upgrade_log(const char *env_dir, const char *log_dir,
|
||||
assert(r==0);
|
||||
r = toku_logger_close(&logger);
|
||||
assert(r==0);
|
||||
FOOTPRINT(4);
|
||||
}
|
||||
{ //Write commit file
|
||||
r = write_commit_file_and_fsync(log_dir, upgrade_commit_fname);
|
||||
{
|
||||
r = verify_clean_shutdown_of_log_version(log_dir, TOKU_LOG_VERSION, NULL);
|
||||
assert(r==0);
|
||||
}
|
||||
{ // Cross the Rubicon here:
|
||||
// Delete all old logs: POINT OF NO RETURN
|
||||
r = convert_logs_and_fsync(log_dir, env_dir, from_version, TOKU_LOG_VERSION);
|
||||
assert(r==0);
|
||||
FOOTPRINT(5);
|
||||
}
|
||||
{ //Delete upgrade lock file and ensure directory is fsynced
|
||||
r = unlink(upgrade_lock_fname);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
}
|
||||
{ //Delete upgrade commit file and ensure directory is fsynced
|
||||
r = unlink(upgrade_commit_fname);
|
||||
assert(r==0);
|
||||
r = toku_fsync_dir_by_name_without_accounting(log_dir);
|
||||
assert(r==0);
|
||||
}
|
||||
FOOTPRINT(6);
|
||||
FOOTPRINTCAPTURE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
toku_maybe_upgrade_log(const char *env_dir, const char *log_dir) {
|
||||
toku_maybe_upgrade_log(const char *env_dir, const char *log_dir, LSN * lsn_of_clean_shutdown, BOOL * upgrade_in_progress) {
|
||||
int r;
|
||||
int lockfd = -1;
|
||||
FOOTPRINTSETUP(10000);
|
||||
FOOTPRINTSETUP(100000);
|
||||
|
||||
*upgrade_in_progress = FALSE; // set TRUE only if all criteria are met and we're actually doing an upgrade
|
||||
|
||||
FOOTPRINT(1);
|
||||
r = toku_recover_lock(log_dir, &lockfd);
|
||||
if (r == 0) {
|
||||
FOOTPRINT(2);
|
||||
assert(log_dir);
|
||||
assert(env_dir);
|
||||
char upgrade_lock_fname[strlen(log_dir) + sizeof(upgrade_lock_file_suffix)];
|
||||
{ //Generate full fname
|
||||
int l = snprintf(upgrade_lock_fname, sizeof(upgrade_lock_fname),
|
||||
"%s%s", log_dir, upgrade_lock_file_suffix);
|
||||
assert(l+1 == (ssize_t)(sizeof(upgrade_lock_fname)));
|
||||
}
|
||||
char upgrade_commit_fname[strlen(log_dir) + sizeof(upgrade_commit_file_suffix)];
|
||||
{ //Generate full fname
|
||||
int l = snprintf(upgrade_commit_fname, sizeof(upgrade_commit_fname),
|
||||
"%s%s", log_dir, upgrade_commit_file_suffix);
|
||||
assert(l+1 == (ssize_t)(sizeof(upgrade_commit_fname)));
|
||||
}
|
||||
|
||||
r = cleanup_previous_upgrade_attempt(env_dir, log_dir,
|
||||
upgrade_lock_fname, upgrade_commit_fname);
|
||||
uint32_t version_of_logs_on_disk;
|
||||
BOOL found_any_logs;
|
||||
r = toku_get_version_of_logs_on_disk(log_dir, &found_any_logs, &version_of_logs_on_disk);
|
||||
if (r==0) {
|
||||
uint32_t version_of_logs_on_disk;
|
||||
BOOL found_any_logs;
|
||||
r = toku_get_version_of_logs_on_disk(log_dir, &found_any_logs, &version_of_logs_on_disk);
|
||||
if (r==0) {
|
||||
if (!found_any_logs)
|
||||
r = 0; //No logs means no logs to upgrade.
|
||||
else if (version_of_logs_on_disk > TOKU_LOG_VERSION)
|
||||
r = TOKUDB_DICTIONARY_TOO_NEW;
|
||||
else if (version_of_logs_on_disk < TOKU_LOG_MIN_SUPPORTED_VERSION)
|
||||
r = TOKUDB_DICTIONARY_TOO_OLD;
|
||||
else if (version_of_logs_on_disk == TOKU_LOG_VERSION)
|
||||
r = 0; //Logs are up to date
|
||||
else {
|
||||
FOOTPRINT(1);
|
||||
LSN last_lsn;
|
||||
r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn);
|
||||
if (r==0) {
|
||||
FOOTPRINT(2);
|
||||
r = upgrade_log(env_dir, log_dir,
|
||||
upgrade_lock_fname, upgrade_commit_fname,
|
||||
last_lsn, version_of_logs_on_disk);
|
||||
}
|
||||
FOOTPRINT(3);
|
||||
if (!found_any_logs)
|
||||
r = 0; //No logs means no logs to upgrade.
|
||||
else if (version_of_logs_on_disk > TOKU_LOG_VERSION)
|
||||
r = TOKUDB_DICTIONARY_TOO_NEW;
|
||||
else if (version_of_logs_on_disk < TOKU_LOG_MIN_SUPPORTED_VERSION)
|
||||
r = TOKUDB_DICTIONARY_TOO_OLD;
|
||||
else if (version_of_logs_on_disk == TOKU_LOG_VERSION)
|
||||
r = 0; //Logs are up to date
|
||||
else {
|
||||
FOOTPRINT(4);
|
||||
LSN last_lsn;
|
||||
r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn);
|
||||
if (r==0) {
|
||||
FOOTPRINT(5);
|
||||
*lsn_of_clean_shutdown = last_lsn;
|
||||
*upgrade_in_progress = TRUE;
|
||||
r = upgrade_log(env_dir, log_dir, last_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -421,6 +227,7 @@ toku_maybe_upgrade_log(const char *env_dir, const char *log_dir) {
|
||||
if (r==0) r = rc;
|
||||
}
|
||||
}
|
||||
FOOTPRINTCAPTURE;
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
@@ -89,7 +89,7 @@ static int lc_open_logfile(TOKULOGCURSOR lc, int index) {
|
||||
r = toku_read_logmagic(lc->cur_fp, &version);
|
||||
if (r!=0)
|
||||
return DB_BADFORMAT;
|
||||
if (version != TOKU_LOG_VERSION)
|
||||
if (version < TOKU_LOG_MIN_SUPPORTED_VERSION || version > TOKU_LOG_VERSION)
|
||||
return DB_BADFORMAT;
|
||||
}
|
||||
// mark as open
|
||||
@@ -379,6 +379,7 @@ int toku_logcursor_first(TOKULOGCURSOR lc, struct log_entry **le) {
|
||||
return r;
|
||||
}
|
||||
|
||||
//get last entry in the logfile specified by logcursor
|
||||
int toku_logcursor_last(TOKULOGCURSOR lc, struct log_entry **le) {
|
||||
int r=0;
|
||||
if ( lc->entry_valid ) {
|
||||
@@ -462,6 +463,7 @@ static int lc_fix_bad_logfile(TOKULOGCURSOR lc) {
|
||||
|
||||
r = fseek(lc->cur_fp, 0, SEEK_SET); if ( r!=0 ) return r;
|
||||
r = toku_read_logmagic(lc->cur_fp, &version); if ( r!=0 ) return r;
|
||||
if (version != TOKU_LOG_VERSION) return -1;
|
||||
|
||||
toku_off_t last_good_pos;
|
||||
last_good_pos = ftello(lc->cur_fp);
|
||||
|
||||
@@ -79,17 +79,20 @@ int toku_logfilemgr_init(TOKULOGFILEMGR lfm, const char *log_dir) {
|
||||
return ENOMEM;
|
||||
}
|
||||
// find the index
|
||||
// basename is the filename of the i-th logfile
|
||||
basename = strrchr(logfiles[i], '/') + 1;
|
||||
int version;
|
||||
r = sscanf(basename, "log%lld.tokulog%d", &index, &version);
|
||||
assert(r==2); // found index and version
|
||||
assert(version==TOKU_LOG_VERSION);
|
||||
assert(version>=TOKU_LOG_MIN_SUPPORTED_VERSION);
|
||||
assert(version<=TOKU_LOG_VERSION);
|
||||
lf_info->index = index;
|
||||
// find last LSN
|
||||
lf_info->version = version;
|
||||
// find last LSN in logfile
|
||||
r = toku_logcursor_create_for_file(&cursor, log_dir, basename);
|
||||
if (r!=0)
|
||||
return r;
|
||||
r = toku_logcursor_last(cursor, &entry);
|
||||
r = toku_logcursor_last(cursor, &entry); // set "entry" to last log entry in logfile
|
||||
if ( r == 0 ) {
|
||||
lf_info->maxlsn = toku_log_entry_get_lsn(entry);
|
||||
tmp_lsn = lf_info->maxlsn;
|
||||
|
||||
@@ -15,6 +15,7 @@ extern "C" {
|
||||
struct toku_logfile_info {
|
||||
int64_t index;
|
||||
LSN maxlsn;
|
||||
uint32_t version;
|
||||
};
|
||||
typedef struct toku_logfile_info *TOKULOGFILEINFO;
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ static const int log_format_version=TOKU_LOG_VERSION;
|
||||
|
||||
static int open_logfile (TOKULOGGER logger);
|
||||
static int toku_logger_write_buffer (TOKULOGGER logger, LSN *fsynced_lsn);
|
||||
static int delete_logfile(TOKULOGGER logger, long long index);
|
||||
static int delete_logfile(TOKULOGGER logger, long long index, uint32_t version);
|
||||
static void grab_output(TOKULOGGER logger, LSN *fsynced_lsn);
|
||||
static void release_output(TOKULOGGER logger, LSN fsynced_lsn);
|
||||
|
||||
@@ -573,10 +573,40 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu
|
||||
return r;
|
||||
}
|
||||
|
||||
// TODO: Put this in portability layer when ready
|
||||
// in: file pathname that may have a dirname prefix
|
||||
// return: file leaf name
|
||||
static char * fileleafname(char *pathname) {
|
||||
const char delimiter = '/';
|
||||
char *leafname = strrchr(pathname, delimiter);
|
||||
if (leafname)
|
||||
leafname++;
|
||||
else
|
||||
leafname = pathname;
|
||||
return leafname;
|
||||
}
|
||||
|
||||
static int logfilenamecompare (const void *ap, const void *bp) {
|
||||
char *a=*(char**)ap;
|
||||
char *a_leafname = fileleafname(a);
|
||||
char *b=*(char**)bp;
|
||||
return strcmp(a,b);
|
||||
char * b_leafname = fileleafname(b);
|
||||
int rval;
|
||||
BOOL valid;
|
||||
uint64_t num_a = 0; // placate compiler
|
||||
uint64_t num_b = 0;
|
||||
uint32_t ver_a = 0;
|
||||
uint32_t ver_b = 0;
|
||||
valid = is_a_logfile_any_version(a_leafname, &num_a, &ver_a);
|
||||
invariant(valid);
|
||||
valid = is_a_logfile_any_version(b_leafname, &num_b, &ver_b);
|
||||
invariant(valid);
|
||||
if (ver_a < ver_b) rval = -1;
|
||||
else if (ver_a > ver_b) rval = +1;
|
||||
else if (num_a < num_b) rval = -1;
|
||||
else if (num_a > num_b) rval = +1;
|
||||
else rval = 0;
|
||||
return rval;
|
||||
}
|
||||
|
||||
// Return the log files in sorted order
|
||||
@@ -596,8 +626,9 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
|
||||
}
|
||||
int dirnamelen = strlen(directory);
|
||||
while ((de=readdir(d))) {
|
||||
long long thisl;
|
||||
if ( !(is_a_logfile(de->d_name, &thisl)) ) continue; //#2424: Skip over files that don't match the exact logfile template
|
||||
uint64_t thisl;
|
||||
uint32_t version_ignore;
|
||||
if ( !(is_a_logfile_any_version(de->d_name, &thisl, &version_ignore)) ) continue; //#2424: Skip over files that don't match the exact logfile template
|
||||
if (n_results+1>=result_limit) {
|
||||
result_limit*=2;
|
||||
result = toku_realloc(result, result_limit*sizeof(*result));
|
||||
@@ -610,8 +641,12 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
|
||||
snprintf(fname, fnamelen, "%s/%s", directory, de->d_name);
|
||||
result[n_results++] = fname;
|
||||
}
|
||||
// Return them in increasing order.
|
||||
qsort(result, n_results, sizeof(result[0]), logfilenamecompare);
|
||||
// Return them in increasing order. Set width to allow for newer log file names ("xxx.tokulog13")
|
||||
// which are one character longer than old log file names ("xxx.tokulog2"). The comparison function
|
||||
// won't look beyond the terminating NUL, so an extra character in the comparison string doesn't matter.
|
||||
// Allow room for terminating NUL after "xxx.tokulog13" even if result[0] is of form "xxx.tokulog2."
|
||||
int width = sizeof(result[0]+2);
|
||||
qsort(result, n_results, width, logfilenamecompare);
|
||||
*resultp = result;
|
||||
*n_logfiles = n_results;
|
||||
result[n_results]=0; // make a trailing null
|
||||
@@ -644,6 +679,7 @@ static int open_logfile (TOKULOGGER logger)
|
||||
return ENOMEM;
|
||||
lf_info->index = index;
|
||||
lf_info->maxlsn = logger->written_lsn;
|
||||
lf_info->version = TOKU_LOG_VERSION;
|
||||
toku_logfilemgr_add_logfile_info(logger->logfilemgr, lf_info);
|
||||
}
|
||||
logger->fsynced_lsn = logger->written_lsn;
|
||||
@@ -651,12 +687,12 @@ static int open_logfile (TOKULOGGER logger)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int delete_logfile(TOKULOGGER logger, long long index)
|
||||
static int delete_logfile(TOKULOGGER logger, long long index, uint32_t version)
|
||||
// Entry and Exit: This thread has permission to modify the output.
|
||||
{
|
||||
int fnamelen = strlen(logger->directory)+50;
|
||||
char fname[fnamelen];
|
||||
snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, TOKU_LOG_VERSION);
|
||||
snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, version);
|
||||
int r = remove(fname);
|
||||
return r;
|
||||
}
|
||||
@@ -675,7 +711,9 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
|
||||
|
||||
if ( logger->write_log_files && logger->trim_log_files) {
|
||||
while ( n_logfiles > 1 ) { // don't delete current logfile
|
||||
uint32_t log_version;
|
||||
lf_info = toku_logfilemgr_get_oldest_logfile_info(lfm);
|
||||
log_version = lf_info->version;
|
||||
if ( lf_info->maxlsn.lsn > trim_lsn.lsn ) {
|
||||
// file contains an open LSN, can't delete this or any newer log files
|
||||
break;
|
||||
@@ -684,7 +722,7 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
|
||||
long index = lf_info->index;
|
||||
toku_logfilemgr_delete_oldest_logfile_info(lfm);
|
||||
n_logfiles--;
|
||||
r = delete_logfile(logger, index);
|
||||
r = delete_logfile(logger, index, log_version);
|
||||
if (r!=0) {
|
||||
break;
|
||||
}
|
||||
@@ -1329,7 +1367,7 @@ toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s) {
|
||||
int
|
||||
toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint32_t *version_found) {
|
||||
BOOL found = FALSE;
|
||||
uint32_t single_version = 0;
|
||||
uint32_t highest_version = 0;
|
||||
int r = 0;
|
||||
|
||||
struct dirent *de;
|
||||
@@ -1338,16 +1376,17 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint
|
||||
r = errno;
|
||||
}
|
||||
else {
|
||||
// Examine every file in the directory and assert that all log files are of the same version (single_version).
|
||||
// Examine every file in the directory and find highest version
|
||||
while ((de=readdir(d))) {
|
||||
uint32_t this_log_version;
|
||||
uint64_t this_log_number;
|
||||
BOOL is_log = is_a_logfile_any_version(de->d_name, &this_log_number, &this_log_version);
|
||||
if (is_log) {
|
||||
if (found)
|
||||
assert(single_version == this_log_version);
|
||||
if (found) {
|
||||
highest_version = highest_version > this_log_version ? highest_version : this_log_version;
|
||||
}
|
||||
found = TRUE;
|
||||
single_version = this_log_version;
|
||||
highest_version = this_log_version;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1358,7 +1397,7 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint
|
||||
if (r==0) {
|
||||
*found_any_logs = found;
|
||||
if (found)
|
||||
*version_found = single_version;
|
||||
*version_found = highest_version;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -9,12 +9,19 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "brt_layout_version.h"
|
||||
|
||||
enum {
|
||||
TOKU_LOG_VERSION_1 = 1,
|
||||
TOKU_LOG_VERSION_2 = 2,
|
||||
TOKU_LOG_NEXT_VERSION, // the version after the current version
|
||||
TOKU_LOG_VERSION = TOKU_LOG_NEXT_VERSION-1, // A hack so I don't have to change this line.
|
||||
TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2
|
||||
//After 2 we linked the log version to the BRT_LAYOUT VERSION.
|
||||
//So it went from 2 to 13 (3-12 do not exist)
|
||||
TOKU_LOG_VERSION = BRT_LAYOUT_VERSION, //Linked
|
||||
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION > BRT_LAYOUT_VERSION_12 //linked once we remove support for 12
|
||||
TOKU_LOG_MIN_SUPPORTED_VERSION = BRT_LAYOUT_MIN_SUPPORTED_VERSION,
|
||||
#else
|
||||
TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2,
|
||||
#endif
|
||||
};
|
||||
#define ROLLBACK_CACHEFILE_NAME "tokudb.rollback"
|
||||
|
||||
|
||||
@@ -474,7 +474,7 @@ static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM
|
||||
}
|
||||
|
||||
static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash,
|
||||
void **rollback_pv, long *sizep, void *extraargs) {
|
||||
void **rollback_pv, long *sizep, int * UU(dirtyp), void *extraargs) {
|
||||
int r;
|
||||
struct brt_header *h = extraargs;
|
||||
assert(h->cf == cachefile);
|
||||
|
||||
@@ -46,7 +46,7 @@ flush (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), void *value, void *UU
|
||||
}
|
||||
|
||||
static int
|
||||
fetch (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), u_int32_t UU(fullhash), void **UU(value), long *UU(sizep), void *UU(extraargs))
|
||||
fetch (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), u_int32_t UU(fullhash), void **UU(value), long *UU(sizep), int *UU(dirtyp), void *UU(extraargs))
|
||||
{
|
||||
assert(0); // should not be called
|
||||
return 0;
|
||||
|
||||
@@ -19,12 +19,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext
|
||||
if (keep_me) n_keep_me++;
|
||||
}
|
||||
|
||||
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs) {
|
||||
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs) {
|
||||
cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs;
|
||||
assert(0); // should not be called
|
||||
n_fetch++;
|
||||
*value = 0;
|
||||
*sizep = item_size;
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
return 0;
|
||||
|
||||
@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
return 0;
|
||||
|
||||
@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
return 0;
|
||||
|
||||
@@ -17,10 +17,11 @@ flush (CACHEFILE cf __attribute__((__unused__)),
|
||||
}
|
||||
|
||||
static int
|
||||
fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t hash, void **vptr, long *sizep, void *extra) {
|
||||
fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t hash, void **vptr, long *sizep, int *dirtyp, void *extra) {
|
||||
cf = cf; hash = hash; extra = extra;
|
||||
*sizep = (long) key.b;
|
||||
*vptr = toku_malloc(*sizep);
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -31,6 +32,7 @@ fetch_error (CACHEFILE cf __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void*extraargs __attribute__((__unused__))
|
||||
) {
|
||||
return -1;
|
||||
|
||||
@@ -22,12 +22,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext
|
||||
if (keep_me) n_keep_me++;
|
||||
}
|
||||
|
||||
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs) {
|
||||
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs) {
|
||||
cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs;
|
||||
n_fetch++;
|
||||
sleep(10);
|
||||
*value = 0;
|
||||
*sizep = item_size;
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return -42;
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = toku_malloc(1);
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value,
|
||||
long *sizep,
|
||||
int *dirtyp,
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -47,6 +48,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -32,7 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
|
||||
*dirtyp = 0;
|
||||
return -42;
|
||||
}
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
|
||||
@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
|
||||
*value = 0;
|
||||
*sizep = 1;
|
||||
*dirtyp = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -75,6 +75,7 @@ static int r_fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void**value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void*extraargs __attribute__((__unused__))) {
|
||||
// fprintf(stderr, "Whoops, this should never be called");
|
||||
return -42;
|
||||
|
||||
@@ -33,6 +33,7 @@ static int f_fetch (CACHEFILE f,
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void**value,
|
||||
long *sizep,
|
||||
int *dirtyp,
|
||||
void*extraargs __attribute__((__unused__))) {
|
||||
void *buf = toku_malloc(BLOCKSIZE);
|
||||
int r = pread(toku_cachefile_get_and_pin_fd(f), buf, BLOCKSIZE, key.b);
|
||||
@@ -40,6 +41,7 @@ static int f_fetch (CACHEFILE f,
|
||||
assert(r==BLOCKSIZE);
|
||||
*value = buf;
|
||||
*sizep = BLOCKSIZE;
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -144,12 +144,13 @@ static struct item *make_item (u_int64_t key) {
|
||||
}
|
||||
|
||||
static CACHEKEY did_fetch={-1};
|
||||
static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
|
||||
static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, long *sizep __attribute__((__unused__)), int *dirtyp, void*extraargs) {
|
||||
if (verbose) printf("Fetch %" PRId64 "\n", key.b);
|
||||
assert (expect_f==f);
|
||||
assert((long)extraargs==23);
|
||||
*value = make_item(key.b);
|
||||
*sizep = test_object_size;
|
||||
*dirtyp = 0;
|
||||
did_fetch=key;
|
||||
return 0;
|
||||
}
|
||||
@@ -308,9 +309,11 @@ static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEK
|
||||
}
|
||||
static int fetch_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
|
||||
void**value, long *sizep __attribute__((__unused__)),
|
||||
int * dirtyp, void*extraargs) {
|
||||
assert((long)extraargs==42);
|
||||
*value=0;
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -369,17 +372,19 @@ static void null_flush (CACHEFILE cf __attribute__((__unused__)),
|
||||
BOOL for_checkpoint __attribute__((__unused__))) {
|
||||
}
|
||||
|
||||
static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), void*extraargs) {
|
||||
static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
|
||||
assert(fullhash==toku_cachetable_hash(cf,key));
|
||||
assert((long)extraargs==123);
|
||||
*value = (void*)((unsigned long)key.b+123L);
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), void*extraargs) {
|
||||
static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
|
||||
assert(fullhash==toku_cachetable_hash(cf,key));
|
||||
assert((long)extraargs==222);
|
||||
*value = (void*)((unsigned long)key.b+222L);
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -443,8 +448,9 @@ static void test_dirty_flush(CACHEFILE f,
|
||||
if (verbose) printf("test_dirty_flush %p %" PRId64 " %p %ld %u %u\n", f, key.b, value, size, (unsigned)do_write, (unsigned)keep);
|
||||
}
|
||||
|
||||
static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, long *size_ptr, void *arg) {
|
||||
static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, long *size_ptr, int * dirtyp, void *arg) {
|
||||
*value_ptr = arg;
|
||||
*dirtyp = 0;
|
||||
assert(fullhash==toku_cachetable_hash(f,key));
|
||||
if (verbose) printf("test_dirty_fetch %p %" PRId64 " %p %ld %p\n", f, key.b, *value_ptr, *size_ptr, arg);
|
||||
return 0;
|
||||
|
||||
@@ -112,10 +112,11 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)),
|
||||
//print_ints();
|
||||
}
|
||||
|
||||
static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
|
||||
static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
|
||||
assert(toku_cachetable_hash(f, key)==fullhash);
|
||||
assert((long)extraargs==(long)key.b);
|
||||
*value = (void*)(long)key.b;
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp __attribute__((__unused__)),
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
|
||||
u_int32_t fullhash __attribute__((__unused__)),
|
||||
void **value __attribute__((__unused__)),
|
||||
long *sizep __attribute__((__unused__)),
|
||||
int *dirtyp,
|
||||
void *extraargs __attribute__((__unused__))
|
||||
) {
|
||||
*dirtyp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
204
newbrt/ule.c
204
newbrt/ule.c
@@ -313,7 +313,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) {
|
||||
XIDS xids = brt_msg_get_xids(msg);
|
||||
invariant(xids_get_num_xids(xids) < MAX_TRANSACTION_RECORDS);
|
||||
enum brt_msg_type type = brt_msg_get_type(msg);
|
||||
if (type != BRT_OPTIMIZE) {
|
||||
if (type != BRT_OPTIMIZE && type != BRT_OPTIMIZE_FOR_UPGRADE) {
|
||||
ule_do_implicit_promotions(ule, xids);
|
||||
}
|
||||
switch (type) {
|
||||
@@ -342,6 +342,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) {
|
||||
ule_apply_commit(ule, xids);
|
||||
break;
|
||||
case BRT_OPTIMIZE:
|
||||
case BRT_OPTIMIZE_FOR_UPGRADE:
|
||||
ule_optimize(ule, xids);
|
||||
break;
|
||||
default:
|
||||
@@ -358,7 +359,7 @@ test_msg_modify_ule(ULE ule, BRT_MSG msg){
|
||||
|
||||
static void ule_optimize(ULE ule, XIDS xids) {
|
||||
if (ule->num_puxrs) {
|
||||
TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid;
|
||||
TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid; // outermost uncommitted
|
||||
TXNID oldest_living_xid = TXNID_NONE;
|
||||
uint32_t num_xids = xids_get_num_xids(xids);
|
||||
if (num_xids > 0) {
|
||||
@@ -2018,3 +2019,202 @@ bool transaction_open(TXNID xid) {
|
||||
|
||||
#endif
|
||||
|
||||
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_12
|
||||
#if TOKU_WINDOWS
|
||||
#pragma pack(push, 1)
|
||||
#endif
|
||||
struct __attribute__ ((__packed__)) leafentry_12 {
|
||||
u_int8_t num_xrs;
|
||||
u_int32_t keylen;
|
||||
u_int32_t innermost_inserted_vallen;
|
||||
union {
|
||||
struct __attribute__ ((__packed__)) leafentry_committed_12 {
|
||||
u_int8_t key_val[0]; //Actual key, then actual val
|
||||
} comm;
|
||||
struct __attribute__ ((__packed__)) leafentry_provisional_12 {
|
||||
u_int8_t innermost_type;
|
||||
TXNID xid_outermost_uncommitted;
|
||||
u_int8_t key_val_xrs[]; //Actual key,
|
||||
//then actual innermost inserted val,
|
||||
//then transaction records.
|
||||
} prov;
|
||||
} u;
|
||||
};
|
||||
#if TOKU_WINDOWS
|
||||
#pragma pack(pop)
|
||||
#endif
|
||||
|
||||
//Requires:
|
||||
// Leafentry that ule represents should not be destroyed (is not just all deletes)
|
||||
static size_t
|
||||
le_memsize_from_ule_12 (ULE ule) {
|
||||
uint32_t num_uxrs = ule->num_cuxrs + ule->num_puxrs;
|
||||
assert(num_uxrs);
|
||||
size_t rval;
|
||||
if (num_uxrs == 1) {
|
||||
assert(uxr_is_insert(&ule->uxrs[0]));
|
||||
rval = 1 //num_uxrs
|
||||
+4 //keylen
|
||||
+4 //vallen
|
||||
+ule->keylen //actual key
|
||||
+ule->uxrs[0].vallen; //actual val
|
||||
}
|
||||
else {
|
||||
rval = 1 //num_uxrs
|
||||
+4 //keylen
|
||||
+ule->keylen //actual key
|
||||
+1*num_uxrs //types
|
||||
+8*(num_uxrs-1); //txnids
|
||||
u_int8_t i;
|
||||
for (i = 0; i < num_uxrs; i++) {
|
||||
UXR uxr = &ule->uxrs[i];
|
||||
if (uxr_is_insert(uxr)) {
|
||||
rval += 4; //vallen
|
||||
rval += uxr->vallen; //actual val
|
||||
}
|
||||
}
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
//This function is mostly copied from 4.1.1
|
||||
// Note, number of transaction records in version 12 has been replaced by separate counters in version 13 (MVCC),
|
||||
// one counter for committed transaction records and one counter for provisional transaction records. When
|
||||
// upgrading a version 12 le to version 13, the number of committed transaction records is always set to one (1)
|
||||
// and the number of provisional transaction records is set to the original number of transaction records
|
||||
// minus one. The bottom transaction record is assumed to be a committed value. (If there is no committed
|
||||
// value then the bottom transaction record of version 12 is a committed delete.)
|
||||
// This is the only change from the 4.1.1 code. The rest of the leafentry is read as is.
|
||||
static void
|
||||
le_unpack_12(ULE ule, LEAFENTRY_12 le) {
|
||||
//Read num_uxrs
|
||||
uint8_t num_xrs = le->num_xrs;
|
||||
assert(num_xrs > 0);
|
||||
ule->uxrs = ule->uxrs_static; //Static version is always enough.
|
||||
ule->num_cuxrs = 1;
|
||||
ule->num_puxrs = num_xrs - 1;
|
||||
|
||||
//Read the keylen
|
||||
ule->keylen = toku_dtoh32(le->keylen);
|
||||
|
||||
//Read the vallen of innermost insert
|
||||
u_int32_t vallen_of_innermost_insert = toku_dtoh32(le->innermost_inserted_vallen);
|
||||
|
||||
u_int8_t *p;
|
||||
if (num_xrs == 1) {
|
||||
//Unpack a 'committed leafentry' (No uncommitted transactions exist)
|
||||
ule->keyp = le->u.comm.key_val;
|
||||
ule->uxrs[0].type = XR_INSERT; //Must be or the leafentry would not exist
|
||||
ule->uxrs[0].vallen = vallen_of_innermost_insert;
|
||||
ule->uxrs[0].valp = &le->u.comm.key_val[ule->keylen];
|
||||
ule->uxrs[0].xid = 0; //Required.
|
||||
|
||||
//Set p to immediately after leafentry
|
||||
p = &le->u.comm.key_val[ule->keylen + vallen_of_innermost_insert];
|
||||
}
|
||||
else {
|
||||
//Unpack a 'provisional leafentry' (Uncommitted transactions exist)
|
||||
|
||||
//Read in type.
|
||||
u_int8_t innermost_type = le->u.prov.innermost_type;
|
||||
assert(!uxr_type_is_placeholder(innermost_type));
|
||||
|
||||
//Read in xid
|
||||
TXNID xid_outermost_uncommitted = toku_dtoh64(le->u.prov.xid_outermost_uncommitted);
|
||||
|
||||
//Read pointer to key
|
||||
ule->keyp = le->u.prov.key_val_xrs;
|
||||
|
||||
//Read pointer to innermost inserted val (immediately after key)
|
||||
u_int8_t *valp_of_innermost_insert = &le->u.prov.key_val_xrs[ule->keylen];
|
||||
|
||||
//Point p to immediately after 'header'
|
||||
p = &le->u.prov.key_val_xrs[ule->keylen + vallen_of_innermost_insert];
|
||||
|
||||
BOOL found_innermost_insert = FALSE;
|
||||
int i; //Index in ULE.uxrs[]
|
||||
//Loop inner to outer
|
||||
for (i = num_xrs - 1; i >= 0; i--) {
|
||||
UXR uxr = &ule->uxrs[i];
|
||||
|
||||
//Innermost's type is in header.
|
||||
if (i < num_xrs - 1) {
|
||||
//Not innermost, so load the type.
|
||||
uxr->type = *p;
|
||||
p += 1;
|
||||
}
|
||||
else {
|
||||
//Innermost, load the type previously read from header
|
||||
uxr->type = innermost_type;
|
||||
}
|
||||
|
||||
//Committed txn id is implicit (0). (i==0)
|
||||
//Outermost uncommitted txnid is stored in header. (i==1)
|
||||
if (i > 1) {
|
||||
//Not committed nor outermost uncommitted, so load the xid.
|
||||
uxr->xid = toku_dtoh64(*(TXNID*)p);
|
||||
p += 8;
|
||||
}
|
||||
else if (i == 1) {
|
||||
//Outermost uncommitted, load the xid previously read from header
|
||||
uxr->xid = xid_outermost_uncommitted;
|
||||
}
|
||||
else {
|
||||
// i == 0, committed entry
|
||||
uxr->xid = 0;
|
||||
}
|
||||
|
||||
if (uxr_is_insert(uxr)) {
|
||||
if (found_innermost_insert) {
|
||||
//Not the innermost insert. Load vallen/valp
|
||||
uxr->vallen = toku_dtoh32(*(u_int32_t*)p);
|
||||
p += 4;
|
||||
|
||||
uxr->valp = p;
|
||||
p += uxr->vallen;
|
||||
}
|
||||
else {
|
||||
//Innermost insert, load the vallen/valp previously read from header
|
||||
uxr->vallen = vallen_of_innermost_insert;
|
||||
uxr->valp = valp_of_innermost_insert;
|
||||
found_innermost_insert = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(found_innermost_insert);
|
||||
}
|
||||
#if ULE_DEBUG
|
||||
size_t memsize = le_memsize_from_ule_12(ule);
|
||||
assert(p == ((u_int8_t*)le) + memsize);
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t
|
||||
leafentry_disksize_12(LEAFENTRY_12 le) {
|
||||
ULE_S ule;
|
||||
le_unpack_12(&ule, le);
|
||||
size_t memsize = le_memsize_from_ule_12(&ule);
|
||||
ule_cleanup(&ule);
|
||||
return memsize;
|
||||
}
|
||||
|
||||
int
|
||||
toku_le_upgrade_12_13(LEAFENTRY_12 old_leafentry,
|
||||
size_t *new_leafentry_memorysize,
|
||||
size_t *new_leafentry_disksize,
|
||||
LEAFENTRY *new_leafentry_p) {
|
||||
ULE_S ule;
|
||||
int rval;
|
||||
invariant(old_leafentry);
|
||||
le_unpack_12(&ule, old_leafentry);
|
||||
rval = le_pack(&ule, // create packed leafentry
|
||||
new_leafentry_memorysize,
|
||||
new_leafentry_disksize,
|
||||
new_leafentry_p,
|
||||
NULL, NULL, NULL); //NULL for omt means that we use malloc instead of mempool
|
||||
ule_cleanup(&ule);
|
||||
return rval;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
219
newbrt/upgrade_12_13_overview
Normal file
219
newbrt/upgrade_12_13_overview
Normal file
@@ -0,0 +1,219 @@
|
||||
The essential idea of auto-upgrade from BRT_LAYOUT_VERSION 12 to 13 is to
|
||||
take advantage of the similarities between the two versions, and not to
|
||||
try to create an infrastructure for all future upgrades.
|
||||
|
||||
As future layouts are created, upgrade paths, if any, will be crafted to
|
||||
each particular change.
|
||||
|
||||
On startup, the version number of the recovery log is checked. If an
|
||||
upgrade is needed, then the log is tested for a clean shutdown. If
|
||||
there is no clean shutdown, then an error is returned. If the log does
|
||||
end in a clean shutdown, then a new log file is created with the current
|
||||
version number, starting with an LSN that is one greater than the clean
|
||||
shutdown.
|
||||
|
||||
Once the new log is in place, the persistent environment dictionary is
|
||||
upgraded, and then normal operation begins.
|
||||
|
||||
The startup of a new version of the storage engine might not be crash
|
||||
safe.
|
||||
|
||||
Dictionaries, including the persistent environment and the fileops
|
||||
directory, are upgraded as they are read into memory from disk.
|
||||
|
||||
|
||||
The brt header is upgraded by
|
||||
- removing an unused flag
|
||||
- setting the transaction id to the xid of the clean shutdown
|
||||
- marking the header as dirty
|
||||
|
||||
Each non-leaf node is upgraded by:
|
||||
- removing an unused flag
|
||||
- upgrading the version numbers in the node
|
||||
- marking the node as dirty.
|
||||
This works because all of the version 12 messages are unchanged
|
||||
in version 13. The version 12 messages will be applied to the
|
||||
leafentries using version 13 code.
|
||||
|
||||
Each non-leaf node is upgraded by
|
||||
- removing an unused flag
|
||||
- using modified version 12 code to unpack the version 12 packed
|
||||
leaf entries into version 13 unpacked leaf entries
|
||||
- repacking the leafentries into a new mempool
|
||||
- destroying the original mempool (that holds the version 12
|
||||
node read from disk)
|
||||
The node is marked as dirty.
|
||||
|
||||
Once the brt is open, a BRT_OPTIMIZE broadcast message is inserted to
|
||||
optimize the dictionary.
|
||||
|
||||
|
||||
|
||||
A schematic overview of how a brt node is deserialized:
|
||||
|
||||
toku_deserialize_brtnode_from() { // accepts fd, fills in BRTNODE, brt_header
|
||||
|
||||
deserialize_brtnode_from_rbuf_versioned() {
|
||||
deserialize_brtnode_from_rbuf() // accepts rbuf fills in BRTNODE
|
||||
|
||||
if nonleaf deserialize_brtnode_nonleaf_from_rbuf(){ // rbuf -> BRTNODE (no version sensitivity)
|
||||
if leaf deserialize_brtnode_leaf_from_rbuf() { // calculates node size from leafentry sizes
|
||||
// leafentry sizes vary with version
|
||||
if version 12 {
|
||||
if leaf {
|
||||
unpack each leafentry into a version 13 ule
|
||||
pack each version 13 ule into version 13 le
|
||||
allocate new mempool for version 13 les
|
||||
destroy old mempool
|
||||
}
|
||||
remove unused flag
|
||||
increment version number
|
||||
mark dirty
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Open issues:
|
||||
- The brt layer makes some callbacks to the handlerton layer. If
|
||||
any of the functions change from one version to another, then
|
||||
the result may not be correct. A version number could be
|
||||
included in all the function signatures so the callback function
|
||||
could be aware of what version the caller is expecting.
|
||||
The callbacks are:
|
||||
- comparator
|
||||
- hot index generator
|
||||
- hot column mutator
|
||||
|
||||
|
||||
|
||||
Note, brt-internal.h defines struct subtree_estimates which contains field nkeys.
|
||||
This field is obsolete with the removal of dupsort databases (since it will always
|
||||
be the same as ndata), but removing it is not worth the trouble.
|
||||
|
||||
|
||||
|
||||
|
||||
==========
|
||||
|
||||
|
||||
|
||||
The changes from version 12 to 13 include (may not be complete list):
|
||||
- Persistent environment dictionary
|
||||
- version number
|
||||
- timestamp of environment creation (database installation)
|
||||
- history of previous versions
|
||||
- timestamps for upgrades
|
||||
- Recovery log
|
||||
- version number
|
||||
- new log entries (hotindex, maybe others)
|
||||
- brt header
|
||||
- version number
|
||||
- added field (root_xid_that_created), set to last checkpoint lsn
|
||||
- deleted flag (built-in comparison function for values)
|
||||
- brt internal node
|
||||
- version number
|
||||
- additional message(s) possible, no upgrade needed beyond changing version number
|
||||
- brt leafnode
|
||||
- version number
|
||||
- new leafentry format
|
||||
- version 12 leafentry unpack code is preserved
|
||||
- rollback log
|
||||
- version number is only change, no upgrade is needed because
|
||||
rollback logs are not preserved through clean shutdown
|
||||
|
||||
|
||||
Because version 12 and version 13 leafentries are significantly
|
||||
different, the way leafentries is handled is as follows:
|
||||
- deserialize_brtnode_leaf_from_rbuf()
|
||||
- sets up array of pointers to leafentries (to be unpacked later),
|
||||
these pointers are put into an OMT
|
||||
- calculates checksum (x1764)
|
||||
- adjusts ndone byte counter to verify that entire rbuf is read
|
||||
- deserialize_brtnode_from_rbuf_versioned() calls
|
||||
deserialize_brtnode_leaf_from_rbuf()
|
||||
- loop through all leafentries, one at a time:
|
||||
- unpack version 12 le and repack as version 13 le, each in its own malloc'ed memory
|
||||
- calculate new fingerprint
|
||||
- create new block
|
||||
- allocate new mempool
|
||||
- copy individual les into new mempool
|
||||
- destroy individual les
|
||||
- destroy original mempool
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Open issues:
|
||||
|
||||
- We need to verify clean shutdown before upgrade.
|
||||
If shutdown was not clean then we would run recovery, and the
|
||||
code does not support recovering from an old format version.
|
||||
- One way to do this is to increase the log version number (either
|
||||
increment or synchronize with BRT_LAYOUT_VERSION).
|
||||
- Can we just look at the log? needs_recovery(env);
|
||||
If this mechanism is specific
|
||||
to the version 12 to 13 upgrade, then that is adequate.
|
||||
Once the recovery log format changes, then we need a
|
||||
different mechanism, similar to the 3.x->4.x upgrade
|
||||
logic in log_upgrade.c.
|
||||
|
||||
|
||||
- How to decide that an upgrade is necessary?
|
||||
Needed for logic that says:
|
||||
- If upgrade is necessary, then verify clean shutdown:
|
||||
If upgrade is necessary (recorded version is old)
|
||||
and clean shutdown was not done, then exit with
|
||||
error code.
|
||||
|
||||
- tokudb_needs_recovery() is not separate from verification of
|
||||
clean shutdown. This function indicates if a recovery is
|
||||
necessary, but it does not verify simple clean shutdown
|
||||
with just the shutdown log entry. Instead, it looks for
|
||||
checkpoint begin/checkpoint end. (Also, comment at end
|
||||
is permitted.)
|
||||
|
||||
|
||||
Proposed solution:
|
||||
- Decision on whether to perform upgrade is done by examining log version.
|
||||
- If we need an upgrade:
|
||||
- If not clean shutdown, then exit with error message, change nothing
|
||||
on disk.
|
||||
- If clean shutdown, then create new log by simply creating new log file
|
||||
(empty, or perhaps with initial comment that says "start of new log").
|
||||
- Normal log-trimming code will delete old logs. (None of the
|
||||
locking logic in log_upgrade.c is needed.)
|
||||
- Log-opening logic needs to be modified to do this. See log file
|
||||
manager initialization function (and maybe functions it calls),
|
||||
maybe the log cursor:
|
||||
- logfilemgr.c: toku_logfilemgr_init()
|
||||
- Log-trimming logic loops over pairs of file names and LSNs,
|
||||
deleting old files based on LSN.
|
||||
|
||||
- Question: would it help any if the "clean shutdown" log entry
|
||||
was required to be in a new log file of its own? It would
|
||||
prevent the creation of an empty log file after "clean shutdown."
|
||||
It might, but it's probably not worth doing.
|
||||
|
||||
|
||||
|
||||
Issue of optimize message (to be sent into each dictionary on upgrade)
|
||||
- BRT_COMMIT_BROADCAST_ALL (should be faster executing, always commits everything, was needed for an earlier upgrade attempt)
|
||||
- BRT_OPTIMIZE (better tested, has been used, tests to see if transactions are still live)
|
||||
After upgrade (after clean shutdown, no running transactions, trees
|
||||
fully flattened), there is no difference in what these two message do.
|
||||
Note, BRT_OPTIMIZE requires a clean shutdown if used on upgrade. If used before recovery (which an upgrade
|
||||
without clean shutdown would do), then it would be wrong because it would appear that all transactions were
|
||||
completed.
|
||||
|
||||
|
||||
|
||||
TODO:
|
||||
- update brt header fields
|
||||
- original layout version
|
||||
- version read from disk
|
||||
- add accountability counters
|
||||
- capture LSN of clean shutdown, use instead of checkpoint lsn
|
||||
|
||||
@@ -182,9 +182,14 @@ xids_get_serialize_size(XIDS xids){
|
||||
return rval;
|
||||
}
|
||||
|
||||
|
||||
// Include TXNID zero in checksum to maintain compatibility
|
||||
// with previously released version.
|
||||
void
|
||||
toku_calc_more_murmur_xids (struct x1764 *mm, XIDS xids) {
|
||||
x1764_add(mm, &xids->num_xids, 1);
|
||||
TXNID zero = 0;
|
||||
x1764_add(mm, &zero, 8);
|
||||
u_int8_t index;
|
||||
u_int8_t num_xids = xids_get_num_xids(xids);
|
||||
for (index = 0; index < num_xids; index++) {
|
||||
|
||||
@@ -37,7 +37,6 @@ TRANSPARENT_UPGRADE_SRCS = $(wildcard upgrade-*.c)
|
||||
NONSTANDARD_SRCS= \
|
||||
$(RECOVER_SRCS) \
|
||||
$(LOADER_SRCS) \
|
||||
$(TRANSPARENT_UPGRADE_SRCS) \
|
||||
#end
|
||||
|
||||
#Tests that don't compile in windows. SHould
|
||||
@@ -179,6 +178,7 @@ BDB_DONTRUN_TESTS = \
|
||||
update-multiple-nochange \
|
||||
update-multiple-key0 \
|
||||
update-multiple-data-diagonal \
|
||||
upgrade_simple \
|
||||
upgrade-test-1 \
|
||||
upgrade-test-2 \
|
||||
upgrade-test-3 \
|
||||
|
||||
@@ -21,6 +21,7 @@ enum {ROWS_PER_TRANSACTION=10000};
|
||||
int NUM_DBS=5;
|
||||
int NUM_ROWS=100000;
|
||||
int CHECK_RESULTS=0;
|
||||
int littlenode = 0;
|
||||
enum { old_default_cachesize=1024 }; // MB
|
||||
int CACHESIZE=old_default_cachesize;
|
||||
int ALLOW_DUPS=0;
|
||||
@@ -112,7 +113,7 @@ static void run_test(void)
|
||||
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
|
||||
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
|
||||
env->set_errfile(env, stderr);
|
||||
r = env->checkpointing_set_period(env, 60); CKERR(r);
|
||||
r = env->checkpointing_set_period(env, 0); CKERR(r);
|
||||
|
||||
DBT desc;
|
||||
dbt_init(&desc, "foo", sizeof("foo"));
|
||||
@@ -124,6 +125,10 @@ static void run_test(void)
|
||||
for(int i=0;i<NUM_DBS;i++) {
|
||||
idx[i] = i;
|
||||
r = db_create(&dbs[i], env, 0); CKERR(r);
|
||||
if (littlenode) {
|
||||
r=dbs[i]->set_pagesize(dbs[i], 4096);
|
||||
CKERR(0);
|
||||
}
|
||||
r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r);
|
||||
dbs[i]->app_private = &idx[i];
|
||||
snprintf(name, sizeof(name), "db_%04x", i);
|
||||
@@ -176,7 +181,7 @@ static void do_args(int argc, char * const argv[]) {
|
||||
} else if (strcmp(argv[0], "-h")==0) {
|
||||
resultcode=0;
|
||||
do_usage:
|
||||
fprintf(stderr, "Usage: -h -c -d <num_dbs> -r <num_rows> %s\n", cmd);
|
||||
fprintf(stderr, "Usage: -h -c -n -d <num_dbs> -r <num_rows> %s\n", cmd);
|
||||
exit(resultcode);
|
||||
} else if (strcmp(argv[0], "-d")==0) {
|
||||
argc--; argv++;
|
||||
@@ -191,6 +196,8 @@ static void do_args(int argc, char * const argv[]) {
|
||||
NUM_ROWS = atoi(argv[0]);
|
||||
} else if (strcmp(argv[0], "-c")==0) {
|
||||
CHECK_RESULTS = 1;
|
||||
} else if (strcmp(argv[0], "-n")==0) {
|
||||
littlenode = 1;
|
||||
} else {
|
||||
fprintf(stderr, "Unknown arg: %s\n", argv[0]);
|
||||
resultcode=1;
|
||||
@@ -40,7 +40,7 @@ test_main (int argc, char *const argv[]) {
|
||||
dbt_init(&data, there, strlen(there)+1),
|
||||
0);
|
||||
r=txn->commit(txn, 0); CKERR(r);
|
||||
r=env->txn_checkpoint(env, 0, 0, 0);
|
||||
r=env->txn_checkpoint(env, 0, 0, 0); CKERR(r);
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -48,7 +48,7 @@ test_main (int argc, char *const argv[]) {
|
||||
dbt_init(&data, there, strlen(there)+1),
|
||||
0);
|
||||
r=txn->commit(txn, 0); CKERR(r);
|
||||
r=env->txn_checkpoint(env, 0, 0, 0);
|
||||
r=env->txn_checkpoint(env, 0, 0, 0); CKERR(r);
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -20,15 +20,17 @@ DB_ENV *env;
|
||||
enum {MAX_NAME=128};
|
||||
int NUM_DBS=5;
|
||||
int NUM_ROWS=100000;
|
||||
int CHECK_RESULTS=0;
|
||||
enum { old_default_cachesize=1024 }; // MB
|
||||
int CACHESIZE=old_default_cachesize;
|
||||
|
||||
char *db_v3_dir = "../../utils/preload-3.1-db";
|
||||
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
|
||||
int SRC_VERSION = 4;
|
||||
int littlenode = 0;
|
||||
int flat = 0;
|
||||
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
char *db_v5_dir = "dir.preload-db.c.tdb";
|
||||
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
|
||||
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
|
||||
char *db_v4_dir_flat = "env_preload.4.1.1.flat.cleanshutdown";
|
||||
|
||||
|
||||
|
||||
static void upgrade_test_1(DB **dbs) {
|
||||
int r;
|
||||
@@ -64,39 +66,52 @@ static void upgrade_test_1(DB **dbs) {
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
static void setup(void) {
|
||||
int r;
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
char * src_db_dir;
|
||||
|
||||
char *src_db_dir;
|
||||
if ( SRC_VERSION == 3 )
|
||||
src_db_dir = db_v3_dir;
|
||||
else if ( SRC_VERSION == 4 )
|
||||
src_db_dir = db_v4_dir;
|
||||
if ( SRC_VERSION == 4 ) {
|
||||
if (flat)
|
||||
src_db_dir = db_v4_dir_flat;
|
||||
else if (littlenode)
|
||||
src_db_dir = db_v4_dir_node4k;
|
||||
else
|
||||
src_db_dir = db_v4_dir;
|
||||
}
|
||||
else if ( SRC_VERSION == 5 ) {
|
||||
src_db_dir = db_v5_dir;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
|
||||
{
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
generate_permute_tables();
|
||||
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
int r;
|
||||
r = db_env_create(&env, 0); CKERR(r);
|
||||
if (littlenode) {
|
||||
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
|
||||
}
|
||||
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
|
||||
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
|
||||
env->set_errfile(env, stderr);
|
||||
r = env->checkpointing_set_period(env, 60); CKERR(r);
|
||||
r = env->checkpointing_set_period(env, 1); CKERR(r);
|
||||
|
||||
DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS);
|
||||
assert(dbs != NULL);
|
||||
@@ -117,7 +132,12 @@ static void do_args(int argc, char * const argv[]);
|
||||
|
||||
int test_main(int argc, char * const *argv) {
|
||||
do_args(argc, argv);
|
||||
run_test();
|
||||
if (SRC_VERSION == 4) {
|
||||
littlenode = 1; // 4k nodes, small cache
|
||||
}
|
||||
setup();
|
||||
run_test(); // read, upgrade, write back to disk
|
||||
run_test(); // read and verify
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -135,7 +155,7 @@ static void do_args(int argc, char * const argv[]) {
|
||||
} else if (strcmp(argv[0], "-h")==0) {
|
||||
resultcode=0;
|
||||
do_usage:
|
||||
fprintf(stderr, "Usage: -h -c -d <num_dbs> -r <num_rows> %s\n", cmd);
|
||||
fprintf(stderr, "Usage: -h -d <num_dbs> -r <num_rows> %s\n", cmd);
|
||||
exit(resultcode);
|
||||
} else if (strcmp(argv[0], "-d")==0) {
|
||||
argc--; argv++;
|
||||
@@ -148,11 +168,11 @@ static void do_args(int argc, char * const argv[]) {
|
||||
} else if (strcmp(argv[0], "-r")==0) {
|
||||
argc--; argv++;
|
||||
NUM_ROWS = atoi(argv[0]);
|
||||
} else if (strcmp(argv[0], "-c")==0) {
|
||||
CHECK_RESULTS = 1;
|
||||
} else if (strcmp(argv[0], "-V")==0) {
|
||||
argc--; argv++;
|
||||
SRC_VERSION = atoi(argv[0]);
|
||||
} else if (strcmp(argv[0], "-f")==0) {
|
||||
flat = 1;
|
||||
} else {
|
||||
fprintf(stderr, "Unknown arg: %s\n", argv[0]);
|
||||
resultcode=1;
|
||||
|
||||
@@ -19,14 +19,15 @@ enum {MAX_NAME=128};
|
||||
int NUM_DBS=5;
|
||||
int NUM_ROWS=100000;
|
||||
int CHECK_RESULTS=0;
|
||||
enum { old_default_cachesize=1024 }; // MB
|
||||
int CACHESIZE=old_default_cachesize;
|
||||
|
||||
char *db_v3_dir = "../../utils/preload-3.1-db";
|
||||
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
|
||||
int SRC_VERSION = 4;
|
||||
int littlenode = 0;
|
||||
|
||||
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
char *db_v5_dir = "dir.preload-db.c.tdb";
|
||||
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
|
||||
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
|
||||
|
||||
|
||||
static void upgrade_test_2(DB **dbs) {
|
||||
int r = 0;
|
||||
@@ -85,39 +86,52 @@ static void upgrade_test_2(DB **dbs) {
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
int r;
|
||||
|
||||
char *src_db_dir;
|
||||
if ( SRC_VERSION == 3 )
|
||||
src_db_dir = db_v3_dir;
|
||||
else if ( SRC_VERSION == 4 )
|
||||
src_db_dir = db_v4_dir;
|
||||
static void setup(void) {
|
||||
int r;
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
char * src_db_dir;
|
||||
|
||||
if ( SRC_VERSION == 4 ) {
|
||||
if (littlenode)
|
||||
src_db_dir = db_v4_dir_node4k;
|
||||
else
|
||||
src_db_dir = db_v4_dir;
|
||||
}
|
||||
else if ( SRC_VERSION == 5 ) {
|
||||
src_db_dir = db_v5_dir;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
|
||||
{
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
generate_permute_tables();
|
||||
|
||||
}
|
||||
|
||||
static void run_test(int checkpoint_period)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = db_env_create(&env, 0); CKERR(r);
|
||||
if (littlenode) {
|
||||
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
|
||||
}
|
||||
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
|
||||
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
|
||||
env->set_errfile(env, stderr);
|
||||
r = env->checkpointing_set_period(env, 60); CKERR(r);
|
||||
r = env->checkpointing_set_period(env, checkpoint_period); CKERR(r);
|
||||
|
||||
DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS);
|
||||
assert(dbs != NULL);
|
||||
@@ -136,9 +150,15 @@ static void run_test(void)
|
||||
// ------------ infrastructure ----------
|
||||
static void do_args(int argc, char * const argv[]);
|
||||
|
||||
|
||||
|
||||
int test_main(int argc, char * const *argv) {
|
||||
do_args(argc, argv);
|
||||
run_test();
|
||||
if (SRC_VERSION == 4) {
|
||||
littlenode = 1; // 4k nodes, small cache
|
||||
}
|
||||
setup();
|
||||
run_test(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,14 +19,15 @@ enum {MAX_NAME=128};
|
||||
int NUM_DBS=5;
|
||||
int NUM_ROWS=100000;
|
||||
int CHECK_RESULTS=0;
|
||||
enum { old_default_cachesize=1024 }; // MB
|
||||
int CACHESIZE=old_default_cachesize;
|
||||
|
||||
char *db_v3_dir = "../../utils/preload-3.1-db";
|
||||
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
|
||||
int SRC_VERSION = 4;
|
||||
int littlenode = 0;
|
||||
|
||||
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
char *db_v5_dir = "dir.preload-db.c.tdb";
|
||||
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
|
||||
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
|
||||
|
||||
|
||||
static void upgrade_test_3(DB **dbs) {
|
||||
int r;
|
||||
@@ -87,35 +88,47 @@ static void upgrade_test_3(DB **dbs) {
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
static void setup(void) {
|
||||
int r;
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
char * src_db_dir;
|
||||
|
||||
char *src_db_dir;
|
||||
if ( SRC_VERSION == 3 )
|
||||
src_db_dir = db_v3_dir;
|
||||
else if ( SRC_VERSION == 4 )
|
||||
src_db_dir = db_v4_dir;
|
||||
if ( SRC_VERSION == 4 ) {
|
||||
if (littlenode)
|
||||
src_db_dir = db_v4_dir_node4k;
|
||||
else
|
||||
src_db_dir = db_v4_dir;
|
||||
}
|
||||
else if ( SRC_VERSION == 5 ) {
|
||||
src_db_dir = db_v5_dir;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
|
||||
{
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
generate_permute_tables();
|
||||
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = db_env_create(&env, 0); CKERR(r);
|
||||
if (littlenode) {
|
||||
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
|
||||
}
|
||||
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
|
||||
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
|
||||
env->set_errfile(env, stderr);
|
||||
@@ -140,7 +153,16 @@ static void do_args(int argc, char * const argv[]);
|
||||
|
||||
int test_main(int argc, char * const *argv) {
|
||||
do_args(argc, argv);
|
||||
littlenode = 0;
|
||||
setup();
|
||||
run_test();
|
||||
if (SRC_VERSION == 4) {
|
||||
if (verbose)
|
||||
printf("Now repeat test with small nodes and small cache.\n");
|
||||
littlenode = 1; // 4k nodes, small cache
|
||||
setup();
|
||||
run_test();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,15 +19,18 @@ enum {MAX_NAME=128};
|
||||
int NUM_DBS=5;
|
||||
int NUM_ROWS=100000;
|
||||
int CHECK_RESULTS=0;
|
||||
enum { old_default_cachesize=1024 }; // MB
|
||||
int CACHESIZE=old_default_cachesize;
|
||||
int SRC_VERSION = 4;
|
||||
int littlenode = 0;
|
||||
|
||||
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
char *db_v5_dir = "dir.preload-db.c.tdb";
|
||||
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
|
||||
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
|
||||
|
||||
|
||||
enum {ROWS_PER_TRANSACTION=10000};
|
||||
|
||||
char *db_v3_dir = "../../utils/preload-3.1-db";
|
||||
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
|
||||
char *env_dir = ENVDIR; // the default env_dir.
|
||||
|
||||
int SRC_VERSION = 4;
|
||||
|
||||
static void upgrade_test_4(DB **dbs) {
|
||||
int r;
|
||||
@@ -122,35 +125,47 @@ static void upgrade_test_4(DB **dbs) {
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
static void setup(void) {
|
||||
int r;
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
char * src_db_dir;
|
||||
|
||||
char *src_db_dir;
|
||||
if ( SRC_VERSION == 3 )
|
||||
src_db_dir = db_v3_dir;
|
||||
else if ( SRC_VERSION == 4 )
|
||||
src_db_dir = db_v4_dir;
|
||||
if ( SRC_VERSION == 4 ) {
|
||||
if (littlenode)
|
||||
src_db_dir = db_v4_dir_node4k;
|
||||
else
|
||||
src_db_dir = db_v4_dir;
|
||||
}
|
||||
else if ( SRC_VERSION == 5 ) {
|
||||
src_db_dir = db_v5_dir;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
|
||||
{
|
||||
int len = 256;
|
||||
char syscmd[len];
|
||||
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd); CKERR(r);
|
||||
}
|
||||
|
||||
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
|
||||
assert(r<len);
|
||||
r = system(syscmd);
|
||||
CKERR(r);
|
||||
generate_permute_tables();
|
||||
|
||||
}
|
||||
|
||||
static void run_test(void)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = db_env_create(&env, 0); CKERR(r);
|
||||
if (littlenode) {
|
||||
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
|
||||
}
|
||||
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
|
||||
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
|
||||
env->set_errfile(env, stderr);
|
||||
@@ -175,7 +190,17 @@ static void do_args(int argc, char * const argv[]);
|
||||
|
||||
int test_main(int argc, char * const *argv) {
|
||||
do_args(argc, argv);
|
||||
do_args(argc, argv);
|
||||
littlenode = 0;
|
||||
setup();
|
||||
run_test();
|
||||
if (SRC_VERSION == 4) {
|
||||
if (verbose)
|
||||
printf("Now repeat test with small nodes and small cache.\n");
|
||||
littlenode = 1; // 4k nodes, small cache
|
||||
setup();
|
||||
run_test();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
88
src/tests/upgrade_simple.c
Normal file
88
src/tests/upgrade_simple.c
Normal file
@@ -0,0 +1,88 @@
|
||||
/* -*- mode: C; c-basic-offset: 4 -*- */
|
||||
|
||||
#ident "Copyright (c) 2009 Tokutek Inc. All rights reserved."
|
||||
#ident "$Id: env_startup.c 20778 2010-05-28 20:38:42Z yfogel $"
|
||||
|
||||
/* Purpose of this test is to verify simplest part of upgrade logic.
|
||||
* Start by creating two very simple 4.x environments,
|
||||
* one in each of two states:
|
||||
* - after a clean shutdown
|
||||
* - without a clean shutdown
|
||||
*
|
||||
* The two different environments will be used to exercise upgrade logic
|
||||
* for 5.x.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "test.h"
|
||||
#include <db.h>
|
||||
|
||||
static DB_ENV *env;
|
||||
|
||||
#define FLAGS_NOLOG DB_INIT_LOCK|DB_INIT_MPOOL|DB_CREATE|DB_PRIVATE
|
||||
#define FLAGS_LOG FLAGS_NOLOG|DB_INIT_TXN|DB_INIT_LOG
|
||||
|
||||
static int mode = S_IRWXU+S_IRWXG+S_IRWXO;
|
||||
|
||||
static void test_shutdown(void);
|
||||
|
||||
static void
|
||||
setup (u_int32_t flags, BOOL clean) {
|
||||
int r;
|
||||
if (env)
|
||||
test_shutdown();
|
||||
r = system("rm -rf " ENVDIR);
|
||||
CKERR(r);
|
||||
r=toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
|
||||
CKERR(r);
|
||||
if (clean) {
|
||||
r = system("cp env_simple.4.1.1.cleanshutdown/* " ENVDIR);
|
||||
}
|
||||
else {
|
||||
r = system("cp env_simple.4.1.1.dirtyshutdown/* " ENVDIR);
|
||||
}
|
||||
CKERR(r);
|
||||
r=db_env_create(&env, 0);
|
||||
CKERR(r);
|
||||
env->set_errfile(env, stderr);
|
||||
r=env->open(env, ENVDIR, flags, mode);
|
||||
if (clean)
|
||||
CKERR(r);
|
||||
else
|
||||
CKERR2(r, TOKUDB_UPGRADE_FAILURE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
test_shutdown(void) {
|
||||
int r;
|
||||
r=env->close(env, 0); CKERR(r);
|
||||
env = NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_env_startup(void) {
|
||||
u_int32_t flags;
|
||||
|
||||
flags = FLAGS_LOG;
|
||||
setup(flags, TRUE);
|
||||
print_engine_status(env);
|
||||
test_shutdown();
|
||||
setup(flags, FALSE);
|
||||
if (verbose) {
|
||||
printf("\n\nEngine status after aborted env->open() will have some garbage values:\n");
|
||||
}
|
||||
print_engine_status(env);
|
||||
test_shutdown();
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
test_main (int argc, char * const argv[]) {
|
||||
parse_args(argc, argv);
|
||||
test_env_startup();
|
||||
return 0;
|
||||
}
|
||||
156
src/ydb.c
156
src/ydb.c
@@ -423,37 +423,116 @@ db_use_builtin_key_cmp(DB *db) {
|
||||
return r;
|
||||
}
|
||||
|
||||
static const char * curr_env_ver_key = "current_version";
|
||||
// Keys used in persistent environment dictionary:
|
||||
// Following keys added in version 12
|
||||
static const char * orig_env_ver_key = "original_version";
|
||||
static const char * curr_env_ver_key = "current_version";
|
||||
// Following keys added in version 13
|
||||
static const char * creation_time_key = "creation_time";
|
||||
static const char * last_lsn_of_v12_key = "last_lsn_of_v12";
|
||||
static const char * upgrade_13_time_key = "upgrade_13_time"; // Add more keys for future upgrades
|
||||
|
||||
// Values read from (or written into) persistent environment,
|
||||
// kept here for read-only access from engine status.
|
||||
static uint32_t persistent_original_env_version;
|
||||
static uint32_t persistent_stored_env_version_at_startup; // read from curr_env_ver_key, prev version as of this startup
|
||||
static time_t persistent_creation_time;
|
||||
static uint64_t persistent_last_lsn_of_v12;
|
||||
static time_t persistent_upgrade_13_time;
|
||||
|
||||
// requires: persistent environment dictionary is already open
|
||||
// Requires: persistent environment dictionary is already open.
|
||||
// Input arg is lsn of clean shutdown of previous version,
|
||||
// or ZERO_LSN if no upgrade or if crash between log upgrade and here.
|
||||
static int
|
||||
maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn) {
|
||||
maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn, LSN last_lsn_of_clean_shutdown_read_from_log) {
|
||||
int r;
|
||||
uint32_t stored_env_version;
|
||||
DBT key, val;
|
||||
DB *persistent_environment = env->i->persistent_environment;
|
||||
|
||||
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
|
||||
toku_init_dbt(&val);
|
||||
r = toku_db_get(env->i->persistent_environment, txn, &key, &val, 0);
|
||||
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
|
||||
assert(r == 0);
|
||||
stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
|
||||
uint32_t stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
|
||||
persistent_stored_env_version_at_startup = stored_env_version;
|
||||
if (stored_env_version > BRT_LAYOUT_VERSION)
|
||||
r = TOKUDB_DICTIONARY_TOO_NEW;
|
||||
else if (stored_env_version < BRT_LAYOUT_MIN_SUPPORTED_VERSION)
|
||||
r = TOKUDB_DICTIONARY_TOO_OLD;
|
||||
else if (stored_env_version < BRT_LAYOUT_VERSION) {
|
||||
const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION);
|
||||
const uint32_t curr_env_ver_d = toku_htod32(BRT_LAYOUT_VERSION);
|
||||
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
|
||||
toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
|
||||
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, DB_YESOVERWRITE);
|
||||
toku_fill_dbt(&val, &curr_env_ver_d, sizeof(curr_env_ver_d));
|
||||
r = toku_db_put(persistent_environment, txn, &key, &val, DB_YESOVERWRITE);
|
||||
assert(r==0);
|
||||
|
||||
uint64_t last_lsn_of_v12_d = toku_htod64(last_lsn_of_clean_shutdown_read_from_log.lsn);
|
||||
toku_fill_dbt(&key, last_lsn_of_v12_key, strlen(last_lsn_of_v12_key));
|
||||
toku_fill_dbt(&val, &last_lsn_of_v12_d, sizeof(last_lsn_of_v12_d));
|
||||
r = toku_db_put(persistent_environment, txn, &key, &val, DB_YESOVERWRITE);
|
||||
assert(r==0);
|
||||
|
||||
time_t upgrade_13_time_d = toku_htod64(time(NULL));
|
||||
toku_fill_dbt(&key, upgrade_13_time_key, strlen(upgrade_13_time_key));
|
||||
toku_fill_dbt(&val, &upgrade_13_time_d, sizeof(upgrade_13_time_d));
|
||||
r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE);
|
||||
assert(r==0);
|
||||
}
|
||||
// TODO: add key/val for timestamp of VERSION_12_CREATION (could be upgrade)
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
// Capture persistent env contents to be read by engine status
|
||||
static void
|
||||
capture_persistent_env (DB_ENV * env, DB_TXN * txn) {
|
||||
int r;
|
||||
DBT key, val;
|
||||
DB *persistent_environment = env->i->persistent_environment;
|
||||
|
||||
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
|
||||
toku_init_dbt(&val);
|
||||
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
|
||||
assert(r == 0);
|
||||
uint32_t curr_env_version = toku_dtoh32(*(uint32_t*)val.data);
|
||||
assert(curr_env_version == BRT_LAYOUT_VERSION);
|
||||
|
||||
toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
|
||||
toku_init_dbt(&val);
|
||||
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
|
||||
assert(r == 0);
|
||||
persistent_original_env_version = toku_dtoh32(*(uint32_t*)val.data);
|
||||
assert(persistent_original_env_version <= curr_env_version);
|
||||
|
||||
// make no assertions about timestamps, clock may have been reset
|
||||
if (persistent_original_env_version >= BRT_LAYOUT_VERSION_13) {
|
||||
toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
|
||||
toku_init_dbt(&val);
|
||||
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
|
||||
assert(r == 0);
|
||||
persistent_creation_time = toku_dtoh64((*(time_t*)val.data));
|
||||
}
|
||||
|
||||
if (persistent_original_env_version != curr_env_version) {
|
||||
// an upgrade was performed at some time, capture info about the upgrade
|
||||
|
||||
toku_fill_dbt(&key, last_lsn_of_v12_key, strlen(last_lsn_of_v12_key));
|
||||
toku_init_dbt(&val);
|
||||
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
|
||||
assert(r == 0);
|
||||
persistent_last_lsn_of_v12 = toku_dtoh64(*(uint32_t*)val.data);
|
||||
|
||||
toku_fill_dbt(&key, upgrade_13_time_key, strlen(upgrade_13_time_key));
|
||||
toku_init_dbt(&val);
|
||||
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
|
||||
assert(r == 0);
|
||||
persistent_upgrade_13_time = toku_dtoh64((*(time_t*)val.data));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// return 0 if log exists or ENOENT if log does not exist
|
||||
static int
|
||||
ydb_recover_log_exists(DB_ENV *env) {
|
||||
@@ -492,7 +571,7 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
|
||||
assert(r);
|
||||
}
|
||||
|
||||
// Test for rollback cachefile
|
||||
// Test for existence of rollback cachefile if it is expected to exist
|
||||
if (r == 0 && need_rollback_cachefile) {
|
||||
path = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
|
||||
assert(path);
|
||||
@@ -558,11 +637,11 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
|
||||
}
|
||||
|
||||
static int
|
||||
ydb_maybe_upgrade_env (DB_ENV *env) {
|
||||
ydb_maybe_upgrade_env (DB_ENV *env, LSN * last_lsn_of_clean_shutdown_read_from_log, BOOL * upgrade_in_progress) {
|
||||
int r = 0;
|
||||
if (env->i->open_flags & DB_INIT_TXN && env->i->open_flags & DB_INIT_LOG) {
|
||||
toku_ydb_unlock();
|
||||
r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir);
|
||||
r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir, last_lsn_of_clean_shutdown_read_from_log, upgrade_in_progress);
|
||||
toku_ydb_lock();
|
||||
}
|
||||
return r;
|
||||
@@ -598,6 +677,8 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
assert(sizeof(time_t) == sizeof(uint64_t));
|
||||
|
||||
HANDLE_EXTRA_FLAGS(env, flags,
|
||||
DB_CREATE|DB_PRIVATE|DB_INIT_LOG|DB_INIT_TXN|DB_RECOVER|DB_INIT_MPOOL|DB_INIT_LOCK|DB_THREAD);
|
||||
|
||||
@@ -678,9 +759,22 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
|
||||
need_rollback_cachefile = TRUE;
|
||||
}
|
||||
|
||||
r = ydb_maybe_upgrade_env(env);
|
||||
LSN last_lsn_of_clean_shutdown_read_from_log = ZERO_LSN;
|
||||
BOOL upgrade_in_progress = FALSE;
|
||||
r = ydb_maybe_upgrade_env(env, &last_lsn_of_clean_shutdown_read_from_log, &upgrade_in_progress);
|
||||
if (r!=0) goto cleanup;
|
||||
|
||||
if (upgrade_in_progress) {
|
||||
// Delete old rollback file. There was a clean shutdown, so it has nothing useful,
|
||||
// and there is no value in upgrading it. It is simpler to just create a new one.
|
||||
char* rollback_filename = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
|
||||
assert(rollback_filename);
|
||||
r = unlink(rollback_filename);
|
||||
toku_free(rollback_filename);
|
||||
assert(r==0 || errno==ENOENT);
|
||||
need_rollback_cachefile = FALSE; // we're not expecting it to exist now
|
||||
}
|
||||
|
||||
r = validate_env(env, &newenv, need_rollback_cachefile); // make sure that environment is either new or complete
|
||||
if (r != 0) goto cleanup;
|
||||
|
||||
@@ -743,10 +837,12 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
|
||||
|
||||
int using_txns = env->i->open_flags & DB_INIT_TXN;
|
||||
if (env->i->logger) {
|
||||
// if this is a newborn env or if this is an upgrade, then create a brand new rollback file
|
||||
BOOL create_new_rollback_file = newenv | upgrade_in_progress;
|
||||
assert (using_txns);
|
||||
toku_logger_set_cachetable(env->i->logger, env->i->cachetable);
|
||||
toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm);
|
||||
r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, newenv);
|
||||
r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, create_new_rollback_file);
|
||||
assert(r==0);
|
||||
}
|
||||
|
||||
@@ -766,20 +862,30 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
|
||||
if (newenv) {
|
||||
// create new persistent_environment
|
||||
DBT key, val;
|
||||
const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION);
|
||||
persistent_original_env_version = BRT_LAYOUT_VERSION;
|
||||
const uint32_t environment_version = toku_htod32(persistent_original_env_version);
|
||||
|
||||
toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
|
||||
toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
|
||||
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
|
||||
assert(r==0);
|
||||
|
||||
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
|
||||
toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
|
||||
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
|
||||
assert(r==0);
|
||||
}
|
||||
else {
|
||||
r = maybe_upgrade_persistent_environment_dictionary(env, txn);
|
||||
|
||||
time_t creation_time_d = toku_htod64(time(NULL));
|
||||
toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
|
||||
toku_fill_dbt(&val, &creation_time_d, sizeof(creation_time_d));
|
||||
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
|
||||
assert(r==0);
|
||||
}
|
||||
else {
|
||||
r = maybe_upgrade_persistent_environment_dictionary(env, txn, last_lsn_of_clean_shutdown_read_from_log);
|
||||
assert(r==0);
|
||||
}
|
||||
capture_persistent_env(env, txn);
|
||||
}
|
||||
{
|
||||
r = toku_db_create(&env->i->directory, env, 0);
|
||||
@@ -805,6 +911,8 @@ cleanup:
|
||||
unlock_single_process(env);
|
||||
}
|
||||
}
|
||||
if (r == 0)
|
||||
errno = 0; // tabula rasa
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -1509,8 +1617,8 @@ format_time(const time_t *timer, char *buf) {
|
||||
}
|
||||
}
|
||||
|
||||
// Do not take ydb lock around or in this function.
|
||||
// If the engine is blocked because some thread is holding the ydb lock, this function
|
||||
// Do not take ydb lock or any other lock around or in this function.
|
||||
// If the engine is blocked because some thread is holding a lock, this function
|
||||
// can help diagnose the problem.
|
||||
// This function only collects information, and it does not matter if something gets garbled
|
||||
// because of a race condition.
|
||||
@@ -1671,9 +1779,9 @@ env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat) {
|
||||
toku_brt_get_upgrade_status(&brt_upgrade_stat);
|
||||
|
||||
engstat->upgrade_env_status = toku_log_upgrade_get_footprint();
|
||||
engstat->upgrade_header = brt_upgrade_stat.header;
|
||||
engstat->upgrade_nonleaf = brt_upgrade_stat.nonleaf;
|
||||
engstat->upgrade_leaf = brt_upgrade_stat.leaf;
|
||||
engstat->upgrade_header = brt_upgrade_stat.header_12;
|
||||
engstat->upgrade_nonleaf = brt_upgrade_stat.nonleaf_12;
|
||||
engstat->upgrade_leaf = brt_upgrade_stat.leaf_12;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
|
||||
Reference in New Issue
Block a user