1
0
mirror of https://github.com/MariaDB/server.git synced 2025-05-28 13:01:41 +03:00
This commit is contained in:
Sergei Golubchik 2014-09-23 23:37:35 +02:00
commit 53a44915c5
1171 changed files with 27745 additions and 36109 deletions

View File

@ -1148,289 +1148,24 @@ extra/jemalloc/jemalloc-*
extra/jemalloc/build
*.tdb
storage/tokudb/ft-index/CTestCustom.cmake
storage/tokudb/ft-index/DartConfiguration.tcl
storage/tokudb/ft-index/ctags-stamp
storage/tokudb/ft-index/valgrind.suppressions
storage/tokudb/ft-index/xz
storage/tokudb/ft-index/buildheader/db.h
storage/tokudb/ft-index/buildheader/make_tdb
storage/tokudb/ft-index/buildheader/runcat.sh
storage/tokudb/ft-index/ft/ftverify
storage/tokudb/ft-index/ft/log_code.cc
storage/tokudb/ft-index/ft/log_header.h
storage/tokudb/ft-index/ft/log_print.cc
storage/tokudb/ft-index/ft/logformat
storage/tokudb/ft-index/ft/tdb-recover
storage/tokudb/ft-index/ft/tdb_logprint
storage/tokudb/ft-index/ft/tokuftdump
storage/tokudb/ft-index/ft/tests/benchmark-test
storage/tokudb/ft-index/ft/tests/block_allocator_test
storage/tokudb/ft-index/ft/tests/bnc-insert-benchmark
storage/tokudb/ft-index/ft/tests/cachetable-4357
storage/tokudb/ft-index/ft/tests/cachetable-4365
storage/tokudb/ft-index/ft/tests/cachetable-5097
storage/tokudb/ft-index/ft/tests/cachetable-5978
storage/tokudb/ft-index/ft/tests/cachetable-5978-2
storage/tokudb/ft-index/ft/tests/cachetable-all-write
storage/tokudb/ft-index/ft/tests/cachetable-checkpoint-pending
storage/tokudb/ft-index/ft/tests/cachetable-checkpoint-pinned-nodes
storage/tokudb/ft-index/ft/tests/cachetable-checkpoint-prefetched-nodes
storage/tokudb/ft-index/ft/tests/cachetable-checkpoint-test
storage/tokudb/ft-index/ft/tests/cachetable-checkpointer-class
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-checkpoint
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-checkpoint2
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-thread-attrs-accumulate
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-thread-empty-cachetable
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-thread-everything-pinned
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-thread-nothing-needs-flushing
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-thread-same-fullhash
storage/tokudb/ft-index/ft/tests/cachetable-cleaner-thread-simple
storage/tokudb/ft-index/ft/tests/cachetable-clock-all-pinned
storage/tokudb/ft-index/ft/tests/cachetable-clock-eviction
storage/tokudb/ft-index/ft/tests/cachetable-clock-eviction2
storage/tokudb/ft-index/ft/tests/cachetable-clock-eviction3
storage/tokudb/ft-index/ft/tests/cachetable-clock-eviction4
storage/tokudb/ft-index/ft/tests/cachetable-clone-checkpoint
storage/tokudb/ft-index/ft/tests/cachetable-clone-partial-fetch
storage/tokudb/ft-index/ft/tests/cachetable-clone-partial-fetch-pinned-node
storage/tokudb/ft-index/ft/tests/cachetable-clone-pin-nonblocking
storage/tokudb/ft-index/ft/tests/cachetable-clone-unpin-remove
storage/tokudb/ft-index/ft/tests/cachetable-count-pinned-test
storage/tokudb/ft-index/ft/tests/cachetable-debug-test
storage/tokudb/ft-index/ft/tests/cachetable-eviction-close-test
storage/tokudb/ft-index/ft/tests/cachetable-eviction-close-test2
storage/tokudb/ft-index/ft/tests/cachetable-eviction-getandpin-test
storage/tokudb/ft-index/ft/tests/cachetable-eviction-getandpin-test2
storage/tokudb/ft-index/ft/tests/cachetable-evictor-class
storage/tokudb/ft-index/ft/tests/cachetable-fd-test
storage/tokudb/ft-index/ft/tests/cachetable-fetch-inducing-evictor
storage/tokudb/ft-index/ft/tests/cachetable-flush-during-cleaner
storage/tokudb/ft-index/ft/tests/cachetable-flush-test
storage/tokudb/ft-index/ft/tests/cachetable-getandpin-test
storage/tokudb/ft-index/ft/tests/cachetable-kibbutz_and_flush_cachefile
storage/tokudb/ft-index/ft/tests/cachetable-partial-fetch
storage/tokudb/ft-index/ft/tests/cachetable-pin-checkpoint
storage/tokudb/ft-index/ft/tests/cachetable-pin-nonblocking-checkpoint-clean
storage/tokudb/ft-index/ft/tests/cachetable-prefetch-checkpoint-test
storage/tokudb/ft-index/ft/tests/cachetable-prefetch-close-leak-test
storage/tokudb/ft-index/ft/tests/cachetable-prefetch-close-test
storage/tokudb/ft-index/ft/tests/cachetable-prefetch-flowcontrol-test
storage/tokudb/ft-index/ft/tests/cachetable-prefetch-getandpin-test
storage/tokudb/ft-index/ft/tests/cachetable-prefetch-maybegetandpin-test
storage/tokudb/ft-index/ft/tests/cachetable-prefetch2-test
storage/tokudb/ft-index/ft/tests/cachetable-put-checkpoint
storage/tokudb/ft-index/ft/tests/cachetable-put-test
storage/tokudb/ft-index/ft/tests/cachetable-rwlock-test
storage/tokudb/ft-index/ft/tests/cachetable-simple-clone
storage/tokudb/ft-index/ft/tests/cachetable-simple-clone2
storage/tokudb/ft-index/ft/tests/cachetable-simple-maybe-get-pin
storage/tokudb/ft-index/ft/tests/cachetable-simple-pin
storage/tokudb/ft-index/ft/tests/cachetable-simple-pin-cheap
storage/tokudb/ft-index/ft/tests/cachetable-simple-pin-dep-nodes
storage/tokudb/ft-index/ft/tests/cachetable-simple-pin-nonblocking
storage/tokudb/ft-index/ft/tests/cachetable-simple-pin-nonblocking-cheap
storage/tokudb/ft-index/ft/tests/cachetable-simple-put-dep-nodes
storage/tokudb/ft-index/ft/tests/cachetable-simple-read-pin
storage/tokudb/ft-index/ft/tests/cachetable-simple-read-pin-nonblocking
storage/tokudb/ft-index/ft/tests/cachetable-simple-unpin-remove-checkpoint
storage/tokudb/ft-index/ft/tests/cachetable-simple-verify
storage/tokudb/ft-index/ft/tests/cachetable-test
storage/tokudb/ft-index/ft/tests/cachetable-unpin-and-remove-test
storage/tokudb/ft-index/ft/tests/cachetable-unpin-remove-and-checkpoint
storage/tokudb/ft-index/ft/tests/cachetable-unpin-test
storage/tokudb/ft-index/ft/tests/cachetable-writer-thread-limit
storage/tokudb/ft-index/ft/tests/comparator-test
storage/tokudb/ft-index/ft/tests/compress-test
storage/tokudb/ft-index/ft/tests/dbufio-test
storage/tokudb/ft-index/ft/tests/dbufio-test-destroy
storage/tokudb/ft-index/ft/tests/fifo-test
storage/tokudb/ft-index/ft/tests/ft-bfe-query
storage/tokudb/ft-index/ft/tests/ft-clock-test
storage/tokudb/ft-index/ft/tests/ft-serialize-benchmark
storage/tokudb/ft-index/ft/tests/ft-serialize-sub-block-test
storage/tokudb/ft-index/ft/tests/ft-serialize-test
storage/tokudb/ft-index/ft/tests/ft-test
storage/tokudb/ft-index/ft/tests/ft-test-cursor
storage/tokudb/ft-index/ft/tests/ft-test-cursor-2
storage/tokudb/ft-index/ft/tests/ft-test-header
storage/tokudb/ft-index/ft/tests/ft-test0
storage/tokudb/ft-index/ft/tests/ft-test1
storage/tokudb/ft-index/ft/tests/ft-test2
storage/tokudb/ft-index/ft/tests/ft-test3
storage/tokudb/ft-index/ft/tests/ft-test4
storage/tokudb/ft-index/ft/tests/ft-test5
storage/tokudb/ft-index/ft/tests/ftloader-test
storage/tokudb/ft-index/ft/tests/ftloader-test-bad-generate
storage/tokudb/ft-index/ft/tests/ftloader-test-extractor
storage/tokudb/ft-index/ft/tests/ftloader-test-extractor-errors
storage/tokudb/ft-index/ft/tests/ftloader-test-merge-files-dbufio
storage/tokudb/ft-index/ft/tests/ftloader-test-open
storage/tokudb/ft-index/ft/tests/ftloader-test-vm
storage/tokudb/ft-index/ft/tests/ftloader-test-writer
storage/tokudb/ft-index/ft/tests/ftloader-test-writer-errors
storage/tokudb/ft-index/ft/tests/is_empty
storage/tokudb/ft-index/ft/tests/keyrange
storage/tokudb/ft-index/ft/tests/keytest
storage/tokudb/ft-index/ft/tests/le-cursor-provdel
storage/tokudb/ft-index/ft/tests/le-cursor-right
storage/tokudb/ft-index/ft/tests/le-cursor-walk
storage/tokudb/ft-index/ft/tests/list-test
storage/tokudb/ft-index/ft/tests/log-test
storage/tokudb/ft-index/ft/tests/log-test-maybe-trim
storage/tokudb/ft-index/ft/tests/log-test2
storage/tokudb/ft-index/ft/tests/log-test3
storage/tokudb/ft-index/ft/tests/log-test4
storage/tokudb/ft-index/ft/tests/log-test5
storage/tokudb/ft-index/ft/tests/log-test6
storage/tokudb/ft-index/ft/tests/log-test7
storage/tokudb/ft-index/ft/tests/logcursor-bad-checksum
storage/tokudb/ft-index/ft/tests/logcursor-empty-logdir
storage/tokudb/ft-index/ft/tests/logcursor-empty-logfile
storage/tokudb/ft-index/ft/tests/logcursor-empty-logfile-2
storage/tokudb/ft-index/ft/tests/logcursor-empty-logfile-3
storage/tokudb/ft-index/ft/tests/logcursor-print
storage/tokudb/ft-index/ft/tests/logcursor-timestamp
storage/tokudb/ft-index/ft/tests/logfilemgr-create-destroy
storage/tokudb/ft-index/ft/tests/logfilemgr-print
storage/tokudb/ft-index/ft/tests/make-tree
storage/tokudb/ft-index/ft/tests/minicron-test
storage/tokudb/ft-index/ft/tests/msnfilter
storage/tokudb/ft-index/ft/tests/omt-test
storage/tokudb/ft-index/ft/tests/orthopush-flush
storage/tokudb/ft-index/ft/tests/pqueue-test
storage/tokudb/ft-index/ft/tests/queue-test
storage/tokudb/ft-index/ft/tests/quicklz-test
storage/tokudb/ft-index/ft/tests/recovery-bad-last-entry
storage/tokudb/ft-index/ft/tests/recovery-cbegin
storage/tokudb/ft-index/ft/tests/recovery-cbegin-cend
storage/tokudb/ft-index/ft/tests/recovery-cbegin-cend-hello
storage/tokudb/ft-index/ft/tests/recovery-cend-cbegin
storage/tokudb/ft-index/ft/tests/recovery-datadir-is-file
storage/tokudb/ft-index/ft/tests/recovery-empty
storage/tokudb/ft-index/ft/tests/recovery-fopen-missing-file
storage/tokudb/ft-index/ft/tests/recovery-hello
storage/tokudb/ft-index/ft/tests/recovery-lsn-error-during-forward-scan
storage/tokudb/ft-index/ft/tests/recovery-no-datadir
storage/tokudb/ft-index/ft/tests/recovery-no-log
storage/tokudb/ft-index/ft/tests/recovery-no-logdir
storage/tokudb/ft-index/ft/tests/recovery-test5123
storage/tokudb/ft-index/ft/tests/shortcut
storage/tokudb/ft-index/ft/tests/subblock-test-checksum
storage/tokudb/ft-index/ft/tests/subblock-test-compression
storage/tokudb/ft-index/ft/tests/subblock-test-index
storage/tokudb/ft-index/ft/tests/subblock-test-size
storage/tokudb/ft-index/ft/tests/test-assert
storage/tokudb/ft-index/ft/tests/test-bjm
storage/tokudb/ft-index/ft/tests/test-checkpoint-during-flush
storage/tokudb/ft-index/ft/tests/test-checkpoint-during-merge
storage/tokudb/ft-index/ft/tests/test-checkpoint-during-rebalance
storage/tokudb/ft-index/ft/tests/test-checkpoint-during-split
storage/tokudb/ft-index/ft/tests/test-del-inorder
storage/tokudb/ft-index/ft/tests/test-dirty-flushes-on-cleaner
storage/tokudb/ft-index/ft/tests/test-dump-ft
storage/tokudb/ft-index/ft/tests/test-flushes-on-cleaner
storage/tokudb/ft-index/ft/tests/test-ft-overflow
storage/tokudb/ft-index/ft/tests/test-hot-with-bounds
storage/tokudb/ft-index/ft/tests/test-inc-split
storage/tokudb/ft-index/ft/tests/test-leafentry-child-txn
storage/tokudb/ft-index/ft/tests/test-leafentry-nested
storage/tokudb/ft-index/ft/tests/test-merges-on-cleaner
storage/tokudb/ft-index/ft/tests/test-oldest-referenced-xid-flush
storage/tokudb/ft-index/ft/tests/test-pick-child-to-flush
storage/tokudb/ft-index/ft/tests/test-txn-child-manager
storage/tokudb/ft-index/ft/tests/test1308a
storage/tokudb/ft-index/ft/tests/test3681
storage/tokudb/ft-index/ft/tests/test3856
storage/tokudb/ft-index/ft/tests/test3884
storage/tokudb/ft-index/ft/tests/test4115
storage/tokudb/ft-index/ft/tests/test4244
storage/tokudb/ft-index/ft/tests/test_block_allocator_merge
storage/tokudb/ft-index/ft/tests/test_logcursor
storage/tokudb/ft-index/ft/tests/test_oexcl
storage/tokudb/ft-index/ft/tests/test_toku_malloc_plain_free
storage/tokudb/ft-index/ft/tests/upgrade_test_simple
storage/tokudb/ft-index/ft/tests/verify-bad-msn
storage/tokudb/ft-index/ft/tests/verify-bad-pivots
storage/tokudb/ft-index/ft/tests/verify-dup-in-leaf
storage/tokudb/ft-index/ft/tests/verify-dup-pivots
storage/tokudb/ft-index/ft/tests/verify-misrouted-msgs
storage/tokudb/ft-index/ft/tests/verify-unsorted-leaf
storage/tokudb/ft-index/ft/tests/verify-unsorted-pivots
storage/tokudb/ft-index/ft/tests/x1764-test
storage/tokudb/ft-index/ft/tests/xid_lsn_independent
storage/tokudb/ft-index/ft/tests/ybt-test
storage/tokudb/ft-index/locktree/tests/concurrent_tree_create_destroy
storage/tokudb/ft-index/locktree/tests/concurrent_tree_lkr_acquire_release
storage/tokudb/ft-index/locktree/tests/concurrent_tree_lkr_insert_remove
storage/tokudb/ft-index/locktree/tests/concurrent_tree_lkr_insert_serial_large
storage/tokudb/ft-index/locktree/tests/concurrent_tree_lkr_remove_all
storage/tokudb/ft-index/locktree/tests/lock_request_create_set
storage/tokudb/ft-index/locktree/tests/lock_request_get_set_keys
storage/tokudb/ft-index/locktree/tests/lock_request_start_deadlock
storage/tokudb/ft-index/locktree/tests/lock_request_start_pending
storage/tokudb/ft-index/locktree/tests/locktree_conflicts
storage/tokudb/ft-index/locktree/tests/locktree_create_destroy
storage/tokudb/ft-index/locktree/tests/locktree_infinity
storage/tokudb/ft-index/locktree/tests/locktree_misc
storage/tokudb/ft-index/locktree/tests/locktree_overlapping_relock
storage/tokudb/ft-index/locktree/tests/locktree_simple_lock
storage/tokudb/ft-index/locktree/tests/locktree_single_txnid_optimization
storage/tokudb/ft-index/locktree/tests/manager_create_destroy
storage/tokudb/ft-index/locktree/tests/manager_locktree_map
storage/tokudb/ft-index/locktree/tests/manager_params
storage/tokudb/ft-index/locktree/tests/manager_reference_release_lt
storage/tokudb/ft-index/locktree/tests/manager_status
storage/tokudb/ft-index/locktree/tests/range_buffer_test
storage/tokudb/ft-index/locktree/tests/txnid_set_test
storage/tokudb/ft-index/locktree/tests/wfg_test
storage/tokudb/ft-index/portability/merge_archives_tokuportability_static.cmake
storage/tokudb/ft-index/portability/toku_config.h
storage/tokudb/ft-index/portability/tokuportability_static_depends.cc
storage/tokudb/ft-index/portability/tests/test-active-cpus
storage/tokudb/ft-index/portability/tests/test-cache-line-boundary-fails
storage/tokudb/ft-index/portability/tests/test-cpu-freq
storage/tokudb/ft-index/portability/tests/test-cpu-freq-openlimit17
storage/tokudb/ft-index/portability/tests/test-fair-rwlock
storage/tokudb/ft-index/portability/tests/test-filesystem-sizes
storage/tokudb/ft-index/portability/tests/test-flock
storage/tokudb/ft-index/portability/tests/test-fsync
storage/tokudb/ft-index/portability/tests/test-fsync-directory
storage/tokudb/ft-index/portability/tests/test-gettime
storage/tokudb/ft-index/portability/tests/test-gettimeofday
storage/tokudb/ft-index/portability/tests/test-hugepage
storage/tokudb/ft-index/portability/tests/test-max-data
storage/tokudb/ft-index/portability/tests/test-memory-status
storage/tokudb/ft-index/portability/tests/test-pagesize
storage/tokudb/ft-index/portability/tests/test-pthread-rwlock-rdlock
storage/tokudb/ft-index/portability/tests/test-pthread-rwlock-rwr
storage/tokudb/ft-index/portability/tests/test-pwrite4g
storage/tokudb/ft-index/portability/tests/test-snprintf
storage/tokudb/ft-index/portability/tests/test-stat
storage/tokudb/ft-index/portability/tests/test-toku-malloc
storage/tokudb/ft-index/portability/tests/test-xid
storage/tokudb/ft-index/portability/tests/try-assert-zero
storage/tokudb/ft-index/portability/tests/try-assert0
storage/tokudb/ft-index/portability/tests/try-leak-lost
storage/tokudb/ft-index/portability/tests/try-leak-reachable
storage/tokudb/ft-index/portability/tests/try-uninit
storage/tokudb/ft-index/src/merge_archives_tokufractaltree_static.cmake
storage/tokudb/ft-index/src/tokufractaltree_static_depends.cc
storage/tokudb/ft-index/src/tests/recovery_fileops_unit_dir
storage/tokudb/ft-index/toku_include/toku_config.h
storage/tokudb/ft-index/util/tests/marked-omt-test
storage/tokudb/ft-index/util/tests/omt-tmpl-test
storage/tokudb/ft-index/util/tests/sort-tmpl-test
storage/tokudb/ft-index/util/tests/test-kibbutz
storage/tokudb/ft-index/util/tests/test-kibbutz2
storage/tokudb/ft-index/util/tests/test-rwlock
storage/tokudb/ft-index/util/tests/test-rwlock-cheapness
storage/tokudb/ft-index/util/tests/test_circular_buffer
storage/tokudb/ft-index/util/tests/test_doubly_linked_list
storage/tokudb/ft-index/util/tests/test_partitioned_counter
storage/tokudb/ft-index/util/tests/test_partitioned_counter_5833
storage/tokudb/ft-index/util/tests/threadpool-test
storage/tokudb/ft-index/util/tests/threadpool-testrunf
storage/tokudb/ft-index/utils/tokudb_dump
storage/tokudb/ft-index/utils/tokudb_gen
storage/tokudb/ft-index/utils/tokudb_load
storage/tokudb/ft-index/tools/ba_replay
storage/tokudb/ft-index/tools/ftverify
storage/tokudb/ft-index/tools/tdb-recover
storage/tokudb/ft-index/tools/tdb_logprint
storage/tokudb/ft-index/tools/tokudb_dump
storage/tokudb/ft-index/tools/tokuftdump
libmysql/libmysql_versions.ld
scripts/mysql_config.pl

View File

@ -230,22 +230,16 @@ pthread_handler_t handle_slave_sql(void *arg);
bool net_request_file(NET* net, const char* fname);
extern bool volatile abort_loop;
extern Master_info main_mi, *active_mi; /* active_mi for multi-master */
extern LIST master_list;
extern Master_info *active_mi; /* active_mi for multi-master */
extern my_bool replicate_same_server_id;
extern int disconnect_slave_event_count, abort_slave_event_count ;
/* the master variables are defaults read from my.cnf or command line */
extern uint master_port, master_connect_retry, report_port;
extern char * master_user, *master_password, *master_host;
extern uint report_port;
extern char *master_info_file, *report_user;
extern char *report_host, *report_password;
extern my_bool master_ssl;
extern char *master_ssl_ca, *master_ssl_capath, *master_ssl_cert;
extern char *master_ssl_cipher, *master_ssl_key;
extern I_List<THD> threads;
#else

View File

@ -75,15 +75,12 @@ set_cflags_if_supported(-Wno-missing-field-initializers)
ADD_SUBDIRECTORY(ft-index)
# TODO: clean up includes in ft-index
INCLUDE_DIRECTORIES(ft-index)
INCLUDE_DIRECTORIES(ft-index/include)
INCLUDE_DIRECTORIES(ft-index/portability)
INCLUDE_DIRECTORIES(ft-index/toku_include)
INCLUDE_DIRECTORIES(ft-index/util)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ft-index)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ft-index/buildheader)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ft-index/toku_include)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ft-index/portability)
SET(TOKUDB_PLUGIN_DYNAMIC "ha_tokudb")
SET(TOKUDB_SOURCES ha_tokudb.cc)

View File

@ -1,17 +1,17 @@
TokuDB
======
TokuDB is a high-performance, transactional storage engine for MySQL and
TokuDB is a high-performance, write optimized, transactional storage engine for MySQL and
MariaDB. For more details, see our [product page][products].
This repository contains the MySQL plugin that uses the [TokuKV][tokukv]
This repository contains the MySQL plugin that uses the [TokuFT][tokuft]
core.
There are also patches to the MySQL and MariaDB kernels, available in our
forks of [mysql][mysql] and [mariadb][mariadb].
[products]: http://www.tokutek.com/products/tokudb-for-mysql/
[tokukv]: http://github.com/Tokutek/ft-index
[tokuft]: http://github.com/Tokutek/ft-index
[mysql]: http://github.com/Tokutek/mysql
[mariadb]: http://github.com/Tokutek/mariadb

View File

@ -51,14 +51,12 @@ if (USE_VALGRIND AND NOT VALGRIND_INCLUDE_DIR MATCHES NOTFOUND)
)
endif()
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/toku_include
${CMAKE_CURRENT_SOURCE_DIR}/portability
${CMAKE_CURRENT_SOURCE_DIR} ## so you can include <ft/ft-ops.h> from inside src/
${CMAKE_CURRENT_BINARY_DIR} ## for logging code
)
## include where config.h will be generated
include_directories(${CMAKE_CURRENT_BINARY_DIR}/toku_include)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/portability)
## build db.h and include where it will be generated
add_subdirectory(buildheader)
@ -76,12 +74,7 @@ add_subdirectory(portability)
add_subdirectory(ft)
add_subdirectory(locktree)
add_subdirectory(src)
add_subdirectory(utils)
## subdirectories that just install things
#add_subdirectory(include)
add_subdirectory(toku_include)
#add_subdirectory(examples)
add_subdirectory(tools)
INSTALL_DOCUMENTATION(README.md README-TOKUDB COMPONENT Server)

View File

@ -0,0 +1,241 @@
cmake_policy(SET CMP0012 NEW)
## these tests shouldn't run with valgrind
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE
ft/bnc-insert-benchmark
ft/ft_loader-test-extractor-1
ft/ft_loader-test-extractor-2
ft/ft_loader-test-extractor-3
ft/upgrade_test_simple
portability/test-cache-line-boundary-fails
portability/try-leak-lost
portability/try-leak-reachable
portability/try-leak-uninit
util/helgrind_test_circular_buffer
util/helgrind_test_partitioned_counter
util/helgrind_test_partitioned_counter_5833
ydb/diskfull.tdb
ydb/drd_test_4015.tdb
ydb/drd_test_groupcommit_count.tdb
ydb/filesize.tdb
ydb/helgrind_helgrind1.tdb
ydb/helgrind_helgrind2.tdb
ydb/helgrind_helgrind3.tdb
ydb/helgrind_test_groupcommit_count.tdb
ydb/hot-optimize-table-tests.tdb
ydb/insert-dup-prelock.tdb
ydb/loader-cleanup-test2.tdb
ydb/loader-cleanup-test3.tdb
ydb/loader-stress-test4.tdb
ydb/maxsize-for-loader-B.tdb
ydb/openlimit17.tdb
ydb/openlimit17-locktree.tdb
ydb/preload-db-nested.tdb
ydb/stress-gc.tdb
ydb/stress-gc2.tdb
ydb/stress-test.tdb
ydb/test-5138.tdb
ydb/test-prepare.tdb
ydb/test-prepare2.tdb
ydb/test-prepare3.tdb
ydb/test-recover1.tdb
ydb/test-recover2.tdb
ydb/test-recover3.tdb
ydb/test-xa-prepare.tdb
ydb/test4573-logtrim.tdb
ydb/test_3645.tdb
ydb/test_groupcommit_perf.tdb
ydb/test_large_update_broadcast_small_cachetable.tdb
ydb/test_update_broadcast_stress.tdb
ydb/test_update_stress.tdb
ydb/upgrade-test-4.tdb
)
if (NOT @RUN_HELGRIND_TESTS@)
list(APPEND CTEST_CUSTOM_TESTS_IGNORE
util/helgrind_test_circular_buffer
util/helgrind_test_partitioned_counter
util/helgrind_test_partitioned_counter_5833
ydb/helgrind_helgrind1.tdb
ydb/helgrind_helgrind2.tdb
ydb/helgrind_helgrind3.tdb
ydb/helgrind_test_groupcommit_count.tdb
)
endif ()
if (NOT @RUN_DRD_TESTS@)
list(APPEND CTEST_CUSTOM_TESTS_IGNORE
ydb/drd_test_groupcommit_count.tdb
ydb/drd_test_4015.tdb
)
endif ()
## osx's pthreads prefer writers, so this test will deadlock
if (@CMAKE_SYSTEM_NAME@ STREQUAL Darwin)
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE portability/test-pthread-rwlock-rwr)
list(APPEND CTEST_CUSTOM_TESTS_IGNORE portability/test-pthread-rwlock-rwr)
endif ()
## tests that are supposed to crash will generate memcheck failures
set(tests_that_should_fail
ft/test-assertA
ft/test-assertB
portability/try-assert-zero
portability/try-assert0
ydb/recover-missing-dbfile-2.abortrecover
ydb/recover-missing-dbfile.abortrecover
ydb/test_db_no_env.tdb
ydb/test_truncate_txn_abort.tdb
)
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE ${tests_that_should_fail})
## don't run drd stress tests with valgrind either (because that would do valgrind twice)
set(stress_tests
test_stress0.tdb
test_stress1.tdb
test_stress2.tdb
test_stress3.tdb
test_stress4.tdb
test_stress5.tdb
test_stress6.tdb
test_stress7.tdb
test_stress_hot_indexing.tdb
test_stress_openclose.tdb
test_stress_with_verify.tdb
)
foreach(test ${stress_tests})
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE
ydb/drd_tiny_${test}
ydb/drd_mid_${test}
ydb/drd_large_${test}
)
if(NOT @RUN_LONG_TESTS@)
list(APPEND CTEST_CUSTOM_TESTS_IGNORE
ydb/drd_large_${test}
)
endif()
if (NOT @RUN_DRD_TESTS@)
list(APPEND CTEST_CUSTOM_TESTS_IGNORE
ydb/drd_tiny_${test}
ydb/drd_mid_${test}
ydb/drd_large_${test}
)
endif ()
endforeach(test)
## upgrade stress tests are 5 minutes long, don't need to run them always
if(NOT @RUN_LONG_TESTS@)
foreach(test ${stress_tests})
if (NOT ${test} MATCHES test_stress_openclose)
foreach(oldver 4.2.0 5.0.8 5.2.7 6.0.0 6.1.0 6.5.1 6.6.3)
foreach(p_or_s pristine stressed)
if (NOT (${test} MATCHES test_stress4 AND ${p_or_s} MATCHES stressed))
foreach(size 2000)
list(APPEND CTEST_CUSTOM_TESTS_IGNORE ydb/${test}/upgrade/${oldver}/${p_or_s}/${size})
endforeach(size)
endif ()
endforeach(p_or_s)
endforeach(oldver)
endif ()
endforeach(test)
endif()
set(tdb_tests_that_should_fail "ydb/${stress_tests}")
string(REGEX REPLACE ";" ";ydb/" stress_tests "${stress_tests}")
set(recover_stress_tests
ydb/recover-test_stress1.abortrecover
ydb/recover-test_stress2.abortrecover
ydb/recover-test_stress3.abortrecover
ydb/recover-test_stress_openclose.abortrecover
)
## we run stress tests separately, only run them if asked to
if(NOT @RUN_STRESS_TESTS@)
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE ${stress_tests} ${recover_stress_tests})
list(APPEND CTEST_CUSTOM_TESTS_IGNORE ${stress_tests} ${recover_stress_tests})
endif()
set(perf_tests
ydb/perf_checkpoint_var.tdb
ydb/perf_cursor_nop.tdb
ydb/perf_malloc_free.tdb
ydb/perf_nop.tdb
ydb/perf_ptquery.tdb
ydb/perf_ptquery2.tdb
ydb/perf_read_write.tdb
ydb/perf_xmalloc_free.tdb
)
## we also don't need to run perf tests every time
if(NOT @RUN_PERF_TESTS@)
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE ${perf_tests})
list(APPEND CTEST_CUSTOM_TESTS_IGNORE ${perf_tests})
endif()
## don't run perf tests with valgrind (that's slow)
file(GLOB perf_test_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/src/tests" perf_*.cc)
string(REGEX REPLACE "\\.cc(;|$)" ".tdb\\1" perf_tests "${perf_test_srcs}")
set(tdb_tests_that_should_fail "ydb/${perf_tests}")
string(REGEX REPLACE ";" ";ydb/" perf_tests "${perf_tests}")
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE ${perf_tests})
## these tests fail often and aren't helpful
set(known_failing_tests
ydb/diskfull.tdb
)
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE ${known_failing_tests})
list(APPEND CTEST_CUSTOM_TESTS_IGNORE ${known_failing_tests})
## these tests take a long time, only run them if asked to
set(long_running_tests
ft/is_empty
ft/upgrade_test_simple
ydb/checkpoint_1.tdb
ydb/checkpoint_stress.tdb
ydb/hotindexer-with-queries.tdb
ydb/hot-optimize-table-tests.tdb
ydb/loader-cleanup-test0.tdb
ydb/loader-cleanup-test0z.tdb
ydb/loader-cleanup-test2.tdb
ydb/loader-cleanup-test2z.tdb
ydb/loader-stress-test4.tdb
ydb/loader-stress-test4z.tdb
ydb/manyfiles.tdb
ydb/preload-db-nested.tdb
ydb/recover_stress.tdb
ydb/root_fifo_1.tdb
ydb/root_fifo_2.tdb
ydb/root_fifo_31.tdb
ydb/root_fifo_32.tdb
ydb/stress-gc.tdb
ydb/stress-test.tdb
ydb/test3529.tdb
ydb/test_logmax.tdb
ydb/test_txn_nested2.tdb
ydb/test_update_broadcast_stress.tdb
ydb/test_update_stress.tdb
)
if(NOT @RUN_LONG_TESTS@)
list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE ${long_running_tests})
list(APPEND CTEST_CUSTOM_TESTS_IGNORE ${long_running_tests})
endif()
## ignore log_print.cc in coverage report
list(APPEND CTEST_CUSTOM_COVERAGE_EXCLUDE "log_print.cc")
list(APPEND CTEST_CUSTOM_WARNING_EXCEPTION
# don't complain about warnings in xz source
"xz-4.999.9beta/src/liblzma"
# don't complain about clang missing warnings from xz code
"clang: warning: unknown warning option"
# don't complain about warnings in jemalloc source
"jemalloc/src"
"jemalloc/internal"
# don't complain about valgrind headers leaving things unused
"valgrind/valgrind.h"
"valgrind/memcheck.h"
# don't complain about ranlib or libtool on empty archive
"has no symbols"
"the table of contents is empty"
)

View File

@ -25,7 +25,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:

View File

@ -1,16 +1,16 @@
TokuKV
TokuFT
======
TokuKV is a high-performance, transactional key-value store, used in the
TokuFT is a high-performance, transactional key-value store, used in the
TokuDB storage engine for MySQL and MariaDB and in TokuMX, the
high-performance MongoDB distribution.
TokuKV is provided as a shared library with an interface similar to
TokuFT is provided as a shared library with an interface similar to
Berkeley DB.
To build the full MySQL product, see the instructions for
[Tokutek/ft-engine][ft-engine]. To build TokuMX, see the instructions
for [Tokutek/mongo][mongo]. This document covers TokuKV only.
for [Tokutek/mongo][mongo]. This document covers TokuFT only.
[ft-engine]: https://github.com/Tokutek/ft-engine
[mongo]: https://github.com/Tokutek/mongo
@ -19,7 +19,7 @@ for [Tokutek/mongo][mongo]. This document covers TokuKV only.
Building
--------
TokuKV is built using CMake >= 2.8.9. Out-of-source builds are
TokuFT is built using CMake >= 2.8.9. Out-of-source builds are
recommended. You need a C++11 compiler, though only GCC >= 4.7 and
Apple's Clang are tested. You also need zlib development packages
(`yum install zlib-devel` or `apt-get install zlib1g-dev`).
@ -35,7 +35,6 @@ mkdir build
cd build
CC=gcc47 CXX=g++47 cmake \
-D CMAKE_BUILD_TYPE=Debug \
-D USE_BDB=OFF \
-D BUILD_TESTING=OFF \
-D USE_VALGRIND=OFF \
-D CMAKE_INSTALL_PREFIX=../prefix/ \
@ -50,14 +49,14 @@ to that if you are planning to run benchmarks or in production.
### Platforms
TokuKV is supported on 64-bit Centos, should work on other 64-bit linux
distributions, and may work on OSX 10.8 and FreeBSD. TokuKV is not
TokuFT is supported on 64-bit Centos, should work on other 64-bit linux
distributions, and may work on OSX 10.8 and FreeBSD. TokuFT is not
supported on 32-bit systems.
[Transparent hugepages][transparent-hugepages] is a feature in newer linux
kernel versions that causes problems for the memory usage tracking
calculations in TokuKV and can lead to memory overcommit. If you have
this feature enabled, TokuKV will not start, and you should turn it off.
calculations in TokuFT and can lead to memory overcommit. If you have
this feature enabled, TokuFT will not start, and you should turn it off.
If you want to run with transparent hugepages on, you can set an
environment variable `TOKU_HUGE_PAGES_OK=1`, but only do this for testing,
and only with a small cache size.
@ -68,31 +67,26 @@ and only with a small cache size.
Examples
--------
There are some sample programs that can use either TokuKV or Berkeley DB
There are some sample programs that can use either TokuFT or Berkeley DB
in the `examples/` directory. Follow the above instructions to build and
install TokuKV, and then look in the installed `examples/` directory for
install TokuFT, and then look in the installed `examples/` directory for
instructions on building and running them.
Testing
-------
TokuKV uses CTest for testing. The CDash testing dashboard is not
TokuFT uses CTest for testing. The CDash testing dashboard is not
currently public, but you can run the tests without submitting them.
There are some large data files not stored in the git repository, that
will be made available soon. For now, the tests that use these files will
not run.
Many of the tests are linked with both TokuKV and Berkeley DB, as a sanity
check on the tests themselves. To build these tests, you will need
Berkeley DB and its header files installed. If you do not have Berkeley
DB installed, just don't pass `USE_BDB=ON`.
In the build directory from above:
```sh
cmake -D BUILD_TESTING=ON [-D USE_BDB=ON] ..
cmake -D BUILD_TESTING=ON ..
ctest -D ExperimentalStart \
-D ExperimentalConfigure \
-D ExperimentalBuild \
@ -103,7 +97,7 @@ ctest -D ExperimentalStart \
Contributing
------------
Please report bugs in TokuKV here on github.
Please report bugs in TokuFT to the [issue tracker][jira].
We have two publicly accessible mailing lists for TokuDB:
@ -121,11 +115,13 @@ and two for TokuMX:
We are also available on IRC on freenode.net, in the #tokutek channel.
[jira]: https://tokutek.atlassian.net/browse/FT/
License
-------
TokuKV is available under the GPL version 2, with slight modifications.
TokuFT is available under the GPL version 2, with slight modifications.
See [README-TOKUDB][license].
[license]: http://github.com/Tokutek/ft-index/blob/master/README-TOKUDB

View File

@ -28,7 +28,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -341,8 +341,8 @@ static void print_defines (void) {
dodefine_from_track(txn_flags, DB_TXN_READ_ONLY);
}
/* TOKUDB specific error codes*/
printf("/* TOKUDB specific error codes */\n");
/* TokuFT specific error codes*/
printf("/* TokuFT specific error codes */\n");
dodefine(TOKUDB_OUT_OF_LOCKS);
dodefine(TOKUDB_SUCCEEDED_EARLY);
dodefine(TOKUDB_FOUND_BUT_REJECTED);
@ -422,7 +422,7 @@ static void print_db_env_struct (void) {
"int (*cleaner_set_iterations) (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invokation. 0 means disabled. */",
"int (*cleaner_get_iterations) (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invokation. 0 means disabled. */",
"int (*checkpointing_postpone) (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */",
"int (*checkpointing_resume) (DB_ENV*) /* Alert tokudb 'postpone' is no longer necessary */",
"int (*checkpointing_resume) (DB_ENV*) /* Alert tokuft that 'postpone' is no longer necessary */",
"int (*checkpointing_begin_atomic_operation) (DB_ENV*) /* Begin a set of operations (that must be atomic as far as checkpoints are concerned). i.e. inserting into every index in one table */",
"int (*checkpointing_end_atomic_operation) (DB_ENV*) /* End a set of operations (that must be atomic as far as checkpoints are concerned). */",
"int (*set_default_bt_compare) (DB_ENV*,int (*bt_compare) (DB *, const DBT *, const DBT *)) /* Set default (key) comparison function for all DBs in this environment. Required for RECOVERY since you cannot open the DBs manually. */",
@ -465,6 +465,7 @@ static void print_db_env_struct (void) {
"void (*set_loader_memory_size)(DB_ENV *env, uint64_t (*get_loader_memory_size_callback)(void))",
"uint64_t (*get_loader_memory_size)(DB_ENV *env)",
"void (*set_killed_callback)(DB_ENV *env, uint64_t default_killed_time_msec, uint64_t (*get_killed_time_callback)(uint64_t default_killed_time_msec), int (*killed_callback)(void))",
"void (*do_backtrace) (DB_ENV *env)",
NULL};
sort_and_dump_fields("db_env", true, extra);
@ -545,6 +546,7 @@ static void print_db_struct (void) {
"int (*change_fanout)(DB *db, uint32_t fanout)",
"int (*get_fanout)(DB *db, uint32_t *fanout)",
"int (*set_fanout)(DB *db, uint32_t fanout)",
"int (*set_memcmp_magic)(DB *db, uint8_t magic)",
"int (*set_indexer)(DB*, DB_INDEXER*)",
"void (*get_indexer)(DB*, DB_INDEXER**)",
"int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going)",
@ -573,11 +575,10 @@ static void print_db_txn_struct (void) {
STRUCT_SETUP(DB_TXN, prepare, "int (*%s) (DB_TXN*, uint8_t gid[DB_GID_SIZE])");
STRUCT_SETUP(DB_TXN, discard, "int (*%s) (DB_TXN*, uint32_t)");
STRUCT_SETUP(DB_TXN, id, "uint32_t (*%s) (DB_TXN *)");
STRUCT_SETUP(DB_TXN, mgrp, "DB_ENV *%s /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/");
STRUCT_SETUP(DB_TXN, mgrp, "DB_ENV *%s /* In TokuFT, mgrp is a DB_ENV, not a DB_TXNMGR */");
STRUCT_SETUP(DB_TXN, parent, "DB_TXN *%s");
const char *extra[] = {
"int (*txn_stat)(DB_TXN *, struct txn_stat **)",
"struct toku_list open_txns",
"int (*commit_with_progress)(DB_TXN*, uint32_t, TXN_PROGRESS_POLL_FUNCTION, void*)",
"int (*abort_with_progress)(DB_TXN*, TXN_PROGRESS_POLL_FUNCTION, void*)",
"int (*xa_prepare) (DB_TXN*, TOKU_XA_XID *)",
@ -614,6 +615,7 @@ static void print_dbc_struct (void) {
"int (*c_set_bounds)(DBC*, const DBT*, const DBT*, bool pre_acquire, int out_of_range_error)",
"void (*c_set_check_interrupt_callback)(DBC*, bool (*)(void*), void *)",
"void (*c_remove_restriction)(DBC*)",
"char _internal[512]",
NULL};
sort_and_dump_fields("dbc", false, extra);
}
@ -635,12 +637,11 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
//printf("#include <inttypes.h>\n");
printf("#if defined(__cplusplus) || defined(__cilkplusplus)\nextern \"C\" {\n#endif\n");
printf("#define TOKUDB 1\n");
printf("#define DB_VERSION_MAJOR %d\n", DB_VERSION_MAJOR);
printf("#define DB_VERSION_MINOR %d\n", DB_VERSION_MINOR);
printf("/* As of r40364 (post TokuDB 5.2.7), the patch version number is 100+ the BDB header patch version number.*/\n");
printf("/* As of r40364 (post TokuFT 5.2.7), the patch version number is 100+ the BDB header patch version number.*/\n");
printf("#define DB_VERSION_PATCH %d\n", 100+DB_VERSION_PATCH);
printf("#define DB_VERSION_STRING \"Tokutek: TokuDB %d.%d.%d\"\n", DB_VERSION_MAJOR, DB_VERSION_MINOR, 100+DB_VERSION_PATCH);
printf("#define DB_VERSION_STRING \"Tokutek: TokuFT %d.%d.%d\"\n", DB_VERSION_MAJOR, DB_VERSION_MINOR, 100+DB_VERSION_PATCH);
#ifndef DB_GID_SIZE
#define DB_GID_SIZE DB_XIDDATASIZE
@ -654,7 +655,6 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
" char data[DB_GID_SIZE];\n"
"} TOKU_XA_XID;\n");
//Typedef toku_off_t
printf("#ifndef TOKU_OFF_T_DEFINED\n"
"#define TOKU_OFF_T_DEFINED\n"
"typedef int64_t toku_off_t;\n"
@ -673,7 +673,10 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
printf("typedef uint32_t db_recno_t;\n");
printf("typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);\n");
printf("#include <tdb-internal.h>\n");
printf("struct simple_dbt {\n");
printf(" uint32_t len;\n");
printf(" void *data;\n");
printf("};\n");
//stat64
printf("typedef struct __toku_db_btree_stat64 {\n");

View File

@ -26,15 +26,54 @@ SET(CMAKE_RANLIB "@CMAKE_RANLIB@")
SET(TEMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/merge_archives_${TARGET})
MAKE_DIRECTORY(${TEMP_DIR})
# Extract each archive to its own subdirectory(avoid object filename clashes)
# Extract each archive to its own subdirectory(avoid object filename
# clashes) Since the lib may contain objects with the same name, we first
# list the archive contents, then uniquify the object names as we extract
# them.
FOREACH(LIB ${STATIC_LIBS})
GET_FILENAME_COMPONENT(NAME_NO_EXT ${LIB} NAME_WE)
SET(TEMP_SUBDIR ${TEMP_DIR}/${NAME_NO_EXT})
MAKE_DIRECTORY(${TEMP_SUBDIR})
EXECUTE_PROCESS(
COMMAND ${CMAKE_AR} -x ${LIB}
WORKING_DIRECTORY ${TEMP_SUBDIR}
COMMAND ${CMAKE_AR} -t ${LIB}
OUTPUT_VARIABLE LIB_OBJS
)
STRING(REGEX REPLACE "\n" ";" LIB_OBJ_LIST "${LIB_OBJS}")
STRING(REGEX REPLACE ";$" "" LIB_OBJ_LIST "${LIB_OBJ_LIST}")
LIST(LENGTH LIB_OBJ_LIST LENGTH_WITH_DUPS)
SET(LIB_OBJ_LIST_NO_DUPS ${LIB_OBJ_LIST})
IF (LENGTH_WITH_DUPS GREATER 0)
LIST(REMOVE_DUPLICATES LIB_OBJ_LIST_NO_DUPS)
ENDIF ()
LIST(LENGTH LIB_OBJ_LIST_NO_DUPS LENGTH_WITHOUT_DUPS)
IF(LENGTH_WITH_DUPS EQUAL LENGTH_WITHOUT_DUPS)
# Optimization for when lib doesn't actually have duplicate object
# names, we can just extract everything.
EXECUTE_PROCESS(
COMMAND ${CMAKE_AR} -x ${LIB}
WORKING_DIRECTORY ${TEMP_SUBDIR}
)
ELSE()
LIST(SORT LIB_OBJ_LIST)
SET(SAME_OBJ_COUNT 1)
SET(LAST_OBJ_NAME)
FOREACH(OBJ ${LIB_OBJ_LIST})
IF(OBJ STREQUAL LAST_OBJ_NAME)
GET_FILENAME_COMPONENT(OBJ_NO_EXT ${OBJ} NAME_WE)
FILE(RENAME "${TEMP_SUBDIR}/${OBJ}" "${TEMP_SUBDIR}/${OBJ_NO_EXT}.${SAME_OBJ_COUNT}.o")
MATH(EXPR SAME_OBJ_COUNT "${SAME_OBJ_COUNT}+1")
ELSE()
SET(SAME_OBJ_COUNT 1)
ENDIF()
SET(LAST_OBJ_NAME "${OBJ}")
EXECUTE_PROCESS(
COMMAND ${CMAKE_AR} -xN ${SAME_OBJ_COUNT} ${LIB} ${OBJ}
WORKING_DIRECTORY ${TEMP_SUBDIR}
)
ENDFOREACH()
ENDIF()
FILE(GLOB_RECURSE LIB_OBJECTS "${TEMP_SUBDIR}/*.o")
SET(OBJECTS ${OBJECTS} ${LIB_OBJECTS})
@ -51,11 +90,7 @@ ENDFOREACH()
FILE(TO_NATIVE_PATH ${TARGET_LOCATION} ${TARGET_LOCATION})
# Now pack the objects into library with ar.
EXECUTE_PROCESS(
COMMAND ${CMAKE_AR} -r ${TARGET_LOCATION} ${ALL_OBJECTS}
WORKING_DIRECTORY ${TEMP_DIR}
)
EXECUTE_PROCESS(
COMMAND ${CMAKE_RANLIB} ${TARGET_LOCATION}
COMMAND ${CMAKE_AR} rcs ${TARGET_LOCATION} ${ALL_OBJECTS}
WORKING_DIRECTORY ${TEMP_DIR}
)

View File

@ -1,27 +0,0 @@
# - Try to find BDB
# Once done this will define
# BDB_FOUND - System has BDB
# BDB_INCLUDE_DIRS - The BDB include directories
# BDB_LIBRARIES - The libraries needed to use BDB
# BDB_DEFINITIONS - Compiler switches required for using BDB
find_path(BDB_INCLUDE_DIR db.h)
find_library(BDB_LIBRARY NAMES db libdb)
include(CheckSymbolExists)
## check if the found bdb has DB_TXN_SNAPSHOT
set(CMAKE_REQUIRED_INCLUDES ${BDB_INCLUDE_DIR})
check_symbol_exists(DB_TXN_SNAPSHOT "db.h" HAVE_DB_TXN_SNAPSHOT)
if(HAVE_DB_TXN_SNAPSHOT)
set(BDB_INCLUDE_DIRS ${BDB_INCLUDE_DIR})
set(BDB_LIBRARIES ${BDB_LIBRARY})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set BDB_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(BDB DEFAULT_MSG
BDB_LIBRARY BDB_INCLUDE_DIR)
mark_as_advanced(BDB_INCLUDE_DIR BDB_LIBRARY)
endif()

View File

@ -1,128 +0,0 @@
## set up lists of sources and headers for tags
file(GLOB_RECURSE all_srcs
buildheader/*.cc
db-benchmark-test/*.cc
ft/*.cc
include/*.cc
locktree/*.cc
portability/*.cc
src/*.cc
toku_include/*.cc
utils/*.cc
util/*.cc
db-benchmark-test/*.cc
)
list(APPEND all_srcs
${CMAKE_CURRENT_BINARY_DIR}/ft/log_code.cc
${CMAKE_CURRENT_BINARY_DIR}/ft/log_print.cc
)
file(GLOB_RECURSE all_hdrs
buildheader/*.h
db-benchmark-test/*.h
ft/*.h
include/*.h
locktree/*.h
portability/*.h
src/*.h
toku_include/*.h
utils/*.h
util/*.h
db-benchmark-test/*.h
)
list(APPEND all_hdrs
${CMAKE_CURRENT_BINARY_DIR}/toku_include/toku_config.h
${CMAKE_CURRENT_BINARY_DIR}/buildheader/db.h
${CMAKE_CURRENT_BINARY_DIR}/ft/log_header.h
)
option(USE_CTAGS "Build the ctags database." ON)
if (USE_CTAGS AND
# Macs by default are not case-sensitive, so tags and TAGS clobber each other. Do etags and not ctags in that case, because Emacs is superior. :P
(NOT APPLE OR NOT USE_ETAGS))
find_program(CTAGS "ctags")
if (NOT CTAGS MATCHES NOTFOUND)
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/tags"
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ctags-stamp"
COMMAND ${CTAGS} -o tags ${all_srcs} ${all_hdrs}
COMMAND touch "${CMAKE_CURRENT_BINARY_DIR}/ctags-stamp"
DEPENDS ${all_srcs} ${all_hdrs} install_tdb_h generate_config_h generate_log_code
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
add_custom_target(build_ctags ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/tags" ctags-stamp)
endif ()
endif ()
option(USE_ETAGS "Build the etags database." ON)
if (USE_ETAGS)
find_program(ETAGS "etags")
if (NOT ETAGS MATCHES NOTFOUND)
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/TAGS"
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/etags-stamp"
COMMAND ${ETAGS} -o TAGS ${all_srcs} ${all_hdrs}
COMMAND touch "${CMAKE_CURRENT_BINARY_DIR}/etags-stamp"
DEPENDS ${all_srcs} ${all_hdrs} install_tdb_h generate_config_h generate_log_code
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
add_custom_target(build_etags ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/TAGS" etags-stamp)
endif ()
endif ()
option(USE_CSCOPE "Build the cscope database." ON)
if (USE_CSCOPE)
find_program(CSCOPE "cscope")
if (NOT CSCOPE MATCHES NOTFOUND)
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/cscope.files" "")
foreach(file ${all_srcs} ${all_hdrs})
file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/cscope.files" "${file}\n")
endforeach(file)
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/cscope.out"
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/cscope.in.out"
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/cscope.po.out"
COMMAND ${CSCOPE} -b -q -R -i"${CMAKE_CURRENT_BINARY_DIR}/cscope.files" -I"${CMAKE_CURRENT_SOURCE_DIR}" -I"${CMAKE_CURRENT_SOURCE_DIR}/include" -I"${CMAKE_CURRENT_SOURCE_DIR}/toku_include" -I"${CMAKE_CURRENT_SOURCE_DIR}/portability" -I"${CMAKE_CURRENT_SOURCE_DIR}/ft" -I"${CMAKE_CURRENT_SOURCE_DIR}/src" -I"${CMAKE_CURRENT_SOURCE_DIR}/locktree" -I"${CMAKE_CURRENT_SOURCE_DIR}/utils" -I"${CMAKE_CURRENT_SOURCE_DIR}/db-benchmark-test" -I"${CMAKE_CURRENT_BINARY_DIR}" -I"${CMAKE_CURRENT_BINARY_DIR}/toku_include" -I"${CMAKE_CURRENT_BINARY_DIR}/buildheader"
DEPENDS ${all_srcs} ${all_hdrs} install_tdb_h generate_config_h generate_log_code
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
add_custom_target(build_cscope.out ALL DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/cscope.out"
"${CMAKE_CURRENT_SOURCE_DIR}/cscope.in.out"
"${CMAKE_CURRENT_SOURCE_DIR}/cscope.po.out")
endif ()
endif ()
option(USE_GTAGS "Build the gtags database." ON)
if (USE_GTAGS)
find_program(GTAGS "gtags")
if (NOT GTAGS MATCHES NOTFOUND)
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/gtags.files" "")
foreach(file ${all_srcs} ${all_hdrs})
file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/gtags.files" "${file}\n")
endforeach(file)
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/GTAGS"
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/GRTAGS"
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/GPATH"
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/GSYMS"
COMMAND ${GTAGS} -f "${CMAKE_CURRENT_BINARY_DIR}/gtags.files"
DEPENDS ${all_srcs} ${all_hdrs} install_tdb_h generate_config_h generate_log_code
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
add_custom_target(build_GTAGS ALL DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/GTAGS"
"${CMAKE_CURRENT_SOURCE_DIR}/GRTAGS"
"${CMAKE_CURRENT_SOURCE_DIR}/GPATH"
"${CMAKE_CURRENT_SOURCE_DIR}/GSYMS")
endif ()
endif ()
option(USE_MKID "Build the idutils database." ON)
if (USE_MKID)
find_program(MKID "mkid")
if (NOT MKID MATCHES NOTFOUND)
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/ID"
COMMAND ${MKID} ${all_srcs} ${all_hdrs}
DEPENDS ${all_srcs} ${all_hdrs} install_tdb_h generate_config_h generate_log_code
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
add_custom_target(build_MKID ALL DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/ID")
endif ()
endif ()

View File

@ -2,11 +2,6 @@
find_package(Threads)
find_package(ZLIB REQUIRED)
option(USE_BDB "Build some tools and tests with bdb (requires a proper BerkeleyDB include directory and library)." ON)
if(USE_BDB)
find_package(BDB REQUIRED)
endif()
option(USE_VALGRIND "Build to run safely under valgrind (often slower)." ON)
if(USE_VALGRIND)
find_package(Valgrind REQUIRED)

View File

@ -94,8 +94,6 @@ if (BUILD_TESTING OR BUILD_FT_TESTS OR BUILD_SRC_TESTS)
## set up full valgrind suppressions file (concatenate the suppressions files)
file(READ ft/valgrind.suppressions valgrind_suppressions)
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/valgrind.suppressions" "${valgrind_suppressions}")
file(READ src/tests/bdb.suppressions bdb_suppressions)
file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/valgrind.suppressions" "${bdb_suppressions}")
file(READ bash.suppressions bash_suppressions)
file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/valgrind.suppressions" "${bash_suppressions}")

View File

@ -1,16 +0,0 @@
# detect when we are being built as a subproject
if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING)
install(
FILES
db-insert.c
db-insert-multiple.c
db-scan.c
db-update.c
Makefile
README.examples
DESTINATION
examples
COMPONENT
tokukv_examples
)
endif ()

View File

@ -1,29 +0,0 @@
SRCS = $(wildcard *.c)
TARGETS = $(patsubst %.c,%,$(SRCS)) $(patsubst %.c,%-bdb,$(SRCS))
CPPFLAGS = -I../include -D_GNU_SOURCE
CFLAGS = -g -std=c99 -Wall -Wextra -Werror -Wno-missing-field-initializers
ifeq ($(USE_STATIC_LIBS),1)
LIBTOKUDB = tokufractaltree_static
LIBTOKUPORTABILITY = tokuportability_static
else
LIBTOKUDB = tokufractaltree
LIBTOKUPORTABILITY = tokuportability
endif
LDFLAGS = -L../lib -l$(LIBTOKUDB) -l$(LIBTOKUPORTABILITY) -Wl,-rpath,../lib -lpthread -lz -ldl
default local: $(TARGETS)
%: %.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS)
%-bdb: %.c
$(CC) -D_GNU_SOURCE -DBDB $(CFLAGS) $^ -o $@ -ldb
check: $(TARGETS)
./db-insert -x && ./db-scan --lwc --prelock --prelockflag
checknox: $(TARGETS)
./db-insert && ./db-scan --nox --lwc --prelock --prelockflag
clean:
rm -rf $(TARGETS) bench.* update.env.* insertm.env.*

View File

@ -1,85 +0,0 @@
The examples includes a pair of programs that can be compiled to use either the Berkeley DB library or the Tokutek Fractal Tree index library.
Note: The file formats are different from TokuDB and Berkley DB. Thus
you cannot access a database created by Berkeley DB using the Tokutek
DB, or vice-versa.
db-insert is a program that inserts random key-value pairs into a database.
db-scan is a program that scans through the key-value pairs, reading every row, from a database.
db-update is a program that upserts key-value pairs into a database. If the key already exists it increment a count in the value.
db-insert-multiple is a program and inserts key-value pairs into multiple databases. This is is now TokuDB maintains consistent
secondary databases.
To build it and run it (it's been tested on Fedora 10):
$ make (Makes the binaries)
Run the insertion workload under TokuDB:
$ ./db-insert
Run the insertion workload under BDB:
$ ./db-insert-bdb
Here is what the output looks like (this on a Thinkpad X61s laptop
running Fedora 10). BDB is a little faster for sequential insertions
(the first three columns), but much much slower for random insertions
(the next 3 columns), so that TokuDB is faster on combined workload.
$ ./db-insert
serial and random insertions of 1048576 per batch
serial 2.609965s 401759/s random 10.983798s 95466/s cumulative 13.593869s 154272/s
serial 3.053433s 343409/s random 12.008670s 87318/s cumulative 28.656115s 146367/s
serial 5.198312s 201715/s random 15.087426s 69500/s cumulative 48.954605s 128516/s
serial 6.096396s 171999/s random 13.550688s 77382/s cumulative 68.638321s 122215/s
Shutdown 4.025110s
Total time 72.677498s for 8388608 insertions = 115422/s
$ ./db-insert-bdb
serial and random insertions of 1048576 per batch
serial 2.623888s 399627/s random 8.770850s 119552/s cumulative 11.394805s 184045/s
serial 3.081946s 340232/s random 21.046589s 49822/s cumulative 35.523434s 118071/s
serial 14.160498s 74049/s random 497.117523s 2109/s cumulative 546.804504s 11506/s
serial 1.534212s 683462/s random 1128.525146s 929/s cumulative 1676.863892s 5003/s
Shutdown 195.879242s
Total time 1872.746582s for 8388608 insertions = 4479/s
The files are smaller for TokuDB than BDB.
$ ls -lh bench.tokudb/
total 39M
-rwxrwxr-x 1 bradley bradley 39M 2009-07-28 15:36 bench.db
$ ls -lh bench.bdb/
total 322M
-rw-r--r-- 1 bradley bradley 322M 2009-07-28 16:14 bench.db
When scanning the table, one can run out of locks with BDB. There are ways around it (increase the lock table size).
$ ./db-scan-bdb --nox
Lock table is out of available object entries
db-scan-bdb: db-scan.c:177: scanscan_hwc: Assertion `r==(-30988)' failed.
Aborted
TokuDB is fine on a big table scan.
$ ./db-scan --nox
Scan 33162304 bytes (2072644 rows) in 7.924463s at 4.184801MB/s
Scan 33162304 bytes (2072644 rows) in 3.062239s at 10.829431MB/s
0:3 1:53 2:56
miss=3 hit=53 wait_reading=0 wait=0
VmPeak: 244668 kB
VmHWM: 68096 kB
VmRSS: 1232 kB
The update-bdb program upserts 1B rows into a BDB database. When the database gets larger than memory, the throughput
should tank since every update needs to read a block from the storage system. The storage system becomes the performance
bottleneck. The program uses 1 1GB cache in front of the kernel's file system buffer cache. The program should hit the wall
at about 300M rows on a machine with 16GB of memory since keys are 8 bytes and values are 8 bytes in size.
$ ./db-update-bdb
The update program upserts 1B rows into a TokuDB database. Throughput should be not degrade significantly since the cost
of the storage system reads is amortized over 1000's of update operations. One should expect TokuDB to be at least 50 times
faster than BDB.
$ ./db-update
There isn't much documentation for the Tokutek Fractal Tree index library, but most of the API is like Berkeley DB's.

View File

@ -1,510 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
// measure the performance of insertions into multiple dictionaries using ENV->put_multiple
// the table schema is t(a bigint, b bigint, c bigint, d bigint, primary key(a), key(b), key(c,d), clustering key(d))
// the primary key(a) is represented with key=a and value=b,c,d
// the key(b) index is represented with key=b,a and no value
// the key(c,d) index is represented with key=c,d,a and no value
// the clustering key(d) is represented with key=d,a and value=b,c
// a is auto increment
// b, c and d are random
#include "../include/toku_config.h"
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#if defined(HAVE_BYTESWAP_H)
# include <byteswap.h>
#elif defined(HAVE_LIBKERN_OSBYTEORDER_H)
# include <libkern/OSByteOrder.h>
# define bswap_64 OSSwapInt64
#endif
#include <arpa/inet.h>
#include "db.h"
static int force_multiple = 1;
struct table {
int ndbs;
DB **dbs;
#if defined(TOKUDB)
DBT *mult_keys;
DBT *mult_vals;
uint32_t *mult_flags;
#endif
};
#if defined(TOKUDB)
static void table_init_dbt(DBT *dbt, size_t length) {
dbt->flags = DB_DBT_USERMEM;
dbt->data = malloc(length);
dbt->ulen = length;
dbt->size = 0;
}
static void table_destroy_dbt(DBT *dbt) {
free(dbt->data);
}
#endif
static void table_init(struct table *t, int ndbs, DB **dbs, size_t key_length __attribute__((unused)), size_t val_length __attribute__((unused))) {
t->ndbs = ndbs;
t->dbs = dbs;
#if defined(TOKUDB)
t->mult_keys = calloc(ndbs, sizeof (DBT));
int i;
for (i = 0; i < ndbs; i++)
table_init_dbt(&t->mult_keys[i], key_length);
t->mult_vals = calloc(ndbs, sizeof (DBT));
for (i = 0; i < ndbs; i++)
table_init_dbt(&t->mult_vals[i], val_length);
t->mult_flags = calloc(ndbs, sizeof (uint32_t));
for (i = 0; i < ndbs; i++)
t->mult_flags[i] = 0;
#endif
}
static void table_destroy(struct table *t) {
#if defined(TOKUDB)
int i;
for (i = 0; i < t->ndbs; i++)
table_destroy_dbt(&t->mult_keys[i]);
free(t->mult_keys);
for (i = 0; i < t->ndbs; i++)
table_destroy_dbt(&t->mult_vals[i]);
free(t->mult_vals);
free(t->mult_flags);
#else
assert(t);
#endif
}
static int verbose = 0;
static long random64(void) {
return ((long)random() << 32LL) + (long)random();
}
static long htonl64(long x) {
#if BYTE_ORDER == LITTLE_ENDIAN
return bswap_64(x);
#else
#error
#endif
}
#if defined(TOKUDB)
static int my_generate_row_for_put(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val) {
assert(src_db);
assert(dest_key->flags == DB_DBT_USERMEM && dest_key->ulen >= 4 * 8);
assert(dest_val->flags == DB_DBT_USERMEM && dest_val->ulen >= 4 * 8);
int index_num;
assert(dest_db->descriptor->dbt.size == sizeof index_num);
memcpy(&index_num, dest_db->descriptor->dbt.data, sizeof index_num);
switch (htonl(index_num) % 4) {
case 0:
// dest_key = src_key
dest_key->size = src_key->size;
memcpy(dest_key->data, src_key->data, src_key->size);
// dest_val = src_val
dest_val->size = src_val->size;
memcpy(dest_val->data, src_val->data, src_val->size);
break;
case 1:
// dest_key = b,a
dest_key->size = 2 * 8;
memcpy((char *)dest_key->data + 0, (char *)src_val->data + 0, 8);
memcpy((char *)dest_key->data + 8, (char *)src_key->data + 0, 8);
// dest_val = null
dest_val->size = 0;
break;
case 2:
// dest_key = c,d,a
dest_key->size = 3 * 8;
memcpy((char *)dest_key->data + 0, (char *)src_val->data + 8, 8);
memcpy((char *)dest_key->data + 8, (char *)src_val->data + 16, 8);
memcpy((char *)dest_key->data + 16, (char *)src_key->data + 0, 8);
// dest_val = null
dest_val->size = 0;
break;
case 3:
// dest_key = d,a
dest_key->size = 2 * 8;
memcpy((char *)dest_key->data + 0, (char *)src_val->data + 16, 8);
memcpy((char *)dest_key->data + 8, (char *)src_key->data + 0, 8);
// dest_val = b,c
dest_val->size = 2 * 8;
memcpy((char *)dest_val->data + 0, (char *)src_val->data + 0, 8);
memcpy((char *)dest_val->data + 8, (char *)src_val->data + 8, 8);
break;
default:
assert(0);
}
return 0;
}
#else
static int my_secondary_key(DB *db, const DBT *src_key, const DBT *src_val, DBT *dest_key) {
assert(dest_key->flags == 0 && dest_key->data == NULL);
dest_key->flags = DB_DBT_APPMALLOC;
dest_key->data = malloc(4 * 8); assert(dest_key->data);
switch ((intptr_t)db->app_private % 4) {
case 0:
// dest_key = src_key
dest_key->size = src_key->size;
memcpy(dest_key->data, src_key->data, src_key->size);
break;
case 1:
// dest_key = b,a
dest_key->size = 2 * 8;
memcpy((char *)dest_key->data + 0, (char *)src_val->data + 0, 8);
memcpy((char *)dest_key->data + 8, (char *)src_key->data + 0, 8);
break;
case 2:
// dest_key = c,d,a
dest_key->size = 3 * 8;
memcpy((char *)dest_key->data + 0, (char *)src_val->data + 8, 8);
memcpy((char *)dest_key->data + 8, (char *)src_val->data + 16, 8);
memcpy((char *)dest_key->data + 16, (char *)src_key->data + 0, 8);
break;
case 3:
// dest_key = d,a,b,c
dest_key->size = 4 * 8;
memcpy((char *)dest_key->data + 0, (char *)src_val->data + 16, 8);
memcpy((char *)dest_key->data + 8, (char *)src_key->data + 0, 8);
memcpy((char *)dest_key->data + 16, (char *)src_val->data + 0, 8);
memcpy((char *)dest_key->data + 24, (char *)src_val->data + 8, 8);
break;
default:
assert(0);
}
return 0;
}
#endif
static void insert_row(DB_ENV *db_env, struct table *t, DB_TXN *txn, long a, long b, long c, long d) {
int r;
// generate the primary key
char key_buffer[8];
a = htonl64(a);
memcpy(key_buffer, &a, sizeof a);
// generate the primary value
char val_buffer[3*8];
b = htonl64(b);
memcpy(val_buffer+0, &b, sizeof b);
c = htonl64(c);
memcpy(val_buffer+8, &c, sizeof c);
d = htonl64(d);
memcpy(val_buffer+16, &d, sizeof d);
DBT key = { .data = key_buffer, .size = sizeof key_buffer };
DBT value = { .data = val_buffer, .size = sizeof val_buffer };
#if defined(TOKUDB)
if (!force_multiple && t->ndbs == 1) {
r = t->dbs[0]->put(t->dbs[0], txn, &key, &value, t->mult_flags[0]); assert(r == 0);
} else {
r = db_env->put_multiple(db_env, t->dbs[0], txn, &key, &value, t->ndbs, &t->dbs[0], t->mult_keys, t->mult_vals, t->mult_flags); assert(r == 0);
}
#else
assert(db_env);
r = t->dbs[0]->put(t->dbs[0], txn, &key, &value, 0); assert(r == 0);
#endif
}
static inline float tdiff (struct timeval *a, struct timeval *b) {
return (a->tv_sec - b->tv_sec) +1e-6*(a->tv_usec - b->tv_usec);
}
static void insert_all(DB_ENV *db_env, struct table *t, long nrows, long max_rows_per_txn, long key_range, long rows_per_report, bool do_txn) {
int r;
struct timeval tstart;
r = gettimeofday(&tstart, NULL); assert(r == 0);
struct timeval tlast = tstart;
DB_TXN *txn = NULL;
if (do_txn) {
r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0);
}
long n_rows_per_txn = 0;
long rowi;
for (rowi = 0; rowi < nrows; rowi++) {
long a = rowi;
long b = random64() % key_range;
long c = random64() % key_range;
long d = random64() % key_range;
insert_row(db_env, t, txn, a, b, c, d);
n_rows_per_txn++;
// maybe commit
if (do_txn && n_rows_per_txn == max_rows_per_txn) {
r = txn->commit(txn, 0); assert(r == 0);
r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0);
n_rows_per_txn = 0;
}
// maybe report performance
if (((rowi + 1) % rows_per_report) == 0) {
struct timeval tnow;
r = gettimeofday(&tnow, NULL); assert(r == 0);
float last_time = tdiff(&tnow, &tlast);
float total_time = tdiff(&tnow, &tstart);
printf("%ld %.3f %.0f/s %.0f/s\n", rowi + 1, last_time, rows_per_report/last_time, rowi/total_time); fflush(stdout);
tlast = tnow;
}
}
if (do_txn) {
r = txn->commit(txn, 0); assert(r == 0);
}
struct timeval tnow;
r = gettimeofday(&tnow, NULL); assert(r == 0);
printf("total %ld %.3f %.0f/s\n", nrows, tdiff(&tnow, &tstart), nrows/tdiff(&tnow, &tstart)); fflush(stdout);
}
int main(int argc, char *argv[]) {
#if defined(TOKDUB)
char *db_env_dir = "insertm.env.tokudb";
#else
char *db_env_dir = "insertm.env.bdb";
#endif
int db_env_open_flags = DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOCK | DB_INIT_LOG;
long rows = 100000000;
long rows_per_txn = 1000;
long rows_per_report = 100000;
long key_range = 100000;
bool do_txn = true;
u_int32_t pagesize = 0;
u_int64_t cachesize = 1000000000;
int ndbs = 4;
#if defined(TOKUDB)
u_int32_t checkpoint_period = 60;
#endif
int i;
for (i = 1; i < argc; i++) {
char *arg = argv[i];
if (strcmp(arg, "--verbose") == 0) {
verbose++;
continue;
}
if (strcmp(arg, "--ndbs") == 0 && i+1 < argc) {
ndbs = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--rows") == 0 && i+1 < argc) {
rows = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--rows_per_txn") == 0 && i+1 < argc) {
rows_per_txn = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--rows_per_report") == 0 && i+1 < argc) {
rows_per_report = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--key_range") == 0 && i+1 < argc) {
key_range = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--txn") == 0 && i+1 < argc) {
do_txn = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--pagesize") == 0 && i+1 < argc) {
pagesize = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--cachesize") == 0 && i+1 < argc) {
cachesize = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--force_multiple") == 0 && i+1 < argc) {
force_multiple = atoi(argv[++i]);
continue;
}
#if defined(TOKUDB)
if (strcmp(arg, "--checkpoint_period") == 0 && i+1 < argc) {
checkpoint_period = atoi(argv[++i]);
continue;
}
#endif
assert(0);
}
int r;
char rm_cmd[strlen(db_env_dir) + strlen("rm -rf ") + 1];
snprintf(rm_cmd, sizeof(rm_cmd), "rm -rf %s", db_env_dir);
r = system(rm_cmd); assert(r == 0);
r = mkdir(db_env_dir, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); assert(r == 0);
// create and open the env
DB_ENV *db_env = NULL;
r = db_env_create(&db_env, 0); assert(r == 0);
if (!do_txn)
db_env_open_flags &= ~(DB_INIT_TXN | DB_INIT_LOG);
if (cachesize) {
const u_int64_t gig = 1 << 30;
r = db_env->set_cachesize(db_env, cachesize / gig, cachesize % gig, 1); assert(r == 0);
}
#if defined(TOKUDB)
r = db_env->set_generate_row_callback_for_put(db_env, my_generate_row_for_put); assert(r == 0);
#endif
r = db_env->open(db_env, db_env_dir, db_env_open_flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0);
#if defined(TOKUDB)
if (checkpoint_period) {
r = db_env->checkpointing_set_period(db_env, checkpoint_period); assert(r == 0);
u_int32_t period;
r = db_env->checkpointing_get_period(db_env, &period); assert(r == 0 && period == checkpoint_period);
}
#endif
// create the db
DB *dbs[ndbs];
for (i = 0; i < ndbs; i++) {
DB *db = NULL;
r = db_create(&db, db_env, 0); assert(r == 0);
DB_TXN *create_txn = NULL;
if (do_txn) {
r = db_env->txn_begin(db_env, NULL, &create_txn, 0); assert(r == 0);
}
if (pagesize) {
r = db->set_pagesize(db, pagesize); assert(r == 0);
}
char db_filename[32]; sprintf(db_filename, "test%d", i);
r = db->open(db, create_txn, db_filename, NULL, DB_BTREE, DB_CREATE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0);
#if defined(TOKUDB)
DESCRIPTOR_S new_descriptor;
int index_num = htonl(i);
new_descriptor.dbt.data = &index_num;
new_descriptor.dbt.size = sizeof i;
r = db->change_descriptor(db, create_txn, &new_descriptor.dbt, 0); assert(r == 0);
#else
db->app_private = (void *) (intptr_t) i;
if (i > 0) {
r = dbs[0]->associate(dbs[0], create_txn, db, my_secondary_key, 0); assert(r == 0);
}
#endif
if (do_txn) {
r = create_txn->commit(create_txn, 0); assert(r == 0);
}
dbs[i] = db;
}
// insert all rows
struct table table;
table_init(&table, ndbs, dbs, 4 * 8, 4 * 8);
insert_all(db_env, &table, rows, rows_per_txn, key_range, rows_per_report, do_txn);
table_destroy(&table);
// shutdown
for (i = 0; i < ndbs; i++) {
DB *db = dbs[i];
r = db->close(db, 0); assert(r == 0); db = NULL;
}
r = db_env->close(db_env, 0); assert(r == 0); db_env = NULL;
return 0;
}

View File

@ -1,610 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// Define BDB if you want to compile this to use Berkeley DB
#include <stdint.h>
#include <inttypes.h>
#ifdef BDB
#include <sys/types.h>
#include <db.h>
#define DIRSUF bdb
#else
#include <tokudb.h>
#define DIRSUF tokudb
#endif
#include <assert.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
static inline float toku_tdiff (struct timeval *a, struct timeval *b) {
return (a->tv_sec - b->tv_sec) +1e-6*(a->tv_usec - b->tv_usec);
}
#if !defined(DB_PRELOCKED_WRITE)
#define NO_DB_PRELOCKED
#define DB_PRELOCKED_WRITE 0
#endif
int verbose=1;
enum { SERIAL_SPACING = 1<<6 };
enum { DEFAULT_ITEMS_TO_INSERT_PER_ITERATION = 1<<20 };
enum { DEFAULT_ITEMS_PER_TRANSACTION = 1<<14 };
static void insert (long long v);
#define CKERR(r) ({ int __r = r; if (__r!=0) fprintf(stderr, "%s:%d error %d %s\n", __FILE__, __LINE__, __r, db_strerror(r)); assert(__r==0); })
#define CKERR2(r,rexpect) if (r!=rexpect) fprintf(stderr, "%s:%d error %d %s\n", __FILE__, __LINE__, r, db_strerror(r)); assert(r==rexpect);
/* default test parameters */
int keysize = sizeof (long long);
int valsize = sizeof (long long);
int pagesize = 0;
long long cachesize = 1000000000; // 1GB
int dupflags = 0;
int noserial = 0; // Don't do the serial stuff
int norandom = 0; // Don't do the random stuff
int prelock = 0;
int prelockflag = 0;
int items_per_transaction = DEFAULT_ITEMS_PER_TRANSACTION;
int items_per_iteration = DEFAULT_ITEMS_TO_INSERT_PER_ITERATION;
int finish_child_first = 0; // Commit or abort child first (before doing so to the parent). No effect if child does not exist.
int singlex_child = 0; // Do a single transaction, but do all work with a child
int singlex = 0; // Do a single transaction
int singlex_create = 0; // Create the db using the single transaction (only valid if singlex)
int insert1first = 0; // insert 1 before doing the rest
int do_transactions = 0;
int if_transactions_do_logging = DB_INIT_LOG; // set this to zero if we want no logging when transactions are used
int do_abort = 0;
int n_insertions_since_txn_began=0;
int env_open_flags = DB_CREATE|DB_PRIVATE|DB_INIT_MPOOL;
u_int32_t put_flags = 0;
double compressibility = -1; // -1 means make it very compressible. 1 means use random bits everywhere. 2 means half the bits are random.
int do_append = 0;
u_int32_t checkpoint_period = 60;
static void do_prelock(DB* db, DB_TXN* txn) {
if (prelock) {
#if !defined(NO_DB_PRELOCKED)
int r = db->pre_acquire_table_lock(db, txn);
assert(r==0);
#else
(void) db; (void) txn;
#endif
}
}
#define STRINGIFY2(s) #s
#define STRINGIFY(s) STRINGIFY2(s)
const char *dbdir = "./bench." STRINGIFY(DIRSUF);
char *dbfilename = "bench.db";
char *dbname;
DB_ENV *dbenv;
DB *db;
DB_TXN *parenttid=0;
DB_TXN *tid=0;
static void benchmark_setup (void) {
int r;
if (!do_append) {
char unlink_cmd[strlen(dbdir) + strlen("rm -rf ") + 1];
snprintf(unlink_cmd, sizeof(unlink_cmd), "rm -rf %s", dbdir);
//printf("unlink_cmd=%s\n", unlink_cmd);
system(unlink_cmd);
if (strcmp(dbdir, ".") != 0) {
r = mkdir(dbdir,S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
assert(r == 0);
}
}
r = db_env_create(&dbenv, 0);
assert(r == 0);
#if !defined(TOKUDB)
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 4
if (dbenv->set_lk_max) {
r = dbenv->set_lk_max(dbenv, items_per_transaction*2);
assert(r==0);
}
#elif (DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 7) || DB_VERSION_MAJOR >= 5
if (dbenv->set_lk_max_locks) {
r = dbenv->set_lk_max_locks(dbenv, items_per_transaction*2);
assert(r==0);
}
if (dbenv->set_lk_max_lockers) {
r = dbenv->set_lk_max_lockers(dbenv, items_per_transaction*2);
assert(r==0);
}
if (dbenv->set_lk_max_objects) {
r = dbenv->set_lk_max_objects(dbenv, items_per_transaction*2);
assert(r==0);
}
#else
#error
#endif
#endif
if (dbenv->set_cachesize) {
r = dbenv->set_cachesize(dbenv, cachesize / (1024*1024*1024), cachesize % (1024*1024*1024), 1);
if (r != 0)
printf("WARNING: set_cachesize %d\n", r);
}
{
r = dbenv->open(dbenv, dbdir, env_open_flags, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
assert(r == 0);
}
#if defined(TOKUDB)
if (checkpoint_period) {
printf("set checkpoint_period %u\n", checkpoint_period);
r = dbenv->checkpointing_set_period(dbenv, checkpoint_period); assert(r == 0);
u_int32_t period;
r = dbenv->checkpointing_get_period(dbenv, &period); assert(r == 0 && period == checkpoint_period);
}
#endif
r = db_create(&db, dbenv, 0);
assert(r == 0);
if (do_transactions) {
r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
}
if (pagesize && db->set_pagesize) {
r = db->set_pagesize(db, pagesize);
assert(r == 0);
}
if (dupflags) {
r = db->set_flags(db, dupflags);
assert(r == 0);
}
r = db->open(db, tid, dbfilename, NULL, DB_BTREE, DB_CREATE, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (r!=0) fprintf(stderr, "errno=%d, %s\n", errno, strerror(errno));
assert(r == 0);
if (insert1first) {
if (do_transactions) {
r=tid->commit(tid, 0);
assert(r==0);
tid = NULL;
r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
}
insert(-1);
if (singlex) {
r=tid->commit(tid, 0);
assert(r==0);
tid = NULL;
r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
}
}
else if (singlex && !singlex_create) {
r=tid->commit(tid, 0);
assert(r==0);
tid = NULL;
r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
}
if (do_transactions) {
if (singlex)
do_prelock(db, tid);
else {
r=tid->commit(tid, 0);
assert(r==0);
tid = NULL;
}
}
if (singlex_child) {
parenttid = tid;
tid = NULL;
r=dbenv->txn_begin(dbenv, parenttid, &tid, 0); CKERR(r);
}
}
static void benchmark_shutdown (void) {
int r;
if (do_transactions && singlex && !insert1first && (singlex_create || prelock)) {
#if defined(TOKUDB)
//There should be a single 'truncate' in the rollback instead of many 'insert' entries.
struct txn_stat *s;
r = tid->txn_stat(tid, &s);
assert(r==0);
//TODO: #1125 Always do the test after performance testing is done.
if (singlex_child) fprintf(stderr, "SKIPPED 'small rollback' test for child txn\n");
else
assert(s->rollback_raw_count < 100); // gross test, not worth investigating details
free(s);
//system("ls -l bench.tokudb");
#endif
}
if (do_transactions && singlex) {
if (!singlex_child || finish_child_first) {
assert(tid);
r = (do_abort ? tid->abort(tid) : tid->commit(tid, 0)); assert(r==0);
tid = NULL;
}
if (singlex_child) {
assert(parenttid);
r = (do_abort ? parenttid->abort(parenttid) : parenttid->commit(parenttid, 0)); assert(r==0);
parenttid = NULL;
}
else
assert(!parenttid);
}
assert(!tid);
assert(!parenttid);
r = db->close(db, 0);
assert(r == 0);
r = dbenv->close(dbenv, 0);
assert(r == 0);
}
static void long_long_to_array (unsigned char *a, int array_size, unsigned long long l) {
int i;
for (i=0; i<8 && i<array_size; i++)
a[i] = (l>>(56-8*i))&0xff;
}
static DBT *fill_dbt(DBT *dbt, const void *data, int size) {
memset(dbt, 0, sizeof *dbt);
dbt->size = size;
dbt->data = (void *) data;
return dbt;
}
// Fill array with 0's if compressibilty==-1, otherwise fill array with data that is likely to compress by a factor of compressibility.
static void fill_array (unsigned char *data, int size) {
memset(data, 0, size);
if (compressibility>0) {
int i;
for (i=0; i<size/compressibility; i++) {
data[i] = (unsigned char) random();
}
}
}
static void insert (long long v) {
unsigned char kc[keysize], vc[valsize];
DBT kt, vt;
fill_array(kc, sizeof kc);
long_long_to_array(kc, keysize, v); // Fill in the array first, then write the long long in.
fill_array(vc, sizeof vc);
long_long_to_array(vc, valsize, v);
int r = db->put(db, tid, fill_dbt(&kt, kc, keysize), fill_dbt(&vt, vc, valsize), put_flags);
CKERR(r);
if (do_transactions) {
if (n_insertions_since_txn_began>=items_per_transaction && !singlex) {
n_insertions_since_txn_began=0;
r = tid->commit(tid, 0); assert(r==0);
tid = NULL;
r=dbenv->txn_begin(dbenv, 0, &tid, 0); assert(r==0);
do_prelock(db, tid);
n_insertions_since_txn_began=0;
}
n_insertions_since_txn_began++;
}
}
static void serial_insert_from (long long from) {
long long i;
if (do_transactions && !singlex) {
int r = dbenv->txn_begin(dbenv, 0, &tid, 0); assert(r==0);
do_prelock(db, tid);
{
DBT k,v;
r=db->put(db, tid, fill_dbt(&k, "a", 1), fill_dbt(&v, "b", 1), put_flags);
CKERR(r);
}
}
for (i=0; i<items_per_iteration; i++) {
insert((from+i)*SERIAL_SPACING);
}
if (do_transactions && !singlex) {
int r= tid->commit(tid, 0); assert(r==0);
tid=NULL;
}
}
static long long llrandom (void) {
return (((long long)(random()))<<32) + random();
}
static void random_insert_below (long long below) {
long long i;
if (do_transactions && !singlex) {
int r = dbenv->txn_begin(dbenv, 0, &tid, 0); assert(r==0);
do_prelock(db, tid);
}
for (i=0; i<items_per_iteration; i++) {
insert(llrandom()%below);
}
if (do_transactions && !singlex) {
int r= tid->commit(tid, 0); assert(r==0);
tid=NULL;
}
}
static void biginsert (long long n_elements, struct timeval *starttime) {
long long i;
struct timeval t1,t2;
int iteration;
for (i=0, iteration=0; i<n_elements; i+=items_per_iteration, iteration++) {
if (verbose) {
printf("%d ", iteration);
fflush(stdout);
}
if (!noserial) {
gettimeofday(&t1,0);
serial_insert_from(i);
gettimeofday(&t2,0);
if (verbose) {
printf("serial %9.6fs %8.0f/s ", toku_tdiff(&t2, &t1), items_per_iteration/toku_tdiff(&t2, &t1));
fflush(stdout);
}
}
if (!norandom) {
gettimeofday(&t1,0);
random_insert_below((i+items_per_iteration)*SERIAL_SPACING);
gettimeofday(&t2,0);
if (verbose) {
printf("random %9.6fs %8.0f/s ", toku_tdiff(&t2, &t1), items_per_iteration/toku_tdiff(&t2, &t1));
fflush(stdout);
}
}
if (verbose) {
printf("cumulative %9.6fs %8.0f/s\n", toku_tdiff(&t2, starttime), (((float)items_per_iteration*(!noserial+!norandom))/toku_tdiff(&t2, starttime))*(iteration+1));
fflush(stdout);
}
}
}
const long long default_n_items = 1LL<<22;
static int print_usage (const char *argv0) {
fprintf(stderr, "Usage:\n");
fprintf(stderr, " %s [-x] [--keysize KEYSIZE] [--valsize VALSIZE] [--noserial] [--norandom] [ n_iterations ]\n", argv0);
fprintf(stderr, " where\n");
fprintf(stderr, " -x do transactions (XCOUNT transactions per iteration) (default: no transactions at all)\n");
fprintf(stderr, " --keysize KEYSIZE sets the key size (default 8)\n");
fprintf(stderr, " --valsize VALSIZE sets the value size (default 8)\n");
fprintf(stderr, " --noserial causes the serial insertions to be skipped\n");
fprintf(stderr, " --norandom causes the random insertions to be skipped\n");
fprintf(stderr, " --cachesize CACHESIZE set the database cache size\n");
fprintf(stderr, " --pagesize PAGESIZE sets the database page size\n");
fprintf(stderr, " --compressibility C creates data that should compress by about a factor C. Default C is large. C is an float.\n");
fprintf(stderr, " --xcount N how many insertions per transaction (default=%d)\n", DEFAULT_ITEMS_PER_TRANSACTION);
fprintf(stderr, " --singlex (implies -x) Run the whole job as a single transaction. (Default don't run as a single transaction.)\n");
fprintf(stderr, " --singlex-child (implies -x) Run the whole job as a single transaction, do all work a child of that transaction.\n");
fprintf(stderr, " --finish-child-first Commit/abort child before doing so to parent (no effect if no child).\n");
fprintf(stderr, " --singlex-create (implies --singlex) Create the file using the single transaction (Default is to use a different transaction to create.)\n");
fprintf(stderr, " --prelock Prelock the database.\n");
fprintf(stderr, " --prelockflag Prelock the database and send the DB_PRELOCKED_WRITE flag.\n");
fprintf(stderr, " --abort Abort the singlex after the transaction is over. (Requires --singlex.)\n");
fprintf(stderr, " --nolog If transactions are used, then don't write the recovery log\n");
fprintf(stderr, " --periter N how many insertions per iteration (default=%d)\n", DEFAULT_ITEMS_TO_INSERT_PER_ITERATION);
fprintf(stderr, " --env DIR\n");
fprintf(stderr, " --append append to an existing file\n");
fprintf(stderr, " --checkpoint-period %" PRIu32 " checkpoint period\n", checkpoint_period);
fprintf(stderr, " n_iterations how many iterations (default %lld)\n", default_n_items/DEFAULT_ITEMS_TO_INSERT_PER_ITERATION);
return 1;
}
#define UU(x) x __attribute__((__unused__))
int main (int argc, const char *argv[]) {
struct timeval t1,t2,t3;
long long total_n_items = default_n_items;
char *endptr;
int i;
for (i=1; i<argc; i++) {
const char *arg = argv[i];
if (arg[0] != '-')
break;
if (strcmp(arg, "-q") == 0) {
verbose--; if (verbose<0) verbose=0;
} else if (strcmp(arg, "-x") == 0) {
do_transactions = 1;
} else if (strcmp(arg, "--noserial") == 0) {
noserial=1;
} else if (strcmp(arg, "--norandom") == 0) {
norandom=1;
} else if (strcmp(arg, "--compressibility") == 0) {
compressibility = atof(argv[++i]);
} else if (strcmp(arg, "--nolog") == 0) {
if_transactions_do_logging = 0;
} else if (strcmp(arg, "--singlex-create") == 0) {
do_transactions = 1;
singlex = 1;
singlex_create = 1;
} else if (strcmp(arg, "--finish-child-first") == 0) {
finish_child_first = 1;
} else if (strcmp(arg, "--singlex-child") == 0) {
do_transactions = 1;
singlex = 1;
singlex_child = 1;
} else if (strcmp(arg, "--singlex") == 0) {
do_transactions = 1;
singlex = 1;
} else if (strcmp(arg, "--insert1first") == 0) {
insert1first = 1;
} else if (strcmp(arg, "--xcount") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
items_per_transaction = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0);
} else if (strcmp(arg, "--abort") == 0) {
do_abort = 1;
} else if (strcmp(arg, "--periter") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
items_per_iteration = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0);
} else if (strcmp(arg, "--cachesize") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
cachesize = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0);
} else if (strcmp(arg, "--keysize") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
keysize = atoi(argv[++i]);
} else if (strcmp(arg, "--valsize") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
valsize = atoi(argv[++i]);
} else if (strcmp(arg, "--pagesize") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
pagesize = atoi(argv[++i]);
} else if (strcmp(arg, "--env") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
dbdir = argv[++i];
} else if (strcmp(arg, "--prelock") == 0) {
prelock=1;
} else if (strcmp(arg, "--prelockflag") == 0) {
prelock=1;
prelockflag=1;
} else if (strcmp(arg, "--srandom") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
srandom(atoi(argv[++i]));
} else if (strcmp(arg, "--append") == 0) {
do_append = 1;
} else if (strcmp(arg, "--checkpoint-period") == 0) {
if (i+1 >= argc) return print_usage(argv[9]);
checkpoint_period = (u_int32_t) atoi(argv[++i]);
} else if (strcmp(arg, "--unique_checks") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
int unique_checks = atoi(argv[++i]);
if (unique_checks)
put_flags = DB_NOOVERWRITE;
else
put_flags = 0;
} else {
return print_usage(argv[0]);
}
}
if (do_transactions) {
env_open_flags |= DB_INIT_TXN | if_transactions_do_logging | DB_INIT_LOCK;
}
if (do_transactions && prelockflag) {
put_flags |= DB_PRELOCKED_WRITE;
}
if (i<argc) {
/* if it looks like a number */
char *end;
errno=0;
long n_iterations = strtol(argv[i], &end, 10);
if (errno!=0 || *end!=0 || end==argv[i]) {
print_usage(argv[0]);
return 1;
}
total_n_items = items_per_iteration * (long long)n_iterations;
}
if (verbose) {
if (!noserial) printf("serial ");
if (!noserial && !norandom) printf("and ");
if (!norandom) printf("random ");
printf("insertions of %d per batch%s\n", items_per_iteration, do_transactions ? " (with transactions)" : "");
}
benchmark_setup();
gettimeofday(&t1,0);
biginsert(total_n_items, &t1);
gettimeofday(&t2,0);
benchmark_shutdown();
gettimeofday(&t3,0);
if (verbose) {
printf("Shutdown %9.6fs\n", toku_tdiff(&t3, &t2));
printf("Total time %9.6fs for %lld insertions = %8.0f/s\n", toku_tdiff(&t3, &t1),
(!noserial+!norandom)*total_n_items, (!noserial+!norandom)*total_n_items/toku_tdiff(&t3, &t1));
}
return 0;
}

View File

@ -1,461 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* Scan the bench.tokudb/bench.db over and over. */
#define DONT_DEPRECATE_MALLOC
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <inttypes.h>
#ifdef BDB
#include <db.h>
#define DIRSUF bdb
#else
#include <tokudb.h>
#define DIRSUF tokudb
#endif
#include <assert.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
static const char *pname;
static enum run_mode { RUN_HWC, RUN_LWC, RUN_VERIFY, RUN_RANGE} run_mode = RUN_HWC;
static int do_txns=1, prelock=0, prelockflag=0;
static u_int32_t lock_flag = 0;
static long limitcount=-1;
static u_int32_t cachesize = 127*1024*1024;
static u_int64_t start_range = 0, end_range = 0;
static int n_experiments = 2;
static int bulk_fetch = 1;
static int print_usage (const char *argv0) {
fprintf(stderr, "Usage:\n%s [--verify-lwc | --lwc | --nohwc] [--prelock] [--prelockflag] [--prelockwriteflag] [--env DIR]\n", argv0);
fprintf(stderr, " --verify-lwc means to run the light weight cursor and the heavyweight cursor to verify that they get the same answer.\n");
fprintf(stderr, " --lwc run light weight cursors instead of heavy weight cursors\n");
fprintf(stderr, " --prelock acquire a read lock on the entire table before running\n");
fprintf(stderr, " --prelockflag pass DB_PRELOCKED to the the cursor get operation whenever the locks have been acquired\n");
fprintf(stderr, " --prelockwriteflag pass DB_PRELOCKED_WRITE to the cursor get operation\n");
fprintf(stderr, " --nox no transactions (no locking)\n");
fprintf(stderr, " --count COUNT read the first COUNT rows and then stop.\n");
fprintf(stderr, " --cachesize N set the env cachesize to N bytes\n");
fprintf(stderr, " --srandom N srandom(N)\n");
fprintf(stderr, " --env DIR put db files in DIR instead of default\n");
fprintf(stderr, " --bulk_fetch 0|1 do bulk fetch on lwc operations (default: 1)\n");
return 1;
}
static DB_ENV *env;
static DB *db;
static DB_TXN *tid=0;
#define STRINGIFY2(s) #s
#define STRINGIFY(s) STRINGIFY2(s)
static const char *dbdir = "./bench." STRINGIFY(DIRSUF); /* DIRSUF is passed in as a -D argument to the compiler. */
static int env_open_flags_yesx = DB_CREATE|DB_PRIVATE|DB_INIT_MPOOL|DB_INIT_TXN|DB_INIT_LOG|DB_INIT_LOCK;
static int env_open_flags_nox = DB_CREATE|DB_PRIVATE|DB_INIT_MPOOL;
static char *dbfilename = "bench.db";
static void parse_args (int argc, const char *argv[]) {
pname=argv[0];
argc--; argv++;
int specified_run_mode=0;
while (argc>0) {
if (strcmp(*argv,"--verify-lwc")==0) {
if (specified_run_mode && run_mode!=RUN_VERIFY) { two_modes: fprintf(stderr, "You specified two run modes\n"); exit(1); }
run_mode = RUN_VERIFY;
} else if (strcmp(*argv, "--lwc")==0) {
if (specified_run_mode && run_mode!=RUN_LWC) goto two_modes;
run_mode = RUN_LWC;
} else if (strcmp(*argv, "--hwc")==0) {
if (specified_run_mode && run_mode!=RUN_VERIFY) goto two_modes;
run_mode = RUN_HWC;
} else if (strcmp(*argv, "--prelock")==0) prelock=1;
#ifdef TOKUDB
else if (strcmp(*argv, "--prelockflag")==0) { prelockflag=1; lock_flag = DB_PRELOCKED; }
else if (strcmp(*argv, "--prelockwriteflag")==0) { prelockflag=1; lock_flag = DB_PRELOCKED_WRITE; }
#endif
else if (strcmp(*argv, "--nox")==0) { do_txns=0; }
else if (strcmp(*argv, "--count")==0) {
char *end;
argc--; argv++;
errno=0; limitcount=strtol(*argv, &end, 10); assert(errno==0);
printf("Limiting count to %ld\n", limitcount);
} else if (strcmp(*argv, "--cachesize")==0 && argc>0) {
char *end;
argc--; argv++;
cachesize=(u_int32_t)strtol(*argv, &end, 10);
} else if (strcmp(*argv, "--env") == 0) {
argc--; argv++;
if (argc==0) exit(print_usage(pname));
dbdir = *argv;
} else if (strcmp(*argv, "--range") == 0 && argc > 2) {
run_mode = RUN_RANGE;
argc--; argv++;
start_range = strtoll(*argv, NULL, 10);
argc--; argv++;
end_range = strtoll(*argv, NULL, 10);
} else if (strcmp(*argv, "--experiments") == 0 && argc > 1) {
argc--; argv++;
n_experiments = strtol(*argv, NULL, 10);
} else if (strcmp(*argv, "--srandom") == 0 && argc > 1) {
argc--; argv++;
srandom(atoi(*argv));
} else if (strcmp(*argv, "--bulk_fetch") == 0 && argc > 1) {
argc--; argv++;
bulk_fetch = atoi(*argv);
} else {
exit(print_usage(pname));
}
argc--; argv++;
}
//Prelocking is meaningless without transactions
if (do_txns==0) {
prelockflag=0;
lock_flag=0;
prelock=0;
}
}
static void scanscan_setup (void) {
int r;
r = db_env_create(&env, 0); assert(r==0);
r = env->set_cachesize(env, 0, cachesize, 1); assert(r==0);
r = env->open(env, dbdir, do_txns? env_open_flags_yesx : env_open_flags_nox, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); assert(r==0);
r = db_create(&db, env, 0); assert(r==0);
if (do_txns) {
r = env->txn_begin(env, 0, &tid, 0); assert(r==0);
}
r = db->open(db, tid, dbfilename, NULL, DB_BTREE, 0, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); assert(r==0);
#ifdef TOKUDB
if (prelock) {
r = db->pre_acquire_table_lock(db, tid);
assert(r==0);
}
#endif
}
static void scanscan_shutdown (void) {
int r;
r = db->close(db, 0); assert(r==0);
if (do_txns) {
r = tid->commit(tid, 0); assert(r==0);
}
r = env->close(env, 0); assert(r==0);
}
static double gettime (void) {
struct timeval tv;
int r = gettimeofday(&tv, 0);
assert(r==0);
return tv.tv_sec + 1e-6*tv.tv_usec;
}
static void scanscan_hwc (void) {
int r;
int counter=0;
for (counter=0; counter<n_experiments; counter++) {
long long totalbytes=0;
int rowcounter=0;
double prevtime = gettime();
DBT k,v;
DBC *dbc;
r = db->cursor(db, tid, &dbc, 0); assert(r==0);
memset(&k, 0, sizeof(k));
memset(&v, 0, sizeof(v));
u_int32_t c_get_flags = DB_NEXT;
if (prelockflag && (counter || prelock)) {
c_get_flags |= lock_flag;
}
while (0 == (r = dbc->c_get(dbc, &k, &v, c_get_flags))) {
//printf("r=%d\n", r);
totalbytes += k.size + v.size;
rowcounter++;
if (limitcount>0 && rowcounter>=limitcount) break;
}
assert(r==DB_NOTFOUND);
r = dbc->c_close(dbc); assert(r==0);
double thistime = gettime();
double tdiff = thistime-prevtime;
printf("Scan %lld bytes (%d rows) in %9.6fs at %9fMB/s\n", totalbytes, rowcounter, tdiff, 1e-6*totalbytes/tdiff);
}
}
#ifdef TOKUDB
struct extra_count {
long long totalbytes;
int rowcounter;
};
static int counttotalbytes (DBT const *key, DBT const *data, void *extrav) {
struct extra_count *e=extrav;
e->totalbytes += key->size + data->size;
e->rowcounter++;
return bulk_fetch ? TOKUDB_CURSOR_CONTINUE : 0;
}
static void scanscan_lwc (void) {
int r;
int counter=0;
for (counter=0; counter<n_experiments; counter++) {
struct extra_count e = {0,0};
double prevtime = gettime();
DBC *dbc;
r = db->cursor(db, tid, &dbc, 0); assert(r==0);
u_int32_t f_flags = 0;
if (prelockflag && (counter || prelock)) {
f_flags |= lock_flag;
}
long rowcounter=0;
while (0 == (r = dbc->c_getf_next(dbc, f_flags, counttotalbytes, &e))) {
rowcounter++;
if (limitcount>0 && rowcounter>=limitcount) break;
}
r = dbc->c_close(dbc); assert(r==0);
double thistime = gettime();
double tdiff = thistime-prevtime;
printf("LWC Scan %lld bytes (%d rows) in %9.6fs at %9fMB/s\n", e.totalbytes, e.rowcounter, tdiff, 1e-6*e.totalbytes/tdiff);
}
}
#endif
static void scanscan_range (void) {
int r;
double texperiments[n_experiments];
u_int64_t k = 0;
char kv[8];
DBT key, val;
int counter;
for (counter = 0; counter < n_experiments; counter++) {
if (1) { //if ((counter&1) == 0) {
makekey:
// generate a random key in the key range
k = (start_range + (random() % (end_range - start_range))) * (1<<6);
int i;
for (i = 0; i < 8; i++)
kv[i] = k >> (56-8*i);
}
memset(&key, 0, sizeof key); key.data = &kv, key.size = sizeof kv;
memset(&val, 0, sizeof val);
double tstart = gettime();
DBC *dbc;
r = db->cursor(db, tid, &dbc, 0); assert(r==0);
// set the cursor to the random key
r = dbc->c_get(dbc, &key, &val, DB_SET_RANGE+lock_flag);
if (r != 0) {
assert(r == DB_NOTFOUND);
printf("%s:%d %" PRIu64 "\n", __FUNCTION__, __LINE__, k);
goto makekey;
}
#ifdef TOKUDB
// do the range scan
long rowcounter = 0;
struct extra_count e = {0,0};
while (limitcount > 0 && rowcounter < limitcount) {
r = dbc->c_getf_next(dbc, prelockflag ? lock_flag : 0, counttotalbytes, &e);
if (r != 0)
break;
rowcounter++;
}
#endif
r = dbc->c_close(dbc);
assert(r==0);
texperiments[counter] = gettime() - tstart;
printf("%" PRIu64 " %f\n", k, texperiments[counter]); fflush(stdout);
}
// print the times
double tsum = 0.0, tmin = 0.0, tmax = 0.0;
for (counter = 0; counter < n_experiments; counter++) {
if (counter==0 || texperiments[counter] < tmin)
tmin = texperiments[counter];
if (counter==0 || texperiments[counter] > tmax)
tmax = texperiments[counter];
tsum += texperiments[counter];
}
printf("%f %f %f/%d = %f\n", tmin, tmax, tsum, n_experiments, tsum / n_experiments);
}
#ifdef TOKUDB
struct extra_verify {
long long totalbytes;
int rowcounter;
DBT k,v; // the k and v are gotten using the old cursor
};
static int
checkbytes (DBT const *key, DBT const *data, void *extrav) {
struct extra_verify *e=extrav;
e->totalbytes += key->size + data->size;
e->rowcounter++;
assert(e->k.size == key->size);
assert(e->v.size == data->size);
assert(memcmp(e->k.data, key->data, key->size)==0);
assert(memcmp(e->v.data, data->data, data->size)==0);
assert(e->k.data != key->data);
assert(e->v.data != data->data);
return 0;
}
static void scanscan_verify (void) {
int r;
int counter=0;
for (counter=0; counter<n_experiments; counter++) {
struct extra_verify v;
v.totalbytes=0;
v.rowcounter=0;
double prevtime = gettime();
DBC *dbc1, *dbc2;
r = db->cursor(db, tid, &dbc1, 0); assert(r==0);
r = db->cursor(db, tid, &dbc2, 0); assert(r==0);
memset(&v.k, 0, sizeof(v.k));
memset(&v.v, 0, sizeof(v.v));
u_int32_t f_flags = 0;
u_int32_t c_get_flags = DB_NEXT;
if (prelockflag && (counter || prelock)) {
f_flags |= lock_flag;
c_get_flags |= lock_flag;
}
while (1) {
int r1,r2;
r2 = dbc1->c_get(dbc1, &v.k, &v.v, c_get_flags);
r1 = dbc2->c_getf_next(dbc2, f_flags, checkbytes, &v);
assert(r1==r2);
if (r1) break;
}
r = dbc1->c_close(dbc1); assert(r==0);
r = dbc2->c_close(dbc2); assert(r==0);
double thistime = gettime();
double tdiff = thistime-prevtime;
printf("verify %lld bytes (%d rows) in %9.6fs at %9fMB/s\n", v.totalbytes, v.rowcounter, tdiff, 1e-6*v.totalbytes/tdiff);
}
}
#endif
int main (int argc, const char *argv[]) {
parse_args(argc,argv);
scanscan_setup();
switch (run_mode) {
case RUN_HWC: scanscan_hwc(); break;
#ifdef TOKUDB
case RUN_LWC: scanscan_lwc(); break;
case RUN_VERIFY: scanscan_verify(); break;
#endif
case RUN_RANGE: scanscan_range(); break;
default: assert(0); break;
}
scanscan_shutdown();
return 0;
}

View File

@ -1,379 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
// measure the performance of a simulated "insert on duplicate key update" operation
// the table schema is t(a int, b int, c int, d int, primary key(a, b))
// a and b are random
// c is the sum of the observations
// d is the first observation
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <arpa/inet.h>
#include "db.h"
static size_t key_size = 8;
static size_t val_size = 8;
static int verbose = 0;
static void db_error(const DB_ENV *env, const char *prefix, const char *msg) {
printf("%s: %p %s %s\n", __FUNCTION__, env, prefix, msg);
}
static int get_int(void *p) {
int v;
memcpy(&v, p, sizeof v);
return htonl(v);
}
#if defined(TOKUDB)
static int my_update_callback(DB *db, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra) {
assert(db);
assert(key);
if (old_val == NULL) {
// insert new_val = extra
set_val(extra, set_extra);
} else {
if (verbose) printf("u");
// update new_val = old_val + extra
assert(old_val->size == val_size && extra->size == val_size);
char new_val_buffer[val_size];
memcpy(new_val_buffer, old_val->data, sizeof new_val_buffer);
int newc = htonl(get_int(old_val->data) + get_int(extra->data)); // newc = oldc + newc
memcpy(new_val_buffer, &newc, sizeof newc);
DBT new_val = { .data = new_val_buffer, .size = sizeof new_val_buffer };
set_val(&new_val, set_extra);
}
return 0;
}
#endif
static void insert_and_update(DB *db, DB_TXN *txn, int a, int b, int c, int d, bool do_update_callback) {
#if !defined(TOKUDB)
assert(!do_update_callback);
#endif
int r;
// generate the key
assert(key_size >= 8);
char key_buffer[key_size];
int newa = htonl(a);
memcpy(key_buffer, &newa, sizeof newa);
int newb = htonl(b);
memcpy(key_buffer+4, &newb, sizeof newb);
// generate the value
assert(val_size >= 8);
char val_buffer[val_size];
int newc = htonl(c);
memcpy(val_buffer, &newc, sizeof newc);
int newd = htonl(d);
memcpy(val_buffer+4, &newd, sizeof newd);
#if defined(TOKUDB)
if (do_update_callback) {
// extra = value_buffer, implicit combine column c update function
DBT key = { .data = key_buffer, .size = sizeof key_buffer };
DBT extra = { .data = val_buffer, .size = sizeof val_buffer };
r = db->update(db, txn, &key, &extra, 0); assert(r == 0);
} else
#endif
{
DBT key = { .data = key_buffer, .size = sizeof key_buffer };
DBT value = { .data = val_buffer, .size = sizeof val_buffer };
DBT oldvalue = { };
r = db->get(db, txn, &key, &oldvalue, 0);
assert(r == 0 || r == DB_NOTFOUND);
if (r == 0) {
// update it
if (verbose) printf("U");
int oldc = get_int(oldvalue.data);
newc = htonl(oldc + c); // newc = oldc + newc
memcpy(val_buffer, &newc, sizeof newc);
r = db->put(db, txn, &key, &value, 0);
assert(r == 0);
} else if (r == DB_NOTFOUND) {
r = db->put(db, txn, &key, &value, 0);
assert(r == 0);
}
}
}
static inline float tdiff (struct timeval *a, struct timeval *b) {
return (a->tv_sec - b->tv_sec) +1e-6*(a->tv_usec - b->tv_usec);
}
static void insert_and_update_all(DB_ENV *db_env, DB *db, long nrows, long max_rows_per_txn, int key_range, long rows_per_report, bool do_update_callback, bool do_txn) {
int r;
struct timeval tstart;
r = gettimeofday(&tstart, NULL); assert(r == 0);
struct timeval tlast = tstart;
DB_TXN *txn = NULL;
if (do_txn) {
r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0);
}
long n_rows_per_txn = 0;
long rowi;
for (rowi = 0; rowi < nrows; rowi++) {
int a = random() % key_range;
int b = random() % key_range;
int c = 1;
int d = 0; // timestamp
insert_and_update(db, txn, a, b, c, d, do_update_callback);
n_rows_per_txn++;
// maybe commit
if (do_txn && n_rows_per_txn == max_rows_per_txn) {
r = txn->commit(txn, 0); assert(r == 0);
r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0);
n_rows_per_txn = 0;
}
// maybe report performance
if (((rowi + 1) % rows_per_report) == 0) {
struct timeval tnow;
r = gettimeofday(&tnow, NULL); assert(r == 0);
float last_time = tdiff(&tnow, &tlast);
float total_time = tdiff(&tnow, &tstart);
printf("%ld %.3f %.0f/s %.0f/s\n", rowi + 1, last_time, rows_per_report/last_time, rowi/total_time); fflush(stdout);
tlast = tnow;
}
}
if (do_txn) {
r = txn->commit(txn, 0); assert(r == 0);
}
struct timeval tnow;
r = gettimeofday(&tnow, NULL); assert(r == 0);
printf("total %ld %.3f %.0f/s\n", nrows, tdiff(&tnow, &tstart), nrows/tdiff(&tnow, &tstart)); fflush(stdout);
}
int main(int argc, char *argv[]) {
#if defined(TOKUDB)
char *db_env_dir = "update.env.tokudb";
#else
char *db_env_dir = "update.env.bdb";
#endif
int db_env_open_flags = DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOCK | DB_INIT_LOG;
char *db_filename = "update.db";
long rows = 1000000000;
long rows_per_txn = 100;
long rows_per_report = 100000;
int key_range = 1000000;
#if defined(TOKUDB)
bool do_update_callback = true;
#else
bool do_update_callback = false;
#endif
bool do_txn = false;
u_int64_t cachesize = 1000000000;
u_int32_t pagesize = 0;
#if defined(TOKUDB)
u_int32_t checkpoint_period = 60;
#endif
int i;
for (i = 1; i < argc; i++) {
char *arg = argv[i];
if (strcmp(arg, "--verbose") == 0) {
verbose++;
continue;
}
if (strcmp(arg, "--rows") == 0 && i+1 < argc) {
rows = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--rows_per_txn") == 0 && i+1 < argc) {
rows_per_txn = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--rows_per_report") == 0 && i+1 < argc) {
rows_per_report = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--key_range") == 0 && i+1 < argc) {
key_range = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--txn") == 0 && i+1 < argc) {
do_txn = atoi(argv[++i]) != 0;
continue;
}
if (strcmp(arg, "--pagesize") == 0 && i+1 < argc) {
pagesize = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--cachesize") == 0 && i+1 < argc) {
cachesize = atol(argv[++i]);
continue;
}
if (strcmp(arg, "--update_callback") == 0 && i+1 < argc) {
do_update_callback = atoi(argv[++i]) != 0;
continue;
}
if (strcmp(arg, "--key_size") == 0 && i+1 < argc) {
key_size = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--val_size") == 0 && i+1 < argc) {
val_size = atoi(argv[++i]);
continue;
}
#if defined(TOKUDB)
if (strcmp(arg, "--checkpoint_period") == 0 && i+1 < argc) {
checkpoint_period = atoi(argv[++i]);
continue;
}
#endif
assert(0);
}
int r;
char rm_cmd[strlen(db_env_dir) + strlen("rm -rf ") + 1];
snprintf(rm_cmd, sizeof(rm_cmd), "rm -rf %s", db_env_dir);
r = system(rm_cmd); assert(r == 0);
r = mkdir(db_env_dir, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); assert(r == 0);
// create and open the env
DB_ENV *db_env = NULL;
r = db_env_create(&db_env, 0); assert(r == 0);
#if defined(TOKUDB)
db_env->set_update(db_env, my_update_callback);
#endif
if (cachesize) {
if (verbose) printf("cachesize %llu\n", (unsigned long long)cachesize);
const u_int64_t gig = 1 << 30;
r = db_env->set_cachesize(db_env, cachesize / gig, cachesize % gig, 1); assert(r == 0);
}
if (!do_txn)
db_env_open_flags &= ~(DB_INIT_TXN | DB_INIT_LOG);
db_env->set_errcall(db_env, db_error);
if (verbose) printf("env %s\n", db_env_dir);
r = db_env->open(db_env, db_env_dir, db_env_open_flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0);
#if defined(TOKUDB)
if (checkpoint_period) {
r = db_env->checkpointing_set_period(db_env, checkpoint_period); assert(r == 0);
u_int32_t period;
r = db_env->checkpointing_get_period(db_env, &period); assert(r == 0 && period == checkpoint_period);
}
#endif
// create the db
DB *db = NULL;
r = db_create(&db, db_env, 0); assert(r == 0);
DB_TXN *create_txn = NULL;
if (do_txn) {
r = db_env->txn_begin(db_env, NULL, &create_txn, 0); assert(r == 0);
}
if (pagesize) {
r = db->set_pagesize(db, pagesize); assert(r == 0);
}
r = db->open(db, create_txn, db_filename, NULL, DB_BTREE, DB_CREATE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0);
if (do_txn) {
r = create_txn->commit(create_txn, 0); assert(r == 0);
}
// insert on duplicate key update
insert_and_update_all(db_env, db, rows, rows_per_txn, key_range, rows_per_report, do_update_callback, do_txn);
// shutdown
r = db->close(db, 0); assert(r == 0); db = NULL;
r = db_env->close(db_env, 0); assert(r == 0); db_env = NULL;
return 0;
}

View File

@ -7,15 +7,17 @@ set_source_files_properties(
"${CMAKE_CURRENT_BINARY_DIR}/log_header.h"
PROPERTIES GENERATED TRUE)
add_executable(logformat logformat.cc)
add_executable(logformat logger/logformat.cc)
target_link_libraries(logformat ${LIBTOKUPORTABILITY}_static)
add_space_separated_property(TARGET logformat LINK_FLAGS --coverage)
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/log_code.cc"
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/log_print.cc"
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/log_header.h"
COMMAND $<TARGET_FILE:logformat> .
DEPENDS logformat
DEPENDS logger/logformat
)
add_custom_target(
generate_log_code
@ -23,55 +25,52 @@ add_custom_target(
)
set(FT_SOURCES
background_job_manager
block_allocator
block_table
bndata
cachetable
checkpoint
compress
dbufio
fifo
cachetable/background_job_manager
cachetable/cachetable
cachetable/checkpoint
cursor
ft
ft-cachetable-wrappers
ft-flusher
ft-hot-flusher
ftloader
ftloader-callback
ft_msg
ft_node-serialize
ft-node-deserialize
ft-ops
ft-serialize
ft-test-helpers
ft-verify
key
loader/callbacks
loader/dbufio
loader/loader
loader/pqueue
leafentry
le-cursor
logcursor
logfilemgr
logger
log_upgrade
memarena
minicron
omt
pqueue
queue
quicklz
recover
rollback
rollback-apply
rollback-ct-callbacks
rollback_log_node_cache
roll
sub_block
txn
txn_child_manager
txn_manager
logger/logcursor
logger/logfilemgr
logger/logger
logger/log_upgrade
logger/recover
msg
msg_buffer
node
pivotkeys
serialize/block_allocator
serialize/block_allocator_strategy
serialize/block_table
serialize/compress
serialize/ft_node-serialize
serialize/ft-node-deserialize
serialize/ft-serialize
serialize/quicklz
serialize/sub_block
txn/rollback
txn/rollback-apply
txn/rollback-ct-callbacks
txn/rollback_log_node_cache
txn/roll
txn/txn
txn/txn_child_manager
txn/txn_manager
txn/xids
ule
x1764
xids
ybt
"${CMAKE_CURRENT_BINARY_DIR}/log_code"
"${CMAKE_CURRENT_BINARY_DIR}/log_print"
)
@ -88,24 +87,7 @@ add_dependencies(ft_static install_tdb_h generate_log_code build_lzma)
## link with lzma (which should be static) and link dependers with zlib
target_link_libraries(ft LINK_PRIVATE util_static lzma ${LIBTOKUPORTABILITY})
target_link_libraries(ft LINK_PUBLIC ${ZLIB_LIBRARY} )
target_link_libraries(ft LINK_PUBLIC z)
target_link_libraries(ft_static LINK_PRIVATE lzma)
## build the bins in this directory
foreach(tool tokuftdump tdb_logprint tdb-recover ftverify)
add_executable(${tool} ${tool}.cc)
add_dependencies(${tool} install_tdb_h)
target_link_libraries(${tool} ft_static util_static ${ZLIB_LIBRARY} lzma ${LIBTOKUPORTABILITY}_static ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
add_space_separated_property(TARGET ${tool} COMPILE_FLAGS -fvisibility=hidden)
endforeach(tool)
# link in math.h library just for this tool.
target_link_libraries(ftverify m)
install(
TARGETS tokuftdump
COMPONENT Server
DESTINATION ${INSTALL_BINDIR}
)
add_subdirectory(tests)

View File

@ -1,473 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2009-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#ident "$Id$"
#include "block_allocator.h"
#include <memory.h>
#include <toku_assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
// Here's a very simple implementation.
// It's not very fast at allocating or freeing.
// Previous implementation used next_fit, but now use first_fit since we are moving blocks around to reduce file size.
struct block_allocator {
uint64_t reserve_at_beginning; // How much to reserve at the beginning
uint64_t alignment; // Block alignment
uint64_t n_blocks; // How many blocks
uint64_t blocks_array_size; // How big is the blocks_array. Must be >= n_blocks.
struct block_allocator_blockpair *blocks_array; // These blocks are sorted by address.
uint64_t n_bytes_in_use; // including the reserve_at_beginning
};
void
block_allocator_validate (BLOCK_ALLOCATOR ba) {
uint64_t i;
uint64_t n_bytes_in_use = ba->reserve_at_beginning;
for (i=0; i<ba->n_blocks; i++) {
n_bytes_in_use += ba->blocks_array[i].size;
if (i>0) {
assert(ba->blocks_array[i].offset > ba->blocks_array[i-1].offset);
assert(ba->blocks_array[i].offset >= ba->blocks_array[i-1].offset + ba->blocks_array[i-1].size );
}
}
assert(n_bytes_in_use == ba->n_bytes_in_use);
}
#if 0
#define VALIDATE(b) block_allocator_validate(b)
#else
#define VALIDATE(b) ((void)0)
#endif
#if 0
void
block_allocator_print (BLOCK_ALLOCATOR ba) {
uint64_t i;
for (i=0; i<ba->n_blocks; i++) {
printf("%" PRId64 ":%" PRId64 " ", ba->blocks_array[i].offset, ba->blocks_array[i].size);
}
printf("\n");
VALIDATE(ba);
}
#endif
void
create_block_allocator (BLOCK_ALLOCATOR *ba, uint64_t reserve_at_beginning, uint64_t alignment) {
assert(alignment>=512 && 0==(alignment%512)); // the alignment must be at least 512 and aligned with 512 to make DIRECT_IO happy.
BLOCK_ALLOCATOR XMALLOC(result);
result->reserve_at_beginning = reserve_at_beginning;
result->alignment = alignment;
result->n_blocks = 0;
result->blocks_array_size = 1;
XMALLOC_N(result->blocks_array_size, result->blocks_array);
result->n_bytes_in_use = reserve_at_beginning;
*ba = result;
VALIDATE(result);
}
void
destroy_block_allocator (BLOCK_ALLOCATOR *bap) {
BLOCK_ALLOCATOR ba = *bap;
*bap = 0;
toku_free(ba->blocks_array);
toku_free(ba);
}
static void
grow_blocks_array_by (BLOCK_ALLOCATOR ba, uint64_t n_to_add) {
if (ba->n_blocks + n_to_add > ba->blocks_array_size) {
uint64_t new_size = ba->n_blocks + n_to_add;
uint64_t at_least = ba->blocks_array_size * 2;
if (at_least > new_size) {
new_size = at_least;
}
ba->blocks_array_size = new_size;
XREALLOC_N(ba->blocks_array_size, ba->blocks_array);
}
}
static void
grow_blocks_array (BLOCK_ALLOCATOR ba) {
grow_blocks_array_by(ba, 1);
}
void
block_allocator_merge_blockpairs_into (uint64_t d, struct block_allocator_blockpair dst[/*d*/],
uint64_t s, const struct block_allocator_blockpair src[/*s*/])
{
uint64_t tail = d+s;
while (d>0 && s>0) {
struct block_allocator_blockpair *dp = &dst[d-1];
struct block_allocator_blockpair const *sp = &src[s-1];
struct block_allocator_blockpair *tp = &dst[tail-1];
assert(tail>0);
if (dp->offset > sp->offset) {
*tp = *dp;
d--;
tail--;
} else {
*tp = *sp;
s--;
tail--;
}
}
while (d>0) {
struct block_allocator_blockpair *dp = &dst[d-1];
struct block_allocator_blockpair *tp = &dst[tail-1];
*tp = *dp;
d--;
tail--;
}
while (s>0) {
struct block_allocator_blockpair const *sp = &src[s-1];
struct block_allocator_blockpair *tp = &dst[tail-1];
*tp = *sp;
s--;
tail--;
}
}
static int
compare_blockpairs (const void *av, const void *bv) {
const struct block_allocator_blockpair *a = (const struct block_allocator_blockpair *) av;
const struct block_allocator_blockpair *b = (const struct block_allocator_blockpair *) bv;
if (a->offset < b->offset) return -1;
if (a->offset > b->offset) return +1;
return 0;
}
void
block_allocator_alloc_blocks_at (BLOCK_ALLOCATOR ba, uint64_t n_blocks, struct block_allocator_blockpair pairs[/*n_blocks*/])
// See the documentation in block_allocator.h
{
VALIDATE(ba);
qsort(pairs, n_blocks, sizeof(*pairs), compare_blockpairs);
for (uint64_t i=0; i<n_blocks; i++) {
assert(pairs[i].offset >= ba->reserve_at_beginning);
assert(pairs[i].offset%ba->alignment == 0);
ba->n_bytes_in_use += pairs[i].size;
invariant(pairs[i].size > 0); //Allocator does not support size 0 blocks. See block_allocator_free_block.
}
grow_blocks_array_by(ba, n_blocks);
block_allocator_merge_blockpairs_into(ba->n_blocks, ba->blocks_array,
n_blocks, pairs);
ba->n_blocks += n_blocks;
VALIDATE(ba);
}
void
block_allocator_alloc_block_at (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t offset) {
struct block_allocator_blockpair p = {.offset = offset, .size=size};
// Just do a linear search for the block.
// This data structure is a sorted array (no gaps or anything), so the search isn't really making this any slower than the insertion.
// To speed up the insertion when opening a file, we provide the block_allocator_alloc_blocks_at function.
block_allocator_alloc_blocks_at(ba, 1, &p);
}
static inline uint64_t
align (uint64_t value, BLOCK_ALLOCATOR ba)
// Effect: align a value by rounding up.
{
return ((value+ba->alignment-1)/ba->alignment)*ba->alignment;
}
void block_allocator_alloc_block(BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset)
// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
{
invariant(size > 0); //Allocator does not support size 0 blocks. See block_allocator_free_block.
grow_blocks_array(ba);
ba->n_bytes_in_use += size;
if (ba->n_blocks==0) {
assert(ba->n_bytes_in_use == ba->reserve_at_beginning + size); // we know exactly how many are in use
ba->blocks_array[0].offset = align(ba->reserve_at_beginning, ba);
ba->blocks_array[0].size = size;
*offset = ba->blocks_array[0].offset;
ba->n_blocks++;
return;
}
// Implement first fit.
{
uint64_t end_of_reserve = align(ba->reserve_at_beginning, ba);
if (end_of_reserve + size <= ba->blocks_array[0].offset ) {
// Check to see if the space immediately after the reserve is big enough to hold the new block.
struct block_allocator_blockpair *bp = &ba->blocks_array[0];
memmove(bp+1, bp, (ba->n_blocks)*sizeof(*bp));
bp[0].offset = end_of_reserve;
bp[0].size = size;
ba->n_blocks++;
*offset = end_of_reserve;
VALIDATE(ba);
return;
}
}
for (uint64_t blocknum = 0; blocknum +1 < ba->n_blocks; blocknum ++) {
// Consider the space after blocknum
struct block_allocator_blockpair *bp = &ba->blocks_array[blocknum];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t answer_offset = align(this_offset + this_size, ba);
if (answer_offset + size > bp[1].offset) continue; // The block we want doesn't fit after this block.
// It fits, so allocate it here.
memmove(bp+2, bp+1, (ba->n_blocks - blocknum -1)*sizeof(*bp));
bp[1].offset = answer_offset;
bp[1].size = size;
ba->n_blocks++;
*offset = answer_offset;
VALIDATE(ba);
return;
}
// It didn't fit anywhere, so fit it on the end.
assert(ba->n_blocks < ba->blocks_array_size);
struct block_allocator_blockpair *bp = &ba->blocks_array[ba->n_blocks];
uint64_t answer_offset = align(bp[-1].offset+bp[-1].size, ba);
bp->offset = answer_offset;
bp->size = size;
ba->n_blocks++;
*offset = answer_offset;
VALIDATE(ba);
}
static int64_t
find_block (BLOCK_ALLOCATOR ba, uint64_t offset)
// Find the index in the blocks array that has a particular offset. Requires that the block exist.
// Use binary search so it runs fast.
{
VALIDATE(ba);
if (ba->n_blocks==1) {
assert(ba->blocks_array[0].offset == offset);
return 0;
}
uint64_t lo = 0;
uint64_t hi = ba->n_blocks;
while (1) {
assert(lo<hi); // otherwise no such block exists.
uint64_t mid = (lo+hi)/2;
uint64_t thisoff = ba->blocks_array[mid].offset;
//printf("lo=%" PRId64 " hi=%" PRId64 " mid=%" PRId64 " thisoff=%" PRId64 " offset=%" PRId64 "\n", lo, hi, mid, thisoff, offset);
if (thisoff < offset) {
lo = mid+1;
} else if (thisoff > offset) {
hi = mid;
} else {
return mid;
}
}
}
// To support 0-sized blocks, we need to include size as an input to this function.
// All 0-sized blocks at the same offset can be considered identical, but
// a 0-sized block can share offset with a non-zero sized block.
// The non-zero sized block is not exchangable with a zero sized block (or vice versa),
// so inserting 0-sized blocks can cause corruption here.
void
block_allocator_free_block (BLOCK_ALLOCATOR ba, uint64_t offset) {
VALIDATE(ba);
int64_t bn = find_block(ba, offset);
assert(bn>=0); // we require that there is a block with that offset. Might as well abort if no such block exists.
ba->n_bytes_in_use -= ba->blocks_array[bn].size;
memmove(&ba->blocks_array[bn], &ba->blocks_array[bn+1], (ba->n_blocks-bn-1) * sizeof(struct block_allocator_blockpair));
ba->n_blocks--;
VALIDATE(ba);
}
uint64_t
block_allocator_block_size (BLOCK_ALLOCATOR ba, uint64_t offset) {
int64_t bn = find_block(ba, offset);
assert(bn>=0); // we require that there is a block with that offset. Might as well abort if no such block exists.
return ba->blocks_array[bn].size;
}
uint64_t
block_allocator_allocated_limit (BLOCK_ALLOCATOR ba) {
if (ba->n_blocks==0) return ba->reserve_at_beginning;
else {
struct block_allocator_blockpair *last = &ba->blocks_array[ba->n_blocks-1];
return last->offset + last->size;
}
}
int
block_allocator_get_nth_block_in_layout_order (BLOCK_ALLOCATOR ba, uint64_t b, uint64_t *offset, uint64_t *size)
// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth.
// Return the offset and size of the block with that number.
// Return 0 if there is a block that big, return nonzero if b is too big.
{
if (b==0) {
*offset=0;
*size =ba->reserve_at_beginning;
return 0;
} else if (b > ba->n_blocks) {
return -1;
} else {
*offset=ba->blocks_array[b-1].offset;
*size =ba->blocks_array[b-1].size;
return 0;
}
}
void
block_allocator_get_unused_statistics(BLOCK_ALLOCATOR ba, TOKU_DB_FRAGMENTATION report) {
//Requires: report->file_size_bytes is filled in
//Requires: report->data_bytes is filled in
//Requires: report->checkpoint_bytes_additional is filled in
assert(ba->n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional);
report->unused_bytes = 0;
report->unused_blocks = 0;
report->largest_unused_block = 0;
if (ba->n_blocks > 0) {
//Deal with space before block 0 and after reserve:
{
struct block_allocator_blockpair *bp = &ba->blocks_array[0];
assert(bp->offset >= align(ba->reserve_at_beginning, ba));
uint64_t free_space = bp->offset - align(ba->reserve_at_beginning, ba);
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
//Deal with space between blocks:
for (uint64_t blocknum = 0; blocknum +1 < ba->n_blocks; blocknum ++) {
// Consider the space after blocknum
struct block_allocator_blockpair *bp = &ba->blocks_array[blocknum];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t end_of_this_block = align(this_offset+this_size, ba);
uint64_t next_offset = bp[1].offset;
uint64_t free_space = next_offset - end_of_this_block;
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
//Deal with space after last block
{
struct block_allocator_blockpair *bp = &ba->blocks_array[ba->n_blocks-1];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t end_of_this_block = align(this_offset+this_size, ba);
if (end_of_this_block < report->file_size_bytes) {
uint64_t free_space = report->file_size_bytes - end_of_this_block;
assert(free_space > 0);
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
}
else {
//No blocks. Just the reserve.
uint64_t end_of_this_block = align(ba->reserve_at_beginning, ba);
if (end_of_this_block < report->file_size_bytes) {
uint64_t free_space = report->file_size_bytes - end_of_this_block;
assert(free_space > 0);
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
}

View File

@ -1,230 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef BLOCK_ALLOCATOR_H
#define BLOCK_ALLOCATOR_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "fttypes.h"
#define BLOCK_ALLOCATOR_ALIGNMENT 4096
// How much must be reserved at the beginning for the block?
// The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
// So 4096 should be enough.
#define BLOCK_ALLOCATOR_HEADER_RESERVE 4096
#if (BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT) != 0
#error
#endif
// Block allocator.
// Overview: A block allocator manages the allocation of variable-sized blocks.
// The translation of block numbers to addresses is handled elsewhere.
// The allocation of block numbers is handled elsewhere.
// We can create a block allocator.
// When creating a block allocator we also specify a certain-sized
// block at the beginning that is preallocated (and cannot be allocated
// or freed)
// We can allocate blocks of a particular size at a particular location.
// We can allocate blocks of a particular size at a location chosen by the allocator.
// We can free blocks.
// We can determine the size of a block.
#define BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE (2*BLOCK_ALLOCATOR_HEADER_RESERVE)
typedef struct block_allocator *BLOCK_ALLOCATOR;
void create_block_allocator (BLOCK_ALLOCATOR * ba, uint64_t reserve_at_beginning, uint64_t alignment);
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
// All blocks be start on a multiple of ALIGNMENT.
// Aborts if we run out of memory.
// Parameters
// ba (OUT): Result stored here.
// reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned.
// alignment (IN) Block alignment.
void destroy_block_allocator (BLOCK_ALLOCATOR *ba);
// Effect: Destroy a block allocator at *ba.
// Also, set *ba=NULL.
// Rationale: If there was only one copy of the pointer, this kills that copy too.
// Paramaters:
// ba (IN/OUT):
void block_allocator_alloc_block_at (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t offset);
// Effect: Allocate a block of the specified size at a particular offset.
// Aborts if anything goes wrong.
// The performance of this function may be as bad as Theta(N), where N is the number of blocks currently in use.
// Usage note: To allocate several blocks (e.g., when opening a BRT), use block_allocator_alloc_blocks_at().
// Requires: The resulting block may not overlap any other allocated block.
// And the offset must be a multiple of the block alignment.
// Parameters:
// ba (IN/OUT): The block allocator. (Modifies ba.)
// size (IN): The size of the block.
// offset (IN): The location of the block.
struct block_allocator_blockpair {
uint64_t offset;
uint64_t size;
};
void block_allocator_alloc_blocks_at (BLOCK_ALLOCATOR ba, uint64_t n_blocks, struct block_allocator_blockpair *pairs);
// Effect: Take pairs in any order, and add them all, as if we did block_allocator_alloc_block() on each pair.
// This should run in time O(N + M log M) where N is the number of blocks in ba, and M is the number of new blocks.
// Modifies: pairs (sorts them).
void block_allocator_alloc_block (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset);
// Effect: Allocate a block of the specified size at an address chosen by the allocator.
// Aborts if anything goes wrong.
// The block address will be a multiple of the alignment.
// Parameters:
// ba (IN/OUT): The block allocator. (Modifies ba.)
// size (IN): The size of the block. (The size does not have to be aligned.)
// offset (OUT): The location of the block.
void block_allocator_free_block (BLOCK_ALLOCATOR ba, uint64_t offset);
// Effect: Free the block at offset.
// Requires: There must be a block currently allocated at that offset.
// Parameters:
// ba (IN/OUT): The block allocator. (Modifies ba.)
// offset (IN): The offset of the block.
uint64_t block_allocator_block_size (BLOCK_ALLOCATOR ba, uint64_t offset);
// Effect: Return the size of the block that starts at offset.
// Requires: There must be a block currently allocated at that offset.
// Parameters:
// ba (IN/OUT): The block allocator. (Modifies ba.)
// offset (IN): The offset of the block.
void block_allocator_validate (BLOCK_ALLOCATOR ba);
// Effect: Check to see if the block allocator is OK. This may take a long time.
// Usage Hints: Probably only use this for unit tests.
void block_allocator_print (BLOCK_ALLOCATOR ba);
// Effect: Print information about the block allocator.
// Rationale: This is probably useful only for debugging.
uint64_t block_allocator_allocated_limit (BLOCK_ALLOCATOR ba);
// Effect: Return the unallocated block address of "infinite" size.
// That is, return the smallest address that is above all the allocated blocks.
// Rationale: When writing the root FIFO we don't know how big the block is.
// So we start at the "infinite" block, write the fifo, and then
// allocate_block_at of the correct size and offset to account for the root FIFO.
int block_allocator_get_nth_block_in_layout_order (BLOCK_ALLOCATOR ba, uint64_t b, uint64_t *offset, uint64_t *size);
// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth.
// Return the offset and size of the block with that number.
// Return 0 if there is a block that big, return nonzero if b is too big.
// Rationale: This is probably useful only for tests.
void block_allocator_get_unused_statistics(BLOCK_ALLOCATOR ba, TOKU_DB_FRAGMENTATION report);
// Effect: Fill in report to indicate how the file is used.
// Requires:
// report->file_size_bytes is filled in
// report->data_bytes is filled in
// report->checkpoint_bytes_additional is filled in
void block_allocator_merge_blockpairs_into (uint64_t d, struct block_allocator_blockpair dst[/*d*/],
uint64_t s, const struct block_allocator_blockpair src[/*s*/]);
// Effect: Merge dst[d] and src[s] into dst[d+s], merging in place.
// Initially dst and src hold sorted arrays (sorted by increasing offset).
// Finally dst contains all d+s elements sorted in order.
// Requires:
// dst and src are sorted.
// dst must be large enough.
// No blocks may overlap.
// Rationale: This is exposed so it can be tested by a glass box tester. Otherwise it would be static (file-scope) function inside block_allocator.c
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,176 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef BLOCKTABLE_H
#define BLOCKTABLE_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "fttypes.h"
typedef struct block_table *BLOCK_TABLE;
//Needed by tests, ftdump
struct block_translation_pair {
union { // If in the freelist, use next_free_blocknum, otherwise diskoff.
DISKOFF diskoff;
BLOCKNUM next_free_blocknum;
} u;
DISKOFF size; // set to 0xFFFFFFFFFFFFFFFF for free
};
void toku_blocktable_create_new(BLOCK_TABLE *btp);
int toku_blocktable_create_from_buffer(int fd, BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
void toku_blocktable_destroy(BLOCK_TABLE *btp);
void toku_ft_lock(FT h);
void toku_ft_unlock(FT h);
void toku_block_translation_note_start_checkpoint_unlocked(BLOCK_TABLE bt);
void toku_block_translation_note_end_checkpoint(BLOCK_TABLE bt, int fd);
void toku_block_translation_note_skipped_checkpoint(BLOCK_TABLE bt);
void toku_maybe_truncate_file_on_open(BLOCK_TABLE bt, int fd);
//Blocknums
void toku_allocate_blocknum(BLOCK_TABLE bt, BLOCKNUM *res, FT h);
void toku_allocate_blocknum_unlocked(BLOCK_TABLE bt, BLOCKNUM *res, FT h);
void toku_free_blocknum(BLOCK_TABLE bt, BLOCKNUM *b, FT h, bool for_checkpoint);
void toku_verify_blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b);
void toku_block_verify_no_data_blocks_except_root(BLOCK_TABLE bt, BLOCKNUM root);
void toku_free_unused_blocknums(BLOCK_TABLE bt, BLOCKNUM root);
void toku_block_verify_no_free_blocknums(BLOCK_TABLE bt);
void toku_realloc_descriptor_on_disk(BLOCK_TABLE bt, DISKOFF size, DISKOFF *offset, FT h, int fd);
void toku_realloc_descriptor_on_disk_unlocked(BLOCK_TABLE bt, DISKOFF size, DISKOFF *offset, FT h);
void toku_get_descriptor_offset_size(BLOCK_TABLE bt, DISKOFF *offset, DISKOFF *size);
//Blocks and Blocknums
void toku_blocknum_realloc_on_disk(BLOCK_TABLE bt, BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, int fd, bool for_checkpoint);
void toku_translate_blocknum_to_offset_size(BLOCK_TABLE bt, BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
//Serialization
void toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w, int64_t *address, int64_t *size);
void toku_block_table_swap_for_redirect(BLOCK_TABLE old_bt, BLOCK_TABLE new_bt);
//DEBUG ONLY (ftdump included), tests included
void toku_blocknum_dump_translation(BLOCK_TABLE bt, BLOCKNUM b);
void toku_dump_translation_table_pretty(FILE *f, BLOCK_TABLE bt);
void toku_dump_translation_table(FILE *f, BLOCK_TABLE bt);
void toku_block_free(BLOCK_TABLE bt, uint64_t offset);
typedef int(*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra);
enum translation_type {TRANSLATION_NONE=0,
TRANSLATION_CURRENT,
TRANSLATION_INPROGRESS,
TRANSLATION_CHECKPOINTED,
TRANSLATION_DEBUG};
int toku_blocktable_iterate(BLOCK_TABLE bt, enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only);
void toku_blocktable_internal_fragmentation(BLOCK_TABLE bt, int64_t *total_sizep, int64_t *used_sizep);
void toku_block_table_get_fragmentation_unlocked(BLOCK_TABLE bt, TOKU_DB_FRAGMENTATION report);
//Requires: blocktable lock is held.
//Requires: report->file_size_bytes is already filled in.
int64_t toku_block_get_blocks_in_use_unlocked(BLOCK_TABLE bt);
void toku_blocktable_get_info64(BLOCK_TABLE, struct ftinfo64 *);
int toku_blocktable_iterate_translation_tables(BLOCK_TABLE, uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *);
//Unmovable reserved first, then reallocable.
// We reserve one blocknum for the translation table itself.
enum {RESERVED_BLOCKNUM_NULL =0,
RESERVED_BLOCKNUM_TRANSLATION=1,
RESERVED_BLOCKNUM_DESCRIPTOR =2,
RESERVED_BLOCKNUMS};
#endif

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,47 +89,198 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <bndata.h>
#include <ft/bndata.h>
#include <ft/ft-internal.h>
static uint32_t klpair_size(KLPAIR klpair){
return sizeof(*klpair) + klpair->keylen + leafentry_memsize(get_le_from_klpair(klpair));
}
static uint32_t klpair_disksize(KLPAIR klpair){
return sizeof(*klpair) + klpair->keylen + leafentry_disksize(get_le_from_klpair(klpair));
using namespace toku;
uint32_t bn_data::klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const {
return sizeof(*klpair) + keylen_from_klpair_len(klpair_len) + leafentry_disksize(get_le_from_klpair(klpair));
}
void bn_data::init_zero() {
toku_mempool_zero(&m_buffer_mempool);
m_disksize_of_keys = 0;
}
void bn_data::initialize_empty() {
toku_mempool_zero(&m_buffer_mempool);
m_buffer.create_no_array();
init_zero();
m_buffer.create();
}
void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size) {
void bn_data::add_key(uint32_t keylen) {
m_disksize_of_keys += sizeof(keylen) + keylen;
}
void bn_data::add_keys(uint32_t n_keys, uint32_t combined_klpair_len) {
invariant(n_keys * sizeof(uint32_t) <= combined_klpair_len);
m_disksize_of_keys += combined_klpair_len;
}
void bn_data::remove_key(uint32_t keylen) {
m_disksize_of_keys -= sizeof(keylen) + keylen;
}
// Deserialize from format optimized for keys being inlined.
// Currently only supports fixed-length keys.
void bn_data::initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version UU(),
uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
uint32_t fixed_klpair_length) {
paranoid_invariant(version >= FT_LAYOUT_VERSION_26); // Support was added @26
uint32_t ndone_before = rb->ndone;
init_zero();
invariant(all_keys_same_length); // Until otherwise supported.
const void *keys_src;
rbuf_literal_bytes(rb, &keys_src, key_data_size);
//Generate dmt
this->m_buffer.create_from_sorted_memory_of_fixed_size_elements(
keys_src, num_entries, key_data_size, fixed_klpair_length);
toku_mempool_construct(&this->m_buffer_mempool, val_data_size);
const void *vals_src;
rbuf_literal_bytes(rb, &vals_src, val_data_size);
if (num_entries > 0) {
void *vals_dest = toku_mempool_malloc(&this->m_buffer_mempool, val_data_size);
paranoid_invariant_notnull(vals_dest);
memcpy(vals_dest, vals_src, val_data_size);
}
add_keys(num_entries, num_entries * fixed_klpair_length);
toku_note_deserialized_basement_node(all_keys_same_length);
invariant(rb->ndone - ndone_before == data_size);
}
static int
wbufwriteleafentry(const void* key, const uint32_t keylen, const LEAFENTRY &le, const uint32_t UU(idx), struct wbuf * const wb) {
// need to pack the leafentry as it was in versions
// where the key was integrated into it (< 26)
uint32_t begin_spot UU() = wb->ndone;
uint32_t le_disk_size = leafentry_disksize(le);
wbuf_nocrc_uint8_t(wb, le->type);
wbuf_nocrc_uint32_t(wb, keylen);
if (le->type == LE_CLEAN) {
wbuf_nocrc_uint32_t(wb, le->u.clean.vallen);
wbuf_nocrc_literal_bytes(wb, key, keylen);
wbuf_nocrc_literal_bytes(wb, le->u.clean.val, le->u.clean.vallen);
}
else {
paranoid_invariant(le->type == LE_MVCC);
wbuf_nocrc_uint32_t(wb, le->u.mvcc.num_cxrs);
wbuf_nocrc_uint8_t(wb, le->u.mvcc.num_pxrs);
wbuf_nocrc_literal_bytes(wb, key, keylen);
wbuf_nocrc_literal_bytes(wb, le->u.mvcc.xrs, le_disk_size - (1 + 4 + 1));
}
uint32_t end_spot UU() = wb->ndone;
paranoid_invariant((end_spot - begin_spot) == keylen + sizeof(keylen) + le_disk_size);
return 0;
}
void bn_data::serialize_to_wbuf(struct wbuf *const wb) {
prepare_to_serialize();
serialize_header(wb);
if (m_buffer.value_length_is_fixed()) {
serialize_rest(wb);
} else {
//
// iterate over leafentries and place them into the buffer
//
iterate<struct wbuf, wbufwriteleafentry>(wb);
}
}
// If we have fixed-length keys, we prepare the dmt and mempool.
// The mempool is prepared by removing any fragmented space and ordering leafentries in the same order as their keys.
void bn_data::prepare_to_serialize(void) {
if (m_buffer.value_length_is_fixed()) {
m_buffer.prepare_for_serialize();
dmt_compress_kvspace(0, nullptr, true); // Gets it ready for easy serialization.
}
}
void bn_data::serialize_header(struct wbuf *wb) const {
bool fixed = m_buffer.value_length_is_fixed();
//key_data_size
wbuf_nocrc_uint(wb, m_disksize_of_keys);
//val_data_size
wbuf_nocrc_uint(wb, toku_mempool_get_used_size(&m_buffer_mempool));
//fixed_klpair_length
wbuf_nocrc_uint(wb, m_buffer.get_fixed_length());
// all_keys_same_length
wbuf_nocrc_uint8_t(wb, fixed);
// keys_vals_separate
wbuf_nocrc_uint8_t(wb, fixed);
}
void bn_data::serialize_rest(struct wbuf *wb) const {
//Write keys
invariant(m_buffer.value_length_is_fixed()); //Assumes prepare_to_serialize was called
m_buffer.serialize_values(m_disksize_of_keys, wb);
//Write leafentries
//Just ran dmt_compress_kvspace so there is no fragmentation and also leafentries are in sorted order.
paranoid_invariant(toku_mempool_get_frag_size(&m_buffer_mempool) == 0);
uint32_t val_data_size = toku_mempool_get_used_size(&m_buffer_mempool);
wbuf_nocrc_literal_bytes(wb, toku_mempool_get_base(&m_buffer_mempool), val_data_size);
}
// Deserialize from rbuf
void bn_data::deserialize_from_rbuf(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version) {
uint32_t key_data_size = data_size; // overallocate if < version 26 (best guess that is guaranteed not too small)
uint32_t val_data_size = data_size; // overallocate if < version 26 (best guess that is guaranteed not too small)
bool all_keys_same_length = false;
bool keys_vals_separate = false;
uint32_t fixed_klpair_length = 0;
// In version 25 and older there is no header. Skip reading header for old version.
if (version >= FT_LAYOUT_VERSION_26) {
uint32_t ndone_before = rb->ndone;
key_data_size = rbuf_int(rb);
val_data_size = rbuf_int(rb);
fixed_klpair_length = rbuf_int(rb); // 0 if !all_keys_same_length
all_keys_same_length = rbuf_char(rb);
keys_vals_separate = rbuf_char(rb);
invariant(all_keys_same_length == keys_vals_separate); // Until we support otherwise
uint32_t header_size = rb->ndone - ndone_before;
data_size -= header_size;
invariant(header_size == HEADER_LENGTH);
if (keys_vals_separate) {
invariant(fixed_klpair_length >= sizeof(klpair_struct) || num_entries == 0);
initialize_from_separate_keys_and_vals(num_entries, rb, data_size, version,
key_data_size, val_data_size, all_keys_same_length,
fixed_klpair_length);
return;
}
}
// Version >= 26 and version 25 deserialization are now identical except that <= 25 might allocate too much memory.
const void *bytes;
rbuf_literal_bytes(rb, &bytes, data_size);
const unsigned char *CAST_FROM_VOIDP(buf, bytes);
if (data_size == 0) {
invariant_zero(num_entries);
}
KLPAIR *XMALLOC_N(num_entries, array); // create array of pointers to leafentries
unsigned char *newmem = NULL;
// add same wiggle room that toku_mempool_construct would, 25% extra
uint32_t allocated_bytes = data_size + data_size/4;
CAST_FROM_VOIDP(newmem, toku_xmalloc(allocated_bytes));
unsigned char* curr_src_pos = buf;
init_zero();
klpair_dmt_t::builder dmt_builder;
dmt_builder.create(num_entries, key_data_size);
// TODO(leif): clean this up (#149)
unsigned char *newmem = nullptr;
// add 25% extra wiggle room
uint32_t allocated_bytes_vals = val_data_size + (val_data_size / 4);
CAST_FROM_VOIDP(newmem, toku_xmalloc(allocated_bytes_vals));
const unsigned char* curr_src_pos = buf;
unsigned char* curr_dest_pos = newmem;
for (uint32_t i = 0; i < num_entries; i++) {
KLPAIR curr_kl = (KLPAIR)curr_dest_pos;
array[i] = curr_kl;
uint8_t curr_type = curr_src_pos[0];
curr_src_pos++;
// first thing we do is lay out the key,
// to do so, we must extract it from the leafentry
// and write it in
uint32_t keylen = 0;
void* keyp = NULL;
const void* keyp = nullptr;
keylen = *(uint32_t *)curr_src_pos;
curr_src_pos += sizeof(uint32_t);
uint32_t clean_vallen = 0;
@ -150,12 +301,10 @@ void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uin
keyp = curr_src_pos;
curr_src_pos += keylen;
}
// now that we have the keylen and the key, we can copy it
// into the destination
*(uint32_t *)curr_dest_pos = keylen;
curr_dest_pos += sizeof(keylen);
memcpy(curr_dest_pos, keyp, keylen);
curr_dest_pos += keylen;
uint32_t le_offset = curr_dest_pos - newmem;
dmt_builder.append(klpair_dmtwriter(keylen, le_offset, keyp));
add_key(keylen);
// now curr_dest_pos is pointing to where the leafentry should be packed
curr_dest_pos[0] = curr_type;
curr_dest_pos++;
@ -173,31 +322,44 @@ void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uin
*(uint8_t *)curr_dest_pos = num_pxrs;
curr_dest_pos += sizeof(num_pxrs);
// now we need to pack the rest of the data
uint32_t num_rest_bytes = leafentry_rest_memsize(num_pxrs, num_cxrs, curr_src_pos);
uint32_t num_rest_bytes = leafentry_rest_memsize(num_pxrs, num_cxrs, const_cast<uint8_t*>(curr_src_pos));
memcpy(curr_dest_pos, curr_src_pos, num_rest_bytes);
curr_dest_pos += num_rest_bytes;
curr_src_pos += num_rest_bytes;
}
}
uint32_t num_bytes_read UU() = (uint32_t)(curr_src_pos - buf);
paranoid_invariant( num_bytes_read == data_size);
uint32_t num_bytes_written = curr_dest_pos - newmem;
paranoid_invariant( num_bytes_written == data_size);
toku_mempool_init(&m_buffer_mempool, newmem, (size_t)(num_bytes_written), allocated_bytes);
dmt_builder.build(&this->m_buffer);
toku_note_deserialized_basement_node(m_buffer.value_length_is_fixed());
// destroy old omt that was created by toku_create_empty_bn(), so we can create a new one
m_buffer.destroy();
m_buffer.create_steal_sorted_array(&array, num_entries, num_entries);
uint32_t num_bytes_read = (uint32_t)(curr_src_pos - buf);
invariant(num_bytes_read == data_size);
uint32_t num_bytes_written = curr_dest_pos - newmem + m_disksize_of_keys;
invariant(num_bytes_written == data_size);
toku_mempool_init(&m_buffer_mempool, newmem, (size_t)(curr_dest_pos - newmem), allocated_bytes_vals);
invariant(get_disk_size() == data_size);
// Versions older than 26 might have allocated too much memory. Try to shrink the mempool now that we
// know how much memory we need.
if (version < FT_LAYOUT_VERSION_26) {
// Unnecessary after version 26
// Reallocate smaller mempool to save memory
invariant_zero(toku_mempool_get_frag_size(&m_buffer_mempool));
toku_mempool_realloc_larger(&m_buffer_mempool, toku_mempool_get_used_size(&m_buffer_mempool));
}
}
uint64_t bn_data::get_memory_size() {
uint64_t retval = 0;
//TODO: Maybe ask for memory_size instead of mempool_footprint (either this todo or the next)
// include fragmentation overhead but do not include space in the
// mempool that has not yet been allocated for leaf entries
size_t poolsize = toku_mempool_footprint(&m_buffer_mempool);
invariant(poolsize >= get_disk_size());
retval += poolsize;
// This one includes not-yet-allocated for nodes (just like old constant-key omt)
//TODO: Maybe ask for mempool_footprint instead of memory_size.
retval += m_buffer.memory_size();
invariant(retval >= get_disk_size());
return retval;
}
@ -205,169 +367,264 @@ void bn_data::delete_leafentry (
uint32_t idx,
uint32_t keylen,
uint32_t old_le_size
)
)
{
remove_key(keylen);
m_buffer.delete_at(idx);
toku_mempool_mfree(&m_buffer_mempool, 0, old_le_size + keylen + sizeof(keylen)); // Must pass 0, since le is no good any more.
toku_mempool_mfree(&m_buffer_mempool, nullptr, old_le_size);
}
/* mempool support */
struct omt_compressor_state {
struct dmt_compressor_state {
struct mempool *new_kvspace;
KLPAIR *newvals;
class bn_data *bd;
};
static int move_it (const KLPAIR &klpair, const uint32_t idx, struct omt_compressor_state * const oc) {
uint32_t size = klpair_size(klpair);
KLPAIR CAST_FROM_VOIDP(newdata, toku_mempool_malloc(oc->new_kvspace, size, 1));
static int move_it (const uint32_t, klpair_struct *klpair, const uint32_t idx UU(), struct dmt_compressor_state * const oc) {
LEAFENTRY old_le = oc->bd->get_le_from_klpair(klpair);
uint32_t size = leafentry_memsize(old_le);
void* newdata = toku_mempool_malloc(oc->new_kvspace, size);
paranoid_invariant_notnull(newdata); // we do this on a fresh mempool, so nothing bad should happen
memcpy(newdata, klpair, size);
oc->newvals[idx] = newdata;
memcpy(newdata, old_le, size);
klpair->le_offset = toku_mempool_get_offset_from_pointer_and_base(oc->new_kvspace, newdata);
return 0;
}
// Compress things, and grow the mempool if needed.
void bn_data::omt_compress_kvspace(size_t added_size, void **maybe_free) {
uint32_t total_size_needed = toku_mempool_get_used_space(&m_buffer_mempool) + added_size;
// set the new mempool size to be twice of the space we actually need.
// On top of the 25% that is padded within toku_mempool_construct (which we
// should consider getting rid of), that should be good enough.
// Compress things, and grow or shrink the mempool if needed.
// May (always if force_compress) have a side effect of putting contents of mempool in sorted order.
void bn_data::dmt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress) {
uint32_t total_size_needed = toku_mempool_get_used_size(&m_buffer_mempool) + added_size;
// If there is no fragmentation, e.g. in serial inserts, we can just increase the size
// of the mempool and move things over with a cheap memcpy. If force_compress is true,
// the caller needs the side effect that all contents are put in sorted order.
bool do_compress = toku_mempool_get_frag_size(&m_buffer_mempool) > 0 || force_compress;
void *old_mempool_base = toku_mempool_get_base(&m_buffer_mempool);
struct mempool new_kvspace;
toku_mempool_construct(&new_kvspace, 2*total_size_needed);
uint32_t numvals = omt_size();
KLPAIR *XMALLOC_N(numvals, newvals);
struct omt_compressor_state oc = { &new_kvspace, newvals };
m_buffer.iterate_on_range< decltype(oc), move_it >(0, omt_size(), &oc);
m_buffer.destroy();
m_buffer.create_steal_sorted_array(&newvals, numvals, numvals);
if (do_compress) {
size_t requested_size = force_compress ? total_size_needed : ((total_size_needed * 3) / 2);
toku_mempool_construct(&new_kvspace, requested_size);
struct dmt_compressor_state oc = { &new_kvspace, this };
m_buffer.iterate_ptr< decltype(oc), move_it >(&oc);
} else {
toku_mempool_construct(&new_kvspace, total_size_needed);
size_t old_offset_limit = toku_mempool_get_offset_limit(&m_buffer_mempool);
void *new_mempool_base = toku_mempool_malloc(&new_kvspace, old_offset_limit);
memcpy(new_mempool_base, old_mempool_base, old_offset_limit);
}
if (maybe_free) {
*maybe_free = m_buffer_mempool.base;
*maybe_free = old_mempool_base;
} else {
toku_free(m_buffer_mempool.base);
toku_free(old_mempool_base);
}
m_buffer_mempool = new_kvspace;
}
// Effect: Allocate a new object of size SIZE in MP. If MP runs out of space, allocate new a new mempool space, and copy all the items
// from the OMT (which items refer to items in the old mempool) into the new mempool.
// If MAYBE_FREE is NULL then free the old mempool's space.
// If MAYBE_FREE is nullptr then free the old mempool's space.
// Otherwise, store the old mempool's space in maybe_free.
KLPAIR bn_data::mempool_malloc_from_omt(size_t size, void **maybe_free) {
void *v = toku_mempool_malloc(&m_buffer_mempool, size, 1);
if (v == NULL) {
omt_compress_kvspace(size, maybe_free);
v = toku_mempool_malloc(&m_buffer_mempool, size, 1);
LEAFENTRY bn_data::mempool_malloc_and_update_dmt(size_t size, void **maybe_free) {
void *v = toku_mempool_malloc(&m_buffer_mempool, size);
if (v == nullptr) {
dmt_compress_kvspace(size, maybe_free, false);
v = toku_mempool_malloc(&m_buffer_mempool, size);
paranoid_invariant_notnull(v);
}
return (KLPAIR)v;
return (LEAFENTRY)v;
}
//TODO: probably not free the "maybe_free" right away?
void bn_data::get_space_for_overwrite(
uint32_t idx,
const void* keyp,
uint32_t keylen,
const void* keyp UU(),
uint32_t keylen UU(),
uint32_t old_keylen,
uint32_t old_le_size,
uint32_t new_size,
LEAFENTRY* new_le_space
LEAFENTRY* new_le_space,
void **const maybe_free
)
{
void* maybe_free = nullptr;
uint32_t size_alloc = new_size + keylen + sizeof(keylen);
KLPAIR new_kl = mempool_malloc_from_omt(
size_alloc,
&maybe_free
);
uint32_t size_freed = old_le_size + keylen + sizeof(keylen);
toku_mempool_mfree(&m_buffer_mempool, nullptr, size_freed); // Must pass nullptr, since le is no good any more.
new_kl->keylen = keylen;
memcpy(new_kl->key_le, keyp, keylen);
m_buffer.set_at(new_kl, idx);
*new_le_space = get_le_from_klpair(new_kl);
// free at end, so that the keyp and keylen
// passed in is still valid
if (maybe_free) {
toku_free(maybe_free);
}
*maybe_free = nullptr;
LEAFENTRY new_le = mempool_malloc_and_update_dmt(new_size, maybe_free);
toku_mempool_mfree(&m_buffer_mempool, nullptr, old_le_size);
klpair_struct* klp = nullptr;
uint32_t klpair_len;
int r = m_buffer.fetch(idx, &klpair_len, &klp);
invariant_zero(r);
paranoid_invariant(klp!=nullptr);
// Old key length should be consistent with what is stored in the DMT
invariant(keylen_from_klpair_len(klpair_len) == old_keylen);
size_t new_le_offset = toku_mempool_get_offset_from_pointer_and_base(&this->m_buffer_mempool, new_le);
paranoid_invariant(new_le_offset <= UINT32_MAX - new_size); // Not using > 4GB
klp->le_offset = new_le_offset;
paranoid_invariant(new_le == get_le_from_klpair(klp));
*new_le_space = new_le;
}
//TODO: probably not free the "maybe_free" right away?
void bn_data::get_space_for_insert(
uint32_t idx,
const void* keyp,
uint32_t keylen,
size_t size,
LEAFENTRY* new_le_space
LEAFENTRY* new_le_space,
void **const maybe_free
)
{
void* maybe_free = nullptr;
uint32_t size_alloc = size + keylen + sizeof(keylen);
KLPAIR new_kl = mempool_malloc_from_omt(
size_alloc,
&maybe_free
);
new_kl->keylen = keylen;
memcpy(new_kl->key_le, keyp, keylen);
m_buffer.insert_at(new_kl, idx);
*new_le_space = get_le_from_klpair(new_kl);
// free at end, so that the keyp and keylen
// passed in is still valid (you never know if
// it was part of the old mempool, this is just
// safer).
if (maybe_free) {
toku_free(maybe_free);
}
add_key(keylen);
*maybe_free = nullptr;
LEAFENTRY new_le = mempool_malloc_and_update_dmt(size, maybe_free);
size_t new_le_offset = toku_mempool_get_offset_from_pointer_and_base(&this->m_buffer_mempool, new_le);
klpair_dmtwriter kl(keylen, new_le_offset, keyp);
m_buffer.insert_at(kl, idx);
*new_le_space = new_le;
}
void bn_data::move_leafentries_to(
BN_DATA dest_bd,
uint32_t lbi, //lower bound inclusive
uint32_t ube //upper bound exclusive
class split_klpairs_extra {
bn_data *const m_left_bn;
bn_data *const m_right_bn;
klpair_dmt_t::builder *const m_left_builder;
klpair_dmt_t::builder *const m_right_builder;
struct mempool *const m_left_dest_mp;
uint32_t m_split_at;
struct mempool *left_dest_mp(void) const { return m_left_dest_mp; }
struct mempool *right_dest_mp(void) const { return &m_right_bn->m_buffer_mempool; }
void copy_klpair(const uint32_t klpair_len, const klpair_struct &klpair,
klpair_dmt_t::builder *const builder,
struct mempool *const dest_mp,
bn_data *const bn) {
LEAFENTRY old_le = m_left_bn->get_le_from_klpair(&klpair);
size_t le_size = leafentry_memsize(old_le);
void *new_le = toku_mempool_malloc(dest_mp, le_size);
paranoid_invariant_notnull(new_le);
memcpy(new_le, old_le, le_size);
size_t le_offset = toku_mempool_get_offset_from_pointer_and_base(dest_mp, new_le);
size_t keylen = keylen_from_klpair_len(klpair_len);
builder->append(klpair_dmtwriter(keylen, le_offset, klpair.key));
bn->add_key(keylen);
}
int move_leafentry(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx) {
m_left_bn->remove_key(keylen_from_klpair_len(klpair_len));
if (idx < m_split_at) {
copy_klpair(klpair_len, klpair, m_left_builder, left_dest_mp(), m_left_bn);
} else {
copy_klpair(klpair_len, klpair, m_right_builder, right_dest_mp(), m_right_bn);
}
return 0;
}
public:
split_klpairs_extra(bn_data *const left_bn, bn_data *const right_bn,
klpair_dmt_t::builder *const left_builder,
klpair_dmt_t::builder *const right_builder,
struct mempool *const left_new_mp,
uint32_t split_at)
: m_left_bn(left_bn),
m_right_bn(right_bn),
m_left_builder(left_builder),
m_right_builder(right_builder),
m_left_dest_mp(left_new_mp),
m_split_at(split_at) {}
static int cb(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx, split_klpairs_extra *const thisp) {
return thisp->move_leafentry(klpair_len, klpair, idx);
}
};
void bn_data::split_klpairs(
bn_data* right_bd,
uint32_t split_at //lower bound inclusive for right_bd
)
//Effect: move leafentries in the range [lbi, ube) from this to src_omt to newly created dest_omt
{
paranoid_invariant(lbi < ube);
paranoid_invariant(ube <= omt_size());
KLPAIR *XMALLOC_N(ube-lbi, newklpointers); // create new omt
// We use move_leafentries_to during a split, and the split algorithm should never call this
// if it's splitting on a boundary, so there must be some leafentries in the range to move.
paranoid_invariant(split_at < num_klpairs());
size_t mpsize = toku_mempool_get_used_space(&m_buffer_mempool); // overkill, but safe
struct mempool *dest_mp = &dest_bd->m_buffer_mempool;
struct mempool *src_mp = &m_buffer_mempool;
toku_mempool_construct(dest_mp, mpsize);
right_bd->init_zero();
uint32_t i = 0;
for (i = lbi; i < ube; i++) {
KLPAIR curr_kl= 0;
m_buffer.fetch(i, &curr_kl);
size_t mpsize = toku_mempool_get_used_size(&m_buffer_mempool); // overkill, but safe
size_t kl_size = klpair_size(curr_kl);
KLPAIR new_kl = NULL;
CAST_FROM_VOIDP(new_kl, toku_mempool_malloc(dest_mp, kl_size, 1));
memcpy(new_kl, curr_kl, kl_size);
newklpointers[i-lbi] = new_kl;
toku_mempool_mfree(src_mp, curr_kl, kl_size);
}
struct mempool new_left_mp;
toku_mempool_construct(&new_left_mp, mpsize);
dest_bd->m_buffer.create_steal_sorted_array(&newklpointers, ube-lbi, ube-lbi);
// now remove the elements from src_omt
for (i=ube-1; i >= lbi; i--) {
m_buffer.delete_at(i);
}
struct mempool *right_mp = &right_bd->m_buffer_mempool;
toku_mempool_construct(right_mp, mpsize);
klpair_dmt_t::builder left_dmt_builder;
left_dmt_builder.create(split_at, m_disksize_of_keys); // overkill, but safe (builder will realloc at the end)
klpair_dmt_t::builder right_dmt_builder;
right_dmt_builder.create(num_klpairs() - split_at, m_disksize_of_keys); // overkill, but safe (builder will realloc at the end)
split_klpairs_extra extra(this, right_bd, &left_dmt_builder, &right_dmt_builder, &new_left_mp, split_at);
int r = m_buffer.iterate<split_klpairs_extra, split_klpairs_extra::cb>(&extra);
invariant_zero(r);
m_buffer.destroy();
toku_mempool_destroy(&m_buffer_mempool);
m_buffer_mempool = new_left_mp;
left_dmt_builder.build(&m_buffer);
right_dmt_builder.build(&right_bd->m_buffer);
// Potentially shrink memory pool for destination.
// We overallocated ("overkill") above
struct mempool *const left_mp = &m_buffer_mempool;
paranoid_invariant_zero(toku_mempool_get_frag_size(left_mp));
toku_mempool_realloc_larger(left_mp, toku_mempool_get_used_size(left_mp));
paranoid_invariant_zero(toku_mempool_get_frag_size(right_mp));
toku_mempool_realloc_larger(right_mp, toku_mempool_get_used_size(right_mp));
}
uint64_t bn_data::get_disk_size() {
return toku_mempool_get_used_space(&m_buffer_mempool);
return m_disksize_of_keys +
toku_mempool_get_used_size(&m_buffer_mempool);
}
struct verify_le_in_mempool_state {
size_t offset_limit;
class bn_data *bd;
};
static int verify_le_in_mempool (const uint32_t, klpair_struct *klpair, const uint32_t idx UU(), struct verify_le_in_mempool_state * const state) {
invariant(klpair->le_offset < state->offset_limit);
LEAFENTRY le = state->bd->get_le_from_klpair(klpair);
uint32_t size = leafentry_memsize(le);
size_t end_offset = klpair->le_offset+size;
invariant(end_offset <= state->offset_limit);
return 0;
}
//This is a debug-only (paranoid) verification.
//Verifies the dmt is valid, and all leafentries are entirely in the mempool's memory.
void bn_data::verify_mempool(void) {
// TODO: implement something
//Verify the dmt itself <- paranoid and slow
m_buffer.verify();
verify_le_in_mempool_state state = { .offset_limit = toku_mempool_get_offset_limit(&m_buffer_mempool), .bd = this };
//Verify every leafentry pointed to by the keys in the dmt are fully inside the mempool
m_buffer.iterate_ptr< decltype(state), verify_le_in_mempool >(&state);
}
uint32_t bn_data::omt_size(void) const {
uint32_t bn_data::num_klpairs(void) const {
return m_buffer.size();
}
@ -375,40 +632,54 @@ void bn_data::destroy(void) {
// The buffer may have been freed already, in some cases.
m_buffer.destroy();
toku_mempool_destroy(&m_buffer_mempool);
m_disksize_of_keys = 0;
}
//TODO: Splitting key/val requires changing this
void bn_data::replace_contents_with_clone_of_sorted_array(
void bn_data::set_contents_as_clone_of_sorted_array(
uint32_t num_les,
const void** old_key_ptrs,
uint32_t* old_keylens,
LEAFENTRY* old_les,
size_t *le_sizes,
size_t mempool_size
)
LEAFENTRY* old_les,
size_t *le_sizes,
size_t total_key_size,
size_t total_le_size
)
{
toku_mempool_construct(&m_buffer_mempool, mempool_size);
KLPAIR *XMALLOC_N(num_les, le_array);
for (uint32_t idx = 0; idx < num_les; idx++) {
KLPAIR new_kl = (KLPAIR)toku_mempool_malloc(
&m_buffer_mempool,
le_sizes[idx] + old_keylens[idx] + sizeof(uint32_t),
1); // point to new location
new_kl->keylen = old_keylens[idx];
memcpy(new_kl->key_le, old_key_ptrs[idx], new_kl->keylen);
memcpy(get_le_from_klpair(new_kl), old_les[idx], le_sizes[idx]);
CAST_FROM_VOIDP(le_array[idx], new_kl);
}
//TODO: Splitting key/val requires changing this; keys are stored in old omt.. cannot delete it yet?
//Enforce "just created" invariant.
paranoid_invariant_zero(m_disksize_of_keys);
paranoid_invariant_zero(num_klpairs());
paranoid_invariant_null(toku_mempool_get_base(&m_buffer_mempool));
paranoid_invariant_zero(toku_mempool_get_size(&m_buffer_mempool));
toku_mempool_construct(&m_buffer_mempool, total_le_size);
m_buffer.destroy();
m_buffer.create_steal_sorted_array(&le_array, num_les, num_les);
m_disksize_of_keys = 0;
klpair_dmt_t::builder dmt_builder;
dmt_builder.create(num_les, total_key_size);
for (uint32_t idx = 0; idx < num_les; idx++) {
void* new_le = toku_mempool_malloc(&m_buffer_mempool, le_sizes[idx]);
paranoid_invariant_notnull(new_le);
memcpy(new_le, old_les[idx], le_sizes[idx]);
size_t le_offset = toku_mempool_get_offset_from_pointer_and_base(&m_buffer_mempool, new_le);
dmt_builder.append(klpair_dmtwriter(old_keylens[idx], le_offset, old_key_ptrs[idx]));
add_key(old_keylens[idx]);
}
dmt_builder.build(&this->m_buffer);
}
LEAFENTRY bn_data::get_le_from_klpair(const klpair_struct *klpair) const {
void * ptr = toku_mempool_get_pointer_from_base_and_offset(&this->m_buffer_mempool, klpair->le_offset);
LEAFENTRY CAST_FROM_VOIDP(le, ptr);
return le;
}
// get info about a single leafentry by index
int bn_data::fetch_le(uint32_t idx, LEAFENTRY *le) {
KLPAIR klpair = NULL;
int r = m_buffer.fetch(idx, &klpair);
klpair_struct* klpair = nullptr;
int r = m_buffer.fetch(idx, nullptr, &klpair);
if (r == 0) {
*le = get_le_from_klpair(klpair);
}
@ -416,59 +687,41 @@ int bn_data::fetch_le(uint32_t idx, LEAFENTRY *le) {
}
int bn_data::fetch_klpair(uint32_t idx, LEAFENTRY *le, uint32_t *len, void** key) {
KLPAIR klpair = NULL;
int r = m_buffer.fetch(idx, &klpair);
klpair_struct* klpair = nullptr;
uint32_t klpair_len;
int r = m_buffer.fetch(idx, &klpair_len, &klpair);
if (r == 0) {
*len = klpair->keylen;
*key = klpair->key_le;
*len = keylen_from_klpair_len(klpair_len);
*key = klpair->key;
*le = get_le_from_klpair(klpair);
}
return r;
}
int bn_data::fetch_klpair_disksize(uint32_t idx, size_t *size) {
KLPAIR klpair = NULL;
int r = m_buffer.fetch(idx, &klpair);
klpair_struct* klpair = nullptr;
uint32_t klpair_len;
int r = m_buffer.fetch(idx, &klpair_len, &klpair);
if (r == 0) {
*size = klpair_disksize(klpair);
*size = klpair_disksize(klpair_len, klpair);
}
return r;
}
int bn_data::fetch_le_key_and_len(uint32_t idx, uint32_t *len, void** key) {
KLPAIR klpair = NULL;
int r = m_buffer.fetch(idx, &klpair);
int bn_data::fetch_key_and_len(uint32_t idx, uint32_t *len, void** key) {
klpair_struct* klpair = nullptr;
uint32_t klpair_len;
int r = m_buffer.fetch(idx, &klpair_len, &klpair);
if (r == 0) {
*len = klpair->keylen;
*key = klpair->key_le;
*len = keylen_from_klpair_len(klpair_len);
*key = klpair->key;
}
return r;
}
struct mp_pair {
void* orig_base;
void* new_base;
klpair_omt_t* omt;
};
static int fix_mp_offset(const KLPAIR &klpair, const uint32_t idx, struct mp_pair * const p) {
char* old_value = (char *) klpair;
char *new_value = old_value - (char *)p->orig_base + (char *)p->new_base;
p->omt->set_at((KLPAIR)new_value, idx);
return 0;
}
void bn_data::clone(bn_data* orig_bn_data) {
toku_mempool_clone(&orig_bn_data->m_buffer_mempool, &m_buffer_mempool);
m_buffer.clone(orig_bn_data->m_buffer);
struct mp_pair p;
p.orig_base = toku_mempool_get_base(&orig_bn_data->m_buffer_mempool);
p.new_base = toku_mempool_get_base(&m_buffer_mempool);
p.omt = &m_buffer;
int r = m_buffer.iterate_on_range<decltype(p), fix_mp_offset>(0, omt_size(), &p);
invariant_zero(r);
this->m_disksize_of_keys = orig_bn_data->m_disksize_of_keys;
}

View File

@ -28,7 +28,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,169 +88,299 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#pragma once
#include <util/omt.h>
#include "leafentry.h"
#include <util/mempool.h>
#include "util/dmt.h"
#include "util/mempool.h"
#if 0 //for implementation
static int
UU() verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
{
LEAFENTRY CAST_FROM_VOIDP(le, lev);
struct mempool *CAST_FROM_VOIDP(mp, mpv);
int r = toku_mempool_inrange(mp, le, leafentry_memsize(le));
lazy_assert(r);
return 0;
}
toku_omt_iterate(bn->buffer, verify_in_mempool, &bn->buffer_mempool);
#endif
#include "ft/leafentry.h"
#include "ft/serialize/wbuf.h"
// Key/leafentry pair stored in a dmt. The key is inlined, the offset (in leafentry mempool) is stored for the leafentry.
struct klpair_struct {
uint32_t keylen;
uint8_t key_le[0]; // key, followed by le
uint32_t le_offset; //Offset of leafentry (in leafentry mempool)
uint8_t key[0]; // key, followed by le
};
typedef struct klpair_struct *KLPAIR;
static inline LEAFENTRY get_le_from_klpair(KLPAIR klpair){
uint32_t keylen = klpair->keylen;
LEAFENTRY le = (LEAFENTRY)(klpair->key_le + keylen);
return le;
static constexpr uint32_t keylen_from_klpair_len(const uint32_t klpair_len) {
return klpair_len - __builtin_offsetof(klpair_struct, key);
}
template<typename omtcmp_t,
int (*h)(const DBT &, const omtcmp_t &)>
static int wrappy_fun_find(const KLPAIR &klpair, const omtcmp_t &extra) {
//TODO: kill this function when we split, and/or use toku_fill_dbt
static_assert(__builtin_offsetof(klpair_struct, key) == 1*sizeof(uint32_t), "klpair alignment issues");
static_assert(__builtin_offsetof(klpair_struct, key) == sizeof(klpair_struct), "klpair size issues");
// A wrapper for the heaviside function provided to dmt->find*.
// Needed because the heaviside functions provided to bndata do not know about the internal types.
// Alternative to this wrapper is to expose accessor functions and rewrite all the external heaviside functions.
template<typename dmtcmp_t,
int (*h)(const DBT &, const dmtcmp_t &)>
static int klpair_find_wrapper(const uint32_t klpair_len, const klpair_struct &klpair, const dmtcmp_t &extra) {
DBT kdbt;
kdbt.data = klpair->key_le;
kdbt.size = klpair->keylen;
kdbt.data = const_cast<void*>(reinterpret_cast<const void*>(klpair.key));
kdbt.size = keylen_from_klpair_len(klpair_len);
return h(kdbt, extra);
}
template<typename inner_iterate_extra_t>
struct klpair_iterate_extra {
public:
inner_iterate_extra_t *inner;
const class bn_data * bd;
};
// A wrapper for the high-order function provided to dmt->iterate*
// Needed because the heaviside functions provided to bndata do not know about the internal types.
// Alternative to this wrapper is to expose accessor functions and rewrite all the external heaviside functions.
template<typename iterate_extra_t,
int (*h)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t idx, iterate_extra_t *const)>
static int wrappy_fun_iterate(const KLPAIR &klpair, const uint32_t idx, iterate_extra_t *const extra) {
uint32_t keylen = klpair->keylen;
void* key = klpair->key_le;
LEAFENTRY le = get_le_from_klpair(klpair);
return h(key, keylen, le, idx, extra);
int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t idx, iterate_extra_t *const)>
static int klpair_iterate_wrapper(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx, klpair_iterate_extra<iterate_extra_t> *const extra) {
const void* key = &klpair.key;
LEAFENTRY le = extra->bd->get_le_from_klpair(&klpair);
return f(key, keylen_from_klpair_len(klpair_len), le, idx, extra->inner);
}
typedef toku::omt<KLPAIR> klpair_omt_t;
namespace toku {
// dmt writer for klpair_struct
class klpair_dmtwriter {
public:
// Return the size needed for the klpair_struct that this dmtwriter represents
size_t get_size(void) const {
return sizeof(klpair_struct) + this->keylen;
}
// Write the klpair_struct this dmtwriter represents to a destination
void write_to(klpair_struct *const dest) const {
dest->le_offset = this->le_offset;
memcpy(dest->key, this->keyp, this->keylen);
}
klpair_dmtwriter(uint32_t _keylen, uint32_t _le_offset, const void* _keyp)
: keylen(_keylen), le_offset(_le_offset), keyp(_keyp) {}
klpair_dmtwriter(const uint32_t klpair_len, klpair_struct *const src)
: keylen(keylen_from_klpair_len(klpair_len)), le_offset(src->le_offset), keyp(src->key) {}
private:
const uint32_t keylen;
const uint32_t le_offset;
const void* keyp;
};
}
typedef toku::dmt<klpair_struct, klpair_struct*, toku::klpair_dmtwriter> klpair_dmt_t;
// This class stores the data associated with a basement node
class bn_data {
public:
// Initialize an empty bn_data _without_ a dmt backing.
// Externally only used for deserialization.
void init_zero(void);
// Initialize an empty bn_data _with_ a dmt
void initialize_empty(void);
void initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size);
// globals
// Deserialize a bn_data from rbuf.
// This is the entry point for deserialization.
void deserialize_from_rbuf(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version);
// Retrieve the memory footprint of this basement node.
// May over or under count: see Tokutek/ft-index#136
// Also see dmt's implementation.
uint64_t get_memory_size(void);
// Get the serialized size of this basement node.
uint64_t get_disk_size(void);
// Perform (paranoid) verification that all leafentries are fully contained within the mempool
void verify_mempool(void);
// Interact with "omt"
uint32_t omt_size(void) const;
// size() of key dmt
uint32_t num_klpairs(void) const;
// iterate() on key dmt (and associated leafentries)
template<typename iterate_extra_t,
int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t, iterate_extra_t *const)>
int omt_iterate(iterate_extra_t *const iterate_extra) const {
return omt_iterate_on_range<iterate_extra_t, f>(0, omt_size(), iterate_extra);
int iterate(iterate_extra_t *const iterate_extra) const {
return iterate_on_range<iterate_extra_t, f>(0, num_klpairs(), iterate_extra);
}
// iterate_on_range() on key dmt (and associated leafentries)
template<typename iterate_extra_t,
int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t, iterate_extra_t *const)>
int omt_iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
return m_buffer.iterate_on_range< iterate_extra_t, wrappy_fun_iterate<iterate_extra_t, f> >(left, right, iterate_extra);
int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
klpair_iterate_extra<iterate_extra_t> klpair_extra = { iterate_extra, this };
return m_buffer.iterate_on_range< klpair_iterate_extra<iterate_extra_t>, klpair_iterate_wrapper<iterate_extra_t, f> >(left, right, &klpair_extra);
}
template<typename omtcmp_t,
int (*h)(const DBT &, const omtcmp_t &)>
int find_zero(const omtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
KLPAIR klpair = NULL;
int r = m_buffer.find_zero< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, &klpair, idxp);
// find_zero() on key dmt
template<typename dmtcmp_t,
int (*h)(const DBT &, const dmtcmp_t &)>
int find_zero(const dmtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
klpair_struct* klpair = nullptr;
uint32_t klpair_len;
int r = m_buffer.find_zero< dmtcmp_t, klpair_find_wrapper<dmtcmp_t, h> >(extra, &klpair_len, &klpair, idxp);
if (r == 0) {
if (value) {
*value = get_le_from_klpair(klpair);
}
if (key) {
paranoid_invariant(keylen != NULL);
*key = klpair->key_le;
*keylen = klpair->keylen;
paranoid_invariant_notnull(keylen);
*key = klpair->key;
*keylen = keylen_from_klpair_len(klpair_len);
}
else {
paranoid_invariant(keylen == NULL);
paranoid_invariant_null(keylen);
}
}
return r;
}
template<typename omtcmp_t,
int (*h)(const DBT &, const omtcmp_t &)>
int find(const omtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
KLPAIR klpair = NULL;
int r = m_buffer.find< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, direction, &klpair, idxp);
// find() on key dmt (and associated leafentries)
template<typename dmtcmp_t,
int (*h)(const DBT &, const dmtcmp_t &)>
int find(const dmtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
klpair_struct* klpair = nullptr;
uint32_t klpair_len;
int r = m_buffer.find< dmtcmp_t, klpair_find_wrapper<dmtcmp_t, h> >(extra, direction, &klpair_len, &klpair, idxp);
if (r == 0) {
if (value) {
*value = get_le_from_klpair(klpair);
}
if (key) {
paranoid_invariant(keylen != NULL);
*key = klpair->key_le;
*keylen = klpair->keylen;
paranoid_invariant_notnull(keylen);
*key = klpair->key;
*keylen = keylen_from_klpair_len(klpair_len);
}
else {
paranoid_invariant(keylen == NULL);
paranoid_invariant_null(keylen);
}
}
return r;
}
// get info about a single leafentry by index
// Fetch leafentry by index
__attribute__((__nonnull__))
int fetch_le(uint32_t idx, LEAFENTRY *le);
// Fetch (leafentry, key, keylen) by index
__attribute__((__nonnull__))
int fetch_klpair(uint32_t idx, LEAFENTRY *le, uint32_t *len, void** key);
// Fetch (serialized size of leafentry, key, and keylen) by index
__attribute__((__nonnull__))
int fetch_klpair_disksize(uint32_t idx, size_t *size);
int fetch_le_key_and_len(uint32_t idx, uint32_t *len, void** key);
// Fetch (key, keylen) by index
__attribute__((__nonnull__))
int fetch_key_and_len(uint32_t idx, uint32_t *len, void** key);
// Interact with another bn_data
void move_leafentries_to(BN_DATA dest_bd,
uint32_t lbi, //lower bound inclusive
uint32_t ube //upper bound exclusive
);
// Move leafentries (and associated key/keylens) from this basement node to dest_bd
// Moves indexes [lbi-ube)
__attribute__((__nonnull__))
void split_klpairs(bn_data* dest_bd, uint32_t first_index_for_dest);
// Destroy this basement node and free memory.
void destroy(void);
// Replaces contents, into brand new mempool.
// Returns old mempool base, expects caller to free it.
void replace_contents_with_clone_of_sorted_array(
// Uses sorted array as input for this basement node.
// Expects this to be a basement node just initialized with initialize_empty()
void set_contents_as_clone_of_sorted_array(
uint32_t num_les,
const void** old_key_ptrs,
uint32_t* old_keylens,
LEAFENTRY* old_les,
size_t *le_sizes,
size_t mempool_size
size_t total_key_size,
size_t total_le_size
);
// Make this basement node a clone of orig_bn_data.
// orig_bn_data still owns all its memory (dmt, mempool)
// this basement node will have a new dmt, mempool containing same data.
void clone(bn_data* orig_bn_data);
// Delete klpair index idx with provided keylen and old leafentry with size old_le_size
void delete_leafentry (
uint32_t idx,
uint32_t keylen,
uint32_t old_le_size
);
void get_space_for_overwrite(uint32_t idx, const void* keyp, uint32_t keylen, uint32_t old_size, uint32_t new_size, LEAFENTRY* new_le_space);
void get_space_for_insert(uint32_t idx, const void* keyp, uint32_t keylen, size_t size, LEAFENTRY* new_le_space);
private:
// Private functions
KLPAIR mempool_malloc_from_omt(size_t size, void **maybe_free);
void omt_compress_kvspace(size_t added_size, void **maybe_free);
klpair_omt_t m_buffer; // pointers to individual leaf entries
// Allocates space in the mempool to store a new leafentry.
// This may require reorganizing the mempool and updating the dmt.
__attribute__((__nonnull__))
void get_space_for_overwrite(uint32_t idx, const void* keyp, uint32_t keylen, uint32_t old_keylen, uint32_t old_size,
uint32_t new_size, LEAFENTRY* new_le_space, void **const maybe_free);
// Allocates space in the mempool to store a new leafentry
// and inserts a new key into the dmt
// This may require reorganizing the mempool and updating the dmt.
__attribute__((__nonnull__))
void get_space_for_insert(uint32_t idx, const void* keyp, uint32_t keylen, size_t size, LEAFENTRY* new_le_space, void **const maybe_free);
// Gets a leafentry given a klpair from this basement node.
LEAFENTRY get_le_from_klpair(const klpair_struct *klpair) const;
void serialize_to_wbuf(struct wbuf *const wb);
// Prepares this basement node for serialization.
// Must be called before serializing this basement node.
// Between calling prepare_to_serialize and actually serializing, the basement node may not be modified
void prepare_to_serialize(void);
// Serialize the basement node header to a wbuf
// Requires prepare_to_serialize() to have been called first.
void serialize_header(struct wbuf *wb) const;
// Serialize all keys and leafentries to a wbuf
// Requires prepare_to_serialize() (and serialize_header()) has been called first.
// Currently only supported when all keys are fixed-length.
void serialize_rest(struct wbuf *wb) const;
static const uint32_t HEADER_LENGTH = 0
+ sizeof(uint32_t) // key_data_size
+ sizeof(uint32_t) // val_data_size
+ sizeof(uint32_t) // fixed_key_length
+ sizeof(uint8_t) // all_keys_same_length
+ sizeof(uint8_t) // keys_vals_separate
+ 0;
private:
// split_klpairs_extra should be a local class in split_klpairs, but
// the dmt template parameter for iterate needs linkage, so it has to be a
// separate class, but we want it to be able to call e.g. add_key
friend class split_klpairs_extra;
// Allocates space in the mempool.
// If there is insufficient space, the mempool is enlarged and leafentries may be shuffled to reduce fragmentation.
// If shuffling happens, the offsets stored in the dmt are updated.
LEAFENTRY mempool_malloc_and_update_dmt(size_t size, void **maybe_free);
// Change the size of the mempool to support what is already in it, plus added_size.
// possibly "compress" by shuffling leafentries around to reduce fragmentation to 0.
// If fragmentation is already 0 and force_compress is not true, shuffling may be skipped.
// If shuffling happens, leafentries will be stored in the mempool in sorted order.
void dmt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress);
// Note that a key was added (for maintaining disk-size of this basement node)
void add_key(uint32_t keylen);
// Note that multiple keys were added (for maintaining disk-size of this basement node)
void add_keys(uint32_t n_keys, uint32_t combined_klpair_len);
// Note that a key was removed (for maintaining disk-size of this basement node)
void remove_key(uint32_t keylen);
klpair_dmt_t m_buffer; // pointers to individual leaf entries
struct mempool m_buffer_mempool; // storage for all leaf entries
friend class bndata_bugfix_test;
};
// Get the serialized size of a klpair.
// As of Jan 14, 2014, serialized size of a klpair is independent of whether this basement node has fixed-length keys.
uint32_t klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const;
// The disk/memory size of all keys. (Note that the size of memory for the leafentries is maintained by m_buffer_mempool)
size_t m_disksize_of_keys;
// Deserialize this basement node from rbuf
// all keys will be first followed by all leafentries (both in sorted order)
void initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version,
uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
uint32_t fixed_klpair_length);
};

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,11 +89,12 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2011-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "background_job_manager.h"
#include "toku_config.h"
#include <portability/toku_config.h>
#include <memory.h>
#include <toku_pthread.h>
#include "cachetable/background_job_manager.h"
struct background_job_manager_struct {
bool accepting_jobs;
uint32_t num_jobs;

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef BACKGROUND_JOB_MANAGER_H
#define BACKGROUND_JOB_MANAGER_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,6 +86,8 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
@ -130,5 +130,3 @@ void bjm_remove_background_job(BACKGROUND_JOB_MANAGER bjm);
// has completed, bjm_add_background_job returns an error.
//
void bjm_wait_for_jobs_to_finish(BACKGROUND_JOB_MANAGER bjm);
#endif

View File

@ -1,9 +1,6 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TokuDB_cachetable_internal_h
#define TokuDB_cachetable_internal_h
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -33,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -90,10 +87,12 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "background_job_manager.h"
#include "cachetable/background_job_manager.h"
#include <portability/toku_random.h>
#include <util/frwlock.h>
#include <util/kibbutz.h>
@ -179,8 +178,6 @@ class pair_list;
// Maps to a file on disk.
//
struct cachefile {
CACHEFILE next;
CACHEFILE prev;
// these next two fields are protected by cachetable's list lock
// they are managed whenever we add or remove a pair from
// the cachetable. As of Riddler, this linked list is only used to
@ -440,14 +437,12 @@ public:
bool evict_some_stale_pair(evictor* ev);
void free_stale_data(evictor* ev);
// access to these fields are protected by the lock
CACHEFILE m_active_head; // head of CACHEFILEs that are active
CACHEFILE m_stale_head; // head of CACHEFILEs that are stale
CACHEFILE m_stale_tail; // tail of CACHEFILEs that are stale
FILENUM m_next_filenum_to_use;
uint32_t m_next_hash_id_to_use;
toku_pthread_rwlock_t m_lock; // this field is publoc so we are still POD
toku::omt<CACHEFILE> m_active_filenum;
toku::omt<CACHEFILE> m_active_fileid;
toku::omt<CACHEFILE> m_stale_fileid;
private:
CACHEFILE find_cachefile_in_list_unlocked(CACHEFILE start, struct fileid* fileid);
};
@ -521,8 +516,8 @@ public:
void add_pair_attr(PAIR_ATTR attr);
void remove_pair_attr(PAIR_ATTR attr);
void change_pair_attr(PAIR_ATTR old_attr, PAIR_ATTR new_attr);
void add_to_size_current(long size);
void remove_from_size_current(long size);
void add_cloned_data_size(long size);
void remove_cloned_data_size(long size);
uint64_t reserve_memory(double fraction, uint64_t upper_bound);
void release_reserved_memory(uint64_t reserved_memory);
void run_eviction_thread();
@ -536,6 +531,8 @@ public:
void get_state(long *size_current_ptr, long *size_limit_ptr);
void fill_engine_status();
private:
void add_to_size_current(long size);
void remove_from_size_current(long size);
void run_eviction();
bool run_eviction_on_pair(PAIR p);
void try_evict_pair(PAIR p);
@ -551,6 +548,7 @@ private:
pair_list* m_pl;
cachefile_list* m_cf_list;
int64_t m_size_current; // the sum of the sizes of the pairs in the cachetable
int64_t m_size_cloned_data; // stores amount of cloned data we have, only used for engine status
// changes to these two values are protected
// by ev_thread_lock
int64_t m_size_reserved; // How much memory is reserved (e.g., by the loader)
@ -654,5 +652,3 @@ struct cachetable {
char *env_dir;
};
#endif // End of header guardian.

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,24 +89,26 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdarg.h>
#include "cachetable.h"
#include <ft/log_header.h>
#include "checkpoint.h"
#include "log-internal.h"
#include "cachetable-internal.h"
#include <memory.h>
#include <toku_race_tools.h>
#include <portability/memory.h>
#include <portability/toku_race_tools.h>
#include <portability/toku_atomic.h>
#include <portability/toku_pthread.h>
#include <portability/toku_portability.h>
#include <portability/toku_stdlib.h>
#include <portability/toku_time.h>
#include <util/rwlock.h>
#include <util/status.h>
#include <util/context.h>
#include "ft/cachetable/cachetable.h"
#include "ft/cachetable/cachetable-internal.h"
#include "ft/cachetable/checkpoint.h"
#include "ft/logger/log-internal.h"
#include "util/rwlock.h"
#include "util/scoped_malloc.h"
#include "util/status.h"
#include "util/context.h"
///////////////////////////////////////////////////////////////////////////////////
// Engine status
@ -127,7 +129,7 @@ static CACHETABLE_STATUS_S ct_status;
// Note, toku_cachetable_get_status() is below, after declaration of cachetable.
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(ct_status, k, c, t, "cachetable: " l, inc)
#define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(ct_status, k, c, t, "cachetable: " l, inc)
static void
status_init(void) {
@ -144,6 +146,7 @@ status_init(void) {
STATUS_INIT(CT_SIZE_LEAF, CACHETABLE_SIZE_LEAF, UINT64, "size leaf", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(CT_SIZE_ROLLBACK, CACHETABLE_SIZE_ROLLBACK, UINT64, "size rollback", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(CT_SIZE_CACHEPRESSURE, CACHETABLE_SIZE_CACHEPRESSURE, UINT64, "size cachepressure", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(CT_SIZE_CLONED, CACHETABLE_SIZE_CLONED, UINT64, "size currently cloned data for checkpoint", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(CT_EVICTIONS, CACHETABLE_EVICTIONS, UINT64, "evictions", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(CT_CLEANER_EXECUTIONS, CACHETABLE_CLEANER_EXECUTIONS, UINT64, "cleaner executions", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(CT_CLEANER_PERIOD, CACHETABLE_CLEANER_PERIOD, UINT64, "cleaner period", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
@ -370,7 +373,7 @@ toku_cachetable_set_env_dir(CACHETABLE ct, const char *env_dir) {
// What cachefile goes with particular iname (iname relative to env)?
// The transaction that is adding the reference might not have a reference
// to the brt, therefore the cachefile might be closing.
// to the ft, therefore the cachefile might be closing.
// If closing, we want to return that it is not there, but must wait till after
// the close has finished.
// Once the close has finished, there must not be a cachefile with that name
@ -380,7 +383,7 @@ int toku_cachefile_of_iname_in_env (CACHETABLE ct, const char *iname_in_env, CAC
}
// What cachefile goes with particular fd?
// This function can only be called if the brt is still open, so file must
// This function can only be called if the ft is still open, so file must
// still be open
int toku_cachefile_of_filenum (CACHETABLE ct, FILENUM filenum, CACHEFILE *cf) {
return ct->cf_list.cachefile_of_filenum(filenum, cf);
@ -642,7 +645,7 @@ static void cachetable_free_pair(PAIR p) {
cachetable_evictions++;
PAIR_ATTR new_attr = p->attr;
// Note that flush_callback is called with write_me false, so the only purpose of this
// call is to tell the brt layer to evict the node (keep_me is false).
// call is to tell the ft layer to evict the node (keep_me is false).
// Also, because we have already removed the PAIR from the cachetable in
// cachetable_remove_pair, we cannot pass in p->cachefile and p->cachefile->fd
// for the first two parameters, as these may be invalid (#5171), so, we
@ -704,7 +707,7 @@ static void cachetable_only_write_locked_data(
p->disk_data = disk_data;
if (is_clone) {
p->cloned_value_data = NULL;
ev->remove_from_size_current(p->cloned_value_size);
ev->remove_cloned_data_size(p->cloned_value_size);
p->cloned_value_size = 0;
}
}
@ -949,7 +952,7 @@ clone_pair(evictor* ev, PAIR p) {
ev->change_pair_attr(old_attr, new_attr);
}
p->cloned_value_size = clone_size;
ev->add_to_size_current(p->cloned_value_size);
ev->add_cloned_data_size(p->cloned_value_size);
}
static void checkpoint_cloned_pair(void* extra) {
@ -1302,8 +1305,6 @@ void toku_cachetable_pf_pinned_pair(
pair_unlock(p);
}
// NOW A TEST ONLY FUNCTION!!!
int toku_cachetable_get_and_pin (
CACHEFILE cachefile,
CACHEKEY key,
@ -1573,7 +1574,7 @@ exit:
return try_again;
}
int toku_cachetable_get_and_pin_with_dep_pairs_batched (
int toku_cachetable_get_and_pin_with_dep_pairs (
CACHEFILE cachefile,
CACHEKEY key,
uint32_t fullhash,
@ -1589,7 +1590,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
PAIR* dependent_pairs,
enum cachetable_dirty* dependent_dirty // array stating dirty/cleanness of dependent pairs
)
// See cachetable.h
// See cachetable/cachetable.h
{
CACHETABLE ct = cachefile->cachetable;
bool wait = false;
@ -1766,43 +1767,6 @@ got_value:
return 0;
}
int toku_cachetable_get_and_pin_with_dep_pairs (
CACHEFILE cachefile,
CACHEKEY key,
uint32_t fullhash,
void**value,
long *sizep,
CACHETABLE_WRITE_CALLBACK write_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
pair_lock_type lock_type,
void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
PAIR* dependent_pairs,
enum cachetable_dirty* dependent_dirty // array stating dirty/cleanness of dependent pairs
)
// See cachetable.h
{
int r = toku_cachetable_get_and_pin_with_dep_pairs_batched(
cachefile,
key,
fullhash,
value,
sizep,
write_callback,
fetch_callback,
pf_req_callback,
pf_callback,
lock_type,
read_extraargs,
num_dependent_pairs,
dependent_pairs,
dependent_dirty
);
return r;
}
// Lookup a key in the cachetable. If it is found and it is not being written, then
// acquire a read lock on the pair, update the LRU list, and return sucess.
//
@ -2048,7 +2012,7 @@ maybe_pin_pair(
return retval;
}
int toku_cachetable_get_and_pin_nonblocking_batched(
int toku_cachetable_get_and_pin_nonblocking(
CACHEFILE cf,
CACHEKEY key,
uint32_t fullhash,
@ -2062,7 +2026,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
void *read_extraargs,
UNLOCKERS unlockers
)
// See cachetable.h.
// See cachetable/cachetable.h.
{
CACHETABLE ct = cf->cachetable;
assert(lock_type == PL_READ ||
@ -2200,40 +2164,6 @@ try_again:
abort();
}
int toku_cachetable_get_and_pin_nonblocking (
CACHEFILE cf,
CACHEKEY key,
uint32_t fullhash,
void**value,
long* sizep,
CACHETABLE_WRITE_CALLBACK write_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
pair_lock_type lock_type,
void *read_extraargs,
UNLOCKERS unlockers
)
// See cachetable.h.
{
int r = 0;
r = toku_cachetable_get_and_pin_nonblocking_batched(
cf,
key,
fullhash,
value,
sizep,
write_callback,
fetch_callback,
pf_req_callback,
pf_callback,
lock_type,
read_extraargs,
unlockers
);
return r;
}
struct cachefile_prefetch_args {
PAIR p;
CACHETABLE_FETCH_CALLBACK fetch_callback;
@ -2279,7 +2209,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
void *read_extraargs,
bool *doing_prefetch)
// Effect: See the documentation for this function in cachetable.h
// Effect: See the documentation for this function in cachetable/cachetable.h
{
int r = 0;
PAIR p = NULL;
@ -2582,6 +2512,11 @@ toku_cachetable_minicron_shutdown(CACHETABLE ct) {
ct->cl.destroy();
}
void toku_cachetable_prepare_close(CACHETABLE ct UU()) {
extern bool toku_serialize_in_parallel;
toku_serialize_in_parallel = true;
}
/* Requires that it all be flushed. */
void toku_cachetable_close (CACHETABLE *ctp) {
CACHETABLE ct = *ctp;
@ -3708,6 +3643,7 @@ int evictor::init(long _size_limit, pair_list* _pl, cachefile_list* _cf_list, KI
m_size_reserved = unreservable_memory(_size_limit);
m_size_current = 0;
m_size_cloned_data = 0;
m_size_evicting = 0;
m_size_nonleaf = create_partitioned_counter();
@ -3842,6 +3778,22 @@ void evictor::remove_from_size_current(long size) {
(void) toku_sync_fetch_and_sub(&m_size_current, size);
}
//
// Adds the size of cloned data to necessary variables in the evictor
//
void evictor::add_cloned_data_size(long size) {
(void) toku_sync_fetch_and_add(&m_size_cloned_data, size);
add_to_size_current(size);
}
//
// Removes the size of cloned data to necessary variables in the evictor
//
void evictor::remove_cloned_data_size(long size) {
(void) toku_sync_fetch_and_sub(&m_size_cloned_data, size);
remove_from_size_current(size);
}
//
// TODO: (Zardosht) comment this function
//
@ -4406,6 +4358,7 @@ void evictor::fill_engine_status() {
STATUS_VALUE(CT_SIZE_LEAF) = read_partitioned_counter(m_size_leaf);
STATUS_VALUE(CT_SIZE_ROLLBACK) = read_partitioned_counter(m_size_rollback);
STATUS_VALUE(CT_SIZE_CACHEPRESSURE) = read_partitioned_counter(m_size_cachepressure);
STATUS_VALUE(CT_SIZE_CLONED) = m_size_cloned_data;
STATUS_VALUE(CT_WAIT_PRESSURE_COUNT) = read_partitioned_counter(m_wait_pressure_count);
STATUS_VALUE(CT_WAIT_PRESSURE_TIME) = read_partitioned_counter(m_wait_pressure_time);
STATUS_VALUE(CT_LONG_WAIT_PRESSURE_COUNT) = read_partitioned_counter(m_long_wait_pressure_count);
@ -4487,43 +4440,48 @@ void checkpointer::increment_num_txns() {
m_checkpoint_num_txns++;
}
struct iterate_begin_checkpoint {
LSN lsn_of_checkpoint_in_progress;
iterate_begin_checkpoint(LSN lsn) : lsn_of_checkpoint_in_progress(lsn) { }
static int fn(const CACHEFILE &cf, const uint32_t UU(idx), struct iterate_begin_checkpoint *info) {
assert(cf->begin_checkpoint_userdata);
if (cf->for_checkpoint) {
cf->begin_checkpoint_userdata(info->lsn_of_checkpoint_in_progress, cf->userdata);
}
return 0;
}
};
//
// Update the user data in any cachefiles in our checkpoint list.
//
void checkpointer::update_cachefiles() {
CACHEFILE cf;
for(cf = m_cf_list->m_active_head; cf; cf=cf->next) {
assert(cf->begin_checkpoint_userdata);
if (cf->for_checkpoint) {
cf->begin_checkpoint_userdata(m_lsn_of_checkpoint_in_progress,
cf->userdata);
}
}
struct iterate_begin_checkpoint iterate(m_lsn_of_checkpoint_in_progress);
int r = m_cf_list->m_active_fileid.iterate<struct iterate_begin_checkpoint,
iterate_begin_checkpoint::fn>(&iterate);
assert_zero(r);
}
struct iterate_note_pin {
static int fn(const CACHEFILE &cf, uint32_t UU(idx), void **UU(extra)) {
assert(cf->note_pin_by_checkpoint);
cf->note_pin_by_checkpoint(cf, cf->userdata);
cf->for_checkpoint = true;
return 0;
}
};
//
// Sets up and kicks off a checkpoint.
//
void checkpointer::begin_checkpoint() {
// 1. Initialize the accountability counters.
m_checkpoint_num_files = 0;
m_checkpoint_num_txns = 0;
// 2. Make list of cachefiles to be included in the checkpoint.
// TODO: <CER> How do we remove the non-lock cachetable reference here?
m_cf_list->read_lock();
for (CACHEFILE cf = m_cf_list->m_active_head; cf; cf = cf->next) {
// The caller must serialize open, close, and begin checkpoint.
// So we should never see a closing cachefile here.
// <CER> Is there an assert we can add here?
// Putting this check here so that this method may be called
// by cachetable tests.
assert(cf->note_pin_by_checkpoint);
cf->note_pin_by_checkpoint(cf, cf->userdata);
cf->for_checkpoint = true;
m_checkpoint_num_files++;
}
m_cf_list->m_active_fileid.iterate<void *, iterate_note_pin::fn>(nullptr);
m_checkpoint_num_files = m_cf_list->m_active_fileid.size();
m_cf_list->read_unlock();
// 3. Create log entries for this checkpoint.
@ -4548,6 +4506,14 @@ void checkpointer::begin_checkpoint() {
m_list->write_pending_exp_unlock();
}
struct iterate_log_fassociate {
static int fn(const CACHEFILE &cf, uint32_t UU(idx), void **UU(extra)) {
assert(cf->log_fassociate_during_checkpoint);
cf->log_fassociate_during_checkpoint(cf, cf->userdata);
return 0;
}
};
//
// Assuming the logger exists, this will write out the folloing
// information to the log.
@ -4571,10 +4537,7 @@ void checkpointer::log_begin_checkpoint() {
m_lsn_of_checkpoint_in_progress = begin_lsn;
// Log the list of open dictionaries.
for (CACHEFILE cf = m_cf_list->m_active_head; cf; cf = cf->next) {
assert(cf->log_fassociate_during_checkpoint);
cf->log_fassociate_during_checkpoint(cf, cf->userdata);
}
m_cf_list->m_active_fileid.iterate<void *, iterate_log_fassociate::fn>(nullptr);
// Write open transactions to the log.
r = toku_txn_manager_iter_over_live_txns(
@ -4632,7 +4595,8 @@ void checkpointer::remove_background_job() {
}
void checkpointer::end_checkpoint(void (*testcallback_f)(void*), void* testextra) {
CACHEFILE *XMALLOC_N(m_checkpoint_num_files, checkpoint_cfs);
toku::scoped_malloc checkpoint_cfs_buf(m_checkpoint_num_files * sizeof(CACHEFILE));
CACHEFILE *checkpoint_cfs = reinterpret_cast<CACHEFILE *>(checkpoint_cfs_buf.get());
this->fill_checkpoint_cfs(checkpoint_cfs);
this->checkpoint_pending_pairs();
@ -4644,22 +4608,33 @@ void checkpointer::end_checkpoint(void (*testcallback_f)(void*), void* testextr
this->log_end_checkpoint();
this->end_checkpoint_userdata(checkpoint_cfs);
//Delete list of cachefiles in the checkpoint,
// Delete list of cachefiles in the checkpoint,
this->remove_cachefiles(checkpoint_cfs);
toku_free(checkpoint_cfs);
}
void checkpointer::fill_checkpoint_cfs(CACHEFILE* checkpoint_cfs) {
m_cf_list->read_lock();
uint32_t curr_index = 0;
for (CACHEFILE cf = m_cf_list->m_active_head; cf; cf = cf->next) {
if (cf->for_checkpoint) {
assert(curr_index < m_checkpoint_num_files);
checkpoint_cfs[curr_index] = cf;
curr_index++;
}
struct iterate_checkpoint_cfs {
CACHEFILE *checkpoint_cfs;
uint32_t checkpoint_num_files;
uint32_t curr_index;
iterate_checkpoint_cfs(CACHEFILE *cfs, uint32_t num_files) :
checkpoint_cfs(cfs), checkpoint_num_files(num_files), curr_index(0) {
}
assert(curr_index == m_checkpoint_num_files);
static int fn(const CACHEFILE &cf, uint32_t UU(idx), struct iterate_checkpoint_cfs *info) {
if (cf->for_checkpoint) {
assert(info->curr_index < info->checkpoint_num_files);
info->checkpoint_cfs[info->curr_index] = cf;
info->curr_index++;
}
return 0;
}
};
void checkpointer::fill_checkpoint_cfs(CACHEFILE* checkpoint_cfs) {
struct iterate_checkpoint_cfs iterate(checkpoint_cfs, m_checkpoint_num_files);
m_cf_list->read_lock();
m_cf_list->m_active_fileid.iterate<struct iterate_checkpoint_cfs, iterate_checkpoint_cfs::fn>(&iterate);
assert(iterate.curr_index == m_checkpoint_num_files);
m_cf_list->read_unlock();
}
@ -4744,19 +4719,18 @@ void checkpointer::remove_cachefiles(CACHEFILE* checkpoint_cfs) {
static_assert(std::is_pod<cachefile_list>::value, "cachefile_list isn't POD");
void cachefile_list::init() {
m_active_head = NULL;
m_stale_head = NULL;
m_stale_tail = NULL;
m_next_filenum_to_use.fileid = 0;
m_next_hash_id_to_use = 0;
toku_pthread_rwlock_init(&m_lock, NULL);
m_active_filenum.create();
m_active_fileid.create();
m_stale_fileid.create();
}
void cachefile_list::destroy() {
m_active_filenum.destroy();
m_active_fileid.destroy();
m_stale_fileid.destroy();
toku_pthread_rwlock_destroy(&m_lock);
}
@ -4775,34 +4749,31 @@ void cachefile_list::write_lock() {
void cachefile_list::write_unlock() {
toku_pthread_rwlock_wrunlock(&m_lock);
}
int cachefile_list::cachefile_of_iname_in_env(const char *iname_in_env, CACHEFILE *cf) {
read_lock();
CACHEFILE extant;
int r;
r = ENOENT;
for (extant = m_active_head; extant; extant = extant->next) {
if (extant->fname_in_env &&
!strcmp(extant->fname_in_env, iname_in_env)) {
*cf = extant;
r = 0;
break;
}
}
read_unlock();
return r;
}
int cachefile_list::cachefile_of_filenum(FILENUM filenum, CACHEFILE *cf) {
read_lock();
CACHEFILE extant;
int r = ENOENT;
*cf = NULL;
for (extant = m_active_head; extant; extant = extant->next) {
if (extant->filenum.fileid==filenum.fileid) {
*cf = extant;
r = 0;
break;
struct iterate_find_iname {
const char *iname_in_env;
CACHEFILE found_cf;
iterate_find_iname(const char *iname) : iname_in_env(iname), found_cf(nullptr) { }
static int fn(const CACHEFILE &cf, uint32_t UU(idx), struct iterate_find_iname *info) {
if (cf->fname_in_env && strcmp(cf->fname_in_env, info->iname_in_env) == 0) {
info->found_cf = cf;
return -1;
}
return 0;
}
};
int cachefile_list::cachefile_of_iname_in_env(const char *iname_in_env, CACHEFILE *cf) {
struct iterate_find_iname iterate(iname_in_env);
read_lock();
int r = m_active_fileid.iterate<iterate_find_iname, iterate_find_iname::fn>(&iterate);
if (iterate.found_cf != nullptr) {
assert(strcmp(iterate.found_cf->fname_in_env, iname_in_env) == 0);
*cf = iterate.found_cf;
r = 0;
} else {
r = ENOENT;
}
read_unlock();
return r;
@ -4819,20 +4790,23 @@ static int cachefile_find_by_filenum(const CACHEFILE &a_cf, const FILENUM &b) {
}
}
int cachefile_list::cachefile_of_filenum(FILENUM filenum, CACHEFILE *cf) {
read_lock();
int r = m_active_filenum.find_zero<FILENUM, cachefile_find_by_filenum>(filenum, cf, nullptr);
if (r == DB_NOTFOUND) {
r = ENOENT;
} else {
invariant_zero(r);
}
read_unlock();
return r;
}
static int cachefile_find_by_fileid(const CACHEFILE &a_cf, const struct fileid &b) {
return toku_fileid_cmp(a_cf->fileid, b);
}
void cachefile_list::add_cf_unlocked(CACHEFILE cf) {
invariant(cf->next == NULL);
invariant(cf->prev == NULL);
cf->next = m_active_head;
cf->prev = NULL;
if (m_active_head) {
m_active_head->prev = cf;
}
m_active_head = cf;
int r;
r = m_active_filenum.insert<FILENUM, cachefile_find_by_filenum>(cf, cf->filenum, nullptr);
assert_zero(r);
@ -4842,36 +4816,13 @@ void cachefile_list::add_cf_unlocked(CACHEFILE cf) {
void cachefile_list::add_stale_cf(CACHEFILE cf) {
write_lock();
invariant(cf->next == NULL);
invariant(cf->prev == NULL);
cf->next = m_stale_head;
cf->prev = NULL;
if (m_stale_head) {
m_stale_head->prev = cf;
}
m_stale_head = cf;
if (m_stale_tail == NULL) {
m_stale_tail = cf;
}
int r = m_stale_fileid.insert<struct fileid, cachefile_find_by_fileid>(cf, cf->fileid, nullptr);
assert_zero(r);
write_unlock();
}
void cachefile_list::remove_cf(CACHEFILE cf) {
write_lock();
invariant(m_active_head != NULL);
if (cf->next) {
cf->next->prev = cf->prev;
}
if (cf->prev) {
cf->prev->next = cf->next;
}
if (cf == m_active_head) {
invariant(cf->prev == NULL);
m_active_head = cf->next;
}
cf->prev = NULL;
cf->next = NULL;
uint32_t idx;
int r;
@ -4889,24 +4840,12 @@ void cachefile_list::remove_cf(CACHEFILE cf) {
}
void cachefile_list::remove_stale_cf_unlocked(CACHEFILE cf) {
invariant(m_stale_head != NULL);
invariant(m_stale_tail != NULL);
if (cf->next) {
cf->next->prev = cf->prev;
}
if (cf->prev) {
cf->prev->next = cf->next;
}
if (cf == m_stale_head) {
invariant(cf->prev == NULL);
m_stale_head = cf->next;
}
if (cf == m_stale_tail) {
invariant(cf->next == NULL);
m_stale_tail = cf->prev;
}
cf->prev = NULL;
cf->next = NULL;
uint32_t idx;
int r;
r = m_stale_fileid.find_zero<struct fileid, cachefile_find_by_fileid>(cf->fileid, nullptr, &idx);
assert_zero(r);
r = m_stale_fileid.delete_at(idx);
assert_zero(r);
}
FILENUM cachefile_list::reserve_filenum() {
@ -4922,11 +4861,6 @@ FILENUM cachefile_list::reserve_filenum() {
break;
}
FILENUM filenum = m_next_filenum_to_use;
#if TOKU_DEBUG_PARANOID
for (CACHEFILE extant = m_active_head; extant; extant = extant->next) {
assert(filenum.fileid != extant->filenum.fileid);
}
#endif
m_next_filenum_to_use.fileid++;
write_unlock();
return filenum;
@ -4938,91 +4872,77 @@ uint32_t cachefile_list::get_new_hash_id_unlocked() {
return retval;
}
CACHEFILE cachefile_list::find_cachefile_in_list_unlocked(
CACHEFILE start,
struct fileid* fileid
)
{
CACHEFILE retval = NULL;
for (CACHEFILE extant = start; extant; extant = extant->next) {
if (toku_fileids_are_equal(&extant->fileid, fileid)) {
// Clients must serialize cachefile open, close, and unlink
// So, during open, we should never see a closing cachefile
// or one that has been marked as unlink on close.
assert(!extant->unlink_on_close);
retval = extant;
goto exit;
}
}
exit:
return retval;
}
CACHEFILE cachefile_list::find_cachefile_unlocked(struct fileid* fileid) {
CACHEFILE cf = nullptr;
int r = m_active_fileid.find_zero<struct fileid, cachefile_find_by_fileid>(*fileid, &cf, nullptr);
if (r == 0) {
assert(!cf->unlink_on_close);
}
#if TOKU_DEBUG_PARANOID
assert(cf == find_cachefile_in_list_unlocked(m_active_head, fileid));
#endif
return cf;
}
CACHEFILE cachefile_list::find_stale_cachefile_unlocked(struct fileid* fileid) {
return find_cachefile_in_list_unlocked(m_stale_head, fileid);
CACHEFILE cf = nullptr;
int r = m_stale_fileid.find_zero<struct fileid, cachefile_find_by_fileid>(*fileid, &cf, nullptr);
if (r == 0) {
assert(!cf->unlink_on_close);
}
return cf;
}
void cachefile_list::verify_unused_filenum(FILENUM filenum) {
int r = m_active_filenum.find_zero<FILENUM, cachefile_find_by_filenum>(filenum, nullptr, nullptr);
assert(r == DB_NOTFOUND);
#if TOKU_DEBUG_PARANOID
for (CACHEFILE extant = m_active_head; extant; extant = extant->next) {
invariant(extant->filenum.fileid != filenum.fileid);
}
#endif
}
// returns true if some eviction ran, false otherwise
bool cachefile_list::evict_some_stale_pair(evictor* ev) {
PAIR p = NULL;
CACHEFILE cf_to_destroy = NULL;
write_lock();
if (m_stale_tail == NULL) {
if (m_stale_fileid.size() == 0) {
write_unlock();
return false;
}
p = m_stale_tail->cf_head;
CACHEFILE stale_cf = nullptr;
int r = m_stale_fileid.fetch(0, &stale_cf);
assert_zero(r);
// we should not have a cf in the stale list
// that does not have any pairs
PAIR p = stale_cf->cf_head;
paranoid_invariant(p != NULL);
evict_pair_from_cachefile(p);
// now that we have evicted something,
// let's check if the cachefile is needed anymore
if (m_stale_tail->cf_head == NULL) {
cf_to_destroy = m_stale_tail;
remove_stale_cf_unlocked(m_stale_tail);
//
// it is not needed if the latest eviction caused
// the cf_head for that cf to become null
bool destroy_cf = stale_cf->cf_head == nullptr;
if (destroy_cf) {
remove_stale_cf_unlocked(stale_cf);
}
write_unlock();
ev->remove_pair_attr(p->attr);
cachetable_free_pair(p);
if (cf_to_destroy) {
cachefile_destroy(cf_to_destroy);
if (destroy_cf) {
cachefile_destroy(stale_cf);
}
return true;
}
void cachefile_list::free_stale_data(evictor* ev) {
write_lock();
while (m_stale_tail != NULL) {
PAIR p = m_stale_tail->cf_head;
while (m_stale_fileid.size() != 0) {
CACHEFILE stale_cf = nullptr;
int r = m_stale_fileid.fetch(0, &stale_cf);
assert_zero(r);
// we should not have a cf in the stale list
// that does not have any pairs
PAIR p = stale_cf->cf_head;
paranoid_invariant(p != NULL);
evict_pair_from_cachefile(p);
@ -5031,10 +4951,9 @@ void cachefile_list::free_stale_data(evictor* ev) {
// now that we have evicted something,
// let's check if the cachefile is needed anymore
if (m_stale_tail->cf_head == NULL) {
CACHEFILE cf_to_destroy = m_stale_tail;
remove_stale_cf_unlocked(m_stale_tail);
cachefile_destroy(cf_to_destroy);
if (stale_cf->cf_head == NULL) {
remove_stale_cf_unlocked(stale_cf);
cachefile_destroy(stale_cf);
}
}
write_unlock();

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef CACHETABLE_H
#define CACHETABLE_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,12 +87,17 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <fcntl.h>
#include "fttypes.h"
#include "minicron.h"
#include "ft/logger/logger.h"
#include "ft/serialize/block_table.h"
#include "ft/txn/txn.h"
#include "util/minicron.h"
// Maintain a cache mapping from cachekeys to values (void*)
// Some of the keys can be pinned. Don't pin too many or for too long.
@ -111,6 +114,42 @@ PATENT RIGHTS GRANT:
typedef BLOCKNUM CACHEKEY;
class checkpointer;
typedef class checkpointer *CHECKPOINTER;
typedef struct cachetable *CACHETABLE;
typedef struct cachefile *CACHEFILE;
typedef struct ctpair *PAIR;
// This struct hold information about values stored in the cachetable.
// As one can tell from the names, we are probably violating an
// abstraction layer by placing names.
//
// The purpose of having this struct is to have a way for the
// cachetable to accumulate the some totals we are interested in.
// Breaking this abstraction layer by having these names was the
// easiest way.
//
typedef struct pair_attr_s {
long size; // size PAIR's value takes in memory
long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status
long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status
long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status
long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts
bool is_valid;
} PAIR_ATTR;
static inline PAIR_ATTR make_pair_attr(long size) {
PAIR_ATTR result={
.size = size,
.nonleaf_size = 0,
.leaf_size = 0,
.rollback_size = 0,
.cache_pressure_size = 0,
.is_valid = true
};
return result;
}
void toku_set_cleaner_period (CACHETABLE ct, uint32_t new_period);
uint32_t toku_get_cleaner_period_unlocked (CACHETABLE ct);
void toku_set_cleaner_iterations (CACHETABLE ct, uint32_t new_iterations);
@ -122,7 +161,7 @@ uint32_t toku_get_cleaner_iterations_unlocked (CACHETABLE ct);
// create and initialize a cache table
// size_limit is the upper limit on the size of the size of the values in the table
// pass 0 if you want the default
int toku_cachetable_create(CACHETABLE *result, long size_limit, LSN initial_lsn, TOKULOGGER);
int toku_cachetable_create(CACHETABLE *result, long size_limit, LSN initial_lsn, struct tokulogger *logger);
// Create a new cachetable.
// Effects: a new cachetable is created and initialized.
@ -147,15 +186,20 @@ int toku_cachefile_of_iname_in_env (CACHETABLE ct, const char *iname_in_env, CAC
// Return the filename
char *toku_cachefile_fname_in_cwd (CACHEFILE cf);
void toku_cachetable_begin_checkpoint (CHECKPOINTER cp, TOKULOGGER);
void toku_cachetable_begin_checkpoint (CHECKPOINTER cp, struct tokulogger *logger);
void toku_cachetable_end_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
void toku_cachetable_end_checkpoint(CHECKPOINTER cp, struct tokulogger *logger,
void (*testcallback_f)(void*), void * testextra);
// Shuts down checkpoint thread
// Requires no locks be held that are taken by the checkpoint function
void toku_cachetable_minicron_shutdown(CACHETABLE ct);
// Prepare to close the cachetable. This informs the cachetable that it is about to be closed
// so that it can tune its checkpoint resource use.
void toku_cachetable_prepare_close(CACHETABLE ct);
// Close the cachetable.
// Effects: All of the memory objects are flushed to disk, and the cachetable is destroyed.
void toku_cachetable_close(CACHETABLE *ct);
@ -344,28 +388,6 @@ void toku_cachetable_put(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
// then the required PAIRs are written to disk for checkpoint.
// KEY PROPERTY OF DEPENDENT PAIRS: They are already locked by the client
// Returns: 0 if the memory object is in memory, otherwise an error number.
// Rationale:
// begin_batched_pin and end_batched_pin take and release a read lock on the pair list.
// Normally, that would be done within this get_and_pin, but we want to pin multiple nodes with a single acquisition of the read lock.
int toku_cachetable_get_and_pin_with_dep_pairs_batched (
CACHEFILE cachefile,
CACHEKEY key,
uint32_t fullhash,
void**value,
long *sizep,
CACHETABLE_WRITE_CALLBACK write_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
pair_lock_type lock_type,
void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
PAIR* dependent_pairs,
enum cachetable_dirty* dependent_dirty // array stating dirty/cleanness of dependent pairs
);
// Effect: call toku_cachetable_get_and_pin_with_dep_pairs_batched once,
// wrapped in begin_batched_pin and end_batched_pin calls.
int toku_cachetable_get_and_pin_with_dep_pairs (
CACHEFILE cachefile,
CACHEKEY key,
@ -383,7 +405,6 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
enum cachetable_dirty* dependent_dirty // array stating dirty/cleanness of dependent pairs
);
// Get and pin a memory object.
// Effects: If the memory object is in the cachetable acquire the PAIR lock on it.
// Otherwise, fetch it from storage by calling the fetch callback. If the fetch
@ -417,15 +438,13 @@ struct unlockers {
bool locked;
void (*f)(void* extra);
void *extra;
UNLOCKERS next;
struct unlockers *next;
};
typedef struct unlockers *UNLOCKERS;
// Effect: If the block is in the cachetable, then return it.
// Otherwise call the functions in unlockers, fetch the data (but don't pin it, since we'll just end up pinning it again later), and return TOKUDB_TRY_AGAIN.
// Rationale:
// begin_batched_pin and end_batched_pin take and release a read lock on the pair list.
// Normally, that would be done within this get_and_pin, but we want to pin multiple nodes with a single acquisition of the read lock.
int toku_cachetable_get_and_pin_nonblocking_batched (
int toku_cachetable_get_and_pin_nonblocking (
CACHEFILE cf,
CACHEKEY key,
uint32_t fullhash,
@ -440,23 +459,6 @@ int toku_cachetable_get_and_pin_nonblocking_batched (
UNLOCKERS unlockers
);
// Effect: call toku_cachetable_get_and_pin_nonblocking_batched once,
// wrapped in begin_batched_pin and end_batched_pin calls.
int toku_cachetable_get_and_pin_nonblocking (
CACHEFILE cf,
CACHEKEY key,
uint32_t fullhash,
void**value,
long *sizep,
CACHETABLE_WRITE_CALLBACK write_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback __attribute__((unused)),
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback __attribute__((unused)),
pair_lock_type lock_type,
void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
UNLOCKERS unlockers
);
int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, pair_lock_type, void**);
// Effect: Maybe get and pin a memory object.
// This function is similar to the get_and_pin function except that it
@ -549,15 +551,15 @@ void toku_cachefile_unlink_on_close(CACHEFILE cf);
bool toku_cachefile_is_unlink_on_close(CACHEFILE cf);
// Return the logger associated with the cachefile
TOKULOGGER toku_cachefile_logger (CACHEFILE);
struct tokulogger *toku_cachefile_logger(CACHEFILE cf);
// Return the filenum associated with the cachefile
FILENUM toku_cachefile_filenum (CACHEFILE);
FILENUM toku_cachefile_filenum(CACHEFILE cf);
// Effect: Return a 32-bit hash key. The hash key shall be suitable for using with bitmasking for a table of size power-of-two.
uint32_t toku_cachetable_hash (CACHEFILE cachefile, CACHEKEY key);
uint32_t toku_cachetable_hash(CACHEFILE cf, CACHEKEY key);
uint32_t toku_cachefile_fullhash_of_header (CACHEFILE cachefile);
uint32_t toku_cachefile_fullhash_of_header(CACHEFILE cf);
// debug functions
@ -599,6 +601,7 @@ typedef enum {
CT_SIZE_LEAF, // number of bytes in cachetable belonging to leaf nodes
CT_SIZE_ROLLBACK, // number of bytes in cachetable belonging to rollback nodes
CT_SIZE_CACHEPRESSURE, // number of bytes causing cache pressure (sum of buffers and workdone counters)
CT_SIZE_CLONED, // number of bytes of cloned data in the system
CT_EVICTIONS,
CT_CLEANER_EXECUTIONS, // number of times the cleaner thread's loop has executed
CT_CLEANER_PERIOD,
@ -644,5 +647,3 @@ void toku_pair_list_set_lock_size(uint32_t num_locks);
// layer.
__attribute__((const,nonnull))
bool toku_ctpair_is_write_locked(PAIR pair);
#endif /* CACHETABLE_H */

View File

@ -28,7 +28,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -126,17 +126,18 @@ PATENT RIGHTS GRANT:
*
*****/
#include <toku_portability.h>
#include <time.h>
#include "fttypes.h"
#include "cachetable.h"
#include "log-internal.h"
#include "logger.h"
#include "checkpoint.h"
#include <portability/toku_atomic.h>
#include <util/status.h>
#include <util/frwlock.h>
#include "portability/toku_portability.h"
#include "portability/toku_atomic.h"
#include "ft/cachetable/cachetable.h"
#include "ft/cachetable/checkpoint.h"
#include "ft/ft.h"
#include "ft/logger/log-internal.h"
#include "ft/logger/recover.h"
#include "util/frwlock.h"
#include "util/status.h"
///////////////////////////////////////////////////////////////////////////////////
// Engine status
@ -146,7 +147,7 @@ PATENT RIGHTS GRANT:
static CHECKPOINT_STATUS_S cp_status;
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(cp_status, k, c, t, "checkpoint: " l, inc)
#define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(cp_status, k, c, t, "checkpoint: " l, inc)
static void
status_init(void) {

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_CHECKPOINT_H
#define TOKU_CHECKPOINT_H
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,17 +86,19 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2009-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#ident "$Id$"
#include "cachetable.h"
#include <stdint.h>
void toku_set_checkpoint_period(CACHETABLE ct, uint32_t new_period);
#include "ft/cachetable/cachetable.h"
//Effect: Change [end checkpoint (n) - begin checkpoint (n+1)] delay to
// new_period seconds. 0 means disable.
void toku_set_checkpoint_period(CACHETABLE ct, uint32_t new_period);
uint32_t toku_get_checkpoint_period_unlocked(CACHETABLE ct);
@ -160,13 +160,11 @@ typedef enum {SCHEDULED_CHECKPOINT = 0, // "normal" checkpoint taken on check
// Callbacks are called during checkpoint procedure while checkpoint_safe lock is still held.
// Callbacks are primarily intended for use in testing.
// caller_id identifies why the checkpoint is being taken.
int toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
void (*callback_f)(void*), void * extra,
void (*callback2_f)(void*), void * extra2,
int toku_checkpoint(CHECKPOINTER cp, struct tokulogger *logger,
void (*callback_f)(void *extra), void *extra,
void (*callback2_f)(void *extra2), void *extra2,
checkpoint_caller_t caller_id);
/******
* These functions are called from the ydb level.
* They return status information and have no side effects.
@ -200,6 +198,3 @@ typedef struct {
} CHECKPOINT_STATUS_S, *CHECKPOINT_STATUS;
void toku_checkpoint_get_status(CACHETABLE ct, CHECKPOINT_STATUS stat);
#endif

View File

@ -1,236 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <zlib.h>
#include <openssl/md2.h>
#include <openssl/md4.h>
#include <openssl/md5.h>
const unsigned int prime = 2000000011;
unsigned int karprabin (unsigned char *datac, int N) {
assert(N%4==0);
unsigned int *data=(unsigned int*)datac;
N=N/4;
int i;
unsigned int result=0;
for (i=0; i<N; i++) {
result=(result*prime)+data[i];
}
return result;
}
// According to
// P. L'Ecuyer, "Tables of Linear Congruential Generators of
// Different Sizes and Good Lattice Structure", Mathematics of
// Computation 68:225, 249--260 (1999).
// m=2^{32}-5 a=1588635695 is good.
const unsigned int mkr = 4294967291U;
const unsigned int akr = 1588635695U;
// But this is slower
unsigned int karprabinP (unsigned char *datac, int N) {
assert(N%4==0);
unsigned int *data=(unsigned int*)datac;
N=N/4;
int i;
unsigned long long result=0;
for (i=0; i<N; i++) {
result=((result*akr)+data[i])%mkr;
}
return result;
}
float tdiff (struct timeval *start, struct timeval *end) {
return (end->tv_sec-start->tv_sec) +1e-6*(end->tv_usec - start->tv_usec);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
struct timeval start, end;
const int N=2<<20;
unsigned char *data=malloc(N);
int i;
assert(data);
for (i=0; i<N; i++) data[i]=random();
// adler32
{
uLong a32 = adler32(0L, Z_NULL, 0);
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
a32 = adler32(a32, data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("adler32=%lu, time=%9.6fs %9.6fns/b\n", a32, tm, 1e9*tm/N);
}
}
// crc32
{
uLong c32 = crc32(0L, Z_NULL, 0);
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
c32 = crc32(c32, data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("crc32=%lu, time=%9.6fs %9.6fns/b\n", c32, tm, 1e9*tm/N);
}
}
// MD2
{
unsigned char buf[MD2_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD2(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md2=");
for (j=0; j<MD2_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// MD4
{
unsigned char buf[MD4_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD4(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md4=");
for (j=0; j<MD4_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// MD5
{
unsigned char buf[MD5_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD5(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md5=");
for (j=0; j<MD5_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// karp rabin
{
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
unsigned int kr = karprabin(data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("kr=%ud time=%9.6fs %9.6fns/b\n", kr, tm, 1e9*tm/N);
}
}
free(data);
return 0;
}

View File

@ -28,7 +28,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -85,47 +85,105 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#pragma once
#include <db.h>
#include <string.h>
#include <ft/ybt.h>
#include <ft/fttypes.h>
#include "portability/memory.h"
#include "util/dbt.h"
typedef int (*ft_compare_func)(DB *db, const DBT *a, const DBT *b);
int toku_keycompare(const void *key1, uint32_t key1len, const void *key2, uint32_t key2len);
int toku_builtin_compare_fun (DB *, const DBT *, const DBT*) __attribute__((__visibility__("default")));
namespace toku {
// a comparator object encapsulates the data necessary for
// comparing two keys in a fractal tree. it further understands
// that points may be positive or negative infinity.
// a comparator object encapsulates the data necessary for
// comparing two keys in a fractal tree. it further understands
// that points may be positive or negative infinity.
class comparator {
public:
void set_descriptor(DESCRIPTOR desc) {
m_fake_db.cmp_descriptor = desc;
}
void create(ft_compare_func cmp, DESCRIPTOR desc) {
m_cmp = cmp;
memset(&m_fake_db, 0, sizeof(m_fake_db));
m_fake_db.cmp_descriptor = desc;
}
int compare(const DBT *a, const DBT *b) {
if (toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b)) {
return toku_dbt_infinite_compare(a, b);
} else {
return m_cmp(&m_fake_db, a, b);
class comparator {
void init(ft_compare_func cmp, DESCRIPTOR desc, uint8_t memcmp_magic) {
_cmp = cmp;
_fake_db->cmp_descriptor = desc;
_memcmp_magic = memcmp_magic;
}
}
private:
struct __toku_db m_fake_db;
ft_compare_func m_cmp;
};
public:
// This magic value is reserved to mean that the magic has not been set.
static const uint8_t MEMCMP_MAGIC_NONE = 0;
void create(ft_compare_func cmp, DESCRIPTOR desc, uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) {
XCALLOC(_fake_db);
init(cmp, desc, memcmp_magic);
}
// inherit the attributes of another comparator, but keep our own
// copy of fake_db that is owned separately from the one given.
void inherit(const comparator &cmp) {
invariant_notnull(_fake_db);
invariant_notnull(cmp._cmp);
invariant_notnull(cmp._fake_db);
init(cmp._cmp, cmp._fake_db->cmp_descriptor, cmp._memcmp_magic);
}
// like inherit, but doesn't require that the this comparator
// was already created
void create_from(const comparator &cmp) {
XCALLOC(_fake_db);
inherit(cmp);
}
void destroy() {
toku_free(_fake_db);
}
const DESCRIPTOR_S *get_descriptor() const {
return _fake_db->cmp_descriptor;
}
ft_compare_func get_compare_func() const {
return _cmp;
}
uint8_t get_memcmp_magic() const {
return _memcmp_magic;
}
bool valid() const {
return _cmp != nullptr;
}
inline bool dbt_has_memcmp_magic(const DBT *dbt) const {
return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic;
}
int operator()(const DBT *a, const DBT *b) const {
if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b), 0)) {
return toku_dbt_infinite_compare(a, b);
} else if (_memcmp_magic != MEMCMP_MAGIC_NONE
// If `a' has the memcmp magic..
&& dbt_has_memcmp_magic(a)
// ..then we expect `b' to also have the memcmp magic
&& __builtin_expect(dbt_has_memcmp_magic(b), 1)) {
return toku_builtin_compare_fun(nullptr, a, b);
} else {
// yikes, const sadness here
return _cmp(const_cast<DB *>(_fake_db), a, b);
}
}
private:
DB *_fake_db;
ft_compare_func _cmp;
uint8_t _memcmp_magic;
};
} /* namespace toku */

View File

@ -0,0 +1,505 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#include "ft/ft-internal.h"
#include "ft/cursor.h"
#include "ft/leafentry.h"
#include "ft/txn/txn.h"
#include "util/dbt.h"
int toku_ft_cursor_create(FT_HANDLE ft_handle, FT_CURSOR cursor, TOKUTXN ttxn,
bool is_snapshot_read,
bool disable_prefetching,
bool is_temporary) {
if (is_snapshot_read) {
invariant(ttxn != NULL);
int accepted = toku_txn_reads_txnid(ft_handle->ft->h->root_xid_that_created, ttxn);
if (accepted != TOKUDB_ACCEPT) {
invariant(accepted == 0);
return TOKUDB_MVCC_DICTIONARY_TOO_NEW;
}
}
memset(cursor, 0, sizeof(*cursor));
cursor->ft_handle = ft_handle;
cursor->ttxn = ttxn;
cursor->is_snapshot_read = is_snapshot_read;
cursor->disable_prefetching = disable_prefetching;
cursor->is_temporary = is_temporary;
return 0;
}
void toku_ft_cursor_destroy(FT_CURSOR cursor) {
toku_destroy_dbt(&cursor->key);
toku_destroy_dbt(&cursor->val);
toku_destroy_dbt(&cursor->range_lock_left_key);
toku_destroy_dbt(&cursor->range_lock_right_key);
}
// deprecated, should only be used by tests
int toku_ft_cursor(FT_HANDLE ft_handle, FT_CURSOR *cursorptr, TOKUTXN ttxn,
bool is_snapshot_read, bool disable_prefetching) {
FT_CURSOR XCALLOC(cursor);
int r = toku_ft_cursor_create(ft_handle, cursor, ttxn, is_snapshot_read, disable_prefetching, false);
if (r == 0) {
*cursorptr = cursor;
} else {
toku_free(cursor);
}
return r;
}
// deprecated, should only be used by tests
void toku_ft_cursor_close(FT_CURSOR cursor) {
toku_ft_cursor_destroy(cursor);
toku_free(cursor);
}
void toku_ft_cursor_remove_restriction(FT_CURSOR cursor) {
cursor->out_of_range_error = 0;
cursor->direction = 0;
}
void toku_ft_cursor_set_check_interrupt_cb(FT_CURSOR cursor, FT_CHECK_INTERRUPT_CALLBACK cb, void *extra) {
cursor->interrupt_cb = cb;
cursor->interrupt_cb_extra = extra;
}
void toku_ft_cursor_set_leaf_mode(FT_CURSOR cursor) {
cursor->is_leaf_mode = true;
}
int toku_ft_cursor_is_leaf_mode(FT_CURSOR cursor) {
return cursor->is_leaf_mode;
}
// TODO: Rename / cleanup - this has nothing to do with locking
void toku_ft_cursor_set_range_lock(FT_CURSOR cursor,
const DBT *left, const DBT *right,
bool left_is_neg_infty, bool right_is_pos_infty,
int out_of_range_error) {
// Destroy any existing keys and then clone the given left, right keys
toku_destroy_dbt(&cursor->range_lock_left_key);
if (left_is_neg_infty) {
cursor->left_is_neg_infty = true;
} else {
toku_clone_dbt(&cursor->range_lock_left_key, *left);
}
toku_destroy_dbt(&cursor->range_lock_right_key);
if (right_is_pos_infty) {
cursor->right_is_pos_infty = true;
} else {
toku_clone_dbt(&cursor->range_lock_right_key, *right);
}
// TOKUDB_FOUND_BUT_REJECTED is a DB_NOTFOUND with instructions to stop looking. (Faster)
cursor->out_of_range_error = out_of_range_error == DB_NOTFOUND ? TOKUDB_FOUND_BUT_REJECTED : out_of_range_error;
cursor->direction = 0;
}
void toku_ft_cursor_set_prefetching(FT_CURSOR cursor) {
cursor->prefetching = true;
}
bool toku_ft_cursor_prefetching(FT_CURSOR cursor) {
return cursor->prefetching;
}
//Return true if cursor is uninitialized. false otherwise.
bool toku_ft_cursor_not_set(FT_CURSOR cursor) {
assert((cursor->key.data==NULL) == (cursor->val.data==NULL));
return (bool)(cursor->key.data == NULL);
}
struct ft_cursor_search_struct {
FT_GET_CALLBACK_FUNCTION getf;
void *getf_v;
FT_CURSOR cursor;
ft_search *search;
};
/* search for the first kv pair that matches the search object */
static int ft_cursor_search(FT_CURSOR cursor, ft_search *search,
FT_GET_CALLBACK_FUNCTION getf, void *getf_v, bool can_bulk_fetch) {
int r = toku_ft_search(cursor->ft_handle, search, getf, getf_v, cursor, can_bulk_fetch);
return r;
}
static inline int compare_k_x(FT_HANDLE ft_handle, const DBT *k, const DBT *x) {
return ft_handle->ft->cmp(k, x);
}
int toku_ft_cursor_compare_one(const ft_search &UU(search), const DBT *UU(x)) {
return 1;
}
static int ft_cursor_compare_set(const ft_search &search, const DBT *x) {
FT_HANDLE CAST_FROM_VOIDP(ft_handle, search.context);
return compare_k_x(ft_handle, search.k, x) <= 0; /* return min xy: kv <= xy */
}
static int
ft_cursor_current_getf(uint32_t keylen, const void *key,
uint32_t vallen, const void *val,
void *v, bool lock_only) {
struct ft_cursor_search_struct *CAST_FROM_VOIDP(bcss, v);
int r;
if (key==NULL) {
r = bcss->getf(0, NULL, 0, NULL, bcss->getf_v, lock_only);
} else {
FT_CURSOR cursor = bcss->cursor;
DBT newkey;
toku_fill_dbt(&newkey, key, keylen);
if (compare_k_x(cursor->ft_handle, &cursor->key, &newkey) != 0) {
r = bcss->getf(0, NULL, 0, NULL, bcss->getf_v, lock_only); // This was once DB_KEYEMPTY
if (r==0) r = TOKUDB_FOUND_BUT_REJECTED;
}
else
r = bcss->getf(keylen, key, vallen, val, bcss->getf_v, lock_only);
}
return r;
}
static int ft_cursor_compare_next(const ft_search &search, const DBT *x) {
FT_HANDLE CAST_FROM_VOIDP(ft_handle, search.context);
return compare_k_x(ft_handle, search.k, x) < 0; /* return min xy: kv < xy */
}
int toku_ft_cursor_current(FT_CURSOR cursor, int op, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
if (toku_ft_cursor_not_set(cursor)) {
return EINVAL;
}
cursor->direction = 0;
if (op == DB_CURRENT) {
struct ft_cursor_search_struct bcss = {getf, getf_v, cursor, 0};
ft_search search;
ft_search_init(&search, ft_cursor_compare_set, FT_SEARCH_LEFT, &cursor->key, nullptr, cursor->ft_handle);
int r = toku_ft_search(cursor->ft_handle, &search, ft_cursor_current_getf, &bcss, cursor, false);
ft_search_finish(&search);
return r;
}
return getf(cursor->key.size, cursor->key.data, cursor->val.size, cursor->val.data, getf_v, false); // ft_cursor_copyout(cursor, outkey, outval);
}
int toku_ft_cursor_first(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = 0;
ft_search search;
ft_search_init(&search, toku_ft_cursor_compare_one, FT_SEARCH_LEFT, nullptr, nullptr, cursor->ft_handle);
int r = ft_cursor_search(cursor, &search, getf, getf_v, false);
ft_search_finish(&search);
return r;
}
int toku_ft_cursor_last(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = 0;
ft_search search;
ft_search_init(&search, toku_ft_cursor_compare_one, FT_SEARCH_RIGHT, nullptr, nullptr, cursor->ft_handle);
int r = ft_cursor_search(cursor, &search, getf, getf_v, false);
ft_search_finish(&search);
return r;
}
int toku_ft_cursor_check_restricted_range(FT_CURSOR c, const void *key, uint32_t keylen) {
if (c->out_of_range_error) {
FT ft = c->ft_handle->ft;
DBT found_key;
toku_fill_dbt(&found_key, key, keylen);
if ((!c->left_is_neg_infty && c->direction <= 0 && ft->cmp(&found_key, &c->range_lock_left_key) < 0) ||
(!c->right_is_pos_infty && c->direction >= 0 && ft->cmp(&found_key, &c->range_lock_right_key) > 0)) {
invariant(c->out_of_range_error);
return c->out_of_range_error;
}
}
// Reset cursor direction to mitigate risk if some query type doesn't set the direction.
// It is always correct to check both bounds (which happens when direction==0) but it can be slower.
c->direction = 0;
return 0;
}
int toku_ft_cursor_shortcut(FT_CURSOR cursor, int direction, uint32_t index, bn_data *bd,
FT_GET_CALLBACK_FUNCTION getf, void *getf_v,
uint32_t *keylen, void **key, uint32_t *vallen, void **val) {
int r = 0;
// if we are searching towards the end, limit is last element
// if we are searching towards the beginning, limit is the first element
uint32_t limit = (direction > 0) ? (bd->num_klpairs() - 1) : 0;
//Starting with the prev, find the first real (non-provdel) leafentry.
while (index != limit) {
index += direction;
LEAFENTRY le;
void* foundkey = NULL;
uint32_t foundkeylen = 0;
r = bd->fetch_klpair(index, &le, &foundkeylen, &foundkey);
invariant_zero(r);
if (toku_ft_cursor_is_leaf_mode(cursor) || !le_val_is_del(le, cursor->is_snapshot_read, cursor->ttxn)) {
le_extract_val(
le,
toku_ft_cursor_is_leaf_mode(cursor),
cursor->is_snapshot_read,
cursor->ttxn,
vallen,
val
);
*key = foundkey;
*keylen = foundkeylen;
cursor->direction = direction;
r = toku_ft_cursor_check_restricted_range(cursor, *key, *keylen);
if (r!=0) {
paranoid_invariant(r == cursor->out_of_range_error);
// We already got at least one entry from the bulk fetch.
// Return 0 (instead of out of range error).
r = 0;
break;
}
r = getf(*keylen, *key, *vallen, *val, getf_v, false);
if (r == TOKUDB_CURSOR_CONTINUE) {
continue;
}
else {
break;
}
}
}
return r;
}
int toku_ft_cursor_next(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = +1;
ft_search search;
ft_search_init(&search, ft_cursor_compare_next, FT_SEARCH_LEFT, &cursor->key, nullptr, cursor->ft_handle);
int r = ft_cursor_search(cursor, &search, getf, getf_v, true);
ft_search_finish(&search);
if (r == 0) {
toku_ft_cursor_set_prefetching(cursor);
}
return r;
}
static int ft_cursor_search_eq_k_x_getf(uint32_t keylen, const void *key,
uint32_t vallen, const void *val,
void *v, bool lock_only) {
struct ft_cursor_search_struct *CAST_FROM_VOIDP(bcss, v);
int r;
if (key==NULL) {
r = bcss->getf(0, NULL, 0, NULL, bcss->getf_v, false);
} else {
FT_CURSOR cursor = bcss->cursor;
DBT newkey;
toku_fill_dbt(&newkey, key, keylen);
if (compare_k_x(cursor->ft_handle, bcss->search->k, &newkey) == 0) {
r = bcss->getf(keylen, key, vallen, val, bcss->getf_v, lock_only);
} else {
r = bcss->getf(0, NULL, 0, NULL, bcss->getf_v, lock_only);
if (r==0) r = TOKUDB_FOUND_BUT_REJECTED;
}
}
return r;
}
/* search for the kv pair that matches the search object and is equal to k */
static int ft_cursor_search_eq_k_x(FT_CURSOR cursor, ft_search *search, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
struct ft_cursor_search_struct bcss = {getf, getf_v, cursor, search};
int r = toku_ft_search(cursor->ft_handle, search, ft_cursor_search_eq_k_x_getf, &bcss, cursor, false);
return r;
}
static int ft_cursor_compare_prev(const ft_search &search, const DBT *x) {
FT_HANDLE CAST_FROM_VOIDP(ft_handle, search.context);
return compare_k_x(ft_handle, search.k, x) > 0; /* return max xy: kv > xy */
}
int toku_ft_cursor_prev(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = -1;
ft_search search;
ft_search_init(&search, ft_cursor_compare_prev, FT_SEARCH_RIGHT, &cursor->key, nullptr, cursor->ft_handle);
int r = ft_cursor_search(cursor, &search, getf, getf_v, true);
ft_search_finish(&search);
return r;
}
int toku_ft_cursor_compare_set_range(const ft_search &search, const DBT *x) {
FT_HANDLE CAST_FROM_VOIDP(ft_handle, search.context);
return compare_k_x(ft_handle, search.k, x) <= 0; /* return kv <= xy */
}
int toku_ft_cursor_set(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = 0;
ft_search search;
ft_search_init(&search, toku_ft_cursor_compare_set_range, FT_SEARCH_LEFT, key, nullptr, cursor->ft_handle);
int r = ft_cursor_search_eq_k_x(cursor, &search, getf, getf_v);
ft_search_finish(&search);
return r;
}
int toku_ft_cursor_set_range(FT_CURSOR cursor, DBT *key, DBT *key_bound, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = 0;
ft_search search;
ft_search_init(&search, toku_ft_cursor_compare_set_range, FT_SEARCH_LEFT, key, key_bound, cursor->ft_handle);
int r = ft_cursor_search(cursor, &search, getf, getf_v, false);
ft_search_finish(&search);
return r;
}
static int ft_cursor_compare_set_range_reverse(const ft_search &search, const DBT *x) {
FT_HANDLE CAST_FROM_VOIDP(ft_handle, search.context);
return compare_k_x(ft_handle, search.k, x) >= 0; /* return kv >= xy */
}
int toku_ft_cursor_set_range_reverse(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
cursor->direction = 0;
ft_search search;
ft_search_init(&search, ft_cursor_compare_set_range_reverse, FT_SEARCH_RIGHT, key, nullptr, cursor->ft_handle);
int r = ft_cursor_search(cursor, &search, getf, getf_v, false);
ft_search_finish(&search);
return r;
}
//TODO: When tests have been rewritten, get rid of this function.
//Only used by tests.
int toku_ft_cursor_get (FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v, int get_flags) {
int op = get_flags & DB_OPFLAGS_MASK;
if (get_flags & ~DB_OPFLAGS_MASK)
return EINVAL;
switch (op) {
case DB_CURRENT:
case DB_CURRENT_BINDING:
return toku_ft_cursor_current(cursor, op, getf, getf_v);
case DB_FIRST:
return toku_ft_cursor_first(cursor, getf, getf_v);
case DB_LAST:
return toku_ft_cursor_last(cursor, getf, getf_v);
case DB_NEXT:
if (toku_ft_cursor_not_set(cursor)) {
return toku_ft_cursor_first(cursor, getf, getf_v);
} else {
return toku_ft_cursor_next(cursor, getf, getf_v);
}
case DB_PREV:
if (toku_ft_cursor_not_set(cursor)) {
return toku_ft_cursor_last(cursor, getf, getf_v);
} else {
return toku_ft_cursor_prev(cursor, getf, getf_v);
}
case DB_SET:
return toku_ft_cursor_set(cursor, key, getf, getf_v);
case DB_SET_RANGE:
return toku_ft_cursor_set_range(cursor, key, nullptr, getf, getf_v);
default: ;// Fall through
}
return EINVAL;
}
void toku_ft_cursor_peek(FT_CURSOR cursor, const DBT **pkey, const DBT **pval) {
*pkey = &cursor->key;
*pval = &cursor->val;
}
bool toku_ft_cursor_uninitialized(FT_CURSOR c) {
return toku_ft_cursor_not_set(c);
}
int toku_ft_lookup(FT_HANDLE ft_handle, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) {
FT_CURSOR cursor;
int r = toku_ft_cursor(ft_handle, &cursor, NULL, false, false);
if (r != 0) {
return r;
}
r = toku_ft_cursor_set(cursor, k, getf, getf_v);
toku_ft_cursor_close(cursor);
return r;
}

View File

@ -1,6 +1,6 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -29,8 +29,8 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2014 Tokutek, Inc.
DISCLAIMER:
@ -86,12 +86,30 @@ PATENT RIGHTS GRANT:
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#pragma once
#ifndef FT_SEARCH_H
#define FT_SEARCH_H
#include <db.h>
#include "ft/ft-internal.h"
/* an ft cursor is represented as a kv pair in a tree */
struct ft_cursor {
FT_HANDLE ft_handle;
DBT key, val; // The key-value pair that the cursor currently points to
DBT range_lock_left_key, range_lock_right_key;
bool prefetching;
bool left_is_neg_infty, right_is_pos_infty;
bool is_snapshot_read; // true if query is read_committed, false otherwise
bool is_leaf_mode;
bool disable_prefetching;
bool is_temporary;
int out_of_range_error;
int direction;
TOKUTXN ttxn;
FT_CHECK_INTERRUPT_CALLBACK interrupt_cb;
void *interrupt_cb_extra;
};
typedef struct ft_cursor *FT_CURSOR;
enum ft_search_direction_e {
FT_SEARCH_LEFT = 1, /* search left -> right, finds min xy as defined by the compare function */
@ -109,7 +127,7 @@ typedef int (*ft_search_compare_func_t)(const struct ft_search &, const DBT *);
/* the search object contains the compare function, search direction, and the kv pair that
is used in the compare function. the context is the user's private data */
typedef struct ft_search {
struct ft_search {
ft_search_compare_func_t compare;
enum ft_search_direction_e direction;
const DBT *k;
@ -137,22 +155,83 @@ typedef struct ft_search {
// way out with a DB_NOTFOUND we ought to unpin those nodes. See #3528.
DBT pivot_bound;
const DBT *k_bound;
} ft_search_t;
};
/* initialize the search compare object */
static inline ft_search_t *ft_search_init(ft_search_t *so, ft_search_compare_func_t compare, enum ft_search_direction_e direction,
const DBT *k, const DBT *k_bound, void *context) {
so->compare = compare;
so->direction = direction;
so->k = k;
so->context = context;
toku_init_dbt(&so->pivot_bound);
so->k_bound = k_bound;
return so;
static inline ft_search *ft_search_init(ft_search *search, ft_search_compare_func_t compare,
enum ft_search_direction_e direction,
const DBT *k, const DBT *k_bound, void *context) {
search->compare = compare;
search->direction = direction;
search->k = k;
search->context = context;
toku_init_dbt(&search->pivot_bound);
search->k_bound = k_bound;
return search;
}
static inline void ft_search_finish(ft_search_t *so) {
toku_destroy_dbt(&so->pivot_bound);
static inline void ft_search_finish(ft_search *search) {
toku_destroy_dbt(&search->pivot_bound);
}
#endif
int toku_ft_cursor_create(FT_HANDLE ft_handle, FT_CURSOR cursor, TOKUTXN txn,
bool is_snapshot_read,
bool disable_prefetching,
bool is_temporary);
void toku_ft_cursor_destroy(FT_CURSOR cursor);
int toku_ft_lookup(FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
void toku_ft_cursor_set_prefetching(FT_CURSOR cursor);
bool toku_ft_cursor_prefetching(FT_CURSOR cursor);
bool toku_ft_cursor_not_set(FT_CURSOR cursor);
void toku_ft_cursor_set_leaf_mode(FT_CURSOR cursor);
void toku_ft_cursor_remove_restriction(FT_CURSOR cursor);
void toku_ft_cursor_set_check_interrupt_cb(FT_CURSOR cursor, FT_CHECK_INTERRUPT_CALLBACK cb, void *extra);
int toku_ft_cursor_is_leaf_mode(FT_CURSOR cursor);
void toku_ft_cursor_set_range_lock(FT_CURSOR, const DBT *, const DBT *, bool, bool, int);
int toku_ft_cursor_first(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_last(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_next(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_prev(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_current(FT_CURSOR cursor, int op, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_set(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_set_range(FT_CURSOR cursor, DBT *key, DBT *key_bound, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_set_range_reverse(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
bool toku_ft_cursor_uninitialized(FT_CURSOR cursor) __attribute__ ((warn_unused_result));
void toku_ft_cursor_peek(FT_CURSOR cursor, const DBT **pkey, const DBT **pval);
int toku_ft_cursor_check_restricted_range(FT_CURSOR cursor, const void *key, uint32_t keylen);
int toku_ft_cursor_shortcut(FT_CURSOR cursor, int direction, uint32_t index, bn_data *bd,
FT_GET_CALLBACK_FUNCTION getf, void *getf_v,
uint32_t *keylen, void **key, uint32_t *vallen, void **val);
// used by get_key_after_bytes
int toku_ft_cursor_compare_one(const ft_search &search, const DBT *x);
int toku_ft_cursor_compare_set_range(const ft_search &search, const DBT *x);
// deprecated, should only be used by tests, and eventually removed
int toku_ft_cursor(FT_HANDLE ft_handle, FT_CURSOR *ftcursor_p, TOKUTXN txn, bool, bool) __attribute__ ((warn_unused_result));
void toku_ft_cursor_close(FT_CURSOR cursor);
int toku_ft_cursor_get(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v, int get_flags);
int toku_ft_cursor_delete(FT_CURSOR cursor, int flags, TOKUTXN txn);

View File

@ -1,254 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "fifo.h"
#include "xids.h"
#include "ybt.h"
#include <memory.h>
#include <toku_assert.h>
struct fifo {
int n_items_in_fifo;
char *memory; // An array of bytes into which fifo_entries are embedded.
int memory_size; // How big is fifo_memory
int memory_used; // How many bytes are in use?
};
const int fifo_initial_size = 4096;
static void fifo_init(struct fifo *fifo) {
fifo->n_items_in_fifo = 0;
fifo->memory = 0;
fifo->memory_size = 0;
fifo->memory_used = 0;
}
__attribute__((const,nonnull))
static int fifo_entry_size(struct fifo_entry *entry) {
return sizeof (struct fifo_entry) + entry->keylen + entry->vallen
+ xids_get_size(&entry->xids_s)
- sizeof(XIDS_S); //Prevent double counting from fifo_entry+xids_get_size
}
__attribute__((const,nonnull))
size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd) {
// This must stay in sync with fifo_entry_size because that's what we
// really trust. But sometimes we only have an in-memory FT_MSG, not
// a serialized fifo_entry so we have to fake it.
return sizeof (struct fifo_entry) + cmd->u.id.key->size + cmd->u.id.val->size
+ xids_get_size(cmd->xids)
- sizeof(XIDS_S);
}
int toku_fifo_create(FIFO *ptr) {
struct fifo *XMALLOC(fifo);
if (fifo == 0) return ENOMEM;
fifo_init(fifo);
*ptr = fifo;
return 0;
}
void toku_fifo_resize(FIFO fifo, size_t new_size) {
XREALLOC_N(new_size, fifo->memory);
fifo->memory_size = new_size;
}
void toku_fifo_free(FIFO *ptr) {
FIFO fifo = *ptr;
if (fifo->memory) toku_free(fifo->memory);
fifo->memory=0;
toku_free(fifo);
*ptr = 0;
}
int toku_fifo_n_entries(FIFO fifo) {
return fifo->n_items_in_fifo;
}
static int next_power_of_two (int n) {
int r = 4096;
while (r < n) {
r*=2;
assert(r>0);
}
return r;
}
int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, int32_t *dest) {
int need_space_here = sizeof(struct fifo_entry)
+ keylen + datalen
+ xids_get_size(xids)
- sizeof(XIDS_S); //Prevent double counting
int need_space_total = fifo->memory_used+need_space_here;
if (fifo->memory == NULL || need_space_total > fifo->memory_size) {
// resize the fifo to the next power of 2 greater than the needed space
int next_2 = next_power_of_two(need_space_total);
toku_fifo_resize(fifo, next_2);
}
struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_used);
fifo_entry_set_msg_type(entry, type);
entry->msn = msn;
xids_cpy(&entry->xids_s, xids);
entry->is_fresh = is_fresh;
entry->keylen = keylen;
unsigned char *e_key = xids_get_end_of_array(&entry->xids_s);
memcpy(e_key, key, keylen);
entry->vallen = datalen;
memcpy(e_key + keylen, data, datalen);
if (dest) {
*dest = fifo->memory_used;
}
fifo->n_items_in_fifo++;
fifo->memory_used += need_space_here;
return 0;
}
int toku_fifo_iterate_internal_start(FIFO UU(fifo)) { return 0; }
int toku_fifo_iterate_internal_has_more(FIFO fifo, int off) { return off < fifo->memory_used; }
int toku_fifo_iterate_internal_next(FIFO fifo, int off) {
struct fifo_entry *e = (struct fifo_entry *)(fifo->memory + off);
return off + fifo_entry_size(e);
}
struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off) {
return (struct fifo_entry *)(fifo->memory + off);
}
size_t toku_fifo_internal_entry_memsize(struct fifo_entry *e) {
return fifo_entry_size(e);
}
void toku_fifo_iterate (FIFO fifo, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, void*), void *arg) {
FIFO_ITERATE(fifo,
key, keylen, data, datalen, type, msn, xids, is_fresh,
f(key,keylen,data,datalen,type,msn,xids,is_fresh, arg));
}
unsigned int toku_fifo_buffer_size_in_use (FIFO fifo) {
return fifo->memory_used;
}
unsigned long toku_fifo_memory_size_in_use(FIFO fifo) {
return sizeof(*fifo)+fifo->memory_used;
}
unsigned long toku_fifo_memory_footprint(FIFO fifo) {
size_t size_used = toku_memory_footprint(fifo->memory, fifo->memory_used);
long rval = sizeof(*fifo) + size_used;
return rval;
}
DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry) {
return toku_fill_dbt(dbt, xids_get_end_of_array((XIDS) &entry->xids_s), entry->keylen);
}
struct fifo_entry *toku_fifo_get_entry(FIFO fifo, int off) {
return toku_fifo_iterate_internal_get_entry(fifo, off);
}
void toku_fifo_clone(FIFO orig_fifo, FIFO* cloned_fifo) {
struct fifo *XMALLOC(new_fifo);
assert(new_fifo);
new_fifo->n_items_in_fifo = orig_fifo->n_items_in_fifo;
new_fifo->memory_used = orig_fifo->memory_used;
new_fifo->memory_size = new_fifo->memory_used;
XMALLOC_N(new_fifo->memory_size, new_fifo->memory);
memcpy(
new_fifo->memory,
orig_fifo->memory,
new_fifo->memory_size
);
*cloned_fifo = new_fifo;
}
bool toku_are_fifos_same(FIFO fifo1, FIFO fifo2) {
return (
fifo1->memory_used == fifo2->memory_used &&
memcmp(fifo1->memory, fifo2->memory, fifo1->memory_used) == 0
);
}

View File

@ -1,193 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FIFO_H
#define FIFO_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "fttypes.h"
#include "xids-internal.h"
#include "xids.h"
// If the fifo_entry is unpacked, the compiler aligns the xids array and we waste a lot of space
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
struct __attribute__((__packed__)) fifo_entry {
unsigned int keylen;
unsigned int vallen;
unsigned char type;
bool is_fresh;
MSN msn;
XIDS_S xids_s;
};
// get and set the brt message type for a fifo entry.
// it is internally stored as a single unsigned char.
static inline enum ft_msg_type
fifo_entry_get_msg_type(const struct fifo_entry * entry)
{
enum ft_msg_type msg_type;
msg_type = (enum ft_msg_type) entry->type;
return msg_type;
}
static inline void
fifo_entry_set_msg_type(struct fifo_entry * entry,
enum ft_msg_type msg_type)
{
unsigned char type = (unsigned char) msg_type;
entry->type = type;
}
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
typedef struct fifo *FIFO;
int toku_fifo_create(FIFO *);
void toku_fifo_resize(FIFO fifo, size_t new_size);
void toku_fifo_free(FIFO *);
int toku_fifo_n_entries(FIFO);
int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, int32_t *dest);
unsigned int toku_fifo_buffer_size_in_use (FIFO fifo);
unsigned long toku_fifo_memory_size_in_use(FIFO fifo); // return how much memory in the fifo holds useful data
unsigned long toku_fifo_memory_footprint(FIFO fifo); // return how much memory the fifo occupies
//These two are problematic, since I don't want to malloc() the bytevecs, but dequeueing the fifo frees the memory.
//int toku_fifo_peek_deq (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, uint32_t *type, TXNID *xid);
//int toku_fifo_peek_deq_cmdstruct (FIFO, FT_MSG, DBT*, DBT*); // fill in the FT_MSG, using the two DBTs for the DBT part.
void toku_fifo_iterate(FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, void*), void*);
#define FIFO_ITERATE(fifo,keyvar,keylenvar,datavar,datalenvar,typevar,msnvar,xidsvar,is_freshvar,body) ({ \
for (int fifo_iterate_off = toku_fifo_iterate_internal_start(fifo); \
toku_fifo_iterate_internal_has_more(fifo, fifo_iterate_off); \
fifo_iterate_off = toku_fifo_iterate_internal_next(fifo, fifo_iterate_off)) { \
struct fifo_entry *e = toku_fifo_iterate_internal_get_entry(fifo, fifo_iterate_off); \
ITEMLEN keylenvar = e->keylen; \
ITEMLEN datalenvar = e->vallen; \
enum ft_msg_type typevar = fifo_entry_get_msg_type(e); \
MSN msnvar = e->msn; \
XIDS xidsvar = &e->xids_s; \
bytevec keyvar = xids_get_end_of_array(xidsvar); \
bytevec datavar = (const uint8_t*)keyvar + e->keylen; \
bool is_freshvar = e->is_fresh; \
body; \
} })
#define FIFO_CURRENT_ENTRY_MEMSIZE toku_fifo_internal_entry_memsize(e)
// Internal functions for the iterator.
int toku_fifo_iterate_internal_start(FIFO fifo);
int toku_fifo_iterate_internal_has_more(FIFO fifo, int off);
int toku_fifo_iterate_internal_next(FIFO fifo, int off);
struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off);
size_t toku_fifo_internal_entry_memsize(struct fifo_entry *e) __attribute__((const,nonnull));
size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd) __attribute__((const,nonnull));
DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry);
struct fifo_entry *toku_fifo_get_entry(FIFO fifo, int off);
void toku_fifo_clone(FIFO orig_fifo, FIFO* cloned_fifo);
bool toku_are_fifos_same(FIFO fifo1, FIFO fifo2);
#endif

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,12 +89,13 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <ft-cachetable-wrappers.h>
#include "ft/serialize/block_table.h"
#include "ft/ft-cachetable-wrappers.h"
#include "ft/ft-flusher.h"
#include "ft/ft-internal.h"
#include "ft/ft.h"
#include "ft/node.h"
#include <fttypes.h>
#include <ft-flusher.h>
#include <ft-internal.h>
#include <ft.h>
#include <util/context.h>
static void
@ -103,23 +104,23 @@ ftnode_get_key_and_fullhash(
uint32_t* fullhash,
void* extra)
{
FT h = (FT) extra;
BLOCKNUM name;
toku_allocate_blocknum(h->blocktable, &name, h);
*cachekey = name;
*fullhash = toku_cachetable_hash(h->cf, name);
FT ft = (FT) extra;
BLOCKNUM blocknum;
ft->blocktable.allocate_blocknum(&blocknum, ft);
*cachekey = blocknum;
*fullhash = toku_cachetable_hash(ft->cf, blocknum);
}
void
cachetable_put_empty_node_with_dep_nodes(
FT h,
FT ft,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
BLOCKNUM* name, //output
BLOCKNUM* blocknum, //output
uint32_t* fullhash, //output
FTNODE* result)
{
FTNODE XMALLOC(new_node);
FTNODE XCALLOC(new_node);
PAIR dependent_pairs[num_dependent_nodes];
enum cachetable_dirty dependent_dirty_bits[num_dependent_nodes];
for (uint32_t i = 0; i < num_dependent_nodes; i++) {
@ -128,18 +129,18 @@ cachetable_put_empty_node_with_dep_nodes(
}
toku_cachetable_put_with_dep_pairs(
h->cf,
ft->cf,
ftnode_get_key_and_fullhash,
new_node,
make_pair_attr(sizeof(FTNODE)),
get_write_callbacks_for_node(h),
h,
get_write_callbacks_for_node(ft),
ft,
num_dependent_nodes,
dependent_pairs,
dependent_dirty_bits,
name,
blocknum,
fullhash,
toku_node_save_ct_pair);
toku_ftnode_save_ct_pair);
*result = new_node;
}
@ -153,13 +154,13 @@ create_new_ftnode_with_dep_nodes(
FTNODE* dependent_nodes)
{
uint32_t fullhash = 0;
BLOCKNUM name;
BLOCKNUM blocknum;
cachetable_put_empty_node_with_dep_nodes(
ft,
num_dependent_nodes,
dependent_nodes,
&name,
&blocknum,
&fullhash,
result);
@ -170,7 +171,7 @@ create_new_ftnode_with_dep_nodes(
toku_initialize_empty_ftnode(
*result,
name,
blocknum,
height,
n_children,
ft->h->layout_version,
@ -201,14 +202,14 @@ toku_create_new_ftnode (
// then a PL_WRITE_CHEAP lock is grabbed
//
int
toku_pin_ftnode_batched(
FT_HANDLE brt,
toku_pin_ftnode_for_query(
FT_HANDLE ft_handle,
BLOCKNUM blocknum,
uint32_t fullhash,
UNLOCKERS unlockers,
ANCESTORS ancestors,
const PIVOT_BOUNDS bounds,
FTNODE_FETCH_EXTRA bfe,
const pivot_bounds &bounds,
ftnode_fetch_extra *bfe,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p,
bool* msgs_applied)
@ -225,13 +226,13 @@ toku_pin_ftnode_batched(
paranoid_invariant(bfe->type == ftnode_fetch_subset);
}
int r = toku_cachetable_get_and_pin_nonblocking_batched(
brt->ft->cf,
int r = toku_cachetable_get_and_pin_nonblocking(
ft_handle->ft->cf,
blocknum,
fullhash,
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
get_write_callbacks_for_node(ft_handle->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
@ -245,7 +246,7 @@ toku_pin_ftnode_batched(
node = static_cast<FTNODE>(node_v);
if (apply_ancestor_messages && node->height == 0) {
needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
brt->ft,
ft_handle->ft,
node,
ancestors,
bounds,
@ -255,20 +256,20 @@ toku_pin_ftnode_batched(
if (needs_ancestors_messages) {
toku::context apply_messages_ctx(CTX_MESSAGE_APPLICATION);
toku_unpin_ftnode_read_only(brt->ft, node);
int rr = toku_cachetable_get_and_pin_nonblocking_batched(
brt->ft->cf,
blocknum,
fullhash,
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
PL_WRITE_CHEAP,
bfe, //read_extraargs
unlockers);
toku_unpin_ftnode_read_only(ft_handle->ft, node);
int rr = toku_cachetable_get_and_pin_nonblocking(
ft_handle->ft->cf,
blocknum,
fullhash,
&node_v,
NULL,
get_write_callbacks_for_node(ft_handle->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
PL_WRITE_CHEAP,
bfe, //read_extraargs
unlockers);
if (rr != 0) {
assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
r = TOKUDB_TRY_AGAIN;
@ -276,7 +277,7 @@ toku_pin_ftnode_batched(
}
node = static_cast<FTNODE>(node_v);
toku_apply_ancestors_messages_to_node(
brt,
ft_handle,
node,
ancestors,
bounds,
@ -317,54 +318,14 @@ exit:
}
void
toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
FT h,
toku_pin_ftnode_with_dep_nodes(
FT ft,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
ftnode_fetch_extra *bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *node_p,
bool move_messages)
{
toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
h,
blocknum,
fullhash,
bfe,
lock_type,
num_dependent_nodes,
dependent_nodes,
node_p,
move_messages
);
}
void
toku_pin_ftnode_off_client_thread(
FT h,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *node_p)
{
toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
h, blocknum, fullhash, bfe, lock_type, num_dependent_nodes, dependent_nodes, node_p, true);
}
void
toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
FT h,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *dependent_nodes,
FTNODE *node_p,
bool move_messages)
{
@ -376,13 +337,13 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
dependent_dirty_bits[i] = (enum cachetable_dirty) dependent_nodes[i]->dirty;
}
int r = toku_cachetable_get_and_pin_with_dep_pairs_batched(
h->cf,
int r = toku_cachetable_get_and_pin_with_dep_pairs(
ft->cf,
blocknum,
fullhash,
&node_v,
NULL,
get_write_callbacks_for_node(h),
get_write_callbacks_for_node(ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
@ -392,27 +353,22 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
dependent_pairs,
dependent_dirty_bits
);
assert(r==0);
invariant_zero(r);
FTNODE node = (FTNODE) node_v;
if ((lock_type != PL_READ) && node->height > 0 && move_messages) {
toku_move_ftnode_messages_to_stale(h, node);
if (lock_type != PL_READ && node->height > 0 && move_messages) {
toku_move_ftnode_messages_to_stale(ft, node);
}
*node_p = node;
}
void
toku_pin_ftnode_off_client_thread_batched(
FT h,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *node_p)
{
toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
h, blocknum, fullhash, bfe, lock_type, num_dependent_nodes, dependent_nodes, node_p, true);
void toku_pin_ftnode(FT ft,
BLOCKNUM blocknum,
uint32_t fullhash,
ftnode_fetch_extra *bfe,
pair_lock_type lock_type,
FTNODE *node_p,
bool move_messages) {
toku_pin_ftnode_with_dep_nodes(ft, blocknum, fullhash, bfe, lock_type, 0, nullptr, node_p, move_messages);
}
int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pair_lock_type lock_type, FTNODE *nodep) {
@ -429,24 +385,12 @@ cleanup:
return r;
}
void
toku_unpin_ftnode_off_client_thread(FT ft, FTNODE node)
{
int r = toku_cachetable_unpin(
ft->cf,
node->ct_pair,
(enum cachetable_dirty) node->dirty,
make_ftnode_pair_attr(node)
);
assert(r==0);
}
void
toku_unpin_ftnode(FT ft, FTNODE node)
{
// printf("%*sUnpin %ld\n", 8-node->height, "", node->thisnodename.b);
//VERIFY_NODE(brt,node);
toku_unpin_ftnode_off_client_thread(ft, node);
void toku_unpin_ftnode(FT ft, FTNODE node) {
int r = toku_cachetable_unpin(ft->cf,
node->ct_pair,
static_cast<enum cachetable_dirty>(node->dirty),
make_ftnode_pair_attr(node));
invariant_zero(r);
}
void
@ -460,3 +404,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node)
);
assert(r==0);
}
void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b)
// Effect: Swap the blocknum, fullhash, and PAIR for for a and b
// Requires: Both nodes are pinned
{
BLOCKNUM tmp_blocknum = a->blocknum;
uint32_t tmp_fullhash = a->fullhash;
PAIR tmp_pair = a->ct_pair;
a->blocknum = b->blocknum;
a->fullhash = b->fullhash;
a->ct_pair = b->ct_pair;
b->blocknum = tmp_blocknum;
b->fullhash = tmp_fullhash;
b->ct_pair = tmp_pair;
// A and B swapped pair pointers, but we still have to swap
// the actual pair values (ie: the FTNODEs they represent)
// in the cachetable.
toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair);
}

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FT_CACHETABLE_WRAPPERS_H
#define FT_CACHETABLE_WRAPPERS_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,11 +87,14 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <fttypes.h>
#include "cachetable.h"
#include "ft/cachetable/cachetable.h"
#include "ft/ft-internal.h"
#include "ft/node.h"
/**
* Put an empty node (that is, no fields filled) into the cachetable.
@ -102,7 +103,7 @@ PATENT RIGHTS GRANT:
*/
void
cachetable_put_empty_node_with_dep_nodes(
FT h,
FT ft,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
BLOCKNUM* name, //output
@ -117,7 +118,7 @@ cachetable_put_empty_node_with_dep_nodes(
*/
void
create_new_ftnode_with_dep_nodes(
FT h,
FT ft,
FTNODE *result,
int height,
int n_children,
@ -138,52 +139,42 @@ toku_create_new_ftnode (
int n_children
);
/**
* Batched version of toku_pin_ftnode, see cachetable batched API for more
* details.
*/
// This function returns a pinned ftnode to the caller.
int
toku_pin_ftnode_batched(
FT_HANDLE brt,
toku_pin_ftnode_for_query(
FT_HANDLE ft_h,
BLOCKNUM blocknum,
uint32_t fullhash,
UNLOCKERS unlockers,
ANCESTORS ancestors,
const PIVOT_BOUNDS pbounds,
FTNODE_FETCH_EXTRA bfe,
const pivot_bounds &bounds,
ftnode_fetch_extra *bfe,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p,
bool* msgs_applied
);
/**
* Unfortunately, this function is poorly named
* as over time, client threads have also started
* calling this function.
* This function returns a pinned ftnode to the caller.
* Unlike toku_pin_ftnode, this function blocks until the node is pinned.
*/
void
toku_pin_ftnode_off_client_thread(
FT h,
// Pins an ftnode without dependent pairs
void toku_pin_ftnode(
FT ft,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
ftnode_fetch_extra *bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *node_p
FTNODE *node_p,
bool move_messages
);
void
toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
FT h,
// Pins an ftnode with dependent pairs
// Unlike toku_pin_ftnode_for_query, this function blocks until the node is pinned.
void toku_pin_ftnode_with_dep_nodes(
FT ft,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
ftnode_fetch_extra *bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *dependent_nodes,
FTNODE *node_p,
bool move_messages
);
@ -195,53 +186,10 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pair_lock_type lock_type, FTNODE *nodep);
/**
* Batched version of toku_pin_ftnode_off_client_thread, see cachetable
* batched API for more details.
* Effect: Unpin an ftnode.
*/
void
toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
FT h,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *node_p,
bool move_messages
);
void toku_unpin_ftnode(FT ft, FTNODE node);
void toku_unpin_ftnode_read_only(FT ft, FTNODE node);
/**
* Batched version of toku_pin_ftnode_off_client_thread, see cachetable
* batched API for more details.
*/
void
toku_pin_ftnode_off_client_thread_batched(
FT h,
BLOCKNUM blocknum,
uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe,
pair_lock_type lock_type,
uint32_t num_dependent_nodes,
FTNODE* dependent_nodes,
FTNODE *node_p
);
/**
* Effect: Unpin a brt node. Used for
* nodes that were pinned off client thread.
*/
void
toku_unpin_ftnode_off_client_thread(FT h, FTNODE node);
/**
* Effect: Unpin a brt node.
* Used for nodes pinned on a client thread
*/
void
toku_unpin_ftnode(FT h, FTNODE node);
void
toku_unpin_ftnode_read_only(FT ft, FTNODE node);
#endif
// Effect: Swaps pair values of two pinned nodes
void toku_ftnode_swap_pair_values(FTNODE nodea, FTNODE nodeb);

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FT_FLUSHER_INTERNAL_H
#define FT_FLUSHER_INTERNAL_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,11 +86,11 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <fttypes.h>
#define flt_flush_before_applying_inbox 1
#define flt_flush_before_child_pin 2
#define ft_flush_aflter_child_pin 3
@ -115,7 +113,7 @@ typedef struct flusher_advice FLUSHER_ADVICE;
* Cleaner thread merging leaf nodes: follow down to a key
* Hot optimize table: follow down to the right of a key
*/
typedef int (*FA_PICK_CHILD)(FT h, FTNODE parent, void* extra);
typedef int (*FA_PICK_CHILD)(FT ft, FTNODE parent, void* extra);
/**
* Decide whether to call `toku_ft_flush_some_child` on the child if it is
@ -139,7 +137,7 @@ typedef bool (*FA_SHOULD_RECURSIVELY_FLUSH)(FTNODE child, void* extra);
* Hot optimize table: just do the merge
*/
typedef void (*FA_MAYBE_MERGE_CHILD)(struct flusher_advice *fa,
FT h,
FT ft,
FTNODE parent,
int childnum,
FTNODE child,
@ -172,7 +170,7 @@ typedef void (*FA_UPDATE_STATUS)(FTNODE child, int dirtied, void* extra);
* by `ft_split_child`. If -1 is returned, `ft_split_child` defaults to
* the old behavior.
*/
typedef int (*FA_PICK_CHILD_AFTER_SPLIT)(FT h,
typedef int (*FA_PICK_CHILD_AFTER_SPLIT)(FT ft,
FTNODE node,
int childnuma,
int childnumb,
@ -223,18 +221,16 @@ dont_destroy_basement_nodes(void* extra);
void
default_merge_child(struct flusher_advice *fa,
FT h,
FT ft,
FTNODE parent,
int childnum,
FTNODE child,
void* extra);
int
default_pick_child_after_split(FT h,
default_pick_child_after_split(FT ft,
FTNODE parent,
int childnuma,
int childnumb,
void *extra);
#endif // End of header guardian.

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FT_FLUSHER_H
#define FT_FLUSHER_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,11 +86,12 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
// This must be first to make the 64-bit file mode work right in Linux
#include "fttypes.h"
#include "ft/ft-internal.h"
typedef enum {
FT_FLUSHER_CLEANER_TOTAL_NODES = 0, // total number of nodes whose buffers are potentially flushed by cleaner thread
@ -124,7 +123,7 @@ typedef enum {
FT_FLUSHER_SPLIT_NONLEAF, // number of nonleaf nodes split
FT_FLUSHER_MERGE_LEAF, // number of times leaf nodes are merged
FT_FLUSHER_MERGE_NONLEAF, // number of times nonleaf nodes are merged
FT_FLUSHER_BALANCE_LEAF, // number of times a leaf node is balanced inside brt
FT_FLUSHER_BALANCE_LEAF, // number of times a leaf node is balanced
FT_FLUSHER_STATUS_NUM_ROWS
} ft_flusher_status_entry;
@ -152,10 +151,31 @@ toku_flusher_thread_set_callback(
* Puts a workitem on the flusher thread queue, scheduling the node to be
* flushed by toku_ft_flush_some_child.
*/
void
toku_ft_flush_node_on_background_thread(
void toku_ft_flush_node_on_background_thread(FT ft, FTNODE parent);
enum split_mode {
SPLIT_EVENLY,
SPLIT_LEFT_HEAVY,
SPLIT_RIGHT_HEAVY
};
// Given pinned node and pinned child, split child into two
// and update node with information about its new child.
void toku_ft_split_child(
FT ft,
FTNODE parent
FTNODE node,
int childnum,
FTNODE child,
enum split_mode split_mode
);
// Given pinned node, merge childnum with a neighbor and update node with
// information about the change
void toku_ft_merge_child(
FT ft,
FTNODE node,
int childnum
);
/**
@ -166,9 +186,10 @@ toku_ft_flush_node_on_background_thread(
* nodea is the left node that results from the split
* splitk is the right-most key of nodea
*/
// TODO: Rename toku_ft_leaf_split
void
ftleaf_split(
FT h,
FT ft,
FTNODE node,
FTNODE *nodea,
FTNODE *nodeb,
@ -189,8 +210,9 @@ ftleaf_split(
* but it does not guarantee that the resulting nodes are smaller than nodesize.
*/
void
// TODO: Rename toku_ft_nonleaf_split
ft_nonleaf_split(
FT h,
FT ft,
FTNODE node,
FTNODE *nodea,
FTNODE *nodeb,
@ -199,8 +221,6 @@ ft_nonleaf_split(
FTNODE* dependent_nodes
);
/************************************************************************
* HOT optimize, should perhaps be factored out to its own header file *
************************************************************************
@ -230,8 +250,6 @@ void toku_ft_hot_get_status(FT_HOT_STATUS);
* we go until the end of the FT.
*/
int
toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
int (*progress_callback)(void *extra, float progress),
void *progress_extra, uint64_t* loops_run);
#endif // End of header guardian.
toku_ft_hot_optimize(FT_HANDLE ft_h, DBT* left, DBT* right,
int (*progress_callback)(void *extra, float progress),
void *progress_extra, uint64_t* loops_run);

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,14 +89,15 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <ft-flusher.h>
#include <ft-flusher-internal.h>
#include <ft-cachetable-wrappers.h>
#include <ft-internal.h>
#include <ft.h>
#include <portability/toku_atomic.h>
#include <util/status.h>
#include <util/context.h>
#include "ft/ft.h"
#include "ft/ft-cachetable-wrappers.h"
#include "ft/ft-flusher.h"
#include "ft/ft-flusher-internal.h"
#include "ft/ft-internal.h"
#include "ft/node.h"
#include "portability/toku_atomic.h"
#include "util/context.h"
#include "util/status.h"
// Member Descirption:
// 1. highest_pivot_key - this is the key that corresponds to the
@ -119,7 +120,7 @@ struct hot_flusher_extra {
static FT_HOT_STATUS_S hot_status;
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(hot_status, k, c, t, "hot: " l, inc)
#define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(hot_status, k, c, t, "hot: " l, inc)
#define STATUS_VALUE(x) hot_status.status[x].value.num
@ -168,7 +169,7 @@ hot_set_start_key(struct hot_flusher_extra *flusher, const DBT* start)
}
static int
hot_just_pick_child(FT h,
hot_just_pick_child(FT ft,
FTNODE parent,
struct hot_flusher_extra *flusher)
{
@ -183,10 +184,7 @@ hot_just_pick_child(FT h,
childnum = 0;
} else {
// Find the pivot boundary.
childnum = toku_ftnode_hot_next_child(parent,
&flusher->highest_pivot_key,
&h->cmp_descriptor,
h->compare_fun);
childnum = toku_ftnode_hot_next_child(parent, &flusher->highest_pivot_key, ft->cmp);
}
return childnum;
@ -201,19 +199,19 @@ hot_update_flusher_keys(FTNODE parent,
// child node.
if (childnum < (parent->n_children - 1)) {
toku_destroy_dbt(&flusher->max_current_key);
toku_clone_dbt(&flusher->max_current_key, parent->childkeys[childnum]);
toku_clone_dbt(&flusher->max_current_key, parent->pivotkeys.get_pivot(childnum));
}
}
// Picks which child toku_ft_flush_some_child will use for flushing and
// recursion.
static int
hot_pick_child(FT h,
hot_pick_child(FT ft,
FTNODE parent,
void *extra)
{
struct hot_flusher_extra *flusher = (struct hot_flusher_extra *) extra;
int childnum = hot_just_pick_child(h, parent, flusher);
int childnum = hot_just_pick_child(ft, parent, flusher);
// Now we determine the percentage of the tree flushed so far.
@ -243,14 +241,14 @@ hot_update_status(FTNODE UU(child),
// one to flush into. This gives it a chance to do that, and update the
// keys it maintains.
static int
hot_pick_child_after_split(FT h,
hot_pick_child_after_split(FT ft,
FTNODE parent,
int childnuma,
int childnumb,
void *extra)
{
struct hot_flusher_extra *flusher = (struct hot_flusher_extra *) extra;
int childnum = hot_just_pick_child(h, parent, flusher);
int childnum = hot_just_pick_child(ft, parent, flusher);
assert(childnum == childnuma || childnum == childnumb);
hot_update_flusher_keys(parent, childnum, flusher);
if (parent->height == 1) {
@ -298,9 +296,9 @@ hot_flusher_destroy(struct hot_flusher_extra *flusher)
// Entry point for Hot Optimize Table (HOT). Note, this function is
// not recursive. It iterates over root-to-leaf paths.
int
toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
int (*progress_callback)(void *extra, float progress),
void *progress_extra, uint64_t* loops_run)
toku_ft_hot_optimize(FT_HANDLE ft_handle, DBT* left, DBT* right,
int (*progress_callback)(void *extra, float progress),
void *progress_extra, uint64_t* loops_run)
{
toku::context flush_ctx(CTX_FLUSH);
@ -316,7 +314,7 @@ toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
// start of HOT operation
(void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_STARTED), 1);
toku_ft_note_hot_begin(brt);
toku_ft_note_hot_begin(ft_handle);
// Higher level logic prevents a dictionary from being deleted or
// truncated during a hot optimize operation. Doing so would violate
@ -329,18 +327,17 @@ toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
{
// Get root node (the first parent of each successive HOT
// call.)
toku_calculate_root_offset_pointer(brt->ft, &root_key, &fullhash);
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->ft);
toku_pin_ftnode_off_client_thread(brt->ft,
(BLOCKNUM) root_key,
fullhash,
&bfe,
PL_WRITE_EXPENSIVE,
0,
NULL,
&root);
toku_assert_entire_node_in_memory(root);
toku_calculate_root_offset_pointer(ft_handle->ft, &root_key, &fullhash);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(ft_handle->ft);
toku_pin_ftnode(ft_handle->ft,
(BLOCKNUM) root_key,
fullhash,
&bfe,
PL_WRITE_EXPENSIVE,
&root,
true);
toku_ftnode_assert_fully_in_memory(root);
}
// Prepare HOT diagnostics.
@ -365,12 +362,12 @@ toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
// This should recurse to the bottom of the tree and then
// return.
if (root->height > 0) {
toku_ft_flush_some_child(brt->ft, root, &advice);
toku_ft_flush_some_child(ft_handle->ft, root, &advice);
} else {
// Since there are no children to flush, we should abort
// the HOT call.
flusher.rightmost_leaf_seen = 1;
toku_unpin_ftnode_off_client_thread(brt->ft, root);
toku_unpin_ftnode(ft_handle->ft, root);
}
// Set the highest pivot key seen here, since the parent may
@ -386,8 +383,7 @@ toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
else if (right) {
// if we have flushed past the bounds set for us,
// set rightmost_leaf_seen so we exit
FAKE_DB(db, &brt->ft->cmp_descriptor);
int cmp = brt->ft->compare_fun(&db, &flusher.max_current_key, right);
int cmp = ft_handle->ft->cmp(&flusher.max_current_key, right);
if (cmp > 0) {
flusher.rightmost_leaf_seen = 1;
}
@ -417,7 +413,7 @@ toku_ft_hot_optimize(FT_HANDLE brt, DBT* left, DBT* right,
if (r == 0) { success = true; }
{
toku_ft_note_hot_complete(brt, success, msn_at_start_of_hot);
toku_ft_note_hot_complete(ft_handle, success, msn_at_start_of_hot);
}
if (success) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FT_OPS_H
#define FT_OPS_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,33 +86,22 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
// This must be first to make the 64-bit file mode work right in Linux
#define _FILE_OFFSET_BITS 64
#include "fttypes.h"
#include "ybt.h"
#include <db.h>
#include "cachetable.h"
#include "log.h"
#include "ft-search.h"
#include "compress.h"
// A callback function is invoked with the key, and the data.
// The pointers (to the bytevecs) must not be modified. The data must be copied out before the callback function returns.
// Note: In the thread-safe version, the brt node remains locked while the callback function runs. So return soon, and don't call the BRT code from the callback function.
// If the callback function returns a nonzero value (an error code), then that error code is returned from the get function itself.
// The cursor object will have been updated (so that if result==0 the current value is the value being passed)
// (If r!=0 then the cursor won't have been updated.)
// If r!=0, it's up to the callback function to return that value of r.
// A 'key' bytevec of NULL means that element is not found (effectively infinity or
// -infinity depending on direction)
// When lock_only is false, the callback does optional lock tree locking and then processes the key and val.
// When lock_only is true, the callback only does optional lock tree locking.
typedef int(*FT_GET_CALLBACK_FUNCTION)(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only);
#include "ft/cachetable/cachetable.h"
#include "ft/comparator.h"
#include "ft/msg.h"
#include "util/dbt.h"
typedef bool(*FT_CHECK_INTERRUPT_CALLBACK)(void* extra);
typedef struct ft_handle *FT_HANDLE;
int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *, int nodesize, int basementnodesize, enum toku_compression_method compression_method, CACHETABLE, TOKUTXN, int(*)(DB *,const DBT*,const DBT*)) __attribute__ ((warn_unused_result));
@ -125,7 +112,7 @@ int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *, int node
// ANY operations. to update the cmp descriptor after any operations have already happened, all handles
// and transactions must close and reopen before the change, then you can update the cmp descriptor
void toku_ft_change_descriptor(FT_HANDLE t, const DBT* old_descriptor, const DBT* new_descriptor, bool do_log, TOKUTXN txn, bool update_cmp_descriptor);
uint32_t toku_serialize_descriptor_size(const DESCRIPTOR desc);
uint32_t toku_serialize_descriptor_size(DESCRIPTOR desc);
void toku_ft_handle_create(FT_HANDLE *ft);
void toku_ft_set_flags(FT_HANDLE, unsigned int flags);
@ -139,11 +126,13 @@ void toku_ft_handle_set_compression_method(FT_HANDLE, enum toku_compression_meth
void toku_ft_handle_get_compression_method(FT_HANDLE, enum toku_compression_method *);
void toku_ft_handle_set_fanout(FT_HANDLE, unsigned int fanout);
void toku_ft_handle_get_fanout(FT_HANDLE, unsigned int *fanout);
int toku_ft_handle_set_memcmp_magic(FT_HANDLE, uint8_t magic);
void toku_ft_set_bt_compare(FT_HANDLE, ft_compare_func);
ft_compare_func toku_ft_get_bt_compare (FT_HANDLE brt);
void toku_ft_set_bt_compare(FT_HANDLE ft_handle, ft_compare_func cmp_func);
const toku::comparator &toku_ft_get_comparator(FT_HANDLE ft_handle);
void toku_ft_set_redirect_callback(FT_HANDLE brt, on_redirect_callback redir_cb, void* extra);
typedef void (*on_redirect_callback)(FT_HANDLE ft_handle, void *extra);
void toku_ft_set_redirect_callback(FT_HANDLE ft_handle, on_redirect_callback cb, void *extra);
// How updates (update/insert/deletes) work:
// There are two flavers of upsertdels: Singleton and broadcast.
@ -181,7 +170,10 @@ void toku_ft_set_redirect_callback(FT_HANDLE brt, on_redirect_callback redir_cb,
// Implementation note: Acquires a write lock on the entire database.
// This function works by sending an BROADCAST-UPDATE message containing
// the key and the extra.
void toku_ft_set_update(FT_HANDLE brt, ft_update_func update_fun);
typedef int (*ft_update_func)(DB *db, const DBT *key, const DBT *old_val, const DBT *extra,
void (*set_val)(const DBT *new_val, void *set_extra),
void *set_extra);
void toku_ft_set_update(FT_HANDLE ft_h, ft_update_func update_fun);
int toku_ft_handle_open(FT_HANDLE, const char *fname_in_env,
int is_create, int only_create, CACHETABLE ct, TOKUTXN txn) __attribute__ ((warn_unused_result));
@ -197,9 +189,17 @@ void toku_ft_handle_close(FT_HANDLE ft_handle);
// close an ft handle during recovery. the underlying ft must close, and will use the given lsn.
void toku_ft_handle_close_recovery(FT_HANDLE ft_handle, LSN oplsn);
// At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary.
// With the introduction of the loader (ticket 2216), it is possible for the file that holds
// an open dictionary to change, so these are now separate and independent unique identifiers (see FILENUM)
struct DICTIONARY_ID {
uint64_t dictid;
};
static const DICTIONARY_ID DICTIONARY_ID_NONE = { .dictid = 0 };
int
toku_ft_handle_open_with_dict_id(
FT_HANDLE t,
FT_HANDLE ft_h,
const char *fname_in_env,
int is_create,
int only_create,
@ -208,86 +208,57 @@ toku_ft_handle_open_with_dict_id(
DICTIONARY_ID use_dictionary_id
) __attribute__ ((warn_unused_result));
int toku_ft_lookup (FT_HANDLE brt, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
// Effect: Insert a key and data pair into an ft
void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn);
// Effect: Insert a key and data pair into a brt
void toku_ft_insert (FT_HANDLE brt, DBT *k, DBT *v, TOKUTXN txn);
// Returns: 0 if the key was inserted, DB_KEYEXIST if the key already exists
int toku_ft_insert_unique(FT_HANDLE ft, DBT *k, DBT *v, TOKUTXN txn, bool do_logging);
// Effect: Optimize the ft
void toku_ft_optimize (FT_HANDLE brt);
void toku_ft_optimize (FT_HANDLE ft_h);
// Effect: Insert a key and data pair into a brt if the oplsn is newer than the brt lsn. This function is called during recovery.
void toku_ft_maybe_insert (FT_HANDLE brt, DBT *k, DBT *v, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type);
// Effect: Insert a key and data pair into an ft if the oplsn is newer than the ft's lsn. This function is called during recovery.
void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type);
// Effect: Send an update message into a brt. This function is called
// Effect: Send an update message into an ft. This function is called
// during recovery.
void toku_ft_maybe_update(FT_HANDLE brt, const DBT *key, const DBT *update_function_extra, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging);
void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_function_extra, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging);
// Effect: Send a broadcasting update message into a brt. This function
// Effect: Send a broadcasting update message into an ft. This function
// is called during recovery.
void toku_ft_maybe_update_broadcast(FT_HANDLE brt, const DBT *update_function_extra, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, bool is_resetting_op);
void toku_ft_maybe_update_broadcast(FT_HANDLE ft_h, const DBT *update_function_extra, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, bool is_resetting_op);
void toku_ft_load_recovery(TOKUTXN txn, FILENUM old_filenum, char const * new_iname, int do_fsync, int do_log, LSN *load_lsn);
void toku_ft_load(FT_HANDLE brt, TOKUTXN txn, char const * new_iname, int do_fsync, LSN *get_lsn);
void toku_ft_load(FT_HANDLE ft_h, TOKUTXN txn, char const * new_iname, int do_fsync, LSN *get_lsn);
void toku_ft_hot_index_recovery(TOKUTXN txn, FILENUMS filenums, int do_fsync, int do_log, LSN *hot_index_lsn);
void toku_ft_hot_index(FT_HANDLE brt, TOKUTXN txn, FILENUMS filenums, int do_fsync, LSN *lsn);
void toku_ft_hot_index(FT_HANDLE ft_h, TOKUTXN txn, FILENUMS filenums, int do_fsync, LSN *lsn);
void toku_ft_log_put_multiple (TOKUTXN txn, FT_HANDLE src_ft, FT_HANDLE *brts, uint32_t num_fts, const DBT *key, const DBT *val);
void toku_ft_log_put (TOKUTXN txn, FT_HANDLE brt, const DBT *key, const DBT *val);
void toku_ft_log_del_multiple (TOKUTXN txn, FT_HANDLE src_ft, FT_HANDLE *brts, uint32_t num_fts, const DBT *key, const DBT *val);
void toku_ft_log_del (TOKUTXN txn, FT_HANDLE brt, const DBT *key);
void toku_ft_log_put_multiple (TOKUTXN txn, FT_HANDLE src_ft, FT_HANDLE *fts, uint32_t num_fts, const DBT *key, const DBT *val);
void toku_ft_log_put (TOKUTXN txn, FT_HANDLE ft_h, const DBT *key, const DBT *val);
void toku_ft_log_del_multiple (TOKUTXN txn, FT_HANDLE src_ft, FT_HANDLE *fts, uint32_t num_fts, const DBT *key, const DBT *val);
void toku_ft_log_del (TOKUTXN txn, FT_HANDLE ft_h, const DBT *key);
// Effect: Delete a key from a brt
void toku_ft_delete (FT_HANDLE brt, DBT *k, TOKUTXN txn);
// Effect: Delete a key from an ft
void toku_ft_delete (FT_HANDLE ft_h, DBT *k, TOKUTXN txn);
// Effect: Delete a key from a brt if the oplsn is newer than the brt lsn. This function is called during recovery.
void toku_ft_maybe_delete (FT_HANDLE brt, DBT *k, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging);
// Effect: Delete a key from an ft if the oplsn is newer than the ft lsn. This function is called during recovery.
void toku_ft_maybe_delete (FT_HANDLE ft_h, DBT *k, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging);
TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h);
TXN_MANAGER toku_ft_get_txn_manager(FT_HANDLE ft_h);
struct txn_manager *toku_ft_get_txn_manager(FT_HANDLE ft_h);
void toku_ft_send_insert(FT_HANDLE brt, DBT *key, DBT *val, XIDS xids, enum ft_msg_type type, txn_gc_info *gc_info);
void toku_ft_send_delete(FT_HANDLE brt, DBT *key, XIDS xids, txn_gc_info *gc_info);
void toku_ft_send_commit_any(FT_HANDLE brt, DBT *key, XIDS xids, txn_gc_info *gc_info);
struct txn_gc_info;
void toku_ft_send_insert(FT_HANDLE ft_h, DBT *key, DBT *val, XIDS xids, enum ft_msg_type type, txn_gc_info *gc_info);
void toku_ft_send_delete(FT_HANDLE ft_h, DBT *key, XIDS xids, txn_gc_info *gc_info);
void toku_ft_send_commit_any(FT_HANDLE ft_h, DBT *key, XIDS xids, txn_gc_info *gc_info);
int toku_close_ft_handle_nolsn (FT_HANDLE, char **error_string) __attribute__ ((warn_unused_result));
int toku_dump_ft (FILE *,FT_HANDLE brt) __attribute__ ((warn_unused_result));
int toku_dump_ft (FILE *,FT_HANDLE ft_h) __attribute__ ((warn_unused_result));
extern int toku_ft_debug_mode;
int toku_verify_ft (FT_HANDLE brt) __attribute__ ((warn_unused_result));
int toku_verify_ft_with_progress (FT_HANDLE brt, int (*progress_callback)(void *extra, float progress), void *extra, int verbose, int keep_going) __attribute__ ((warn_unused_result));
typedef struct ft_cursor *FT_CURSOR;
int toku_ft_cursor (FT_HANDLE, FT_CURSOR*, TOKUTXN, bool, bool) __attribute__ ((warn_unused_result));
void toku_ft_cursor_set_leaf_mode(FT_CURSOR);
// Sets a boolean on the brt cursor that prevents uncessary copying of
// the cursor duing a one query.
void toku_ft_cursor_set_temporary(FT_CURSOR);
void toku_ft_cursor_remove_restriction(FT_CURSOR);
void toku_ft_cursor_set_check_interrupt_cb(FT_CURSOR ftcursor, FT_CHECK_INTERRUPT_CALLBACK cb, void *extra);
int toku_ft_cursor_is_leaf_mode(FT_CURSOR);
void toku_ft_cursor_set_range_lock(FT_CURSOR, const DBT *, const DBT *, bool, bool, int);
// get is deprecated in favor of the individual functions below
int toku_ft_cursor_get (FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v, int get_flags) __attribute__ ((warn_unused_result));
int toku_ft_cursor_first(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_last(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_next(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_prev(FT_CURSOR cursor, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_current(FT_CURSOR cursor, int op, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_set(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_set_range(FT_CURSOR cursor, DBT *key, DBT *key_bound, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_set_range_reverse(FT_CURSOR cursor, DBT *key, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_get_both_range(FT_CURSOR cursor, DBT *key, DBT *val, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_get_both_range_reverse(FT_CURSOR cursor, DBT *key, DBT *val, FT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result));
int toku_ft_cursor_delete(FT_CURSOR cursor, int flags, TOKUTXN) __attribute__ ((warn_unused_result));
void toku_ft_cursor_close (FT_CURSOR curs);
bool toku_ft_cursor_uninitialized(FT_CURSOR c) __attribute__ ((warn_unused_result));
void toku_ft_cursor_peek(FT_CURSOR cursor, const DBT **pkey, const DBT **pval);
int toku_verify_ft (FT_HANDLE ft_h) __attribute__ ((warn_unused_result));
int toku_verify_ft_with_progress (FT_HANDLE ft_h, int (*progress_callback)(void *extra, float progress), void *extra, int verbose, int keep_going) __attribute__ ((warn_unused_result));
DICTIONARY_ID toku_ft_get_dictionary_id(FT_HANDLE);
@ -298,8 +269,8 @@ enum ft_flags {
TOKU_DB_VALCMP_BUILTIN_13 = (1<<3),
};
void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater);
void toku_ft_keysrange(FT_HANDLE brt, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p);
void toku_ft_keyrange(FT_HANDLE ft_h, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater);
void toku_ft_keysrange(FT_HANDLE ft_h, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p);
int toku_ft_get_key_after_bytes(FT_HANDLE ft_h, const DBT *start_key, uint64_t skip_len, void (*callback)(const DBT *end_key, uint64_t actually_skipped, void *extra), void *cb_extra);
@ -341,16 +312,16 @@ void toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size
// Effect: make the file bigger by either doubling it or growing by 16MiB whichever is less, until it is at least size
// Return 0 on success, otherwise an error number.
int toku_ft_get_fragmentation(FT_HANDLE brt, TOKU_DB_FRAGMENTATION report) __attribute__ ((warn_unused_result));
int toku_ft_get_fragmentation(FT_HANDLE ft_h, TOKU_DB_FRAGMENTATION report) __attribute__ ((warn_unused_result));
bool toku_ft_is_empty_fast (FT_HANDLE brt) __attribute__ ((warn_unused_result));
bool toku_ft_is_empty_fast (FT_HANDLE ft_h) __attribute__ ((warn_unused_result));
// Effect: Return true if there are no messages or leaf entries in the tree. If so, it's empty. If there are messages or leaf entries, we say it's not empty
// even though if we were to optimize the tree it might turn out that they are empty.
int toku_ft_strerror_r(int error, char *buf, size_t buflen);
// Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror().
// If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message.
// If error<0 then return a TokuDB-specific error code. For unknown cases, we return -1 and set errno=EINVAL, even for cases that *should* be known. (Not all DB errors are known by this function which is a bug.)
// If error<0 then return a TokuFT-specific error code. For unknown cases, we return -1 and set errno=EINVAL, even for cases that *should* be known. (Not all DB errors are known by this function which is a bug.)
extern bool garbage_collection_debug;
@ -358,4 +329,4 @@ extern bool garbage_collection_debug;
void toku_ft_set_direct_io(bool direct_io_on);
void toku_ft_set_compress_buffers_before_eviction(bool compress_buffers);
#endif
void toku_note_deserialized_basement_node(bool fixed_key_size);

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,14 +89,15 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "ft-cachetable-wrappers.h"
#include "ft-flusher.h"
#include "ft-internal.h"
#include "ft.h"
#include "fttypes.h"
#include "ule.h"
#include "ft/ft.h"
#include "ft/ft-cachetable-wrappers.h"
#include "ft/ft-internal.h"
#include "ft/ft-flusher.h"
#include "ft/serialize/ft_node-serialize.h"
#include "ft/node.h"
#include "ft/ule.h"
// dummymsn needed to simulate msn because messages are injected at a lower level than toku_ft_root_put_cmd()
// dummymsn needed to simulate msn because messages are injected at a lower level than toku_ft_root_put_msg()
#define MIN_DUMMYMSN ((MSN) {(uint64_t)1 << 62})
static MSN dummymsn;
static int testsetup_initialized = 0;
@ -119,62 +120,71 @@ next_dummymsn(void) {
bool ignore_if_was_already_open;
int toku_testsetup_leaf(FT_HANDLE brt, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens) {
int toku_testsetup_leaf(FT_HANDLE ft_handle, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens) {
FTNODE node;
assert(testsetup_initialized);
toku_create_new_ftnode(brt, &node, 0, n_children);
int i;
for (i=0; i<n_children; i++) {
BP_STATE(node,i) = PT_AVAIL;
toku_create_new_ftnode(ft_handle, &node, 0, n_children);
for (int i = 0; i < n_children; i++) {
BP_STATE(node, i) = PT_AVAIL;
}
for (i=0; i+1<n_children; i++) {
toku_memdup_dbt(&node->childkeys[i], keys[i], keylens[i]);
node->totalchildkeylens += keylens[i];
DBT *XMALLOC_N(n_children - 1, pivotkeys);
for (int i = 0; i + 1 < n_children; i++) {
toku_memdup_dbt(&pivotkeys[i], keys[i], keylens[i]);
}
node->pivotkeys.create_from_dbts(pivotkeys, n_children - 1);
for (int i = 0; i + 1 < n_children; i++) {
toku_destroy_dbt(&pivotkeys[i]);
}
toku_free(pivotkeys);
*blocknum = node->thisnodename;
toku_unpin_ftnode(brt->ft, node);
*blocknum = node->blocknum;
toku_unpin_ftnode(ft_handle->ft, node);
return 0;
}
// Don't bother to clean up carefully if something goes wrong. (E.g., it's OK to have malloced stuff that hasn't been freed.)
int toku_testsetup_nonleaf (FT_HANDLE brt, int height, BLOCKNUM *blocknum, int n_children, BLOCKNUM *children, char **keys, int *keylens) {
int toku_testsetup_nonleaf (FT_HANDLE ft_handle, int height, BLOCKNUM *blocknum, int n_children, BLOCKNUM *children, char **keys, int *keylens) {
FTNODE node;
assert(testsetup_initialized);
toku_create_new_ftnode(brt, &node, height, n_children);
int i;
for (i=0; i<n_children; i++) {
toku_create_new_ftnode(ft_handle, &node, height, n_children);
for (int i = 0; i < n_children; i++) {
BP_BLOCKNUM(node, i) = children[i];
BP_STATE(node,i) = PT_AVAIL;
}
for (i=0; i+1<n_children; i++) {
toku_memdup_dbt(&node->childkeys[i], keys[i], keylens[i]);
node->totalchildkeylens += keylens[i];
DBT *XMALLOC_N(n_children - 1, pivotkeys);
for (int i = 0; i + 1 < n_children; i++) {
toku_memdup_dbt(&pivotkeys[i], keys[i], keylens[i]);
}
*blocknum = node->thisnodename;
toku_unpin_ftnode(brt->ft, node);
node->pivotkeys.create_from_dbts(pivotkeys, n_children - 1);
for (int i = 0; i + 1 < n_children; i++) {
toku_destroy_dbt(&pivotkeys[i]);
}
toku_free(pivotkeys);
*blocknum = node->blocknum;
toku_unpin_ftnode(ft_handle->ft, node);
return 0;
}
int toku_testsetup_root(FT_HANDLE brt, BLOCKNUM blocknum) {
int toku_testsetup_root(FT_HANDLE ft_handle, BLOCKNUM blocknum) {
assert(testsetup_initialized);
brt->ft->h->root_blocknum = blocknum;
ft_handle->ft->h->root_blocknum = blocknum;
return 0;
}
int toku_testsetup_get_sersize(FT_HANDLE brt, BLOCKNUM diskoff) // Return the size on disk
int toku_testsetup_get_sersize(FT_HANDLE ft_handle, BLOCKNUM diskoff) // Return the size on disk
{
assert(testsetup_initialized);
void *node_v;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->ft);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(ft_handle->ft);
int r = toku_cachetable_get_and_pin(
brt->ft->cf, diskoff,
toku_cachetable_hash(brt->ft->cf, diskoff),
ft_handle->ft->cf, diskoff,
toku_cachetable_hash(ft_handle->ft->cf, diskoff),
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
get_write_callbacks_for_node(ft_handle->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
@ -184,25 +194,25 @@ int toku_testsetup_get_sersize(FT_HANDLE brt, BLOCKNUM diskoff) // Return the si
assert(r==0);
FTNODE CAST_FROM_VOIDP(node, node_v);
int size = toku_serialize_ftnode_size(node);
toku_unpin_ftnode(brt->ft, node);
toku_unpin_ftnode(ft_handle->ft, node);
return size;
}
int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM blocknum, const char *key, int keylen, const char *val, int vallen) {
int toku_testsetup_insert_to_leaf (FT_HANDLE ft_handle, BLOCKNUM blocknum, const char *key, int keylen, const char *val, int vallen) {
void *node_v;
int r;
assert(testsetup_initialized);
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->ft);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(ft_handle->ft);
r = toku_cachetable_get_and_pin(
brt->ft->cf,
ft_handle->ft->cf,
blocknum,
toku_cachetable_hash(brt->ft->cf, blocknum),
toku_cachetable_hash(ft_handle->ft->cf, blocknum),
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
get_write_callbacks_for_node(ft_handle->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
@ -214,30 +224,26 @@ int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM blocknum, const char
toku_verify_or_set_counts(node);
assert(node->height==0);
DBT keydbt,valdbt;
MSN msn = next_dummymsn();
FT_MSG_S cmd = { FT_INSERT, msn, xids_get_root_xids(),
.u = { .id = { toku_fill_dbt(&keydbt, key, keylen),
toku_fill_dbt(&valdbt, val, vallen) } } };
DBT kdbt, vdbt;
ft_msg msg(toku_fill_dbt(&kdbt, key, keylen), toku_fill_dbt(&vdbt, val, vallen),
FT_INSERT, next_dummymsn(), toku_xids_get_root_xids());
static size_t zero_flow_deltas[] = { 0, 0 };
txn_gc_info gc_info(nullptr, TXNID_NONE, TXNID_NONE, true);
toku_ft_node_put_cmd (
brt->ft->compare_fun,
brt->ft->update_fun,
&brt->ft->cmp_descriptor,
node,
-1,
&cmd,
true,
&gc_info,
zero_flow_deltas,
NULL
);
toku_ftnode_put_msg(ft_handle->ft->cmp,
ft_handle->ft->update_fun,
node,
-1,
msg,
true,
&gc_info,
zero_flow_deltas,
NULL
);
toku_verify_or_set_counts(node);
toku_unpin_ftnode(brt->ft, node);
toku_unpin_ftnode(ft_handle->ft, node);
return 0;
}
@ -252,35 +258,34 @@ testhelper_string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
void
toku_pin_node_with_min_bfe(FTNODE* node, BLOCKNUM b, FT_HANDLE t)
{
struct ftnode_fetch_extra bfe;
fill_bfe_for_min_read(&bfe, t->ft);
toku_pin_ftnode_off_client_thread(
ftnode_fetch_extra bfe;
bfe.create_for_min_read(t->ft);
toku_pin_ftnode(
t->ft,
b,
toku_cachetable_hash(t->ft->cf, b),
&bfe,
PL_WRITE_EXPENSIVE,
0,
NULL,
node
node,
true
);
}
int toku_testsetup_insert_to_nonleaf (FT_HANDLE brt, BLOCKNUM blocknum, enum ft_msg_type cmdtype, const char *key, int keylen, const char *val, int vallen) {
int toku_testsetup_insert_to_nonleaf (FT_HANDLE ft_handle, BLOCKNUM blocknum, enum ft_msg_type msgtype, const char *key, int keylen, const char *val, int vallen) {
void *node_v;
int r;
assert(testsetup_initialized);
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->ft);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(ft_handle->ft);
r = toku_cachetable_get_and_pin(
brt->ft->cf,
ft_handle->ft->cf,
blocknum,
toku_cachetable_hash(brt->ft->cf, blocknum),
toku_cachetable_hash(ft_handle->ft->cf, blocknum),
&node_v,
NULL,
get_write_callbacks_for_node(brt->ft),
get_write_callbacks_for_node(ft_handle->ft),
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
@ -292,21 +297,22 @@ int toku_testsetup_insert_to_nonleaf (FT_HANDLE brt, BLOCKNUM blocknum, enum ft_
assert(node->height>0);
DBT k;
int childnum = toku_ftnode_which_child(node,
toku_fill_dbt(&k, key, keylen),
&brt->ft->cmp_descriptor, brt->ft->compare_fun);
int childnum = toku_ftnode_which_child(node, toku_fill_dbt(&k, key, keylen), ft_handle->ft->cmp);
XIDS xids_0 = xids_get_root_xids();
XIDS xids_0 = toku_xids_get_root_xids();
MSN msn = next_dummymsn();
toku_bnc_insert_msg(BNC(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0, true, NULL, testhelper_string_key_cmp);
toku::comparator cmp;
cmp.create(testhelper_string_key_cmp, nullptr);
toku_bnc_insert_msg(BNC(node, childnum), key, keylen, val, vallen, msgtype, msn, xids_0, true, cmp);
cmp.destroy();
// Hack to get the test working. The problem is that this test
// is directly queueing something in a FIFO instead of
// using brt APIs.
// using ft APIs.
node->max_msn_applied_to_node_on_disk = msn;
node->dirty = 1;
// Also hack max_msn_in_ft
brt->ft->h->max_msn_in_ft = msn;
ft_handle->ft->h->max_msn_in_ft = msn;
toku_unpin_ftnode(brt->ft, node);
toku_unpin_ftnode(ft_handle->ft, node);
return 0;
}

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,7 +89,7 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* Verify a BRT. */
/* Verify an FT. */
/* Check:
* The tree is of uniform depth (and the height is correct at every node)
* For each pivot key: the max of the stuff to the left is <= the pivot key < the min of the stuff to the right.
@ -97,31 +97,30 @@ PATENT RIGHTS GRANT:
* For each nonleaf node: All the messages have keys that are between the associated pivot keys ( left_pivot_key < message <= right_pivot_key)
*/
#include "ft-cachetable-wrappers.h"
#include "ft-internal.h"
#include "ft.h"
#include "ft/serialize/block_table.h"
#include "ft/ft.h"
#include "ft/ft-cachetable-wrappers.h"
#include "ft/ft-internal.h"
#include "ft/node.h"
static int
compare_pairs (FT_HANDLE brt, const DBT *a, const DBT *b) {
FAKE_DB(db, &brt->ft->cmp_descriptor);
int cmp = brt->ft->compare_fun(&db, a, b);
return cmp;
compare_pairs (FT_HANDLE ft_handle, const DBT *a, const DBT *b) {
return ft_handle->ft->cmp(a, b);
}
static int
compare_pair_to_key (FT_HANDLE brt, const DBT *a, bytevec key, ITEMLEN keylen) {
compare_pair_to_key (FT_HANDLE ft_handle, const DBT *a, const void *key, uint32_t keylen) {
DBT y;
FAKE_DB(db, &brt->ft->cmp_descriptor);
int cmp = brt->ft->compare_fun(&db, a, toku_fill_dbt(&y, key, keylen));
return cmp;
return ft_handle->ft->cmp(a, toku_fill_dbt(&y, key, keylen));
}
static int
verify_msg_in_child_buffer(FT_HANDLE brt, enum ft_msg_type type, MSN msn, bytevec key, ITEMLEN keylen, bytevec UU(data), ITEMLEN UU(datalen), XIDS UU(xids), const DBT *lesser_pivot, const DBT *greatereq_pivot)
verify_msg_in_child_buffer(FT_HANDLE ft_handle, enum ft_msg_type type, MSN msn, const void *key, uint32_t keylen, const void *UU(data), uint32_t UU(datalen), XIDS UU(xids), const DBT *lesser_pivot, const DBT *greatereq_pivot)
__attribute__((warn_unused_result));
UU()
static int
verify_msg_in_child_buffer(FT_HANDLE brt, enum ft_msg_type type, MSN msn, bytevec key, ITEMLEN keylen, bytevec UU(data), ITEMLEN UU(datalen), XIDS UU(xids), const DBT *lesser_pivot, const DBT *greatereq_pivot) {
verify_msg_in_child_buffer(FT_HANDLE ft_handle, enum ft_msg_type type, MSN msn, const void *key, uint32_t keylen, const void *UU(data), uint32_t UU(datalen), XIDS UU(xids), const DBT *lesser_pivot, const DBT *greatereq_pivot) {
int result = 0;
if (msn.msn == ZERO_MSN.msn)
result = EINVAL;
@ -135,12 +134,12 @@ verify_msg_in_child_buffer(FT_HANDLE brt, enum ft_msg_type type, MSN msn, byteve
case FT_COMMIT_ANY:
// verify key in bounds
if (lesser_pivot) {
int compare = compare_pair_to_key(brt, lesser_pivot, key, keylen);
int compare = compare_pair_to_key(ft_handle, lesser_pivot, key, keylen);
if (compare >= 0)
result = EINVAL;
}
if (result == 0 && greatereq_pivot) {
int compare = compare_pair_to_key(brt, greatereq_pivot, key, keylen);
int compare = compare_pair_to_key(ft_handle, greatereq_pivot, key, keylen);
if (compare < 0)
result = EINVAL;
}
@ -152,14 +151,15 @@ verify_msg_in_child_buffer(FT_HANDLE brt, enum ft_msg_type type, MSN msn, byteve
static DBT
get_ith_key_dbt (BASEMENTNODE bn, int i) {
DBT kdbt;
int r = bn->data_buffer.fetch_le_key_and_len(i, &kdbt.size, &kdbt.data);
int r = bn->data_buffer.fetch_key_and_len(i, &kdbt.size, &kdbt.data);
invariant_zero(r); // this is a bad failure if it happens.
return kdbt;
}
#define VERIFY_ASSERTION(predicate, i, string) ({ \
if(!(predicate)) { \
if (verbose) { \
(void) verbose; \
if (true) { \
fprintf(stderr, "%s:%d: Looking at child %d of block %" PRId64 ": %s\n", __FILE__, __LINE__, i, blocknum.b, string); \
} \
result = TOKUDB_NEEDS_REPAIR; \
@ -169,7 +169,7 @@ get_ith_key_dbt (BASEMENTNODE bn, int i) {
struct count_msgs_extra {
int count;
MSN msn;
FIFO fifo;
message_buffer *msg_buffer;
};
// template-only function, but must be extern
@ -177,15 +177,16 @@ int count_msgs(const int32_t &offset, const uint32_t UU(idx), struct count_msgs_
__attribute__((nonnull(3)));
int count_msgs(const int32_t &offset, const uint32_t UU(idx), struct count_msgs_extra *const e)
{
const struct fifo_entry *entry = toku_fifo_get_entry(e->fifo, offset);
if (entry->msn.msn == e->msn.msn) {
MSN msn;
e->msg_buffer->get_message_key_msn(offset, nullptr, &msn);
if (msn.msn == e->msn.msn) {
e->count++;
}
return 0;
}
struct verify_message_tree_extra {
FIFO fifo;
message_buffer *msg_buffer;
bool broadcast;
bool is_fresh;
int i;
@ -202,20 +203,22 @@ int verify_message_tree(const int32_t &offset, const uint32_t UU(idx), struct ve
BLOCKNUM blocknum = e->blocknum;
int keep_going_on_failure = e->keep_going_on_failure;
int result = 0;
const struct fifo_entry *entry = toku_fifo_get_entry(e->fifo, offset);
DBT k, v;
ft_msg msg = e->msg_buffer->get_message(offset, &k, &v);
bool is_fresh = e->msg_buffer->get_freshness(offset);
if (e->broadcast) {
VERIFY_ASSERTION(ft_msg_type_applies_all((enum ft_msg_type) entry->type) || ft_msg_type_does_nothing((enum ft_msg_type) entry->type),
VERIFY_ASSERTION(ft_msg_type_applies_all((enum ft_msg_type) msg.type()) || ft_msg_type_does_nothing((enum ft_msg_type) msg.type()),
e->i, "message found in broadcast list that is not a broadcast");
} else {
VERIFY_ASSERTION(ft_msg_type_applies_once((enum ft_msg_type) entry->type),
VERIFY_ASSERTION(ft_msg_type_applies_once((enum ft_msg_type) msg.type()),
e->i, "message found in fresh or stale message tree that does not apply once");
if (e->is_fresh) {
if (e->messages_have_been_moved) {
VERIFY_ASSERTION(entry->is_fresh,
VERIFY_ASSERTION(is_fresh,
e->i, "message found in fresh message tree that is not fresh");
}
} else {
VERIFY_ASSERTION(!entry->is_fresh,
VERIFY_ASSERTION(!is_fresh,
e->i, "message found in stale message tree that is fresh");
}
}
@ -235,15 +238,15 @@ int verify_marked_messages(const int32_t &offset, const uint32_t UU(idx), struct
BLOCKNUM blocknum = e->blocknum;
int keep_going_on_failure = e->keep_going_on_failure;
int result = 0;
const struct fifo_entry *entry = toku_fifo_get_entry(e->fifo, offset);
VERIFY_ASSERTION(!entry->is_fresh, e->i, "marked message found in the fresh message tree that is fresh");
bool is_fresh = e->msg_buffer->get_freshness(offset);
VERIFY_ASSERTION(!is_fresh, e->i, "marked message found in the fresh message tree that is fresh");
done:
return result;
}
template<typename verify_omt_t>
static int
verify_sorted_by_key_msn(FT_HANDLE brt, FIFO fifo, const verify_omt_t &mt) {
verify_sorted_by_key_msn(FT_HANDLE ft_handle, message_buffer *msg_buffer, const verify_omt_t &mt) {
int result = 0;
size_t last_offset = 0;
for (uint32_t i = 0; i < mt.size(); i++) {
@ -251,12 +254,8 @@ verify_sorted_by_key_msn(FT_HANDLE brt, FIFO fifo, const verify_omt_t &mt) {
int r = mt.fetch(i, &offset);
assert_zero(r);
if (i > 0) {
struct toku_fifo_entry_key_msn_cmp_extra extra;
ZERO_STRUCT(extra);
extra.desc = &brt->ft->cmp_descriptor;
extra.cmp = brt->ft->compare_fun;
extra.fifo = fifo;
if (toku_fifo_entry_key_msn_cmp(extra, last_offset, offset) >= 0) {
struct toku_msg_buffer_key_msn_cmp_extra extra(ft_handle->ft->cmp, msg_buffer);
if (toku_msg_buffer_key_msn_cmp(extra, last_offset, offset) >= 0) {
result = TOKUDB_NEEDS_REPAIR;
break;
}
@ -268,15 +267,9 @@ verify_sorted_by_key_msn(FT_HANDLE brt, FIFO fifo, const verify_omt_t &mt) {
template<typename count_omt_t>
static int
count_eq_key_msn(FT_HANDLE brt, FIFO fifo, const count_omt_t &mt, const DBT *key, MSN msn) {
struct toku_fifo_entry_key_msn_heaviside_extra extra;
ZERO_STRUCT(extra);
extra.desc = &brt->ft->cmp_descriptor;
extra.cmp = brt->ft->compare_fun;
extra.fifo = fifo;
extra.key = key;
extra.msn = msn;
int r = mt.template find_zero<struct toku_fifo_entry_key_msn_heaviside_extra, toku_fifo_entry_key_msn_heaviside>(extra, nullptr, nullptr);
count_eq_key_msn(FT_HANDLE ft_handle, message_buffer *msg_buffer, const count_omt_t &mt, const DBT *key, MSN msn) {
struct toku_msg_buffer_key_msn_heaviside_extra extra(ft_handle->ft->cmp, msg_buffer, key, msn);
int r = mt.template find_zero<struct toku_msg_buffer_key_msn_heaviside_extra, toku_msg_buffer_key_msn_heaviside>(extra, nullptr, nullptr);
int count;
if (r == 0) {
count = 1;
@ -290,28 +283,100 @@ count_eq_key_msn(FT_HANDLE brt, FIFO fifo, const count_omt_t &mt, const DBT *key
void
toku_get_node_for_verify(
BLOCKNUM blocknum,
FT_HANDLE brt,
FT_HANDLE ft_handle,
FTNODE* nodep
)
{
uint32_t fullhash = toku_cachetable_hash(brt->ft->cf, blocknum);
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->ft);
toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
brt->ft,
uint32_t fullhash = toku_cachetable_hash(ft_handle->ft->cf, blocknum);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(ft_handle->ft);
toku_pin_ftnode(
ft_handle->ft,
blocknum,
fullhash,
&bfe,
PL_WRITE_EXPENSIVE, // may_modify_node
0,
NULL,
nodep,
false
);
}
struct verify_msg_fn {
FT_HANDLE ft_handle;
NONLEAF_CHILDINFO bnc;
const DBT *curr_less_pivot;
const DBT *curr_geq_pivot;
BLOCKNUM blocknum;
MSN this_msn;
int verbose;
int keep_going_on_failure;
bool messages_have_been_moved;
MSN last_msn;
int msg_i;
int result = 0; // needed by VERIFY_ASSERTION
verify_msg_fn(FT_HANDLE handle, NONLEAF_CHILDINFO nl, const DBT *less, const DBT *geq,
BLOCKNUM b, MSN tmsn, int v, int k, bool m) :
ft_handle(handle), bnc(nl), curr_less_pivot(less), curr_geq_pivot(geq),
blocknum(b), this_msn(tmsn), verbose(v), keep_going_on_failure(k), messages_have_been_moved(m), last_msn(ZERO_MSN), msg_i(0) {
}
int operator()(const ft_msg &msg, bool is_fresh) {
enum ft_msg_type type = (enum ft_msg_type) msg.type();
MSN msn = msg.msn();
XIDS xid = msg.xids();
const void *key = msg.kdbt()->data;
const void *data = msg.vdbt()->data;
uint32_t keylen = msg.kdbt()->size;
uint32_t datalen = msg.vdbt()->size;
int r = verify_msg_in_child_buffer(ft_handle, type, msn, key, keylen, data, datalen, xid,
curr_less_pivot,
curr_geq_pivot);
VERIFY_ASSERTION(r == 0, msg_i, "A message in the buffer is out of place");
VERIFY_ASSERTION((msn.msn > last_msn.msn), msg_i, "msn per msg must be monotonically increasing toward newer messages in buffer");
VERIFY_ASSERTION((msn.msn <= this_msn.msn), msg_i, "all messages must have msn within limit of this node's max_msn_applied_to_node_in_memory");
if (ft_msg_type_applies_once(type)) {
int count;
DBT keydbt;
toku_fill_dbt(&keydbt, key, keylen);
int total_count = 0;
count = count_eq_key_msn(ft_handle, &bnc->msg_buffer, bnc->fresh_message_tree, toku_fill_dbt(&keydbt, key, keylen), msn);
total_count += count;
if (is_fresh) {
VERIFY_ASSERTION(count == 1, msg_i, "a fresh message was not found in the fresh message tree");
} else if (messages_have_been_moved) {
VERIFY_ASSERTION(count == 0, msg_i, "a stale message was found in the fresh message tree");
}
VERIFY_ASSERTION(count <= 1, msg_i, "a message was found multiple times in the fresh message tree");
count = count_eq_key_msn(ft_handle, &bnc->msg_buffer, bnc->stale_message_tree, &keydbt, msn);
total_count += count;
if (is_fresh) {
VERIFY_ASSERTION(count == 0, msg_i, "a fresh message was found in the stale message tree");
} else if (messages_have_been_moved) {
VERIFY_ASSERTION(count == 1, msg_i, "a stale message was not found in the stale message tree");
}
VERIFY_ASSERTION(count <= 1, msg_i, "a message was found multiple times in the stale message tree");
VERIFY_ASSERTION(total_count <= 1, msg_i, "a message was found in both message trees (or more than once in a single tree)");
VERIFY_ASSERTION(total_count >= 1, msg_i, "a message was not found in either message tree");
} else {
VERIFY_ASSERTION(ft_msg_type_applies_all(type) || ft_msg_type_does_nothing(type), msg_i, "a message was found that does not apply either to all or to only one key");
struct count_msgs_extra extra = { .count = 0, .msn = msn, .msg_buffer = &bnc->msg_buffer };
bnc->broadcast_list.iterate<struct count_msgs_extra, count_msgs>(&extra);
VERIFY_ASSERTION(extra.count == 1, msg_i, "a broadcast message was not found in the broadcast list");
}
last_msn = msn;
msg_i++;
done:
return result;
}
};
static int
toku_verify_ftnode_internal(FT_HANDLE brt,
toku_verify_ftnode_internal(FT_HANDLE ft_handle,
MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above,
FTNODE node, int height,
const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.)
@ -320,10 +385,10 @@ toku_verify_ftnode_internal(FT_HANDLE brt,
{
int result=0;
MSN this_msn;
BLOCKNUM blocknum = node->thisnodename;
BLOCKNUM blocknum = node->blocknum;
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
toku_assert_entire_node_in_memory(node);
toku_ftnode_assert_fully_in_memory(node);
this_msn = node->max_msn_applied_to_node_on_disk;
if (height >= 0) {
@ -334,74 +399,40 @@ toku_verify_ftnode_internal(FT_HANDLE brt,
}
// Verify that all the pivot keys are in order.
for (int i = 0; i < node->n_children-2; i++) {
int compare = compare_pairs(brt, &node->childkeys[i], &node->childkeys[i+1]);
DBT x, y;
int compare = compare_pairs(ft_handle, node->pivotkeys.fill_pivot(i, &x), node->pivotkeys.fill_pivot(i + 1, &y));
VERIFY_ASSERTION(compare < 0, i, "Value is >= the next value");
}
// Verify that all the pivot keys are lesser_pivot < pivot <= greatereq_pivot
for (int i = 0; i < node->n_children-1; i++) {
DBT x;
if (lesser_pivot) {
int compare = compare_pairs(brt, lesser_pivot, &node->childkeys[i]);
int compare = compare_pairs(ft_handle, lesser_pivot, node->pivotkeys.fill_pivot(i, &x));
VERIFY_ASSERTION(compare < 0, i, "Pivot is >= the lower-bound pivot");
}
if (greatereq_pivot) {
int compare = compare_pairs(brt, greatereq_pivot, &node->childkeys[i]);
int compare = compare_pairs(ft_handle, greatereq_pivot, node->pivotkeys.fill_pivot(i, &x));
VERIFY_ASSERTION(compare >= 0, i, "Pivot is < the upper-bound pivot");
}
}
for (int i = 0; i < node->n_children; i++) {
const DBT *curr_less_pivot = (i==0) ? lesser_pivot : &node->childkeys[i-1];
const DBT *curr_geq_pivot = (i==node->n_children-1) ? greatereq_pivot : &node->childkeys[i];
DBT x, y;
const DBT *curr_less_pivot = (i==0) ? lesser_pivot : node->pivotkeys.fill_pivot(i - 1, &x);
const DBT *curr_geq_pivot = (i==node->n_children-1) ? greatereq_pivot : node->pivotkeys.fill_pivot(i, &y);
if (node->height > 0) {
MSN last_msn = ZERO_MSN;
// Verify that messages in the buffers are in the right place.
NONLEAF_CHILDINFO bnc = BNC(node, i);
VERIFY_ASSERTION(verify_sorted_by_key_msn(brt, bnc->buffer, bnc->fresh_message_tree) == 0, i, "fresh_message_tree");
VERIFY_ASSERTION(verify_sorted_by_key_msn(brt, bnc->buffer, bnc->stale_message_tree) == 0, i, "stale_message_tree");
FIFO_ITERATE(bnc->buffer, key, keylen, data, datalen, itype, msn, xid, is_fresh,
({
enum ft_msg_type type = (enum ft_msg_type) itype;
int r = verify_msg_in_child_buffer(brt, type, msn, key, keylen, data, datalen, xid,
curr_less_pivot,
curr_geq_pivot);
VERIFY_ASSERTION(r==0, i, "A message in the buffer is out of place");
VERIFY_ASSERTION((msn.msn > last_msn.msn), i, "msn per msg must be monotonically increasing toward newer messages in buffer");
VERIFY_ASSERTION((msn.msn <= this_msn.msn), i, "all messages must have msn within limit of this node's max_msn_applied_to_node_in_memory");
if (ft_msg_type_applies_once(type)) {
int count;
DBT keydbt;
toku_fill_dbt(&keydbt, key, keylen);
int total_count = 0;
count = count_eq_key_msn(brt, bnc->buffer, bnc->fresh_message_tree, toku_fill_dbt(&keydbt, key, keylen), msn);
total_count += count;
if (is_fresh) {
VERIFY_ASSERTION(count == 1, i, "a fresh message was not found in the fresh message tree");
} else if (messages_have_been_moved) {
VERIFY_ASSERTION(count == 0, i, "a stale message was found in the fresh message tree");
}
VERIFY_ASSERTION(count <= 1, i, "a message was found multiple times in the fresh message tree");
count = count_eq_key_msn(brt, bnc->buffer, bnc->stale_message_tree, &keydbt, msn);
// Verify that messages in the buffers are in the right place.
VERIFY_ASSERTION(verify_sorted_by_key_msn(ft_handle, &bnc->msg_buffer, bnc->fresh_message_tree) == 0, i, "fresh_message_tree");
VERIFY_ASSERTION(verify_sorted_by_key_msn(ft_handle, &bnc->msg_buffer, bnc->stale_message_tree) == 0, i, "stale_message_tree");
total_count += count;
if (is_fresh) {
VERIFY_ASSERTION(count == 0, i, "a fresh message was found in the stale message tree");
} else if (messages_have_been_moved) {
VERIFY_ASSERTION(count == 1, i, "a stale message was not found in the stale message tree");
}
VERIFY_ASSERTION(count <= 1, i, "a message was found multiple times in the stale message tree");
verify_msg_fn verify_msg(ft_handle, bnc, curr_less_pivot, curr_geq_pivot,
blocknum, this_msn, verbose, keep_going_on_failure, messages_have_been_moved);
int r = bnc->msg_buffer.iterate(verify_msg);
if (r != 0) { result = r; goto done; }
VERIFY_ASSERTION(total_count <= 1, i, "a message was found in both message trees (or more than once in a single tree)");
VERIFY_ASSERTION(total_count >= 1, i, "a message was not found in either message tree");
} else {
VERIFY_ASSERTION(ft_msg_type_applies_all(type) || ft_msg_type_does_nothing(type), i, "a message was found that does not apply either to all or to only one key");
struct count_msgs_extra extra = { .count = 0, .msn = msn, .fifo = bnc->buffer };
bnc->broadcast_list.iterate<struct count_msgs_extra, count_msgs>(&extra);
VERIFY_ASSERTION(extra.count == 1, i, "a broadcast message was not found in the broadcast list");
}
last_msn = msn;
}));
struct verify_message_tree_extra extra = { .fifo = bnc->buffer, .broadcast = false, .is_fresh = true, .i = i, .verbose = verbose, .blocknum = node->thisnodename, .keep_going_on_failure = keep_going_on_failure, .messages_have_been_moved = messages_have_been_moved };
int r = bnc->fresh_message_tree.iterate<struct verify_message_tree_extra, verify_message_tree>(&extra);
struct verify_message_tree_extra extra = { .msg_buffer = &bnc->msg_buffer, .broadcast = false, .is_fresh = true, .i = i, .verbose = verbose, .blocknum = node->blocknum, .keep_going_on_failure = keep_going_on_failure, .messages_have_been_moved = messages_have_been_moved };
r = bnc->fresh_message_tree.iterate<struct verify_message_tree_extra, verify_message_tree>(&extra);
if (r != 0) { result = r; goto done; }
extra.is_fresh = false;
r = bnc->stale_message_tree.iterate<struct verify_message_tree_extra, verify_message_tree>(&extra);
@ -424,20 +455,20 @@ toku_verify_ftnode_internal(FT_HANDLE brt,
}
else {
BASEMENTNODE bn = BLB(node, i);
for (uint32_t j = 0; j < bn->data_buffer.omt_size(); j++) {
for (uint32_t j = 0; j < bn->data_buffer.num_klpairs(); j++) {
VERIFY_ASSERTION((rootmsn.msn >= this_msn.msn), 0, "leaf may have latest msn, but cannot be greater than root msn");
DBT kdbt = get_ith_key_dbt(bn, j);
if (curr_less_pivot) {
int compare = compare_pairs(brt, curr_less_pivot, &kdbt);
int compare = compare_pairs(ft_handle, curr_less_pivot, &kdbt);
VERIFY_ASSERTION(compare < 0, j, "The leafentry is >= the lower-bound pivot");
}
if (curr_geq_pivot) {
int compare = compare_pairs(brt, curr_geq_pivot, &kdbt);
int compare = compare_pairs(ft_handle, curr_geq_pivot, &kdbt);
VERIFY_ASSERTION(compare >= 0, j, "The leafentry is < the upper-bound pivot");
}
if (0 < j) {
DBT prev_key_dbt = get_ith_key_dbt(bn, j-1);
int compare = compare_pairs(brt, &prev_key_dbt, &kdbt);
int compare = compare_pairs(ft_handle, &prev_key_dbt, &kdbt);
VERIFY_ASSERTION(compare < 0, j, "Adjacent leafentries are out of order");
}
}
@ -451,7 +482,7 @@ done:
// input is a pinned node, on exit, node is unpinned
int
toku_verify_ftnode (FT_HANDLE brt,
toku_verify_ftnode (FT_HANDLE ft_handle,
MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above,
FTNODE node, int height,
const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.)
@ -462,7 +493,7 @@ toku_verify_ftnode (FT_HANDLE brt,
MSN this_msn;
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
toku_assert_entire_node_in_memory(node);
toku_ftnode_assert_fully_in_memory(node);
this_msn = node->max_msn_applied_to_node_on_disk;
int result = 0;
@ -471,15 +502,15 @@ toku_verify_ftnode (FT_HANDLE brt,
// Otherwise we'll just do the next call
result = toku_verify_ftnode_internal(
brt, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot,
ft_handle, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot,
verbose, keep_going_on_failure, false);
if (result != 0 && (!keep_going_on_failure || result != TOKUDB_NEEDS_REPAIR)) goto done;
}
if (node->height > 0) {
toku_move_ftnode_messages_to_stale(brt->ft, node);
toku_move_ftnode_messages_to_stale(ft_handle->ft, node);
}
result2 = toku_verify_ftnode_internal(
brt, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot,
ft_handle, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot,
verbose, keep_going_on_failure, true);
if (result == 0) {
result = result2;
@ -490,15 +521,16 @@ toku_verify_ftnode (FT_HANDLE brt,
if (recurse && node->height > 0) {
for (int i = 0; i < node->n_children; i++) {
FTNODE child_node;
toku_get_node_for_verify(BP_BLOCKNUM(node, i), brt, &child_node);
int r = toku_verify_ftnode(brt, rootmsn,
toku_get_node_for_verify(BP_BLOCKNUM(node, i), ft_handle, &child_node);
DBT x, y;
int r = toku_verify_ftnode(ft_handle, rootmsn,
(toku_bnc_n_entries(BNC(node, i)) > 0
? this_msn
: parentmsn_with_messages),
messages_exist_above || toku_bnc_n_entries(BNC(node, i)) > 0,
child_node, node->height-1,
(i==0) ? lesser_pivot : &node->childkeys[i-1],
(i==node->n_children-1) ? greatereq_pivot : &node->childkeys[i],
(i==0) ? lesser_pivot : node->pivotkeys.fill_pivot(i - 1, &x),
(i==node->n_children-1) ? greatereq_pivot : node->pivotkeys.fill_pivot(i, &y),
progress_callback, progress_extra,
recurse, verbose, keep_going_on_failure);
if (r) {
@ -508,7 +540,7 @@ toku_verify_ftnode (FT_HANDLE brt,
}
}
done:
toku_unpin_ftnode(brt->ft, node);
toku_unpin_ftnode(ft_handle->ft, node);
if (result == 0 && progress_callback)
result = progress_callback(progress_extra, 0.0);
@ -517,26 +549,26 @@ done:
}
int
toku_verify_ft_with_progress (FT_HANDLE brt, int (*progress_callback)(void *extra, float progress), void *progress_extra, int verbose, int keep_on_going) {
assert(brt->ft);
toku_verify_ft_with_progress (FT_HANDLE ft_handle, int (*progress_callback)(void *extra, float progress), void *progress_extra, int verbose, int keep_on_going) {
assert(ft_handle->ft);
FTNODE root_node = NULL;
{
uint32_t root_hash;
CACHEKEY root_key;
toku_calculate_root_offset_pointer(brt->ft, &root_key, &root_hash);
toku_get_node_for_verify(root_key, brt, &root_node);
toku_calculate_root_offset_pointer(ft_handle->ft, &root_key, &root_hash);
toku_get_node_for_verify(root_key, ft_handle, &root_node);
}
int r = toku_verify_ftnode(brt, brt->ft->h->max_msn_in_ft, brt->ft->h->max_msn_in_ft, false, root_node, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going);
int r = toku_verify_ftnode(ft_handle, ft_handle->ft->h->max_msn_in_ft, ft_handle->ft->h->max_msn_in_ft, false, root_node, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going);
if (r == 0) {
toku_ft_lock(brt->ft);
brt->ft->h->time_of_last_verification = time(NULL);
brt->ft->h->dirty = 1;
toku_ft_unlock(brt->ft);
toku_ft_lock(ft_handle->ft);
ft_handle->ft->h->time_of_last_verification = time(NULL);
ft_handle->ft->h->dirty = 1;
toku_ft_unlock(ft_handle->ft);
}
return r;
}
int
toku_verify_ft (FT_HANDLE brt) {
return toku_verify_ft_with_progress(brt, NULL, NULL, 0, 0);
toku_verify_ft (FT_HANDLE ft_handle) {
return toku_verify_ft_with_progress(ft_handle, NULL, NULL, 0, 0);
}

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,12 +89,15 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "ft.h"
#include "ft-internal.h"
#include "ft-cachetable-wrappers.h"
#include "log-internal.h"
#include <ft/log_header.h>
#include "ft/serialize/block_table.h"
#include "ft/ft.h"
#include "ft/ft-cachetable-wrappers.h"
#include "ft/ft-internal.h"
#include "ft/logger/log-internal.h"
#include "ft/log_header.h"
#include "ft/node.h"
#include "ft/serialize/ft-serialize.h"
#include "ft/serialize/ft_node-serialize.h"
#include <memory.h>
#include <toku_assert.h>
@ -107,10 +110,10 @@ toku_reset_root_xid_that_created(FT ft, TXNID new_root_xid_that_created) {
// hold lock around setting and clearing of dirty bit
// (see cooperative use of dirty bit in ft_begin_checkpoint())
toku_ft_lock (ft);
toku_ft_lock(ft);
ft->h->root_xid_that_created = new_root_xid_that_created;
ft->h->dirty = 1;
toku_ft_unlock (ft);
toku_ft_unlock(ft);
}
static void
@ -118,9 +121,10 @@ ft_destroy(FT ft) {
//header and checkpoint_header have same Blocktable pointer
//cannot destroy since it is still in use by CURRENT
assert(ft->h->type == FT_CURRENT);
toku_blocktable_destroy(&ft->blocktable);
if (ft->descriptor.dbt.data) toku_free(ft->descriptor.dbt.data);
if (ft->cmp_descriptor.dbt.data) toku_free(ft->cmp_descriptor.dbt.data);
ft->blocktable.destroy();
ft->cmp.destroy();
toku_destroy_dbt(&ft->descriptor.dbt);
toku_destroy_dbt(&ft->cmp_descriptor.dbt);
toku_ft_destroy_reflock(ft);
toku_free(ft->h);
}
@ -187,7 +191,7 @@ ft_log_fassociate_during_checkpoint (CACHEFILE cf, void *header_v) {
}
// Maps to cf->begin_checkpoint_userdata
// Create checkpoint-in-progress versions of header and translation (btt) (and fifo for now...).
// Create checkpoint-in-progress versions of header and translation (btt)
// Has access to fd (it is protected).
//
// Not reentrant for a single FT (see ft_checkpoint)
@ -199,7 +203,7 @@ static void ft_begin_checkpoint (LSN checkpoint_lsn, void *header_v) {
assert(ft->checkpoint_header == NULL);
ft_copy_for_checkpoint_unlocked(ft, checkpoint_lsn);
ft->h->dirty = 0; // this is only place this bit is cleared (in currentheader)
toku_block_translation_note_start_checkpoint_unlocked(ft->blocktable);
ft->blocktable.note_start_checkpoint_unlocked();
toku_ft_unlock (ft);
}
@ -235,8 +239,6 @@ ft_hack_highest_unused_msn_for_upgrade_for_checkpoint(FT ft) {
static void ft_checkpoint (CACHEFILE cf, int fd, void *header_v) {
FT ft = (FT) header_v;
FT_HEADER ch = ft->checkpoint_header;
//printf("%s:%d allocated_limit=%lu writing queue to %lu\n", __FILE__, __LINE__,
// block_allocator_allocated_limit(h->block_allocator), h->unused_blocks.b*h->nodesize);
assert(ch);
assert(ch->type == FT_CHECKPOINT_INPROGRESS);
if (ch->dirty) { // this is only place this bit is tested (in checkpoint_header)
@ -251,16 +253,15 @@ static void ft_checkpoint (CACHEFILE cf, int fd, void *header_v) {
ft_hack_highest_unused_msn_for_upgrade_for_checkpoint(ft);
// write translation and header to disk (or at least to OS internal buffer)
toku_serialize_ft_to(fd, ch, ft->blocktable, ft->cf);
toku_serialize_ft_to(fd, ch, &ft->blocktable, ft->cf);
ch->dirty = 0; // this is only place this bit is cleared (in checkpoint_header)
// fsync the cachefile
toku_cachefile_fsync(cf);
ft->h->checkpoint_count++; // checkpoint succeeded, next checkpoint will save to alternate header location
ft->h->checkpoint_lsn = ch->checkpoint_lsn; //Header updated.
}
else {
toku_block_translation_note_skipped_checkpoint(ft->blocktable);
} else {
ft->blocktable.note_skipped_checkpoint();
}
}
@ -268,14 +269,12 @@ static void ft_checkpoint (CACHEFILE cf, int fd, void *header_v) {
// free unused disk space
// (i.e. tell BlockAllocator to liberate blocks used by previous checkpoint).
// Must have access to fd (protected)
static void ft_end_checkpoint (CACHEFILE UU(cachefile), int fd, void *header_v) {
static void ft_end_checkpoint(CACHEFILE UU(cf), int fd, void *header_v) {
FT ft = (FT) header_v;
assert(ft->h->type == FT_CURRENT);
toku_block_translation_note_end_checkpoint(ft->blocktable, fd);
if (ft->checkpoint_header) {
toku_free(ft->checkpoint_header);
ft->checkpoint_header = NULL;
}
ft->blocktable.note_end_checkpoint(fd);
toku_free(ft->checkpoint_header);
ft->checkpoint_header = nullptr;
}
// maps to cf->close_userdata
@ -360,11 +359,6 @@ static void ft_note_unpin_by_checkpoint (CACHEFILE UU(cachefile), void *header_v
// End of Functions that are callbacks to the cachefile
/////////////////////////////////////////////////////////////////////////
void toku_node_save_ct_pair(CACHEKEY UU(key), void *value_data, PAIR p) {
FTNODE CAST_FROM_VOIDP(node, value_data);
node->ct_pair = p;
}
static void setup_initial_ft_root_node(FT ft, BLOCKNUM blocknum) {
FTNODE XCALLOC(node);
toku_initialize_empty_ftnode(node, blocknum, 0, 1, ft->h->layout_version, ft->h->flags);
@ -375,7 +369,7 @@ static void setup_initial_ft_root_node(FT ft, BLOCKNUM blocknum) {
toku_cachetable_put(ft->cf, blocknum, fullhash,
node, make_ftnode_pair_attr(node),
get_write_callbacks_for_node(ft),
toku_node_save_ct_pair);
toku_ftnode_save_ct_pair);
toku_unpin_ftnode(ft, node);
}
@ -386,7 +380,8 @@ static void ft_init(FT ft, FT_OPTIONS options, CACHEFILE cf) {
toku_list_init(&ft->live_ft_handles);
ft->compare_fun = options->compare_fun;
// intuitively, the comparator points to the FT's cmp descriptor
ft->cmp.create(options->compare_fun, &ft->cmp_descriptor, options->memcmp_magic);
ft->update_fun = options->update_fun;
if (ft->cf != NULL) {
@ -407,7 +402,7 @@ static void ft_init(FT ft, FT_OPTIONS options, CACHEFILE cf) {
ft_note_pin_by_checkpoint,
ft_note_unpin_by_checkpoint);
toku_block_verify_no_free_blocknums(ft->blocktable);
ft->blocktable.verify_no_free_blocknums();
}
@ -451,55 +446,48 @@ void toku_ft_create(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn) {
invariant(ftp);
FT XCALLOC(ft);
memset(&ft->descriptor, 0, sizeof(ft->descriptor));
memset(&ft->cmp_descriptor, 0, sizeof(ft->cmp_descriptor));
ft->h = ft_header_create(options, make_blocknum(0), (txn ? txn->txnid.parent_id64: TXNID_NONE));
toku_ft_init_reflock(ft);
// Assign blocknum for root block, also dirty the header
toku_blocktable_create_new(&ft->blocktable);
toku_allocate_blocknum(ft->blocktable, &ft->h->root_blocknum, ft);
ft->blocktable.create();
ft->blocktable.allocate_blocknum(&ft->h->root_blocknum, ft);
ft_init(ft, options, cf);
*ftp = ft;
}
// TODO: (Zardosht) get rid of brt parameter
int toku_read_ft_and_store_in_cachefile (FT_HANDLE brt, CACHEFILE cf, LSN max_acceptable_lsn, FT *header)
// TODO: (Zardosht) get rid of ft parameter
int toku_read_ft_and_store_in_cachefile (FT_HANDLE ft_handle, CACHEFILE cf, LSN max_acceptable_lsn, FT *header)
// If the cachefile already has the header, then just get it.
// If the cachefile has not been initialized, then don't modify anything.
// max_acceptable_lsn is the latest acceptable checkpointed version of the file.
{
{
FT h;
if ((h = (FT) toku_cachefile_get_userdata(cf))!=0) {
*header = h;
assert(brt->options.update_fun == h->update_fun);
assert(brt->options.compare_fun == h->compare_fun);
return 0;
}
FT ft = nullptr;
if ((ft = (FT) toku_cachefile_get_userdata(cf)) != nullptr) {
*header = ft;
assert(ft_handle->options.update_fun == ft->update_fun);
return 0;
}
FT h = nullptr;
int r;
{
int fd = toku_cachefile_get_fd(cf);
r = toku_deserialize_ft_from(fd, max_acceptable_lsn, &h);
if (r == TOKUDB_BAD_CHECKSUM) {
fprintf(stderr, "Checksum failure while reading header in file %s.\n", toku_cachefile_fname_in_env(cf));
assert(false); // make absolutely sure we crash before doing anything else
}
int fd = toku_cachefile_get_fd(cf);
int r = toku_deserialize_ft_from(fd, max_acceptable_lsn, &ft);
if (r == TOKUDB_BAD_CHECKSUM) {
fprintf(stderr, "Checksum failure while reading header in file %s.\n", toku_cachefile_fname_in_env(cf));
assert(false); // make absolutely sure we crash before doing anything else
} else if (r != 0) {
return r;
}
if (r!=0) return r;
// GCC 4.8 seems to get confused by the gotos in the deserialize code and think h is maybe uninitialized.
invariant_notnull(h);
h->cf = cf;
h->compare_fun = brt->options.compare_fun;
h->update_fun = brt->options.update_fun;
invariant_notnull(ft);
// intuitively, the comparator points to the FT's cmp descriptor
ft->cmp.create(ft_handle->options.compare_fun, &ft->cmp_descriptor, ft_handle->options.memcmp_magic);
ft->update_fun = ft_handle->options.update_fun;
ft->cf = cf;
toku_cachefile_set_userdata(cf,
(void*)h,
reinterpret_cast<void *>(ft),
ft_log_fassociate_during_checkpoint,
ft_close,
ft_free,
@ -508,7 +496,7 @@ int toku_read_ft_and_store_in_cachefile (FT_HANDLE brt, CACHEFILE cf, LSN max_ac
ft_end_checkpoint,
ft_note_pin_by_checkpoint,
ft_note_unpin_by_checkpoint);
*header = h;
*header = ft;
return 0;
}
@ -550,22 +538,22 @@ void toku_ft_evict_from_memory(FT ft, bool oplsn_valid, LSN oplsn) {
}
// Verifies there exists exactly one ft handle and returns it.
FT_HANDLE toku_ft_get_only_existing_ft_handle(FT h) {
FT_HANDLE toku_ft_get_only_existing_ft_handle(FT ft) {
FT_HANDLE ft_handle_ret = NULL;
toku_ft_grab_reflock(h);
assert(toku_list_num_elements_est(&h->live_ft_handles) == 1);
ft_handle_ret = toku_list_struct(toku_list_head(&h->live_ft_handles), struct ft_handle, live_ft_handle_link);
toku_ft_release_reflock(h);
toku_ft_grab_reflock(ft);
assert(toku_list_num_elements_est(&ft->live_ft_handles) == 1);
ft_handle_ret = toku_list_struct(toku_list_head(&ft->live_ft_handles), struct ft_handle, live_ft_handle_link);
toku_ft_release_reflock(ft);
return ft_handle_ret;
}
// Purpose: set fields in brt_header to capture accountability info for start of HOT optimize.
// Purpose: set fields in ft_header to capture accountability info for start of HOT optimize.
// Note: HOT accountability variables in header are modified only while holding header lock.
// (Header lock is really needed for touching the dirty bit, but it's useful and
// convenient here for keeping the HOT variables threadsafe.)
void
toku_ft_note_hot_begin(FT_HANDLE brt) {
FT ft = brt->ft;
toku_ft_note_hot_begin(FT_HANDLE ft_handle) {
FT ft = ft_handle->ft;
time_t now = time(NULL);
// hold lock around setting and clearing of dirty bit
@ -578,11 +566,11 @@ toku_ft_note_hot_begin(FT_HANDLE brt) {
}
// Purpose: set fields in brt_header to capture accountability info for end of HOT optimize.
// Purpose: set fields in ft_header to capture accountability info for end of HOT optimize.
// Note: See note for toku_ft_note_hot_begin().
void
toku_ft_note_hot_complete(FT_HANDLE brt, bool success, MSN msn_at_start_of_hot) {
FT ft = brt->ft;
toku_ft_note_hot_complete(FT_HANDLE ft_handle, bool success, MSN msn_at_start_of_hot) {
FT ft = ft_handle->ft;
time_t now = time(NULL);
toku_ft_lock(ft);
@ -620,6 +608,7 @@ toku_ft_init(FT ft,
.compression_method = compression_method,
.fanout = fanout,
.flags = 0,
.memcmp_magic = 0,
.compare_fun = NULL,
.update_fun = NULL
};
@ -628,29 +617,29 @@ toku_ft_init(FT ft,
ft->h->checkpoint_lsn = checkpoint_lsn;
}
// Open a brt for use by redirect. The new brt must have the same dict_id as the old_ft passed in. (FILENUM is assigned by the ft_handle_open() function.)
// Open an ft for use by redirect. The new ft must have the same dict_id as the old_ft passed in. (FILENUM is assigned by the ft_handle_open() function.)
static int
ft_handle_open_for_redirect(FT_HANDLE *new_ftp, const char *fname_in_env, TOKUTXN txn, FT old_h) {
FT_HANDLE t;
assert(old_h->dict_id.dictid != DICTIONARY_ID_NONE.dictid);
toku_ft_handle_create(&t);
toku_ft_set_bt_compare(t, old_h->compare_fun);
toku_ft_set_update(t, old_h->update_fun);
toku_ft_handle_set_nodesize(t, old_h->h->nodesize);
toku_ft_handle_set_basementnodesize(t, old_h->h->basementnodesize);
toku_ft_handle_set_compression_method(t, old_h->h->compression_method);
toku_ft_handle_set_fanout(t, old_h->h->fanout);
CACHETABLE ct = toku_cachefile_get_cachetable(old_h->cf);
int r = toku_ft_handle_open_with_dict_id(t, fname_in_env, 0, 0, ct, txn, old_h->dict_id);
ft_handle_open_for_redirect(FT_HANDLE *new_ftp, const char *fname_in_env, TOKUTXN txn, FT old_ft) {
FT_HANDLE ft_handle;
assert(old_ft->dict_id.dictid != DICTIONARY_ID_NONE.dictid);
toku_ft_handle_create(&ft_handle);
toku_ft_set_bt_compare(ft_handle, old_ft->cmp.get_compare_func());
toku_ft_set_update(ft_handle, old_ft->update_fun);
toku_ft_handle_set_nodesize(ft_handle, old_ft->h->nodesize);
toku_ft_handle_set_basementnodesize(ft_handle, old_ft->h->basementnodesize);
toku_ft_handle_set_compression_method(ft_handle, old_ft->h->compression_method);
toku_ft_handle_set_fanout(ft_handle, old_ft->h->fanout);
CACHETABLE ct = toku_cachefile_get_cachetable(old_ft->cf);
int r = toku_ft_handle_open_with_dict_id(ft_handle, fname_in_env, 0, 0, ct, txn, old_ft->dict_id);
if (r != 0) {
goto cleanup;
}
assert(t->ft->dict_id.dictid == old_h->dict_id.dictid);
*new_ftp = t;
assert(ft_handle->ft->dict_id.dictid == old_ft->dict_id.dictid);
*new_ftp = ft_handle;
cleanup:
if (r != 0) {
toku_ft_handle_close(t);
toku_ft_handle_close(ft_handle);
}
return r;
}
@ -658,81 +647,81 @@ ft_handle_open_for_redirect(FT_HANDLE *new_ftp, const char *fname_in_env, TOKUTX
// This function performs most of the work to redirect a dictionary to different file.
// It is called for redirect and to abort a redirect. (This function is almost its own inverse.)
static int
dictionary_redirect_internal(const char *dst_fname_in_env, FT src_h, TOKUTXN txn, FT *dst_hp) {
dictionary_redirect_internal(const char *dst_fname_in_env, FT src_ft, TOKUTXN txn, FT *dst_ftp) {
int r;
FILENUM src_filenum = toku_cachefile_filenum(src_h->cf);
FILENUM src_filenum = toku_cachefile_filenum(src_ft->cf);
FILENUM dst_filenum = FILENUM_NONE;
FT dst_h = NULL;
FT dst_ft = NULL;
struct toku_list *list;
// open a dummy brt based off of
// open a dummy ft based off of
// dst_fname_in_env to get the header
// then we will change all the brt's to have
// their headers point to dst_h instead of src_h
// then we will change all the ft's to have
// their headers point to dst_ft instead of src_ft
FT_HANDLE tmp_dst_ft = NULL;
r = ft_handle_open_for_redirect(&tmp_dst_ft, dst_fname_in_env, txn, src_h);
r = ft_handle_open_for_redirect(&tmp_dst_ft, dst_fname_in_env, txn, src_ft);
if (r != 0) {
goto cleanup;
}
dst_h = tmp_dst_ft->ft;
dst_ft = tmp_dst_ft->ft;
// some sanity checks on dst_filenum
dst_filenum = toku_cachefile_filenum(dst_h->cf);
dst_filenum = toku_cachefile_filenum(dst_ft->cf);
assert(dst_filenum.fileid!=FILENUM_NONE.fileid);
assert(dst_filenum.fileid!=src_filenum.fileid); //Cannot be same file.
// for each live brt, brt->ft is currently src_h
// for each live ft_handle, ft_handle->ft is currently src_ft
// we want to change it to dummy_dst
toku_ft_grab_reflock(src_h);
while (!toku_list_empty(&src_h->live_ft_handles)) {
list = src_h->live_ft_handles.next;
toku_ft_grab_reflock(src_ft);
while (!toku_list_empty(&src_ft->live_ft_handles)) {
list = src_ft->live_ft_handles.next;
FT_HANDLE src_handle = NULL;
src_handle = toku_list_struct(list, struct ft_handle, live_ft_handle_link);
toku_list_remove(&src_handle->live_ft_handle_link);
toku_ft_note_ft_handle_open(dst_h, src_handle);
toku_ft_note_ft_handle_open(dst_ft, src_handle);
if (src_handle->redirect_callback) {
src_handle->redirect_callback(src_handle, src_handle->redirect_callback_extra);
}
}
assert(dst_h);
// making sure that we are not leaking src_h
assert(toku_ft_needed_unlocked(src_h));
toku_ft_release_reflock(src_h);
assert(dst_ft);
// making sure that we are not leaking src_ft
assert(toku_ft_needed_unlocked(src_ft));
toku_ft_release_reflock(src_ft);
toku_ft_handle_close(tmp_dst_ft);
*dst_hp = dst_h;
*dst_ftp = dst_ft;
cleanup:
return r;
}
//This is the 'abort redirect' function. The redirect of old_h to new_h was done
//and now must be undone, so here we redirect new_h back to old_h.
//This is the 'abort redirect' function. The redirect of old_ft to new_ft was done
//and now must be undone, so here we redirect new_ft back to old_ft.
int
toku_dictionary_redirect_abort(FT old_h, FT new_h, TOKUTXN txn) {
char *old_fname_in_env = toku_cachefile_fname_in_env(old_h->cf);
toku_dictionary_redirect_abort(FT old_ft, FT new_ft, TOKUTXN txn) {
char *old_fname_in_env = toku_cachefile_fname_in_env(old_ft->cf);
int r;
{
FILENUM old_filenum = toku_cachefile_filenum(old_h->cf);
FILENUM new_filenum = toku_cachefile_filenum(new_h->cf);
FILENUM old_filenum = toku_cachefile_filenum(old_ft->cf);
FILENUM new_filenum = toku_cachefile_filenum(new_ft->cf);
assert(old_filenum.fileid!=new_filenum.fileid); //Cannot be same file.
//No living brts in old header.
toku_ft_grab_reflock(old_h);
assert(toku_list_empty(&old_h->live_ft_handles));
toku_ft_release_reflock(old_h);
//No living fts in old header.
toku_ft_grab_reflock(old_ft);
assert(toku_list_empty(&old_ft->live_ft_handles));
toku_ft_release_reflock(old_ft);
}
FT dst_h;
// redirect back from new_h to old_h
r = dictionary_redirect_internal(old_fname_in_env, new_h, txn, &dst_h);
FT dst_ft;
// redirect back from new_ft to old_ft
r = dictionary_redirect_internal(old_fname_in_env, new_ft, txn, &dst_ft);
if (r == 0) {
assert(dst_h == old_h);
assert(dst_ft == old_ft);
}
return r;
}
@ -740,13 +729,13 @@ toku_dictionary_redirect_abort(FT old_h, FT new_h, TOKUTXN txn) {
/****
* on redirect or abort:
* if redirect txn_note_doing_work(txn)
* if redirect connect src brt to txn (txn modified this brt)
* for each src brt
* open brt to dst file (create new brt struct)
* if redirect connect dst brt to txn
* redirect db to new brt
* redirect cursors to new brt
* close all src brts
* if redirect connect src ft to txn (txn modified this ft)
* for each src ft
* open ft to dst file (create new ft struct)
* if redirect connect dst ft to txn
* redirect db to new ft
* redirect cursors to new ft
* close all src fts
* if redirect make rollback log entry
*
* on commit:
@ -758,21 +747,21 @@ int
toku_dictionary_redirect (const char *dst_fname_in_env, FT_HANDLE old_ft_h, TOKUTXN txn) {
// Input args:
// new file name for dictionary (relative to env)
// old_ft_h is a live brt of open handle ({DB, BRT} pair) that currently refers to old dictionary file.
// old_ft_h is a live ft of open handle ({DB, FT_HANDLE} pair) that currently refers to old dictionary file.
// (old_ft_h may be one of many handles to the dictionary.)
// txn that created the loader
// Requires:
// multi operation lock is held.
// The brt is open. (which implies there can be no zombies.)
// The ft is open. (which implies there can be no zombies.)
// The new file must be a valid dictionary.
// The block size and flags in the new file must match the existing BRT.
// The block size and flags in the new file must match the existing FT.
// The new file must already have its descriptor in it (and it must match the existing descriptor).
// Effect:
// Open new FTs (and related header and cachefile) to the new dictionary file with a new FILENUM.
// Redirect all DBs that point to brts that point to the old file to point to brts that point to the new file.
// Redirect all DBs that point to fts that point to the old file to point to fts that point to the new file.
// Copy the dictionary id (dict_id) from the header of the original file to the header of the new file.
// Create a rollback log entry.
// The original BRT, header, cachefile and file remain unchanged. They will be cleaned up on commmit.
// The original FT, header, cachefile and file remain unchanged. They will be cleaned up on commmit.
// If the txn aborts, then this operation will be undone
int r;
@ -881,18 +870,17 @@ toku_ft_stat64 (FT ft, struct ftstat64_s *s) {
s->verify_time_sec = ft->h->time_of_last_verification;
}
void
toku_ft_get_fractal_tree_info64(FT ft, struct ftinfo64 *s) {
toku_blocktable_get_info64(ft->blocktable, s);
void toku_ft_get_fractal_tree_info64(FT ft, struct ftinfo64 *info) {
ft->blocktable.get_info64(info);
}
int toku_ft_iterate_fractal_tree_block_map(FT ft, int (*iter)(uint64_t,int64_t,int64_t,int64_t,int64_t,void*), void *iter_extra) {
uint64_t this_checkpoint_count = ft->h->checkpoint_count;
return toku_blocktable_iterate_translation_tables(ft->blocktable, this_checkpoint_count, iter, iter_extra);
return ft->blocktable.iterate_translation_tables(this_checkpoint_count, iter, iter_extra);
}
void
toku_ft_update_descriptor(FT ft, DESCRIPTOR d)
toku_ft_update_descriptor(FT ft, DESCRIPTOR desc)
// Effect: Changes the descriptor in a tree (log the change, make sure it makes it to disk eventually).
// requires: the ft is fully user-opened with a valid cachefile.
// descriptor updates cannot happen in parallel for an FT
@ -900,7 +888,7 @@ toku_ft_update_descriptor(FT ft, DESCRIPTOR d)
{
assert(ft->cf);
int fd = toku_cachefile_get_fd(ft->cf);
toku_ft_update_descriptor_with_fd(ft, d, fd);
toku_ft_update_descriptor_with_fd(ft, desc, fd);
}
// upadate the descriptor for an ft and serialize it using
@ -909,41 +897,30 @@ toku_ft_update_descriptor(FT ft, DESCRIPTOR d)
// update a descriptor before the ft is fully opened and has
// a valid cachefile.
void
toku_ft_update_descriptor_with_fd(FT ft, DESCRIPTOR d, int fd) {
toku_ft_update_descriptor_with_fd(FT ft, DESCRIPTOR desc, int fd) {
// the checksum is four bytes, so that's where the magic number comes from
// make space for the new descriptor and write it out to disk
DISKOFF offset, size;
size = toku_serialize_descriptor_size(d) + 4;
toku_realloc_descriptor_on_disk(ft->blocktable, size, &offset, ft, fd);
toku_serialize_descriptor_contents_to_fd(fd, d, offset);
size = toku_serialize_descriptor_size(desc) + 4;
ft->blocktable.realloc_descriptor_on_disk(size, &offset, ft, fd);
toku_serialize_descriptor_contents_to_fd(fd, desc, offset);
// cleanup the old descriptor and set the in-memory descriptor to the new one
if (ft->descriptor.dbt.data) {
toku_free(ft->descriptor.dbt.data);
}
ft->descriptor.dbt.size = d->dbt.size;
ft->descriptor.dbt.data = toku_memdup(d->dbt.data, d->dbt.size);
toku_destroy_dbt(&ft->descriptor.dbt);
toku_clone_dbt(&ft->descriptor.dbt, desc->dbt);
}
void
toku_ft_update_cmp_descriptor(FT ft) {
if (ft->cmp_descriptor.dbt.data != NULL) {
toku_free(ft->cmp_descriptor.dbt.data);
}
ft->cmp_descriptor.dbt.size = ft->descriptor.dbt.size;
ft->cmp_descriptor.dbt.data = toku_xmemdup(
ft->descriptor.dbt.data,
ft->descriptor.dbt.size
);
void toku_ft_update_cmp_descriptor(FT ft) {
// cleanup the old cmp descriptor and clone it as the in-memory descriptor
toku_destroy_dbt(&ft->cmp_descriptor.dbt);
toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt);
}
DESCRIPTOR
toku_ft_get_descriptor(FT_HANDLE ft_handle) {
DESCRIPTOR toku_ft_get_descriptor(FT_HANDLE ft_handle) {
return &ft_handle->ft->descriptor;
}
DESCRIPTOR
toku_ft_get_cmp_descriptor(FT_HANDLE ft_handle) {
DESCRIPTOR toku_ft_get_cmp_descriptor(FT_HANDLE ft_handle) {
return &ft_handle->ft->cmp_descriptor;
}
@ -1068,8 +1045,8 @@ garbage_helper(BLOCKNUM blocknum, int64_t UU(size), int64_t UU(address), void *e
struct garbage_helper_extra *CAST_FROM_VOIDP(info, extra);
FTNODE node;
FTNODE_DISK_DATA ndd;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, info->ft);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(info->ft);
int fd = toku_cachefile_get_fd(info->ft->cf);
int r = toku_deserialize_ftnode_from(fd, blocknum, 0, &node, &ndd, &bfe);
if (r != 0) {
@ -1079,8 +1056,8 @@ garbage_helper(BLOCKNUM blocknum, int64_t UU(size), int64_t UU(address), void *e
goto exit;
}
for (int i = 0; i < node->n_children; ++i) {
BN_DATA bd = BLB_DATA(node, i);
r = bd->omt_iterate<struct garbage_helper_extra, garbage_leafentry_helper>(info);
bn_data* bd = BLB_DATA(node, i);
r = bd->iterate<struct garbage_helper_extra, garbage_leafentry_helper>(info);
if (r != 0) {
goto exit;
}
@ -1103,7 +1080,7 @@ void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space) {
.total_space = 0,
.used_space = 0
};
toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, garbage_helper, &info, true, true);
ft->blocktable.iterate(block_table::TRANSLATION_CHECKPOINTED, garbage_helper, &info, true, true);
*total_space = info.total_space;
*used_space = info.used_space;
}
@ -1113,8 +1090,6 @@ void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space) {
#error
#endif
#define xstr(X) str(X)
#define str(X) #X
#define static_version_string xstr(DB_VERSION_MAJOR) "." \
@ -1124,10 +1099,9 @@ void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space) {
struct toku_product_name_strings_struct toku_product_name_strings;
char toku_product_name[TOKU_MAX_PRODUCT_NAME_LENGTH];
void
tokudb_update_product_name_strings(void) {
//DO ALL STRINGS HERE.. maybe have a separate FT layer version as well
{ // Version string
void tokuft_update_product_name_strings(void) {
// DO ALL STRINGS HERE.. maybe have a separate FT layer version as well
{
int n = snprintf(toku_product_name_strings.db_version,
sizeof(toku_product_name_strings.db_version),
"%s %s", toku_product_name, static_version_string);
@ -1179,7 +1153,7 @@ toku_single_process_lock(const char *lock_dir, const char *which, int *lockfd) {
*lockfd = toku_os_lock_file(lockfname);
if (*lockfd < 0) {
int e = get_error_errno();
fprintf(stderr, "Couldn't start tokudb because some other tokudb process is using the same directory [%s] for [%s]\n", lock_dir, which);
fprintf(stderr, "Couldn't start tokuft because some other tokuft process is using the same directory [%s] for [%s]\n", lock_dir, which);
return e;
}
return 0;
@ -1197,10 +1171,10 @@ toku_single_process_unlock(int *lockfd) {
return 0;
}
int tokudb_num_envs = 0;
int tokuft_num_envs = 0;
int
db_env_set_toku_product_name(const char *name) {
if (tokudb_num_envs > 0) {
if (tokuft_num_envs > 0) {
return EINVAL;
}
if (!name || strlen(name) < 1) {
@ -1211,7 +1185,7 @@ db_env_set_toku_product_name(const char *name) {
}
if (strncmp(toku_product_name, name, sizeof(toku_product_name))) {
strcpy(toku_product_name, name);
tokudb_update_product_name_strings();
tokuft_update_product_name_strings();
}
return 0;
}

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FT_H
#define FT_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,17 +86,20 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "fttypes.h"
#include "ybt.h"
#include <db.h>
#include "cachetable.h"
#include "log.h"
#include "ft-search.h"
#include "ft-ops.h"
#include "compress.h"
#include "ft/cachetable/cachetable.h"
#include "ft/ft-ops.h"
#include "ft/logger/log.h"
#include "util/dbt.h"
typedef struct ft *FT;
typedef struct ft_options *FT_OPTIONS;
// unlink a ft from the filesystem with or without a txn.
// if with a txn, then the unlink happens on commit.
@ -110,10 +111,13 @@ void toku_ft_destroy_reflock(FT ft);
void toku_ft_grab_reflock(FT ft);
void toku_ft_release_reflock(FT ft);
void toku_ft_create(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn);
void toku_ft_free (FT h);
void toku_ft_lock(struct ft *ft);
void toku_ft_unlock(struct ft *ft);
int toku_read_ft_and_store_in_cachefile (FT_HANDLE brt, CACHEFILE cf, LSN max_acceptable_lsn, FT *header);
void toku_ft_create(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn);
void toku_ft_free (FT ft);
int toku_read_ft_and_store_in_cachefile (FT_HANDLE ft_h, CACHEFILE cf, LSN max_acceptable_lsn, FT *header);
void toku_ft_note_ft_handle_open(FT ft, FT_HANDLE live);
bool toku_ft_needed_unlocked(FT ft);
@ -123,10 +127,10 @@ bool toku_ft_has_one_reference_unlocked(FT ft);
// will have to read in the ft in a new cachefile and new FT object.
void toku_ft_evict_from_memory(FT ft, bool oplsn_valid, LSN oplsn);
FT_HANDLE toku_ft_get_only_existing_ft_handle(FT h);
FT_HANDLE toku_ft_get_only_existing_ft_handle(FT ft);
void toku_ft_note_hot_begin(FT_HANDLE brt);
void toku_ft_note_hot_complete(FT_HANDLE brt, bool success, MSN msn_at_start_of_hot);
void toku_ft_note_hot_begin(FT_HANDLE ft_h);
void toku_ft_note_hot_complete(FT_HANDLE ft_h, bool success, MSN msn_at_start_of_hot);
void
toku_ft_init(
@ -142,29 +146,29 @@ toku_ft_init(
int toku_dictionary_redirect_abort(FT old_h, FT new_h, TOKUTXN txn) __attribute__ ((warn_unused_result));
int toku_dictionary_redirect (const char *dst_fname_in_env, FT_HANDLE old_ft, TOKUTXN txn);
void toku_reset_root_xid_that_created(FT h, TXNID new_root_xid_that_created);
void toku_reset_root_xid_that_created(FT ft, TXNID new_root_xid_that_created);
// Reset the root_xid_that_created field to the given value.
// This redefines which xid created the dictionary.
void toku_ft_add_txn_ref(FT h);
void toku_ft_remove_txn_ref(FT h);
void toku_ft_add_txn_ref(FT ft);
void toku_ft_remove_txn_ref(FT ft);
void toku_calculate_root_offset_pointer ( FT h, CACHEKEY* root_key, uint32_t *roothash);
void toku_ft_set_new_root_blocknum(FT h, CACHEKEY new_root_key);
LSN toku_ft_checkpoint_lsn(FT h) __attribute__ ((warn_unused_result));
void toku_ft_stat64 (FT h, struct ftstat64_s *s);
void toku_ft_get_fractal_tree_info64 (FT h, struct ftinfo64 *s);
void toku_calculate_root_offset_pointer (FT ft, CACHEKEY* root_key, uint32_t *roothash);
void toku_ft_set_new_root_blocknum(FT ft, CACHEKEY new_root_key);
LSN toku_ft_checkpoint_lsn(FT ft) __attribute__ ((warn_unused_result));
void toku_ft_stat64 (FT ft, struct ftstat64_s *s);
void toku_ft_get_fractal_tree_info64 (FT ft, struct ftinfo64 *s);
int toku_ft_iterate_fractal_tree_block_map(FT ft, int (*iter)(uint64_t,int64_t,int64_t,int64_t,int64_t,void*), void *iter_extra);
// unconditionally set the descriptor for an open FT. can't do this when
// any operation has already occurred on the ft.
// see toku_ft_change_descriptor(), which is the transactional version
// used by the ydb layer. it better describes the client contract.
void toku_ft_update_descriptor(FT ft, DESCRIPTOR d);
void toku_ft_update_descriptor(FT ft, DESCRIPTOR desc);
// use this version if the FT is not fully user-opened with a valid cachefile.
// this is a clean hack to get deserialization code to update a descriptor
// while the FT and cf are in the process of opening, for upgrade purposes
void toku_ft_update_descriptor_with_fd(FT ft, DESCRIPTOR d, int fd);
void toku_ft_update_descriptor_with_fd(FT ft, DESCRIPTOR desc, int fd);
void toku_ft_update_cmp_descriptor(FT ft);
// get the descriptor for a ft. safe to read as long as clients honor the
@ -174,9 +178,17 @@ void toku_ft_update_cmp_descriptor(FT ft);
DESCRIPTOR toku_ft_get_descriptor(FT_HANDLE ft_handle);
DESCRIPTOR toku_ft_get_cmp_descriptor(FT_HANDLE ft_handle);
typedef struct {
// delta versions in basements could be negative
int64_t numrows;
int64_t numbytes;
} STAT64INFO_S, *STAT64INFO;
static const STAT64INFO_S ZEROSTATS = { .numrows = 0, .numbytes = 0};
void toku_ft_update_stats(STAT64INFO headerstats, STAT64INFO_S delta);
void toku_ft_decrease_stats(STAT64INFO headerstats, STAT64INFO_S delta);
typedef void (*remove_ft_ref_callback)(FT ft, void *extra);
void toku_ft_remove_reference(FT ft,
bool oplsn_valid, LSN oplsn,
remove_ft_ref_callback remove_ref, void *extra);
@ -189,7 +201,6 @@ void toku_ft_set_compression_method(FT ft, enum toku_compression_method method);
void toku_ft_get_compression_method(FT ft, enum toku_compression_method *methodp);
void toku_ft_set_fanout(FT ft, unsigned int fanout);
void toku_ft_get_fanout(FT ft, unsigned int *fanout);
void toku_node_save_ct_pair(CACHEKEY UU(key), void *value_data, PAIR p);
// mark the ft as a blackhole. any message injections will be a no op.
void toku_ft_set_blackhole(FT_HANDLE ft_handle);
@ -198,15 +209,17 @@ void toku_ft_set_blackhole(FT_HANDLE ft_handle);
// The difference between the two is MVCC garbage.
void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space);
// TODO: Should be in portability
int get_num_cores(void);
// TODO: Use the cachetable's worker pool instead of something managed by the FT...
struct toku_thread_pool *get_ft_pool(void);
void dump_bad_block(unsigned char *vp, uint64_t size);
// TODO: Should be in portability
int toku_single_process_lock(const char *lock_dir, const char *which, int *lockfd);
int toku_single_process_unlock(int *lockfd);
void tokudb_update_product_name_strings(void);
void tokuft_update_product_name_strings(void);
#define TOKU_MAX_PRODUCT_NAME_LENGTH (256)
extern char toku_product_name[TOKU_MAX_PRODUCT_NAME_LENGTH];
@ -219,5 +232,4 @@ struct toku_product_name_strings_struct {
};
extern struct toku_product_name_strings_struct toku_product_name_strings;
extern int tokudb_num_envs;
#endif
extern int tokuft_num_envs;

View File

@ -1,382 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FTTYPES_H
#define FTTYPES_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <sys/types.h>
#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE 500
#endif
#define _FILE_OFFSET_BITS 64
#include "toku_assert.h"
#include <db.h>
#include <inttypes.h>
// Use the C++ bool and constants (true false), rather than BOOL, TRUE, and FALSE.
typedef struct ft_handle *FT_HANDLE;
typedef struct ftnode *FTNODE;
typedef struct ftnode_disk_data *FTNODE_DISK_DATA;
typedef struct ftnode_leaf_basement_node *BASEMENTNODE;
typedef struct ftnode_nonleaf_childinfo *NONLEAF_CHILDINFO;
typedef struct sub_block *SUB_BLOCK;
typedef struct ft *FT;
typedef struct ft_header *FT_HEADER;
typedef struct ft_options *FT_OPTIONS;
struct wbuf;
struct dbuf;
typedef unsigned int ITEMLEN;
typedef const void *bytevec;
typedef int64_t DISKOFF; /* Offset in a disk. -1 is the NULL pointer. */
typedef uint64_t TXNID;
typedef struct txnid_pair_s {
TXNID parent_id64;
TXNID child_id64;
} TXNID_PAIR;
#define TXNID_NONE_LIVING ((TXNID)0)
#define TXNID_NONE ((TXNID)0)
#define TXNID_MAX ((TXNID)-1)
static const TXNID_PAIR TXNID_PAIR_NONE = { .parent_id64 = TXNID_NONE, .child_id64 = TXNID_NONE };
typedef struct blocknum_s { int64_t b; } BLOCKNUM; // make a struct so that we will notice type problems.
typedef struct gid_s { uint8_t *gid; } GID; // the gid is of size [DB_GID_SIZE]
typedef TOKU_XA_XID *XIDP; // this is the type that's passed to the logger code (so that we don't have to copy all 152 bytes when only a subset are even valid.)
#define ROLLBACK_NONE ((BLOCKNUM){0})
static inline BLOCKNUM make_blocknum(int64_t b) { BLOCKNUM result={b}; return result; }
// This struct hold information about values stored in the cachetable.
// As one can tell from the names, we are probably violating an
// abstraction layer by placing names.
//
// The purpose of having this struct is to have a way for the
// cachetable to accumulate the some totals we are interested in.
// Breaking this abstraction layer by having these names was the
// easiest way.
//
typedef struct pair_attr_s {
long size; // size PAIR's value takes in memory
long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status
long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status
long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status
long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts
bool is_valid;
} PAIR_ATTR;
static inline PAIR_ATTR make_pair_attr(long size) {
PAIR_ATTR result={
.size = size,
.nonleaf_size = 0,
.leaf_size = 0,
.rollback_size = 0,
.cache_pressure_size = 0,
.is_valid = true
};
return result;
}
typedef struct {
uint32_t len;
char *data;
} BYTESTRING;
/* Log Sequence Number (LSN)
* Make the LSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_lsn { uint64_t lsn; } LSN;
#define ZERO_LSN ((LSN){0})
#define MAX_LSN ((LSN){UINT64_MAX})
/* Message Sequence Number (MSN)
* Make the MSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_msn { uint64_t msn; } MSN;
#define ZERO_MSN ((MSN){0}) // dummy used for message construction, to be filled in when msg is applied to tree
#define MIN_MSN ((MSN){(uint64_t)1 << 62}) // first 2^62 values reserved for messages created before Dr. No (for upgrade)
#define MAX_MSN ((MSN){UINT64_MAX})
typedef struct {
int64_t numrows; // delta versions in basements could be negative
int64_t numbytes;
} STAT64INFO_S, *STAT64INFO;
static const STAT64INFO_S ZEROSTATS = {0,0};
/* At the brt layer, a FILENUM uniquely identifies an open file.
* At the ydb layer, a DICTIONARY_ID uniquely identifies an open dictionary.
* With the introduction of the loader (ticket 2216), it is possible for the file that holds
* an open dictionary to change, so these are now separate and independent unique identifiers.
*/
typedef struct {uint32_t fileid;} FILENUM;
#define FILENUM_NONE ((FILENUM){UINT32_MAX})
typedef struct {uint64_t dictid;} DICTIONARY_ID;
#define DICTIONARY_ID_NONE ((DICTIONARY_ID){0})
typedef struct {
uint32_t num;
FILENUM *filenums;
} FILENUMS;
typedef struct tokulogger *TOKULOGGER;
typedef struct txn_manager *TXN_MANAGER;
#define NULL_LOGGER ((TOKULOGGER)0)
typedef struct tokutxn *TOKUTXN;
typedef struct txninfo *TXNINFO;
#define NULL_TXN ((TOKUTXN)0)
struct logged_btt_pair {
DISKOFF off;
int32_t size;
};
typedef struct cachetable *CACHETABLE;
typedef struct cachefile *CACHEFILE;
typedef struct ctpair *PAIR;
typedef class checkpointer *CHECKPOINTER;
typedef class bn_data *BN_DATA;
/* tree command types */
enum ft_msg_type {
FT_NONE = 0,
FT_INSERT = 1,
FT_DELETE_ANY = 2, // Delete any matching key. This used to be called FT_DELETE.
//FT_DELETE_BOTH = 3,
FT_ABORT_ANY = 4, // Abort any commands on any matching key.
//FT_ABORT_BOTH = 5, // Abort commands that match both the key and the value
FT_COMMIT_ANY = 6,
//FT_COMMIT_BOTH = 7,
FT_COMMIT_BROADCAST_ALL = 8, // Broadcast to all leafentries, (commit all transactions).
FT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction).
FT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction).
FT_INSERT_NO_OVERWRITE = 11,
FT_OPTIMIZE = 12, // Broadcast
FT_OPTIMIZE_FOR_UPGRADE = 13, // same as FT_OPTIMIZE, but record version number in leafnode
FT_UPDATE = 14,
FT_UPDATE_BROADCAST_ALL = 15
};
static inline bool
ft_msg_type_applies_once(enum ft_msg_type type)
{
bool ret_val;
switch (type) {
case FT_INSERT_NO_OVERWRITE:
case FT_INSERT:
case FT_DELETE_ANY:
case FT_ABORT_ANY:
case FT_COMMIT_ANY:
case FT_UPDATE:
ret_val = true;
break;
case FT_COMMIT_BROADCAST_ALL:
case FT_COMMIT_BROADCAST_TXN:
case FT_ABORT_BROADCAST_TXN:
case FT_OPTIMIZE:
case FT_OPTIMIZE_FOR_UPGRADE:
case FT_UPDATE_BROADCAST_ALL:
case FT_NONE:
ret_val = false;
break;
default:
assert(false);
}
return ret_val;
}
static inline bool
ft_msg_type_applies_all(enum ft_msg_type type)
{
bool ret_val;
switch (type) {
case FT_NONE:
case FT_INSERT_NO_OVERWRITE:
case FT_INSERT:
case FT_DELETE_ANY:
case FT_ABORT_ANY:
case FT_COMMIT_ANY:
case FT_UPDATE:
ret_val = false;
break;
case FT_COMMIT_BROADCAST_ALL:
case FT_COMMIT_BROADCAST_TXN:
case FT_ABORT_BROADCAST_TXN:
case FT_OPTIMIZE:
case FT_OPTIMIZE_FOR_UPGRADE:
case FT_UPDATE_BROADCAST_ALL:
ret_val = true;
break;
default:
assert(false);
}
return ret_val;
}
static inline bool
ft_msg_type_does_nothing(enum ft_msg_type type)
{
return (type == FT_NONE);
}
typedef struct xids_t *XIDS;
typedef struct fifo_msg_t *FIFO_MSG;
/* tree commands */
struct ft_msg {
enum ft_msg_type type;
MSN msn; // message sequence number
XIDS xids;
union {
/* insert or delete */
struct ft_cmd_insert_delete {
const DBT *key; // for insert, delete, upsertdel
const DBT *val; // for insert, delete, (and it is the "extra" for upsertdel, upsertdel_broadcast_all)
} id;
} u;
};
// Message sent into brt to implement command (insert, delete, etc.)
// This structure supports nested transactions, and obsoletes ft_msg.
typedef struct ft_msg FT_MSG_S;
typedef struct ft_msg *FT_MSG;
typedef int (*ft_compare_func)(DB *, const DBT *, const DBT *);
typedef void (*setval_func)(const DBT *, void *);
typedef int (*ft_update_func)(DB *, const DBT *, const DBT *, const DBT *, setval_func, void *);
typedef void (*on_redirect_callback)(FT_HANDLE, void*);
typedef void (*remove_ft_ref_callback)(FT, void*);
#define UU(x) x __attribute__((__unused__))
typedef struct memarena *MEMARENA;
typedef struct rollback_log_node *ROLLBACK_LOG_NODE;
typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE;
//
// Types of snapshots that can be taken by a tokutxn
// - TXN_SNAPSHOT_NONE: means that there is no snapshot. Reads do not use snapshot reads.
// used for SERIALIZABLE and READ UNCOMMITTED
// - TXN_SNAPSHOT_ROOT: means that all tokutxns use their root transaction's snapshot
// used for REPEATABLE READ
// - TXN_SNAPSHOT_CHILD: means that each child tokutxn creates its own snapshot
// used for READ COMMITTED
//
typedef enum __TXN_SNAPSHOT_TYPE {
TXN_SNAPSHOT_NONE=0,
TXN_SNAPSHOT_ROOT=1,
TXN_SNAPSHOT_CHILD=2
} TXN_SNAPSHOT_TYPE;
typedef struct ancestors *ANCESTORS;
typedef struct pivot_bounds const * const PIVOT_BOUNDS;
typedef struct ftnode_fetch_extra *FTNODE_FETCH_EXTRA;
typedef struct unlockers *UNLOCKERS;
enum reactivity {
RE_STABLE,
RE_FUSIBLE,
RE_FISSIBLE
};
enum split_mode {
SPLIT_EVENLY,
SPLIT_LEFT_HEAVY,
SPLIT_RIGHT_HEAVY
};
#endif

View File

@ -1,378 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* Benchmark various hash functions. */
#include <sys/time.h>
#include <zlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <toku_assert.h>
#define N 200000000
char *buf;
static double tdiff (struct timeval *a, struct timeval *b) {
return a->tv_sec - b->tv_sec + (1e-6)*(a->tv_usec - b->tv_usec);
}
#define measure_bandwidth(str, body) ({ \
int c; \
struct timeval start,end; \
gettimeofday(&start, 0); \
body; \
gettimeofday(&end, 0); \
double diff = tdiff(&end, &start); \
printf("%s=%08x %d bytes in %8.6fs for %8.3fMB/s\n", str, c, N, diff, N*(1e-6)/diff); \
})
int sum32 (int start, void *buf, int bytecount) {
int *ibuf = buf;
assert(bytecount%4==0);
while (bytecount>0) {
start+=*ibuf;
ibuf++;
bytecount-=4;
}
return start;
}
static const uint32_t m = 0x5bd1e995;
static const int r = 24;
static const uint32_t seed = 0x3dd3b51a;
#define USE_ZERO_CHECKSUM 0
static uint32_t MurmurHash2 ( const void * key, int len)
{
if (USE_ZERO_CHECKSUM) return 0;
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
// Initialize the hash to a 'random' value
uint32_t h = seed;
// Mix 4 bytes at a time into the hash
const unsigned char * data = (const unsigned char *)key;
while(len >= 4)
{
uint32_t k = *(uint32_t *)data;
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
len -= 4;
}
// Handle the last few bytes of the input array
switch(len)
{
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
h *= m;
};
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >> 29;
h *= m;
h ^= h >> 31;
return h;
}
struct murmur {
int n_bytes_in_k; // How many bytes in k
uint32_t k; // These are the extra bytes. Bytes are shifted into the low-order bits.
uint32_t h; // The hash so far (up to the most recent 4-byte boundary)
};
void murmur_init (struct murmur *mm) {
mm->n_bytes_in_k=0;
mm->k =0;
mm->h = seed;
}
#define MIX() ({ k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; })
#define LD1() data[0]
#define LD2() ((data[0]<<8) | data[1])
#define LD3() ((data[0]<<16) | (data[1]<<8) | data[2])
#define ADD1_0() (mm->k = LD1())
#define ADD1() (mm->k = (k<<8) | LD1())
#define ADD2_0() (mm->k = LD2())
#define ADD2() (mm->k = (k<<16) | LD2())
#define ADD3_0() (mm->k = LD3())
#define ADD3() (mm->k = (k<<24) | LD3())
void murmur_add (struct murmur *mm, const void * key, unsigned int len) {
if (USE_ZERO_CHECKSUM) return;
if (len==0) return;
const int n_bytes_in_k = mm->n_bytes_in_k;
uint32_t k = mm->k;
const unsigned char *data = key;
uint32_t h = mm->h;
switch (n_bytes_in_k) {
case 0:
switch (len) {
case 1: ADD1_0(); mm->n_bytes_in_k = 1; mm->h=h; return;
case 2: ADD2_0(); mm->n_bytes_in_k = 2; mm->h=h; return;
case 3: ADD3_0(); mm->n_bytes_in_k = 3; mm->h=h; return;
default: break;
}
break;
case 1:
switch (len) {
case 1: ADD1(); mm->n_bytes_in_k = 2; mm->h=h; return;
case 2: ADD2(); mm->n_bytes_in_k = 3; mm->h=h; return;
case 3: ADD3(); mm->n_bytes_in_k = 0; MIX(); mm->h=h; return;
default: ADD3(); mm->n_bytes_in_k = 0; MIX(); len-=3; data+=3; break;
}
break;
case 2:
switch (len) {
case 1: ADD1(); mm->n_bytes_in_k = 3; mm->h=h; return;
case 2: ADD2(); mm->n_bytes_in_k = 0; MIX(); mm->h=h; return;
default: ADD2(); mm->n_bytes_in_k = 0; MIX(); len-=2; data+=2; break;
}
break;
case 3:
switch (len) {
case 1: ADD1(); mm->n_bytes_in_k = 0; MIX(); mm->h=h; return;
default: ADD1(); mm->n_bytes_in_k = 0; MIX(); len--; data++; break;
}
break;
default: assert(0);
}
// We've used up the partial bytes at the beginning of k.
assert(mm->n_bytes_in_k==0);
while (len >= 4) {
uint32_t k = toku_dtoh32(*(uint32_t *)data);
//printf(" oldh=%08x k=%08x", h, k);
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
len -= 4;
//printf(" h=%08x\n", h);
}
mm->h=h;
//printf("%s:%d h=%08x\n", __FILE__, __LINE__, h);
{
uint32_t k=0;
switch (len) {
case 3: k = *data << 16; data++;
case 2: k |= *data << 8; data++;
case 1: k |= *data;
}
mm->k = k;
mm->n_bytes_in_k = len;
//printf("now extra=%08x (%d bytes) n_bytes=%d\n", mm->k, len, mm->n_bytes_in_k);
}
}
uint32_t murmur_finish (struct murmur *mm) {
if (USE_ZERO_CHECKSUM) return 0;
uint32_t h = mm->h;
if (mm->n_bytes_in_k>0) {
h ^= mm->k;
h *= m;
}
if (0) {
// The real murmur function does this extra mixing at the end. We don't need that for fingerprint.
h ^= h >> 29;
h *= m;
h ^= h >> 31;
}
return h;
}
struct sum84 {
uint32_t sum;
int i;
};
void sum84_init (struct sum84 *s) { s->sum=0; s->i=0; };
void sum84_add (struct sum84 *s, char *buf, int count) {
while (s->i%4!=0 && count>0) {
char v = *buf;
s->sum ^= v << (s->i%4)*8;
buf++; count--; s->i++;
}
while (count>4) {
s->sum ^= *(int*)buf;
buf+=4; count-=4;
}
while (count>0) {
char v = *buf;
s->sum ^= v << (s->i%4)*8;
buf++; count--; s->i++;
}
}
int sum84_finish (struct sum84 *s) {
return s->sum;
}
uint32_t xor8_add (uint32_t x, char *buf, int count) {
while (count>4) {
x ^= *(int*)buf;
buf+=4; count-=4;
}
while (count>0) {
char v = *buf;
x ^= v;
buf++; count--;
}
return x;
}
uint32_t xor8_finish (uint32_t x) {
return (x ^ (x>>8) ^ (x>>16) ^ (x>>24))&0xff;
}
uint64_t xor8_64_add (uint64_t x, char *buf, int count) {
while (count>8) {
x ^= *(uint64_t*)buf;
buf+=8; count-=8;
}
while (count>0) {
char v = *buf;
x ^= v;
buf++; count--;
}
return x;
}
uint32_t xor8_64_finish (uint64_t x) {
return (x ^ (x>>8) ^ (x>>16) ^ (x>>24) ^ (x>>32) ^ (x>>40) ^ (x>>48) ^ (x>>56))&0xff;
}
static void measure_bandwidths (void) {
measure_bandwidth("crc32 ", c=crc32(0, buf, N));
measure_bandwidth("sum32 ", c=sum32(0, buf, N));
measure_bandwidth("murmur ", c=MurmurHash2(buf, N));
measure_bandwidth("murmurf ", ({ struct murmur mm; murmur_init(&mm); murmur_add(&mm, buf, N); c=murmur_finish(&mm); }));
measure_bandwidth("sum84 ", ({ struct sum84 s; sum84_init(&s); sum84_add(&s, buf, N); c=sum84_finish(&s); }));
measure_bandwidth("xor32 ", ({ c=0; int j; for(j=0; j<N/4; j++) c^=*(int*)buf+j*4; }));
measure_bandwidth("xor8 ", c=xor8_finish(xor8_add(0, buf, N)));
measure_bandwidth("xor8_64 ", c=xor8_64_finish(xor8_64_add(0, buf, N)));
measure_bandwidth("crc32by1 ", ({ c=0; int j; for(j=0; j<N; j++) c=crc32(c, buf+j, 1); }));
measure_bandwidth("crc32by2 ", ({ c=0; int j; for(j=0; j<N; j+=2) c=crc32(c, buf+j, 2); }));
measure_bandwidth("sum8by1 ", ({ c=0; int j; for(j=0; j<N; j++) c+=buf[j]; }));
measure_bandwidth("murmurby1", ({ struct murmur mm; murmur_init(&mm); int j; for(j=0; j<N; j++) murmur_add(&mm, buf+j, 1); c=murmur_finish(&mm); }));
measure_bandwidth("murmurby2", ({ struct murmur mm; murmur_init(&mm); int j; for(j=0; j<N; j+=2) murmur_add(&mm, buf+j, 2); c=murmur_finish(&mm); }));
measure_bandwidth("sum84by1 ", ({ struct sum84 s; sum84_init(&s); int j; for(j=0; j<N; j++) sum84_add(&s, buf+j, 1); c=sum84_finish(&s); }));
measure_bandwidth("xor8by1 ", ({ int j; c=0; for(j=0; j<N; j++) c=xor8_add(c, buf+j, 1); c=xor8_finish(c); }));
measure_bandwidth("xor864by1", ({ int j; uint64_t x=0; for(j=0; j<N; j++) x=xor8_64_add(x, buf+j, 1); c=xor8_64_finish(x); }));
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
buf = malloc(N);
int i;
for (i=0; i<N; i++) buf[i]=random();
measure_bandwidths();
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,189 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "key.h"
#include "fttypes.h"
#include <memory.h>
#if 0
int toku_keycompare (bytevec key1b, ITEMLEN key1len, bytevec key2b, ITEMLEN key2len) {
const unsigned char *key1 = key1b;
const unsigned char *key2 = key2b;
while (key1len > 0 && key2len > 0) {
unsigned char b1 = key1[0];
unsigned char b2 = key2[0];
if (b1<b2) return -1;
if (b1>b2) return 1;
key1len--; key1++;
key2len--; key2++;
}
if (key1len<key2len) return -1;
if (key1len>key2len) return 1;
return 0;
}
#elif 0
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len) {
if (key1len==key2len) {
return memcmp(key1,key2,key1len);
} else if (key1len<key2len) {
int r = memcmp(key1,key2,key1len);
if (r<=0) return -1; /* If the keys are the same up to 1's length, then return -1, since key1 is shorter than key2. */
else return 1;
} else {
return -toku_keycompare(key2,key2len,key1,key1len);
}
}
#elif 0
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len) {
if (key1len==key2len) {
return memcmp(key1,key2,key1len);
} else if (key1len<key2len) {
int r = memcmp(key1,key2,key1len);
if (r<=0) return -1; /* If the keys are the same up to 1's length, then return -1, since key1 is shorter than key2. */
else return 1;
} else {
int r = memcmp(key1,key2,key2len);
if (r>=0) return 1; /* If the keys are the same up to 2's length, then return 1 since key1 is longer than key2 */
else return -1;
}
}
#elif 0
/* This one looks tighter, but it does use memcmp... */
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len) {
int comparelen = key1len<key2len ? key1len : key2len;
const unsigned char *k1;
const unsigned char *k2;
for (k1=key1, k2=key2;
comparelen>0;
k1++, k2++, comparelen--) {
if (*k1 != *k2) {
return (int)*k1-(int)*k2;
}
}
if (key1len<key2len) return -1;
if (key1len>key2len) return 1;
return 0;
}
#else
/* unroll that one four times */
// when a and b are chars, return a-b is safe here because return type is int. No over/underflow possible.
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len) {
int comparelen = key1len<key2len ? key1len : key2len;
const unsigned char *k1;
const unsigned char *k2;
for (CAST_FROM_VOIDP(k1, key1), CAST_FROM_VOIDP(k2, key2);
comparelen>4;
k1+=4, k2+=4, comparelen-=4) {
{ int v1=k1[0], v2=k2[0]; if (v1!=v2) return v1-v2; }
{ int v1=k1[1], v2=k2[1]; if (v1!=v2) return v1-v2; }
{ int v1=k1[2], v2=k2[2]; if (v1!=v2) return v1-v2; }
{ int v1=k1[3], v2=k2[3]; if (v1!=v2) return v1-v2; }
}
for (;
comparelen>0;
k1++, k2++, comparelen--) {
if (*k1 != *k2) {
return (int)*k1-(int)*k2;
}
}
if (key1len<key2len) return -1;
if (key1len>key2len) return 1;
return 0;
}
#endif
int
toku_builtin_compare_fun (DB *db __attribute__((__unused__)), const DBT *a, const DBT*b) {
return toku_keycompare(a->data, a->size, b->data, b->size);
}

View File

@ -1,104 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_KEY_H
#define TOKU_KEY_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "ybt.h"
#include "fttypes.h"
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
void toku_test_keycompare (void) ;
int toku_builtin_compare_fun (DB *, const DBT *, const DBT*) __attribute__((__visibility__("default")));
#endif

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,9 +89,10 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "ft.h"
#include "ft-internal.h"
#include "le-cursor.h"
#include "ft/ft.h"
#include "ft/ft-internal.h"
#include "ft/le-cursor.h"
#include "ft/cursor.h"
// A LE_CURSOR is a special purpose FT_CURSOR that:
// - enables prefetching
@ -100,10 +101,6 @@ PATENT RIGHTS GRANT:
// A LE_CURSOR is good for scanning a FT from beginning to end. Useful for hot indexing.
struct le_cursor {
// TODO: remove DBs from the ft layer comparison function
// so this is never necessary
// use a fake db for comparisons.
struct __toku_db fake_db;
FT_CURSOR ft_cursor;
bool neg_infinity; // true when the le cursor is positioned at -infinity (initial setting)
bool pos_infinity; // true when the le cursor is positioned at +infinity (when _next returns DB_NOTFOUND)
@ -123,8 +120,6 @@ toku_le_cursor_create(LE_CURSOR *le_cursor_result, FT_HANDLE ft_handle, TOKUTXN
toku_ft_cursor_set_leaf_mode(le_cursor->ft_cursor);
le_cursor->neg_infinity = false;
le_cursor->pos_infinity = true;
// zero out the fake DB. this is a rare operation so it's not too slow.
memset(&le_cursor->fake_db, 0, sizeof(le_cursor->fake_db));
}
}
@ -169,13 +164,9 @@ toku_le_cursor_is_key_greater_or_equal(LE_CURSOR le_cursor, const DBT *key) {
} else if (le_cursor->pos_infinity) {
result = false; // all keys are less than +infinity
} else {
// get the comparison function and descriptor from the cursor's ft
FT_HANDLE ft_handle = le_cursor->ft_cursor->ft_handle;
ft_compare_func keycompare = toku_ft_get_bt_compare(ft_handle);
le_cursor->fake_db.cmp_descriptor = toku_ft_get_cmp_descriptor(ft_handle);
FT ft = le_cursor->ft_cursor->ft_handle->ft;
// get the current position from the cursor and compare it to the given key.
DBT *cursor_key = &le_cursor->ft_cursor->key;
int r = keycompare(&le_cursor->fake_db, cursor_key, key);
int r = ft->cmp(&le_cursor->ft_cursor->key, key);
if (r <= 0) {
result = true; // key is right of the cursor key
} else {

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -86,13 +86,12 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#ifndef LE_CURSOR_H
#define LE_CURSOR_H
#include "ft-ops.h"
#include "ft/ft-internal.h"
// A leaf entry cursor (LE_CURSOR) is a special type of FT_CURSOR that visits all of the leaf entries in a tree
// and returns the leaf entry to the caller. It maintains a copy of the key that it was last positioned over to
@ -104,10 +103,10 @@ PATENT RIGHTS GRANT:
typedef struct le_cursor *LE_CURSOR;
// Create a leaf cursor for a tree (brt) within a transaction (txn)
// Create a leaf cursor for a tree (ft_h) within a transaction (txn)
// Success: returns 0, stores the LE_CURSOR in the le_cursor_result
// Failure: returns a non-zero error number
int toku_le_cursor_create(LE_CURSOR *le_cursor_result, FT_HANDLE brt, TOKUTXN txn);
int toku_le_cursor_create(LE_CURSOR *le_cursor_result, FT_HANDLE ft_h, TOKUTXN txn);
// Close and free the LE_CURSOR
void toku_le_cursor_close(LE_CURSOR le_cursor);
@ -127,5 +126,3 @@ bool toku_le_cursor_is_key_greater_or_equal(LE_CURSOR le_cursor, const DBT *key)
// extracts position of le_cursor into estimate. Responsibility of caller to handle
// thread safety. Caller (the indexer), does so by ensuring indexer lock is held
void toku_le_cursor_update_estimate(LE_CURSOR le_cursor, DBT* estimate);
#endif

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,7 +89,7 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "wbuf.h"
#include "serialize/wbuf.h"
#include "leafentry.h"
void wbuf_nocrc_LEAFENTRY(struct wbuf *w, LEAFENTRY le) {

View File

@ -1,9 +1,6 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_LEAFENTRY_H
#define TOKU_LEAFENTRY_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -33,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -90,17 +87,19 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h>
#include <util/mempool.h>
#include <util/omt.h>
#include "txn_manager.h"
#include "rbuf.h"
#include "x1764.h"
#include "omt.h"
#include "ft/txn/txn_manager.h"
#include "ft/serialize/rbuf.h"
#include "ft/msg.h"
/*
Memory format of packed leaf entry
@ -212,6 +211,7 @@ void wbuf_nocrc_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
int print_klpair (FILE *outf, const void* key, uint32_t keylen, LEAFENTRY v); // Print a leafentry out in human-readable form.
int le_latest_is_del(LEAFENTRY le); // Return true if it is a provisional delete.
int le_val_is_del(LEAFENTRY le, bool is_snapshot_read, TOKUTXN txn); // Returns true if the value that is to be read is empty
bool le_is_clean(LEAFENTRY le); //Return how many xids exist (0 does not count)
bool le_has_xids(LEAFENTRY le, XIDS xids); // Return true transaction represented by xids is still provisional in this leafentry (le's xid stack is a superset or equal to xids)
void* le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes)
@ -228,10 +228,13 @@ uint64_t le_outermost_uncommitted_xid (LEAFENTRY le);
// r|r!=0&&r!=TOKUDB_ACCEPT: Quit early, return r, because something unexpected went wrong (error case)
typedef int(*LE_ITERATE_CALLBACK)(TXNID id, TOKUTXN context);
int le_iterate_is_del(LEAFENTRY le, LE_ITERATE_CALLBACK f, bool *is_empty, TOKUTXN context);
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, uint32_t *vallenp, TOKUTXN context);
void le_extract_val(LEAFENTRY le,
// should we return the entire leafentry as the val?
bool is_leaf_mode, bool is_snapshot_read,
TOKUTXN ttxn, uint32_t *vallen, void **val);
size_t
leafentry_disksize_13(LEAFENTRY_13 le);
@ -242,11 +245,14 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p);
class bn_data;
void
toku_le_apply_msg(FT_MSG msg,
toku_le_apply_msg(const ft_msg &msg,
LEAFENTRY old_leafentry, // NULL if there was no stored data.
bn_data* data_buffer, // bn_data storing leafentry, if NULL, means there is no bn_data
uint32_t idx, // index in data_buffer where leafentry is stored (and should be replaced
uint32_t old_keylen,
txn_gc_info *gc_info,
LEAFENTRY *new_leafentry_p,
int64_t * numbytes_delta_p);
@ -262,6 +268,3 @@ toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
txn_gc_info *gc_info,
LEAFENTRY *new_leaf_entry,
int64_t * numbytes_delta_p);
#endif /* TOKU_LEAFENTRY_H */

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -95,8 +95,8 @@ PATENT RIGHTS GRANT:
#include <errno.h>
#include <string.h>
#include "ftloader-internal.h"
#include "ybt.h"
#include "loader/loader-internal.h"
#include "util/dbt.h"
static void error_callback_lock(ft_loader_error_callback loader_error) {
toku_mutex_lock(&loader_error->mutex);

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,16 +89,17 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "dbufio.h"
#include "fttypes.h"
#include <toku_assert.h>
#include <errno.h>
#include <unistd.h>
#include "memory.h"
#include <string.h>
#include "ftloader-internal.h"
#include "ft-internal.h"
#include "ft.h"
#include <unistd.h>
#include "portability/toku_assert.h"
#include "portability/memory.h"
#include "ft/ft-internal.h"
#include "ft/serialize/ft_node-serialize.h"
#include "loader/dbufio.h"
#include "loader/loader-internal.h"
struct dbufio_file {
// i/o thread owns these

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_DBUFIO_H
#define TOKU_DBUFIO_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,6 +86,8 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved."
#include <toku_portability.h>
@ -108,5 +108,3 @@ int dbufio_fileset_read (DBUFIO_FILESET bfs, int filenum, void *buf_v, size_t co
int panic_dbufio_fileset(DBUFIO_FILESET, int error);
void dbufio_print(DBUFIO_FILESET);
#endif

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FTLOADER_INTERNAL_H
#define FTLOADER_INTERNAL_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -31,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,28 +86,31 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved."
#include <db.h>
#include "fttypes.h"
#include "ftloader.h"
#include "queue.h"
#include <toku_pthread.h>
#include "dbufio.h"
enum { EXTRACTOR_QUEUE_DEPTH = 2,
FILE_BUFFER_SIZE = 1<<24,
MIN_ROWSET_MEMORY = 1<<23,
MIN_MERGE_FANIN = 2,
FRACTAL_WRITER_QUEUE_DEPTH = 3,
FRACTAL_WRITER_ROWSETS = FRACTAL_WRITER_QUEUE_DEPTH + 2,
DBUFIO_DEPTH = 2,
TARGET_MERGE_BUF_SIZE = 1<<24, // we'd like the merge buffer to be this big.
MIN_MERGE_BUF_SIZE = 1<<20, // always use at least this much
MAX_UNCOMPRESSED_BUF = MIN_MERGE_BUF_SIZE
#include "portability/toku_pthread.h"
#include "loader/dbufio.h"
#include "loader/loader.h"
#include "util/queue.h"
enum {
EXTRACTOR_QUEUE_DEPTH = 2,
FILE_BUFFER_SIZE = 1<<24,
MIN_ROWSET_MEMORY = 1<<23,
MIN_MERGE_FANIN = 2,
FRACTAL_WRITER_QUEUE_DEPTH = 3,
FRACTAL_WRITER_ROWSETS = FRACTAL_WRITER_QUEUE_DEPTH + 2,
DBUFIO_DEPTH = 2,
TARGET_MERGE_BUF_SIZE = 1<<24, // we'd like the merge buffer to be this big.
MIN_MERGE_BUF_SIZE = 1<<20, // always use at least this much
MAX_UNCOMPRESSED_BUF = MIN_MERGE_BUF_SIZE
};
/* These functions are exported to allow the tests to compile. */
/* These structures maintain a collection of all the open temporary files used by the loader. */
@ -257,7 +258,7 @@ struct ft_loader_s {
int progress_callback_result; // initially zero, if any call to the poll function callback returns nonzero, we save the result here (and don't call the poll callback function again).
LSN load_lsn; //LSN of the fsynced 'load' log entry. Write this LSN (as checkpoint_lsn) in brt headers made by this loader.
LSN load_lsn; //LSN of the fsynced 'load' log entry. Write this LSN (as checkpoint_lsn) in ft headers made by this loader.
TXNID load_root_xid; //(Root) transaction that performed the load.
QUEUE *fractal_queues; // an array of work queues, one for each secondary index.
@ -280,7 +281,7 @@ uint64_t toku_ft_loader_get_n_rows(FTLOADER bl);
struct fractal_thread_args {
FTLOADER bl;
const DESCRIPTOR descriptor;
int fd; // write the brt into tfd.
int fd; // write the ft into fd.
int progress_allocation;
QUEUE q;
uint64_t total_disksize_estimate;
@ -312,17 +313,17 @@ int toku_merge_some_files_using_dbufio (const bool to_q, FIDX dest_data, QUEUE q
int ft_loader_sort_and_write_rows (struct rowset *rows, struct merge_fileset *fs, FTLOADER bl, int which_db, DB *dest_db, ft_compare_func);
// This is probably only for testing.
int toku_loader_write_brt_from_q_in_C (FTLOADER bl,
const DESCRIPTOR descriptor,
int fd, // write to here
int progress_allocation,
QUEUE q,
uint64_t total_disksize_estimate,
int which_db,
uint32_t target_nodesize,
uint32_t target_basementnodesize,
enum toku_compression_method target_compression_method,
uint32_t fanout);
int toku_loader_write_ft_from_q_in_C (FTLOADER bl,
const DESCRIPTOR descriptor,
int fd, // write to here
int progress_allocation,
QUEUE q,
uint64_t total_disksize_estimate,
int which_db,
uint32_t target_nodesize,
uint32_t target_basementnodesize,
enum toku_compression_method target_compression_method,
uint32_t fanout);
int ft_loader_mergesort_row_array (struct row rows[/*n*/], int n, int which_db, DB *dest_db, ft_compare_func, FTLOADER, struct rowset *);
@ -339,7 +340,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
CACHETABLE cachetable,
generate_row_for_put_func g,
DB *src_db,
int N, FT_HANDLE brts[/*N*/], DB* dbs[/*N*/],
int N, FT_HANDLE ft_hs[/*N*/], DB* dbs[/*N*/],
const char *new_fnames_in_env[/*N*/],
ft_compare_func bt_compare_functions[/*N*/],
const char *temp_file_template,
@ -362,5 +363,3 @@ int toku_ft_loader_get_error(FTLOADER bl, int *loader_errno);
void ft_loader_lock_init(FTLOADER bl);
void ft_loader_lock_destroy(FTLOADER bl);
void ft_loader_set_fractal_workers_count_from_c(FTLOADER bl);
#endif // FTLOADER_INTERNAL_H

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -91,9 +91,7 @@ PATENT RIGHTS GRANT:
#include <toku_portability.h>
#if !TOKU_WINDOWS
#include <arpa/inet.h>
#endif
#include <stdio.h>
#include <memory.h>
@ -101,16 +99,21 @@ PATENT RIGHTS GRANT:
#include <toku_assert.h>
#include <string.h>
#include <fcntl.h>
#include "x1764.h"
#include "ftloader-internal.h"
#include "ft-internal.h"
#include "sub_block.h"
#include "sub_block_map.h"
#include "pqueue.h"
#include "dbufio.h"
#include "leafentry.h"
#include "log-internal.h"
#include "ft.h"
#include "ft/ft.h"
#include "ft/ft-internal.h"
#include "ft/leafentry.h"
#include "ft/loader/loader-internal.h"
#include "ft/loader/pqueue.h"
#include "ft/loader/dbufio.h"
#include "ft/logger/log-internal.h"
#include "ft/node.h"
#include "ft/serialize/block_table.h"
#include "ft/serialize/ft-serialize.h"
#include "ft/serialize/ft_node-serialize.h"
#include "ft/serialize/sub_block.h"
#include "util/x1764.h"
static size_t (*os_fwrite_fun)(const void *,size_t,size_t,FILE*)=NULL;
void ft_loader_set_os_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) {
@ -423,7 +426,7 @@ void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error) {
destroy_rowset(&bl->primary_rowset);
if (bl->primary_rowset_queue) {
queue_destroy(bl->primary_rowset_queue);
toku_queue_destroy(bl->primary_rowset_queue);
bl->primary_rowset_queue = nullptr;
}
@ -541,7 +544,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
CACHETABLE cachetable,
generate_row_for_put_func g,
DB *src_db,
int N, FT_HANDLE brts[/*N*/], DB* dbs[/*N*/],
int N, FT_HANDLE fts[/*N*/], DB* dbs[/*N*/],
const char *new_fnames_in_env[/*N*/],
ft_compare_func bt_compare_functions[/*N*/],
const char *temp_file_template,
@ -585,11 +588,11 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
#define SET_TO_MY_STRDUP(lval, s) do { char *v = toku_strdup(s); if (!v) { int r = get_error_errno(); toku_ft_loader_internal_destroy(bl, true); return r; } lval = v; } while (0)
MY_CALLOC_N(N, bl->root_xids_that_created);
for (int i=0; i<N; i++) if (brts[i]) bl->root_xids_that_created[i]=brts[i]->ft->h->root_xid_that_created;
for (int i=0; i<N; i++) if (fts[i]) bl->root_xids_that_created[i]=fts[i]->ft->h->root_xid_that_created;
MY_CALLOC_N(N, bl->dbs);
for (int i=0; i<N; i++) if (brts[i]) bl->dbs[i]=dbs[i];
for (int i=0; i<N; i++) if (fts[i]) bl->dbs[i]=dbs[i];
MY_CALLOC_N(N, bl->descriptors);
for (int i=0; i<N; i++) if (brts[i]) bl->descriptors[i]=&brts[i]->ft->descriptor;
for (int i=0; i<N; i++) if (fts[i]) bl->descriptors[i]=&fts[i]->ft->descriptor;
MY_CALLOC_N(N, bl->new_fnames_in_env);
for (int i=0; i<N; i++) SET_TO_MY_STRDUP(bl->new_fnames_in_env[i], new_fnames_in_env[i]);
MY_CALLOC_N(N, bl->extracted_datasizes); // the calloc_n zeroed everything, which is what we want
@ -629,7 +632,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
int r = init_rowset(&bl->primary_rowset, memory_per_rowset_during_extract(bl));
if (r!=0) { toku_ft_loader_internal_destroy(bl, true); return r; }
}
{ int r = queue_create(&bl->primary_rowset_queue, EXTRACTOR_QUEUE_DEPTH);
{ int r = toku_queue_create(&bl->primary_rowset_queue, EXTRACTOR_QUEUE_DEPTH);
if (r!=0) { toku_ft_loader_internal_destroy(bl, true); return r; }
}
{
@ -641,11 +644,11 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
return 0;
}
int toku_ft_loader_open (/* out */ FTLOADER *blp,
int toku_ft_loader_open (FTLOADER *blp, /* out */
CACHETABLE cachetable,
generate_row_for_put_func g,
DB *src_db,
int N, FT_HANDLE brts[/*N*/], DB* dbs[/*N*/],
int N, FT_HANDLE fts[/*N*/], DB* dbs[/*N*/],
const char *new_fnames_in_env[/*N*/],
ft_compare_func bt_compare_functions[/*N*/],
const char *temp_file_template,
@ -655,9 +658,9 @@ int toku_ft_loader_open (/* out */ FTLOADER *blp,
uint64_t reserve_memory_size,
bool compress_intermediates,
bool allow_puts) {
// Effect: called by DB_ENV->create_loader to create a brt loader.
// Effect: called by DB_ENV->create_loader to create an ft loader.
// Arguments:
// blp Return the brt loader here.
// blp Return a ft loader ("bulk loader") here.
// g The function for generating a row
// src_db The source database. Needed by g. May be NULL if that's ok with g.
// N The number of dbs to create.
@ -672,7 +675,7 @@ int toku_ft_loader_open (/* out */ FTLOADER *blp,
int result = 0;
{
int r = toku_ft_loader_internal_init(blp, cachetable, g, src_db,
N, brts, dbs,
N, fts, dbs,
new_fnames_in_env,
bt_compare_functions,
temp_file_template,
@ -1138,7 +1141,7 @@ static void* extractor_thread (void *blv) {
while (1) {
void *item;
{
int rq = queue_deq(bl->primary_rowset_queue, &item, NULL, NULL);
int rq = toku_queue_deq(bl->primary_rowset_queue, &item, NULL, NULL);
if (rq==EOF) break;
invariant(rq==0); // other errors are arbitrarily bad.
}
@ -1169,7 +1172,7 @@ static void enqueue_for_extraction (FTLOADER bl) {
struct rowset *XMALLOC(enqueue_me);
*enqueue_me = bl->primary_rowset;
zero_rowset(&bl->primary_rowset);
int r = queue_enq(bl->primary_rowset_queue, (void*)enqueue_me, 1, NULL);
int r = toku_queue_enq(bl->primary_rowset_queue, (void*)enqueue_me, 1, NULL);
resource_assert_zero(r);
}
@ -1206,7 +1209,7 @@ finish_extractor (FTLOADER bl) {
}
//printf("%s:%d please finish extraction\n", __FILE__, __LINE__);
{
int r = queue_eof(bl->primary_rowset_queue);
int r = toku_queue_eof(bl->primary_rowset_queue);
invariant(r==0);
}
//printf("%s:%d joining\n", __FILE__, __LINE__);
@ -1218,7 +1221,7 @@ finish_extractor (FTLOADER bl) {
bl->extractor_live = false;
}
{
int r = queue_destroy(bl->primary_rowset_queue);
int r = toku_queue_destroy(bl->primary_rowset_queue);
invariant(r==0);
bl->primary_rowset_queue = nullptr;
}
@ -1378,7 +1381,7 @@ static int process_primary_rows (FTLOADER bl, struct rowset *primary_rowset) {
}
int toku_ft_loader_put (FTLOADER bl, DBT *key, DBT *val)
/* Effect: Put a key-value pair into the brt loader. Called by DB_LOADER->put().
/* Effect: Put a key-value pair into the ft loader. Called by DB_LOADER->put().
* Return value: 0 on success, an error number otherwise.
*/
{
@ -1882,7 +1885,7 @@ int toku_merge_some_files_using_dbufio (const bool to_q, FIDX dest_data, QUEUE q
if (to_q) {
if (row_wont_fit(output_rowset, keys[mini].size + vals[mini].size)) {
{
int r = queue_enq(q, (void*)output_rowset, 1, NULL);
int r = toku_queue_enq(q, (void*)output_rowset, 1, NULL);
if (r!=0) {
result = r;
break;
@ -1958,7 +1961,7 @@ int toku_merge_some_files_using_dbufio (const bool to_q, FIDX dest_data, QUEUE q
}
if (result==0 && to_q) {
int r = queue_enq(q, (void*)output_rowset, 1, NULL);
int r = toku_queue_enq(q, (void*)output_rowset, 1, NULL);
if (r!=0)
result = r;
else
@ -2149,7 +2152,7 @@ int merge_files (struct merge_fileset *fs,
if (result) ft_loader_set_panic(bl, result, true, which_db, nullptr, nullptr);
{
int r = queue_eof(output_q);
int r = toku_queue_eof(output_q);
if (r!=0 && result==0) result = r;
}
// It's conceivable that the progress_allocation could be nonzero (for example if bl->N==0)
@ -2219,16 +2222,16 @@ struct dbout {
int64_t n_translations_limit;
struct translation *translation;
toku_mutex_t mutex;
FT h;
FT ft;
};
static inline void dbout_init(struct dbout *out, FT h) {
static inline void dbout_init(struct dbout *out, FT ft) {
out->fd = -1;
out->current_off = 0;
out->n_translations = out->n_translations_limit = 0;
out->translation = NULL;
toku_mutex_init(&out->mutex, NULL);
out->h = h;
out->ft = ft;
}
static inline void dbout_destroy(struct dbout *out) {
@ -2345,12 +2348,12 @@ static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc)
lbuf->nkeys = lbuf->ndata = lbuf->dsize = 0;
lbuf->off = 0;
lbuf->xids = xids_get_root_xids();
lbuf->xids = toku_xids_get_root_xids();
if (xid != TXNID_NONE) {
XIDS new_xids = NULL;
int r = xids_create_child(lbuf->xids, &new_xids, xid);
int r = toku_xids_create_child(lbuf->xids, &new_xids, xid);
assert(r == 0 && new_xids);
xids_destroy(&lbuf->xids);
toku_xids_destroy(&lbuf->xids);
lbuf->xids = new_xids;
}
@ -2371,7 +2374,7 @@ static int write_header (struct dbout *out, long long translation_location_on_di
static void drain_writer_q(QUEUE q) {
void *item;
while (1) {
int r = queue_deq(q, &item, NULL, NULL);
int r = toku_queue_deq(q, &item, NULL, NULL);
if (r == EOF)
break;
invariant(r == 0);
@ -2501,7 +2504,7 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
while (result == 0) {
void *item;
{
int rr = queue_deq(q, &item, NULL, NULL);
int rr = toku_queue_deq(q, &item, NULL, NULL);
if (rr == EOF) break;
if (rr != 0) {
ft_loader_set_panic(bl, rr, true, which_db, nullptr, nullptr);
@ -2614,7 +2617,7 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
{
invariant(sts.n_subtrees==1);
out.h->h->root_blocknum = make_blocknum(sts.subtrees[0].block);
out.ft->h->root_blocknum = make_blocknum(sts.subtrees[0].block);
toku_free(sts.subtrees); sts.subtrees = NULL;
// write the descriptor
@ -2630,7 +2633,7 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
char *XMALLOC_N(desc_size, buf);
wbuf_init(&wbuf, buf, desc_size);
toku_serialize_descriptor_contents_to_wbuf(&wbuf, descriptor);
uint32_t checksum = x1764_finish(&wbuf.checksum);
uint32_t checksum = toku_x1764_finish(&wbuf.checksum);
wbuf_int(&wbuf, checksum);
invariant(wbuf.ndone==desc_size);
r = toku_os_write(out.fd, wbuf.buf, wbuf.ndone);
@ -2681,17 +2684,17 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
return result;
}
int toku_loader_write_brt_from_q_in_C (FTLOADER bl,
const DESCRIPTOR descriptor,
int fd, // write to here
int progress_allocation,
QUEUE q,
uint64_t total_disksize_estimate,
int which_db,
uint32_t target_nodesize,
uint32_t target_basementnodesize,
enum toku_compression_method target_compression_method,
uint32_t target_fanout)
int toku_loader_write_ft_from_q_in_C (FTLOADER bl,
const DESCRIPTOR descriptor,
int fd, // write to here
int progress_allocation,
QUEUE q,
uint64_t total_disksize_estimate,
int which_db,
uint32_t target_nodesize,
uint32_t target_basementnodesize,
enum toku_compression_method target_compression_method,
uint32_t target_fanout)
// This is probably only for testing.
{
target_nodesize = target_nodesize == 0 ? default_loader_nodesize : target_nodesize;
@ -2723,7 +2726,7 @@ static int loader_do_i (FTLOADER bl,
struct rowset *rows = &(bl->rows[which_db]);
invariant(rows->data==NULL); // the rows should be all cleaned up already
int r = queue_create(&bl->fractal_queues[which_db], FRACTAL_WRITER_QUEUE_DEPTH);
int r = toku_queue_create(&bl->fractal_queues[which_db], FRACTAL_WRITER_QUEUE_DEPTH);
if (r) goto error;
{
@ -2767,7 +2770,7 @@ static int loader_do_i (FTLOADER bl,
r = toku_pthread_create(bl->fractal_threads+which_db, NULL, fractal_thread, (void*)&fta);
if (r) {
int r2 __attribute__((__unused__)) = queue_destroy(bl->fractal_queues[which_db]);
int r2 __attribute__((__unused__)) = toku_queue_destroy(bl->fractal_queues[which_db]);
// ignore r2, since we already have an error
bl->fractal_queues[which_db] = nullptr;
goto error;
@ -2788,7 +2791,7 @@ static int loader_do_i (FTLOADER bl,
if (r == 0) r = fta.errno_result;
}
} else {
queue_eof(bl->fractal_queues[which_db]);
toku_queue_eof(bl->fractal_queues[which_db]);
r = toku_loader_write_ft_from_q(bl, descriptor, fd, progress_allocation,
bl->fractal_queues[which_db], bl->extracted_datasizes[which_db], which_db,
target_nodesize, target_basementnodesize, target_compression_method, target_fanout);
@ -2797,7 +2800,7 @@ static int loader_do_i (FTLOADER bl,
error: // this is the cleanup code. Even if r==0 (no error) we fall through to here.
if (bl->fractal_queues[which_db]) {
int r2 = queue_destroy(bl->fractal_queues[which_db]);
int r2 = toku_queue_destroy(bl->fractal_queues[which_db]);
invariant(r2==0);
bl->fractal_queues[which_db] = nullptr;
}
@ -2938,17 +2941,13 @@ static void add_pair_to_leafnode (struct leaf_buf *lbuf, unsigned char *key, int
// #3588 TODO just make a clean ule and append it to the omt
// #3588 TODO can do the rebalancing here and avoid a lot of work later
FTNODE leafnode = lbuf->node;
uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
DBT thekey = { .data = key, .size = (uint32_t) keylen };
DBT theval = { .data = val, .size = (uint32_t) vallen };
FT_MSG_S cmd = { .type = FT_INSERT,
.msn = ZERO_MSN,
.xids = lbuf->xids,
.u = { .id = { &thekey, &theval } } };
uint64_t workdone=0;
uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();
DBT kdbt, vdbt;
ft_msg msg(toku_fill_dbt(&kdbt, key, keylen), toku_fill_dbt(&vdbt, val, vallen), FT_INSERT, ZERO_MSN, lbuf->xids);
uint64_t workdone = 0;
// there's no mvcc garbage in a bulk-loaded FT, so there's no need to pass useful gc info
txn_gc_info gc_info(nullptr, TXNID_NONE, TXNID_NONE, true);
toku_ft_bn_apply_cmd_once(BLB(leafnode,0), &cmd, idx, NULL, &gc_info, &workdone, stats_to_update);
toku_ft_bn_apply_msg_once(BLB(leafnode,0), msg, idx, keylen, NULL, &gc_info, &workdone, stats_to_update);
}
static int write_literal(struct dbout *out, void*data, size_t len) {
@ -2988,7 +2987,7 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr
toku_free(serialized_leaf);
}
toku_ftnode_free(&lbuf->node);
xids_destroy(&lbuf->xids);
toku_xids_destroy(&lbuf->xids);
toku_free(lbuf);
//printf("Nodewrite %d (%.1f%%):", progress_allocation, 100.0*progress_allocation/PROGRESS_MAX);
@ -3013,7 +3012,7 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla
putbuf_int64(&ttable, out->translation[i].off);
putbuf_int64(&ttable, out->translation[i].size);
}
unsigned int checksum = x1764_memory(ttable.buf, ttable.off);
unsigned int checksum = toku_x1764_memory(ttable.buf, ttable.off);
putbuf_int32(&ttable, checksum);
// pad it to 512 zeros
long long encoded_length = ttable.off;
@ -3036,7 +3035,7 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla
static int
write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) {
int result = 0;
size_t size = toku_serialize_ft_size(out->h->h);
size_t size = toku_serialize_ft_size(out->ft->h);
size_t alloced_size = roundup_to_multiple(512, size);
struct wbuf wbuf;
char *MALLOC_N_ALIGNED(512, alloced_size, buf);
@ -3044,8 +3043,8 @@ write_header (struct dbout *out, long long translation_location_on_disk, long lo
result = get_error_errno();
} else {
wbuf_init(&wbuf, buf, size);
out->h->h->on_disk_stats = out->h->in_memory_stats;
toku_serialize_ft_to_wbuf(&wbuf, out->h->h, translation_location_on_disk, translation_size_on_disk);
out->ft->h->on_disk_stats = out->ft->in_memory_stats;
toku_serialize_ft_to_wbuf(&wbuf, out->ft->h, translation_location_on_disk, translation_size_on_disk);
for (size_t i=size; i<alloced_size; i++) buf[i]=0; // initialize all those unused spots to zero
if (wbuf.ndone != size)
result = EINVAL;
@ -3167,11 +3166,7 @@ static void write_nonleaf_node (FTLOADER bl, struct dbout *out, int64_t blocknum
FTNODE XMALLOC(node);
toku_initialize_empty_ftnode(node, make_blocknum(blocknum_of_new_node), height, n_children,
FT_LAYOUT_VERSION, 0);
node->totalchildkeylens = 0;
for (int i=0; i<n_children-1; i++) {
toku_clone_dbt(&node->childkeys[i], pivots[i]);
node->totalchildkeylens += pivots[i].size;
}
node->pivotkeys.create_from_dbts(pivots, n_children - 1);
assert(node->bp);
for (int i=0; i<n_children; i++) {
BP_BLOCKNUM(node,i) = make_blocknum(subtree_info[i].block);
@ -3205,14 +3200,14 @@ static void write_nonleaf_node (FTLOADER bl, struct dbout *out, int64_t blocknum
for (int i=0; i<n_children-1; i++) {
toku_free(pivots[i].data);
toku_free(node->childkeys[i].data);
}
for (int i=0; i<n_children; i++) {
destroy_nonleaf_childinfo(BNC(node,i));
}
toku_free(pivots);
// TODO: Should be using toku_destroy_ftnode_internals, which should be renamed to toku_ftnode_destroy
toku_free(node->bp);
toku_free(node->childkeys);
node->pivotkeys.destroy();
toku_free(node);
toku_free(ndd);
toku_free(subtree_info);

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef FTLOADER_H
#define FTLOADER_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,9 +87,16 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "ft/txn/txn.h"
#include "ft/cachetable/cachetable.h"
#include "ft/comparator.h"
#include "ft/ft-ops.h"
// The loader callbacks are C functions and need to be defined as such
typedef void (*ft_loader_error_func)(DB *, int which_db, int err, DBT *key, DBT *val, void *extra);
@ -102,13 +107,13 @@ typedef struct ft_loader_s *FTLOADER;
int toku_ft_loader_open (FTLOADER *bl,
CACHETABLE cachetable,
generate_row_for_put_func g,
DB *src_db,
int N,
FT_HANDLE brts[/*N*/], DB* dbs[/*N*/],
const char * new_fnames_in_env[/*N*/],
ft_compare_func bt_compare_functions[/*N*/],
const char *temp_file_template,
generate_row_for_put_func g,
DB *src_db,
int N,
FT_HANDLE ft_hs[/*N*/], DB* dbs[/*N*/],
const char * new_fnames_in_env[/*N*/],
ft_compare_func bt_compare_functions[/*N*/],
const char *temp_file_template,
LSN load_lsn,
TOKUTXN txn,
bool reserve_memory,
@ -131,5 +136,3 @@ void toku_ft_loader_set_size_factor (uint32_t factor);
void ft_loader_set_os_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*));
size_t ft_loader_leafentry_size(size_t key_size, size_t val_size, TXNID xid);
#endif // FTLOADER_H

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -92,8 +92,8 @@ PATENT RIGHTS GRANT:
#include <toku_portability.h>
#include "toku_os.h"
#include "ft-internal.h"
#include "ftloader-internal.h"
#include "pqueue.h"
#include "loader/loader-internal.h"
#include "loader/pqueue.h"
#define pqueue_left(i) ((i) << 1)
#define pqueue_right(i) (((i) << 1) + 1)

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_PQUEUE_H
#define TOKU_PQUEUE_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,6 +87,8 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
@ -121,6 +121,3 @@ void pqueue_free(pqueue_t *q);
size_t pqueue_size(pqueue_t *q);
int pqueue_insert(pqueue_t *q, pqueue_node_t *d);
int pqueue_pop(pqueue_t *q, pqueue_node_t **d);
#endif //TOKU_PQUEUE_H

View File

@ -1,217 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* Time {m,l,s}fence vs.xchgl for a memory barrier. */
/* Timing numbers:
* Intel T2500 2GHZ
do1 9.0ns/loop
mfence: 29.0ns/loop (marginal cost= 20.0ns)
sfence: 17.3ns/loop (marginal cost= 8.3ns)
lfence: 23.6ns/loop (marginal cost= 14.6ns)
xchgl: 35.8ns/loop (marginal cost= 26.8ns)
* AMD Athlon 64 X2 Dual Core Processor 4200+
Timings are more crazy
do1 20.6ns/loop
mfence: 12.9ns/loop (marginal cost= -7.6ns)
sfence: 8.4ns/loop (marginal cost= -12.1ns)
lfence: 20.2ns/loop (marginal cost= -0.3ns)
xchgl: 16.6ns/loop (marginal cost= -3.9ns)
do1 13.0ns/loop
mfence: 25.6ns/loop (marginal cost= 12.6ns)
sfence: 21.0ns/loop (marginal cost= 8.1ns)
lfence: 12.9ns/loop (marginal cost= -0.1ns)
xchgl: 29.3ns/loop (marginal cost= 16.3ns)
*/
#include <sys/time.h>
#include <stdio.h>
#include <portability/toku_atomic.h>
enum { COUNT = 100000000 };
static inline void xchgl (void) {
{
/*
* According to the Intel Architecture Software Developer's
* Manual, Volume 3: System Programming Guide
* (http://www.intel.com/design/pro/manuals/243192.htm), page
* 7-6, "For the P6 family processors, locked operations
* serialize all outstanding load and store operations (that
* is, wait for them to complete)."
* Since xchg is locked by default, it is one way to do membar.
*/
int x=0, y;
asm volatile ("xchgl %0,%1" :"=r" (x) :"m" (y), "0" (x) :"memory");
}
}
static inline void mfence (void) {
asm volatile ("mfence":::"memory");
}
static inline void lfence (void) {
asm volatile ("lfence":::"memory");
}
static inline void sfence (void) {
asm volatile ("sfence":::"memory");
}
int lock_for_lock_and_unlock;
static inline void lock_and_unlock (void) {
(void)toku_sync_lock_test_and_set(&lock_for_lock_and_unlock, 1);
toku_sync_lock_release(&lock_for_lock_and_unlock);
}
double tdiff (struct timeval *start, struct timeval *end) {
return ((end->tv_sec-start->tv_sec + 1e-6*(end->tv_usec + start->tv_usec))/COUNT)*1e9;
}
double nop_cost;
void do1 (volatile int *x) {
int i;
struct timeval start, end;
gettimeofday(&start, 0);
for (i=0; i<COUNT; i++) {
x[0]++;
x[1]++;
x[2]++;
x[3]++;
}
gettimeofday(&end, 0);
printf("do1 %6.1fns/loop\n", nop_cost=tdiff(&start, &end));
}
#define doit(name) void do ##name (volatile int *x) { \
int i; \
struct timeval start, end; \
gettimeofday(&start, 0); \
for (i=0; i<COUNT; i++) { \
x[0]++; \
x[1]++; \
name(); \
x[2]++; \
x[3]++; \
} \
gettimeofday(&end, 0); \
double this_cost = tdiff(&start, &end); \
printf("%15s:%6.1fns/loop (marginal cost=%6.1fns)\n", #name, this_cost, this_cost-nop_cost); \
}
doit(mfence)
doit(lfence)
doit(sfence)
doit(xchgl)
doit(lock_and_unlock);
int main (int argc __attribute__((__unused__)),
char *argv[] __attribute__((__unused__))) {
int x[4];
int i;
for (i=0; i<4; i++) {
do1(x);
domfence(x);
dosfence(x);
dolfence(x);
doxchgl(x);
dolock_and_unlock(x);
}
return 0;
}

View File

@ -1,233 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* Test pthread rwlocks in multiprocess environment. */
/* How expensive is
* - Obtaining a read-only lock for the first obtainer.
* - Obtaining it for the second one?
* - The third one? */
#include <toku_assert.h>
#include <fcntl.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
#define FILE "process.data"
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
int r;
int fd;
void *p;
fd=open(FILE, O_CREAT|O_RDWR|O_TRUNC, 0666); assert(fd>=0);
int i;
for (i=0; i<4096; i++) {
r=write(fd, "\000", 1);
assert(r==1);
}
p=mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (p==MAP_FAILED) {
printf("err=%d %s (EPERM=%d)\n", errno, strerror(errno), EPERM);
}
assert(p!=MAP_FAILED);
r=close(fd); assert(r==0);
pthread_rwlockattr_t attr;
pthread_rwlock_t *lock=p;
r=pthread_rwlockattr_init(&attr); assert(r==0);
r=pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); assert(r==0);
r=pthread_rwlock_init(lock, &attr); assert(r==0);
r=pthread_rwlock_init(lock+1, &attr); assert(r==0);
r=pthread_rwlock_wrlock(lock);
pid_t pid;
if ((pid=fork())==0) {
// I'm the child
r = munmap(p, 4096); assert(r==0);
fd = open(FILE, O_RDWR, 0666); assert(fd>=0);
p=mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
assert(p!=MAP_FAILED);
r=close(fd); assert(r==0);
printf("A0\n");
r=pthread_rwlock_wrlock(lock);
printf("C\n");
sleep(1);
r=pthread_rwlock_unlock(lock);
printf("D\n");
r=pthread_rwlock_rdlock(lock);
printf("E0\n");
sleep(1);
} else {
printf("A1\n");
sleep(1);
printf("B\n");
r=pthread_rwlock_unlock(lock); // release the lock grabbed before the fork
assert(r==0);
sleep(1);
r=pthread_rwlock_rdlock(lock);
assert(r==0);
printf("E1\n");
sleep(1);
int status;
pid_t waited=wait(&status);
assert(waited==pid);
}
return 0;
#if 0
int j;
int i;
int r;
struct timeval start, end;
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
r=pthread_rwlock_init(&rwlocks[i], NULL);
assert(r==0);
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
r = pthread_rwlock_tryrdlock(&rwlocks[i]);
assert(r==0);
}
gettimeofday(&end, 0);
printf("pthread_rwlock_tryrdlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
r=pthread_rwlock_init(&rwlocks[i], NULL);
assert(r==0);
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
r = pthread_rwlock_rdlock(&rwlocks[i]);
assert(r==0);
}
gettimeofday(&end, 0);
printf("pthread_rwlock_rdlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
blocks[i].state=0;
blocks[i].mutex=0;
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
brwl_rlock(&blocks[i]);
}
gettimeofday(&end, 0);
printf("brwl_rlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
return 0;
#endif
}

View File

@ -1,272 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* How expensive is
* - Obtaining a read-only lock for the first obtainer.
* - Obtaining it for the second one?
* - The third one? */
#include <toku_assert.h>
#include <pthread.h>
#include <stdio.h>
#include <sys/time.h>
#include <pthread.h>
#include <portability/toku_atomic.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
/* My own rwlock implementation. */
struct brwl {
int mutex;
int state; // 0 for unlocked, -1 for a writer, otherwise many readers
};
static inline int xchg(volatile int *ptr, int x)
{
__asm__("xchgl %0,%1" :"=r" (x) :"m" (*(ptr)), "0" (x) :"memory");
return x;
}
static inline void sfence (void) {
asm volatile ("sfence":::"memory");
}
static inline void brwl_rlock_fence (struct brwl *l) {
while (xchg(&l->mutex, 1)) ;
l->state++;
sfence();
l->mutex=0;
}
static inline void brwl_rlock_xchg (struct brwl *l) {
while (xchg(&l->mutex, 1)) ;
l->state++;
xchg(&l->mutex, 0);
}
// Something wrong with the compiler for longs
static inline long
fetch_and_add (volatile long *p, long incr)
{
long result = incr;
__asm__ __volatile__ ("lock; xaddl %0, %1" :
"+r" (result), "+m" (*p) : : "memory");
return result;
}
static inline int
fetch_and_add_i (volatile int *p, int incr)
{
int result = incr;
__asm__ __volatile__ ("lock; xadd %0, %1" :
"+r" (result), "+m" (*p) : : "memory");
return result;
}
static inline int
gcc_fetch_and_add_i (volatile int *p, int incr)
{
return toku_sync_fetch_and_add(p, incr);
}
static inline long
gcc_fetch_and_add_l (volatile long *p, long incr)
{
return toku_sync_fetch_and_add(p, incr);
}
// Something wrong with the compiler for longs
/* Returns nonzero if the comparison succeeded. */
static inline long
compare_and_swap_full(volatile long *addr,
long old, long new_val)
{
char result;
__asm__ __volatile__("lock; cmpxchgl %2, %0; setz %1"
: "+m"(*(addr)), "=q"(result)
: "r" (new_val), "a"(old) : "memory");
return (int) result;
}
/* Returns nonzero if the comparison succeeded. */
// Atomically compare *addr to old_val, and replace *addr by new_val
// if the first comparison succeeds. Returns nonzero if the comparison
// succeeded and *addr was updated.
static inline int
compare_and_swap_full_i(volatile int *addr,
int old, int new_val)
{
char result;
__asm__ __volatile__("lock; cmpxchg %2, %0; setz %1"
: "+m"(*(addr)), "=q"(result)
: "r" (new_val), "a"(old) : "memory");
return (int) result;
}
enum {K=100000};
pthread_rwlock_t rwlocks[K];
struct brwl blocks[K];
pthread_mutex_t mlocks[K];
long lvals[K];
int ivals[K];
#define TIME(s, i, init, body) ({ \
int j_tmp; \
printf("%-24s", s); \
for (j_tmp=0; j_tmp<3; j_tmp++) { \
struct timeval start,end; \
int i; \
for (i=0; i<K; i++) { \
init; \
} \
gettimeofday(&start, 0); \
for (i=0; i<K; i++) { \
body; \
} \
gettimeofday(&end, 0); \
printf(" %9.3fus", tdiff(&start,&end)/K); \
} \
printf("\n"); \
})
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
printf("sizeof (pthread_mutex_t) %lu\n", sizeof (pthread_mutex_t));
printf("sizeof (pthread_cond_t) %lu\n", sizeof (pthread_cond_t));
TIME("pthread_mutex_lock_errorcheck", i,
({ int r; pthread_mutexattr_t mattr;
r = pthread_mutexattr_init(&mattr); assert(r == 0);
r = pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_ERRORCHECK_NP); assert(r == 0);
r = pthread_mutex_init(&mlocks[i], &mattr); assert(r==0);
r = pthread_mutexattr_destroy(&mattr); assert(r == 0); }),
({ int r = pthread_mutex_lock(&mlocks[i]); assert(r==0); }));
TIME("pthread_mutex_lock", i,
({ int r = pthread_mutex_init(&mlocks[i], NULL); assert(r==0); }),
({ int r = pthread_mutex_lock(&mlocks[i]); assert(r==0); }));
TIME("pthread_mutex_unlock", i,
({ int r = pthread_mutex_init(&mlocks[i], NULL); assert(r==0); r = pthread_mutex_lock(&mlocks[i]); assert(r==0); }),
({ int r = pthread_mutex_unlock(&mlocks[i]); assert(r==0); }));
TIME("pthread_rwlock_tryrdlock", i,
({ int r = pthread_rwlock_init(&rwlocks[i], NULL); assert(r==0); }),
({ int r = pthread_rwlock_tryrdlock(&rwlocks[i]); assert(r==0); }));
TIME("pthread_rwlock_rdlock", i,
({ int r = pthread_rwlock_init(&rwlocks[i], NULL); assert(r==0); }),
({ int r = pthread_rwlock_rdlock(&rwlocks[i]); assert(r==0); }));
TIME("brwl_rlock_xchg", i,
(blocks[i].state=0, blocks[i].mutex=0),
brwl_rlock_xchg(&blocks[i]));
TIME("brwl_rlock_fence", i,
(blocks[i].state=0, blocks[i].mutex=0),
brwl_rlock_fence(&blocks[i]));
int fa=0;
TIME("fetchadd", i,
(void)0,
fetch_and_add_i(&fa, i));
// printf("fa=%d\n", fa);
fa=0;
TIME("gcc_fetchadd", i,
(void)0,
gcc_fetch_and_add_i(&fa, i));
// printf("fa=%d\n", fa);
long fal = 0;
TIME("gcc_fetchaddlong", i,
(void)0,
gcc_fetch_and_add_l(&fal, i));
// printf("fa=%d\n", fa);
TIME("compare_and_swap", i,
ivals[i]=0,
({ int r=compare_and_swap_full_i(&ivals[i], 0, 1); assert(r==1); }));
return 0;
}

View File

@ -1,247 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/* Like trylock, except use rdstc */
#define _MULTI_THREADED
#include <pthread.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>
#include <rdtsc.h>
#include <portability/toku_atomic.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
unsigned long long rtdiff (unsigned long long a, unsigned long long b) {
return (b-a);
}
/* Simple function to check the return code and exit the program
if the function call failed
*/
static void compResults(char *string, int rc) {
if (rc) {
printf("Error on : %s, rc=%d",
string, rc);
exit(EXIT_FAILURE);
}
return;
}
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
void *rdlockThread(void *arg __attribute__((unused)))
{
int rc;
int count=0;
unsigned long long t_start, t_end;
printf("Entered thread, getting read lock with mp wait\n");
Retry:
t_start = rdtsc();
rc = pthread_rwlock_tryrdlock(&rwlock);
t_end = rdtsc();
printf("pthread_rwlock_tryrdlock took %llu clocks\n", rtdiff(t_start,t_end));
if (rc == EBUSY) {
if (count >= 10) {
printf("Retried too many times, failure!\n");
exit(EXIT_FAILURE);
}
++count;
printf("Could not get lock, do other work, then RETRY...\n");
sleep(1);
goto Retry;
}
compResults("pthread_rwlock_tryrdlock() 1\n", rc);
sleep(2);
printf("unlock the read lock\n");
t_start = rdtsc();
rc = pthread_rwlock_unlock(&rwlock);
t_end = rdtsc();
compResults("pthread_rwlock_unlock()\n", rc);
printf("Took %llu clocks\n", rtdiff(t_start, t_end));
printf("Secondary thread complete\n");
return NULL;
}
int main(int argc __attribute__((unused)), char **argv)
{
int rc=0;
pthread_t thread;
unsigned long long t_start, t_end;
printf("Enter Testcase - %s\n", argv[0]);
t_start = rdtsc();
t_end = rdtsc();
printf("nop Took %llu clocks\n", rtdiff(t_start, t_end));
{
int N=1000;
int i;
printf("Main, get and release the write lock %d times\n", N);
t_start = rdtsc();
for (i=0; i<N; i++) {
rc = pthread_rwlock_wrlock(&rwlock);
rc = pthread_rwlock_unlock(&rwlock);
}
t_end = rdtsc();
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %5.2f clocks/op\n", ((double)(t_end-t_start))/N);
}
printf("Main, get the write lock\n");
t_start = rdtsc();
rc = pthread_rwlock_wrlock(&rwlock);
t_end = rdtsc();
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %llu clocks\n", rtdiff(t_start, t_end));
printf("Main, create the try read lock thread\n");
rc = pthread_create(&thread, NULL, rdlockThread, NULL);
compResults("pthread_create\n", rc);
printf("Main, wait a bit holding the write lock\n");
sleep(5);
printf("Main, Now unlock the write lock\n");
t_start = rdtsc();
rc = pthread_rwlock_unlock(&rwlock);
t_end = rdtsc();
compResults("pthread_rwlock_unlock()\n", rc);
printf("Took %llu clocks\n", rtdiff(t_start, t_end));
printf("Main, wait for the thread to end\n");
rc = pthread_join(thread, NULL);
compResults("pthread_join\n", rc);
rc = pthread_rwlock_destroy(&rwlock);
compResults("pthread_rwlock_destroy()\n", rc);
printf("Main completed\n");
{
static int lock_for_lock_and_unlock;
t_start = rdtsc();
(void)toku_sync_lock_test_and_set(&lock_for_lock_and_unlock, 1);
t_end = rdtsc();
printf("sync_lock_test_and_set took %llu clocks\n", t_end-t_start);
t_start = rdtsc();
toku_sync_lock_release(&lock_for_lock_and_unlock);
t_end = rdtsc();
printf("sync_lock_release took %llu clocks\n", t_end-t_start);
}
{
t_start = rdtsc();
(void)toku_sync_synchronize();
t_end = rdtsc();
printf("sync_synchornize took %llu clocks\n", t_end-t_start);
}
t_start = rdtsc();
sleep(1);
t_end = rdtsc();
printf("sleep(1) took %llu clocks\n", t_end-t_start);
return 0;
}

View File

@ -1,213 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#define _MULTI_THREADED
#include <pthread.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
/* Simple function to check the return code and exit the program
if the function call failed
*/
static void compResults(char *string, int rc) {
if (rc) {
printf("Error on : %s, rc=%d",
string, rc);
exit(EXIT_FAILURE);
}
return;
}
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
void *rdlockThread(void *arg __attribute__((unused)))
{
int rc;
int count=0;
struct timeval start, end;
printf("Entered thread, getting read lock with mp wait\n");
Retry:
gettimeofday(&start, 0);
rc = pthread_rwlock_tryrdlock(&rwlock);
gettimeofday(&end, 0);
printf("pthread_rwlock_tryrdlock took %9.3fus\n", tdiff(&start,&end));
if (rc == EBUSY) {
if (count >= 10) {
printf("Retried too many times, failure!\n");
exit(EXIT_FAILURE);
}
++count;
printf("Could not get lock, do other work, then RETRY...\n");
sleep(1);
goto Retry;
}
compResults("pthread_rwlock_tryrdlock() 1\n", rc);
sleep(2);
printf("unlock the read lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_unlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_unlock()\n", rc);
printf("%lu.%6lu to %lu.%6lu is %9.2f\n", start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec, tdiff(&start, &end));
printf("Secondary thread complete\n");
return NULL;
}
int main(int argc __attribute__((unused)), char **argv)
{
int rc=0;
pthread_t thread;
struct timeval start, end;
printf("Enter Testcase - %s\n", argv[0]);
gettimeofday(&start, 0);
gettimeofday(&end, 0);
printf("nop Took %9.2f\n", tdiff(&start, &end));
{
int N=1000;
int i;
printf("Main, get and release the write lock %d times\n", N);
gettimeofday(&start, 0);
for (i=0; i<N; i++) {
rc = pthread_rwlock_wrlock(&rwlock);
rc = pthread_rwlock_unlock(&rwlock);
}
gettimeofday(&end, 0);
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %9.2fns/op\n", 1000*tdiff(&start, &end)/N);
}
printf("Main, get the write lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_wrlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %9.2f\n", tdiff(&start, &end));
printf("Main, create the try read lock thread\n");
rc = pthread_create(&thread, NULL, rdlockThread, NULL);
compResults("pthread_create\n", rc);
printf("Main, wait a bit holding the write lock\n");
sleep(5);
printf("Main, Now unlock the write lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_unlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_unlock()\n", rc);
printf("Took %9.2f\n", tdiff(&start, &end));
printf("Main, wait for the thread to end\n");
rc = pthread_join(thread, NULL);
compResults("pthread_join\n", rc);
rc = pthread_rwlock_destroy(&rwlock);
compResults("pthread_rwlock_destroy()\n", rc);
printf("Main completed\n");
return 0;
}

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef LOG_INTERNAL_H
#define LOG_INTERNAL_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,6 +87,8 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
@ -96,17 +96,18 @@ PATENT RIGHTS GRANT:
#include <sys/types.h>
#include <string.h>
#include <dirent.h>
#include "ft-internal.h"
#include "log.h"
#include "toku_list.h"
#include "memarena.h"
#include "logfilemgr.h"
#include "txn.h"
#include "txn_manager.h"
#include <portability/toku_pthread.h>
#include <util/omt.h>
#include "rollback_log_node_cache.h"
#include "txn_child_manager.h"
#include "portability/toku_list.h"
#include "portability/toku_pthread.h"
#include "ft/ft-internal.h"
#include "ft/logger/log.h"
#include "ft/logger/logfilemgr.h"
#include "ft/txn/txn.h"
#include "ft/txn/txn_manager.h"
#include "ft/txn/rollback_log_node_cache.h"
#include "util/memarena.h"
#include "util/omt.h"
using namespace toku;
// Locking for the logger
@ -117,6 +118,7 @@ using namespace toku;
#define LOGGER_MIN_BUF_SIZE (1<<24)
// TODO: Remove mylock, it has no value
struct mylock {
toku_mutex_t lock;
};
@ -155,7 +157,7 @@ struct tokulogger {
DIR *dir; // descriptor for directory
int fd;
CACHETABLE ct;
int lg_max; // The size of the single file in the log. Default is 100MB in TokuDB
int lg_max; // The size of the single file in the log. Default is 100MB.
// To access these, you must have the input lock
LSN lsn; // the next available lsn
@ -179,8 +181,6 @@ struct tokulogger {
tokutime_t time_spent_writing_to_disk; // how much tokutime did we spend writing to disk?
uint64_t num_wait_buf_long; // how many times we waited >= 100ms for the in buf
void (*remove_finalize_callback) (DICTIONARY_ID, void*); // ydb-level callback to be called when a transaction that ...
void * remove_finalize_callback_extra; // ... deletes a file is committed or when one that creates a file is aborted.
CACHEFILE rollback_cachefile;
rollback_log_node_cache rollback_cache;
TXN_MANAGER txn_manager;
@ -188,99 +188,7 @@ struct tokulogger {
int toku_logger_find_next_unused_log_file(const char *directory, long long *result);
int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_logfiles);
struct txn_roll_info {
// these are number of rollback nodes and rollback entries for this txn.
//
// the current rollback node below has sequence number num_rollback_nodes - 1
// (because they are numbered 0...num-1). often, the current rollback is
// already set to this block num, which means it exists and is available to
// log some entries. if the current rollback is NONE and the number of
// rollback nodes for this transaction is non-zero, then we will use
// the number of rollback nodes to know which sequence number to assign
// to a new one we create
uint64_t num_rollback_nodes;
uint64_t num_rollentries;
uint64_t num_rollentries_processed;
uint64_t rollentry_raw_count; // the total count of every byte in the transaction and all its children.
// spilled rollback nodes are rollback nodes that were gorged by this
// transaction, retired, and saved in a list.
// the spilled rollback head is the block number of the first rollback node
// that makes up the rollback log chain
BLOCKNUM spilled_rollback_head;
// the spilled rollback is the block number of the last rollback node that
// makes up the rollback log chain.
BLOCKNUM spilled_rollback_tail;
// the current rollback node block number we may use. if this is ROLLBACK_NONE,
// then we need to create one and set it here before using it.
BLOCKNUM current_rollback;
};
struct tokutxn {
// These don't change after create:
TXNID_PAIR txnid;
uint64_t snapshot_txnid64; // this is the lsn of the snapshot
const TXN_SNAPSHOT_TYPE snapshot_type;
const bool for_recovery;
const TOKULOGGER logger;
const TOKUTXN parent;
// The child txn is protected by the child_txn_manager lock
// and by the user contract. The user contract states (and is
// enforced at the ydb layer) that a child txn should not be created
// while another child exists. The txn_child_manager will protect
// other threads from trying to read this value while another
// thread commits/aborts the child
TOKUTXN child;
// statically allocated child manager, if this
// txn is a root txn, this manager will be used and set to
// child_manager for this transaction and all of its children
txn_child_manager child_manager_s;
// child manager for this transaction, all of its children,
// and all of its ancestors
txn_child_manager* child_manager;
// These don't change but they're created in a way that's hard to make
// strictly const.
DB_TXN *container_db_txn; // reference to DB_TXN that contains this tokutxn
xid_omt_t *live_root_txn_list; // the root txns live when the root ancestor (self if a root) started.
XIDS xids; // Represents the xid list
TOKUTXN snapshot_next;
TOKUTXN snapshot_prev;
bool begin_was_logged;
bool declared_read_only; // true if the txn was declared read only when began
// These are not read until a commit, prepare, or abort starts, and
// they're "monotonic" (only go false->true) during operation:
bool do_fsync;
bool force_fsync_on_commit; //This transaction NEEDS an fsync once (if) it commits. (commit means root txn)
// Not used until commit, prepare, or abort starts:
LSN do_fsync_lsn;
TOKU_XA_XID xa_xid; // for prepared transactions
TXN_PROGRESS_POLL_FUNCTION progress_poll_fun;
void *progress_poll_fun_extra;
toku_mutex_t txn_lock;
// Protected by the txn lock:
omt<FT> open_fts; // a collection of the fts that we touched. Indexed by filenum.
struct txn_roll_info roll_info; // Info used to manage rollback entries
// mutex that protects the transition of the state variable
// the rest of the variables are used by the txn code and
// hot indexing to ensure that when hot indexing is processing a
// leafentry, a TOKUTXN cannot dissappear or change state out from
// underneath it
toku_mutex_t state_lock;
toku_cond_t state_cond;
TOKUTXN_STATE state;
uint32_t num_pin; // number of threads (all hot indexes) that want this
// txn to not transition to commit or abort
uint64_t client_id;
};
void toku_logger_free_logfiles (char **logfiles, int n_logfiles);
static inline int
txn_has_current_rollback_log(TOKUTXN txn) {
@ -369,5 +277,3 @@ static inline char *fixup_fname(BYTESTRING *f) {
fname[f->len]=0;
return fname;
}
#endif

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_LOGGGER_H
#define TOKU_LOGGGER_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,34 +87,27 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h>
#include <db.h>
#include <errno.h>
#include <db.h>
#include "fttypes.h"
#include "memory.h"
#include "x1764.h"
#include "portability/memory.h"
#include "portability/toku_portability.h"
#include "ft/logger/recover.h"
#include "ft/txn/rollback.h"
#include "ft/txn/txn.h"
#include "util/bytestring.h"
struct roll_entry;
#include "logger.h"
#include "rollback.h"
#include "recover.h"
#include "txn.h"
static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) {
target->len = val.len;
target->data = (char *) toku_memdup(val.data, (size_t)val.len);
if (target->data==0) {
return get_error_errno();
}
return 0;
}
static inline void toku_free_TXNID(TXNID txnid __attribute__((__unused__))) {}
static inline void toku_free_TXNID_PAIR(TXNID_PAIR txnid __attribute__((__unused__))) {}
static inline void toku_free_LSN(LSN lsn __attribute__((__unused__))) {}
static inline void toku_free_uint64_t(uint64_t u __attribute__((__unused__))) {}
static inline void toku_free_uint32_t(uint32_t u __attribute__((__unused__))) {}
@ -130,6 +121,3 @@ static inline void toku_free_FILENUMS(FILENUMS val) { toku_free(val.filenums); }
int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir, LSN * lsn_of_clean_shutdown, bool * upgrade_in_progress);
uint64_t toku_log_upgrade_get_footprint(void);
#endif

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -92,8 +92,8 @@ PATENT RIGHTS GRANT:
#include <ft/log_header.h>
#include "log-internal.h"
#include "logcursor.h"
#include "checkpoint.h"
#include "logger/logcursor.h"
#include "cachetable/checkpoint.h"
static uint64_t footprint = 0; // for debug and accountability
@ -209,10 +209,7 @@ cleanup:
r = toku_logcursor_destroy(&cursor);
assert(r == 0);
cleanup_no_logcursor:
for(int i=0;i<n_logfiles;i++) {
toku_free(logfiles[i]);
}
toku_free(logfiles);
toku_logger_free_logfiles(logfiles, n_logfiles);
FOOTPRINTCAPTURE;
return rval;
}
@ -227,10 +224,6 @@ verify_clean_shutdown_of_log_version(const char *log_dir, uint32_t version, LSN
if (version < TOKU_LOG_VERSION) {
FOOTPRINT(1);
r = verify_clean_shutdown_of_log_version_old(log_dir, last_lsn, last_xid, version);
if (r != 0) {
fprintf(stderr, "Cannot upgrade TokuDB version %d database.", version);
fprintf(stderr, " Previous improper shutdown detected.\n");
}
}
else {
FOOTPRINT(2);
@ -321,10 +314,17 @@ toku_maybe_upgrade_log(const char *env_dir, const char *log_dir, LSN * lsn_of_cl
r = 0; //Logs are up to date
else {
FOOTPRINT(4);
LSN last_lsn= ZERO_LSN;
TXNID last_xid;
LSN last_lsn = ZERO_LSN;
TXNID last_xid = TXNID_NONE;
r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn, &last_xid);
if (r != 0) {
if (TOKU_LOG_VERSION_25 <= version_of_logs_on_disk && version_of_logs_on_disk <= TOKU_LOG_VERSION_27
&& TOKU_LOG_VERSION_27 == TOKU_LOG_VERSION) {
r = 0; // can do recovery on dirty shutdown
} else {
fprintf(stderr, "Cannot upgrade TokuFT version %d database.", version_of_logs_on_disk);
fprintf(stderr, " Previous improper shutdown detected.\n");
}
goto cleanup;
}
FOOTPRINT(5);

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -90,7 +90,7 @@ PATENT RIGHTS GRANT:
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "log-internal.h"
#include "logcursor.h"
#include "logger/logcursor.h"
#include <limits.h>
#include <unistd.h>
@ -167,11 +167,8 @@ static int lc_open_logfile(TOKULOGCURSOR lc, int index) {
lc->cur_fp = fopen(lc->logfiles[index], "rb");
if ( lc->cur_fp == NULL )
return DB_NOTFOUND;
// debug printf("%s:%d %s %p %u\n", __FUNCTION__, __LINE__, lc->logfiles[index], lc->buffer, (unsigned) lc->buffer_size);
#if !TOKU_WINDOWS //Windows reads logs fastest if we use default settings (not use setvbuf to change buffering)
r = setvbuf(lc->cur_fp, (char *) lc->buffer, _IOFBF, lc->buffer_size);
assert(r == 0);
#endif
// position fp past header, ignore 0 length file (t:2384)
unsigned int version=0;
if ( lc_file_len(lc->logfiles[index]) >= 12 ) {
@ -194,7 +191,7 @@ static int lc_check_lsn(TOKULOGCURSOR lc, int dir) {
// int index = lc->cur_logfiles_index;
// fprintf(stderr, "Bad LSN: %d %s direction = %d, lsn.lsn = %" PRIu64 ", cur_lsn.lsn=%" PRIu64 "\n",
// index, lc->logfiles[index], dir, lsn.lsn, lc->cur_lsn.lsn);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
printf("DB_RUNRECOVERY: %s:%d r=%d\n", __FUNCTION__, __LINE__, 0);
return LC_LSN_ERROR;
}
@ -280,11 +277,7 @@ int toku_logcursor_destroy(TOKULOGCURSOR *lc) {
(*lc)->entry_valid = false;
}
r = lc_close_cur_logfile(*lc);
int lf;
for(lf=0;lf<(*lc)->n_logfiles;lf++) {
if ( (*lc)->logfiles[lf] ) toku_free((*lc)->logfiles[lf]);
}
if ( (*lc)->logfiles ) toku_free((*lc)->logfiles);
toku_logger_free_logfiles((*lc)->logfiles, (*lc)->n_logfiles);
if ( (*lc)->logdir ) toku_free((*lc)->logdir);
if ( (*lc)->buffer ) toku_free((*lc)->buffer);
toku_free(*lc);
@ -310,10 +303,10 @@ static int lc_log_read(TOKULOGCURSOR lc)
toku_log_free_log_entry_resources(&(lc->entry));
time_t tnow = time(NULL);
if (r==DB_BADFORMAT) {
fprintf(stderr, "%.24s Tokudb bad log format in %s\n", ctime(&tnow), lc->logfiles[lc->cur_logfiles_index]);
fprintf(stderr, "%.24s TokuFT bad log format in %s\n", ctime(&tnow), lc->logfiles[lc->cur_logfiles_index]);
}
else {
fprintf(stderr, "%.24s Tokudb unexpected log format error '%s' in %s\n", ctime(&tnow), strerror(r), lc->logfiles[lc->cur_logfiles_index]);
fprintf(stderr, "%.24s TokuFT unexpected log format error '%s' in %s\n", ctime(&tnow), strerror(r), lc->logfiles[lc->cur_logfiles_index]);
}
}
return r;
@ -342,10 +335,10 @@ static int lc_log_read_backward(TOKULOGCURSOR lc)
toku_log_free_log_entry_resources(&(lc->entry));
time_t tnow = time(NULL);
if (r==DB_BADFORMAT) {
fprintf(stderr, "%.24s Tokudb bad log format in %s\n", ctime(&tnow), lc->logfiles[lc->cur_logfiles_index]);
fprintf(stderr, "%.24s TokuFT bad log format in %s\n", ctime(&tnow), lc->logfiles[lc->cur_logfiles_index]);
}
else {
fprintf(stderr, "%.24s Tokudb uUnexpected log format error '%s' in %s\n", ctime(&tnow), strerror(r), lc->logfiles[lc->cur_logfiles_index]);
fprintf(stderr, "%.24s TokuFT uUnexpected log format error '%s' in %s\n", ctime(&tnow), strerror(r), lc->logfiles[lc->cur_logfiles_index]);
}
}
return r;
@ -463,10 +456,10 @@ int toku_logcursor_last(TOKULOGCURSOR lc, struct log_entry **le) {
// probably a corrupted last log entry due to a crash
// try scanning forward from the beginning to find the last good entry
time_t tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery repairing log\n", ctime(&tnow));
fprintf(stderr, "%.24s TokuFT recovery repairing log\n", ctime(&tnow));
r = lc_fix_bad_logfile(lc);
if ( r != 0 ) {
fprintf(stderr, "%.24s Tokudb recovery repair unsuccessful\n", ctime(&tnow));
fprintf(stderr, "%.24s TokuFT recovery repair unsuccessful\n", ctime(&tnow));
return DB_BADFORMAT;
}
// try reading again

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKULOGCURSOR_H
#define TOKULOGCURSOR_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,12 +87,13 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <ft/log_header.h>
struct toku_logcursor;
typedef struct toku_logcursor *TOKULOGCURSOR;
@ -127,6 +126,3 @@ int toku_logcursor_last(const TOKULOGCURSOR lc, struct log_entry **le);
int toku_logcursor_log_exists(const TOKULOGCURSOR lc);
void toku_logcursor_print(TOKULOGCURSOR lc);
#endif // TOKULOGCURSOR_H

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,9 +89,9 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "log-internal.h"
#include "logcursor.h"
#include "logfilemgr.h"
#include "logger/log-internal.h"
#include "logger/logcursor.h"
#include "logger/logfilemgr.h"
// for now, implement with singlely-linked-list
// first = oldest (delete from beginning)
@ -186,10 +186,7 @@ int toku_logfilemgr_init(TOKULOGFILEMGR lfm, const char *log_dir, TXNID *last_xi
toku_logfilemgr_add_logfile_info(lfm, lf_info);
toku_logcursor_destroy(&cursor);
}
for(int i=0;i<n_logfiles;i++) {
toku_free(logfiles[i]);
}
toku_free(logfiles);
toku_logger_free_logfiles(logfiles, n_logfiles);
*last_xid_if_clean_shutdown = last_xid;
return 0;
}

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKULOGFILEMGR_H
#define TOKULOGFILEMGR_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,12 +87,13 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <ft/log_header.h>
// this is the basic information we need to keep per logfile
struct toku_logfile_info {
int64_t index;
@ -118,6 +117,3 @@ LSN toku_logfilemgr_get_last_lsn(TOKULOGFILEMGR lfm);
void toku_logfilemgr_update_last_lsn(TOKULOGFILEMGR lfm, LSN lsn);
void toku_logfilemgr_print(TOKULOGFILEMGR lfm);
#endif //TOKULOGFILEMGR_H

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -195,7 +195,7 @@ const struct logtype logtypes[] = {
{"BYTESTRING", "iname", 0}, // pathname of file
{"uint8_t", "unlink_on_close", 0},
NULLFIELD}, IGNORE_LOG_BEGIN},
//We do not use a TXNINFO struct since recovery log has
//We do not use a txninfo struct since recovery log has
//FILENUMS and TOKUTXN has FTs (for open_fts)
{"xstillopen", 's', FA{{"TXNID_PAIR", "xid", 0},
{"TXNID_PAIR", "parentxid", 0},
@ -536,7 +536,7 @@ generate_log_writer (void) {
if (strcmp(field_type->name, "timestamp") == 0)
fprintf(cf, " if (timestamp == 0) timestamp = toku_get_timestamp();\n");
fprintf(cf, " wbuf_nocrc_%s(&wbuf, %s);\n", field_type->type, field_type->name));
fprintf(cf, " wbuf_nocrc_int(&wbuf, x1764_memory(wbuf.buf, wbuf.ndone));\n");
fprintf(cf, " wbuf_nocrc_int(&wbuf, toku_x1764_memory(wbuf.buf, wbuf.ndone));\n");
fprintf(cf, " wbuf_nocrc_int(&wbuf, buflen);\n");
fprintf(cf, " assert(wbuf.ndone==buflen);\n");
fprintf(cf, " logger->inbuf.n_in_buf += buflen;\n");
@ -558,7 +558,7 @@ generate_log_reader (void) {
fprintf(cf, " uint32_t checksum_in_file, len_in_file;\n");
fprintf(cf, " r=toku_fread_uint32_t_nocrclen(infile, &checksum_in_file); actual_len+=4; if (r!=0) return r;\n");
fprintf(cf, " r=toku_fread_uint32_t_nocrclen(infile, &len_in_file); actual_len+=4; if (r!=0) return r;\n");
fprintf(cf, " if (checksum_in_file!=x1764_finish(checksum) || len_in_file!=actual_len || len1 != len_in_file) return DB_BADFORMAT;\n");
fprintf(cf, " if (checksum_in_file!=toku_x1764_finish(checksum) || len_in_file!=actual_len || len1 != len_in_file) return DB_BADFORMAT;\n");
fprintf(cf, " return 0;\n");
fprintf(cf, "}\n\n");
});
@ -568,12 +568,12 @@ generate_log_reader (void) {
fprintf(cf, " uint32_t len1; int r;\n");
fprintf(cf, " uint32_t ignorelen=0;\n");
fprintf(cf, " struct x1764 checksum;\n");
fprintf(cf, " x1764_init(&checksum);\n");
fprintf(cf, " toku_x1764_init(&checksum);\n");
fprintf(cf, " r = toku_fread_uint32_t(infile, &len1, &checksum, &ignorelen); if (r!=0) return r;\n");
fprintf(cf, " int cmd=fgetc(infile);\n");
fprintf(cf, " if (cmd==EOF) return EOF;\n");
fprintf(cf, " char cmdchar = (char)cmd;\n");
fprintf(cf, " x1764_add(&checksum, &cmdchar, 1);\n");
fprintf(cf, " toku_x1764_add(&checksum, &cmdchar, 1);\n");
fprintf(cf, " le->cmd=(enum lt_cmd)cmd;\n");
fprintf(cf, " switch ((enum lt_cmd)cmd) {\n");
DO_LOGTYPES(lt, {
@ -639,14 +639,14 @@ generate_logprint (void) {
fprintf(pf, " uint32_t len1, crc_in_file;\n");
fprintf(pf, " uint32_t ignorelen=0;\n");
fprintf(pf, " struct x1764 checksum;\n");
fprintf(pf, " x1764_init(&checksum);\n");
fprintf(pf, " toku_x1764_init(&checksum);\n");
fprintf(pf, " r=toku_fread_uint32_t(f, &len1, &checksum, &ignorelen);\n");
fprintf(pf, " if (r==EOF) return EOF;\n");
fprintf(pf, " cmd=fgetc(f);\n");
fprintf(pf, " if (cmd==EOF) return DB_BADFORMAT;\n");
fprintf(pf, " uint32_t len_in_file, len=1+4; // cmd + len1\n");
fprintf(pf, " char charcmd = (char)cmd;\n");
fprintf(pf, " x1764_add(&checksum, &charcmd, 1);\n");
fprintf(pf, " toku_x1764_add(&checksum, &charcmd, 1);\n");
fprintf(pf, " switch ((enum lt_cmd)cmd) {\n");
DO_LOGTYPES(lt, { if (strlen(lt->name)>maxnamelen) maxnamelen=strlen(lt->name); });
DO_LOGTYPES(lt, {
@ -664,7 +664,7 @@ generate_logprint (void) {
fprintf(pf, "); if (r!=0) return r;\n");
});
fprintf(pf, " {\n");
fprintf(pf, " uint32_t actual_murmur = x1764_finish(&checksum);\n");
fprintf(pf, " uint32_t actual_murmur = toku_x1764_finish(&checksum);\n");
fprintf(pf, " r = toku_fread_uint32_t_nocrclen (f, &crc_in_file); len+=4; if (r!=0) return r;\n");
fprintf(pf, " fprintf(outf, \" crc=%%08x\", crc_in_file);\n");
fprintf(pf, " if (crc_in_file!=actual_murmur) fprintf(outf, \" checksum=%%08x\", actual_murmur);\n");
@ -798,7 +798,7 @@ generate_rollbacks (void) {
fprintf(cf, " }\n assert(0);\n return 0;\n");
fprintf(cf, "}\n");
fprintf2(cf, hf, "int toku_parse_rollback(unsigned char *buf, uint32_t n_bytes, struct roll_entry **itemp, MEMARENA ma)");
fprintf2(cf, hf, "int toku_parse_rollback(unsigned char *buf, uint32_t n_bytes, struct roll_entry **itemp, memarena *ma)");
fprintf(hf, ";\n");
fprintf(cf, " {\n assert(n_bytes>0);\n struct roll_entry *item;\n enum rt_cmd cmd = (enum rt_cmd)(buf[0]);\n size_t mem_needed;\n");
fprintf(cf, " struct rbuf rc = {buf, n_bytes, 1};\n");
@ -806,7 +806,7 @@ generate_rollbacks (void) {
DO_ROLLBACKS(lt, {
fprintf(cf, " case RT_%s:\n", lt->name);
fprintf(cf, " mem_needed = sizeof(item->u.%s) + __builtin_offsetof(struct roll_entry, u.%s);\n", lt->name, lt->name);
fprintf(cf, " CAST_FROM_VOIDP(item, malloc_in_memarena(ma, mem_needed));\n");
fprintf(cf, " CAST_FROM_VOIDP(item, ma->malloc_from_arena(mem_needed));\n");
fprintf(cf, " item->cmd = cmd;\n");
DO_FIELDS(field_type, lt, fprintf(cf, " rbuf_ma_%s(&rc, ma, &item->u.%s.%s);\n", field_type->type, lt->name, field_type->name));
fprintf(cf, " *itemp = item;\n");
@ -849,16 +849,15 @@ int main (int argc, const char *const argv[]) {
pf = fopen(printpath, "w"); assert(pf!=0);
fprintf2(cf, hf, "/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */\n");
fprintf2(cf, hf, "// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:\n");
fprintf(hf, "#ifndef LOG_HEADER_H\n");
fprintf(hf, "#define LOG_HEADER_H\n");
fprintf(hf, "#pragma once\n");
fprintf2(cf, hf, "/* Do not edit this file. This code generated by logformat.c. Copyright (c) 2007-2013 Tokutek Inc. */\n");
fprintf2(cf, hf, "#ident \"Copyright (c) 2007-2013 Tokutek Inc. All rights reserved.\"\n");
fprintf2(cf, pf, "#include <stdint.h>\n");
fprintf2(cf, pf, "#include <sys/time.h>\n");
fprintf2(cf, pf, "#include <ft/fttypes.h>\n");
fprintf2(cf, pf, "#include <ft/log-internal.h>\n");
fprintf2(cf, pf, "#include <ft/logger/log-internal.h>\n");
fprintf(hf, "#include <ft/ft-internal.h>\n");
fprintf(hf, "#include <ft/memarena.h>\n");
fprintf(hf, "#include <util/bytestring.h>\n");
fprintf(hf, "#include <util/memarena.h>\n");
generate_enum();
generate_log_struct();
generate_dispatch();
@ -867,7 +866,6 @@ int main (int argc, const char *const argv[]) {
generate_rollbacks();
generate_log_entry_functions();
generate_logprint();
fprintf(hf, "#endif\n");
{
int r=fclose(hf); assert(r==0);
r=fclose(cf); assert(r==0);

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -94,12 +94,13 @@ PATENT RIGHTS GRANT:
#include <limits.h>
#include <unistd.h>
#include "ft.h"
#include "log-internal.h"
#include "txn_manager.h"
#include "rollback_log_node_cache.h"
#include "huge_page_detection.h"
#include <util/status.h>
#include "ft/serialize/block_table.h"
#include "ft/ft.h"
#include "ft/logger/log-internal.h"
#include "ft/txn/txn_manager.h"
#include "ft/txn/rollback_log_node_cache.h"
#include "util/status.h"
static const int log_format_version=TOKU_LOG_VERSION;
@ -151,8 +152,8 @@ static bool is_a_logfile_any_version (const char *name, uint64_t *number_result,
// added for #2424, improved for #2521
static bool is_a_logfile (const char *name, long long *number_result) {
bool rval;
uint64_t result= 0;
uint32_t version= 0;
uint64_t result;
uint32_t version;
rval = is_a_logfile_any_version(name, &result, &version);
if (rval && version != TOKU_LOG_VERSION)
rval = false;
@ -164,18 +165,12 @@ static bool is_a_logfile (const char *name, long long *number_result) {
// TODO: can't fail
int toku_logger_create (TOKULOGGER *resultp) {
if (complain_and_return_true_if_huge_pages_are_enabled()) {
*resultp = NULL;
errno = TOKUDB_HUGE_PAGES_ENABLED;
return TOKUDB_HUGE_PAGES_ENABLED;
}
TOKULOGGER CALLOC(result);
if (result==0) return get_error_errno();
result->is_open=false;
result->write_log_files = true;
result->trim_log_files = true;
result->directory=0;
result->remove_finalize_callback = NULL;
// fd is uninitialized on purpose
// ct is uninitialized on purpose
result->lg_max = 100<<20; // 100MB default
@ -187,7 +182,7 @@ int toku_logger_create (TOKULOGGER *resultp) {
result->last_completed_checkpoint_lsn = ZERO_LSN;
// next_log_file_number is uninitialized
// n_in_file is uninitialized
result->write_block_size = FT_DEFAULT_NODE_SIZE; // default logging size is the same as the default brt block size
result->write_block_size = FT_DEFAULT_NODE_SIZE; // default logging size is the same as the default ft block size
toku_logfilemgr_create(&result->logfilemgr);
*resultp=result;
ml_init(&result->input_lock);
@ -234,7 +229,7 @@ toku_logger_open_with_last_xid(const char *directory, TOKULOGGER logger, TXNID l
if (logger->is_open) return EINVAL;
int r;
TXNID last_xid_if_clean_shutdown= TXNID_NONE;
TXNID last_xid_if_clean_shutdown = TXNID_NONE;
r = toku_logfilemgr_init(logger->logfilemgr, directory, &last_xid_if_clean_shutdown);
if ( r!=0 )
return r;
@ -274,32 +269,30 @@ bool toku_logger_rollback_is_open (TOKULOGGER logger) {
#define MAX_CACHED_ROLLBACK_NODES 4096
void
toku_logger_initialize_rollback_cache(TOKULOGGER logger, FT ft) {
toku_free_unused_blocknums(ft->blocktable, ft->h->root_blocknum);
void toku_logger_initialize_rollback_cache(TOKULOGGER logger, FT ft) {
ft->blocktable.free_unused_blocknums(ft->h->root_blocknum);
logger->rollback_cache.init(MAX_CACHED_ROLLBACK_NODES);
}
int
toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, bool create) {
int toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, bool create) {
assert(logger->is_open);
assert(!logger->rollback_cachefile);
FT_HANDLE t = NULL; // Note, there is no DB associated with this BRT.
toku_ft_handle_create(&t);
int r = toku_ft_handle_open(t, toku_product_name_strings.rollback_cachefile, create, create, cachetable, NULL_TXN);
FT_HANDLE ft_handle = nullptr; // Note, there is no DB associated with this FT.
toku_ft_handle_create(&ft_handle);
int r = toku_ft_handle_open(ft_handle, toku_product_name_strings.rollback_cachefile, create, create, cachetable, nullptr);
if (r == 0) {
logger->rollback_cachefile = t->ft->cf;
toku_logger_initialize_rollback_cache(logger, t->ft);
FT ft = ft_handle->ft;
logger->rollback_cachefile = ft->cf;
toku_logger_initialize_rollback_cache(logger, ft_handle->ft);
//Verify it is empty
//Must have no data blocks (rollback logs or otherwise).
toku_block_verify_no_data_blocks_except_root(t->ft->blocktable, t->ft->h->root_blocknum);
bool is_empty;
is_empty = toku_ft_is_empty_fast(t);
// Verify it is empty
// Must have no data blocks (rollback logs or otherwise).
ft->blocktable.verify_no_data_blocks_except_root(ft->h->root_blocknum);
bool is_empty = toku_ft_is_empty_fast(ft_handle);
assert(is_empty);
} else {
toku_ft_handle_close(t);
toku_ft_handle_close(ft_handle);
}
return r;
}
@ -313,15 +306,15 @@ void toku_logger_close_rollback_check_empty(TOKULOGGER logger, bool clean_shutdo
CACHEFILE cf = logger->rollback_cachefile; // stored in logger at rollback cachefile open
if (cf) {
FT_HANDLE ft_to_close;
{ //Find "brt"
{ //Find "ft_to_close"
logger->rollback_cache.destroy();
FT CAST_FROM_VOIDP(ft, toku_cachefile_get_userdata(cf));
if (clean_shutdown) {
//Verify it is safe to close it.
assert(!ft->h->dirty); //Must not be dirty.
toku_free_unused_blocknums(ft->blocktable, ft->h->root_blocknum);
//Must have no data blocks (rollback logs or otherwise).
toku_block_verify_no_data_blocks_except_root(ft->blocktable, ft->h->root_blocknum);
ft->blocktable.free_unused_blocknums(ft->h->root_blocknum);
// Must have no data blocks (rollback logs or otherwise).
ft->blocktable.verify_no_data_blocks_except_root(ft->h->root_blocknum);
assert(!ft->h->dirty);
} else {
ft->h->dirty = 0;
@ -428,7 +421,7 @@ wait_till_output_available (TOKULOGGER logger)
// Implementation hint: Use a pthread_cond_wait.
// Entry: Holds the output_condition_lock (but not the inlock)
// Exit: Holds the output_condition_lock and logger->output_is_available
//
//
{
tokutime_t t0 = toku_time_now();
while (!logger->output_is_available) {
@ -497,7 +490,7 @@ release_output (TOKULOGGER logger, LSN fsynced_lsn)
toku_cond_broadcast(&logger->output_condition);
toku_mutex_unlock(&logger->output_condition_lock);
}
static void
swap_inbuf_outbuf (TOKULOGGER logger)
// Effect: Swap the inbuf and outbuf
@ -634,7 +627,7 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu
if (d==0) return get_error_errno();
while ((de=readdir(d))) {
if (de==0) return get_error_errno();
long long thisl;
long long thisl = -1;
if ( is_a_logfile(de->d_name, &thisl) ) {
if ((long long)thisl > maxf) maxf = thisl;
}
@ -700,7 +693,7 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
while ((de=readdir(d))) {
uint64_t thisl;
uint32_t version_ignore;
if ( !(is_a_logfile_any_version(de->d_name, &thisl, &version_ignore)) ) continue; //#2424: Skip over files that don't match the exact logfile template
if ( !(is_a_logfile_any_version(de->d_name, &thisl, &version_ignore)) ) continue; //#2424: Skip over files that don't match the exact logfile template
if (n_results+1>=result_limit) {
result_limit*=2;
XREALLOC_N(result_limit, result);
@ -714,7 +707,7 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
// which are one character longer than old log file names ("xxx.tokulog2"). The comparison function
// won't look beyond the terminating NUL, so an extra character in the comparison string doesn't matter.
// Allow room for terminating NUL after "xxx.tokulog13" even if result[0] is of form "xxx.tokulog2."
int width = sizeof(result[0]+2);
int width = sizeof(result[0]+2);
qsort(result, n_results, width, logfilenamecompare);
*resultp = result;
*n_logfiles = n_results;
@ -722,6 +715,12 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
return d ? closedir(d) : 0;
}
void toku_logger_free_logfiles(char **logfiles, int n_logfiles) {
for (int i = 0; i < n_logfiles; i++)
toku_free(logfiles[i]);
toku_free(logfiles);
}
static int open_logfile (TOKULOGGER logger)
// Entry and Exit: This thread has permission to modify the output.
{
@ -730,7 +729,7 @@ static int open_logfile (TOKULOGGER logger)
snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, logger->next_log_file_number, TOKU_LOG_VERSION);
long long index = logger->next_log_file_number;
if (logger->write_log_files) {
logger->fd = open(fname, O_CREAT+O_WRONLY+O_TRUNC+O_EXCL+O_BINARY, S_IRUSR+S_IWUSR);
logger->fd = open(fname, O_CREAT+O_WRONLY+O_TRUNC+O_EXCL+O_BINARY, S_IRUSR+S_IWUSR);
if (logger->fd==-1) {
return get_error_errno();
}
@ -748,7 +747,7 @@ static int open_logfile (TOKULOGGER logger)
if ( logger->write_log_files ) {
TOKULOGFILEINFO XMALLOC(lf_info);
lf_info->index = index;
lf_info->maxlsn = logger->written_lsn;
lf_info->maxlsn = logger->written_lsn;
lf_info->version = TOKU_LOG_VERSION;
toku_logfilemgr_add_logfile_info(logger->logfilemgr, lf_info);
}
@ -777,7 +776,7 @@ void toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
int n_logfiles = toku_logfilemgr_num_logfiles(lfm);
TOKULOGFILEINFO lf_info = NULL;
if ( logger->write_log_files && logger->trim_log_files) {
while ( n_logfiles > 1 ) { // don't delete current logfile
uint32_t log_version;
@ -857,7 +856,7 @@ void toku_logger_maybe_fsync(TOKULOGGER logger, LSN lsn, int do_fsync, bool hold
}
static void
logger_write_buffer(TOKULOGGER logger, LSN *fsynced_lsn)
logger_write_buffer(TOKULOGGER logger, LSN *fsynced_lsn)
// Entry: Holds the input lock and permission to modify output.
// Exit: Holds only the permission to modify output.
// Effect: Write the buffers to the output. If DO_FSYNC is true, then fsync.
@ -885,7 +884,7 @@ int toku_logger_restart(TOKULOGGER logger, LSN lastlsn)
// close the log file
if ( logger->write_log_files) { // fsyncs don't work to /dev/null
toku_file_fsync_without_accounting(logger->fd);
toku_file_fsync_without_accounting(logger->fd);
}
r = close(logger->fd); assert(r == 0);
logger->fd = -1;
@ -908,7 +907,7 @@ void toku_logger_log_fcreate (TOKUTXN txn, const char *fname, FILENUM filenum, u
if (txn) {
BYTESTRING bs_fname = { .len = (uint32_t) strlen(fname), .data = (char *) fname };
// fsync log on fcreate
toku_log_fcreate (txn->logger, (LSN*)0, 1, txn, toku_txn_get_txnid(txn), filenum,
toku_log_fcreate (txn->logger, (LSN*)0, 1, txn, toku_txn_get_txnid(txn), filenum,
bs_fname, mode, treeflags, nodesize, basementnodesize, compression_method);
}
}
@ -946,7 +945,7 @@ int toku_fread_uint8_t (FILE *f, uint8_t *v, struct x1764 *mm, uint32_t *len) {
int vi=fgetc(f);
if (vi==EOF) return -1;
uint8_t vc=(uint8_t)vi;
x1764_add(mm, &vc, 1);
toku_x1764_add(mm, &vc, 1);
(*len)++;
*v = vc;
return 0;
@ -1011,8 +1010,8 @@ int toku_fread_TXNID (FILE *f, TXNID *txnid, struct x1764 *checksum, uint32_t
}
int toku_fread_TXNID_PAIR (FILE *f, TXNID_PAIR *txnid, struct x1764 *checksum, uint32_t *len) {
TXNID parent= TXNID_NONE;
TXNID child= TXNID_NONE;
TXNID parent;
TXNID child;
int r;
r = toku_fread_TXNID(f, &parent, checksum, len); if (r != 0) { return r; }
r = toku_fread_TXNID(f, &child, checksum, len); if (r != 0) { return r; }
@ -1115,7 +1114,7 @@ int toku_logprint_XIDP (FILE *outf, FILE *inf, const char *fieldname, struct x17
XIDP vp;
int r = toku_fread_XIDP(inf, &vp, checksum, len);
if (r!=0) return r;
fprintf(outf, "%s={formatID=0x%lx gtrid_length=%ld bqual_length=%ld data=", fieldname, vp->formatID, vp->gtrid_length, vp->bqual_length);
fprintf(outf, " %s={formatID=0x%lx gtrid_length=%ld bqual_length=%ld data=", fieldname, vp->formatID, vp->gtrid_length, vp->bqual_length);
toku_print_bytes(outf, vp->gtrid_length + vp->bqual_length, vp->data);
fprintf(outf, "}");
toku_free(vp);
@ -1294,7 +1293,7 @@ static int peek_at_log (TOKULOGGER logger, char* filename, LSN *first_lsn) {
if (logger->write_log_files) printf("couldn't open: %s\n", strerror(er));
return er;
}
enum { SKIP = 12+1+4 }; // read the 12 byte header, the first cmd, and the first len
enum { SKIP = 12+1+4 }; // read the 12 byte header, the first message, and the first len
unsigned char header[SKIP+8];
int r = read(fd, header, SKIP+8);
if (r!=SKIP+8) return 0; // cannot determine that it's archivable, so we'll assume no. If a later-log is archivable is then this one will be too.
@ -1346,7 +1345,7 @@ int toku_logger_log_archive (TOKULOGGER logger, char ***logs_p, int flags) {
for (i=all_n_logs-2; i>=0; i--) { // start at all_n_logs-2 because we never archive the most recent log
r = peek_at_log(logger, all_logs[i], &earliest_lsn_in_logfile);
if (r!=0) continue; // In case of error, just keep going
if (earliest_lsn_in_logfile.lsn <= save_lsn.lsn) {
break;
}
@ -1398,18 +1397,18 @@ void toku_logger_note_checkpoint(TOKULOGGER logger, LSN lsn) {
static LOGGER_STATUS_S logger_status;
#define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(logger_status, k, c, t, "logger: " l, inc)
#define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(logger_status, k, c, t, "logger: " l, inc)
static void
status_init(void) {
// Note, this function initializes the keyname, type, and legend fields.
// Value fields are initialized to zero by compiler.
STATUS_INIT(LOGGER_NEXT_LSN, nullptr, UINT64, "next LSN", TOKU_ENGINE_STATUS);
STATUS_INIT(LOGGER_NUM_WRITES, LOGGER_WRITES, UINT64, "writes", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_BYTES_WRITTEN, LOGGER_WRITES_BYTES, UINT64, "writes (bytes)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_UNCOMPRESSED_BYTES_WRITTEN, LOGGER_WRITES_UNCOMPRESSED_BYTES, UINT64, "writes (uncompressed bytes)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_TOKUTIME_WRITES, LOGGER_WRITES_SECONDS, TOKUTIME, "writes (seconds)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_WAIT_BUF_LONG, LOGGER_WAIT_LONG, UINT64, "count", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_NUM_WRITES, LOGGER_WRITES, UINT64, "writes", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_BYTES_WRITTEN, LOGGER_WRITES_BYTES, UINT64, "writes (bytes)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_UNCOMPRESSED_BYTES_WRITTEN, LOGGER_WRITES_UNCOMPRESSED_BYTES, UINT64, "writes (uncompressed bytes)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_TOKUTIME_WRITES, LOGGER_WRITES_SECONDS, TOKUTIME, "writes (seconds)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LOGGER_WAIT_BUF_LONG, LOGGER_WAIT_LONG, UINT64, "number of long logger write operations", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
logger_status.initialized = true;
}
#undef STATUS_INIT
@ -1435,7 +1434,7 @@ toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS statp) {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Used for upgrade:
// Used for upgrade:
// if any valid log files exist in log_dir, then
// set *found_any_logs to true and set *version_found to version number of latest log
int

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_LOGGER_H
#define TOKU_LOGGER_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,17 +87,26 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "fttypes.h"
#include "ft_layout_version.h"
#include "ft/serialize/block_table.h"
#include "ft/serialize/ft_layout_version.h"
#include "ft/txn/txn.h"
typedef struct tokulogger *TOKULOGGER;
enum {
TOKU_LOG_VERSION_1 = 1,
TOKU_LOG_VERSION_2 = 2,
//After 2 we linked the log version to the FT_LAYOUT VERSION.
//So it went from 2 to 13 (3-12 do not exist)
TOKU_LOG_VERSION_24 = 24,
TOKU_LOG_VERSION_25 = 25, // change rollinclude rollback log entry
TOKU_LOG_VERSION_26 = 26, // no change from 25
TOKU_LOG_VERSION_27 = 27, // no change from 26
TOKU_LOG_VERSION = FT_LAYOUT_VERSION,
TOKU_LOG_MIN_SUPPORTED_VERSION = FT_LAYOUT_MIN_SUPPORTED_VERSION,
};
@ -109,8 +116,8 @@ int toku_logger_open (const char *directory, TOKULOGGER logger);
int toku_logger_open_with_last_xid(const char *directory, TOKULOGGER logger, TXNID last_xid);
void toku_logger_shutdown(TOKULOGGER logger);
int toku_logger_close(TOKULOGGER *loggerp);
void toku_logger_initialize_rollback_cache(TOKULOGGER logger, FT ft);
int toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, bool create);
void toku_logger_initialize_rollback_cache(TOKULOGGER logger, struct ft *ft);
int toku_logger_open_rollback(TOKULOGGER logger, struct cachetable *ct, bool create);
void toku_logger_close_rollback(TOKULOGGER logger);
void toku_logger_close_rollback_check_empty(TOKULOGGER logger, bool clean_shutdown);
bool toku_logger_rollback_is_open (TOKULOGGER); // return true iff the rollback is open.
@ -118,7 +125,7 @@ bool toku_logger_rollback_is_open (TOKULOGGER); // return true iff the rollback
void toku_logger_fsync (TOKULOGGER logger);
void toku_logger_fsync_if_lsn_not_fsynced(TOKULOGGER logger, LSN lsn);
int toku_logger_is_open(TOKULOGGER logger);
void toku_logger_set_cachetable (TOKULOGGER logger, CACHETABLE ct);
void toku_logger_set_cachetable (TOKULOGGER logger, struct cachetable *ct);
int toku_logger_set_lg_max(TOKULOGGER logger, uint32_t lg_max);
int toku_logger_get_lg_max(TOKULOGGER logger, uint32_t *lg_maxp);
int toku_logger_set_lg_bsize(TOKULOGGER logger, uint32_t bsize);
@ -139,10 +146,24 @@ int toku_logger_restart(TOKULOGGER logger, LSN lastlsn);
// given LSN and delete them.
void toku_logger_maybe_trim_log(TOKULOGGER logger, LSN oldest_open_lsn);
// At the ft layer, a FILENUM uniquely identifies an open file.
struct FILENUM {
uint32_t fileid;
};
static const FILENUM FILENUM_NONE = { .fileid = UINT32_MAX };
struct FILENUMS {
uint32_t num;
FILENUM *filenums;
};
void toku_logger_log_fcreate(TOKUTXN txn, const char *fname, FILENUM filenum, uint32_t mode, uint32_t flags, uint32_t nodesize, uint32_t basementnodesize, enum toku_compression_method compression_method);
void toku_logger_log_fdelete(TOKUTXN txn, FILENUM filenum);
void toku_logger_log_fopen(TOKUTXN txn, const char * fname, FILENUM filenum, uint32_t treeflags);
// the log generation code requires a typedef if we want to pass by pointer
typedef TOKU_XA_XID *XIDP;
int toku_fread_uint8_t (FILE *f, uint8_t *v, struct x1764 *mm, uint32_t *len);
int toku_fread_uint32_t_nocrclen (FILE *f, uint32_t *v);
int toku_fread_uint32_t (FILE *f, uint32_t *v, struct x1764 *checksum, uint32_t *len);
@ -258,8 +279,63 @@ void toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s);
int toku_get_version_of_logs_on_disk(const char *log_dir, bool *found_any_logs, uint32_t *version_found);
TXN_MANAGER toku_logger_get_txn_manager(TOKULOGGER logger);
struct txn_manager *toku_logger_get_txn_manager(TOKULOGGER logger);
static const TOKULOGGER NULL_logger __attribute__((__unused__)) = NULL;
// For serialize / deserialize
#endif /* TOKU_LOGGER_H */
#include "ft/serialize/wbuf.h"
static inline void wbuf_nocrc_FILENUM(struct wbuf *wb, FILENUM fileid) {
wbuf_nocrc_uint(wb, fileid.fileid);
}
static inline void wbuf_FILENUM(struct wbuf *wb, FILENUM fileid) {
wbuf_uint(wb, fileid.fileid);
}
static inline void wbuf_nocrc_FILENUMS(struct wbuf *wb, FILENUMS v) {
wbuf_nocrc_uint(wb, v.num);
for (uint32_t i = 0; i < v.num; i++) {
wbuf_nocrc_FILENUM(wb, v.filenums[i]);
}
}
static inline void wbuf_FILENUMS(struct wbuf *wb, FILENUMS v) {
wbuf_uint(wb, v.num);
for (uint32_t i = 0; i < v.num; i++) {
wbuf_FILENUM(wb, v.filenums[i]);
}
}
static inline void wbuf_nocrc_XIDP (struct wbuf *w, TOKU_XA_XID *xid) {
wbuf_nocrc_uint32_t(w, xid->formatID);
wbuf_nocrc_uint8_t(w, xid->gtrid_length);
wbuf_nocrc_uint8_t(w, xid->bqual_length);
wbuf_nocrc_literal_bytes(w, xid->data, xid->gtrid_length+xid->bqual_length);
}
#include "ft/serialize/rbuf.h"
static inline void rbuf_FILENUM(struct rbuf *rb, FILENUM *filenum) {
filenum->fileid = rbuf_int(rb);
}
static inline void rbuf_ma_FILENUM(struct rbuf *rb, memarena *UU(ma), FILENUM *filenum) {
rbuf_FILENUM(rb, filenum);
}
static inline void rbuf_FILENUMS(struct rbuf *rb, FILENUMS *filenums) {
filenums->num = rbuf_int(rb);
XMALLOC_N(filenums->num, filenums->filenums);
for (uint32_t i = 0; i < filenums->num; i++) {
rbuf_FILENUM(rb, &(filenums->filenums[i]));
}
}
static inline void rbuf_ma_FILENUMS(struct rbuf *rb, memarena *ma, FILENUMS *filenums) {
rbuf_ma_uint32_t(rb, ma, &(filenums->num));
filenums->filenums = (FILENUM *) ma->malloc_from_arena(filenums->num * sizeof(FILENUM));
assert(filenums->filenums != NULL);
for (uint32_t i = 0; i < filenums->num; i++) {
rbuf_ma_FILENUM(rb, ma, &(filenums->filenums[i]));
}
}

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,16 +89,17 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <ft/log_header.h>
#include "ft.h"
#include "log-internal.h"
#include "logcursor.h"
#include "cachetable.h"
#include "checkpoint.h"
#include "txn_manager.h"
#include "ft/cachetable/cachetable.h"
#include "ft/cachetable/checkpoint.h"
#include "ft/ft.h"
#include "ft/log_header.h"
#include "ft/logger/log-internal.h"
#include "ft/logger/logcursor.h"
#include "ft/txn/txn_manager.h"
#include "util/omt.h"
int tokudb_recovery_trace = 0; // turn on recovery tracing, default off.
int tokuft_recovery_trace = 0; // turn on recovery tracing, default off.
//#define DO_VERIFY_COUNTS
#ifdef DO_VERIFY_COUNTS
@ -154,9 +155,9 @@ struct file_map_tuple {
struct __toku_db fake_db;
};
static void file_map_tuple_init(struct file_map_tuple *tuple, FILENUM filenum, FT_HANDLE brt, char *iname) {
static void file_map_tuple_init(struct file_map_tuple *tuple, FILENUM filenum, FT_HANDLE ft_handle, char *iname) {
tuple->filenum = filenum;
tuple->ft_handle = brt;
tuple->ft_handle = ft_handle;
tuple->iname = iname;
// use a fake DB for comparisons, using the ft's cmp descriptor
memset(&tuple->fake_db, 0, sizeof(tuple->fake_db));
@ -171,9 +172,9 @@ static void file_map_tuple_destroy(struct file_map_tuple *tuple) {
}
}
// Map filenum to brt
// Map filenum to ft_handle
struct file_map {
OMT filenums;
toku::omt<struct file_map_tuple *> *filenums;
};
// The recovery environment
@ -199,31 +200,33 @@ typedef struct recover_env *RECOVER_ENV;
static void file_map_init(struct file_map *fmap) {
int r = toku_omt_create(&fmap->filenums);
assert(r == 0);
XMALLOC(fmap->filenums);
fmap->filenums->create();
}
static void file_map_destroy(struct file_map *fmap) {
toku_omt_destroy(&fmap->filenums);
fmap->filenums->destroy();
toku_free(fmap->filenums);
fmap->filenums = nullptr;
}
static uint32_t file_map_get_num_dictionaries(struct file_map *fmap) {
return toku_omt_size(fmap->filenums);
return fmap->filenums->size();
}
static void file_map_close_dictionaries(struct file_map *fmap, LSN oplsn) {
int r;
while (1) {
uint32_t n = toku_omt_size(fmap->filenums);
if (n == 0)
uint32_t n = fmap->filenums->size();
if (n == 0) {
break;
OMTVALUE v;
r = toku_omt_fetch(fmap->filenums, n-1, &v);
}
struct file_map_tuple *tuple;
r = fmap->filenums->fetch(n - 1, &tuple);
assert(r == 0);
r = toku_omt_delete_at(fmap->filenums, n-1);
r = fmap->filenums->delete_at(n - 1);
assert(r == 0);
struct file_map_tuple *CAST_FROM_VOIDP(tuple, v);
assert(tuple->ft_handle);
// Logging is on again, but we must pass the right LSN into close.
if (tuple->ft_handle) { // it's a DB, not a rollback file
@ -234,27 +237,29 @@ static void file_map_close_dictionaries(struct file_map *fmap, LSN oplsn) {
}
}
static int file_map_h(OMTVALUE omtv, void *v) {
struct file_map_tuple *CAST_FROM_VOIDP(a, omtv);
FILENUM *CAST_FROM_VOIDP(b, v);
if (a->filenum.fileid < b->fileid) return -1;
if (a->filenum.fileid > b->fileid) return +1;
return 0;
static int file_map_h(struct file_map_tuple *const &a, const FILENUM &b) {
if (a->filenum.fileid < b.fileid) {
return -1;
} else if (a->filenum.fileid > b.fileid) {
return 1;
} else {
return 0;
}
}
static int file_map_insert (struct file_map *fmap, FILENUM fnum, FT_HANDLE brt, char *iname) {
static int file_map_insert (struct file_map *fmap, FILENUM fnum, FT_HANDLE ft_handle, char *iname) {
struct file_map_tuple *XMALLOC(tuple);
file_map_tuple_init(tuple, fnum, brt, iname);
int r = toku_omt_insert(fmap->filenums, tuple, file_map_h, &fnum, NULL);
file_map_tuple_init(tuple, fnum, ft_handle, iname);
int r = fmap->filenums->insert<FILENUM, file_map_h>(tuple, fnum, nullptr);
return r;
}
static void file_map_remove(struct file_map *fmap, FILENUM fnum) {
OMTVALUE v; uint32_t idx;
int r = toku_omt_find_zero(fmap->filenums, file_map_h, &fnum, &v, &idx);
uint32_t idx;
struct file_map_tuple *tuple;
int r = fmap->filenums->find_zero<FILENUM, file_map_h>(fnum, &tuple, &idx);
if (r == 0) {
struct file_map_tuple *CAST_FROM_VOIDP(tuple, v);
r = toku_omt_delete_at(fmap->filenums, idx);
r = fmap->filenums->delete_at(idx);
file_map_tuple_destroy(tuple);
toku_free(tuple);
}
@ -262,14 +267,15 @@ static void file_map_remove(struct file_map *fmap, FILENUM fnum) {
// Look up file info: given FILENUM, return file_map_tuple (or DB_NOTFOUND)
static int file_map_find(struct file_map *fmap, FILENUM fnum, struct file_map_tuple **file_map_tuple) {
OMTVALUE v; uint32_t idx;
int r = toku_omt_find_zero(fmap->filenums, file_map_h, &fnum, &v, &idx);
uint32_t idx;
struct file_map_tuple *tuple;
int r = fmap->filenums->find_zero<FILENUM, file_map_h>(fnum, &tuple, &idx);
if (r == 0) {
struct file_map_tuple *CAST_FROM_VOIDP(tuple, v);
assert(tuple->filenum.fileid == fnum.fileid);
*file_map_tuple = tuple;
} else {
assert(r == DB_NOTFOUND);
}
else assert(r==DB_NOTFOUND);
return r;
}
@ -311,7 +317,7 @@ static int recover_env_init (RECOVER_ENV renv,
renv->cp = toku_cachetable_get_checkpointer(renv->ct);
toku_dbt_array_init(&renv->dest_keys, 1);
toku_dbt_array_init(&renv->dest_vals, 1);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
fprintf(stderr, "%s:%d\n", __FUNCTION__, __LINE__);
return r;
}
@ -319,7 +325,7 @@ static int recover_env_init (RECOVER_ENV renv,
static void recover_env_cleanup (RECOVER_ENV renv) {
int r;
assert(toku_omt_size(renv->fmap.filenums)==0);
invariant_zero(renv->fmap.filenums->size());
file_map_destroy(&renv->fmap);
if (renv->destroy_logger_at_end) {
@ -338,7 +344,7 @@ static void recover_env_cleanup (RECOVER_ENV renv) {
toku_dbt_array_destroy(&renv->dest_keys);
toku_dbt_array_destroy(&renv->dest_vals);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
fprintf(stderr, "%s:%d\n", __FUNCTION__, __LINE__);
}
@ -350,48 +356,48 @@ static const char *recover_state(RECOVER_ENV renv) {
static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, bool must_create, int UU(mode), BYTESTRING *bs_iname, FILENUM filenum, uint32_t treeflags,
TOKUTXN txn, uint32_t nodesize, uint32_t basementnodesize, enum toku_compression_method compression_method, LSN max_acceptable_lsn) {
int r = 0;
FT_HANDLE brt = NULL;
FT_HANDLE ft_handle = NULL;
char *iname = fixup_fname(bs_iname);
toku_ft_handle_create(&brt);
toku_ft_set_flags(brt, treeflags);
toku_ft_handle_create(&ft_handle);
toku_ft_set_flags(ft_handle, treeflags);
if (nodesize != 0) {
toku_ft_handle_set_nodesize(brt, nodesize);
toku_ft_handle_set_nodesize(ft_handle, nodesize);
}
if (basementnodesize != 0) {
toku_ft_handle_set_basementnodesize(brt, basementnodesize);
toku_ft_handle_set_basementnodesize(ft_handle, basementnodesize);
}
if (compression_method != TOKU_DEFAULT_COMPRESSION_METHOD) {
toku_ft_handle_set_compression_method(brt, compression_method);
toku_ft_handle_set_compression_method(ft_handle, compression_method);
}
// set the key compare functions
if (!(treeflags & TOKU_DB_KEYCMP_BUILTIN) && renv->bt_compare) {
toku_ft_set_bt_compare(brt, renv->bt_compare);
toku_ft_set_bt_compare(ft_handle, renv->bt_compare);
}
if (renv->update_function) {
toku_ft_set_update(brt, renv->update_function);
toku_ft_set_update(ft_handle, renv->update_function);
}
// TODO mode (FUTURE FEATURE)
//mode = mode;
r = toku_ft_handle_open_recovery(brt, iname, must_create, must_create, renv->ct, txn, filenum, max_acceptable_lsn);
r = toku_ft_handle_open_recovery(ft_handle, iname, must_create, must_create, renv->ct, txn, filenum, max_acceptable_lsn);
if (r != 0) {
//Note: If ft_handle_open fails, then close_ft will NOT write a header to disk.
//No need to provide lsn, so use the regular toku_ft_handle_close function
toku_ft_handle_close(brt);
toku_ft_handle_close(ft_handle);
toku_free(iname);
if (r == ENOENT) //Not an error to simply be missing.
r = 0;
return r;
}
file_map_insert(&renv->fmap, filenum, brt, iname);
file_map_insert(&renv->fmap, filenum, ft_handle, iname);
return 0;
}
@ -417,7 +423,7 @@ static int toku_recover_begin_checkpoint (struct logtype_begin_checkpoint *l, RE
r = 0; // ignore it (log only has a begin checkpoint)
break;
default:
fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss);
fprintf(stderr, "TokuFT recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss);
abort();
break;
}
@ -427,7 +433,7 @@ static int toku_recover_begin_checkpoint (struct logtype_begin_checkpoint *l, RE
static int toku_recover_backward_begin_checkpoint (struct logtype_begin_checkpoint *l, RECOVER_ENV renv) {
int r;
time_t tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery bw_begin_checkpoint at %" PRIu64 " timestamp %" PRIu64 " (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, recover_state(renv));
fprintf(stderr, "%.24s TokuFT recovery bw_begin_checkpoint at %" PRIu64 " timestamp %" PRIu64 " (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, recover_state(renv));
switch (renv->ss.ss) {
case BACKWARD_NEWER_CHECKPOINT_END:
// incomplete checkpoint, nothing to do
@ -439,13 +445,13 @@ static int toku_recover_backward_begin_checkpoint (struct logtype_begin_checkpoi
renv->ss.checkpoint_begin_timestamp = l->timestamp;
renv->goforward = true;
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery turning around at begin checkpoint %" PRIu64 " time %" PRIu64 "\n",
fprintf(stderr, "%.24s TokuFT recovery turning around at begin checkpoint %" PRIu64 " time %" PRIu64 "\n",
ctime(&tnow), l->lsn.lsn,
renv->ss.checkpoint_end_timestamp - renv->ss.checkpoint_begin_timestamp);
r = 0;
break;
default:
fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss);
fprintf(stderr, "TokuFT recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss);
abort();
break;
}
@ -475,7 +481,7 @@ static int toku_recover_end_checkpoint (struct logtype_end_checkpoint *l, RECOVE
static int toku_recover_backward_end_checkpoint (struct logtype_end_checkpoint *l, RECOVER_ENV renv) {
time_t tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery bw_end_checkpoint at %" PRIu64 " timestamp %" PRIu64 " xid %" PRIu64 " (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, l->lsn_begin_checkpoint.lsn, recover_state(renv));
fprintf(stderr, "%.24s TokuFT recovery bw_end_checkpoint at %" PRIu64 " timestamp %" PRIu64 " xid %" PRIu64 " (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, l->lsn_begin_checkpoint.lsn, recover_state(renv));
switch (renv->ss.ss) {
case BACKWARD_NEWER_CHECKPOINT_END:
renv->ss.ss = BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END;
@ -484,12 +490,12 @@ static int toku_recover_backward_end_checkpoint (struct logtype_end_checkpoint *
renv->ss.checkpoint_end_timestamp = l->timestamp;
return 0;
case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END:
fprintf(stderr, "Tokudb recovery %s:%d Should not see two end_checkpoint log entries without an intervening begin_checkpoint\n", __FILE__, __LINE__);
fprintf(stderr, "TokuFT recovery %s:%d Should not see two end_checkpoint log entries without an intervening begin_checkpoint\n", __FILE__, __LINE__);
abort();
default:
break;
}
fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss);
fprintf(stderr, "TokuFT recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss);
abort();
}
@ -826,7 +832,7 @@ static int toku_recover_fcreate (struct logtype_fcreate *l, RECOVER_ENV renv) {
if (r != 0) {
int er = get_error_errno();
if (er != ENOENT) {
fprintf(stderr, "Tokudb recovery %s:%d unlink %s %d\n", __FUNCTION__, __LINE__, iname, er);
fprintf(stderr, "TokuFT recovery %s:%d unlink %s %d\n", __FUNCTION__, __LINE__, iname, er);
toku_free(iname);
return r;
}
@ -1253,7 +1259,7 @@ static int toku_recover_backward_hot_index(struct logtype_hot_index *UU(l), RECO
// Effects: If there are no log files, or if there is a clean "shutdown" at
// the end of the log, then we don't need recovery to run.
// Returns: true if we need recovery, otherwise false.
int tokudb_needs_recovery(const char *log_dir, bool ignore_log_empty) {
int tokuft_needs_recovery(const char *log_dir, bool ignore_log_empty) {
int needs_recovery;
int r;
TOKULOGCURSOR logcursor = NULL;
@ -1377,7 +1383,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
struct log_entry *le = NULL;
time_t tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery starting in env %s\n", ctime(&tnow), env_dir);
fprintf(stderr, "%.24s TokuFT recovery starting in env %s\n", ctime(&tnow), env_dir);
char org_wd[1000];
{
@ -1398,7 +1404,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
r = toku_logcursor_last(logcursor, &le);
if (r != 0) {
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
fprintf(stderr, "RUNRECOVERY: %s:%d r=%d\n", __FUNCTION__, __LINE__, r);
rr = DB_RUNRECOVERY; goto errorexit;
}
@ -1413,10 +1419,10 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
toku_struct_stat buf;
if (toku_stat(env_dir, &buf)!=0) {
rr = get_error_errno();
fprintf(stderr, "%.24s Tokudb recovery error: directory does not exist: %s\n", ctime(&tnow), env_dir);
fprintf(stderr, "%.24s TokuFT recovery error: directory does not exist: %s\n", ctime(&tnow), env_dir);
goto errorexit;
} else if (!S_ISDIR(buf.st_mode)) {
fprintf(stderr, "%.24s Tokudb recovery error: this file is supposed to be a directory, but is not: %s\n", ctime(&tnow), env_dir);
fprintf(stderr, "%.24s TokuFT recovery error: this file is supposed to be a directory, but is not: %s\n", ctime(&tnow), env_dir);
rr = ENOTDIR; goto errorexit;
}
}
@ -1425,13 +1431,13 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
tnow = time(NULL);
time_t tlast;
tlast = tnow;
fprintf(stderr, "%.24s Tokudb recovery scanning backward from %" PRIu64 "\n", ctime(&tnow), lastlsn.lsn);
fprintf(stderr, "%.24s TokuFT recovery scanning backward from %" PRIu64 "\n", ctime(&tnow), lastlsn.lsn);
for (unsigned i=0; 1; i++) {
// get the previous log entry (first time gets the last one)
le = NULL;
r = toku_logcursor_prev(logcursor, &le);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
recover_trace_le(__FUNCTION__, __LINE__, r, le);
if (r != 0) {
if (r == DB_NOTFOUND)
@ -1445,7 +1451,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
tnow = time(NULL);
if (tnow - tlast >= TOKUDB_RECOVERY_PROGRESS_TIME) {
thislsn = toku_log_entry_get_lsn(le);
fprintf(stderr, "%.24s Tokudb recovery scanning backward from %" PRIu64 " at %" PRIu64 " (%s)\n", ctime(&tnow), lastlsn.lsn, thislsn.lsn, recover_state(renv));
fprintf(stderr, "%.24s TokuFT recovery scanning backward from %" PRIu64 " at %" PRIu64 " (%s)\n", ctime(&tnow), lastlsn.lsn, thislsn.lsn, recover_state(renv));
tlast = tnow;
}
}
@ -1454,10 +1460,10 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
assert(renv->ss.ss == BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END ||
renv->ss.ss == BACKWARD_NEWER_CHECKPOINT_END);
logtype_dispatch_assign(le, toku_recover_backward_, r, renv);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
recover_trace_le(__FUNCTION__, __LINE__, r, le);
if (r != 0) {
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
fprintf(stderr, "DB_RUNRECOVERY: %s:%d r=%d\n", __FUNCTION__, __LINE__, r);
rr = DB_RUNRECOVERY;
goto errorexit;
@ -1474,7 +1480,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
assert(le);
thislsn = toku_log_entry_get_lsn(le);
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery starts scanning forward to %" PRIu64 " from %" PRIu64 " left %" PRIu64 " (%s)\n", ctime(&tnow), lastlsn.lsn, thislsn.lsn, lastlsn.lsn - thislsn.lsn, recover_state(renv));
fprintf(stderr, "%.24s TokuFT recovery starts scanning forward to %" PRIu64 " from %" PRIu64 " left %" PRIu64 " (%s)\n", ctime(&tnow), lastlsn.lsn, thislsn.lsn, lastlsn.lsn - thislsn.lsn, recover_state(renv));
for (unsigned i=0; 1; i++) {
@ -1483,7 +1489,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
tnow = time(NULL);
if (tnow - tlast >= TOKUDB_RECOVERY_PROGRESS_TIME) {
thislsn = toku_log_entry_get_lsn(le);
fprintf(stderr, "%.24s Tokudb recovery scanning forward to %" PRIu64 " at %" PRIu64 " left %" PRIu64 " (%s)\n", ctime(&tnow), lastlsn.lsn, thislsn.lsn, lastlsn.lsn - thislsn.lsn, recover_state(renv));
fprintf(stderr, "%.24s TokuFT recovery scanning forward to %" PRIu64 " at %" PRIu64 " left %" PRIu64 " (%s)\n", ctime(&tnow), lastlsn.lsn, thislsn.lsn, lastlsn.lsn - thislsn.lsn, recover_state(renv));
tlast = tnow;
}
}
@ -1492,10 +1498,10 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
assert(renv->ss.ss == FORWARD_BETWEEN_CHECKPOINT_BEGIN_END ||
renv->ss.ss == FORWARD_NEWER_CHECKPOINT_END);
logtype_dispatch_assign(le, toku_recover_, r, renv);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
recover_trace_le(__FUNCTION__, __LINE__, r, le);
if (r != 0) {
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
fprintf(stderr, "DB_RUNRECOVERY: %s:%d r=%d\n", __FUNCTION__, __LINE__, r);
rr = DB_RUNRECOVERY;
goto errorexit;
@ -1504,7 +1510,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
// get the next log entry
le = NULL;
r = toku_logcursor_next(logcursor, &le);
if (tokudb_recovery_trace)
if (tokuft_recovery_trace)
recover_trace_le(__FUNCTION__, __LINE__, r, le);
if (r != 0) {
if (r == DB_NOTFOUND)
@ -1532,7 +1538,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
uint32_t n = recover_get_num_live_txns(renv);
if (n > 0) {
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery has %" PRIu32 " live transaction%s\n", ctime(&tnow), n, n > 1 ? "s" : "");
fprintf(stderr, "%.24s TokuFT recovery has %" PRIu32 " live transaction%s\n", ctime(&tnow), n, n > 1 ? "s" : "");
}
}
recover_abort_all_live_txns(renv);
@ -1540,7 +1546,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
uint32_t n = recover_get_num_live_txns(renv);
if (n > 0) {
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery has %" PRIu32 " prepared transaction%s\n", ctime(&tnow), n, n > 1 ? "s" : "");
fprintf(stderr, "%.24s TokuFT recovery has %" PRIu32 " prepared transaction%s\n", ctime(&tnow), n, n > 1 ? "s" : "");
}
}
@ -1549,7 +1555,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
n = file_map_get_num_dictionaries(&renv->fmap);
if (n > 0) {
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery closing %" PRIu32 " dictionar%s\n", ctime(&tnow), n, n > 1 ? "ies" : "y");
fprintf(stderr, "%.24s TokuFT recovery closing %" PRIu32 " dictionar%s\n", ctime(&tnow), n, n > 1 ? "ies" : "y");
}
file_map_close_dictionaries(&renv->fmap, lastlsn);
@ -1561,17 +1567,17 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di
// checkpoint
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery making a checkpoint\n", ctime(&tnow));
fprintf(stderr, "%.24s TokuFT recovery making a checkpoint\n", ctime(&tnow));
r = toku_checkpoint(renv->cp, renv->logger, NULL, NULL, NULL, NULL, RECOVERY_CHECKPOINT);
assert(r == 0);
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery done\n", ctime(&tnow));
fprintf(stderr, "%.24s TokuFT recovery done\n", ctime(&tnow));
return 0;
errorexit:
tnow = time(NULL);
fprintf(stderr, "%.24s Tokudb recovery failed %d\n", ctime(&tnow), rr);
fprintf(stderr, "%.24s TokuFT recovery failed %d\n", ctime(&tnow), rr);
if (logcursor) {
r = toku_logcursor_destroy(&logcursor);
@ -1596,7 +1602,7 @@ toku_recover_unlock(int lockfd) {
return toku_single_process_unlock(&lockfd_copy);
}
int tokudb_recover(DB_ENV *env,
int tokuft_recover(DB_ENV *env,
prepared_txn_callback_t prepared_txn_callback,
keep_cachetable_callback_t keep_cachetable_callback,
TOKULOGGER logger,
@ -1614,7 +1620,7 @@ int tokudb_recover(DB_ENV *env,
return r;
int rr = 0;
if (tokudb_needs_recovery(log_dir, false)) {
if (tokuft_needs_recovery(log_dir, false)) {
struct recover_env renv;
r = recover_env_init(&renv,
env_dir,
@ -1643,7 +1649,7 @@ int tokudb_recover(DB_ENV *env,
// Return 0 if recovery log exists, ENOENT if log is missing
int
tokudb_recover_log_exists(const char * log_dir) {
tokuft_recover_log_exists(const char * log_dir) {
int r;
TOKULOGCURSOR logcursor;

View File

@ -1,7 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKURECOVER_H
#define TOKURECOVER_H
#ident "$Id$"
/*
@ -32,7 +30,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -89,55 +87,53 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h>
#include <db.h>
#include <errno.h>
#include <db.h>
#include "fttypes.h"
#include "memory.h"
#include "x1764.h"
#include "portability/memory.h"
#include "portability/toku_portability.h"
#include "ft/comparator.h"
#include "ft/ft-ops.h"
#include "util/x1764.h"
typedef void (*prepared_txn_callback_t)(DB_ENV*, TOKUTXN);
typedef void (*keep_cachetable_callback_t)(DB_ENV*, CACHETABLE);
typedef void (*prepared_txn_callback_t)(DB_ENV *env, struct tokutxn *txn);
typedef void (*keep_cachetable_callback_t)(DB_ENV *env, struct cachetable *ct);
// Run tokudb recovery from the log
// Run tokuft recovery from the log
// Returns 0 if success
int tokudb_recover (DB_ENV *env,
prepared_txn_callback_t prepared_txn_callback,
keep_cachetable_callback_t keep_cachetable_callback,
TOKULOGGER logger,
const char *env_dir, const char *log_dir,
ft_compare_func bt_compare,
ft_update_func update_function,
generate_row_for_put_func generate_row_for_put,
generate_row_for_del_func generate_row_for_del,
size_t cachetable_size);
int tokuft_recover(DB_ENV *env,
prepared_txn_callback_t prepared_txn_callback,
keep_cachetable_callback_t keep_cachetable_callback,
struct tokulogger *logger,
const char *env_dir,
const char *log_dir,
ft_compare_func bt_compare,
ft_update_func update_function,
generate_row_for_put_func generate_row_for_put,
generate_row_for_del_func generate_row_for_del,
size_t cachetable_size);
// Effect: Check the tokudb logs to determine whether or not we need to run recovery.
// Effect: Check the tokuft logs to determine whether or not we need to run recovery.
// If the log is empty or if there is a clean shutdown at the end of the log, then we
// dont need to run recovery.
// Returns: true if we need recovery, otherwise false.
int tokudb_needs_recovery(const char *logdir, bool ignore_empty_log);
int tokuft_needs_recovery(const char *logdir, bool ignore_empty_log);
// Return 0 if recovery log exists, ENOENT if log is missing
int tokudb_recover_log_exists(const char * log_dir);
int tokuft_recover_log_exists(const char * log_dir);
// For test only - set callbacks for recovery testing
void toku_recover_set_callback (void (*)(void*), void*);
void toku_recover_set_callback2 (void (*)(void*), void*);
extern int tokudb_recovery_trace;
extern int tokuft_recovery_trace;
int toku_recover_lock (const char *lock_dir, int *lockfd);
int toku_recover_unlock(int lockfd);
static const prepared_txn_callback_t NULL_prepared_txn_callback __attribute__((__unused__)) = NULL;
static const keep_cachetable_callback_t NULL_keep_cachetable_callback __attribute__((__unused__)) = NULL;
#endif // TOKURECOVER_H

View File

@ -29,7 +29,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -88,46 +88,84 @@ PATENT RIGHTS GRANT:
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#include "portability/toku_portability.h"
#include <toku_portability.h>
#include "fttypes.h"
#include "xids.h"
#include "ft_msg.h"
#include "ft/msg.h"
#include "ft/txn/xids.h"
#include "util/dbt.h"
uint32_t
ft_msg_get_keylen(FT_MSG ft_msg) {
uint32_t rval = ft_msg->u.id.key->size;
return rval;
ft_msg::ft_msg(const DBT *key, const DBT *val, enum ft_msg_type t, MSN m, XIDS x) :
_key(key ? *key : toku_empty_dbt()),
_val(val ? *val : toku_empty_dbt()),
_type(t), _msn(m), _xids(x) {
}
uint32_t
ft_msg_get_vallen(FT_MSG ft_msg) {
uint32_t rval = ft_msg->u.id.val->size;
return rval;
ft_msg ft_msg::deserialize_from_rbuf(struct rbuf *rb, XIDS *x, bool *is_fresh) {
const void *keyp, *valp;
uint32_t keylen, vallen;
enum ft_msg_type t = (enum ft_msg_type) rbuf_char(rb);
*is_fresh = rbuf_char(rb);
MSN m = rbuf_MSN(rb);
toku_xids_create_from_buffer(rb, x);
rbuf_bytes(rb, &keyp, &keylen);
rbuf_bytes(rb, &valp, &vallen);
DBT k, v;
return ft_msg(toku_fill_dbt(&k, keyp, keylen), toku_fill_dbt(&v, valp, vallen), t, m, *x);
}
XIDS
ft_msg_get_xids(FT_MSG ft_msg) {
XIDS rval = ft_msg->xids;
return rval;
ft_msg ft_msg::deserialize_from_rbuf_v13(struct rbuf *rb, MSN m, XIDS *x) {
const void *keyp, *valp;
uint32_t keylen, vallen;
enum ft_msg_type t = (enum ft_msg_type) rbuf_char(rb);
toku_xids_create_from_buffer(rb, x);
rbuf_bytes(rb, &keyp, &keylen);
rbuf_bytes(rb, &valp, &vallen);
DBT k, v;
return ft_msg(toku_fill_dbt(&k, keyp, keylen), toku_fill_dbt(&v, valp, vallen), t, m, *x);
}
void *
ft_msg_get_key(FT_MSG ft_msg) {
void * rval = ft_msg->u.id.key->data;
return rval;
const DBT *ft_msg::kdbt() const {
return &_key;
}
void *
ft_msg_get_val(FT_MSG ft_msg) {
void * rval = ft_msg->u.id.val->data;
return rval;
const DBT *ft_msg::vdbt() const {
return &_val;
}
enum ft_msg_type
ft_msg_get_type(FT_MSG ft_msg) {
enum ft_msg_type rval = ft_msg->type;
return rval;
enum ft_msg_type ft_msg::type() const {
return _type;
}
MSN ft_msg::msn() const {
return _msn;
}
XIDS ft_msg::xids() const {
return _xids;
}
size_t ft_msg::total_size() const {
// Must store two 4-byte lengths
static const size_t key_val_overhead = 8;
// 1 byte type, 1 byte freshness, then 8 byte MSN
static const size_t msg_overhead = 2 + sizeof(MSN);
static const size_t total_overhead = key_val_overhead + msg_overhead;
const size_t keyval_size = _key.size + _val.size;
const size_t xids_size = toku_xids_get_serialize_size(xids());
return total_overhead + keyval_size + xids_size;
}
void ft_msg::serialize_to_wbuf(struct wbuf *wb, bool is_fresh) const {
wbuf_nocrc_char(wb, (unsigned char) _type);
wbuf_nocrc_char(wb, (unsigned char) is_fresh);
wbuf_MSN(wb, _msn);
wbuf_nocrc_xids(wb, _xids);
wbuf_nocrc_bytes(wb, _key.data, _key.size);
wbuf_nocrc_bytes(wb, _val.data, _val.size);
}

View File

@ -1,5 +1,11 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/* The purpose of this file is to provide access to the ft_msg,
* which is the ephemeral version of the messages that lives in
* a message buffer.
*/
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -29,7 +35,7 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
@ -86,145 +92,155 @@ PATENT RIGHTS GRANT:
under this License.
*/
#pragma once
#include <db.h>
#include "portability/toku_assert.h"
#include "portability/toku_stdint.h"
#include "ft/txn/xids.h"
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h>
#include <memory.h>
#include <string.h>
#include <db.h>
// Message Sequence Number (MSN)
typedef struct __toku_msn { uint64_t msn; } MSN;
#include "omt.h"
// dummy used for message construction, to be filled in when msg is applied to tree
static const MSN ZERO_MSN = { .msn = 0 };
int
toku_omt_create_steal_sorted_array(OMT *omtp, OMTVALUE **valuesp, uint32_t numvalues, uint32_t capacity) {
OMT XMALLOC(omt);
omt->create_steal_sorted_array(valuesp, numvalues, capacity);
*omtp = omt;
return 0;
}
// first 2^62 values reserved for messages created before Dr. No (for upgrade)
static const MSN MIN_MSN = { .msn = 1ULL << 62 };
static const MSN MAX_MSN = { .msn = UINT64_MAX };
//TODO: Put all omt API functions here.
int toku_omt_create (OMT *omtp) {
OMT XMALLOC(omt);
omt->create();
*omtp = omt;
return 0;
}
void toku_omt_destroy(OMT *omtp) {
OMT omt=*omtp;
omt->destroy();
toku_free(omt);
*omtp=NULL;
}
uint32_t toku_omt_size(OMT V) {
return V->size();
}
int toku_omt_create_from_sorted_array(OMT *omtp, OMTVALUE *values, uint32_t numvalues) {
OMT XMALLOC(omt);
omt->create_from_sorted_array(values, numvalues);
*omtp=omt;
return 0;
}
int toku_omt_insert_at(OMT omt, OMTVALUE value, uint32_t index) {
return omt->insert_at(value, index);
}
int toku_omt_set_at (OMT omt, OMTVALUE value, uint32_t index) {
return omt->set_at(value, index);
}
int toku_omt_delete_at(OMT omt, uint32_t index) {
return omt->delete_at(index);
}
int toku_omt_fetch(OMT omt, uint32_t i, OMTVALUE *v) {
return omt->fetch(i, v);
}
struct functor {
int (*f)(OMTVALUE, uint32_t, void *);
void *v;
/* tree command types */
enum ft_msg_type {
FT_NONE = 0,
FT_INSERT = 1,
FT_DELETE_ANY = 2, // Delete any matching key. This used to be called FT_DELETE.
//FT_DELETE_BOTH = 3,
FT_ABORT_ANY = 4, // Abort any commands on any matching key.
//FT_ABORT_BOTH = 5, // Abort commands that match both the key and the value
FT_COMMIT_ANY = 6,
//FT_COMMIT_BOTH = 7,
FT_COMMIT_BROADCAST_ALL = 8, // Broadcast to all leafentries, (commit all transactions).
FT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction).
FT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction).
FT_INSERT_NO_OVERWRITE = 11,
FT_OPTIMIZE = 12, // Broadcast
FT_OPTIMIZE_FOR_UPGRADE = 13, // same as FT_OPTIMIZE, but record version number in leafnode
FT_UPDATE = 14,
FT_UPDATE_BROADCAST_ALL = 15
};
static_assert(std::is_pod<functor>::value, "not POD");
int call_functor(const OMTVALUE &v, uint32_t idx, functor *const ftor);
int call_functor(const OMTVALUE &v, uint32_t idx, functor *const ftor) {
return ftor->f(const_cast<OMTVALUE>(v), idx, ftor->v);
}
int toku_omt_iterate(OMT omt, int (*f)(OMTVALUE, uint32_t, void*), void*v) {
struct functor ftor = { .f = f, .v = v };
return omt->iterate<functor, call_functor>(&ftor);
}
int toku_omt_iterate_on_range(OMT omt, uint32_t left, uint32_t right, int (*f)(OMTVALUE, uint32_t, void*), void*v) {
struct functor ftor = { .f = f, .v = v };
return omt->iterate_on_range<functor, call_functor>(left, right, &ftor);
}
struct heftor {
int (*h)(OMTVALUE, void *v);
void *v;
};
static_assert(std::is_pod<heftor>::value, "not POD");
int call_heftor(const OMTVALUE &v, const heftor &htor);
int call_heftor(const OMTVALUE &v, const heftor &htor) {
return htor.h(const_cast<OMTVALUE>(v), htor.v);
}
int toku_omt_insert(OMT omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v, uint32_t *index) {
struct heftor htor = { .h = h, .v = v };
return omt->insert<heftor, call_heftor>(value, htor, index);
}
int toku_omt_find_zero(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, uint32_t *index) {
struct heftor htor = { .h = h, .v = extra };
return V->find_zero<heftor, call_heftor>(htor, value, index);
}
int toku_omt_find(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, int direction, OMTVALUE *value, uint32_t *index) {
struct heftor htor = { .h = h, .v = extra };
return V->find<heftor, call_heftor>(htor, direction, value, index);
}
int toku_omt_split_at(OMT omt, OMT *newomtp, uint32_t index) {
OMT XMALLOC(newomt);
int r = omt->split_at(newomt, index);
if (r != 0) {
toku_free(newomt);
} else {
*newomtp = newomt;
static inline bool
ft_msg_type_applies_once(enum ft_msg_type type)
{
bool ret_val;
switch (type) {
case FT_INSERT_NO_OVERWRITE:
case FT_INSERT:
case FT_DELETE_ANY:
case FT_ABORT_ANY:
case FT_COMMIT_ANY:
case FT_UPDATE:
ret_val = true;
break;
case FT_COMMIT_BROADCAST_ALL:
case FT_COMMIT_BROADCAST_TXN:
case FT_ABORT_BROADCAST_TXN:
case FT_OPTIMIZE:
case FT_OPTIMIZE_FOR_UPGRADE:
case FT_UPDATE_BROADCAST_ALL:
case FT_NONE:
ret_val = false;
break;
default:
assert(false);
}
return r;
return ret_val;
}
int toku_omt_merge(OMT leftomt, OMT rightomt, OMT *newomtp) {
OMT XMALLOC(newomt);
newomt->merge(leftomt, rightomt);
toku_free(leftomt);
toku_free(rightomt);
*newomtp = newomt;
return 0;
static inline bool
ft_msg_type_applies_all(enum ft_msg_type type)
{
bool ret_val;
switch (type) {
case FT_NONE:
case FT_INSERT_NO_OVERWRITE:
case FT_INSERT:
case FT_DELETE_ANY:
case FT_ABORT_ANY:
case FT_COMMIT_ANY:
case FT_UPDATE:
ret_val = false;
break;
case FT_COMMIT_BROADCAST_ALL:
case FT_COMMIT_BROADCAST_TXN:
case FT_ABORT_BROADCAST_TXN:
case FT_OPTIMIZE:
case FT_OPTIMIZE_FOR_UPGRADE:
case FT_UPDATE_BROADCAST_ALL:
ret_val = true;
break;
default:
assert(false);
}
return ret_val;
}
int toku_omt_clone_noptr(OMT *dest, OMT src) {
OMT XMALLOC(omt);
omt->clone(*src);
*dest = omt;
return 0;
static inline bool
ft_msg_type_does_nothing(enum ft_msg_type type)
{
return (type == FT_NONE);
}
void toku_omt_clear(OMT omt) {
omt->clear();
class ft_msg {
public:
ft_msg(const DBT *key, const DBT *val, enum ft_msg_type t, MSN m, XIDS x);
enum ft_msg_type type() const;
MSN msn() const;
XIDS xids() const;
const DBT *kdbt() const;
const DBT *vdbt() const;
size_t total_size() const;
void serialize_to_wbuf(struct wbuf *wb, bool is_fresh) const;
// deserialization goes through a static factory function so the ft msg
// API stays completely const and there's no default constructor
static ft_msg deserialize_from_rbuf(struct rbuf *rb, XIDS *xids, bool *is_fresh);
// Version 13/14 messages did not have an msn - so `m' is the MSN
// that will be assigned to the message that gets deserialized.
static ft_msg deserialize_from_rbuf_v13(struct rbuf *rb, MSN m, XIDS *xids);
private:
const DBT _key;
const DBT _val;
enum ft_msg_type _type;
MSN _msn;
XIDS _xids;
};
// For serialize / deserialize
#include "ft/serialize/wbuf.h"
static inline void wbuf_MSN(struct wbuf *wb, MSN msn) {
wbuf_ulonglong(wb, msn.msn);
}
size_t toku_omt_memory_size (OMT omt) {
return omt->memory_size();
}
#include "ft/serialize/rbuf.h"
static inline MSN rbuf_MSN(struct rbuf *rb) {
MSN msn = { .msn = rbuf_ulonglong(rb) };
return msn;
}

View File

@ -0,0 +1,318 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#include "ft/msg_buffer.h"
#include "util/dbt.h"
void message_buffer::create() {
_num_entries = 0;
_memory = nullptr;
_memory_size = 0;
_memory_used = 0;
}
void message_buffer::clone(message_buffer *src) {
_num_entries = src->_num_entries;
_memory_used = src->_memory_used;
_memory_size = src->_memory_size;
XMALLOC_N(_memory_size, _memory);
memcpy(_memory, src->_memory, _memory_size);
}
void message_buffer::destroy() {
if (_memory != nullptr) {
toku_free(_memory);
}
}
void message_buffer::deserialize_from_rbuf(struct rbuf *rb,
int32_t **fresh_offsets, int32_t *nfresh,
int32_t **stale_offsets, int32_t *nstale,
int32_t **broadcast_offsets, int32_t *nbroadcast) {
// read the number of messages in this buffer
int n_in_this_buffer = rbuf_int(rb);
if (fresh_offsets != nullptr) {
XMALLOC_N(n_in_this_buffer, *fresh_offsets);
}
if (stale_offsets != nullptr) {
XMALLOC_N(n_in_this_buffer, *stale_offsets);
}
if (broadcast_offsets != nullptr) {
XMALLOC_N(n_in_this_buffer, *broadcast_offsets);
}
_resize(rb->size + 64); // rb->size is a good hint for how big the buffer will be
// deserialize each message individually, noting whether it was fresh
// and putting its buffer offset in the appropriate offsets array
for (int i = 0; i < n_in_this_buffer; i++) {
XIDS xids;
bool is_fresh;
const ft_msg msg = ft_msg::deserialize_from_rbuf(rb, &xids, &is_fresh);
int32_t *dest;
if (ft_msg_type_applies_once(msg.type())) {
if (is_fresh) {
dest = fresh_offsets ? *fresh_offsets + (*nfresh)++ : nullptr;
} else {
dest = stale_offsets ? *stale_offsets + (*nstale)++ : nullptr;
}
} else {
invariant(ft_msg_type_applies_all(msg.type()) || ft_msg_type_does_nothing(msg.type()));
dest = broadcast_offsets ? *broadcast_offsets + (*nbroadcast)++ : nullptr;
}
enqueue(msg, is_fresh, dest);
toku_xids_destroy(&xids);
}
invariant(_num_entries == n_in_this_buffer);
}
MSN message_buffer::deserialize_from_rbuf_v13(struct rbuf *rb,
MSN *highest_unused_msn_for_upgrade,
int32_t **fresh_offsets, int32_t *nfresh,
int32_t **broadcast_offsets, int32_t *nbroadcast) {
// read the number of messages in this buffer
int n_in_this_buffer = rbuf_int(rb);
if (fresh_offsets != nullptr) {
XMALLOC_N(n_in_this_buffer, *fresh_offsets);
}
if (broadcast_offsets != nullptr) {
XMALLOC_N(n_in_this_buffer, *broadcast_offsets);
}
// Atomically decrement the header's MSN count by the number
// of messages in the buffer.
MSN highest_msn_in_this_buffer = {
.msn = toku_sync_sub_and_fetch(&highest_unused_msn_for_upgrade->msn, n_in_this_buffer)
};
// Create the message buffers from the deserialized buffer.
for (int i = 0; i < n_in_this_buffer; i++) {
XIDS xids;
// There were no stale messages at this version, so call it fresh.
const bool is_fresh = true;
// Increment our MSN, the last message should have the
// newest/highest MSN. See above for a full explanation.
highest_msn_in_this_buffer.msn++;
const ft_msg msg = ft_msg::deserialize_from_rbuf_v13(rb, highest_msn_in_this_buffer, &xids);
int32_t *dest;
if (ft_msg_type_applies_once(msg.type())) {
dest = fresh_offsets ? *fresh_offsets + (*nfresh)++ : nullptr;
} else {
invariant(ft_msg_type_applies_all(msg.type()) || ft_msg_type_does_nothing(msg.type()));
dest = broadcast_offsets ? *broadcast_offsets + (*nbroadcast)++ : nullptr;
}
enqueue(msg, is_fresh, dest);
toku_xids_destroy(&xids);
}
return highest_msn_in_this_buffer;
}
void message_buffer::_resize(size_t new_size) {
XREALLOC_N(new_size, _memory);
_memory_size = new_size;
}
static int next_power_of_two (int n) {
int r = 4096;
while (r < n) {
r*=2;
assert(r>0);
}
return r;
}
struct message_buffer::buffer_entry *message_buffer::get_buffer_entry(int32_t offset) const {
return (struct buffer_entry *) (_memory + offset);
}
void message_buffer::enqueue(const ft_msg &msg, bool is_fresh, int32_t *offset) {
int need_space_here = msg_memsize_in_buffer(msg);
int need_space_total = _memory_used + need_space_here;
if (_memory == nullptr || need_space_total > _memory_size) {
// resize the buffer to the next power of 2 greater than the needed space
int next_2 = next_power_of_two(need_space_total);
_resize(next_2);
}
uint32_t keylen = msg.kdbt()->size;
uint32_t datalen = msg.vdbt()->size;
struct buffer_entry *entry = get_buffer_entry(_memory_used);
entry->type = (unsigned char) msg.type();
entry->msn = msg.msn();
toku_xids_cpy(&entry->xids_s, msg.xids());
entry->is_fresh = is_fresh;
unsigned char *e_key = toku_xids_get_end_of_array(&entry->xids_s);
entry->keylen = keylen;
memcpy(e_key, msg.kdbt()->data, keylen);
entry->vallen = datalen;
memcpy(e_key + keylen, msg.vdbt()->data, datalen);
if (offset) {
*offset = _memory_used;
}
_num_entries++;
_memory_used += need_space_here;
}
void message_buffer::set_freshness(int32_t offset, bool is_fresh) {
struct buffer_entry *entry = get_buffer_entry(offset);
entry->is_fresh = is_fresh;
}
bool message_buffer::get_freshness(int32_t offset) const {
struct buffer_entry *entry = get_buffer_entry(offset);
return entry->is_fresh;
}
ft_msg message_buffer::get_message(int32_t offset, DBT *keydbt, DBT *valdbt) const {
struct buffer_entry *entry = get_buffer_entry(offset);
uint32_t keylen = entry->keylen;
uint32_t vallen = entry->vallen;
enum ft_msg_type type = (enum ft_msg_type) entry->type;
MSN msn = entry->msn;
const XIDS xids = (XIDS) &entry->xids_s;
const void *key = toku_xids_get_end_of_array(xids);
const void *val = (uint8_t *) key + entry->keylen;
return ft_msg(toku_fill_dbt(keydbt, key, keylen), toku_fill_dbt(valdbt, val, vallen), type, msn, xids);
}
void message_buffer::get_message_key_msn(int32_t offset, DBT *key, MSN *msn) const {
struct buffer_entry *entry = get_buffer_entry(offset);
if (key != nullptr) {
toku_fill_dbt(key, toku_xids_get_end_of_array((XIDS) &entry->xids_s), entry->keylen);
}
if (msn != nullptr) {
*msn = entry->msn;
}
}
int message_buffer::num_entries() const {
return _num_entries;
}
size_t message_buffer::buffer_size_in_use() const {
return _memory_used;
}
size_t message_buffer::memory_size_in_use() const {
return sizeof(*this) + _memory_used;
}
size_t message_buffer::memory_footprint() const {
return sizeof(*this) + toku_memory_footprint(_memory, _memory_used);
}
bool message_buffer::equals(message_buffer *other) const {
return (_memory_used == other->_memory_used &&
memcmp(_memory, other->_memory, _memory_used) == 0);
}
void message_buffer::serialize_to_wbuf(struct wbuf *wb) const {
wbuf_nocrc_int(wb, _num_entries);
struct msg_serialize_fn {
struct wbuf *wb;
msg_serialize_fn(struct wbuf *w) : wb(w) { }
int operator()(const ft_msg &msg, bool is_fresh) {
msg.serialize_to_wbuf(wb, is_fresh);
return 0;
}
} serialize_fn(wb);
iterate(serialize_fn);
}
size_t message_buffer::msg_memsize_in_buffer(const ft_msg &msg) {
const uint32_t keylen = msg.kdbt()->size;
const uint32_t datalen = msg.vdbt()->size;
const size_t xidslen = toku_xids_get_size(msg.xids());
return sizeof(struct buffer_entry) + keylen + datalen + xidslen - sizeof(XIDS_S);
}

View File

@ -1,10 +1,6 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef XIDS_INTERNAL_H
#define XIDS_INTERNAL_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
@ -33,8 +29,8 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2014 Tokutek, Inc.
DISCLAIMER:
@ -90,29 +86,96 @@ PATENT RIGHTS GRANT:
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#pragma once
#include "ft/msg.h"
#include "ft/txn/xids.h"
#include "util/dbt.h"
// Variable size list of transaction ids (known in design doc as xids<>).
// ids[0] is the outermost transaction.
// ids[num_xids - 1] is the innermost transaction.
// Should only be accessed by accessor functions xids_xxx, not directly.
class message_buffer {
public:
void create();
// If the xids struct is unpacked, the compiler aligns the ids[] and we waste a lot of space
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
void clone(message_buffer *dst);
typedef struct __attribute__((__packed__)) xids_t {
uint8_t num_xids; // maximum value of MAX_TRANSACTION_RECORDS - 1 ...
// ... because transaction 0 is implicit
TXNID ids[];
} XIDS_S;
void destroy();
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
// effect: deserializes a message buffer from the given rbuf
// returns: *fresh_offsets (etc) malloc'd to be num_entries large and
// populated with *nfresh (etc) offsets in the message buffer
// requires: if fresh_offsets (etc) != nullptr, then nfresh != nullptr
void deserialize_from_rbuf(struct rbuf *rb,
int32_t **fresh_offsets, int32_t *nfresh,
int32_t **stale_offsets, int32_t *nstale,
int32_t **broadcast_offsets, int32_t *nbroadcast);
// effect: deserializes a message buffer whose messages are at version 13/14
// returns: similar to deserialize_from_rbuf(), excpet there are no stale messages
// and each message is assigned a sequential value from *highest_unused_msn_for_upgrade,
// which is modified as needed using toku_sync_fech_and_sub()
// returns: the highest MSN assigned to any message in this buffer
// requires: similar to deserialize_from_rbuf(), and highest_unused_msn_for_upgrade != nullptr
MSN deserialize_from_rbuf_v13(struct rbuf *rb,
MSN *highest_unused_msn_for_upgrade,
int32_t **fresh_offsets, int32_t *nfresh,
int32_t **broadcast_offsets, int32_t *nbroadcast);
#endif
void enqueue(const ft_msg &msg, bool is_fresh, int32_t *offset);
void set_freshness(int32_t offset, bool is_fresh);
bool get_freshness(int32_t offset) const;
ft_msg get_message(int32_t offset, DBT *keydbt, DBT *valdbt) const;
void get_message_key_msn(int32_t offset, DBT *key, MSN *msn) const;
int num_entries() const;
size_t buffer_size_in_use() const;
size_t memory_size_in_use() const;
size_t memory_footprint() const;
template <typename F>
int iterate(F &fn) const {
for (int32_t offset = 0; offset < _memory_used; ) {
DBT k, v;
const ft_msg msg = get_message(offset, &k, &v);
bool is_fresh = get_freshness(offset);
int r = fn(msg, is_fresh);
if (r != 0) {
return r;
}
offset += msg_memsize_in_buffer(msg);
}
return 0;
}
bool equals(message_buffer *other) const;
void serialize_to_wbuf(struct wbuf *wb) const;
static size_t msg_memsize_in_buffer(const ft_msg &msg);
private:
void _resize(size_t new_size);
// If this isn't packged, the compiler aligns the xids array and we waste a lot of space
struct __attribute__((__packed__)) buffer_entry {
unsigned int keylen;
unsigned int vallen;
unsigned char type;
bool is_fresh;
MSN msn;
XIDS_S xids_s;
};
struct buffer_entry *get_buffer_entry(int32_t offset) const;
int _num_entries;
char *_memory; // An array of bytes into which buffer entries are embedded.
int _memory_size; // How big is _memory
int _memory_used; // How many bytes are in use?
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,588 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#pragma once
#include "ft/bndata.h"
#include "ft/comparator.h"
#include "ft/ft.h"
#include "ft/msg_buffer.h"
/* Pivot keys.
* Child 0's keys are <= pivotkeys[0].
* Child 1's keys are <= pivotkeys[1].
* Child 1's keys are > pivotkeys[0].
* etc
*/
class ftnode_pivot_keys {
public:
// effect: create an empty set of pivot keys
void create_empty();
// effect: create pivot keys by copying the given DBT array
void create_from_dbts(const DBT *keys, int n);
// effect: create pivot keys as a clone of an existing set of pivotkeys
void create_from_pivot_keys(const ftnode_pivot_keys &pivotkeys);
void destroy();
// effect: deserialize pivot keys previously serialized by serialize_to_wbuf()
void deserialize_from_rbuf(struct rbuf *rb, int n);
// returns: unowned DBT representing the i'th pivot key
DBT get_pivot(int i) const;
// effect: fills a DBT with the i'th pivot key
// returns: the given dbt
DBT *fill_pivot(int i, DBT *dbt) const;
// effect: insert a pivot into the i'th position, shifting others to the right
void insert_at(const DBT *key, int i);
// effect: append pivotkeys to the end of our own pivot keys
void append(const ftnode_pivot_keys &pivotkeys);
// effect: replace the pivot at the i'th position
void replace_at(const DBT *key, int i);
// effect: removes the i'th pivot key, shifting others to the left
void delete_at(int i);
// effect: split the pivot keys, removing all pivots at position greater
// than or equal to `i' and storing them in *other
// requires: *other is empty (size == 0)
void split_at(int i, ftnode_pivot_keys *other);
// effect: serialize pivot keys to a wbuf
// requires: wbuf has at least ftnode_pivot_keys::total_size() bytes available
void serialize_to_wbuf(struct wbuf *wb) const;
int num_pivots() const;
// return: the total size of this data structure
size_t total_size() const;
// return: the sum of the keys sizes of each pivot (for serialization)
size_t serialized_size() const;
private:
inline size_t _align4(size_t x) const {
return roundup_to_multiple(4, x);
}
// effect: create pivot keys, in fixed key format, by copying the given key array
void _create_from_fixed_keys(const char *fixedkeys, size_t fixed_keylen, int n);
char *_fixed_key(int i) const {
return &_fixed_keys[i * _fixed_keylen_aligned];
}
bool _fixed_format() const {
return _fixed_keys != nullptr;
}
void sanity_check() const;
void _insert_at_dbt(const DBT *key, int i);
void _append_dbt(const ftnode_pivot_keys &pivotkeys);
void _replace_at_dbt(const DBT *key, int i);
void _delete_at_dbt(int i);
void _split_at_dbt(int i, ftnode_pivot_keys *other);
void _insert_at_fixed(const DBT *key, int i);
void _append_fixed(const ftnode_pivot_keys &pivotkeys);
void _replace_at_fixed(const DBT *key, int i);
void _delete_at_fixed(int i);
void _split_at_fixed(int i, ftnode_pivot_keys *other);
// adds/destroys keys at a certain index (in dbt format),
// maintaining _total_size, but not _num_pivots
void _add_key_dbt(const DBT *key, int i);
void _destroy_key_dbt(int i);
// conversions to and from packed key array format
void _convert_to_dbt_format();
void _convert_to_fixed_format();
// If every key is _fixed_keylen long, then _fixed_key is a
// packed array of keys..
char *_fixed_keys;
// The actual length of the fixed key
size_t _fixed_keylen;
// The aligned length that we use for fixed key storage
size_t _fixed_keylen_aligned;
// ..otherwise _fixed_keys is null and we store an array of dbts,
// each representing a key. this is simpler but less cache-efficient.
DBT *_dbt_keys;
int _num_pivots;
size_t _total_size;
};
// TODO: class me up
struct ftnode {
MSN max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk
unsigned int flags;
BLOCKNUM blocknum; // Which block number is this node?
int layout_version; // What version of the data structure?
int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging)
int layout_version_read_from_disk; // transient, not serialized to disk, (useful for debugging)
uint32_t build_id; // build_id (svn rev number) of software that wrote this node to disk
int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
int dirty;
uint32_t fullhash;
// for internal nodes, if n_children==fanout+1 then the tree needs to be rebalanced.
// for leaf nodes, represents number of basement nodes
int n_children;
ftnode_pivot_keys pivotkeys;
// What's the oldest referenced xid that this node knows about? The real oldest
// referenced xid might be younger, but this is our best estimate. We use it
// as a heuristic to transition provisional mvcc entries from provisional to
// committed (from implicity committed to really committed).
//
// A better heuristic would be the oldest live txnid, but we use this since it
// still works well most of the time, and its readily available on the inject
// code path.
TXNID oldest_referenced_xid_known;
// array of size n_children, consisting of ftnode partitions
// each one is associated with a child
// for internal nodes, the ith partition corresponds to the ith message buffer
// for leaf nodes, the ith partition corresponds to the ith basement node
struct ftnode_partition *bp;
struct ctpair *ct_pair;
};
typedef struct ftnode *FTNODE;
// data of an available partition of a leaf ftnode
struct ftnode_leaf_basement_node {
bn_data data_buffer;
unsigned int seqinsert; // number of sequential inserts to this leaf
MSN max_msn_applied; // max message sequence number applied
bool stale_ancestor_messages_applied;
STAT64INFO_S stat64_delta; // change in stat64 counters since basement was last written to disk
};
typedef struct ftnode_leaf_basement_node *BASEMENTNODE;
enum pt_state { // declare this to be packed so that when used below it will only take 1 byte.
PT_INVALID = 0,
PT_ON_DISK = 1,
PT_COMPRESSED = 2,
PT_AVAIL = 3};
enum ftnode_child_tag {
BCT_INVALID = 0,
BCT_NULL,
BCT_SUBBLOCK,
BCT_LEAF,
BCT_NONLEAF
};
typedef toku::omt<int32_t> off_omt_t;
typedef toku::omt<int32_t, int32_t, true> marked_off_omt_t;
// data of an available partition of a nonleaf ftnode
struct ftnode_nonleaf_childinfo {
message_buffer msg_buffer;
off_omt_t broadcast_list;
marked_off_omt_t fresh_message_tree;
off_omt_t stale_message_tree;
uint64_t flow[2]; // current and last checkpoint
};
typedef struct ftnode_nonleaf_childinfo *NONLEAF_CHILDINFO;
typedef struct ftnode_child_pointer {
union {
struct sub_block *subblock;
struct ftnode_nonleaf_childinfo *nonleaf;
struct ftnode_leaf_basement_node *leaf;
} u;
enum ftnode_child_tag tag;
} FTNODE_CHILD_POINTER;
struct ftnode_disk_data {
//
// stores the offset to the beginning of the partition on disk from the ftnode, and the length, needed to read a partition off of disk
// the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
// The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
// The SIZE is the size of the compressed partition.
// Rationale: We cannot store the size from the beginning of the node since we don't know how big the header will be.
// However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
uint32_t start;
uint32_t size;
};
typedef struct ftnode_disk_data *FTNODE_DISK_DATA;
// TODO: Turn these into functions instead of macros
#define BP_START(node_dd,i) ((node_dd)[i].start)
#define BP_SIZE(node_dd,i) ((node_dd)[i].size)
// a ftnode partition, associated with a child of a node
struct ftnode_partition {
// the following three variables are used for nonleaf nodes
// for leaf nodes, they are meaningless
BLOCKNUM blocknum; // blocknum of child
// How many bytes worth of work was performed by messages in each buffer.
uint64_t workdone;
//
// pointer to the partition. Depending on the state, they may be different things
// if state == PT_INVALID, then the node was just initialized and ptr == NULL
// if state == PT_ON_DISK, then ptr == NULL
// if state == PT_COMPRESSED, then ptr points to a struct sub_block*
// if state == PT_AVAIL, then ptr is:
// a struct ftnode_nonleaf_childinfo for internal nodes,
// a struct ftnode_leaf_basement_node for leaf nodes
//
struct ftnode_child_pointer ptr;
//
// at any time, the partitions may be in one of the following three states (stored in pt_state):
// PT_INVALID - means that the partition was just initialized
// PT_ON_DISK - means that the partition is not in memory and needs to be read from disk. To use, must read off disk and decompress
// PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress
// PT_AVAIL - means the partition is decompressed and in memory
//
enum pt_state state; // make this an enum to make debugging easier.
// clock count used to for pe_callback to determine if a node should be evicted or not
// for now, saturating the count at 1
uint8_t clock_count;
};
//
// TODO: Fix all these names
// Organize declarations
// Fix widespread parameter ordering inconsistencies
//
BASEMENTNODE toku_create_empty_bn(void);
BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer.
NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo);
BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn);
NONLEAF_CHILDINFO toku_create_empty_nl(void);
void destroy_basement_node (BASEMENTNODE bn);
void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl);
void toku_destroy_ftnode_internals(FTNODE node);
void toku_ftnode_free (FTNODE *node);
bool toku_ftnode_fully_in_memory(FTNODE node);
void toku_ftnode_assert_fully_in_memory(FTNODE node);
void toku_evict_bn_from_memory(FTNODE node, int childnum, FT ft);
BASEMENTNODE toku_detach_bn(FTNODE node, int childnum);
void toku_ftnode_update_disk_stats(FTNODE ftnode, FT ft, bool for_checkpoint);
void toku_ftnode_clone_partitions(FTNODE node, FTNODE cloned_node);
void toku_initialize_empty_ftnode(FTNODE node, BLOCKNUM blocknum, int height, int num_children,
int layout_version, unsigned int flags);
int toku_ftnode_which_child(FTNODE node, const DBT *k, const toku::comparator &cmp);
void toku_ftnode_save_ct_pair(CACHEKEY key, void *value_data, PAIR p);
//
// TODO: put the heaviside functions into their respective 'struct .*extra;' namespaces
//
struct toku_msg_buffer_key_msn_heaviside_extra {
const toku::comparator &cmp;
message_buffer *msg_buffer;
const DBT *key;
MSN msn;
toku_msg_buffer_key_msn_heaviside_extra(const toku::comparator &c, message_buffer *mb, const DBT *k, MSN m) :
cmp(c), msg_buffer(mb), key(k), msn(m) {
}
};
int toku_msg_buffer_key_msn_heaviside(const int32_t &v, const struct toku_msg_buffer_key_msn_heaviside_extra &extra);
struct toku_msg_buffer_key_msn_cmp_extra {
const toku::comparator &cmp;
message_buffer *msg_buffer;
toku_msg_buffer_key_msn_cmp_extra(const toku::comparator &c, message_buffer *mb) :
cmp(c), msg_buffer(mb) {
}
};
int toku_msg_buffer_key_msn_cmp(const struct toku_msg_buffer_key_msn_cmp_extra &extrap, const int &a, const int &b);
struct toku_msg_leafval_heaviside_extra {
const toku::comparator &cmp;
DBT const *const key;
toku_msg_leafval_heaviside_extra(const toku::comparator &c, const DBT *k) :
cmp(c), key(k) {
}
};
int toku_msg_leafval_heaviside(DBT const &kdbt, const struct toku_msg_leafval_heaviside_extra &be);
unsigned int toku_bnc_nbytesinbuf(NONLEAF_CHILDINFO bnc);
int toku_bnc_n_entries(NONLEAF_CHILDINFO bnc);
long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc);
long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc);
void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, uint32_t keylen, const void *data, uint32_t datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, const toku::comparator &cmp);
void toku_bnc_empty(NONLEAF_CHILDINFO bnc);
void toku_bnc_flush_to_child(FT ft, NONLEAF_CHILDINFO bnc, FTNODE child, TXNID parent_oldest_referenced_xid_known);
bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull));
bool toku_ftnode_nonleaf_is_gorged(FTNODE node, uint32_t nodesize);
uint32_t toku_ftnode_leaf_num_entries(FTNODE node);
void toku_ftnode_leaf_rebalance(FTNODE node, unsigned int basementnodesize);
void toku_ftnode_leaf_run_gc(FT ft, FTNODE node);
enum reactivity {
RE_STABLE,
RE_FUSIBLE,
RE_FISSIBLE
};
enum reactivity toku_ftnode_get_reactivity(FT ft, FTNODE node);
enum reactivity toku_ftnode_get_nonleaf_reactivity(FTNODE node, unsigned int fanout);
enum reactivity toku_ftnode_get_leaf_reactivity(FTNODE node, uint32_t nodesize);
/**
* Finds the next child for HOT to flush to, given that everything up to
* and including k has been flattened.
*
* If k falls between pivots in node, then we return the childnum where k
* lies.
*
* If k is equal to some pivot, then we return the next (to the right)
* childnum.
*/
int toku_ftnode_hot_next_child(FTNODE node, const DBT *k, const toku::comparator &cmp);
void toku_ftnode_put_msg(const toku::comparator &cmp, ft_update_func update_fun,
FTNODE node, int target_childnum,
const ft_msg &msg, bool is_fresh, txn_gc_info *gc_info,
size_t flow_deltas[], STAT64INFO stats_to_update);
void toku_ft_bn_apply_msg_once(BASEMENTNODE bn, const ft_msg &msg, uint32_t idx,
uint32_t le_keylen, LEAFENTRY le, txn_gc_info *gc_info,
uint64_t *workdonep, STAT64INFO stats_to_update);
void toku_ft_bn_apply_msg(const toku::comparator &cmp, ft_update_func update_fun,
BASEMENTNODE bn, const ft_msg &msg, txn_gc_info *gc_info,
uint64_t *workdone, STAT64INFO stats_to_update);
void toku_ft_leaf_apply_msg(const toku::comparator &cmp, ft_update_func update_fun,
FTNODE node, int target_childnum,
const ft_msg &msg, txn_gc_info *gc_info,
uint64_t *workdone, STAT64INFO stats_to_update);
//
// Message management for orthopush
//
struct ancestors {
// This is the root node if next is NULL (since the root has no ancestors)
FTNODE node;
// Which buffer holds messages destined to the node whose ancestors this list represents.
int childnum;
struct ancestors *next;
};
typedef struct ancestors *ANCESTORS;
void toku_ft_bnc_move_messages_to_stale(FT ft, NONLEAF_CHILDINFO bnc);
void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
// TODO: Should ft_handle just be FT?
class pivot_bounds;
void toku_apply_ancestors_messages_to_node(FT_HANDLE t, FTNODE node, ANCESTORS ancestors,
const pivot_bounds &bounds,
bool *msgs_applied, int child_to_read);
bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors,
const pivot_bounds &bounds,
MSN *const max_msn_in_path, int child_to_read);
void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);
struct ft_search;
int toku_ft_search_which_child(const toku::comparator &cmp, FTNODE node, ft_search *search);
//
// internal node inline functions
// TODO: Turn the macros into real functions
//
static inline void set_BNULL(FTNODE node, int i) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
node->bp[i].ptr.tag = BCT_NULL;
}
static inline bool is_BNULL (FTNODE node, int i) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
return node->bp[i].ptr.tag == BCT_NULL;
}
static inline NONLEAF_CHILDINFO BNC(FTNODE node, int i) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
FTNODE_CHILD_POINTER p = node->bp[i].ptr;
paranoid_invariant(p.tag==BCT_NONLEAF);
return p.u.nonleaf;
}
static inline void set_BNC(FTNODE node, int i, NONLEAF_CHILDINFO nl) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
p->tag = BCT_NONLEAF;
p->u.nonleaf = nl;
}
static inline BASEMENTNODE BLB(FTNODE node, int i) {
paranoid_invariant(i >= 0);
// The optimizer really doesn't like it when we compare
// i to n_children as signed integers. So we assert that
// n_children is in fact positive before doing a comparison
// on the values forcibly cast to unsigned ints.
paranoid_invariant(node->n_children > 0);
paranoid_invariant((unsigned) i < (unsigned) node->n_children);
FTNODE_CHILD_POINTER p = node->bp[i].ptr;
paranoid_invariant(p.tag==BCT_LEAF);
return p.u.leaf;
}
static inline void set_BLB(FTNODE node, int i, BASEMENTNODE bn) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
p->tag = BCT_LEAF;
p->u.leaf = bn;
}
static inline struct sub_block *BSB(FTNODE node, int i) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
FTNODE_CHILD_POINTER p = node->bp[i].ptr;
paranoid_invariant(p.tag==BCT_SUBBLOCK);
return p.u.subblock;
}
static inline void set_BSB(FTNODE node, int i, struct sub_block *sb) {
paranoid_invariant(i >= 0);
paranoid_invariant(i < node->n_children);
FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
p->tag = BCT_SUBBLOCK;
p->u.subblock = sb;
}
// ftnode partition macros
// BP stands for ftnode_partition
#define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum)
#define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_WORKDONE(node, i)((node)->bp[i].workdone)
//
// macros for managing a node's clock
// Should be managed by ft-ops.c, NOT by serialize/deserialize
//
//
// BP_TOUCH_CLOCK uses a compare and swap because multiple threads
// that have a read lock on an internal node may try to touch the clock
// simultaneously
//
#define BP_TOUCH_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
#define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
#define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0)
// not crazy about having these two here, one is for the case where we create new
// nodes, such as in splits and creating new roots, and the other is for when
// we are deserializing a node and not all bp's are touched
#define BP_INIT_TOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
#define BP_INIT_UNTOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
// ftnode leaf basementnode macros,
#define BLB_MAX_MSN_APPLIED(node,i) (BLB(node,i)->max_msn_applied)
#define BLB_MAX_DSN_APPLIED(node,i) (BLB(node,i)->max_dsn_applied)
#define BLB_DATA(node,i) (&(BLB(node,i)->data_buffer))
#define BLB_NBYTESINDATA(node,i) (BLB_DATA(node,i)->get_disk_size())
#define BLB_SEQINSERT(node,i) (BLB(node,i)->seqinsert)

View File

@ -1,416 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#if !defined(TOKU_OMT_H)
#define TOKU_OMT_H
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
// Order Maintenance Tree (OMT)
//
// Maintains a collection of totally ordered values, where each value has an integer weight.
// The OMT is a mutable datatype.
//
// The Abstraction:
//
// An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
// The vector is numbered from $0$ to $|V|-1$.
// Each value has a weight. The weight of the $i$th element is denoted $w(V_i)$.
//
// We can create a new OMT, which is the empty vector.
//
// We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
// $|V'|=1+|V|$ and
//
// V'_j = V_j if $j<i$
// x if $j=i$
// V_{j-1} if $j>i$.
//
// We can specify $i$ using a kind of function instead of as an integer.
// Let $b$ be a function mapping from values to nonzero integers, such that
// the signum of $b$ is monotically increasing.
// We can specify $i$ as the minimum integer such that $b(V_i)>0$.
//
// We look up a value using its index, or using a Heaviside function.
// For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
// When lookup up values, we can look up
// $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a special return code if no such value exists.)
// (Rationale: Ordinarily we want $i$ to be unique. But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
// $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$. (Or an indication that no such value exists.)
// $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an indication that no such value exists.)
//
// When looking up a value using a Heaviside function, we get the value and its index.
//
// We can also split an OMT into two OMTs, splitting the weight of the values evenly.
// Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
// The resulting two OMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
// All of the values from the original OMT go into one of the new OMTs.
// If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
// the new left OMT or the new right OMT is larger.
//
// Performance:
// Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
// The memory required is O(|V|).
//
// The programming API:
//typedef struct value *OMTVALUE; // A slight improvement over using void*.
#include <util/omt.h>
typedef void *OMTVALUE;
typedef toku::omt<OMTVALUE> *OMT;
int toku_omt_create (OMT *omtp);
// Effect: Create an empty OMT. Stores it in *omtp.
// Requires: omtp != NULL
// Returns:
// 0 success
// ENOMEM out of memory (and doesn't modify *omtp)
// Performance: constant time.
int toku_omt_create_from_sorted_array(OMT *omtp, OMTVALUE *values, uint32_t numvalues);
// Effect: Create a OMT containing values. The number of values is in numvalues.
// Stores the new OMT in *omtp.
// Requires: omtp != NULL
// Requires: values != NULL
// Requires: values is sorted
// Returns:
// 0 success
// ENOMEM out of memory (and doesn't modify *omtp)
// Performance: time=O(numvalues)
// Rational: Normally to insert N values takes O(N lg N) amortized time.
// If the N values are known in advance, are sorted, and
// the structure is empty, we can batch insert them much faster.
int toku_omt_create_steal_sorted_array(OMT *omtp, OMTVALUE **valuesp, uint32_t numvalues, uint32_t steal_capacity);
// Effect: Create an OMT containing values. The number of values is in numvalues.
// On success the OMT takes ownership of *valuesp array, and sets valuesp=NULL.
// Requires: omtp != NULL
// Requires: valuesp != NULL
// Requires: *valuesp is sorted
// Requires: *valuesp was allocated with toku_malloc
// Requires: Capacity of the *valuesp array is <= steal_capacity
// Requires: On success, *valuesp may not be accessed again by the caller.
// Returns:
// 0 success
// ENOMEM out of memory (and doesn't modify *omtp)
// EINVAL *valuesp == NULL or numvalues > capacity
// Performance: time=O(1)
// Rational: toku_omt_create_from_sorted_array takes O(numvalues) time.
// By taking ownership of the array, we save a malloc and memcpy,
// and possibly a free (if the caller is done with the array).
void toku_omt_destroy(OMT *omtp);
// Effect: Destroy an OMT, freeing all its memory.
// Does not free the OMTVALUEs stored in the OMT.
// Those values may be freed before or after calling toku_omt_destroy.
// Also sets *omtp=NULL.
// Requires: omtp != NULL
// Requires: *omtp != NULL
// Rationale: The usage is to do something like
// toku_omt_destroy(&s->omt);
// and now s->omt will have a NULL pointer instead of a dangling freed pointer.
// Rationale: Returns no values since free() cannot fail.
// Rationale: Does not free the OMTVALUEs to reduce complexity.
// Performance: time=O(toku_omt_size(*omtp))
uint32_t toku_omt_size(OMT V);
// Effect: return |V|.
// Requires: V != NULL
// Performance: time=O(1)
int toku_omt_iterate_on_range(OMT omt, uint32_t left, uint32_t right, int (*f)(OMTVALUE, uint32_t, void*), void*v);
// Effect: Iterate over the values of the omt, from left to right, calling f on each value.
// The second argument passed to f is the index of the value.
// The third argument passed to f is v.
// The indices run from 0 (inclusive) to toku_omt_size(omt) (exclusive).
// We will iterate only over [left,right)
//
// Requires: omt != NULL
// left <= right
// Requires: f != NULL
// Returns:
// If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_omt_iterate.
// If f always returns zero, then toku_omt_iterate returns 0.
// Requires: Don't modify omt while running. (E.g., f may not insert or delete values form omt.)
// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in omt.
// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
int toku_omt_iterate(OMT omt, int (*f)(OMTVALUE, uint32_t, void*), void*v);
// Effect: Iterate over the values of the omt, from left to right, calling f on each value.
// The second argument passed to f is the index of the value.
// The third argument passed to f is v.
// The indices run from 0 (inclusive) to toku_omt_size(omt) (exclusive).
// Requires: omt != NULL
// Requires: f != NULL
// Returns:
// If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_omt_iterate.
// If f always returns zero, then toku_omt_iterate returns 0.
// Requires: Don't modify omt while running. (E.g., f may not insert or delete values form omt.)
// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in omt.
// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
int toku_omt_insert_at(OMT omt, OMTVALUE value, uint32_t idx);
// Effect: Increases indexes of all items at slot >= index by 1.
// Insert value into the position at index.
//
// Returns:
// 0 success
// EINVAL if index>toku_omt_size(omt)
// ENOMEM
// On error, omt is unchanged.
// Performance: time=O(\log N) amortized time.
// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
int toku_omt_set_at (OMT omt, OMTVALUE value, uint32_t idx);
// Effect: Replaces the item at index with value.
// Returns:
// 0 success
// EINVAL if index>=toku_omt_size(omt)
// On error, omt i sunchanged.
// Performance: time=O(\log N)
// Rationale: The BRT needs to be able to replace a value with another copy of the same value (allocated in a different location)
int toku_omt_insert(OMT omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v, uint32_t *idx);
// Effect: Insert value into the OMT.
// If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
// Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
// If no such i exists, then let i be |V|
// Then this has the same effect as
// omt_insert_at(tree, value, i);
// If index!=NULL then i is stored in *index
// Requires: The signum of h must be monotonically increasing.
// Returns:
// 0 success
// DB_KEYEXIST the key is present (h was equal to zero for some value)
// ENOMEM
// On nonzero return, omt is unchanged.
// On nonzero non-DB_KEYEXIST return, *index is unchanged.
// Performance: time=O(\log N) amortized.
// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
int toku_omt_delete_at(OMT omt, uint32_t idx);
// Effect: Delete the item in slot index.
// Decreases indexes of all items at slot >= index by 1.
// Returns
// 0 success
// EINVAL if index>=toku_omt_size(omt)
// On error, omt is unchanged.
// Rationale: To delete an item, first find its index using toku_omt_find, then delete it.
// Performance: time=O(\log N) amortized.
int toku_omt_fetch (OMT V, uint32_t i, OMTVALUE *v);
// Effect: Set *v=V_i
// If c!=NULL then set c's abstract offset to i.
// Requires: v != NULL
// Returns
// 0 success
// EINVAL if index>=toku_omt_size(omt)
// On nonzero return, *v is unchanged, and c (if nonnull) is either
// invalidated or unchanged.
// Performance: time=O(\log N)
// Implementation Notes: It is possible that c was previously valid and was
// associated with a different OMT. If c is changed by this
// function, the function must remove c's association with the old
// OMT, and associate it with the new OMT.
int toku_omt_find_zero(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, uint32_t *idx);
// Effect: Find the smallest i such that h(V_i, extra)>=0
// If there is such an i and h(V_i,extra)==0 then set *index=i and return 0.
// If there is such an i and h(V_i,extra)>0 then set *index=i and return DB_NOTFOUND.
// If there is no such i then set *index=toku_omt_size(V) and return DB_NOTFOUND.
// Requires: index!=NULL
int toku_omt_find(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, int direction, OMTVALUE *value, uint32_t *idx);
// Effect:
// If direction >0 then find the smallest i such that h(V_i,extra)>0.
// If direction <0 then find the largest i such that h(V_i,extra)<0.
// (Direction may not be equal to zero.)
// If value!=NULL then store V_i in *value
// If index!=NULL then store i in *index.
// Requires: The signum of h is monotically increasing.
// Returns
// 0 success
// DB_NOTFOUND no such value is found.
// On nonzero return, *value and *index are unchanged, and c (if nonnull) is either invalidated
// or unchanged.
// Performance: time=O(\log N)
// Rationale:
// Here's how to use the find function to find various things
// Cases for find:
// find first value: ( h(v)=+1, direction=+1 )
// find last value ( h(v)=-1, direction=-1 )
// find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 )
// find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 )
// find X or successor to X ( same as find first X. )
//
// Rationale: To help understand heaviside functions and behavor of find:
// There are 7 kinds of heaviside functions.
// The signus of the h must be monotonically increasing.
// Given a function of the following form, A is the element
// returned for direction>0, B is the element returned
// for direction<0, C is the element returned for
// direction==0 (see find_zero) (with a return of 0), and D is the element
// returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
// If any of A, B, or C are not found, then asking for the
// associated direction will return DB_NOTFOUND.
// See find_zero for more information.
//
// Let the following represent the signus of the heaviside function.
//
// -...-
// A
// D
//
// +...+
// B
// D
//
// 0...0
// C
//
// -...-0...0
// AC
//
// 0...0+...+
// C B
//
// -...-+...+
// AB
// D
//
// -...-0...0+...+
// AC B
int toku_omt_split_at(OMT omt, OMT *newomt, uint32_t idx);
// Effect: Create a new OMT, storing it in *newomt.
// The values to the right of index (starting at index) are moved to *newomt.
// Requires: omt != NULL
// Requires: newomt != NULL
// Returns
// 0 success,
// EINVAL if index > toku_omt_size(omt)
// ENOMEM
// On nonzero return, omt and *newomt are unmodified.
// Performance: time=O(n)
// Rationale: We don't need a split-evenly operation. We need to split items so that their total sizes
// are even, and other similar splitting criteria. It's easy to split evenly by calling toku_omt_size(), and dividing by two.
int toku_omt_merge(OMT leftomt, OMT rightomt, OMT *newomt);
// Effect: Appends leftomt and rightomt to produce a new omt.
// Sets *newomt to the new omt.
// On success, leftomt and rightomt destroyed,.
// Returns 0 on success
// ENOMEM on out of memory.
// On error, nothing is modified.
// Performance: time=O(n) is acceptable, but one can imagine implementations that are O(\log n) worst-case.
int toku_omt_clone_noptr(OMT *dest, OMT src);
// Effect: Creates a copy of an omt.
// Sets *dest to the clone
// Each element is assumed to be stored directly in the omt, that is, the OMTVALUEs are not pointers, they are data. Thus no extra memory allocation is required.
// Returns 0 on success
// ENOMEM on out of memory.
// On error, nothing is modified.
// Performance: time between O(n) and O(n log n), depending how long it
// takes to traverse src.
void toku_omt_clear(OMT omt);
// Effect: Set the tree to be empty.
// Note: Will not reallocate or resize any memory, since returning void precludes calling malloc.
// Performance: time=O(1)
size_t toku_omt_memory_size (OMT omt);
// Effect: Return the size (in bytes) of the omt, as it resides in main memory. Don't include any of the OMTVALUES.
#endif /* #ifndef TOKU_OMT_H */

View File

@ -0,0 +1,491 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <string>
#include "portability/memory.h"
#include "ft/node.h"
#include "ft/serialize/rbuf.h"
#include "ft/serialize/wbuf.h"
void ftnode_pivot_keys::create_empty() {
_num_pivots = 0;
_total_size = 0;
_fixed_keys = nullptr;
_fixed_keylen = 0;
_fixed_keylen_aligned = 0;
_dbt_keys = nullptr;
}
void ftnode_pivot_keys::create_from_dbts(const DBT *keys, int n) {
create_empty();
_num_pivots = n;
// see if every key has the same length
bool keys_same_size = true;
for (int i = 1; i < _num_pivots; i++) {
if (keys[i].size != keys[i - 1].size) {
keys_same_size = false;
break;
}
}
if (keys_same_size && _num_pivots > 0) {
// if so, store pivots in a tightly packed array of fixed length keys
_fixed_keylen = keys[0].size;
_fixed_keylen_aligned = _align4(_fixed_keylen);
_total_size = _fixed_keylen_aligned * _num_pivots;
XMALLOC_N_ALIGNED(64, _total_size, _fixed_keys);
for (int i = 0; i < _num_pivots; i++) {
invariant(keys[i].size == _fixed_keylen);
memcpy(_fixed_key(i), keys[i].data, _fixed_keylen);
}
} else {
// otherwise we'll just store the pivots in an array of dbts
XMALLOC_N_ALIGNED(64, _num_pivots, _dbt_keys);
for (int i = 0; i < _num_pivots; i++) {
size_t size = keys[i].size;
toku_memdup_dbt(&_dbt_keys[i], keys[i].data, size);
_total_size += size;
}
}
sanity_check();
}
void ftnode_pivot_keys::_create_from_fixed_keys(const char *fixedkeys, size_t fixed_keylen, int n) {
create_empty();
_num_pivots = n;
_fixed_keylen = fixed_keylen;
_fixed_keylen_aligned = _align4(fixed_keylen);
_total_size = _fixed_keylen_aligned * _num_pivots;
XMEMDUP_N(_fixed_keys, fixedkeys, _total_size);
}
// effect: create pivot keys as a clone of an existing set of pivotkeys
void ftnode_pivot_keys::create_from_pivot_keys(const ftnode_pivot_keys &pivotkeys) {
if (pivotkeys._fixed_format()) {
_create_from_fixed_keys(pivotkeys._fixed_keys, pivotkeys._fixed_keylen, pivotkeys._num_pivots);
} else {
create_from_dbts(pivotkeys._dbt_keys, pivotkeys._num_pivots);
}
sanity_check();
}
void ftnode_pivot_keys::destroy() {
if (_dbt_keys != nullptr) {
for (int i = 0; i < _num_pivots; i++) {
toku_destroy_dbt(&_dbt_keys[i]);
}
toku_free(_dbt_keys);
_dbt_keys = nullptr;
}
if (_fixed_keys != nullptr) {
toku_free(_fixed_keys);
_fixed_keys = nullptr;
}
_fixed_keylen = 0;
_fixed_keylen_aligned = 0;
_num_pivots = 0;
_total_size = 0;
}
void ftnode_pivot_keys::_convert_to_fixed_format() {
invariant(!_fixed_format());
// convert to a tightly packed array of fixed length keys
_fixed_keylen = _dbt_keys[0].size;
_fixed_keylen_aligned = _align4(_fixed_keylen);
_total_size = _fixed_keylen_aligned * _num_pivots;
XMALLOC_N_ALIGNED(64, _total_size, _fixed_keys);
for (int i = 0; i < _num_pivots; i++) {
invariant(_dbt_keys[i].size == _fixed_keylen);
memcpy(_fixed_key(i), _dbt_keys[i].data, _fixed_keylen);
}
// destroy the dbt array format
for (int i = 0; i < _num_pivots; i++) {
toku_destroy_dbt(&_dbt_keys[i]);
}
toku_free(_dbt_keys);
_dbt_keys = nullptr;
invariant(_fixed_format());
sanity_check();
}
void ftnode_pivot_keys::_convert_to_dbt_format() {
invariant(_fixed_format());
// convert to an aray of dbts
REALLOC_N_ALIGNED(64, _num_pivots, _dbt_keys);
for (int i = 0; i < _num_pivots; i++) {
toku_memdup_dbt(&_dbt_keys[i], _fixed_key(i), _fixed_keylen);
}
// pivots sizes are not aligned up dbt format
_total_size = _num_pivots * _fixed_keylen;
// destroy the fixed key format
toku_free(_fixed_keys);
_fixed_keys = nullptr;
_fixed_keylen = 0;
_fixed_keylen_aligned = 0;
invariant(!_fixed_format());
sanity_check();
}
void ftnode_pivot_keys::deserialize_from_rbuf(struct rbuf *rb, int n) {
_num_pivots = n;
_total_size = 0;
_fixed_keys = nullptr;
_fixed_keylen = 0;
_dbt_keys = nullptr;
XMALLOC_N_ALIGNED(64, _num_pivots, _dbt_keys);
bool keys_same_size = true;
for (int i = 0; i < _num_pivots; i++) {
const void *pivotkeyptr;
uint32_t size;
rbuf_bytes(rb, &pivotkeyptr, &size);
toku_memdup_dbt(&_dbt_keys[i], pivotkeyptr, size);
_total_size += size;
if (i > 0 && keys_same_size && _dbt_keys[i].size != _dbt_keys[i - 1].size) {
// not all keys are the same size, we'll stick to the dbt array format
keys_same_size = false;
}
}
if (keys_same_size && _num_pivots > 0) {
_convert_to_fixed_format();
}
sanity_check();
}
DBT ftnode_pivot_keys::get_pivot(int i) const {
paranoid_invariant(i < _num_pivots);
if (_fixed_format()) {
paranoid_invariant(i * _fixed_keylen_aligned < _total_size);
DBT dbt;
toku_fill_dbt(&dbt, _fixed_key(i), _fixed_keylen);
return dbt;
} else {
return _dbt_keys[i];
}
}
DBT *ftnode_pivot_keys::fill_pivot(int i, DBT *dbt) const {
paranoid_invariant(i < _num_pivots);
if (_fixed_format()) {
toku_fill_dbt(dbt, _fixed_key(i), _fixed_keylen);
} else {
toku_copyref_dbt(dbt, _dbt_keys[i]);
}
return dbt;
}
void ftnode_pivot_keys::_add_key_dbt(const DBT *key, int i) {
toku_clone_dbt(&_dbt_keys[i], *key);
_total_size += _dbt_keys[i].size;
}
void ftnode_pivot_keys::_destroy_key_dbt(int i) {
invariant(_total_size >= _dbt_keys[i].size);
_total_size -= _dbt_keys[i].size;
toku_destroy_dbt(&_dbt_keys[i]);
}
void ftnode_pivot_keys::_insert_at_dbt(const DBT *key, int i) {
// make space for a new pivot, slide existing keys to the right
REALLOC_N_ALIGNED(64, _num_pivots + 1, _dbt_keys);
memmove(&_dbt_keys[i + 1], &_dbt_keys[i], (_num_pivots - i) * sizeof(DBT));
_add_key_dbt(key, i);
}
void ftnode_pivot_keys::_insert_at_fixed(const DBT *key, int i) {
REALLOC_N_ALIGNED(64, (_num_pivots + 1) * _fixed_keylen_aligned, _fixed_keys);
// TODO: This is not going to be valgrind-safe, because we do not initialize the space
// between _fixed_keylen and _fixed_keylen_aligned (but we probably should)
memmove(_fixed_key(i + 1), _fixed_key(i), (_num_pivots - i) * _fixed_keylen_aligned);
memcpy(_fixed_key(i), key->data, _fixed_keylen);
_total_size += _fixed_keylen_aligned;
}
void ftnode_pivot_keys::insert_at(const DBT *key, int i) {
invariant(i <= _num_pivots); // it's ok to insert at the end, so we check <= n
// if the new key doesn't have the same size, we can't be in fixed format
if (_fixed_format() && key->size != _fixed_keylen) {
_convert_to_dbt_format();
}
if (_fixed_format()) {
_insert_at_fixed(key, i);
} else {
_insert_at_dbt(key, i);
}
_num_pivots++;
invariant(total_size() > 0);
}
void ftnode_pivot_keys::_append_dbt(const ftnode_pivot_keys &pivotkeys) {
REALLOC_N_ALIGNED(64, _num_pivots + pivotkeys._num_pivots, _dbt_keys);
bool other_fixed = pivotkeys._fixed_format();
for (int i = 0; i < pivotkeys._num_pivots; i++) {
size_t size = other_fixed ? pivotkeys._fixed_keylen :
pivotkeys._dbt_keys[i].size;
toku_memdup_dbt(&_dbt_keys[_num_pivots + i],
other_fixed ? pivotkeys._fixed_key(i) :
pivotkeys._dbt_keys[i].data,
size);
_total_size += size;
}
}
void ftnode_pivot_keys::_append_fixed(const ftnode_pivot_keys &pivotkeys) {
if (pivotkeys._fixed_format() && pivotkeys._fixed_keylen == _fixed_keylen) {
// other pivotkeys have the same fixed keylen
REALLOC_N_ALIGNED(64, (_num_pivots + pivotkeys._num_pivots) * _fixed_keylen_aligned, _fixed_keys);
memcpy(_fixed_key(_num_pivots), pivotkeys._fixed_keys, pivotkeys._total_size);
_total_size += pivotkeys._total_size;
} else {
// must convert to dbt format, other pivotkeys have different length'd keys
_convert_to_dbt_format();
_append_dbt(pivotkeys);
}
}
void ftnode_pivot_keys::append(const ftnode_pivot_keys &pivotkeys) {
if (_fixed_format()) {
_append_fixed(pivotkeys);
} else {
_append_dbt(pivotkeys);
}
_num_pivots += pivotkeys._num_pivots;
sanity_check();
}
void ftnode_pivot_keys::_replace_at_dbt(const DBT *key, int i) {
_destroy_key_dbt(i);
_add_key_dbt(key, i);
}
void ftnode_pivot_keys::_replace_at_fixed(const DBT *key, int i) {
if (key->size == _fixed_keylen) {
memcpy(_fixed_key(i), key->data, _fixed_keylen);
} else {
// must convert to dbt format, replacement key has different length
_convert_to_dbt_format();
_replace_at_dbt(key, i);
}
}
void ftnode_pivot_keys::replace_at(const DBT *key, int i) {
if (i < _num_pivots) {
if (_fixed_format()) {
_replace_at_fixed(key, i);
} else {
_replace_at_dbt(key, i);
}
} else {
invariant(i == _num_pivots); // appending to the end is ok
insert_at(key, i);
}
invariant(total_size() > 0);
}
void ftnode_pivot_keys::_delete_at_fixed(int i) {
memmove(_fixed_key(i), _fixed_key(i + 1), (_num_pivots - 1 - i) * _fixed_keylen_aligned);
_total_size -= _fixed_keylen_aligned;
}
void ftnode_pivot_keys::_delete_at_dbt(int i) {
// slide over existing keys, then shrink down to size
_destroy_key_dbt(i);
memmove(&_dbt_keys[i], &_dbt_keys[i + 1], (_num_pivots - 1 - i) * sizeof(DBT));
REALLOC_N_ALIGNED(64, _num_pivots - 1, _dbt_keys);
}
void ftnode_pivot_keys::delete_at(int i) {
invariant(i < _num_pivots);
if (_fixed_format()) {
_delete_at_fixed(i);
} else {
_delete_at_dbt(i);
}
_num_pivots--;
}
void ftnode_pivot_keys::_split_at_fixed(int i, ftnode_pivot_keys *other) {
// recreate the other set of pivots from index >= i
other->_create_from_fixed_keys(_fixed_key(i), _fixed_keylen, _num_pivots - i);
// shrink down to size
_total_size = i * _fixed_keylen_aligned;
REALLOC_N_ALIGNED(64, _total_size, _fixed_keys);
}
void ftnode_pivot_keys::_split_at_dbt(int i, ftnode_pivot_keys *other) {
// recreate the other set of pivots from index >= i
other->create_from_dbts(&_dbt_keys[i], _num_pivots - i);
// destroy everything greater, shrink down to size
for (int k = i; k < _num_pivots; k++) {
_destroy_key_dbt(k);
}
REALLOC_N_ALIGNED(64, i, _dbt_keys);
}
void ftnode_pivot_keys::split_at(int i, ftnode_pivot_keys *other) {
if (i < _num_pivots) {
if (_fixed_format()) {
_split_at_fixed(i, other);
} else {
_split_at_dbt(i, other);
}
_num_pivots = i;
}
sanity_check();
}
void ftnode_pivot_keys::serialize_to_wbuf(struct wbuf *wb) const {
bool fixed = _fixed_format();
size_t written = 0;
for (int i = 0; i < _num_pivots; i++) {
size_t size = fixed ? _fixed_keylen : _dbt_keys[i].size;
invariant(size);
wbuf_nocrc_bytes(wb, fixed ? _fixed_key(i) : _dbt_keys[i].data, size);
written += size;
}
invariant(written == serialized_size());
}
int ftnode_pivot_keys::num_pivots() const {
// if we have fixed size keys, the number of pivots should be consistent
paranoid_invariant(_fixed_keys == nullptr || (_total_size == _fixed_keylen_aligned * _num_pivots));
return _num_pivots;
}
size_t ftnode_pivot_keys::total_size() const {
// if we have fixed size keys, the total size should be consistent
paranoid_invariant(_fixed_keys == nullptr || (_total_size == _fixed_keylen_aligned * _num_pivots));
return _total_size;
}
size_t ftnode_pivot_keys::serialized_size() const {
// we only return the size that will be used when serialized, so we calculate based
// on the fixed keylen and not the aligned keylen.
return _fixed_format() ? _num_pivots * _fixed_keylen : _total_size;
}
void ftnode_pivot_keys::sanity_check() const {
if (_fixed_format()) {
invariant(_dbt_keys == nullptr);
invariant(_fixed_keylen_aligned == _align4(_fixed_keylen));
invariant(_num_pivots * _fixed_keylen <= _total_size);
invariant(_num_pivots * _fixed_keylen_aligned == _total_size);
} else {
invariant(_num_pivots == 0 || _dbt_keys != nullptr);
size_t size = 0;
for (int i = 0; i < _num_pivots; i++) {
size += _dbt_keys[i].size;
}
invariant(size == _total_size);
}
}

View File

@ -0,0 +1,513 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2009-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#ident "$Id$"
#include <algorithm>
#include <string.h>
#include "portability/memory.h"
#include "portability/toku_assert.h"
#include "portability/toku_stdint.h"
#include "portability/toku_stdlib.h"
#include "ft/serialize/block_allocator.h"
#include "ft/serialize/block_allocator_strategy.h"
#if TOKU_DEBUG_PARANOID
#define VALIDATE() validate()
#else
#define VALIDATE()
#endif
static FILE *ba_trace_file = nullptr;
void block_allocator::maybe_initialize_trace(void) {
const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH");
if (ba_trace_path != nullptr) {
ba_trace_file = toku_os_fopen(ba_trace_path, "w");
if (ba_trace_file == nullptr) {
fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), "
"but it could not be opened for writing (errno %d)\n",
ba_trace_path, get_maybe_error_errno());
} else {
fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path);
}
}
}
void block_allocator::maybe_close_trace() {
if (ba_trace_file != nullptr) {
int r = toku_os_fclose(ba_trace_file);
if (r != 0) {
fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n",
r, get_maybe_error_errno());
} else {
fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n");
}
}
}
void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) {
// the alignment must be at least 512 and aligned with 512 to work with direct I/O
assert(alignment >= 512 && (alignment % 512) == 0);
_reserve_at_beginning = reserve_at_beginning;
_alignment = alignment;
_n_blocks = 0;
_blocks_array_size = 1;
XMALLOC_N(_blocks_array_size, _blocks_array);
_n_bytes_in_use = reserve_at_beginning;
_strategy = BA_STRATEGY_FIRST_FIT;
memset(&_trace_lock, 0, sizeof(toku_mutex_t));
toku_mutex_init(&_trace_lock, nullptr);
VALIDATE();
}
void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) {
_create_internal(reserve_at_beginning, alignment);
_trace_create();
}
void block_allocator::destroy() {
toku_free(_blocks_array);
_trace_destroy();
toku_mutex_destroy(&_trace_lock);
}
void block_allocator::set_strategy(enum allocation_strategy strategy) {
_strategy = strategy;
}
void block_allocator::grow_blocks_array_by(uint64_t n_to_add) {
if (_n_blocks + n_to_add > _blocks_array_size) {
uint64_t new_size = _n_blocks + n_to_add;
uint64_t at_least = _blocks_array_size * 2;
if (at_least > new_size) {
new_size = at_least;
}
_blocks_array_size = new_size;
XREALLOC_N(_blocks_array_size, _blocks_array);
}
}
void block_allocator::grow_blocks_array() {
grow_blocks_array_by(1);
}
void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
struct blockpair *pairs, uint64_t n_blocks) {
_create_internal(reserve_at_beginning, alignment);
_n_blocks = n_blocks;
grow_blocks_array_by(_n_blocks);
memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair));
std::sort(_blocks_array, _blocks_array + _n_blocks);
for (uint64_t i = 0; i < _n_blocks; i++) {
// Allocator does not support size 0 blocks. See block_allocator_free_block.
invariant(_blocks_array[i].size > 0);
invariant(_blocks_array[i].offset >= _reserve_at_beginning);
invariant(_blocks_array[i].offset % _alignment == 0);
_n_bytes_in_use += _blocks_array[i].size;
}
VALIDATE();
_trace_create_from_blockpairs();
}
// Effect: align a value by rounding up.
static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
}
struct block_allocator::blockpair *
block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) {
switch (_strategy) {
case BA_STRATEGY_FIRST_FIT:
return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment);
case BA_STRATEGY_BEST_FIT:
return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment);
case BA_STRATEGY_HEAT_ZONE:
return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat);
case BA_STRATEGY_PADDED_FIT:
return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment);
default:
abort();
}
}
// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) {
struct blockpair *bp;
// Allocator does not support size 0 blocks. See block_allocator_free_block.
invariant(size > 0);
grow_blocks_array();
_n_bytes_in_use += size;
uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment);
if (_n_blocks == 0) {
// First and only block
assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use
_blocks_array[0].offset = align(_reserve_at_beginning, _alignment);
_blocks_array[0].size = size;
*offset = _blocks_array[0].offset;
goto done;
} else if (end_of_reserve + size <= _blocks_array[0].offset ) {
// Check to see if the space immediately after the reserve is big enough to hold the new block.
bp = &_blocks_array[0];
memmove(bp + 1, bp, _n_blocks * sizeof(*bp));
bp[0].offset = end_of_reserve;
bp[0].size = size;
*offset = end_of_reserve;
goto done;
}
bp = choose_block_to_alloc_after(size, heat);
if (bp != nullptr) {
// our allocation strategy chose the space after `bp' to fit the new block
uint64_t answer_offset = align(bp->offset + bp->size, _alignment);
uint64_t blocknum = bp - _blocks_array;
invariant(&_blocks_array[blocknum] == bp);
invariant(blocknum < _n_blocks);
memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp));
bp[1].offset = answer_offset;
bp[1].size = size;
*offset = answer_offset;
} else {
// It didn't fit anywhere, so fit it on the end.
assert(_n_blocks < _blocks_array_size);
bp = &_blocks_array[_n_blocks];
uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment);
bp->offset = answer_offset;
bp->size = size;
*offset = answer_offset;
}
done:
_n_blocks++;
VALIDATE();
_trace_alloc(size, heat, *offset);
}
// Find the index in the blocks array that has a particular offset. Requires that the block exist.
// Use binary search so it runs fast.
int64_t block_allocator::find_block(uint64_t offset) {
VALIDATE();
if (_n_blocks == 1) {
assert(_blocks_array[0].offset == offset);
return 0;
}
uint64_t lo = 0;
uint64_t hi = _n_blocks;
while (1) {
assert(lo < hi); // otherwise no such block exists.
uint64_t mid = (lo + hi) / 2;
uint64_t thisoff = _blocks_array[mid].offset;
if (thisoff < offset) {
lo = mid + 1;
} else if (thisoff > offset) {
hi = mid;
} else {
return mid;
}
}
}
// To support 0-sized blocks, we need to include size as an input to this function.
// All 0-sized blocks at the same offset can be considered identical, but
// a 0-sized block can share offset with a non-zero sized block.
// The non-zero sized block is not exchangable with a zero sized block (or vice versa),
// so inserting 0-sized blocks can cause corruption here.
void block_allocator::free_block(uint64_t offset) {
VALIDATE();
int64_t bn = find_block(offset);
assert(bn >= 0); // we require that there is a block with that offset.
_n_bytes_in_use -= _blocks_array[bn].size;
memmove(&_blocks_array[bn], &_blocks_array[bn + 1],
(_n_blocks - bn - 1) * sizeof(struct blockpair));
_n_blocks--;
VALIDATE();
_trace_free(offset);
}
uint64_t block_allocator::block_size(uint64_t offset) {
int64_t bn = find_block(offset);
assert(bn >=0); // we require that there is a block with that offset.
return _blocks_array[bn].size;
}
uint64_t block_allocator::allocated_limit() const {
if (_n_blocks == 0) {
return _reserve_at_beginning;
} else {
struct blockpair *last = &_blocks_array[_n_blocks - 1];
return last->offset + last->size;
}
}
// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth.
// Return the offset and size of the block with that number.
// Return 0 if there is a block that big, return nonzero if b is too big.
int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) {
if (b ==0 ) {
*offset = 0;
*size = _reserve_at_beginning;
return 0;
} else if (b > _n_blocks) {
return -1;
} else {
*offset =_blocks_array[b - 1].offset;
*size =_blocks_array[b - 1].size;
return 0;
}
}
// Requires: report->file_size_bytes is filled in
// Requires: report->data_bytes is filled in
// Requires: report->checkpoint_bytes_additional is filled in
void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) {
assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional);
report->unused_bytes = 0;
report->unused_blocks = 0;
report->largest_unused_block = 0;
if (_n_blocks > 0) {
//Deal with space before block 0 and after reserve:
{
struct blockpair *bp = &_blocks_array[0];
assert(bp->offset >= align(_reserve_at_beginning, _alignment));
uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment);
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
//Deal with space between blocks:
for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) {
// Consider the space after blocknum
struct blockpair *bp = &_blocks_array[blocknum];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
uint64_t next_offset = bp[1].offset;
uint64_t free_space = next_offset - end_of_this_block;
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
//Deal with space after last block
{
struct blockpair *bp = &_blocks_array[_n_blocks-1];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
if (end_of_this_block < report->file_size_bytes) {
uint64_t free_space = report->file_size_bytes - end_of_this_block;
assert(free_space > 0);
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
} else {
// No blocks. Just the reserve.
uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment);
if (end_of_this_block < report->file_size_bytes) {
uint64_t free_space = report->file_size_bytes - end_of_this_block;
assert(free_space > 0);
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
}
void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) {
report->data_bytes = _n_bytes_in_use;
report->data_blocks = _n_blocks;
report->file_size_bytes = 0;
report->checkpoint_bytes_additional = 0;
get_unused_statistics(report);
}
void block_allocator::validate() const {
uint64_t n_bytes_in_use = _reserve_at_beginning;
for (uint64_t i = 0; i < _n_blocks; i++) {
n_bytes_in_use += _blocks_array[i].size;
if (i > 0) {
assert(_blocks_array[i].offset > _blocks_array[i - 1].offset);
assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size );
}
}
assert(n_bytes_in_use == _n_bytes_in_use);
}
// Tracing
void block_allocator::_trace_create(void) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n",
this, _reserve_at_beginning, _alignment);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_create_from_blockpairs(void) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ",
this, _reserve_at_beginning, _alignment);
for (uint64_t i = 0; i < _n_blocks; i++) {
fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ",
_blocks_array[i].offset, _blocks_array[i].size);
}
fprintf(ba_trace_file, "\n");
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_destroy(void) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_destroy %p\n", this);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
this, size, heat, offset);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_free(uint64_t offset) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}

View File

@ -0,0 +1,267 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#pragma once
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <db.h>
#include "portability/toku_pthread.h"
#include "portability/toku_stdint.h"
// Block allocator.
//
// A block allocator manages the allocation of variable-sized blocks.
// The translation of block numbers to addresses is handled elsewhere.
// The allocation of block numbers is handled elsewhere.
//
// When creating a block allocator we also specify a certain-sized
// block at the beginning that is preallocated (and cannot be allocated or freed)
//
// We can allocate blocks of a particular size at a particular location.
// We can allocate blocks of a particular size at a location chosen by the allocator.
// We can free blocks.
// We can determine the size of a block.
class block_allocator {
public:
static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096;
// How much must be reserved at the beginning for the block?
// The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
// So 4096 should be enough.
static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096;
static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0,
"block allocator header must have proper alignment");
static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
enum allocation_strategy {
BA_STRATEGY_FIRST_FIT = 1,
BA_STRATEGY_BEST_FIT,
BA_STRATEGY_PADDED_FIT,
BA_STRATEGY_HEAT_ZONE
};
struct blockpair {
uint64_t offset;
uint64_t size;
blockpair(uint64_t o, uint64_t s) :
offset(o), size(s) {
}
int operator<(const struct blockpair &rhs) const {
return offset < rhs.offset;
}
int operator<(const uint64_t &o) const {
return offset < o;
}
};
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
// The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
// All blocks be start on a multiple of ALIGNMENT.
// Aborts if we run out of memory.
// Parameters
// reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned.
// alignment (IN) Block alignment.
void create(uint64_t reserve_at_beginning, uint64_t alignment);
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
// The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
// The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs'
// All blocks be start on a multiple of ALIGNMENT.
// Aborts if we run out of memory.
// Parameters
// pairs, unowned array of pairs to copy
// n_blocks, Size of pairs array
// reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned.
// alignment (IN) Block alignment.
void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
struct blockpair *pairs, uint64_t n_blocks);
// Effect: Destroy this block allocator
void destroy();
// Effect: Set the allocation strategy that the allocator should use
// Requires: No other threads are operating on this block allocator
void set_strategy(enum allocation_strategy strategy);
// Effect: Allocate a block of the specified size at an address chosen by the allocator.
// Aborts if anything goes wrong.
// The block address will be a multiple of the alignment.
// Parameters:
// size (IN): The size of the block. (The size does not have to be aligned.)
// offset (OUT): The location of the block.
// heat (IN): A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint)
// Heat values are lexiographically ordered (like integers), but their specific values are arbitrary
void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset);
// Effect: Free the block at offset.
// Requires: There must be a block currently allocated at that offset.
// Parameters:
// offset (IN): The offset of the block.
void free_block(uint64_t offset);
// Effect: Return the size of the block that starts at offset.
// Requires: There must be a block currently allocated at that offset.
// Parameters:
// offset (IN): The offset of the block.
uint64_t block_size(uint64_t offset);
// Effect: Check to see if the block allocator is OK. This may take a long time.
// Usage Hints: Probably only use this for unit tests.
// TODO: Private?
void validate() const;
// Effect: Return the unallocated block address of "infinite" size.
// That is, return the smallest address that is above all the allocated blocks.
uint64_t allocated_limit() const;
// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth.
// Return the offset and size of the block with that number.
// Return 0 if there is a block that big, return nonzero if b is too big.
// Rationale: This is probably useful only for tests.
int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size);
// Effect: Fill in report to indicate how the file is used.
// Requires:
// report->file_size_bytes is filled in
// report->data_bytes is filled in
// report->checkpoint_bytes_additional is filled in
void get_unused_statistics(TOKU_DB_FRAGMENTATION report);
// Effect: Fill in report->data_bytes with the number of bytes in use
// Fill in report->data_blocks with the number of blockpairs in use
// Fill in unused statistics using this->get_unused_statistics()
// Requires:
// report->file_size is ignored on return
// report->checkpoint_bytes_additional is ignored on return
void get_statistics(TOKU_DB_FRAGMENTATION report);
// Block allocator tracing.
// - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file
// should be written to.
// - Trace may be replayed by ba_trace_replay tool in tools/ directory
// eg: "cat mytracefile | ba_trace_replay"
static void maybe_initialize_trace();
static void maybe_close_trace();
private:
void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment);
void grow_blocks_array_by(uint64_t n_to_add);
void grow_blocks_array();
int64_t find_block(uint64_t offset);
struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat);
// Tracing
toku_mutex_t _trace_lock;
void _trace_create(void);
void _trace_create_from_blockpairs(void);
void _trace_destroy(void);
void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset);
void _trace_free(uint64_t offset);
// How much to reserve at the beginning
uint64_t _reserve_at_beginning;
// Block alignment
uint64_t _alignment;
// How many blocks
uint64_t _n_blocks;
// How big is the blocks_array. Must be >= n_blocks.
uint64_t _blocks_array_size;
// These blocks are sorted by address.
struct blockpair *_blocks_array;
// Including the reserve_at_beginning
uint64_t _n_bytes_in_use;
// The allocation strategy are we using
enum allocation_strategy _strategy;
};

View File

@ -0,0 +1,274 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#include <algorithm>
#include <string.h>
#include "portability/toku_assert.h"
#include "ft/serialize/block_allocator_strategy.h"
static uint64_t _align(uint64_t value, uint64_t ba_alignment) {
return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
}
static uint64_t _roundup_to_power_of_two(uint64_t value) {
uint64_t r = 4096;
while (r < value) {
r *= 2;
invariant(r > 0);
}
return r;
}
// First fit block allocation
static struct block_allocator::blockpair *
_first_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t max_padding) {
if (n_blocks == 1) {
// won't enter loop, can't underflow the direction < 0 case
return nullptr;
}
struct block_allocator::blockpair *bp = &blocks_array[0];
for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
n_spaces_to_check--, bp++) {
// Consider the space after bp
uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1]
invariant(bp - blocks_array < (int64_t) n_blocks);
return bp;
}
}
return nullptr;
}
static struct block_allocator::blockpair *
_first_fit_bw(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) {
if (n_blocks == 1) {
// won't enter loop, can't underflow the direction < 0 case
return nullptr;
}
struct block_allocator::blockpair *bp = &blocks_array[-1];
for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
n_spaces_to_check--, bp--) {
// Consider the space after bp
uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) {
invariant(blocks_array - bp < (int64_t) n_blocks);
return bp;
}
}
return nullptr;
}
struct block_allocator::blockpair *
block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment) {
return _first_fit(blocks_array, n_blocks, size, alignment, 0);
}
// Best fit block allocation
struct block_allocator::blockpair *
block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment) {
struct block_allocator::blockpair *best_bp = nullptr;
uint64_t best_hole_size = 0;
for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) {
// Consider the space after blocknum
struct block_allocator::blockpair *bp = &blocks_array[blocknum];
uint64_t possible_offset = _align(bp->offset + bp->size, alignment);
uint64_t possible_end_offset = possible_offset + size;
if (possible_end_offset <= bp[1].offset) {
// It fits here. Is it the best fit?
uint64_t hole_size = bp[1].offset - possible_end_offset;
if (best_bp == nullptr || hole_size < best_hole_size) {
best_hole_size = hole_size;
best_bp = bp;
}
}
}
return best_bp;
}
static uint64_t padded_fit_alignment = 4096;
// TODO: These compiler specific directives should be abstracted in a portability header
// portability/toku_compiler.h?
__attribute__((__constructor__))
static void determine_padded_fit_alignment_from_env(void) {
// TODO: Should be in portability as 'toku_os_getenv()?'
const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT");
if (s != nullptr && strlen(s) > 0) {
const int64_t alignment = strtoll(s, nullptr, 10);
if (alignment <= 0) {
fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), "
"but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n",
s, padded_fit_alignment);
} else {
padded_fit_alignment = _roundup_to_power_of_two(alignment);
fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n",
padded_fit_alignment);
}
}
}
// First fit into a block that is oversized by up to max_padding.
// The hope is that if we purposefully waste a bit of space at allocation
// time we'll be more likely to reuse this block later.
struct block_allocator::blockpair *
block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment) {
return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment);
}
static double hot_zone_threshold = 0.85;
// TODO: These compiler specific directives should be abstracted in a portability header
// portability/toku_compiler.h?
__attribute__((__constructor__))
static void determine_hot_zone_threshold_from_env(void) {
// TODO: Should be in portability as 'toku_os_getenv()?'
const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD");
if (s != nullptr && strlen(s) > 0) {
const double hot_zone = strtod(s, nullptr);
if (hot_zone < 1 || hot_zone > 99) {
fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), "
"but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s);
hot_zone_threshold = 85 / 100;
} else {
fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s);
hot_zone_threshold = hot_zone / 100;
}
}
}
struct block_allocator::blockpair *
block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t heat) {
if (heat > 0) {
struct block_allocator::blockpair *bp, *boundary_bp;
// Hot allocation. Find the beginning of the hot zone.
boundary_bp = &blocks_array[n_blocks - 1];
uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment);
uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset);
boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset);
uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp;
uint64_t blocks_outside_zone = boundary_bp - blocks_array;
invariant(blocks_in_zone + blocks_outside_zone == n_blocks);
if (blocks_in_zone > 0) {
// Find the first fit in the hot zone, going forward.
bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0);
if (bp != nullptr) {
return bp;
}
}
if (blocks_outside_zone > 0) {
// Find the first fit in the cold zone, going backwards.
bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]);
if (bp != nullptr) {
return bp;
}
}
} else {
// Cold allocations are simply first-fit from the beginning.
return _first_fit(blocks_array, n_blocks, size, alignment, 0);
}
return nullptr;
}

View File

@ -1,8 +1,5 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
#ifndef _TDB_INTERNAL_H
#define _TDB_INTERNAL_H
/*
COPYING CONDITIONS NOTICE:
@ -32,8 +29,8 @@ COPYING CONDITIONS NOTICE:
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
TokuFT, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2014 Tokutek, Inc.
DISCLAIMER:
@ -89,22 +86,30 @@ PATENT RIGHTS GRANT:
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#pragma once
#include "toku_list.h"
// Included by db.h, defines some internal structures. These structures are inlined in some versions of db.h
// the types DB_TXN and so forth have been defined.
#include <db.h>
//// This list structure is repeated here (from toku_list.h) so that the db.h file will be standalone. Any code that depends on this list matching the structure in toku_list.h
//// will get flagged by the compiler if someone changes one but not the other. See #2276.
//struct toku_list {
// struct toku_list *next, *prev;
//};
#include "ft/serialize/block_allocator.h"
struct simple_dbt {
uint32_t len;
void *data;
// Block allocation strategy implementations
class block_allocator_strategy {
public:
static struct block_allocator::blockpair *
first_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment);
static struct block_allocator::blockpair *
best_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment);
static struct block_allocator::blockpair *
padded_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment);
static struct block_allocator::blockpair *
heat_zone(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t heat);
};
// end of _TDB_INTERNAL_H:
#endif

Some files were not shown because too many files have changed in this diff Show More