1 // btree version threadskv10h futex version
2 // with reworked bt_deletekey code,
3 // phase-fair re-entrant reader writer lock,
4 // librarian page split code,
5 // duplicate key management
6 // bi-directional cursors
7 // ACID batched key-value updates
8 // LSM B-trees for write optimization
9 // larger sized leaf pages than non-leaf
10 // and LSM B-tree find & count operations
14 // author: karl malbrain, malbrain@cal.berkeley.edu
17 This work, including the source code, documentation
18 and related data, is placed into the public domain.
20 The orginal author is Karl Malbrain.
22 THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
23 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
24 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
25 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
26 RESULTING FROM THE USE, MODIFICATION, OR
27 REDISTRIBUTION OF THIS SOFTWARE.
30 // Please see the project home page for documentation
31 // code.google.com/p/high-concurrency-btree
33 #define _FILE_OFFSET_BITS 64
34 #define _LARGEFILE64_SOURCE
38 #include <xmmintrin.h>
39 #include <linux/futex.h>
40 #include <sys/syscall.h>
54 #define WIN32_LEAN_AND_MEAN
68 typedef unsigned long long uid;
69 typedef unsigned long long logseqno;
72 typedef unsigned long long off64_t;
73 typedef unsigned short ushort;
74 typedef unsigned int uint;
77 #define BT_ro 0x6f72 // ro
78 #define BT_rw 0x7772 // rw
80 #define BT_maxbits 26 // maximum page size in bits
81 #define BT_minbits 9 // minimum page size in bits
82 #define BT_minpage (1 << BT_minbits) // minimum page size
83 #define BT_maxpage (1 << BT_maxbits) // maximum page size
85 // BTree page number constants
86 #define ALLOC_page 0 // allocation page
87 #define ROOT_page 1 // root of the btree
88 #define LATCH_page 2 // first page of latches
90 #define SEG_bits 16 // number of leaf pages in a segment in bits
91 #define MIN_seg 32 // initial number of mapping segments
93 // Number of levels to create in a new BTree
97 There are six lock types for each node in four independent sets:
98 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
99 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent.
100 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock.
101 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks.
102 5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification.
103 6. (set 4) LinkModification: Exclusive. Update of a node's left link is underway. Incompatible with another LinkModification.
118 volatile unsigned char xcl[1];
119 volatile unsigned char filler;
120 volatile ushort waiters[1];
126 // definition for reader/writer reentrant lock implementation
131 ushort readers; // number of readers holding lock
133 ushort line; // owner source line number
135 ushort dup; // re-entrant lock count
136 pid_t tid; // owner pid
139 // hash table entries
143 uint entry; // Latch table entry at head of chain
146 // latch manager table structure
149 uid page_no; // latch set page number
150 MutexLatch modify[1]; // modify entry lite latch
151 RWLock readwr[1]; // read/write page lock
152 RWLock access[1]; // Access Intent/Page delete
153 RWLock parent[1]; // Posting of fence key in parent
154 RWLock link[1]; // left link update in progress
155 uint split; // right split page atomic insert
156 uint next; // next entry in hash table chain
157 uint prev; // prev entry in hash table chain
158 uint pin; // number of accessing threads
161 // Define the length of the page record numbers
165 // Page key slot definition.
167 // Keys are marked dead, but remain on the page until
168 // it cleanup is called. The fence key (highest key) for
169 // a leaf page is always present, even after cleanup.
173 // In addition to the Unique keys that occupy slots
174 // there are Librarian and Duplicate key
175 // slots occupying the key slot array.
177 // The Librarian slots are dead keys that
178 // serve as filler, available to add new Unique
179 // or Dup slots that are inserted into the B-tree.
181 // The Duplicate slots have had their key bytes extended
182 // by 6 bytes to contain a binary duplicate key uniqueifier.
193 uint off:BT_maxbits; // page offset for key start
194 uint type:3; // type of slot
195 uint dead:1; // set for deleted slot
198 // The key structure occupies space at the upper end of
199 // each page. It's a length byte followed by the key
203 unsigned char len; // this can be changed to a ushort or uint
204 unsigned char key[0];
207 // the value structure also occupies space at the upper
208 // end of the page. Each key is immediately followed by a value.
211 unsigned char len; // this can be changed to a ushort or uint
212 unsigned char value[0];
215 #define BT_maxkey 255 // maximum number of bytes in a key
216 #define BT_keyarray (BT_maxkey + sizeof(BtKey))
218 // The first part of an index page.
219 // It is immediately followed
220 // by the BtSlot array of keys.
222 typedef struct BtPage_ {
223 uint cnt; // count of keys in page
224 uint act; // count of active keys
225 uint min; // next key/value offset
226 uint fence; // page fence key offset
227 uint garbage; // page garbage in bytes
228 unsigned char lvl; // level of page, zero = leaf
229 unsigned char free; // page is on the free chain
230 unsigned char kill; // page is being deleted
231 unsigned char nopromote; // page is being constructed
232 uid right, left; // page numbers to right and left
235 // The loadpage interface object
238 BtPage page; // current page pointer
239 BtLatchSet *latch; // current page latch set
242 // structure for latch manager on shared ALLOC_page
245 uid allocpage; // page number of first available page
246 uid freechain; // head of free page_nos chain
247 uid leafchain; // head of leaf page_nos chain
248 uid leaf_page; // page number of leftmost leaf
249 uid rightleaf; // page number of rightmost leaf
250 uid leafpromote; // next leaf page to try promotion
251 unsigned long long leafpages; // number of active leaf pages
252 unsigned long long upperpages; // number of active upper pages
253 unsigned char leaf_xtra; // leaf page size in xtra bits
254 unsigned char page_bits; // base page size in bits
255 uint nlatchpage; // size of buffer pool & latchsets
256 uint latchtotal; // number of page latch entries
257 uint latchvictim; // next latch entry to test for pin
258 uint latchhash; // number of latch hash table slots
259 MutexLatch lock[1]; // allocation area lite latch
260 MutexLatch promote[1]; // promotion lite latch
263 // The object structure for Btree access
266 uint page_size; // base page size
267 uint page_bits; // base page size in bits
268 uint leaf_xtra; // leaf xtra bits
274 BtPageZero *pagezero; // mapped allocation page
275 BtHashEntry *hashtable; // the buffer pool hash table entries
276 BtLatchSet *latchsets; // mapped latch set from buffer pool
277 uint maxleaves; // leaf page count to begin promote
278 int err; // last error
279 int line; // last error line no
280 int found; // number of keys found by delete
281 int type; // type of LSM tree 0=cache, 1=main
282 uint maxseg; // max number of memory mapped segments
283 uint segments; // number of memory mapped segments in use
284 MutexLatch maps[1]; // segment table mutex
285 unsigned char **pages; // memory mapped segments of b-tree
289 BtMgr *mgr; // buffer manager for entire process
290 BtMgr *main; // buffer manager for main btree
291 pid_t tid; // thread-id of thread
292 BtPageSet cacheset[1]; // cached page frame for cache btree
293 BtPageSet mainset[1]; // cached page frame for main btree
294 uint cacheslot; // slot number in cacheset
295 uint mainslot; // slot number in mainset
296 ushort phase; // 1 = main btree 0 = cache btree 2 = both
306 uint entry:31; // latch table entry number
307 uint reuse:1; // reused previous page
308 uint slot; // slot on page
309 uint src; // source slot
312 // Catastrophic errors
327 extern void bt_close (BtDb *bt);
328 extern BtDb *bt_open (BtMgr *mgr, BtMgr *main);
329 extern BTERR bt_writepage (BtMgr *mgr, BtPage page, uid page_no, uint leaf);
330 extern void bt_lockpage(BtLock mode, BtLatchSet *latch, pid_t tid, uint line);
331 extern void bt_unlockpage(BtLock mode, BtLatchSet *latch, uint line);
332 extern BTERR bt_insertkey (BtMgr *mgr, unsigned char *key, uint len, uint lvl, void *value, uint vallen, BtSlotType type);
333 extern BTERR bt_deletekey (BtMgr *mgr, unsigned char *key, uint len, uint lvl);
335 extern int bt_findkey (BtDb *db, unsigned char *key, uint keylen, unsigned char *value, uint valmax);
337 extern BTERR bt_startkey (BtDb *db, unsigned char *key, uint len);
338 extern BTERR bt_nextkey (BtDb *bt);
340 extern uint bt_lastkey (BtDb *bt);
341 extern uint bt_prevkey (BtDb *bt);
344 extern BtMgr *bt_mgr (char *name, uint bits, uint leaf_xtra, uint poolsize);
345 extern void bt_mgrclose (BtMgr *mgr);
347 // atomic transaction functions
348 BTERR bt_atomicexec(BtMgr *mgr, BtPage source, uint count, pid_t tid);
349 BTERR bt_promote (BtDb *bt);
351 // The page is allocated from low and hi ends.
352 // The key slots are allocated from the bottom,
353 // while the text and value of the key
354 // are allocated from the top. When the two
355 // areas meet, the page is split into two.
357 // A key consists of a length byte, two bytes of
358 // index number (0 - 65535), and up to 253 bytes
361 // Associated with each key is a value byte string
362 // containing any value desired.
364 // The b-tree root is always located at page 1.
365 // The first leaf page of level zero is always
366 // located on page 2.
368 // The b-tree pages are linked with next
369 // pointers to facilitate enumerators,
370 // and provide for concurrency.
372 // When to root page fills, it is split in two and
373 // the tree height is raised by a new root at page
374 // one with two keys.
376 // Deleted keys are marked with a dead bit until
377 // page cleanup. The fence key for a leaf node is
380 // To achieve maximum concurrency one page is locked at a time
381 // as the tree is traversed to find leaf key in question. The right
382 // page numbers are used in cases where the page is being split,
385 // Page 0 is dedicated to lock for new page extensions,
386 // and chains empty pages together for reuse. It also
387 // contains the latch manager hash table.
389 // The ParentModification lock on a node is obtained to serialize posting
390 // or changing the fence key for a node.
392 // Empty pages are chained together through the ALLOC page and reused.
394 // Access macros to address slot and key values from the page
395 // Page slots use 1 based indexing.
397 #define slotptr(page, slot) (((BtSlot *)(page+1)) + ((slot)-1))
398 #define keyptr(page, slot) ((BtKey*)((unsigned char*)(page) + slotptr(page, slot)->off))
399 #define valptr(page, slot) ((BtVal*)(keyptr(page,slot)->key + keyptr(page,slot)->len))
400 #define fenceptr(page) ((BtKey*)((unsigned char*)(page) + page->fence))
402 void bt_putid(unsigned char *dest, uid id)
407 dest[i] = (unsigned char)id, id >>= 8;
410 uid bt_getid(unsigned char *src)
415 for( i = 0; i < BtId; i++ )
416 id <<= 8, id |= *src++;
421 // lite weight spin lock Latch Manager
425 return syscall(SYS_gettid);
428 int sys_futex(void *addr1, int op, int val1, struct timespec *timeout, void *addr2, int val3)
430 return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
433 void bt_mutexlock(MutexLatch *latch)
435 uint idx, waited = 0;
439 for( idx = 0; idx < 100; idx++ ) {
440 *prev->value = __sync_fetch_and_or (latch->value, 1);
441 if( !*prev->bits->xcl ) {
443 __sync_fetch_and_sub (latch->bits->waiters, 1);
449 __sync_fetch_and_add (latch->bits->waiters, 1);
450 *prev->bits->waiters += 1;
454 sys_futex (latch->value, FUTEX_WAIT, *prev->value, NULL, NULL, 0);
458 int bt_mutextry(MutexLatch *latch)
460 return !__sync_lock_test_and_set (latch->bits->xcl, 1);
463 void bt_releasemutex(MutexLatch *latch)
467 *prev->value = __sync_fetch_and_and (latch->value, 0xffff0000);
469 if( *prev->bits->waiters )
470 sys_futex( latch->value, FUTEX_WAKE, 1, NULL, NULL, 0 );
473 // reader/writer lock implementation
475 void WriteLock (RWLock *lock, pid_t tid, uint line)
477 if( tid && lock->tid == tid ) {
481 bt_mutexlock (lock->xcl);
482 bt_mutexlock (lock->wrt);
483 bt_releasemutex (lock->xcl);
490 void WriteRelease (RWLock *lock)
497 bt_releasemutex (lock->wrt);
500 void ReadLock (RWLock *lock)
502 bt_mutexlock (lock->xcl);
504 if( !__sync_fetch_and_add (&lock->readers, 1) )
505 bt_mutexlock (lock->wrt);
507 bt_releasemutex (lock->xcl);
510 void ReadRelease (RWLock *lock)
512 if( __sync_fetch_and_sub (&lock->readers, 1) == 1 )
513 bt_releasemutex (lock->wrt);
516 // read page into buffer pool from permanent location in Btree file
518 BTERR bt_readpage (BtMgr *mgr, BtPage page, uid page_no, uint leaf)
520 uint page_size = mgr->page_size;
523 page_size <<= mgr->leaf_xtra;
525 if( pread(mgr->idx, page, page_size, page_no << mgr->page_bits) < page_size )
526 return mgr->err = BTERR_read;
531 // write page to location in Btree file
533 BTERR bt_writepage (BtMgr *mgr, BtPage page, uid page_no, uint leaf)
535 uint page_size = mgr->page_size;
538 page_size <<= mgr->leaf_xtra;
540 if( pwrite(mgr->idx, page, page_size, page_no << mgr->page_bits) < page_size )
541 return mgr->err = BTERR_wrt;
546 // decrement pin count
548 void bt_unpinlatch (BtLatchSet *latch)
550 bt_mutexlock(latch->modify);
552 bt_releasemutex(latch->modify);
555 // return the btree cached page address
557 BtPage bt_mappage (BtMgr *mgr, BtLatchSet *latch)
559 uint segment = latch->page_no >> SEG_bits;
560 int flag = PROT_READ | PROT_WRITE;
561 uid mask = (uid)1 << SEG_bits;
564 bt_mutexlock (mgr->maps);
568 if( segment < mgr->segments ) {
569 page = (BtPage)(mgr->pages[segment] + ((latch->page_no & mask) << mgr->page_bits));
571 bt_releasemutex (mgr->maps);
575 if( mgr->segments < mgr->maxseg ) {
576 mgr->pages[mgr->segments] = mmap (0, (uid)mgr->page_size << SEG_bits, flag, MAP_SHARED, mgr->idx, (uid)mgr->segments << mgr->page_bits << SEG_bits);
582 mgr->pages = realloc (mgr->pages, mgr->maxseg * sizeof(void *));
586 // return next available latch entry
587 // and with latch entry locked
589 uint bt_availnext (BtMgr *mgr)
596 entry = __sync_fetch_and_add (&mgr->pagezero->latchvictim, 1) + 1;
598 entry = _InterlockedIncrement (&mgr->pagezero->latchvictim);
600 entry %= mgr->pagezero->latchtotal;
605 latch = mgr->latchsets + entry;
607 if( !bt_mutextry(latch->modify) )
610 // return this entry if it is not pinned
615 bt_releasemutex(latch->modify);
619 // pin latch in latch pool
621 BtLatchSet *bt_pinlatch (BtMgr *mgr, uid page_no)
623 uint hashidx = page_no % mgr->pagezero->latchhash;
628 // try to find our entry
630 bt_mutexlock(mgr->hashtable[hashidx].latch);
632 if( entry = mgr->hashtable[hashidx].entry ) do
634 latch = mgr->latchsets + entry;
636 if( page_no == latch->page_no )
638 } while( entry = latch->next );
640 // found our entry: increment pin
643 latch = mgr->latchsets + entry;
644 bt_mutexlock(latch->modify);
646 bt_releasemutex(latch->modify);
647 bt_releasemutex(mgr->hashtable[hashidx].latch);
651 // find and reuse unpinned entry
654 entry = bt_availnext (mgr);
655 latch = mgr->latchsets + entry;
656 oldidx = latch->page_no % mgr->pagezero->latchhash;
658 // skip over this entry if latch not available
661 if( oldidx != hashidx )
662 if( !bt_mutextry (mgr->hashtable[oldidx].latch) ) {
663 bt_releasemutex(latch->modify);
667 // if latch is on a different hash chain
668 // unlink from the old page_no chain
671 if( oldidx != hashidx ) {
673 mgr->latchsets[latch->prev].next = latch->next;
675 mgr->hashtable[oldidx].entry = latch->next;
678 mgr->latchsets[latch->next].prev = latch->prev;
680 bt_releasemutex (mgr->hashtable[oldidx].latch);
683 // link page as head of hash table chain
684 // if this is a never before used entry,
685 // or it was previously on a different
686 // hash table chain. Otherwise, just
687 // leave it in its current hash table
690 if( !latch->page_no || hashidx != oldidx ) {
691 if( latch->next = mgr->hashtable[hashidx].entry )
692 mgr->latchsets[latch->next].prev = entry;
694 mgr->hashtable[hashidx].entry = entry;
698 // fill in latch structure
700 latch->page_no = page_no;
703 bt_releasemutex (latch->modify);
704 bt_releasemutex (mgr->hashtable[hashidx].latch);
708 void bt_mgrclose (BtMgr *mgr)
710 char *name = mgr->type ? "Main" : "Cache";
716 // flush previously written dirty pages
717 // and write recovery buffer to disk
719 fdatasync (mgr->idx);
722 while( mgr->segments )
723 munmap (mgr->pages[--mgr->segments], (uid)mgr->page_size << SEG_bits);
725 while( mgr->segments ) {
726 FlushViewOfFile(mgr->pages[--mgr->segments], 0);
727 UnmapViewOfFile(mgr->pages[mgr->Segments]);
734 FlushFileBuffers(mgr->idx);
735 CloseHandle(mgr->idx);
740 // close and release memory
742 void bt_close (BtDb *bt)
747 void bt_initpage (BtMgr *mgr, BtPage page, uid leaf_page_no, uint lvl)
749 BtSlot *node = slotptr(page, 1);
750 unsigned char value[BtId];
755 page_no = lvl ? ROOT_page : leaf_page_no;
756 node->off = mgr->page_size;
759 node->off <<= mgr->leaf_xtra;
761 node->off -= 3 + (lvl ? BtId + sizeof(BtVal): sizeof(BtVal));
762 node->type = Librarian;
765 node->off = node[-1].off;
766 key = keyptr(page, 2);
767 key = keyptr(page, 1);
768 key->len = 2; // create stopper key
772 bt_putid(value, leaf_page_no);
773 val = valptr(page, 1);
774 val->len = lvl ? BtId : 0;
775 memcpy (val->value, value, val->len);
777 page->fence = node->off;
778 page->min = node->off;
783 if( bt_writepage (mgr, page, page_no, !lvl) ) {
784 fprintf (stderr, "Unable to create btree page %d\n", page_no);
789 // open/create new btree buffer manager
791 // call with file_name, BT_openmode, bits in page size (e.g. 16),
792 // extra bits for leaves (e.g. 4) size of latch pool (e.g. 500)
794 BtMgr *bt_mgr (char *name, uint pagebits, uint leafxtra, uint nodemax)
796 uint lvl, attr, last, slot, idx, blk;
797 int flag, initit = 0;
798 BtPageZero *pagezero;
799 struct flock lock[1];
807 // determine sanity of page size and buffer pool
809 if( leafxtra | pagebits )
810 if( leafxtra + pagebits > BT_maxbits )
811 fprintf (stderr, "pagebits + leafxtra > maxbits\n"), exit(1);
814 if( pagebits < BT_minbits )
815 fprintf (stderr, "pagebits < minbits\n"), exit(1);
818 mgr = calloc (1, sizeof(BtMgr));
820 mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
822 if( mgr->idx == -1 ) {
823 fprintf (stderr, "Unable to create/open btree file %s\n", name);
824 return free(mgr), NULL;
827 memset (lock, 0, sizeof(lock));
828 lock->l_len = sizeof(struct BtPage_);
829 lock->l_type = F_WRLCK;
831 if( fcntl (mgr->idx, F_SETLKW, lock) < 0 ) {
832 fprintf(stderr, "unable to lock record zero %s\n", name);
836 mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr));
837 attr = FILE_ATTRIBUTE_NORMAL;
838 mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL);
840 if( mgr->idx == INVALID_HANDLE_VALUE ) {
841 fprintf (stderr, "Unable to create/open btree file %s\n", name);
842 return GlobalFree(mgr), NULL;
847 pagezero = valloc (BT_maxpage);
848 page = (BtPage)pagezero;
851 // read minimum page size to get root info
852 // to support raw disk partition files
853 // check if page_bits == 0 on the disk.
855 if( size = lseek (mgr->idx, 0L, 2) )
856 if( pread(mgr->idx, pagezero, BT_minpage, 0) == BT_minpage )
857 if( pagezero->page_bits ) {
858 pagebits = pagezero->page_bits;
859 leafxtra = pagezero->leaf_xtra;
863 return free(mgr), free(pagezero), NULL;
867 pagezero = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE);
868 size = GetFileSize(mgr->idx, amt);
871 if( !ReadFile(mgr->idx, (char *)pagezero, BT_minpage, amt, NULL) )
872 return bt_mgrclose (mgr), NULL;
873 pagebits = pagezero->page_bits;
874 leafxtra = pagezero->leaf_xtra;
879 mgr->page_size = 1 << pagebits;
880 mgr->page_bits = pagebits;
881 mgr->leaf_xtra = leafxtra;
886 // calculate number of latch table & hash entries
888 memset (pagezero, 0, 1 << pagebits);
889 pagezero->nlatchpage = nodemax/16 * sizeof(BtHashEntry);
891 pagezero->nlatchpage += sizeof(BtLatchSet) * nodemax + mgr->page_size - 1;
892 pagezero->nlatchpage >>= mgr->page_bits;
893 pagezero->latchtotal = nodemax;
895 pagezero->latchhash = (((uid)pagezero->nlatchpage<< mgr->page_bits) - nodemax * sizeof(BtLatchSet)) / sizeof(BtHashEntry);
897 // initialize an empty b-tree with alloc page, root page, leaf page
898 // and page(s) of latches and page pool cache
900 pagezero->page_bits = mgr->page_bits;
901 pagezero->leaf_xtra = leafxtra;
902 pagezero->upperpages = 1;
903 pagezero->leafpages = 1;
905 leaf_page = pagezero->leaf_page = pagezero->nlatchpage + LATCH_page;
907 // round first leafpage up to leafxtra boundary
909 if( pagezero->leaf_page & ((1 << leafxtra) - 1)) {
910 blk = pagezero->leaf_page;
911 pagezero->leaf_page |= (1 << leafxtra) - 1;
912 pagezero->freechain = pagezero->leaf_page++;
913 leaf_page = pagezero->leaf_page;
917 pagezero->rightleaf = pagezero->leaf_page;
918 pagezero->allocpage = pagezero->leaf_page + (1 << leafxtra);
920 if( pwrite (mgr->idx, pagezero, 1 << pagebits, 0) < 1 << pagebits) {
921 fprintf (stderr, "Unable to create btree page zero\n");
922 return bt_mgrclose (mgr), NULL;
925 // initialize root level 1 page
927 memset (page, 0, 1 << pagebits);
928 bt_initpage (mgr, page, leaf_page, 1);
930 // chain unused pages as first freelist
932 memset (page, 0, 1 << pagebits);
934 while( blk & ((1 << leafxtra) - 1) ) {
935 if( bt_writepage (mgr, page, blk, 0) ) {
936 fprintf(stderr, "unable to write initial free blk %d\r\n", blk);
942 // initialize first page of leaves
944 memset (page, 0, 1 << pagebits);
945 bt_initpage (mgr, page, leaf_page, 0);
951 VirtualFree (pagezero, 0, MEM_RELEASE);
954 lock->l_type = F_UNLCK;
956 if( fcntl (mgr->idx, F_SETLK, lock) < 0 ) {
957 fprintf (stderr, "Unable to unlock page zero\n");
964 mgr->maxseg = MIN_seg;
965 mgr->pages = calloc (MIN_seg, sizeof(unsigned char *));
967 flag = PROT_READ | PROT_WRITE;
968 mgr->pages[0] = mmap (0, (uid)mgr->page_size << SEG_bits, flag, MAP_SHARED, mgr->idx, 0);
970 if( mgr->pages[0] == MAP_FAILED ) {
971 fprintf (stderr, "Unable to mmap pagezero btree segment, error = %d\n", errno);
972 return bt_mgrclose (mgr), NULL;
975 mgr->pagezero = (BtPageZero *)mgr->pages[0];
976 // mlock (mgr->pagezero, mgr->page_size);
978 // allocate latch pool
980 mgr->latchsets = (BtLatchSet *)(mgr->pages[0] + ((uid)LATCH_page << mgr->page_bits));
981 mgr->hashtable = (BtHashEntry *)(mgr->latchsets + mgr->pagezero->latchtotal);
986 // open BTree access method
987 // based on buffer manager
989 BtDb *bt_open (BtMgr *mgr, BtMgr *main)
991 BtDb *bt = malloc (sizeof(*bt));
993 memset (bt, 0, sizeof(*bt));
994 bt->tid = sys_gettid();
1000 // compare two keys, return > 0, = 0, or < 0
1001 // =0: keys are same
1004 // as the comparison value
1006 int keycmp (BtKey* key1, unsigned char *key2, uint len2)
1008 uint len1 = key1->len;
1011 if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) )
1022 // place write, read, or parent lock on requested page_no.
1024 void bt_lockpage(BtLock mode, BtLatchSet *latch, pid_t tid, uint line)
1028 ReadLock (latch->readwr);
1031 WriteLock (latch->readwr, tid, line);
1034 ReadLock (latch->access);
1037 WriteLock (latch->access, tid, line);
1040 WriteLock (latch->parent, tid, line);
1043 WriteLock (latch->link, tid, line);
1048 // remove write, read, or parent lock on requested page
1050 void bt_unlockpage(BtLock mode, BtLatchSet *latch, uint line)
1054 ReadRelease (latch->readwr);
1057 WriteRelease (latch->readwr);
1060 ReadRelease (latch->access);
1063 WriteRelease (latch->access);
1066 WriteRelease (latch->parent);
1069 WriteRelease (latch->link);
1074 // allocate a new page
1075 // return with page latched, but unlocked.
1076 // contents is cleared for lvl > 0
1078 int bt_newpage(BtMgr *mgr, BtPageSet *set, BtPage contents)
1080 uint page_size = mgr->page_size, blk;
1084 // lock allocation page
1086 bt_mutexlock(mgr->pagezero->lock);
1088 if( contents->lvl ) {
1089 freechain = &mgr->pagezero->freechain;
1090 mgr->pagezero->upperpages++;
1092 freechain = &mgr->pagezero->leafchain;
1093 mgr->pagezero->leafpages++;
1094 page_size <<= mgr->leaf_xtra;
1097 // use empty chain first
1098 // else allocate new page
1100 if( page_no = *freechain ) {
1101 if( set->latch = bt_pinlatch (mgr, page_no) )
1102 set->page = bt_mappage (mgr, set->latch);
1104 return mgr->line = __LINE__, mgr->err = BTERR_struct;
1106 *freechain = set->page->right;
1108 // the page is currently nopromote and this
1109 // will keep bt_promote out.
1111 // contents will replace this bit
1112 // and pin will keep bt_promote out
1114 contents->nopromote = 0;
1116 memcpy (set->page, contents, page_size);
1118 // if( msync (mgr->pagezero, mgr->page_size, MS_SYNC) < 0 )
1119 // fprintf(stderr, "msync error %d line %d\n", errno, __LINE__);
1121 bt_releasemutex(mgr->pagezero->lock);
1125 // obtain next available page number
1126 // suitable for leaf or higher level
1128 page_no = mgr->pagezero->allocpage;
1129 mgr->pagezero->allocpage += 1 << mgr->leaf_xtra;
1131 // keep bt_promote out of this page
1133 contents->nopromote = 1;
1135 // unlock allocation latch and
1136 // extend file into new page.
1138 // if( msync (mgr->pagezero, mgr->page_size, MS_SYNC) < 0 )
1139 // fprintf(stderr, "msync error %d line %d\n", errno, __LINE__);
1141 if( bt_writepage (mgr, contents, page_no, !contents->lvl) )
1142 fprintf(stderr, "Write %lld error %d\n", page_no + blk, errno);
1144 // chain together unused non-leaf allocation
1146 if( contents->lvl ) {
1147 memset (contents, 0, mgr->page_size);
1149 for( blk = 1; blk < 1 << mgr->leaf_xtra; blk++ ) {
1150 if( bt_writepage (mgr, contents, page_no + blk, 0) )
1151 fprintf(stderr, "Write %lld error %d\n", page_no + blk, errno);
1152 contents->right = page_no + blk;
1153 *freechain = page_no + blk;
1157 bt_releasemutex(mgr->pagezero->lock);
1159 if( set->latch = bt_pinlatch (mgr, page_no) )
1160 set->page = bt_mappage (mgr, set->latch);
1164 // now pin will keep bt_promote out
1166 set->page->nopromote = 0;
1170 // find slot in page for given key at a given level
1172 int bt_findslot (BtPage page, unsigned char *key, uint len)
1174 uint diff, higher = page->cnt, low = 1, slot;
1177 // make stopper key an infinite fence value
1184 // low is the lowest candidate.
1185 // loop ends when they meet
1187 // higher is already
1188 // tested as .ge. the passed key.
1190 while( diff = higher - low ) {
1191 slot = low + ( diff >> 1 );
1192 if( keycmp (keyptr(page, slot), key, len) < 0 )
1195 higher = slot, good++;
1198 // return zero if key is on right link page
1200 return good ? higher : 0;
1203 // find and load page at given level for given key
1204 // leave page rd or wr locked as requested
1206 int bt_loadpage (BtMgr *mgr, BtPageSet *set, unsigned char *key, uint len, uint lvl, BtLock lock, pid_t tid)
1208 uid page_no = ROOT_page, prevpage_no = 0;
1209 uint drill = 0xff, slot;
1210 uint mode, prevmode;
1215 // start at root of btree and drill down
1218 if( set->latch = bt_pinlatch (mgr, page_no) )
1219 set->page = bt_mappage (mgr, set->latch);
1223 if( page_no > ROOT_page )
1224 bt_lockpage(BtLockAccess, set->latch, tid, __LINE__);
1226 // release & unpin parent or left sibling page
1229 bt_unlockpage(prevmode, prev->latch, __LINE__);
1230 bt_unpinlatch (prev->latch);
1234 // obtain mode lock using lock coupling through AccessLock
1235 // determine lock mode of drill level
1237 mode = (drill == lvl) ? lock : BtLockRead;
1238 bt_lockpage(mode, set->latch, tid, __LINE__);
1240 // grab our fence key
1242 ptr=fenceptr(set->page);
1244 if( set->page->free )
1245 return mgr->err = BTERR_struct, mgr->line = __LINE__, 0;
1247 if( page_no > ROOT_page )
1248 bt_unlockpage(BtLockAccess, set->latch, __LINE__);
1250 // re-read and re-lock root after determining actual level of root
1252 if( set->page->lvl != drill) {
1253 if( set->latch->page_no != ROOT_page )
1254 return mgr->err = BTERR_struct, mgr->line = __LINE__, 0;
1256 drill = set->page->lvl;
1258 if( lock != BtLockRead && drill == lvl ) {
1259 bt_unlockpage(mode, set->latch, __LINE__);
1260 bt_unpinlatch (set->latch);
1265 prevpage_no = set->latch->page_no;
1269 // if requested key is beyond our fence,
1270 // slide to the right
1272 if( keycmp (ptr, key, len) < 0 )
1273 if( page_no = set->page->right )
1276 // if page is part of a delete operation,
1277 // slide to the left;
1279 if( set->page->kill ) {
1280 bt_lockpage(BtLockLink, set->latch, tid, __LINE__);
1281 page_no = set->page->left;
1282 bt_unlockpage(BtLockLink, set->latch, __LINE__);
1286 // find key on page at this level
1287 // and descend to requested level
1289 if( slot = bt_findslot (set->page, key, len) ) {
1293 // find next non-dead slot -- the fence key if nothing else
1295 while( slotptr(set->page, slot)->dead )
1296 if( slot++ < set->page->cnt )
1299 return mgr->err = BTERR_struct, mgr->line = __LINE__, 0;
1301 val = valptr(set->page, slot);
1303 if( val->len == BtId )
1304 page_no = bt_getid(val->value);
1306 return mgr->line = __LINE__, mgr->err = BTERR_struct, 0;
1312 // slide right into next page
1314 page_no = set->page->right;
1318 // return error on end of right chain
1320 mgr->line = __LINE__, mgr->err = BTERR_struct;
1321 return 0; // return error
1324 // return page to free list
1325 // page must be delete, link & write locked
1326 // and have no keys pointing to it.
1328 void bt_freepage (BtMgr *mgr, BtPageSet *set)
1332 // lock allocation page
1334 bt_mutexlock (mgr->pagezero->lock);
1336 if( set->page->lvl ) {
1337 freechain = &mgr->pagezero->freechain;
1338 mgr->pagezero->upperpages--;
1340 freechain = &mgr->pagezero->leafchain;
1341 mgr->pagezero->leafpages--;
1346 set->page->right = *freechain;
1347 *freechain = set->latch->page_no;
1348 set->page->free = 1;
1350 // if( msync (mgr->pagezero, mgr->page_size, MS_SYNC) < 0 )
1351 // fprintf(stderr, "msync error %d line %d\n", errno, __LINE__);
1353 // unlock released page
1354 // and unlock allocation page
1356 bt_unlockpage (BtLockDelete, set->latch, __LINE__);
1357 bt_unlockpage (BtLockWrite, set->latch, __LINE__);
1358 bt_unlockpage (BtLockLink, set->latch, __LINE__);
1359 bt_unpinlatch (set->latch);
1360 bt_releasemutex (mgr->pagezero->lock);
1363 // a fence key was deleted from an interiour level page
1364 // push new fence value upwards
1366 BTERR bt_fixfence (BtMgr *mgr, BtPageSet *set, uint lvl)
1368 unsigned char leftkey[BT_keyarray], rightkey[BT_keyarray];
1369 unsigned char value[BtId];
1373 // remove the old fence value
1375 ptr = fenceptr(set->page);
1376 memcpy (rightkey, ptr, ptr->len + sizeof(BtKey));
1377 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1378 set->page->fence = slotptr(set->page, set->page->cnt)->off;
1380 // cache new fence value
1382 ptr = fenceptr(set->page);
1383 memcpy (leftkey, ptr, ptr->len + sizeof(BtKey));
1385 bt_lockpage (BtLockParent, set->latch, 0, __LINE__);
1386 bt_unlockpage (BtLockWrite, set->latch, __LINE__);
1388 // insert new (now smaller) fence key
1390 bt_putid (value, set->latch->page_no);
1391 ptr = (BtKey*)leftkey;
1393 if( bt_insertkey (mgr, ptr->key, ptr->len, lvl+1, value, BtId, Unique) )
1396 // now delete old fence key
1398 ptr = (BtKey*)rightkey;
1400 if( bt_deletekey (mgr, ptr->key, ptr->len, lvl+1) )
1403 bt_unlockpage (BtLockParent, set->latch, __LINE__);
1404 bt_unpinlatch(set->latch);
1408 // root has a single child
1409 // collapse a level from the tree
1411 BTERR bt_collapseroot (BtMgr *mgr, BtPageSet *root)
1418 // find the child entry and promote as new root contents
1421 for( idx = 0; idx++ < root->page->cnt; )
1422 if( !slotptr(root->page, idx)->dead )
1425 val = valptr(root->page, idx);
1427 if( val->len == BtId )
1428 page_no = bt_getid (valptr(root->page, idx)->value);
1430 return mgr->line = __LINE__, mgr->err = BTERR_struct;
1432 if( child->latch = bt_pinlatch (mgr, page_no) )
1433 child->page = bt_mappage (mgr, child->latch);
1437 bt_lockpage (BtLockDelete, child->latch, 0, __LINE__);
1438 bt_lockpage (BtLockWrite, child->latch, 0, __LINE__);
1440 memcpy (root->page, child->page, mgr->page_size);
1441 bt_freepage (mgr, child);
1443 } while( root->page->lvl > 1 && root->page->act == 1 );
1445 bt_unlockpage (BtLockWrite, root->latch, __LINE__);
1446 bt_unpinlatch (root->latch);
1450 // delete a page and manage key
1451 // call with page writelocked
1453 // returns with page unpinned
1454 // from the page pool.
1456 BTERR bt_deletepage (BtMgr *mgr, BtPageSet *set, uint lvl)
1458 unsigned char higherfence[BT_keyarray], lowerfence[BT_keyarray];
1459 uint page_size = mgr->page_size, kill;
1460 BtPageSet right[1], temp[1];
1461 unsigned char value[BtId];
1462 uid page_no, right2;
1466 page_size <<= mgr->leaf_xtra;
1468 // cache original copy of original fence key
1469 // that is going to be deleted.
1471 ptr = fenceptr(set->page);
1472 memcpy (lowerfence, ptr, ptr->len + sizeof(BtKey));
1474 // pin and lock our right page
1476 page_no = set->page->right;
1478 if( right->latch = bt_pinlatch (mgr, page_no) )
1479 right->page = bt_mappage (mgr, right->latch);
1483 bt_lockpage (BtLockWrite, right->latch, 0, __LINE__);
1485 if( right->page->kill || set->page->kill )
1486 return mgr->line = __LINE__, mgr->err = BTERR_struct;
1488 // pull contents of right sibling over our empty page
1489 // preserving our left page number, and its right page number.
1491 bt_lockpage (BtLockLink, set->latch, 0, __LINE__);
1492 page_no = set->page->left;
1493 memcpy (set->page, right->page, page_size);
1494 set->page->left = page_no;
1495 bt_unlockpage (BtLockLink, set->latch, __LINE__);
1497 // fix left link from far right page
1499 if( right2 = set->page->right ) {
1500 if( temp->latch = bt_pinlatch (mgr, right2) )
1501 temp->page = bt_mappage (mgr, temp->latch);
1505 bt_lockpage (BtLockAccess, temp->latch, 0, __LINE__);
1506 bt_lockpage(BtLockLink, temp->latch, 0, __LINE__);
1507 temp->page->left = set->latch->page_no;
1508 bt_unlockpage(BtLockLink, temp->latch, __LINE__);
1509 bt_unlockpage(BtLockAccess, temp->latch, __LINE__);
1510 bt_unpinlatch (temp->latch);
1511 } else if( !lvl ) { // our page is now rightmost leaf
1512 bt_mutexlock (mgr->pagezero->lock);
1513 mgr->pagezero->rightleaf = set->latch->page_no;
1514 bt_releasemutex(mgr->pagezero->lock);
1517 ptr = fenceptr(set->page);
1518 memcpy (higherfence, ptr, ptr->len + sizeof(BtKey));
1520 // mark right page as being deleted and release lock
1521 // keep lock on parent modification.
1523 right->page->kill = 1;
1524 bt_lockpage (BtLockParent, right->latch, 0, __LINE__);
1525 bt_unlockpage (BtLockWrite, right->latch, __LINE__);
1527 bt_lockpage (BtLockParent, set->latch, 0, __LINE__);
1528 bt_unlockpage (BtLockWrite, set->latch, __LINE__);
1530 // redirect the new higher key directly to our new node
1532 ptr = (BtKey *)higherfence;
1533 bt_putid (value, set->latch->page_no);
1535 if( bt_insertkey (mgr, ptr->key, ptr->len, lvl+1, value, BtId, Update) )
1538 // delete our original fence key in parent
1540 ptr = (BtKey *)lowerfence;
1542 if( bt_deletekey (mgr, ptr->key, ptr->len, lvl+1) )
1545 // wait for all access to drain away with delete lock,
1546 // then obtain write lock to right node and free it.
1548 bt_lockpage (BtLockDelete, right->latch, 0, __LINE__);
1549 bt_lockpage (BtLockWrite, right->latch, 0, __LINE__);
1550 bt_lockpage (BtLockLink, right->latch, 0, __LINE__);
1551 bt_unlockpage (BtLockParent, right->latch, __LINE__);
1552 bt_freepage (mgr, right);
1554 // release parent lock to our node
1556 bt_unlockpage (BtLockParent, set->latch, __LINE__);
1557 bt_unpinlatch (set->latch);
1561 // find and delete key on page by marking delete flag bit
1562 // if page becomes empty, delete it from the btree
1564 BTERR bt_deletekey (BtMgr *mgr, unsigned char *key, uint len, uint lvl)
1566 uint slot, idx, found, fence;
1572 if( slot = bt_loadpage (mgr, set, key, len, lvl, BtLockWrite, 0) ) {
1573 node = slotptr(set->page, slot);
1574 ptr = keyptr(set->page, slot);
1578 // if librarian slot, advance to real slot
1580 if( node->type == Librarian ) {
1581 ptr = keyptr(set->page, ++slot);
1582 node = slotptr(set->page, slot);
1585 fence = slot == set->page->cnt;
1587 // delete the key, ignore request if already dead
1589 if( found = !keycmp (ptr, key, len) )
1590 if( found = node->dead == 0 ) {
1591 val = valptr(set->page,slot);
1592 set->page->garbage += ptr->len + val->len + sizeof(BtKey) + sizeof(BtVal);
1596 // collapse empty slots beneath the fence
1597 // on interiour nodes
1600 while( idx = set->page->cnt - 1 )
1601 if( slotptr(set->page, idx)->dead ) {
1602 *slotptr(set->page, idx) = *slotptr(set->page, idx + 1);
1603 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1611 // did we delete a fence key in an upper level?
1613 if( lvl && set->page->act && fence )
1614 return bt_fixfence (mgr, set, lvl);
1616 // do we need to collapse root?
1618 if( lvl > 1 && set->latch->page_no == ROOT_page && set->page->act == 1 )
1619 return bt_collapseroot (mgr, set);
1621 // delete empty page
1623 if( !set->page->act )
1624 return bt_deletepage (mgr, set, set->page->lvl);
1626 bt_unlockpage(BtLockWrite, set->latch, __LINE__);
1627 bt_unpinlatch (set->latch);
1631 // check page for space available,
1632 // clean if necessary and return
1633 // 0 - page needs splitting
1634 // >0 new slot value
1636 uint bt_cleanpage(BtMgr *mgr, BtPageSet *set, uint keylen, uint slot, uint vallen)
1638 uint page_size = mgr->page_size;
1639 BtPage page = set->page, frame;
1640 uint cnt = 0, idx = 0;
1641 uint max = page->cnt;
1646 if( !set->page->lvl )
1647 page_size <<= mgr->leaf_xtra;
1649 if( page->min >= (max+2) * sizeof(BtSlot) + sizeof(*page) + keylen + sizeof(BtKey) + vallen + sizeof(BtVal))
1652 // skip cleanup and proceed to split
1653 // if there's not enough garbage
1656 if( page->garbage < page_size / 5 )
1659 frame = malloc (page_size);
1660 memcpy (frame, page, page_size);
1662 // skip page info and set rest of page to zero
1664 memset (page+1, 0, page_size - sizeof(*page));
1666 page->min = page_size;
1670 // clean up page first by
1671 // removing dead keys
1673 while( cnt++ < max ) {
1677 if( cnt < max || frame->lvl )
1678 if( slotptr(frame,cnt)->dead )
1681 // copy the value across
1683 val = valptr(frame, cnt);
1684 page->min -= val->len + sizeof(BtVal);
1685 memcpy ((unsigned char *)page + page->min, val, val->len + sizeof(BtVal));
1687 // copy the key across
1689 key = keyptr(frame, cnt);
1690 page->min -= key->len + sizeof(BtKey);
1691 memcpy ((unsigned char *)page + page->min, key, key->len + sizeof(BtKey));
1693 // make a librarian slot
1695 slotptr(page, ++idx)->off = page->min;
1696 slotptr(page, idx)->type = Librarian;
1697 slotptr(page, idx)->dead = 1;
1701 slotptr(page, ++idx)->off = page->min;
1702 slotptr(page, idx)->type = slotptr(frame, cnt)->type;
1704 if( !(slotptr(page, idx)->dead = slotptr(frame, cnt)->dead) )
1708 page->fence = page->min;
1712 // see if page has enough space now, or does it need splitting?
1714 if( page->min >= (idx+2) * sizeof(BtSlot) + sizeof(*page) + keylen + sizeof(BtKey) + vallen + sizeof(BtVal) )
1720 // split the root and raise the height of the btree
1722 BTERR bt_splitroot(BtMgr *mgr, BtPageSet *root, BtLatchSet *right)
1724 unsigned char leftkey[BT_keyarray];
1725 uint nxt = mgr->page_size;
1726 unsigned char value[BtId];
1733 frame = malloc (mgr->page_size);
1734 memcpy (frame, root->page, mgr->page_size);
1736 // save left page fence key for new root
1738 ptr = fenceptr(root->page);
1739 memcpy (leftkey, ptr, ptr->len + sizeof(BtKey));
1741 // Obtain an empty page to use, and copy the current
1742 // root contents into it, e.g. lower keys
1744 if( bt_newpage(mgr, left, frame) )
1747 left_page_no = left->latch->page_no;
1748 bt_unpinlatch (left->latch);
1751 // left link the pages together
1753 page = bt_mappage (mgr, right);
1754 page->left = left_page_no;
1756 // preserve the page info at the bottom
1757 // of higher keys and set rest to zero
1759 memset(root->page+1, 0, mgr->page_size - sizeof(*root->page));
1761 // insert stopper key at top of newroot page
1762 // and increase the root height
1764 nxt -= BtId + sizeof(BtVal);
1765 bt_putid (value, right->page_no);
1766 val = (BtVal *)((unsigned char *)root->page + nxt);
1767 memcpy (val->value, value, BtId);
1770 nxt -= 2 + sizeof(BtKey);
1771 root->page->fence = nxt;
1773 slotptr(root->page, 2)->off = nxt;
1774 ptr = (BtKey *)((unsigned char *)root->page + nxt);
1779 // insert lower keys page fence key on newroot page as first key
1781 nxt -= BtId + sizeof(BtVal);
1782 bt_putid (value, left_page_no);
1783 val = (BtVal *)((unsigned char *)root->page + nxt);
1784 memcpy (val->value, value, BtId);
1787 ptr = (BtKey *)leftkey;
1788 nxt -= ptr->len + sizeof(BtKey);
1789 slotptr(root->page, 1)->off = nxt;
1790 memcpy ((unsigned char *)root->page + nxt, leftkey, ptr->len + sizeof(BtKey));
1792 root->page->right = 0;
1793 root->page->min = nxt; // reset lowest used offset and key count
1794 root->page->cnt = 2;
1795 root->page->act = 2;
1798 // release and unpin root pages
1800 bt_unlockpage(BtLockWrite, root->latch, __LINE__);
1801 bt_unpinlatch (root->latch);
1803 bt_unpinlatch (right);
1807 // split already locked full node
1809 // return pool entry for new right
1810 // page, pinned & unlocked
1812 uint bt_splitpage (BtMgr *mgr, BtPageSet *set, uint linkleft)
1814 uint page_size = mgr->page_size;
1815 BtPageSet right[1], temp[1];
1816 uint cnt = 0, idx = 0, max;
1817 uint lvl = set->page->lvl;
1825 if( !set->page->lvl )
1826 page_size <<= mgr->leaf_xtra;
1828 // split higher half of keys to frame
1830 frame = malloc (page_size);
1831 memset (frame, 0, page_size);
1832 frame->min = page_size;
1833 max = set->page->cnt;
1837 while( cnt++ < max ) {
1838 if( cnt < max || set->page->lvl )
1839 if( slotptr(set->page, cnt)->dead )
1842 val = valptr(set->page, cnt);
1843 frame->min -= val->len + sizeof(BtVal);
1844 memcpy ((unsigned char *)frame + frame->min, val, val->len + sizeof(BtVal));
1846 key = keyptr(set->page, cnt);
1847 frame->min -= key->len + sizeof(BtKey);
1848 memcpy ((unsigned char *)frame + frame->min, key, key->len + sizeof(BtKey));
1850 // add librarian slot
1852 slotptr(frame, ++idx)->off = frame->min;
1853 slotptr(frame, idx)->type = Librarian;
1854 slotptr(frame, idx)->dead = 1;
1858 slotptr(frame, ++idx)->off = frame->min;
1859 slotptr(frame, idx)->type = slotptr(set->page, cnt)->type;
1861 if( !(slotptr(frame, idx)->dead = slotptr(set->page, cnt)->dead) )
1865 frame->fence = frame->min;
1871 if( set->latch->page_no > ROOT_page ) {
1872 right2 = set->page->right;
1873 frame->right = right2;
1876 frame->left = set->latch->page_no;
1879 // get new free page and write higher keys to it.
1881 if( bt_newpage(mgr, right, frame) )
1884 // link far right's left pointer to new page
1886 if( linkleft && set->latch->page_no > ROOT_page )
1888 if( temp->latch = bt_pinlatch (mgr, right2) )
1889 temp->page = bt_mappage (mgr, temp->latch);
1893 bt_lockpage(BtLockLink, temp->latch, 0, __LINE__);
1894 temp->page->left = right->latch->page_no;
1895 bt_unlockpage(BtLockLink, temp->latch, __LINE__);
1896 bt_unpinlatch (temp->latch);
1897 } else if( !lvl ) { // page is rightmost leaf
1898 bt_mutexlock (mgr->pagezero->lock);
1899 mgr->pagezero->rightleaf = right->latch->page_no;
1900 bt_releasemutex(mgr->pagezero->lock);
1903 // process lower keys
1905 memcpy (frame, set->page, page_size);
1906 memset (set->page+1, 0, page_size - sizeof(*set->page));
1908 set->page->min = page_size;
1909 set->page->garbage = 0;
1915 // assemble page of smaller keys
1917 while( cnt++ < max ) {
1918 if( slotptr(frame, cnt)->dead )
1920 val = valptr(frame, cnt);
1921 set->page->min -= val->len + sizeof(BtVal);
1922 memcpy ((unsigned char *)set->page + set->page->min, val, val->len + sizeof(BtVal));
1924 key = keyptr(frame, cnt);
1925 set->page->min -= key->len + sizeof(BtKey);
1926 memcpy ((unsigned char *)set->page + set->page->min, key, key->len + sizeof(BtKey));
1928 // add librarian slot
1930 slotptr(set->page, ++idx)->off = set->page->min;
1931 slotptr(set->page, idx)->type = Librarian;
1932 slotptr(set->page, idx)->dead = 1;
1936 slotptr(set->page, ++idx)->off = set->page->min;
1937 slotptr(set->page, idx)->type = slotptr(frame, cnt)->type;
1941 set->page->right = right->latch->page_no;
1942 set->page->fence = set->page->min;
1943 set->page->cnt = idx;
1946 entry = right->latch - mgr->latchsets;
1950 // fix keys for newly split page
1951 // call with both pages pinned & locked
1952 // return unlocked and unpinned
1954 BTERR bt_splitkeys (BtMgr *mgr, BtPageSet *set, BtLatchSet *right)
1956 unsigned char leftkey[BT_keyarray], rightkey[BT_keyarray];
1957 unsigned char value[BtId];
1958 uint lvl = set->page->lvl;
1964 // if current page is the root page, split it
1966 if( set->latch->page_no == ROOT_page )
1967 return bt_splitroot (mgr, set, right);
1969 ptr = fenceptr(set->page);
1970 memcpy (leftkey, ptr, ptr->len + sizeof(BtKey));
1972 page = bt_mappage (mgr, right);
1974 ptr = fenceptr(page);
1975 memcpy (rightkey, ptr, ptr->len + sizeof(BtKey));
1977 // splice in far right page's left page_no
1979 if( right2 = page->right ) {
1980 if( temp->latch = bt_pinlatch (mgr, right2) )
1981 temp->page = bt_mappage (mgr, temp->latch);
1985 bt_lockpage(BtLockLink, temp->latch, 0, __LINE__);
1986 temp->page->left = right->page_no;
1987 bt_unlockpage(BtLockLink, temp->latch, __LINE__);
1988 bt_unpinlatch (temp->latch);
1989 } else if( !lvl ) { // right page is far right page
1990 bt_mutexlock (mgr->pagezero->lock);
1991 mgr->pagezero->rightleaf = right->page_no;
1992 bt_releasemutex(mgr->pagezero->lock);
1994 // insert new fences in their parent pages
1996 bt_lockpage (BtLockParent, right, 0, __LINE__);
1998 bt_lockpage (BtLockParent, set->latch, 0, __LINE__);
1999 bt_unlockpage (BtLockWrite, set->latch, __LINE__);
2001 // insert new fence for reformulated left block of smaller keys
2003 bt_putid (value, set->latch->page_no);
2004 ptr = (BtKey *)leftkey;
2006 if( bt_insertkey (mgr, ptr->key, ptr->len, lvl+1, value, BtId, Unique) )
2009 // switch fence for right block of larger keys to new right page
2011 bt_putid (value, right->page_no);
2012 ptr = (BtKey *)rightkey;
2014 if( bt_insertkey (mgr, ptr->key, ptr->len, lvl+1, value, BtId, Unique) )
2017 bt_unlockpage (BtLockParent, set->latch, __LINE__);
2018 bt_unpinlatch (set->latch);
2020 bt_unlockpage (BtLockParent, right, __LINE__);
2021 bt_unpinlatch (right);
2025 // install new key and value onto page
2026 // page must already be checked for
2029 BTERR bt_insertslot (BtMgr *mgr, BtPageSet *set, uint slot, unsigned char *key,uint keylen, unsigned char *value, uint vallen, uint type)
2031 uint idx, librarian;
2037 // if previous slot is a librarian slot, use it
2040 if( slotptr(set->page, slot-1)->type == Librarian )
2043 // copy value onto page
2045 set->page->min -= vallen + sizeof(BtVal);
2046 val = (BtVal*)((unsigned char *)set->page + set->page->min);
2047 memcpy (val->value, value, vallen);
2050 // copy key onto page
2052 set->page->min -= keylen + sizeof(BtKey);
2053 ptr = (BtKey*)((unsigned char *)set->page + set->page->min);
2054 memcpy (ptr->key, key, keylen);
2057 // find first empty slot at or above our insert slot
2059 for( idx = slot; idx < set->page->cnt; idx++ )
2060 if( slotptr(set->page, idx)->dead )
2063 // now insert key into array before slot.
2065 // if we're going all the way to the top,
2066 // add as many librarian slots as
2069 if( idx == set->page->cnt ) {
2070 int avail = 4 * set->page->min / 5 - sizeof(*set->page) - ++set->page->cnt * sizeof(BtSlot);
2072 librarian = ++idx - slot;
2073 avail /= sizeof(BtSlot);
2078 if( librarian > avail )
2082 rate = (idx - slot) / librarian;
2083 set->page->cnt += librarian;
2088 librarian = 0, rate = 0;
2090 // transfer slots and add librarian slots
2092 while( idx > slot ) {
2093 *slotptr(set->page, idx) = *slotptr(set->page, idx-librarian-1);
2095 // add librarian slot per rate
2098 if( (idx - slot)/2 <= librarian * rate ) {
2099 node = slotptr(set->page, --idx);
2100 node->off = node[1].off;
2101 node->type = Librarian;
2113 node = slotptr(set->page, slot);
2114 node->off = set->page->min;
2120 // Insert new key into the btree at given level.
2121 // either add a new key or update/add an existing one
2123 BTERR bt_insertkey (BtMgr *mgr, unsigned char *key, uint keylen, uint lvl, void *value, uint vallen, BtSlotType type)
2125 uint slot, idx, len, entry;
2131 while( 1 ) { // find the page and slot for the current key
2132 if( slot = bt_loadpage (mgr, set, key, keylen, lvl, BtLockWrite, 0) ) {
2133 node = slotptr(set->page, slot);
2134 ptr = keyptr(set->page, slot);
2138 // if librarian slot == found slot, advance to real slot
2140 if( node->type == Librarian ) {
2141 node = slotptr(set->page, ++slot);
2142 ptr = keyptr(set->page, slot);
2145 // if inserting a duplicate key or unique
2146 // key that doesn't exist on the page,
2147 // check for adequate space on the page
2148 // and insert the new key before slot.
2153 if( !keycmp (ptr, key, keylen) )
2156 if( slot = bt_cleanpage (mgr, set, keylen, slot, vallen) )
2157 if( bt_insertslot (mgr, set, slot, key, keylen, value, vallen, type) )
2162 if( entry = bt_splitpage (mgr, set, 1) )
2163 if( !bt_splitkeys (mgr, set, entry + mgr->latchsets) )
2169 if( keycmp (ptr, key, keylen) )
2175 // if key already exists, update value and return
2177 val = valptr(set->page, slot);
2179 if( val->len >= vallen ) {
2185 set->page->garbage += val->len - vallen;
2187 memcpy (val->value, value, vallen);
2191 // new update value doesn't fit in existing value area
2192 // make sure page has room
2195 set->page->garbage += val->len + ptr->len + sizeof(BtKey) + sizeof(BtVal);
2202 if( slot = bt_cleanpage (mgr, set, keylen, slot, vallen) )
2205 if( entry = bt_splitpage (mgr, set, 1) )
2206 if( !bt_splitkeys (mgr, set, entry + mgr->latchsets) )
2212 // copy key and value onto page and update slot
2214 set->page->min -= vallen + sizeof(BtVal);
2215 val = (BtVal*)((unsigned char *)set->page + set->page->min);
2216 memcpy (val->value, value, vallen);
2219 set->page->min -= keylen + sizeof(BtKey);
2220 ptr = (BtKey*)((unsigned char *)set->page + set->page->min);
2221 memcpy (ptr->key, key, keylen);
2224 slotptr(set->page,slot)->off = set->page->min;
2227 bt_unlockpage(BtLockWrite, set->latch, __LINE__);
2228 bt_unpinlatch (set->latch);
2232 // determine actual page where key is located
2233 // return slot number
2235 uint bt_atomicpage (BtMgr *mgr, BtPage source, AtomicTxn *locks, uint idx, BtPageSet *set)
2237 BtKey *key = keyptr(source,locks[idx].src), *ptr;
2238 uint slot = locks[idx].slot;
2241 if( locks[idx].reuse )
2242 entry = locks[idx-1].entry;
2244 entry = locks[idx].entry;
2247 set->latch = mgr->latchsets + entry;
2248 set->page = bt_mappage (mgr, set->latch);
2252 // find where our key is located
2253 // on current page or pages split on
2254 // same page txn operations.
2257 set->latch = mgr->latchsets + entry;
2258 set->page = bt_mappage (mgr, set->latch);
2260 if( slot = bt_findslot(set->page, key->key, key->len) ) {
2261 if( slotptr(set->page, slot)->type == Librarian )
2263 if( locks[idx].reuse )
2264 locks[idx].entry = entry;
2267 } while( entry = set->latch->split );
2269 mgr->line = __LINE__, mgr->err = BTERR_atomic;
2273 BTERR bt_atomicinsert (BtMgr *mgr, BtPage source, AtomicTxn *locks, uint idx)
2275 BtKey *key = keyptr(source, locks[idx].src);
2276 BtVal *val = valptr(source, locks[idx].src);
2281 while( slot = bt_atomicpage (mgr, source, locks, idx, set) ) {
2282 if( slot = bt_cleanpage(mgr, set, key->len, slot, val->len) ) {
2283 if( bt_insertslot (mgr, set, slot, key->key, key->len, val->value, val->len, slotptr(source,locks[idx].src)->type) )
2291 if( entry = bt_splitpage (mgr, set, 0) )
2292 latch = mgr->latchsets + entry;
2296 // splice right page into split chain
2299 bt_lockpage(BtLockWrite, latch, 0, __LINE__);
2300 latch->split = set->latch->split;
2301 set->latch->split = entry;
2303 // clear slot number for atomic page
2305 locks[idx].slot = 0;
2308 return mgr->line = __LINE__, mgr->err = BTERR_atomic;
2311 // perform delete from smaller btree
2312 // insert a delete slot if not found there
2314 BTERR bt_atomicdelete (BtMgr *mgr, BtPage source, AtomicTxn *locks, uint idx)
2316 BtKey *key = keyptr(source, locks[idx].src);
2324 while( slot = bt_atomicpage (mgr, source, locks, idx, set) ) {
2325 node = slotptr(set->page, slot);
2326 ptr = keyptr(set->page, slot);
2327 val = valptr(set->page, slot);
2329 // if slot is not found on cache btree, insert a delete slot
2330 // otherwise ignore the request.
2332 if( keycmp (ptr, key->key, key->len) )
2334 if( slot = bt_cleanpage(mgr, set, key->len, slot, 0) )
2335 return bt_insertslot (mgr, set, slot, key->key, key->len, NULL, 0, Delete);
2336 else { // split page before inserting Delete slot
2337 if( entry = bt_splitpage (mgr, set, 0) )
2338 latch = mgr->latchsets + entry;
2342 // splice right page into split chain
2345 bt_lockpage(BtLockWrite, latch, 0, __LINE__);
2346 latch->split = set->latch->split;
2347 set->latch->split = entry;
2349 // clear slot number for atomic page
2351 locks[idx].slot = 0;
2357 // if node is already dead,
2358 // ignore the request.
2360 if( node->type == Delete || node->dead )
2363 // if main LSM btree, delete the slot
2364 // else change to delete type.
2370 node->type = Delete;
2372 __sync_fetch_and_add(&mgr->found, 1);
2376 return mgr->line = __LINE__, mgr->err = BTERR_struct;
2379 // release master's splits from right to left
2381 void bt_atomicrelease (BtMgr *mgr, uint entry)
2383 BtLatchSet *latch = mgr->latchsets + entry;
2386 bt_atomicrelease (mgr, latch->split);
2389 bt_unlockpage(BtLockWrite, latch, __LINE__);
2390 bt_unpinlatch(latch);
2393 int qsortcmp (BtSlot *slot1, BtSlot *slot2, BtPage page)
2395 BtKey *key1 = (BtKey *)((char *)page + slot1->off);
2396 BtKey *key2 = (BtKey *)((char *)page + slot2->off);
2398 return keycmp (key1, key2->key, key2->len);
2400 // atomic modification of a batch of keys.
2402 BTERR bt_atomictxn (BtDb *bt, BtPage source)
2404 uint src, idx, slot, samepage, entry, que = 0;
2405 BtKey *key, *ptr, *key2;
2410 // stable sort the list of keys into order to
2411 // prevent deadlocks between threads.
2413 for( src = 1; src++ < source->cnt; ) {
2414 *temp = *slotptr(source,src);
2415 key = keyptr (source,src);
2417 for( idx = src; --idx; ) {
2418 key2 = keyptr (source,idx);
2419 if( keycmp (key, key2->key, key2->len) < 0 ) {
2420 *slotptr(source,idx+1) = *slotptr(source,idx);
2421 *slotptr(source,idx) = *temp;
2427 qsort_r (slotptr(source,1), source->cnt, sizeof(BtSlot), (__compar_d_fn_t)qsortcmp, source);
2429 // perform the individual actions in the transaction
2431 if( bt_atomicexec (bt->mgr, source, source->cnt, bt->tid) )
2432 return bt->mgr->err;
2434 // if number of active pages
2435 // is greater than the buffer pool
2436 // promote page into larger btree
2439 if( bt->mgr->pagezero->leafpages > bt->mgr->maxleaves )
2440 if( bt_promote (bt) )
2441 return bt->mgr->err;
2448 // execute the source list of inserts/deletes
2450 BTERR bt_atomicexec(BtMgr *mgr, BtPage source, uint count, pid_t tid)
2452 uint slot, src, idx, samepage, entry, outidx;
2453 BtPageSet set[1], prev[1];
2454 unsigned char value[BtId];
2462 locks = calloc (count, sizeof(AtomicTxn));
2463 memset (set, 0, sizeof(BtPageSet));
2466 // Load the leaf page for each key
2467 // group same page references with reuse bit
2469 for( src = 0; src++ < count; ) {
2470 if( slotptr(source,src)->dead )
2473 key = keyptr(source, src);
2475 // first determine if this modification falls
2476 // on the same page as the previous modification
2477 // note that the far right leaf page is a special case
2479 if( samepage = !!set->page )
2480 samepage = !set->page->right || keycmp (ptr, key->key, key->len) >= 0;
2483 if( slot = bt_loadpage(mgr, set, key->key, key->len, 0, BtLockWrite, tid) )
2484 ptr = fenceptr(set->page), set->latch->split = 0;
2491 if( slotptr(set->page, slot)->type == Librarian )
2494 entry = set->latch - mgr->latchsets;
2495 locks[outidx].reuse = samepage;
2496 locks[outidx].entry = entry;
2497 locks[outidx].slot = slot;
2498 locks[outidx].src = src;
2502 // insert or delete each key
2503 // process any splits or merges
2504 // run through txn list backwards
2508 for( src = outidx; src--; ) {
2509 if( locks[src].reuse )
2512 // perform the txn operations
2513 // from smaller to larger on
2516 for( idx = src; idx < samepage; idx++ )
2517 switch( slotptr(source,locks[idx].src)->type ) {
2519 if( bt_atomicdelete (mgr, source, locks, idx) )
2525 if( bt_atomicinsert (mgr, source, locks, idx) )
2530 bt_atomicpage (mgr, source, locks, idx, set);
2534 // after the same page operations have finished,
2535 // process master page for splits or deletion.
2537 latch = prev->latch = mgr->latchsets + locks[src].entry;
2538 prev->page = bt_mappage (mgr, prev->latch);
2541 // pick-up all splits from master page
2542 // each one is already pinned & WriteLocked.
2544 while( entry = prev->latch->split ) {
2545 set->latch = mgr->latchsets + entry;
2546 set->page = bt_mappage (mgr, set->latch);
2548 // delete empty master page by undoing its split
2549 // (this is potentially another empty page)
2550 // note that there are no pointers to it yet
2552 if( !prev->page->act ) {
2553 set->page->left = prev->page->left;
2554 memcpy (prev->page, set->page, mgr->page_size << mgr->leaf_xtra);
2555 bt_lockpage (BtLockDelete, set->latch, 0, __LINE__);
2556 bt_lockpage (BtLockLink, set->latch, 0, __LINE__);
2557 prev->latch->split = set->latch->split;
2558 bt_freepage (mgr, set);
2562 // remove empty split page from the split chain
2563 // and return it to the free list. No other
2564 // thread has its page number yet.
2566 if( !set->page->act ) {
2567 prev->page->right = set->page->right;
2568 prev->latch->split = set->latch->split;
2570 bt_lockpage (BtLockDelete, set->latch, 0, __LINE__);
2571 bt_lockpage (BtLockLink, set->latch, 0, __LINE__);
2572 bt_freepage (mgr, set);
2576 // update prev's fence key
2578 ptr = fenceptr(prev->page);
2579 bt_putid (value, prev->latch->page_no);
2581 if( bt_insertkey (mgr, ptr->key, ptr->len, 1, value, BtId, Unique) )
2584 // splice in the left link into the split page
2586 set->page->left = prev->latch->page_no;
2590 // update left pointer in next right page from last split page
2591 // (if all splits were reversed or none occurred, latch->split == 0)
2593 if( latch->split ) {
2594 // fix left pointer in master's original (now split)
2595 // far right sibling or set rightmost page in page zero
2597 if( right_page_no = prev->page->right ) {
2598 if( set->latch = bt_pinlatch (mgr, right_page_no) )
2599 set->page = bt_mappage (mgr, set->latch);
2603 bt_lockpage (BtLockLink, set->latch, 0, __LINE__);
2604 set->page->left = prev->latch->page_no;
2605 bt_unlockpage (BtLockLink, set->latch, __LINE__);
2606 bt_unpinlatch (set->latch);
2607 } else { // prev is rightmost page
2608 bt_mutexlock (mgr->pagezero->lock);
2609 mgr->pagezero->rightleaf = prev->latch->page_no;
2610 bt_releasemutex(mgr->pagezero->lock);
2613 // switch the original fence key from the
2614 // master page to the last split page.
2616 ptr = fenceptr(prev->page);
2617 bt_putid (value, prev->latch->page_no);
2619 if( bt_insertkey (mgr, ptr->key, ptr->len, 1, value, BtId, Update) )
2622 // unlock and unpin the split pages
2624 bt_atomicrelease (mgr, latch->split);
2626 // unlock and unpin the master page
2629 bt_unlockpage(BtLockWrite, latch, __LINE__);
2630 bt_unpinlatch(latch);
2634 // since there are no splits remaining, we're
2635 // finished if master page occupied
2637 if( prev->page->act ) {
2638 bt_unlockpage(BtLockWrite, prev->latch, __LINE__);
2639 bt_unpinlatch(prev->latch);
2643 // any and all splits were reversed, and the
2644 // master page located in prev is empty, delete it
2646 if( bt_deletepage (mgr, prev, 0) )
2652 for( idx = 0; idx++ < count; ) {
2653 if( slotptr(source,idx)->dead )
2656 slotptr(source,idx)->dead = 1;
2664 // pick & promote a page into the larger btree
2666 BTERR bt_promote (BtDb *bt)
2675 bt_mutexlock(bt->mgr->pagezero->promote);
2678 if( bt->mgr->pagezero->leafpromote < bt->mgr->pagezero->allocpage )
2679 page_no = bt->mgr->pagezero->leafpromote;
2681 page_no = bt->mgr->pagezero->leaf_page;
2683 bt->mgr->pagezero->leafpromote = page_no + (1 << bt->mgr->leaf_xtra);
2685 if( page_no < bt->mgr->pagezero->leaf_page )
2688 if( set->latch = bt_pinlatch (bt->mgr, page_no) )
2689 set->page = bt_mappage (bt->mgr,set->latch);
2691 // skip upper level pages
2693 if( set->page->lvl ) {
2695 bt_releasemutex(set->latch->modify);
2699 if( !bt_mutextry(set->latch->modify) ) {
2701 bt_releasemutex(set->latch->modify);
2705 // skip this page if it was pinned
2707 if( set->latch->pin > 1 ) {
2709 bt_releasemutex(set->latch->modify);
2713 // page has no right sibling
2715 if( !set->page->right ) {
2717 bt_releasemutex(set->latch->modify);
2721 // page is being killed or constructed
2723 if( set->page->nopromote || set->page->kill ) {
2725 bt_releasemutex(set->latch->modify);
2729 // leave it locked for the
2730 // duration of the promotion.
2732 bt_releasemutex(bt->mgr->pagezero->promote);
2733 bt_lockpage (BtLockWrite, set->latch, 0, __LINE__);
2734 bt_releasemutex(set->latch->modify);
2736 // transfer slots in our selected page to the main btree
2738 if( !((page_no>>bt->mgr->leaf_xtra)%100) )
2739 fprintf(stderr, "Promote page %lld, %d keys\n", page_no, set->page->act);
2741 if( bt_atomicexec (bt->main, set->page, set->page->cnt, bt->tid) ) {
2742 fprintf (stderr, "Promote error = %d line = %d\n", bt->main->err, bt->main->line);
2743 return bt->main->err;
2746 // now delete the page
2748 if( bt_deletepage (bt->mgr, set, 0) )
2749 fprintf (stderr, "Promote: delete page err = %d\n", bt->mgr->err);
2751 return bt->mgr->err;
2755 // find unique key == given key, or first duplicate key in
2756 // leaf level and return number of value bytes
2757 // or (-1) if not found.
2759 int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax)
2768 for( type = 0; type < 2; type++ )
2769 if( slot = bt_loadpage (type ? bt->main : bt->mgr, set, key, keylen, 0, BtLockRead, 0) ) {
2770 node = slotptr(set->page, slot);
2772 // skip librarian slot place holder
2774 if( node->type == Librarian )
2775 node = slotptr(set->page, ++slot);
2777 ptr = keyptr(set->page, slot);
2779 // not there if we reach the stopper key
2780 // or the key doesn't match what's on the page.
2782 if( slot == set->page->cnt )
2783 if( !set->page->right ) {
2784 bt_unlockpage (BtLockRead, set->latch, __LINE__);
2785 bt_unpinlatch (set->latch);
2789 if( keycmp (ptr, key, keylen) ) {
2790 bt_unlockpage (BtLockRead, set->latch, __LINE__);
2791 bt_unpinlatch (set->latch);
2795 // key matches, return >= 0 value bytes copied
2796 // or -1 if not there.
2798 if( node->type == Delete || node->dead ) {
2803 val = valptr (set->page,slot);
2805 if( valmax > val->len )
2808 memcpy (value, val->value, valmax);
2817 bt_unlockpage (BtLockRead, set->latch, __LINE__);
2818 bt_unpinlatch (set->latch);
2823 // set cursor to highest slot on right-most page
2825 BTERR bt_lastkey (BtDb *bt)
2827 uid cache_page_no = bt->mgr->pagezero->rightleaf;
2828 uid main_page_no = bt->main->pagezero->rightleaf;
2830 if( bt->cacheset->latch = bt_pinlatch (bt->mgr, cache_page_no) )
2831 bt->cacheset->page = bt_mappage (bt->mgr, bt->cacheset->latch);
2833 return bt->mgr->err;
2835 bt_lockpage(BtLockRead, bt->cacheset->latch, 0, __LINE__);
2836 bt->cacheslot = bt->cacheset->page->cnt;
2838 if( bt->mainset->latch = bt_pinlatch (bt->main, main_page_no) )
2839 bt->mainset->page = bt_mappage (bt->main, bt->mainset->latch);
2841 return bt->main->err;
2843 bt_lockpage(BtLockRead, bt->mainset->latch, 0, __LINE__);
2844 bt->mainslot = bt->mainset->page->cnt;
2849 // return previous slot on cursor page
2851 uint bt_prevslot (BtMgr *mgr, BtPageSet *set, uint slot)
2853 uid next, us = set->latch->page_no;
2857 if( slotptr(set->page, slot)->dead )
2862 next = set->page->left;
2868 bt_unlockpage(BtLockRead, set->latch, __LINE__);
2869 bt_unpinlatch (set->latch);
2871 if( set->latch = bt_pinlatch (mgr, next) )
2872 set->page = bt_mappage (mgr, set->latch);
2876 bt_lockpage(BtLockRead, set->latch, 0, __LINE__);
2877 next = set->page->right;
2879 } while( next != us );
2881 slot = set->page->cnt + 1;
2885 // advance to previous key
2887 BTERR bt_prevkey (BtDb *bt)
2891 // first advance last key(s) one previous slot
2894 switch( bt->phase ) {
2896 bt->cacheslot = bt_prevslot (bt->mgr, bt->cacheset, bt->cacheslot);
2899 bt->mainslot = bt_prevslot (bt->main, bt->mainset, bt->mainslot);
2902 bt->cacheslot = bt_prevslot (bt->mgr, bt->cacheset, bt->cacheslot);
2903 bt->mainslot = bt_prevslot (bt->main, bt->mainset, bt->mainslot);
2909 if( bt->cacheslot ) {
2910 bt->cachenode = slotptr(bt->cacheset->page, bt->cacheslot);
2911 bt->cachekey = keyptr(bt->cacheset->page, bt->cacheslot);
2912 bt->cacheval = valptr(bt->cacheset->page, bt->cacheslot);
2915 if( bt->mainslot ) {
2916 bt->mainnode = slotptr(bt->mainset->page, bt->mainslot);
2917 bt->mainkey = keyptr(bt->mainset->page, bt->mainslot);
2918 bt->mainval = valptr(bt->mainset->page, bt->mainslot);
2921 if( bt->mainslot && bt->cacheslot )
2922 cmp = keycmp (bt->cachekey, bt->mainkey->key, bt->mainkey->len);
2923 else if( bt->cacheslot )
2925 else if( bt->mainslot )
2930 // cache key is larger
2934 if( bt->cachenode->type == Delete )
2936 return bt->cacheslot;
2939 // main key is larger
2943 return bt->mainslot;
2950 if( bt->cachenode->type == Delete )
2953 return bt->cacheslot;
2957 // advance to next slot in cache or main btree
2958 // return 0 for EOF/error
2960 uint bt_nextslot (BtMgr *mgr, BtPageSet *set, uint slot)
2966 while( slot++ < set->page->cnt )
2967 if( slotptr(set->page, slot)->dead )
2969 else if( slot < set->page->cnt || set->page->right )
2974 bt_unlockpage(BtLockRead, set->latch, __LINE__);
2975 bt_unpinlatch (set->latch);
2977 if( page_no = set->page->right )
2978 if( set->latch = bt_pinlatch (mgr, page_no) )
2979 set->page = bt_mappage (mgr, set->latch);
2985 // obtain access lock using lock chaining with Access mode
2987 bt_lockpage(BtLockAccess, set->latch, 0, __LINE__);
2988 bt_lockpage(BtLockRead, set->latch, 0, __LINE__);
2989 bt_unlockpage(BtLockAccess, set->latch, __LINE__);
2994 // advance to next key
2996 BTERR bt_nextkey (BtDb *bt)
3000 // first advance last key(s) one next slot
3003 switch( bt->phase ) {
3005 bt->cacheslot = bt_nextslot (bt->mgr, bt->cacheset, bt->cacheslot);
3008 bt->mainslot = bt_nextslot (bt->main, bt->mainset, bt->mainslot);
3011 bt->cacheslot = bt_nextslot (bt->mgr, bt->cacheset, bt->cacheslot);
3012 bt->mainslot = bt_nextslot (bt->main, bt->mainset, bt->mainslot);
3018 if( bt->cacheslot ) {
3019 bt->cachenode = slotptr(bt->cacheset->page, bt->cacheslot);
3020 bt->cachekey = keyptr(bt->cacheset->page, bt->cacheslot);
3021 bt->cacheval = valptr(bt->cacheset->page, bt->cacheslot);
3024 if( bt->mainslot ) {
3025 bt->mainnode = slotptr(bt->mainset->page, bt->mainslot);
3026 bt->mainkey = keyptr(bt->mainset->page, bt->mainslot);
3027 bt->mainval = valptr(bt->mainset->page, bt->mainslot);
3030 if( bt->mainslot && bt->cacheslot )
3031 cmp = keycmp (bt->cachekey, bt->mainkey->key, bt->mainkey->len);
3032 else if( bt->mainslot )
3034 else if( bt->cacheslot )
3039 // main key is larger
3040 // return smaller key
3044 if( bt->cachenode->type == Delete )
3046 return bt->cacheslot;
3049 // cache key is larger
3053 return bt->mainslot;
3060 if( bt->cachenode->type == Delete )
3063 return bt->cacheslot;
3067 // start sweep of keys
3069 BTERR bt_startkey (BtDb *bt, unsigned char *key, uint len)
3076 if( slot = bt_loadpage (bt->mgr, bt->cacheset, key, len, 0, BtLockRead, 0) )
3077 bt->cacheslot = slot - 1;
3079 return bt->mgr->err;
3083 if( slot = bt_loadpage (bt->main, bt->mainset, key, len, 0, BtLockRead, 0) )
3084 bt->mainslot = slot - 1;
3086 return bt->mgr->err;
3092 // flush cache pages to main btree
3094 BTERR bt_flushmain (BtDb *bt)
3096 uint count, cnt = 0;
3099 while( bt->mgr->pagezero->leafpages > 0 ) {
3100 if( set->latch = bt_pinlatch (bt->mgr, bt->mgr->pagezero->leaf_page) )
3101 set->page = bt_mappage (bt->mgr, set->latch);
3103 return bt->mgr->err;
3105 bt_lockpage(BtLockWrite, set->latch, 0, __LINE__);
3106 count = set->page->cnt;
3108 if( !set->page->right )
3111 if( !(cnt++ % 100) )
3112 fprintf(stderr, "Promote LEAF_page %d with %d keys\n", cnt, set->page->act);
3114 if( bt_atomicexec (bt->main, set->page, count, bt->tid) )
3115 return bt->mgr->line = bt->main->line, bt->mgr->err = bt->main->err;
3117 if( set->page->right )
3118 if( bt_deletepage (bt->mgr, set, 0) )
3119 return bt->mgr->err;
3123 bt_unlockpage(BtLockWrite, set->latch, __LINE__);
3124 bt_unpinlatch (set->latch);
3128 // leaf page count is off
3130 bt->mgr->line = __LINE__;
3131 return bt->mgr->err = BTERR_ovflw;
3137 double getCpuTime(int type)
3140 FILETIME xittime[1];
3141 FILETIME systime[1];
3142 FILETIME usrtime[1];
3143 SYSTEMTIME timeconv[1];
3146 memset (timeconv, 0, sizeof(SYSTEMTIME));
3150 GetSystemTimeAsFileTime (xittime);
3151 FileTimeToSystemTime (xittime, timeconv);
3152 ans = (double)timeconv->wDayOfWeek * 3600 * 24;
3155 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
3156 FileTimeToSystemTime (usrtime, timeconv);
3159 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
3160 FileTimeToSystemTime (systime, timeconv);
3164 ans += (double)timeconv->wHour * 3600;
3165 ans += (double)timeconv->wMinute * 60;
3166 ans += (double)timeconv->wSecond;
3167 ans += (double)timeconv->wMilliseconds / 1000;
3172 #include <sys/resource.h>
3174 double getCpuTime(int type)
3176 struct rusage used[1];
3177 struct timeval tv[1];
3181 gettimeofday(tv, NULL);
3182 return (double)tv->tv_sec + (double)tv->tv_usec / 1000000;
3185 getrusage(RUSAGE_SELF, used);
3186 return (double)used->ru_utime.tv_sec + (double)used->ru_utime.tv_usec / 1000000;
3189 getrusage(RUSAGE_SELF, used);
3190 return (double)used->ru_stime.tv_sec + (double)used->ru_stime.tv_usec / 1000000;
3197 void bt_poolaudit (BtMgr *mgr, char *type)
3199 BtLatchSet *latch, test[1];
3202 memset (test, 0, sizeof(test));
3204 if( memcmp (test, mgr->latchsets, sizeof(test)) )
3205 fprintf(stderr, "%s latchset zero overwritten\n", type);
3207 for( entry = 0; ++entry < mgr->pagezero->latchtotal; ) {
3208 latch = mgr->latchsets + entry;
3210 if( *latch->modify->value )
3211 fprintf(stderr, "%s latchset %d modifylocked for page %lld\n", type, entry, latch->page_no);
3214 fprintf(stderr, "%s latchset %d pinned %d times for page %lld\n", type, entry, latch->pin, latch->page_no);
3227 // standalone program to index file of keys
3228 // then list them onto std-out
3231 void *index_file (void *arg)
3233 uint __stdcall index_file (void *arg)
3236 int line = 0, found = 0, cnt = 0, cachecnt, idx;
3237 int ch, len = 0, slot, type = 0;
3238 unsigned char key[BT_maxkey];
3239 unsigned char buff[65536];
3240 uint nxt = sizeof(buff);
3241 ThreadArg *args = arg;
3253 bt = bt_open (args->mgr, args->main);
3254 page = (BtPage)buff;
3256 if( args->idx < strlen (args->type) )
3257 ch = args->type[args->idx];
3259 ch = args->type[strlen(args->type) - 1];
3264 bt_poolaudit(bt->mgr, "cache");
3265 bt_poolaudit(bt->main, "main");
3269 fprintf(stderr, "started flushing cache to main btree\n");
3272 if( bt_flushmain(bt) )
3273 fprintf(stderr, "Error %d Line: %d\n", bt->mgr->err, bt->mgr->line), exit(0);
3285 if( type == Delete )
3286 fprintf(stderr, "started TXN pennysort delete for %s\n", args->infile);
3288 fprintf(stderr, "started TXN pennysort insert for %s\n", args->infile);
3290 if( type == Delete )
3291 fprintf(stderr, "started pennysort delete for %s\n", args->infile);
3293 fprintf(stderr, "started pennysort insert for %s\n", args->infile);
3295 if( in = fopen (args->infile, "rb") )
3296 while( ch = getc(in), ch != EOF )
3302 if( bt_insertkey (bt->mgr, key, 10, 0, key + 10, len - 10, Unique) )
3303 fprintf(stderr, "Error %d Line: %d source: %d\n", bt->mgr->err, bt->mgr->line, line), exit(0);
3309 memcpy (buff + nxt, key + 10, len - 10);
3311 buff[nxt] = len - 10;
3313 memcpy (buff + nxt, key, 10);
3316 slotptr(page,++cnt)->off = nxt;
3317 slotptr(page,cnt)->type = type;
3318 slotptr(page,cnt)->dead = 0;
3321 if( cnt < args->num )
3328 if( bt_atomictxn (bt, page) )
3329 fprintf(stderr, "Error %d Line: %d source: %d\n", bt->mgr->err, bt->mgr->line, line), exit(0);
3334 else if( len < BT_maxkey )
3336 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
3340 fprintf(stderr, "started indexing for %s\n", args->infile);
3341 if( in = fopen (args->infile, "r") )
3342 while( ch = getc(in), ch != EOF )
3347 if( bt_insertkey (bt->mgr, key, len, 0, NULL, 0, Unique) )
3348 fprintf(stderr, "Error %d Line: %d source: %d\n", bt->mgr->err, bt->mgr->line, line), exit(0);
3351 else if( len < BT_maxkey )
3353 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
3357 fprintf(stderr, "started finding keys for %s\n", args->infile);
3358 if( in = fopen (args->infile, "rb") )
3359 while( ch = getc(in), ch != EOF )
3363 if( bt_findkey (bt, key, len, NULL, 0) == 0 )
3365 else if( bt->mgr->err )
3366 fprintf(stderr, "Error %d Syserr %d Line: %d source: %d\n", bt->mgr->err, errno, bt->mgr->line, line), exit(0);
3369 else if( len < BT_maxkey )
3371 fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found);
3375 fprintf(stderr, "started forward scan\n");
3376 if( bt_startkey (bt, NULL, 0) )
3377 fprintf(stderr, "unable to begin scan error %d Line: %d\n", bt->mgr->err, bt->mgr->line);
3379 while( bt_nextkey (bt) ) {
3380 if( bt->phase == 1 ) {
3381 len = bt->mainkey->len;
3383 if( bt->mainnode->type == Duplicate )
3386 fwrite (bt->mainkey->key, len, 1, stdout);
3387 fwrite (bt->mainval->value, bt->mainval->len, 1, stdout);
3389 len = bt->cachekey->len;
3391 if( bt->cachenode->type == Duplicate )
3394 fwrite (bt->cachekey->key, len, 1, stdout);
3395 fwrite (bt->cacheval->value, bt->cacheval->len, 1, stdout);
3398 fputc ('\n', stdout);
3402 bt_unlockpage(BtLockRead, bt->cacheset->latch, __LINE__);
3403 bt_unpinlatch (bt->cacheset->latch);
3405 bt_unlockpage(BtLockRead, bt->mainset->latch, __LINE__);
3406 bt_unpinlatch (bt->mainset->latch);
3408 fprintf(stderr, " Total keys read %d\n", cnt);
3412 fprintf(stderr, "started reverse scan\n");
3413 if( bt_lastkey (bt) )
3414 fprintf(stderr, "unable to begin scan error %d Line: %d\n", bt->mgr->err, bt->mgr->line);
3416 while( bt_prevkey (bt) ) {
3417 if( bt->phase == 1 ) {
3418 len = bt->mainkey->len;
3420 if( bt->mainnode->type == Duplicate )
3423 fwrite (bt->mainkey->key, len, 1, stdout);
3424 fwrite (bt->mainval->value, bt->mainval->len, 1, stdout);
3426 len = bt->cachekey->len;
3428 if( bt->cachenode->type == Duplicate )
3431 fwrite (bt->cachekey->key, len, 1, stdout);
3432 fwrite (bt->cacheval->value, bt->cacheval->len, 1, stdout);
3435 fputc ('\n', stdout);
3439 bt_unlockpage(BtLockRead, bt->cacheset->latch, __LINE__);
3440 bt_unpinlatch (bt->cacheset->latch);
3442 bt_unlockpage(BtLockRead, bt->mainset->latch, __LINE__);
3443 bt_unpinlatch (bt->mainset->latch);
3445 fprintf(stderr, " Total keys read %d\n", cnt);
3449 fprintf(stderr, "started counting LSM cache btree\n");
3450 memset (counts, 0, sizeof(counts));
3451 page_no = bt->mgr->pagezero->leaf_page;
3453 size = bt->mgr->page_size << bt->mgr->leaf_xtra;
3454 page = malloc(size);
3457 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
3459 while( page_no < bt->mgr->pagezero->allocpage ) {
3460 if( bt_readpage (bt->mgr, page, page_no, 0) )
3461 fprintf(stderr, "Unable to read page %lld from cache\n", page_no), exit(1);
3462 if( !page->lvl && !page->free ) {
3465 for( idx = 0; idx++ < page->cnt; ) {
3466 BtSlot *node = slotptr (page, idx);
3467 counts[node->type][node->dead]++;
3470 page_no += 1 << bt->mgr->leaf_xtra;
3473 cachecnt = --cnt; // remove stopper key
3474 counts[Unique][0]--;
3476 fprintf(stderr, " Unique : %d dead: %d\n", counts[Unique][0], counts[Unique][1]);
3477 fprintf(stderr, " Duplicates: %d dead: %d\n", counts[Duplicate][0], counts[Duplicate][1]);
3478 fprintf(stderr, " Librarian : %d dead: %d\n", counts[Librarian][0], counts[Librarian][1]);
3479 fprintf(stderr, " Deletion : %d dead: %d\n", counts[Delete][0], counts[Delete][1]);
3480 fprintf(stderr, "total cache keys count: %d\n", cachecnt);
3483 fprintf(stderr, "started counting LSM main btree\n");
3484 memset (counts, 0, sizeof(counts));
3485 size = bt->main->page_size << bt->main->leaf_xtra;
3486 page_no = bt->mgr->pagezero->leaf_page;
3487 page = malloc(size);
3491 posix_fadvise( bt->main->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
3493 while( page_no < bt->main->pagezero->allocpage ) {
3494 if( bt_readpage (bt->main, page, page_no, 0) )
3495 fprintf(stderr, "Unable to read page %lld from main\n", page_no), exit(1);
3496 if( !page->lvl && !page->free ) {
3499 for( idx = 0; idx++ < page->cnt; ) {
3500 BtSlot *node = slotptr (page, idx);
3501 counts[node->type][node->dead]++;
3504 page_no += 1 << bt->main->leaf_xtra;
3507 cnt--; // remove stopper key
3508 counts[Unique][0]--;
3510 fprintf(stderr, " Unique : %d dead: %d\n", counts[Unique][0], counts[Unique][1]);
3511 fprintf(stderr, " Duplicates: %d dead: %d\n", counts[Duplicate][0], counts[Duplicate][1]);
3512 fprintf(stderr, " Librarian : %d dead: %d\n", counts[Librarian][0], counts[Librarian][1]);
3513 fprintf(stderr, " Deletion : %d dead: %d\n", counts[Delete][0], counts[Delete][1]);
3514 fprintf(stderr, "total main keys count : %d\n", cnt);
3515 fprintf(stderr, "Total keys counted : %d\n", cnt + cachecnt);
3528 typedef struct timeval timer;
3530 int main (int argc, char **argv)
3532 int idx, cnt, len, slot, err;
3540 uint mainleafpool = 0;
3541 uint mainleafxtra = 0;
3557 fprintf (stderr, "Usage: %s idx_file main_file cmds [pagebits leafbits poolsize txnsize mainbits mainleafbits mainpool maxleaves src_file1 src_file2 ... ]\n", argv[0]);
3558 fprintf (stderr, " where idx_file is the name of the cache btree file\n");
3559 fprintf (stderr, " where main_file is the name of the main btree file\n");
3560 fprintf (stderr, " cmds is a string of (r)ev scan/(w)rite/(s)can/(d)elete/(f)ind/(p)ennysort/(c)ount/(m)ainflush/(a)udit, with a one character command for each input src_file. A command can also be given with no input file\n");
3561 fprintf (stderr, " pagebits is the page size in bits for the cache btree\n");
3562 fprintf (stderr, " leafbits is the number of xtra bits for a leaf page\n");
3563 fprintf (stderr, " poolsize is the number of latches in latch pool for the cache btree\n");
3564 fprintf (stderr, " txnsize = n to block transactions into n units, or zero for no transactions\n");
3565 fprintf (stderr, " mainbits is the page size of the main btree in bits\n");
3566 fprintf (stderr, " mainpool is the number of latches in the main latch pool\n");
3567 fprintf (stderr, " maxleaves is the threashold for LSM leaf page promotion\n");
3568 fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n");
3572 start = getCpuTime(0);
3575 bits = atoi(argv[4]);
3578 leafxtra = atoi(argv[5]);
3581 poolsize = atoi(argv[6]);
3584 num = atoi(argv[7]);
3587 mainbits = atoi(argv[8]);
3590 mainleafxtra = atoi(argv[9]);
3593 mainpool = atoi(argv[10]);
3596 maxleaves = atoi(argv[11]);
3604 threads = malloc (cnt * sizeof(pthread_t));
3606 threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE));
3608 args = malloc ((cnt + 1) * sizeof(ThreadArg));
3610 mgr = bt_mgr (argv[1], bits, leafxtra, poolsize);
3613 fprintf(stderr, "Index Open Error %s\n", argv[1]);
3616 mgr->maxleaves = maxleaves;
3620 main = bt_mgr (argv[2], mainbits, mainleafxtra, mainpool);
3623 fprintf(stderr, "Index Open Error %s\n", argv[2]);
3631 for( idx = 0; idx < cnt; idx++ ) {
3632 args[idx].infile = argv[idx + 12];
3633 args[idx].type = argv[3];
3634 args[idx].main = main;
3635 args[idx].mgr = mgr;
3636 args[idx].num = num;
3637 args[idx].idx = idx;
3639 if( err = pthread_create (threads + idx, NULL, index_file, args + idx) )
3640 fprintf(stderr, "Error creating thread %d\n", err);
3642 threads[idx] = (HANDLE)_beginthreadex(NULL, 131072, index_file, args + idx, 0, NULL);
3646 args[0].infile = argv[12];
3647 args[0].type = argv[3];
3648 args[0].main = main;
3655 // wait for termination
3659 for( idx = 0; idx < cnt; idx++ )
3660 pthread_join (threads[idx], NULL);
3663 WaitForMultipleObjects (cnt, threads, TRUE, INFINITE);
3666 for( idx = 0; idx < cnt; idx++ )
3667 CloseHandle(threads[idx]);
3669 fprintf(stderr, "cache %lld leaves %lld upper %d found\n", mgr->pagezero->leafpages, mgr->pagezero->upperpages, mgr->found);
3671 fprintf(stderr, "main %lld leaves %lld upper %d found\n", main->pagezero->leafpages, main->pagezero->upperpages, main->found);
3678 elapsed = getCpuTime(0) - start;
3679 fprintf(stderr, " real %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
3680 elapsed = getCpuTime(1);
3681 fprintf(stderr, " user %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
3682 elapsed = getCpuTime(2);
3683 fprintf(stderr, " sys %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
3686 BtKey *bt_fence (BtPage page)
3688 return fenceptr(page);
3691 BtKey *bt_key (BtPage page, uint slot)
3693 return keyptr(page,slot);
3696 BtSlot *bt_slot (BtPage page, uint slot)
3698 return slotptr(page,slot);