1 // btree version threadskv6 sched_yield version
2 // with reworked bt_deletekey code,
3 // phase-fair reader writer lock,
4 // librarian page split code,
5 // duplicate key management
6 // bi-directional cursors
7 // and traditional buffer pool manager
11 // author: karl malbrain, malbrain@cal.berkeley.edu
14 This work, including the source code, documentation
15 and related data, is placed into the public domain.
17 The orginal author is Karl Malbrain.
19 THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
20 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
21 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
22 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
23 RESULTING FROM THE USE, MODIFICATION, OR
24 REDISTRIBUTION OF THIS SOFTWARE.
27 // Please see the project home page for documentation
28 // code.google.com/p/high-concurrency-btree
30 #define _FILE_OFFSET_BITS 64
31 #define _LARGEFILE64_SOURCE
47 #define WIN32_LEAN_AND_MEAN
61 typedef unsigned long long uid;
64 typedef unsigned long long off64_t;
65 typedef unsigned short ushort;
66 typedef unsigned int uint;
69 #define BT_ro 0x6f72 // ro
70 #define BT_rw 0x7772 // rw
72 #define BT_maxbits 24 // maximum page size in bits
73 #define BT_minbits 9 // minimum page size in bits
74 #define BT_minpage (1 << BT_minbits) // minimum page size
75 #define BT_maxpage (1 << BT_maxbits) // maximum page size
77 // BTree page number constants
78 #define ALLOC_page 0 // allocation page
79 #define ROOT_page 1 // root of the btree
80 #define LEAF_page 2 // first page of leaves
82 // Number of levels to create in a new BTree
87 There are five lock types for each node in three independent sets:
88 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
89 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent.
90 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock.
91 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks.
92 5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification.
103 // definition for phase-fair reader/writer lock implementation
117 // definition for spin latch implementation
119 // exclusive is set for write access
120 // share is count of read accessors
121 // grant write lock when share == 0
123 volatile typedef struct {
134 // hash table entries
137 volatile uint slot; // Latch table entry at head of chain
138 BtSpinLatch latch[1];
141 // latch manager table structure
144 volatile uid page_no; // latch set page number
145 RWLock readwr[1]; // read/write page lock
146 RWLock access[1]; // Access Intent/Page delete
147 RWLock parent[1]; // Posting of fence key in parent
148 volatile uint next; // next entry in hash table chain
149 volatile uint prev; // prev entry in hash table chain
150 volatile ushort pin; // number of outstanding latches
151 ushort dirty:1; // page in cache is dirty
154 // Define the length of the page record numbers
158 // Page key slot definition.
160 // Keys are marked dead, but remain on the page until
161 // it cleanup is called. The fence key (highest key) for
162 // a leaf page is always present, even after cleanup.
166 // In addition to the Unique keys that occupy slots
167 // there are Librarian and Duplicate key
168 // slots occupying the key slot array.
170 // The Librarian slots are dead keys that
171 // serve as filler, available to add new Unique
172 // or Dup slots that are inserted into the B-tree.
174 // The Duplicate slots have had their key bytes extended
175 // by 6 bytes to contain a binary duplicate key uniqueifier.
184 uint off:BT_maxbits; // page offset for key start
185 uint type:3; // type of slot
186 uint dead:1; // set for deleted slot
189 // The key structure occupies space at the upper end of
190 // each page. It's a length byte followed by the key
194 unsigned char len; // this can be changed to a ushort or uint
195 unsigned char key[0];
198 // the value structure also occupies space at the upper
199 // end of the page. Each key is immediately followed by a value.
202 unsigned char len; // this can be changed to a ushort or uint
203 unsigned char value[0];
206 #define BT_maxkey 255 // maximum number of bytes in a key
207 #define BT_keyarray (BT_maxkey + sizeof(BtKey))
209 // The first part of an index page.
210 // It is immediately followed
211 // by the BtSlot array of keys.
213 // note that this structure size
214 // must be a multiple of 8 bytes
215 // in order to place dups correctly.
217 typedef struct BtPage_ {
218 uint cnt; // count of keys in page
219 uint act; // count of active keys
220 uint min; // next key offset
221 uint garbage; // page garbage in bytes
222 unsigned char bits:7; // page size in bits
223 unsigned char free:1; // page is on free chain
224 unsigned char lvl:7; // level of page
225 unsigned char kill:1; // page is being deleted
226 unsigned char left[BtId]; // page number to left
227 unsigned char filler[2]; // padding to multiple of 8
228 unsigned char right[BtId]; // page number to right
231 // The loadpage interface object
234 uid page_no; // current page number
235 BtPage page; // current page pointer
236 BtLatchSet *latch; // current page latch set
239 // structure for latch manager on ALLOC_page
242 struct BtPage_ alloc[1]; // next page_no in right ptr
243 unsigned long long dups[1]; // global duplicate key uniqueifier
244 unsigned char chain[BtId]; // head of free page_nos chain
247 // The object structure for Btree access
250 uint page_size; // page size
251 uint page_bits; // page size in bits
257 BtPageZero *pagezero; // mapped allocation page
258 BtSpinLatch lock[1]; // allocation area lite latch
259 uint latchdeployed; // highest number of latch entries deployed
260 uint nlatchpage; // number of latch pages at BT_latch
261 uint latchtotal; // number of page latch entries
262 uint latchhash; // number of latch hash table slots
263 uint latchvictim; // next latch entry to examine
264 BtHashEntry *hashtable; // the anonymous mapping buffer pool
265 BtLatchSet *latchsets; // mapped latch set from latch pages
266 unsigned char *pagepool; // mapped to the buffer pool pages
268 HANDLE halloc; // allocation handle
269 HANDLE hpool; // buffer pool handle
274 BtMgr *mgr; // buffer manager for thread
275 BtPage cursor; // cached frame for start/next (never mapped)
276 BtPage frame; // spare frame for the page split (never mapped)
277 uid cursor_page; // current cursor page number
278 unsigned char *mem; // frame, cursor, page memory buffer
279 unsigned char key[BT_keyarray]; // last found complete key
280 int found; // last delete or insert was found
281 int err; // last error
282 int reads, writes; // number of reads and writes from the btree
296 #define CLOCK_bit 0x8000
299 extern void bt_close (BtDb *bt);
300 extern BtDb *bt_open (BtMgr *mgr);
301 extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, void *value, uint vallen, uint update);
302 extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl);
303 extern int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax);
304 extern BtKey *bt_foundkey (BtDb *bt);
305 extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len);
306 extern uint bt_nextkey (BtDb *bt, uint slot);
309 extern BtMgr *bt_mgr (char *name, uint bits, uint poolsize);
310 void bt_mgrclose (BtMgr *mgr);
312 // Helper functions to return slot values
313 // from the cursor page.
315 extern BtKey *bt_key (BtDb *bt, uint slot);
316 extern BtVal *bt_val (BtDb *bt, uint slot);
318 // The page is allocated from low and hi ends.
319 // The key slots are allocated from the bottom,
320 // while the text and value of the key
321 // are allocated from the top. When the two
322 // areas meet, the page is split into two.
324 // A key consists of a length byte, two bytes of
325 // index number (0 - 65535), and up to 253 bytes
328 // Associated with each key is a value byte string
329 // containing any value desired.
331 // The b-tree root is always located at page 1.
332 // The first leaf page of level zero is always
333 // located on page 2.
335 // The b-tree pages are linked with next
336 // pointers to facilitate enumerators,
337 // and provide for concurrency.
339 // When to root page fills, it is split in two and
340 // the tree height is raised by a new root at page
341 // one with two keys.
343 // Deleted keys are marked with a dead bit until
344 // page cleanup. The fence key for a leaf node is
347 // To achieve maximum concurrency one page is locked at a time
348 // as the tree is traversed to find leaf key in question. The right
349 // page numbers are used in cases where the page is being split,
352 // Page 0 is dedicated to lock for new page extensions,
353 // and chains empty pages together for reuse. It also
354 // contains the latch manager hash table.
356 // The ParentModification lock on a node is obtained to serialize posting
357 // or changing the fence key for a node.
359 // Empty pages are chained together through the ALLOC page and reused.
361 // Access macros to address slot and key values from the page
362 // Page slots use 1 based indexing.
364 #define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1))
365 #define keyptr(page, slot) ((BtKey*)((unsigned char*)(page) + slotptr(page, slot)->off))
366 #define valptr(page, slot) ((BtVal*)(keyptr(page,slot)->key + keyptr(page,slot)->len))
368 void bt_putid(unsigned char *dest, uid id)
373 dest[i] = (unsigned char)id, id >>= 8;
376 uid bt_getid(unsigned char *src)
381 for( i = 0; i < BtId; i++ )
382 id <<= 8, id |= *src++;
387 uid bt_newdup (BtDb *bt)
390 return __sync_fetch_and_add (bt->mgr->pagezero->dups, 1) + 1;
392 return _InterlockedIncrement64(bt->mgr->pagezero->dups, 1);
396 // Phase-Fair reader/writer lock implementation
398 void WriteLock (RWLock *lock)
403 tix = __sync_fetch_and_add (lock->ticket, 1);
405 tix = _InterlockedExchangeAdd16 (lock->ticket, 1);
407 // wait for our ticket to come up
409 while( tix != lock->serving[0] )
416 w = PRES | (tix & PHID);
418 r = __sync_fetch_and_add (lock->rin, w);
420 r = _InterlockedExchangeAdd16 (lock->rin, w);
422 while( r != *lock->rout )
430 void WriteRelease (RWLock *lock)
433 __sync_fetch_and_and (lock->rin, ~MASK);
435 _InterlockedAnd16 (lock->rin, ~MASK);
440 void ReadLock (RWLock *lock)
444 w = __sync_fetch_and_add (lock->rin, RINC) & MASK;
446 w = _InterlockedExchangeAdd16 (lock->rin, RINC) & MASK;
449 while( w == (*lock->rin & MASK) )
457 void ReadRelease (RWLock *lock)
460 __sync_fetch_and_add (lock->rout, RINC);
462 _InterlockedExchangeAdd16 (lock->rout, RINC);
466 // Spin Latch Manager
468 // wait until write lock mode is clear
469 // and add 1 to the share count
471 void bt_spinreadlock(BtSpinLatch *latch)
477 prev = __sync_fetch_and_add ((ushort *)latch, SHARE);
479 prev = _InterlockedExchangeAdd16((ushort *)latch, SHARE);
481 // see if exclusive request is granted or pending
486 prev = __sync_fetch_and_add ((ushort *)latch, -SHARE);
488 prev = _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
491 } while( sched_yield(), 1 );
493 } while( SwitchToThread(), 1 );
497 // wait for other read and write latches to relinquish
499 void bt_spinwritelock(BtSpinLatch *latch)
505 prev = __sync_fetch_and_or((ushort *)latch, PEND | XCL);
507 prev = _InterlockedOr16((ushort *)latch, PEND | XCL);
510 if( !(prev & ~BOTH) )
514 __sync_fetch_and_and ((ushort *)latch, ~XCL);
516 _InterlockedAnd16((ushort *)latch, ~XCL);
519 } while( sched_yield(), 1 );
521 } while( SwitchToThread(), 1 );
525 // try to obtain write lock
527 // return 1 if obtained,
530 int bt_spinwritetry(BtSpinLatch *latch)
535 prev = __sync_fetch_and_or((ushort *)latch, XCL);
537 prev = _InterlockedOr16((ushort *)latch, XCL);
539 // take write access if all bits are clear
542 if( !(prev & ~BOTH) )
546 __sync_fetch_and_and ((ushort *)latch, ~XCL);
548 _InterlockedAnd16((ushort *)latch, ~XCL);
555 void bt_spinreleasewrite(BtSpinLatch *latch)
558 __sync_fetch_and_and((ushort *)latch, ~BOTH);
560 _InterlockedAnd16((ushort *)latch, ~BOTH);
564 // decrement reader count
566 void bt_spinreleaseread(BtSpinLatch *latch)
569 __sync_fetch_and_add((ushort *)latch, -SHARE);
571 _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
575 // read page from permanent location in Btree file
577 BTERR bt_readpage (BtMgr *mgr, BtPage page, uid page_no)
579 off64_t off = page_no << mgr->page_bits;
582 if( pread (mgr->idx, page, mgr->page_size, page_no << mgr->page_bits) < mgr->page_size ) {
583 fprintf (stderr, "Unable to read page %.8x errno = %d\n", page_no, errno);
590 memset (ovl, 0, sizeof(OVERLAPPED));
592 ovl->OffsetHigh = off >> 32;
594 if( !ReadFile(mgr->idx, page, mgr->page_size, amt, ovl)) {
595 fprintf (stderr, "Unable to read page %.8x GetLastError = %d\n", page_no, GetLastError());
598 if( *amt < mgr->page_size ) {
599 fprintf (stderr, "Unable to read page %.8x GetLastError = %d\n", page_no, GetLastError());
606 // write page to permanent location in Btree file
607 // clear the dirty bit
609 BTERR bt_writepage (BtMgr *mgr, BtPage page, uid page_no)
611 off64_t off = page_no << mgr->page_bits;
614 if( pwrite(mgr->idx, page, mgr->page_size, off) < mgr->page_size )
620 memset (ovl, 0, sizeof(OVERLAPPED));
622 ovl->OffsetHigh = off >> 32;
624 if( !WriteFile(mgr->idx, page, mgr->page_size, amt, ovl) )
627 if( *amt < mgr->page_size )
633 // link latch table entry into head of latch hash table
635 BTERR bt_latchlink (BtDb *bt, uint hashidx, uint slot, uid page_no, uint loadit)
637 BtPage page = (BtPage)((uid)slot * bt->mgr->page_size + bt->mgr->pagepool);
638 BtLatchSet *latch = bt->mgr->latchsets + slot;
640 if( latch->next = bt->mgr->hashtable[hashidx].slot )
641 bt->mgr->latchsets[latch->next].prev = slot;
643 bt->mgr->hashtable[hashidx].slot = slot;
644 latch->pin = CLOCK_bit | 1; // initialize clock
645 latch->page_no = page_no;
649 if( bt->err = bt_readpage (bt->mgr, page, page_no) )
659 void bt_unpinlatch (BtLatchSet *set)
662 __sync_fetch_and_add(&set->pin, -1);
664 _InterlockedDecrement16 (&set->pin);
668 // map the btree cached page onto current page
670 BtPage bt_mappage (BtDb *bt, BtLatchSet *latch)
672 BtPage page = (BtPage)((uid)(latch - bt->mgr->latchsets) * bt->mgr->page_size + bt->mgr->pagepool);
677 // find existing latchset or inspire new one
678 // return with latchset pinned
680 BtLatchSet *bt_pinlatch (BtDb *bt, uid page_no, uint loadit)
682 uint hashidx = page_no % bt->mgr->latchhash;
690 // try to find our entry
692 bt_spinwritelock(bt->mgr->hashtable[hashidx].latch);
694 if( slot = bt->mgr->hashtable[hashidx].slot ) do
696 latch = bt->mgr->latchsets + slot;
697 if( page_no == latch->page_no )
699 } while( slot = latch->next );
705 latch = bt->mgr->latchsets + slot;
707 __sync_fetch_and_add(&latch->pin, 1);
708 __sync_fetch_and_or(&latch->pin, CLOCK_bit);
710 _InterlockedIncrement16 (&latch->pin);
711 _InterlockedOr16 (&latch->pin, CLOCK_bit);
713 bt_spinreleasewrite(bt->mgr->hashtable[hashidx].latch);
717 // see if there are any unused pool entries
719 slot = __sync_fetch_and_add (&bt->mgr->latchdeployed, 1) + 1;
721 slot = _InterlockedIncrement (&bt->mgr->latchdeployed);
724 if( slot < bt->mgr->latchtotal ) {
725 latch = bt->mgr->latchsets + slot;
726 if( bt_latchlink (bt, hashidx, slot, page_no, loadit) )
728 bt_spinreleasewrite (bt->mgr->hashtable[hashidx].latch);
733 __sync_fetch_and_add (&bt->mgr->latchdeployed, -1);
735 _InterlockedDecrement (&bt->mgr->latchdeployed);
737 // find and reuse previous entry on victim
741 slot = __sync_fetch_and_add(&bt->mgr->latchvictim, 1);
743 slot = _InterlockedIncrement (&bt->mgr->latchvictim) - 1;
745 // try to get write lock on hash chain
746 // skip entry if not obtained
747 // or has outstanding pins
749 slot %= bt->mgr->latchtotal;
754 latch = bt->mgr->latchsets + slot;
755 idx = latch->page_no % bt->mgr->latchhash;
757 // see we are on same chain as hashidx
762 if( !bt_spinwritetry (bt->mgr->hashtable[idx].latch) )
765 if( latch->pin & CLOCK_bit ) {
767 __sync_fetch_and_add(&latch->pin, -CLOCK_bit);
769 _InterlockedExchangeAdd16 (&latch->pin, -CLOCK_bit);
771 bt_spinreleasewrite (bt->mgr->hashtable[idx].latch);
775 // update permanent page area in btree
777 page = (BtPage)((uid)slot * bt->mgr->page_size + bt->mgr->pagepool);
780 if( bt->err = bt_writepage (bt->mgr, page, latch->page_no) )
783 latch->dirty = 0, bt->writes++;
785 // unlink our available slot from its hash chain
788 bt->mgr->latchsets[latch->prev].next = latch->next;
790 bt->mgr->hashtable[idx].slot = latch->next;
793 bt->mgr->latchsets[latch->next].prev = latch->prev;
795 bt_spinreleasewrite (bt->mgr->hashtable[idx].latch);
797 if( bt_latchlink (bt, hashidx, slot, page_no, loadit) )
800 bt_spinreleasewrite (bt->mgr->hashtable[hashidx].latch);
805 void bt_mgrclose (BtMgr *mgr)
812 // flush dirty pool pages to the btree
814 for( slot = 1; slot <= mgr->latchdeployed; slot++ ) {
815 page = (BtPage)((uid)slot * mgr->page_size + mgr->pagepool);
816 latch = mgr->latchsets + slot;
819 bt_writepage(mgr, page, latch->page_no);
820 latch->dirty = 0, num++;
824 fprintf(stderr, "%d buffer pool pages flushed\n", num);
827 munmap (mgr->hashtable, mgr->nlatchpage * mgr->page_size);
828 munmap (mgr->pagezero, mgr->page_size);
830 FlushViewOfFile(mgr->pagezero, 0);
831 UnmapViewOfFile(mgr->pagezero);
832 UnmapViewOfFile(mgr->hashtable);
833 CloseHandle(mgr->halloc);
834 CloseHandle(mgr->hpool);
840 FlushFileBuffers(mgr->idx);
841 CloseHandle(mgr->idx);
846 // close and release memory
848 void bt_close (BtDb *bt)
855 VirtualFree (bt->mem, 0, MEM_RELEASE);
860 // open/create new btree buffer manager
862 // call with file_name, BT_openmode, bits in page size (e.g. 16),
863 // size of page pool (e.g. 262144)
865 BtMgr *bt_mgr (char *name, uint bits, uint nodemax)
867 uint lvl, attr, last, slot, idx;
868 uint nlatchpage, latchhash;
869 unsigned char value[BtId];
870 int flag, initit = 0;
871 BtPageZero *pagezero;
878 // determine sanity of page size and buffer pool
880 if( bits > BT_maxbits )
882 else if( bits < BT_minbits )
886 fprintf(stderr, "Buffer pool too small: %d\n", nodemax);
891 mgr = calloc (1, sizeof(BtMgr));
893 mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
895 if( mgr->idx == -1 ) {
896 fprintf (stderr, "Unable to open btree file\n");
897 return free(mgr), NULL;
900 mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr));
901 attr = FILE_ATTRIBUTE_NORMAL;
902 mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL);
904 if( mgr->idx == INVALID_HANDLE_VALUE )
905 return GlobalFree(mgr), NULL;
909 pagezero = valloc (BT_maxpage);
912 // read minimum page size to get root info
913 // to support raw disk partition files
914 // check if bits == 0 on the disk.
916 if( size = lseek (mgr->idx, 0L, 2) )
917 if( pread(mgr->idx, pagezero, BT_minpage, 0) == BT_minpage )
918 if( pagezero->alloc->bits )
919 bits = pagezero->alloc->bits;
923 return free(mgr), free(pagezero), NULL;
927 pagezero = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE);
928 size = GetFileSize(mgr->idx, amt);
931 if( !ReadFile(mgr->idx, (char *)pagezero, BT_minpage, amt, NULL) )
932 return bt_mgrclose (mgr), NULL;
933 bits = pagezero->alloc->bits;
938 mgr->page_size = 1 << bits;
939 mgr->page_bits = bits;
941 // calculate number of latch hash table entries
943 mgr->nlatchpage = (nodemax/16 * sizeof(BtHashEntry) + mgr->page_size - 1) / mgr->page_size;
944 mgr->latchhash = mgr->nlatchpage * mgr->page_size / sizeof(BtHashEntry);
946 mgr->nlatchpage += nodemax; // size of the buffer pool in pages
947 mgr->nlatchpage += (sizeof(BtLatchSet) * nodemax + mgr->page_size - 1)/mgr->page_size;
948 mgr->latchtotal = nodemax;
953 // initialize an empty b-tree with latch page, root page, page of leaves
954 // and page(s) of latches and page pool cache
956 memset (pagezero, 0, 1 << bits);
957 pagezero->alloc->bits = mgr->page_bits;
958 bt_putid(pagezero->alloc->right, MIN_lvl+1);
960 // initialize left-most LEAF page in
963 bt_putid (pagezero->alloc->left, LEAF_page);
965 if( bt_writepage (mgr, pagezero->alloc, 0) ) {
966 fprintf (stderr, "Unable to create btree page zero\n");
967 return bt_mgrclose (mgr), NULL;
970 memset (pagezero, 0, 1 << bits);
971 pagezero->alloc->bits = mgr->page_bits;
973 for( lvl=MIN_lvl; lvl--; ) {
974 slotptr(pagezero->alloc, 1)->off = mgr->page_size - 3 - (lvl ? BtId + sizeof(BtVal): sizeof(BtVal));
975 key = keyptr(pagezero->alloc, 1);
976 key->len = 2; // create stopper key
980 bt_putid(value, MIN_lvl - lvl + 1);
981 val = valptr(pagezero->alloc, 1);
982 val->len = lvl ? BtId : 0;
983 memcpy (val->value, value, val->len);
985 pagezero->alloc->min = slotptr(pagezero->alloc, 1)->off;
986 pagezero->alloc->lvl = lvl;
987 pagezero->alloc->cnt = 1;
988 pagezero->alloc->act = 1;
990 if( bt_writepage (mgr, pagezero->alloc, MIN_lvl - lvl) ) {
991 fprintf (stderr, "Unable to create btree page zero\n");
992 return bt_mgrclose (mgr), NULL;
1000 VirtualFree (pagezero, 0, MEM_RELEASE);
1003 // mlock the pagezero page
1005 flag = PROT_READ | PROT_WRITE;
1006 mgr->pagezero = mmap (0, mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size);
1007 if( mgr->pagezero == MAP_FAILED ) {
1008 fprintf (stderr, "Unable to mmap btree page zero, error = %d\n", errno);
1009 return bt_mgrclose (mgr), NULL;
1011 mlock (mgr->pagezero, mgr->page_size);
1013 mgr->hashtable = (void *)mmap (0, (uid)mgr->nlatchpage * mgr->page_size, flag, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
1014 if( mgr->hashtable == MAP_FAILED ) {
1015 fprintf (stderr, "Unable to mmap anonymous buffer pool pages, error = %d\n", errno);
1016 return bt_mgrclose (mgr), NULL;
1019 flag = PAGE_READWRITE;
1020 mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, mgr->page_size, NULL);
1021 if( !mgr->halloc ) {
1022 fprintf (stderr, "Unable to create page zero memory mapping, error = %d\n", GetLastError());
1023 return bt_mgrclose (mgr), NULL;
1026 flag = FILE_MAP_WRITE;
1027 mgr->pagezero = MapViewOfFile(mgr->halloc, flag, 0, 0, mgr->page_size);
1028 if( !mgr->pagezero ) {
1029 fprintf (stderr, "Unable to map page zero, error = %d\n", GetLastError());
1030 return bt_mgrclose (mgr), NULL;
1033 flag = PAGE_READWRITE;
1034 size = (uid)mgr->nlatchpage << mgr->page_bits;
1035 mgr->hpool = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, flag, size >> 32, size, NULL);
1037 fprintf (stderr, "Unable to create buffer pool memory mapping, error = %d\n", GetLastError());
1038 return bt_mgrclose (mgr), NULL;
1041 flag = FILE_MAP_WRITE;
1042 mgr->hashtable = MapViewOfFile(mgr->pool, flag, 0, 0, size);
1043 if( !mgr->hashtable ) {
1044 fprintf (stderr, "Unable to map buffer pool, error = %d\n", GetLastError());
1045 return bt_mgrclose (mgr), NULL;
1049 mgr->pagepool = (unsigned char *)mgr->hashtable + (uid)(mgr->nlatchpage - mgr->latchtotal) * mgr->page_size;
1050 mgr->latchsets = (BtLatchSet *)(mgr->pagepool - (uid)mgr->latchtotal * sizeof(BtLatchSet));
1055 // open BTree access method
1056 // based on buffer manager
1058 BtDb *bt_open (BtMgr *mgr)
1060 BtDb *bt = malloc (sizeof(*bt));
1062 memset (bt, 0, sizeof(*bt));
1065 bt->mem = valloc (2 *mgr->page_size);
1067 bt->mem = VirtualAlloc(NULL, 2 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE);
1069 bt->frame = (BtPage)bt->mem;
1070 bt->cursor = (BtPage)(bt->mem + 1 * mgr->page_size);
1074 // compare two keys, returning > 0, = 0, or < 0
1075 // as the comparison value
1077 int keycmp (BtKey* key1, unsigned char *key2, uint len2)
1079 uint len1 = key1->len;
1082 if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) )
1093 // place write, read, or parent lock on requested page_no.
1095 void bt_lockpage(BtLock mode, BtLatchSet *set)
1099 ReadLock (set->readwr);
1102 WriteLock (set->readwr);
1105 ReadLock (set->access);
1108 WriteLock (set->access);
1111 WriteLock (set->parent);
1116 // remove write, read, or parent lock on requested page
1118 void bt_unlockpage(BtLock mode, BtLatchSet *set)
1122 ReadRelease (set->readwr);
1125 WriteRelease (set->readwr);
1128 ReadRelease (set->access);
1131 WriteRelease (set->access);
1134 WriteRelease (set->parent);
1139 // allocate a new page
1140 // return with page latched.
1142 int bt_newpage(BtDb *bt, BtPageSet *set, BtPage contents)
1146 // lock allocation page
1148 bt_spinwritelock(bt->mgr->lock);
1150 // use empty chain first
1151 // else allocate empty page
1153 if( set->page_no = bt_getid(bt->mgr->pagezero->chain) ) {
1154 if( set->latch = bt_pinlatch (bt, set->page_no, 1) )
1155 set->page = bt_mappage (bt, set->latch);
1157 return bt->err = BTERR_struct, -1;
1159 bt_putid(bt->mgr->pagezero->chain, bt_getid(set->page->right));
1160 bt_spinreleasewrite(bt->mgr->lock);
1162 memcpy (set->page, contents, bt->mgr->page_size);
1163 set->latch->dirty = 1;
1167 set->page_no = bt_getid(bt->mgr->pagezero->alloc->right);
1168 bt_putid(bt->mgr->pagezero->alloc->right, set->page_no+1);
1170 // unlock allocation latch
1172 bt_spinreleasewrite(bt->mgr->lock);
1174 // don't load cache from btree page
1176 if( set->latch = bt_pinlatch (bt, set->page_no, 0) )
1177 set->page = bt_mappage (bt, set->latch);
1179 return bt->err = BTERR_struct;
1181 memcpy (set->page, contents, bt->mgr->page_size);
1182 set->latch->dirty = 1;
1186 // find slot in page for given key at a given level
1188 int bt_findslot (BtPageSet *set, unsigned char *key, uint len)
1190 uint diff, higher = set->page->cnt, low = 1, slot;
1193 // make stopper key an infinite fence value
1195 if( bt_getid (set->page->right) )
1200 // low is the lowest candidate.
1201 // loop ends when they meet
1203 // higher is already
1204 // tested as .ge. the passed key.
1206 while( diff = higher - low ) {
1207 slot = low + ( diff >> 1 );
1208 if( keycmp (keyptr(set->page, slot), key, len) < 0 )
1211 higher = slot, good++;
1214 // return zero if key is on right link page
1216 return good ? higher : 0;
1219 // find and load page at given level for given key
1220 // leave page rd or wr locked as requested
1222 int bt_loadpage (BtDb *bt, BtPageSet *set, unsigned char *key, uint len, uint lvl, BtLock lock)
1224 uid page_no = ROOT_page, prevpage = 0;
1225 uint drill = 0xff, slot;
1226 BtLatchSet *prevlatch;
1227 uint mode, prevmode;
1229 // start at root of btree and drill down
1232 // determine lock mode of drill level
1233 mode = (drill == lvl) ? lock : BtLockRead;
1235 if( set->latch = bt_pinlatch (bt, page_no, 1) )
1236 set->page_no = page_no;
1240 // obtain access lock using lock chaining with Access mode
1242 if( page_no > ROOT_page )
1243 bt_lockpage(BtLockAccess, set->latch);
1245 set->page = bt_mappage (bt, set->latch);
1247 // release & unpin parent page
1250 bt_unlockpage(prevmode, prevlatch);
1251 bt_unpinlatch (prevlatch);
1255 // obtain read lock using lock chaining
1257 bt_lockpage(mode, set->latch);
1259 if( set->page->free )
1260 return bt->err = BTERR_struct, 0;
1262 if( page_no > ROOT_page )
1263 bt_unlockpage(BtLockAccess, set->latch);
1265 // re-read and re-lock root after determining actual level of root
1267 if( set->page->lvl != drill) {
1268 if( set->page_no != ROOT_page )
1269 return bt->err = BTERR_struct, 0;
1271 drill = set->page->lvl;
1273 if( lock != BtLockRead && drill == lvl ) {
1274 bt_unlockpage(mode, set->latch);
1275 bt_unpinlatch (set->latch);
1280 prevpage = set->page_no;
1281 prevlatch = set->latch;
1284 // find key on page at this level
1285 // and descend to requested level
1287 if( set->page->kill )
1290 if( slot = bt_findslot (set, key, len) ) {
1294 while( slotptr(set->page, slot)->dead )
1295 if( slot++ < set->page->cnt )
1300 page_no = bt_getid(valptr(set->page, slot)->value);
1305 // or slide right into next page
1308 page_no = bt_getid(set->page->right);
1312 // return error on end of right chain
1314 bt->err = BTERR_struct;
1315 return 0; // return error
1318 // return page to free list
1319 // page must be delete & write locked
1321 void bt_freepage (BtDb *bt, BtPageSet *set)
1323 // lock allocation page
1325 bt_spinwritelock (bt->mgr->lock);
1328 memcpy(set->page->right, bt->mgr->pagezero->chain, BtId);
1329 bt_putid(bt->mgr->pagezero->chain, set->page_no);
1330 set->latch->dirty = 1;
1331 set->page->free = 1;
1333 // unlock released page
1335 bt_unlockpage (BtLockDelete, set->latch);
1336 bt_unlockpage (BtLockWrite, set->latch);
1337 bt_unpinlatch (set->latch);
1339 // unlock allocation page
1341 bt_spinreleasewrite (bt->mgr->lock);
1344 // a fence key was deleted from a page
1345 // push new fence value upwards
1347 BTERR bt_fixfence (BtDb *bt, BtPageSet *set, uint lvl)
1349 unsigned char leftkey[BT_keyarray], rightkey[BT_keyarray];
1350 unsigned char value[BtId];
1354 // remove the old fence value
1356 ptr = keyptr(set->page, set->page->cnt);
1357 memcpy (rightkey, ptr, ptr->len + sizeof(BtKey));
1358 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1359 set->latch->dirty = 1;
1361 // cache new fence value
1363 ptr = keyptr(set->page, set->page->cnt);
1364 memcpy (leftkey, ptr, ptr->len + sizeof(BtKey));
1366 bt_lockpage (BtLockParent, set->latch);
1367 bt_unlockpage (BtLockWrite, set->latch);
1369 // insert new (now smaller) fence key
1371 bt_putid (value, set->page_no);
1372 ptr = (BtKey*)leftkey;
1374 if( bt_insertkey (bt, ptr->key, ptr->len, lvl+1, value, BtId, 1) )
1377 // now delete old fence key
1379 ptr = (BtKey*)rightkey;
1381 if( bt_deletekey (bt, ptr->key, ptr->len, lvl+1) )
1384 bt_unlockpage (BtLockParent, set->latch);
1385 bt_unpinlatch(set->latch);
1389 // root has a single child
1390 // collapse a level from the tree
1392 BTERR bt_collapseroot (BtDb *bt, BtPageSet *root)
1397 // find the child entry and promote as new root contents
1400 for( idx = 0; idx++ < root->page->cnt; )
1401 if( !slotptr(root->page, idx)->dead )
1404 child->page_no = bt_getid (valptr(root->page, idx)->value);
1406 if( child->latch = bt_pinlatch (bt, child->page_no, 1) )
1407 child->page = bt_mappage (bt, child->latch);
1411 bt_lockpage (BtLockDelete, child->latch);
1412 bt_lockpage (BtLockWrite, child->latch);
1414 memcpy (root->page, child->page, bt->mgr->page_size);
1415 root->latch->dirty = 1;
1417 bt_freepage (bt, child);
1419 } while( root->page->lvl > 1 && root->page->act == 1 );
1421 bt_unlockpage (BtLockWrite, root->latch);
1422 bt_unpinlatch (root->latch);
1426 // maintain reverse scan pointers by
1427 // linking left pointer of far right node
1429 BTERR bt_linkleft (BtDb *bt, uid left_page_no, uid right_page_no)
1431 BtPageSet right2[1];
1433 // keep track of rightmost leaf page
1435 if( !right_page_no ) {
1436 bt_putid (bt->mgr->pagezero->alloc->left, left_page_no);
1440 // link right page to left page
1442 if( right2->latch = bt_pinlatch (bt, right_page_no, 1) )
1443 right2->page = bt_mappage (bt, right2->latch);
1447 bt_lockpage (BtLockWrite, right2->latch);
1449 bt_putid(right2->page->left, left_page_no);
1450 bt_unlockpage (BtLockWrite, right2->latch);
1451 bt_unpinlatch (right2->latch);
1455 // find and delete key on page by marking delete flag bit
1456 // if page becomes empty, delete it from the btree
1458 BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl)
1460 unsigned char lowerfence[BT_keyarray], higherfence[BT_keyarray];
1461 uint slot, idx, found, fence;
1462 BtPageSet set[1], right[1];
1463 unsigned char value[BtId];
1467 if( slot = bt_loadpage (bt, set, key, len, lvl, BtLockWrite) )
1468 ptr = keyptr(set->page, slot);
1472 // if librarian slot, advance to real slot
1474 if( slotptr(set->page, slot)->type == Librarian )
1475 ptr = keyptr(set->page, ++slot);
1477 fence = slot == set->page->cnt;
1479 // if key is found delete it, otherwise ignore request
1481 if( found = !keycmp (ptr, key, len) )
1482 if( found = slotptr(set->page, slot)->dead == 0 ) {
1483 val = valptr(set->page,slot);
1484 slotptr(set->page, slot)->dead = 1;
1485 set->page->garbage += ptr->len + val->len + sizeof(BtKey) + sizeof(BtVal);
1488 // collapse empty slots beneath the fence
1490 while( idx = set->page->cnt - 1 )
1491 if( slotptr(set->page, idx)->dead ) {
1492 *slotptr(set->page, idx) = *slotptr(set->page, idx + 1);
1493 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1498 // did we delete a fence key in an upper level?
1500 if( found && lvl && set->page->act && fence )
1501 if( bt_fixfence (bt, set, lvl) )
1504 return bt->found = found, 0;
1506 // do we need to collapse root?
1508 if( lvl > 1 && set->page_no == ROOT_page && set->page->act == 1 )
1509 if( bt_collapseroot (bt, set) )
1512 return bt->found = found, 0;
1514 // return if page is not empty
1516 if( set->page->act ) {
1517 set->latch->dirty = 1;
1518 bt_unlockpage(BtLockWrite, set->latch);
1519 bt_unpinlatch (set->latch);
1520 return bt->found = found, 0;
1523 // cache copy of fence key
1524 // to post in parent
1526 ptr = keyptr(set->page, set->page->cnt);
1527 memcpy (lowerfence, ptr, ptr->len + sizeof(BtKey));
1529 // obtain lock on right page
1531 right->page_no = bt_getid(set->page->right);
1533 if( right->latch = bt_pinlatch (bt, right->page_no, 1) )
1534 right->page = bt_mappage (bt, right->latch);
1538 bt_lockpage (BtLockWrite, right->latch);
1540 if( right->page->kill )
1541 return bt->err = BTERR_struct;
1543 // transfer left link
1545 memcpy (right->page->left, set->page->left, BtId);
1547 // pull contents of right peer into our empty page
1549 memcpy (set->page, right->page, bt->mgr->page_size);
1550 set->latch->dirty = 1;
1555 if( bt_linkleft (bt, set->page_no, bt_getid (set->page->right)) )
1558 // cache copy of key to update
1560 ptr = keyptr(right->page, right->page->cnt);
1561 memcpy (higherfence, ptr, ptr->len + sizeof(BtKey));
1563 // mark right page deleted and point it to left page
1564 // until we can post parent updates
1566 bt_putid (right->page->right, set->page_no);
1567 right->latch->dirty = 1;
1568 right->page->kill = 1;
1570 bt_lockpage (BtLockParent, right->latch);
1571 bt_unlockpage (BtLockWrite, right->latch);
1573 bt_lockpage (BtLockParent, set->latch);
1574 bt_unlockpage (BtLockWrite, set->latch);
1576 // redirect higher key directly to our new node contents
1578 bt_putid (value, set->page_no);
1579 ptr = (BtKey*)higherfence;
1581 if( bt_insertkey (bt, ptr->key, ptr->len, lvl+1, value, BtId, 1) )
1584 // delete old lower key to our node
1586 ptr = (BtKey*)lowerfence;
1588 if( bt_deletekey (bt, ptr->key, ptr->len, lvl+1) )
1591 // obtain delete and write locks to right node
1593 bt_unlockpage (BtLockParent, right->latch);
1594 bt_lockpage (BtLockDelete, right->latch);
1595 bt_lockpage (BtLockWrite, right->latch);
1596 bt_freepage (bt, right);
1598 bt_unlockpage (BtLockParent, set->latch);
1599 bt_unpinlatch (set->latch);
1604 BtKey *bt_foundkey (BtDb *bt)
1606 return (BtKey*)(bt->key);
1609 // advance to next slot
1611 uint bt_findnext (BtDb *bt, BtPageSet *set, uint slot)
1613 BtLatchSet *prevlatch;
1616 if( slot < set->page->cnt )
1619 prevlatch = set->latch;
1621 if( page_no = bt_getid(set->page->right) )
1622 if( set->latch = bt_pinlatch (bt, page_no, 1) )
1623 set->page = bt_mappage (bt, set->latch);
1627 return bt->err = BTERR_struct, 0;
1629 // obtain access lock using lock chaining with Access mode
1631 bt_lockpage(BtLockAccess, set->latch);
1633 bt_unlockpage(BtLockRead, prevlatch);
1634 bt_unpinlatch (prevlatch);
1636 bt_lockpage(BtLockRead, set->latch);
1637 bt_unlockpage(BtLockAccess, set->latch);
1639 set->page_no = page_no;
1643 // find unique key or first duplicate key in
1644 // leaf level and return number of value bytes
1645 // or (-1) if not found. Setup key for bt_foundkey
1647 int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax)
1655 if( slot = bt_loadpage (bt, set, key, keylen, 0, BtLockRead) )
1657 ptr = keyptr(set->page, slot);
1659 // skip librarian slot place holder
1661 if( slotptr(set->page, slot)->type == Librarian )
1662 ptr = keyptr(set->page, ++slot);
1664 // return actual key found
1666 memcpy (bt->key, ptr, ptr->len + sizeof(BtKey));
1669 if( slotptr(set->page, slot)->type == Duplicate )
1672 // not there if we reach the stopper key
1674 if( slot == set->page->cnt )
1675 if( !bt_getid (set->page->right) )
1678 // if key exists, return >= 0 value bytes copied
1679 // otherwise return (-1)
1681 if( slotptr(set->page, slot)->dead )
1685 if( !memcmp (ptr->key, key, len) ) {
1686 val = valptr (set->page,slot);
1687 if( valmax > val->len )
1689 memcpy (value, val->value, valmax);
1695 } while( slot = bt_findnext (bt, set, slot) );
1697 bt_unlockpage (BtLockRead, set->latch);
1698 bt_unpinlatch (set->latch);
1702 // check page for space available,
1703 // clean if necessary and return
1704 // 0 - page needs splitting
1705 // >0 new slot value
1707 uint bt_cleanpage(BtDb *bt, BtPageSet *set, uint keylen, uint slot, uint vallen)
1709 uint nxt = bt->mgr->page_size;
1710 BtPage page = set->page;
1711 uint cnt = 0, idx = 0;
1712 uint max = page->cnt;
1717 if( page->min >= (max+2) * sizeof(BtSlot) + sizeof(*page) + keylen + sizeof(BtKey) + vallen + sizeof(BtVal))
1720 // skip cleanup and proceed to split
1721 // if there's not enough garbage
1724 if( page->garbage < nxt / 5 )
1727 memcpy (bt->frame, page, bt->mgr->page_size);
1729 // skip page info and set rest of page to zero
1731 memset (page+1, 0, bt->mgr->page_size - sizeof(*page));
1732 set->latch->dirty = 1;
1736 // clean up page first by
1737 // removing deleted keys
1739 while( cnt++ < max ) {
1742 if( cnt < max && slotptr(bt->frame,cnt)->dead )
1745 // copy the value across
1747 val = valptr(bt->frame, cnt);
1748 nxt -= val->len + sizeof(BtVal);
1749 memcpy ((unsigned char *)page + nxt, val, val->len + sizeof(BtVal));
1751 // copy the key across
1753 key = keyptr(bt->frame, cnt);
1754 nxt -= key->len + sizeof(BtKey);
1755 memcpy ((unsigned char *)page + nxt, key, key->len + sizeof(BtKey));
1757 // make a librarian slot
1760 slotptr(page, ++idx)->off = nxt;
1761 slotptr(page, idx)->type = Librarian;
1762 slotptr(page, idx)->dead = 1;
1767 slotptr(page, ++idx)->off = nxt;
1768 slotptr(page, idx)->type = slotptr(bt->frame, cnt)->type;
1770 if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) )
1777 // see if page has enough space now, or does it need splitting?
1779 if( page->min >= (idx+2) * sizeof(BtSlot) + sizeof(*page) + keylen + sizeof(BtKey) + vallen + sizeof(BtVal) )
1785 // split the root and raise the height of the btree
1787 BTERR bt_splitroot(BtDb *bt, BtPageSet *root, BtKey *leftkey, BtPageSet *right)
1789 uint nxt = bt->mgr->page_size;
1790 unsigned char value[BtId];
1795 // Obtain an empty page to use, and copy the current
1796 // root contents into it, e.g. lower keys
1798 if( bt_newpage(bt, left, root->page) )
1801 bt_unpinlatch (left->latch);
1803 // set left from higher to lower page
1805 bt_putid (right->page->left, left->page_no);
1806 bt_unpinlatch (right->latch);
1808 // preserve the page info at the bottom
1809 // of higher keys and set rest to zero
1811 memset(root->page+1, 0, bt->mgr->page_size - sizeof(*root->page));
1813 // insert stopper key at top of newroot page
1814 // and increase the root height
1816 nxt -= BtId + sizeof(BtVal);
1817 bt_putid (value, right->page_no);
1818 val = (BtVal *)((unsigned char *)root->page + nxt);
1819 memcpy (val->value, value, BtId);
1822 nxt -= 2 + sizeof(BtKey);
1823 slotptr(root->page, 2)->off = nxt;
1824 ptr = (BtKey *)((unsigned char *)root->page + nxt);
1829 // insert lower keys page fence key on newroot page as first key
1831 nxt -= BtId + sizeof(BtVal);
1832 bt_putid (value, left->page_no);
1833 val = (BtVal *)((unsigned char *)root->page + nxt);
1834 memcpy (val->value, value, BtId);
1837 nxt -= leftkey->len + sizeof(BtKey);
1838 slotptr(root->page, 1)->off = nxt;
1839 memcpy ((unsigned char *)root->page + nxt, leftkey, leftkey->len + sizeof(BtKey));
1841 bt_putid(root->page->right, 0);
1842 root->page->min = nxt; // reset lowest used offset and key count
1843 root->page->cnt = 2;
1844 root->page->act = 2;
1847 // release and unpin root pages
1849 bt_unlockpage(BtLockWrite, root->latch);
1850 bt_unpinlatch (root->latch);
1854 // split already locked full node
1857 BTERR bt_splitpage (BtDb *bt, BtPageSet *set)
1859 unsigned char fencekey[BT_keyarray], rightkey[BT_keyarray];
1860 uint cnt = 0, idx = 0, max, nxt = bt->mgr->page_size;
1861 unsigned char value[BtId];
1862 uint lvl = set->page->lvl;
1869 // split higher half of keys to bt->frame
1871 memset (bt->frame, 0, bt->mgr->page_size);
1872 max = set->page->cnt;
1876 while( cnt++ < max ) {
1877 if( slotptr(set->page, cnt)->dead && cnt < max )
1879 src = valptr(set->page, cnt);
1880 nxt -= src->len + sizeof(BtVal);
1881 memcpy ((unsigned char *)bt->frame + nxt, src, src->len + sizeof(BtVal));
1883 key = keyptr(set->page, cnt);
1884 nxt -= key->len + sizeof(BtKey);
1885 ptr = (BtKey*)((unsigned char *)bt->frame + nxt);
1886 memcpy (ptr, key, key->len + sizeof(BtKey));
1888 // add librarian slot
1891 slotptr(bt->frame, ++idx)->off = nxt;
1892 slotptr(bt->frame, idx)->type = Librarian;
1893 slotptr(bt->frame, idx)->dead = 1;
1898 slotptr(bt->frame, ++idx)->off = nxt;
1899 slotptr(bt->frame, idx)->type = slotptr(set->page, cnt)->type;
1901 if( !(slotptr(bt->frame, idx)->dead = slotptr(set->page, cnt)->dead) )
1905 // remember existing fence key for new page to the right
1907 memcpy (rightkey, key, key->len + sizeof(BtKey));
1909 bt->frame->bits = bt->mgr->page_bits;
1910 bt->frame->min = nxt;
1911 bt->frame->cnt = idx;
1912 bt->frame->lvl = lvl;
1916 if( set->page_no > ROOT_page ) {
1917 bt_putid (bt->frame->right, bt_getid (set->page->right));
1918 bt_putid(bt->frame->left, set->page_no);
1921 // get new free page and write higher keys to it.
1923 if( bt_newpage(bt, right, bt->frame) )
1928 if( set->page_no > ROOT_page && !lvl )
1929 if( bt_linkleft (bt, right->page_no, bt_getid (set->page->right)) )
1932 // update lower keys to continue in old page
1934 memcpy (bt->frame, set->page, bt->mgr->page_size);
1935 memset (set->page+1, 0, bt->mgr->page_size - sizeof(*set->page));
1936 set->latch->dirty = 1;
1938 nxt = bt->mgr->page_size;
1939 set->page->garbage = 0;
1945 if( slotptr(bt->frame, max)->type == Librarian )
1948 // assemble page of smaller keys
1950 while( cnt++ < max ) {
1951 if( slotptr(bt->frame, cnt)->dead )
1953 val = valptr(bt->frame, cnt);
1954 nxt -= val->len + sizeof(BtVal);
1955 memcpy ((unsigned char *)set->page + nxt, val, val->len + sizeof(BtVal));
1957 key = keyptr(bt->frame, cnt);
1958 nxt -= key->len + sizeof(BtKey);
1959 memcpy ((unsigned char *)set->page + nxt, key, key->len + sizeof(BtKey));
1961 // add librarian slot
1964 slotptr(set->page, ++idx)->off = nxt;
1965 slotptr(set->page, idx)->type = Librarian;
1966 slotptr(set->page, idx)->dead = 1;
1971 slotptr(set->page, ++idx)->off = nxt;
1972 slotptr(set->page, idx)->type = slotptr(bt->frame, cnt)->type;
1976 // remember fence key for smaller page
1978 memcpy(fencekey, key, key->len + sizeof(BtKey));
1980 bt_putid(set->page->right, right->page_no);
1981 set->page->min = nxt;
1982 set->page->cnt = idx;
1984 // if current page is the root page, split it
1986 if( set->page_no == ROOT_page )
1987 return bt_splitroot (bt, set, (BtKey *)fencekey, right);
1989 // insert new fences in their parent pages
1991 bt_lockpage (BtLockParent, right->latch);
1993 bt_lockpage (BtLockParent, set->latch);
1994 bt_unlockpage (BtLockWrite, set->latch);
1996 // insert new fence for reformulated left block of smaller keys
1998 bt_putid (value, set->page_no);
2000 if( bt_insertkey (bt, fencekey+1, *fencekey, lvl+1, value, BtId, 1) )
2003 // switch fence for right block of larger keys to new right page
2005 bt_putid (value, right->page_no);
2007 if( bt_insertkey (bt, rightkey+1, *rightkey, lvl+1, value, BtId, 1) )
2010 bt_unlockpage (BtLockParent, set->latch);
2011 bt_unpinlatch (set->latch);
2013 bt_unlockpage (BtLockParent, right->latch);
2014 bt_unpinlatch (right->latch);
2018 // install new key and value onto page
2019 // page must already be checked for
2022 BTERR bt_insertslot (BtDb *bt, BtPageSet *set, uint slot, unsigned char *key,uint keylen, unsigned char *value, uint vallen, uint type)
2024 uint idx, librarian;
2029 // if found slot > desired slot and previous slot
2030 // is a librarian slot, use it
2033 if( slotptr(set->page, slot-1)->type == Librarian )
2036 // copy value onto page
2038 set->page->min -= vallen + sizeof(BtVal);
2039 val = (BtVal*)((unsigned char *)set->page + set->page->min);
2040 memcpy (val->value, value, vallen);
2043 // copy key onto page
2045 set->page->min -= keylen + sizeof(BtKey);
2046 ptr = (BtKey*)((unsigned char *)set->page + set->page->min);
2047 memcpy (ptr->key, key, keylen);
2050 // find first empty slot
2052 for( idx = slot; idx < set->page->cnt; idx++ )
2053 if( slotptr(set->page, idx)->dead )
2056 // now insert key into array before slot
2058 if( idx == set->page->cnt )
2059 idx += 2, set->page->cnt += 2, librarian = 2;
2063 set->latch->dirty = 1;
2066 while( idx > slot + librarian - 1 )
2067 *slotptr(set->page, idx) = *slotptr(set->page, idx - librarian), idx--;
2069 // add librarian slot
2071 if( librarian > 1 ) {
2072 node = slotptr(set->page, slot++);
2073 node->off = set->page->min;
2074 node->type = Librarian;
2080 node = slotptr(set->page, slot);
2081 node->off = set->page->min;
2085 bt_unlockpage (BtLockWrite, set->latch);
2086 bt_unpinlatch (set->latch);
2090 // Insert new key into the btree at given level.
2091 // either add a new key or update/add an existing one
2093 BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint keylen, uint lvl, void *value, uint vallen, uint unique)
2095 unsigned char newkey[BT_keyarray];
2096 uint slot, idx, len;
2103 // set up the key we're working on
2105 ins = (BtKey*)newkey;
2106 memcpy (ins->key, key, keylen);
2109 // is this a non-unique index value?
2115 sequence = bt_newdup (bt);
2116 bt_putid (ins->key + ins->len + sizeof(BtKey), sequence);
2120 while( 1 ) { // find the page and slot for the current key
2121 if( slot = bt_loadpage (bt, set, ins->key, ins->len, lvl, BtLockWrite) )
2122 ptr = keyptr(set->page, slot);
2125 bt->err = BTERR_ovflw;
2129 // if librarian slot == found slot, advance to real slot
2131 if( slotptr(set->page, slot)->type == Librarian )
2132 if( !keycmp (ptr, key, keylen) )
2133 ptr = keyptr(set->page, ++slot);
2137 if( slotptr(set->page, slot)->type == Duplicate )
2140 // if inserting a duplicate key or unique key
2141 // check for adequate space on the page
2142 // and insert the new key before slot.
2144 if( unique && (len != ins->len || memcmp (ptr->key, ins->key, ins->len)) || !unique ) {
2145 if( !(slot = bt_cleanpage (bt, set, ins->len, slot, vallen)) )
2146 if( bt_splitpage (bt, set) )
2151 return bt_insertslot (bt, set, slot, ins->key, ins->len, value, vallen, type);
2154 // if key already exists, update value and return
2156 val = valptr(set->page, slot);
2158 if( val->len >= vallen ) {
2159 if( slotptr(set->page, slot)->dead )
2161 set->page->garbage += val->len - vallen;
2162 set->latch->dirty = 1;
2163 slotptr(set->page, slot)->dead = 0;
2165 memcpy (val->value, value, vallen);
2166 bt_unlockpage(BtLockWrite, set->latch);
2167 bt_unpinlatch (set->latch);
2171 // new update value doesn't fit in existing value area
2173 if( !slotptr(set->page, slot)->dead )
2174 set->page->garbage += val->len + ptr->len + sizeof(BtKey) + sizeof(BtVal);
2176 slotptr(set->page, slot)->dead = 0;
2180 if( !(slot = bt_cleanpage (bt, set, keylen, slot, vallen)) )
2181 if( bt_splitpage (bt, set) )
2186 set->page->min -= vallen + sizeof(BtVal);
2187 val = (BtVal*)((unsigned char *)set->page + set->page->min);
2188 memcpy (val->value, value, vallen);
2191 set->latch->dirty = 1;
2192 set->page->min -= keylen + sizeof(BtKey);
2193 ptr = (BtKey*)((unsigned char *)set->page + set->page->min);
2194 memcpy (ptr->key, key, keylen);
2197 slotptr(set->page, slot)->off = set->page->min;
2198 bt_unlockpage(BtLockWrite, set->latch);
2199 bt_unpinlatch (set->latch);
2205 // set cursor to highest slot on highest page
2207 uint bt_lastkey (BtDb *bt)
2209 uid page_no = bt_getid (bt->mgr->pagezero->alloc->left);
2213 if( set->latch = bt_pinlatch (bt, page_no, 1) )
2214 set->page = bt_mappage (bt, set->latch);
2218 bt_lockpage(BtLockRead, set->latch);
2220 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2221 slot = set->page->cnt;
2223 bt_unlockpage(BtLockRead, set->latch);
2224 bt_unpinlatch (set->latch);
2228 // return previous slot on cursor page
2230 uint bt_prevkey (BtDb *bt, uint slot)
2238 if( left = bt_getid(bt->cursor->left) )
2239 bt->cursor_page = left;
2243 if( set->latch = bt_pinlatch (bt, left, 1) )
2244 set->page = bt_mappage (bt, set->latch);
2248 bt_lockpage(BtLockRead, set->latch);
2249 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2250 bt_unlockpage(BtLockRead, set->latch);
2251 bt_unpinlatch (set->latch);
2252 return bt->cursor->cnt;
2255 // return next slot on cursor page
2256 // or slide cursor right into next page
2258 uint bt_nextkey (BtDb *bt, uint slot)
2264 right = bt_getid(bt->cursor->right);
2266 while( slot++ < bt->cursor->cnt )
2267 if( slotptr(bt->cursor,slot)->dead )
2269 else if( right || (slot < bt->cursor->cnt) ) // skip infinite stopper
2277 bt->cursor_page = right;
2279 if( set->latch = bt_pinlatch (bt, right, 1) )
2280 set->page = bt_mappage (bt, set->latch);
2284 bt_lockpage(BtLockRead, set->latch);
2286 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2288 bt_unlockpage(BtLockRead, set->latch);
2289 bt_unpinlatch (set->latch);
2297 // cache page of keys into cursor and return starting slot for given key
2299 uint bt_startkey (BtDb *bt, unsigned char *key, uint len)
2304 // cache page for retrieval
2306 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockRead) )
2307 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2311 bt->cursor_page = set->page_no;
2313 bt_unlockpage(BtLockRead, set->latch);
2314 bt_unpinlatch (set->latch);
2318 BtKey *bt_key(BtDb *bt, uint slot)
2320 return keyptr(bt->cursor, slot);
2323 BtVal *bt_val(BtDb *bt, uint slot)
2325 return valptr(bt->cursor,slot);
2331 double getCpuTime(int type)
2334 FILETIME xittime[1];
2335 FILETIME systime[1];
2336 FILETIME usrtime[1];
2337 SYSTEMTIME timeconv[1];
2340 memset (timeconv, 0, sizeof(SYSTEMTIME));
2344 GetSystemTimeAsFileTime (xittime);
2345 FileTimeToSystemTime (xittime, timeconv);
2346 ans = (double)timeconv->wDayOfWeek * 3600 * 24;
2349 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2350 FileTimeToSystemTime (usrtime, timeconv);
2353 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2354 FileTimeToSystemTime (systime, timeconv);
2358 ans += (double)timeconv->wHour * 3600;
2359 ans += (double)timeconv->wMinute * 60;
2360 ans += (double)timeconv->wSecond;
2361 ans += (double)timeconv->wMilliseconds / 1000;
2366 #include <sys/resource.h>
2368 double getCpuTime(int type)
2370 struct rusage used[1];
2371 struct timeval tv[1];
2375 gettimeofday(tv, NULL);
2376 return (double)tv->tv_sec + (double)tv->tv_usec / 1000000;
2379 getrusage(RUSAGE_SELF, used);
2380 return (double)used->ru_utime.tv_sec + (double)used->ru_utime.tv_usec / 1000000;
2383 getrusage(RUSAGE_SELF, used);
2384 return (double)used->ru_stime.tv_sec + (double)used->ru_stime.tv_usec / 1000000;
2391 void bt_poolaudit (BtMgr *mgr)
2396 while( slot++ < mgr->latchdeployed ) {
2397 latch = mgr->latchsets + slot;
2399 if( *latch->readwr->rin & MASK )
2400 fprintf(stderr, "latchset %d rwlocked for page %.8x\n", slot, latch->page_no);
2401 memset ((ushort *)latch->readwr, 0, sizeof(RWLock));
2403 if( *latch->access->rin & MASK )
2404 fprintf(stderr, "latchset %d accesslocked for page %.8x\n", slot, latch->page_no);
2405 memset ((ushort *)latch->access, 0, sizeof(RWLock));
2407 if( *latch->parent->rin & MASK )
2408 fprintf(stderr, "latchset %d parentlocked for page %.8x\n", slot, latch->page_no);
2409 memset ((ushort *)latch->parent, 0, sizeof(RWLock));
2411 if( latch->pin & ~CLOCK_bit ) {
2412 fprintf(stderr, "latchset %d pinned for page %.8x\n", slot, latch->page_no);
2418 uint bt_latchaudit (BtDb *bt)
2420 ushort idx, hashidx;
2426 if( *(ushort *)(bt->mgr->lock) )
2427 fprintf(stderr, "Alloc page locked\n");
2428 *(ushort *)(bt->mgr->lock) = 0;
2430 for( idx = 1; idx <= bt->mgr->latchdeployed; idx++ ) {
2431 latch = bt->mgr->latchsets + idx;
2432 if( *latch->readwr->rin & MASK )
2433 fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
2434 memset ((ushort *)latch->readwr, 0, sizeof(RWLock));
2436 if( *latch->access->rin & MASK )
2437 fprintf(stderr, "latchset %d accesslocked for page %.8x\n", idx, latch->page_no);
2438 memset ((ushort *)latch->access, 0, sizeof(RWLock));
2440 if( *latch->parent->rin & MASK )
2441 fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
2442 memset ((ushort *)latch->parent, 0, sizeof(RWLock));
2445 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2450 for( hashidx = 0; hashidx < bt->mgr->latchhash; hashidx++ ) {
2451 if( *(ushort *)(bt->mgr->hashtable[hashidx].latch) )
2452 fprintf(stderr, "hash entry %d locked\n", hashidx);
2454 *(ushort *)(bt->mgr->hashtable[hashidx].latch) = 0;
2456 if( idx = bt->mgr->hashtable[hashidx].slot ) do {
2457 latch = bt->mgr->latchsets + idx;
2459 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2460 } while( idx = latch->next );
2463 page_no = LEAF_page;
2465 while( page_no < bt_getid(bt->mgr->pagezero->alloc->right) ) {
2466 uid off = page_no << bt->mgr->page_bits;
2468 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2472 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2474 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2475 return bt->err = BTERR_map;
2477 if( *amt < bt->mgr->page_size )
2478 return bt->err = BTERR_map;
2480 if( !bt->frame->free && !bt->frame->lvl )
2481 cnt += bt->frame->act;
2485 cnt--; // remove stopper key
2486 fprintf(stderr, " Total keys read %d\n", cnt);
2500 // standalone program to index file of keys
2501 // then list them onto std-out
2504 void *index_file (void *arg)
2506 uint __stdcall index_file (void *arg)
2509 int line = 0, found = 0, cnt = 0, unique;
2510 uid next, page_no = LEAF_page; // start on first page of leaves
2511 unsigned char key[BT_maxkey];
2512 ThreadArg *args = arg;
2513 int ch, len = 0, slot;
2520 bt = bt_open (args->mgr);
2522 unique = (args->type[1] | 0x20) != 'd';
2524 switch(args->type[0] | 0x20)
2527 fprintf(stderr, "started latch mgr audit\n");
2528 cnt = bt_latchaudit (bt);
2529 fprintf(stderr, "finished latch mgr audit, found %d keys\n", cnt);
2533 fprintf(stderr, "started pennysort for %s\n", args->infile);
2534 if( in = fopen (args->infile, "rb") )
2535 while( ch = getc(in), ch != EOF )
2540 if( bt_insertkey (bt, key, 10, 0, key + 10, len - 10, unique) )
2541 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2544 else if( len < BT_maxkey )
2546 fprintf(stderr, "finished %s for %d keys: %d reads %d writes\n", args->infile, line, bt->reads, bt->writes);
2550 fprintf(stderr, "started indexing for %s\n", args->infile);
2551 if( in = fopen (args->infile, "rb") )
2552 while( ch = getc(in), ch != EOF )
2557 if( args->num == 1 )
2558 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2560 else if( args->num )
2561 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2563 if( bt_insertkey (bt, key, len, 0, NULL, 0, unique) )
2564 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2567 else if( len < BT_maxkey )
2569 fprintf(stderr, "finished %s for %d keys: %d reads %d writes\n", args->infile, line, bt->reads, bt->writes);
2573 fprintf(stderr, "started deleting keys for %s\n", args->infile);
2574 if( in = fopen (args->infile, "rb") )
2575 while( ch = getc(in), ch != EOF )
2579 if( args->num == 1 )
2580 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2582 else if( args->num )
2583 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2585 if( bt_findkey (bt, key, len, NULL, 0) < 0 )
2586 fprintf(stderr, "Cannot find key for Line: %d\n", line), exit(0);
2587 ptr = (BtKey*)(bt->key);
2590 if( bt_deletekey (bt, ptr->key, ptr->len, 0) )
2591 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2594 else if( len < BT_maxkey )
2596 fprintf(stderr, "finished %s for %d keys, %d found: %d reads %d writes\n", args->infile, line, found, bt->reads, bt->writes);
2600 fprintf(stderr, "started finding keys for %s\n", args->infile);
2601 if( in = fopen (args->infile, "rb") )
2602 while( ch = getc(in), ch != EOF )
2606 if( args->num == 1 )
2607 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2609 else if( args->num )
2610 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2612 if( bt_findkey (bt, key, len, NULL, 0) == 0 )
2615 fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
2618 else if( len < BT_maxkey )
2620 fprintf(stderr, "finished %s for %d keys, found %d: %d reads %d writes\n", args->infile, line, found, bt->reads, bt->writes);
2624 fprintf(stderr, "started scanning\n");
2626 if( set->latch = bt_pinlatch (bt, page_no, 1) )
2627 set->page = bt_mappage (bt, set->latch);
2629 fprintf(stderr, "unable to obtain latch"), exit(1);
2630 bt_lockpage (BtLockRead, set->latch);
2631 next = bt_getid (set->page->right);
2633 for( slot = 0; slot++ < set->page->cnt; )
2634 if( next || slot < set->page->cnt )
2635 if( !slotptr(set->page, slot)->dead ) {
2636 ptr = keyptr(set->page, slot);
2639 if( slotptr(set->page, slot)->type == Duplicate )
2642 fwrite (ptr->key, len, 1, stdout);
2643 val = valptr(set->page, slot);
2644 fwrite (val->value, val->len, 1, stdout);
2645 fputc ('\n', stdout);
2649 bt_unlockpage (BtLockRead, set->latch);
2650 bt_unpinlatch (set->latch);
2651 } while( page_no = next );
2653 fprintf(stderr, " Total keys read %d: %d reads, %d writes\n", cnt, bt->reads, bt->writes);
2657 fprintf(stderr, "started reverse scan\n");
2658 if( slot = bt_lastkey (bt) )
2659 while( slot = bt_prevkey (bt, slot) ) {
2660 if( slotptr(bt->cursor, slot)->dead )
2663 ptr = keyptr(bt->cursor, slot);
2666 if( slotptr(bt->cursor, slot)->type == Duplicate )
2669 fwrite (ptr->key, len, 1, stdout);
2670 val = valptr(bt->cursor, slot);
2671 fwrite (val->value, val->len, 1, stdout);
2672 fputc ('\n', stdout);
2676 fprintf(stderr, " Total keys read %d: %d reads, %d writes\n", cnt, bt->reads, bt->writes);
2681 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2683 fprintf(stderr, "started counting\n");
2684 page_no = LEAF_page;
2686 while( page_no < bt_getid(bt->mgr->pagezero->alloc->right) ) {
2687 if( bt_readpage (bt->mgr, bt->frame, page_no) )
2690 if( !bt->frame->free && !bt->frame->lvl )
2691 cnt += bt->frame->act;
2697 cnt--; // remove stopper key
2698 fprintf(stderr, " Total keys read %d: %d reads, %d writes\n", cnt, bt->reads, bt->writes);
2710 typedef struct timeval timer;
2712 int main (int argc, char **argv)
2714 int idx, cnt, len, slot, err;
2715 int segsize, bits = 16;
2732 fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits buffer_pool_size line_numbers src_file1 src_file2 ... ]\n", argv[0]);
2733 fprintf (stderr, " where page_bits is the page size in bits\n");
2734 fprintf (stderr, " buffer_pool_size is the number of pages in buffer pool\n");
2735 fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n");
2736 fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n");
2740 start = getCpuTime(0);
2743 bits = atoi(argv[3]);
2746 poolsize = atoi(argv[4]);
2749 fprintf (stderr, "Warning: no mapped_pool\n");
2752 num = atoi(argv[5]);
2756 threads = malloc (cnt * sizeof(pthread_t));
2758 threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE));
2760 args = malloc (cnt * sizeof(ThreadArg));
2762 mgr = bt_mgr ((argv[1]), bits, poolsize);
2765 fprintf(stderr, "Index Open Error %s\n", argv[1]);
2771 for( idx = 0; idx < cnt; idx++ ) {
2772 args[idx].infile = argv[idx + 6];
2773 args[idx].type = argv[2];
2774 args[idx].mgr = mgr;
2775 args[idx].num = num;
2776 args[idx].idx = idx;
2778 if( err = pthread_create (threads + idx, NULL, index_file, args + idx) )
2779 fprintf(stderr, "Error creating thread %d\n", err);
2781 threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL);
2785 // wait for termination
2788 for( idx = 0; idx < cnt; idx++ )
2789 pthread_join (threads[idx], NULL);
2791 WaitForMultipleObjects (cnt, threads, TRUE, INFINITE);
2793 for( idx = 0; idx < cnt; idx++ )
2794 CloseHandle(threads[idx]);
2800 elapsed = getCpuTime(0) - start;
2801 fprintf(stderr, " real %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
2802 elapsed = getCpuTime(1);
2803 fprintf(stderr, " user %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
2804 elapsed = getCpuTime(2);
2805 fprintf(stderr, " sys %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);