1 // btree version threadskv5 sched_yield version
2 // with reworked bt_deletekey code,
3 // phase-fair reader writer lock,
4 // librarian page split code,
5 // duplicate key management
6 // and bi-directional cursors
10 // author: karl malbrain, malbrain@cal.berkeley.edu
13 This work, including the source code, documentation
14 and related data, is placed into the public domain.
16 The orginal author is Karl Malbrain.
18 THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
19 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
20 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
21 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
22 RESULTING FROM THE USE, MODIFICATION, OR
23 REDISTRIBUTION OF THIS SOFTWARE.
26 // Please see the project home page for documentation
27 // code.google.com/p/high-concurrency-btree
29 #define _FILE_OFFSET_BITS 64
30 #define _LARGEFILE64_SOURCE
46 #define WIN32_LEAN_AND_MEAN
60 typedef unsigned long long uid;
63 typedef unsigned long long off64_t;
64 typedef unsigned short ushort;
65 typedef unsigned int uint;
68 #define BT_latchtable 128 // number of latch manager slots
70 #define BT_ro 0x6f72 // ro
71 #define BT_rw 0x7772 // rw
73 #define BT_maxbits 24 // maximum page size in bits
74 #define BT_minbits 9 // minimum page size in bits
75 #define BT_minpage (1 << BT_minbits) // minimum page size
76 #define BT_maxpage (1 << BT_maxbits) // maximum page size
79 There are five lock types for each node in three independent sets:
80 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
81 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent.
82 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock.
83 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks.
84 5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification.
95 // definition for phase-fair reader/writer lock implementation
109 // definition for spin latch implementation
111 // exclusive is set for write access
112 // share is count of read accessors
113 // grant write lock when share == 0
115 volatile typedef struct {
126 // hash table entries
129 BtSpinLatch latch[1];
130 volatile ushort slot; // Latch table entry at head of chain
133 // latch manager table structure
136 RWLock readwr[1]; // read/write page lock
137 RWLock access[1]; // Access Intent/Page delete
138 RWLock parent[1]; // Posting of fence key in parent
139 BtSpinLatch busy[1]; // slot is being moved between chains
140 volatile ushort next; // next entry in hash table chain
141 volatile ushort prev; // prev entry in hash table chain
142 volatile ushort pin; // number of outstanding locks
143 volatile ushort hash; // hash slot entry is under
144 volatile uid page_no; // latch set page number
147 // Define the length of the page record numbers
151 // Page key slot definition.
153 // Keys are marked dead, but remain on the page until
154 // it cleanup is called. The fence key (highest key) for
155 // a leaf page is always present, even after cleanup.
159 // In addition to the Unique keys that occupy slots
160 // there are Librarian and Duplicate key
161 // slots occupying the key slot array.
163 // The Librarian slots are dead keys that
164 // serve as filler, available to add new Unique
165 // or Dup slots that are inserted into the B-tree.
167 // The Duplicate slots have had their key bytes extended
168 // by 6 bytes to contain a binary duplicate key uniqueifier.
177 uint off:BT_maxbits; // page offset for key start
178 uint type:3; // type of slot
179 uint dead:1; // set for deleted slot
182 // The key structure occupies space at the upper end of
183 // each page. It's a length byte followed by the key
187 unsigned char len; // this can be changed to a ushort or uint
188 unsigned char key[0];
191 // the value structure also occupies space at the upper
192 // end of the page. Each key is immediately followed by a value.
195 unsigned char len; // this can be changed to a ushort or uint
196 unsigned char value[0];
199 #define BT_maxkey 255 // maximum number of bytes in a key
200 #define BT_keyarray (BT_maxkey + sizeof(BtKey))
202 // The first part of an index page.
203 // It is immediately followed
204 // by the BtSlot array of keys.
206 // note that this structure size
207 // must be a multiple of 8 bytes
208 // in order to place dups correctly.
210 typedef struct BtPage_ {
211 uint cnt; // count of keys in page
212 uint act; // count of active keys
213 uint min; // next key offset
214 uint garbage; // page garbage in bytes
215 unsigned char bits:7; // page size in bits
216 unsigned char free:1; // page is on free chain
217 unsigned char lvl:7; // level of page
218 unsigned char kill:1; // page is being deleted
219 unsigned char left[BtId]; // page number to left
220 unsigned char filler[2]; // padding to multiple of 8
221 unsigned char right[BtId]; // page number to right
224 // The memory mapping pool table buffer manager entry
227 uid basepage; // mapped base page number
228 char *map; // mapped memory pointer
229 ushort slot; // slot index in this array
230 ushort pin; // mapped page pin counter
231 void *hashprev; // previous pool entry for the same hash idx
232 void *hashnext; // next pool entry for the same hash idx
234 HANDLE hmap; // Windows memory mapping handle
238 #define CLOCK_bit 0x8000 // bit in pool->pin
240 // The loadpage interface object
243 uid page_no; // current page number
244 BtPage page; // current page pointer
245 BtPool *pool; // current page pool
246 BtLatchSet *latch; // current page latch set
249 // structure for latch manager on ALLOC_page
252 struct BtPage_ alloc[1]; // next page_no in right ptr
253 unsigned long long dups[1]; // global duplicate key uniqueifier
254 unsigned char chain[BtId]; // head of free page_nos chain
255 BtSpinLatch lock[1]; // allocation area lite latch
256 ushort latchdeployed; // highest number of latch entries deployed
257 ushort nlatchpage; // number of latch pages at BT_latch
258 ushort latchtotal; // number of page latch entries
259 ushort latchhash; // number of latch hash table slots
260 ushort latchvictim; // next latch entry to examine
261 BtHashEntry table[0]; // the hash table
264 // The object structure for Btree access
267 uint page_size; // page size
268 uint page_bits; // page size in bits
269 uint seg_bits; // seg size in pages in bits
270 uint mode; // read-write mode
276 ushort poolcnt; // highest page pool node in use
277 ushort poolmax; // highest page pool node allocated
278 ushort poolmask; // total number of pages in mmap segment - 1
279 ushort hashsize; // size of Hash Table for pool entries
280 volatile uint evicted; // last evicted hash table slot
281 ushort *hash; // pool index for hash entries
282 BtSpinLatch *latch; // latches for hash table slots
283 BtLatchMgr *latchmgr; // mapped latch page from allocation page
284 BtLatchSet *latchsets; // mapped latch set from latch pages
285 BtPool *pool; // memory pool page segments
287 HANDLE halloc; // allocation and latch table handle
292 BtMgr *mgr; // buffer manager for thread
293 BtPage cursor; // cached frame for start/next (never mapped)
294 BtPage frame; // spare frame for the page split (never mapped)
295 uid cursor_page; // current cursor page number
296 unsigned char *mem; // frame, cursor, page memory buffer
297 unsigned char key[BT_keyarray]; // last found complete key
298 int found; // last delete or insert was found
299 int err; // last error
313 extern void bt_close (BtDb *bt);
314 extern BtDb *bt_open (BtMgr *mgr);
315 extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, void *value, uint vallen, uint update);
316 extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl);
317 extern int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax);
318 extern BtKey *bt_foundkey (BtDb *bt);
319 extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len);
320 extern uint bt_nextkey (BtDb *bt, uint slot);
323 extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize);
324 void bt_mgrclose (BtMgr *mgr);
326 // Helper functions to return slot values
327 // from the cursor page.
329 extern BtKey *bt_key (BtDb *bt, uint slot);
330 extern BtVal *bt_val (BtDb *bt, uint slot);
332 // BTree page number constants
333 #define ALLOC_page 0 // allocation & latch manager hash table
334 #define ROOT_page 1 // root of the btree
335 #define LEAF_page 2 // first page of leaves
336 #define LATCH_page 3 // pages for latch manager
338 // Number of levels to create in a new BTree
342 // The page is allocated from low and hi ends.
343 // The key slots are allocated from the bottom,
344 // while the text and value of the key
345 // are allocated from the top. When the two
346 // areas meet, the page is split into two.
348 // A key consists of a length byte, two bytes of
349 // index number (0 - 65534), and up to 253 bytes
352 // Associated with each key is a value byte string
353 // containing any value desired.
355 // The b-tree root is always located at page 1.
356 // The first leaf page of level zero is always
357 // located on page 2.
359 // The b-tree pages are linked with next
360 // pointers to facilitate enumerators,
361 // and provide for concurrency.
363 // When to root page fills, it is split in two and
364 // the tree height is raised by a new root at page
365 // one with two keys.
367 // Deleted keys are marked with a dead bit until
368 // page cleanup. The fence key for a leaf node is
371 // Groups of pages called segments from the btree are optionally
372 // cached with a memory mapped pool. A hash table is used to keep
373 // track of the cached segments. This behaviour is controlled
374 // by the cache block size parameter to bt_open.
376 // To achieve maximum concurrency one page is locked at a time
377 // as the tree is traversed to find leaf key in question. The right
378 // page numbers are used in cases where the page is being split,
381 // Page 0 is dedicated to lock for new page extensions,
382 // and chains empty pages together for reuse. It also
383 // contains the latch manager hash table.
385 // The ParentModification lock on a node is obtained to serialize posting
386 // or changing the fence key for a node.
388 // Empty pages are chained together through the ALLOC page and reused.
390 // Access macros to address slot and key values from the page
391 // Page slots use 1 based indexing.
393 #define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1))
394 #define keyptr(page, slot) ((BtKey*)((unsigned char*)(page) + slotptr(page, slot)->off))
395 #define valptr(page, slot) ((BtVal*)(keyptr(page,slot)->key + keyptr(page,slot)->len))
397 void bt_putid(unsigned char *dest, uid id)
402 dest[i] = (unsigned char)id, id >>= 8;
405 uid bt_getid(unsigned char *src)
410 for( i = 0; i < BtId; i++ )
411 id <<= 8, id |= *src++;
416 uid bt_newdup (BtDb *bt)
419 return __sync_fetch_and_add (bt->mgr->latchmgr->dups, 1) + 1;
421 return _InterlockedIncrement64(bt->mgr->latchmgr->dups, 1);
425 // Phase-Fair reader/writer lock implementation
427 void WriteLock (RWLock *lock)
432 tix = __sync_fetch_and_add (lock->ticket, 1);
434 tix = _InterlockedExchangeAdd16 (lock->ticket, 1);
436 // wait for our ticket to come up
438 while( tix != lock->serving[0] )
445 w = PRES | (tix & PHID);
447 r = __sync_fetch_and_add (lock->rin, w);
449 r = _InterlockedExchangeAdd16 (lock->rin, w);
451 while( r != *lock->rout )
459 void WriteRelease (RWLock *lock)
462 __sync_fetch_and_and (lock->rin, ~MASK);
464 _InterlockedAnd16 (lock->rin, ~MASK);
469 void ReadLock (RWLock *lock)
473 w = __sync_fetch_and_add (lock->rin, RINC) & MASK;
475 w = _InterlockedExchangeAdd16 (lock->rin, RINC) & MASK;
478 while( w == (*lock->rin & MASK) )
486 void ReadRelease (RWLock *lock)
489 __sync_fetch_and_add (lock->rout, RINC);
491 _InterlockedExchangeAdd16 (lock->rout, RINC);
495 // Spin Latch Manager
497 // wait until write lock mode is clear
498 // and add 1 to the share count
500 void bt_spinreadlock(BtSpinLatch *latch)
506 prev = __sync_fetch_and_add ((ushort *)latch, SHARE);
508 prev = _InterlockedExchangeAdd16((ushort *)latch, SHARE);
510 // see if exclusive request is granted or pending
515 prev = __sync_fetch_and_add ((ushort *)latch, -SHARE);
517 prev = _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
520 } while( sched_yield(), 1 );
522 } while( SwitchToThread(), 1 );
526 // wait for other read and write latches to relinquish
528 void bt_spinwritelock(BtSpinLatch *latch)
534 prev = __sync_fetch_and_or((ushort *)latch, PEND | XCL);
536 prev = _InterlockedOr16((ushort *)latch, PEND | XCL);
539 if( !(prev & ~BOTH) )
543 __sync_fetch_and_and ((ushort *)latch, ~XCL);
545 _InterlockedAnd16((ushort *)latch, ~XCL);
548 } while( sched_yield(), 1 );
550 } while( SwitchToThread(), 1 );
554 // try to obtain write lock
556 // return 1 if obtained,
559 int bt_spinwritetry(BtSpinLatch *latch)
564 prev = __sync_fetch_and_or((ushort *)latch, XCL);
566 prev = _InterlockedOr16((ushort *)latch, XCL);
568 // take write access if all bits are clear
571 if( !(prev & ~BOTH) )
575 __sync_fetch_and_and ((ushort *)latch, ~XCL);
577 _InterlockedAnd16((ushort *)latch, ~XCL);
584 void bt_spinreleasewrite(BtSpinLatch *latch)
587 __sync_fetch_and_and((ushort *)latch, ~BOTH);
589 _InterlockedAnd16((ushort *)latch, ~BOTH);
593 // decrement reader count
595 void bt_spinreleaseread(BtSpinLatch *latch)
598 __sync_fetch_and_add((ushort *)latch, -SHARE);
600 _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
604 // link latch table entry into latch hash table
606 void bt_latchlink (BtDb *bt, ushort hashidx, ushort victim, uid page_no)
608 BtLatchSet *set = bt->mgr->latchsets + victim;
610 if( set->next = bt->mgr->latchmgr->table[hashidx].slot )
611 bt->mgr->latchsets[set->next].prev = victim;
613 bt->mgr->latchmgr->table[hashidx].slot = victim;
614 set->page_no = page_no;
621 void bt_unpinlatch (BtLatchSet *set)
624 __sync_fetch_and_add(&set->pin, -1);
626 _InterlockedDecrement16 (&set->pin);
630 // find existing latchset or inspire new one
631 // return with latchset pinned
633 BtLatchSet *bt_pinlatch (BtDb *bt, uid page_no)
635 ushort hashidx = page_no % bt->mgr->latchmgr->latchhash;
636 ushort slot, avail = 0, victim, idx;
639 // obtain read lock on hash table entry
641 bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch);
643 if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do
645 set = bt->mgr->latchsets + slot;
646 if( page_no == set->page_no )
648 } while( slot = set->next );
652 __sync_fetch_and_add(&set->pin, 1);
654 _InterlockedIncrement16 (&set->pin);
658 bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch);
663 // try again, this time with write lock
665 bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch);
667 if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do
669 set = bt->mgr->latchsets + slot;
670 if( page_no == set->page_no )
672 if( !set->pin && !avail )
674 } while( slot = set->next );
676 // found our entry, or take over an unpinned one
678 if( slot || (slot = avail) ) {
679 set = bt->mgr->latchsets + slot;
681 __sync_fetch_and_add(&set->pin, 1);
683 _InterlockedIncrement16 (&set->pin);
685 set->page_no = page_no;
686 bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch);
690 // see if there are any unused entries
692 victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, 1) + 1;
694 victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchdeployed);
697 if( victim < bt->mgr->latchmgr->latchtotal ) {
698 set = bt->mgr->latchsets + victim;
700 __sync_fetch_and_add(&set->pin, 1);
702 _InterlockedIncrement16 (&set->pin);
704 bt_latchlink (bt, hashidx, victim, page_no);
705 bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch);
710 victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, -1);
712 victim = _InterlockedDecrement16 (&bt->mgr->latchmgr->latchdeployed);
714 // find and reuse previous lock entry
718 victim = __sync_fetch_and_add(&bt->mgr->latchmgr->latchvictim, 1);
720 victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchvictim) - 1;
722 // we don't use slot zero
724 if( victim %= bt->mgr->latchmgr->latchtotal )
725 set = bt->mgr->latchsets + victim;
729 // take control of our slot
730 // from other threads
732 if( set->pin || !bt_spinwritetry (set->busy) )
737 // try to get write lock on hash chain
738 // skip entry if not obtained
739 // or has outstanding locks
741 if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) {
742 bt_spinreleasewrite (set->busy);
747 bt_spinreleasewrite (set->busy);
748 bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch);
752 // unlink our available victim from its hash chain
755 bt->mgr->latchsets[set->prev].next = set->next;
757 bt->mgr->latchmgr->table[idx].slot = set->next;
760 bt->mgr->latchsets[set->next].prev = set->prev;
762 bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch);
764 __sync_fetch_and_add(&set->pin, 1);
766 _InterlockedIncrement16 (&set->pin);
768 bt_latchlink (bt, hashidx, victim, page_no);
769 bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch);
770 bt_spinreleasewrite (set->busy);
775 void bt_mgrclose (BtMgr *mgr)
780 // release mapped pages
781 // note that slot zero is never used
783 for( slot = 1; slot < mgr->poolmax; slot++ ) {
784 pool = mgr->pool + slot;
787 munmap (pool->map, (uid)(mgr->poolmask+1) << mgr->page_bits);
790 FlushViewOfFile(pool->map, 0);
791 UnmapViewOfFile(pool->map);
792 CloseHandle(pool->hmap);
798 munmap (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size);
799 munmap (mgr->latchmgr, 2 * mgr->page_size);
801 FlushViewOfFile(mgr->latchmgr, 0);
802 UnmapViewOfFile(mgr->latchmgr);
803 CloseHandle(mgr->halloc);
809 free ((void *)mgr->latch);
812 FlushFileBuffers(mgr->idx);
813 CloseHandle(mgr->idx);
814 GlobalFree (mgr->pool);
815 GlobalFree (mgr->hash);
816 GlobalFree ((void *)mgr->latch);
821 // close and release memory
823 void bt_close (BtDb *bt)
830 VirtualFree (bt->mem, 0, MEM_RELEASE);
835 // open/create new btree buffer manager
837 // call with file_name, BT_openmode, bits in page size (e.g. 16),
838 // size of mapped page pool (e.g. 8192)
840 BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize)
842 uint lvl, attr, cacheblk, last, slot, idx;
843 uint nlatchpage, latchhash;
844 unsigned char value[BtId];
845 int flag, initit = 0;
846 BtLatchMgr *latchmgr;
854 SYSTEM_INFO sysinfo[1];
857 // determine sanity of page size and buffer pool
859 if( bits > BT_maxbits )
861 else if( bits < BT_minbits )
865 return NULL; // must have buffer pool
868 mgr = calloc (1, sizeof(BtMgr));
870 mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
873 return free(mgr), NULL;
875 cacheblk = 4096; // minimum mmap segment size for unix
878 mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr));
879 attr = FILE_ATTRIBUTE_NORMAL;
880 mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL);
882 if( mgr->idx == INVALID_HANDLE_VALUE )
883 return GlobalFree(mgr), NULL;
885 // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity
886 GetSystemInfo(sysinfo);
887 cacheblk = sysinfo->dwAllocationGranularity;
891 latchmgr = malloc (BT_maxpage);
894 // read minimum page size to get root info
895 // to support raw disk partition files
896 // check if bits == 0 on the disk.
898 if( size = lseek (mgr->idx, 0L, 2) ) {
899 if( pread(mgr->idx, latchmgr, BT_minpage, 0) == BT_minpage )
900 if( latchmgr->alloc->bits )
901 bits = latchmgr->alloc->bits;
905 return free(mgr), free(latchmgr), NULL;
906 } else if( mode == BT_ro )
907 return free(latchmgr), bt_mgrclose (mgr), NULL;
909 latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE);
910 size = GetFileSize(mgr->idx, amt);
913 if( !ReadFile(mgr->idx, (char *)latchmgr, BT_minpage, amt, NULL) )
914 return bt_mgrclose (mgr), NULL;
915 bits = latchmgr->alloc->bits;
916 } else if( mode == BT_ro )
917 return bt_mgrclose (mgr), NULL;
922 mgr->page_size = 1 << bits;
923 mgr->page_bits = bits;
925 mgr->poolmax = poolmax;
928 if( cacheblk < mgr->page_size )
929 cacheblk = mgr->page_size;
931 // mask for partial memmaps
933 mgr->poolmask = (cacheblk >> bits) - 1;
935 // see if requested size of pages per memmap is greater
937 if( (1 << segsize) > mgr->poolmask )
938 mgr->poolmask = (1 << segsize) - 1;
942 while( (1 << mgr->seg_bits) <= mgr->poolmask )
945 mgr->hashsize = hashsize;
948 mgr->pool = calloc (poolmax, sizeof(BtPool));
949 mgr->hash = calloc (hashsize, sizeof(ushort));
950 mgr->latch = calloc (hashsize, sizeof(BtSpinLatch));
952 mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool));
953 mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort));
954 mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch));
960 // initialize an empty b-tree with latch page, root page, page of leaves
961 // and page(s) of latches
963 memset (latchmgr, 0, 1 << bits);
964 nlatchpage = BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1;
965 bt_putid(latchmgr->alloc->right, MIN_lvl+1+nlatchpage);
966 latchmgr->alloc->bits = mgr->page_bits;
968 latchmgr->nlatchpage = nlatchpage;
969 latchmgr->latchtotal = nlatchpage * (mgr->page_size / sizeof(BtLatchSet));
971 // initialize latch manager
973 latchhash = (mgr->page_size - sizeof(BtLatchMgr)) / sizeof(BtHashEntry);
975 // size of hash table = total number of latchsets
977 if( latchhash > latchmgr->latchtotal )
978 latchhash = latchmgr->latchtotal;
980 latchmgr->latchhash = latchhash;
982 // initialize left-most LEAF page in
985 bt_putid (latchmgr->alloc->left, LEAF_page);
988 if( pwrite (mgr->idx, latchmgr, mgr->page_size, 0) < mgr->page_size )
989 return bt_mgrclose (mgr), NULL;
991 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
992 return bt_mgrclose (mgr), NULL;
994 if( *amt < mgr->page_size )
995 return bt_mgrclose (mgr), NULL;
998 memset (latchmgr, 0, 1 << bits);
999 latchmgr->alloc->bits = mgr->page_bits;
1001 for( lvl=MIN_lvl; lvl--; ) {
1002 slotptr(latchmgr->alloc, 1)->off = mgr->page_size - 3 - (lvl ? BtId + sizeof(BtVal): sizeof(BtVal));
1003 key = keyptr(latchmgr->alloc, 1);
1004 key->len = 2; // create stopper key
1008 bt_putid(value, MIN_lvl - lvl + 1);
1009 val = valptr(latchmgr->alloc, 1);
1010 val->len = lvl ? BtId : 0;
1011 memcpy (val->value, value, val->len);
1013 latchmgr->alloc->min = slotptr(latchmgr->alloc, 1)->off;
1014 latchmgr->alloc->lvl = lvl;
1015 latchmgr->alloc->cnt = 1;
1016 latchmgr->alloc->act = 1;
1018 if( pwrite (mgr->idx, latchmgr, mgr->page_size, (MIN_lvl - lvl) << bits) < mgr->page_size )
1019 return bt_mgrclose (mgr), NULL;
1021 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
1022 return bt_mgrclose (mgr), NULL;
1024 if( *amt < mgr->page_size )
1025 return bt_mgrclose (mgr), NULL;
1029 // clear out latch manager locks
1030 // and rest of pages to round out segment
1032 memset(latchmgr, 0, mgr->page_size);
1035 while( last <= ((MIN_lvl + 1 + nlatchpage) | mgr->poolmask) ) {
1037 pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits);
1039 SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN);
1040 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
1041 return bt_mgrclose (mgr), NULL;
1042 if( *amt < mgr->page_size )
1043 return bt_mgrclose (mgr), NULL;
1050 // mlock the root page and the latchmgr page
1052 flag = PROT_READ | PROT_WRITE;
1053 mgr->latchmgr = mmap (0, 2 * mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size);
1054 if( mgr->latchmgr == MAP_FAILED )
1055 return bt_mgrclose (mgr), NULL;
1056 mlock (mgr->latchmgr, 2 * mgr->page_size);
1058 mgr->latchsets = (BtLatchSet *)mmap (0, mgr->latchmgr->nlatchpage * mgr->page_size, flag, MAP_SHARED, mgr->idx, LATCH_page * mgr->page_size);
1059 if( mgr->latchsets == MAP_FAILED )
1060 return bt_mgrclose (mgr), NULL;
1061 mlock (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size);
1063 flag = PAGE_READWRITE;
1064 mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size, NULL);
1066 return bt_mgrclose (mgr), NULL;
1068 flag = FILE_MAP_WRITE;
1069 mgr->latchmgr = MapViewOfFile(mgr->halloc, flag, 0, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size);
1070 if( !mgr->latchmgr )
1071 return GetLastError(), bt_mgrclose (mgr), NULL;
1073 mgr->latchsets = (void *)((char *)mgr->latchmgr + LATCH_page * mgr->page_size);
1079 VirtualFree (latchmgr, 0, MEM_RELEASE);
1084 // open BTree access method
1085 // based on buffer manager
1087 BtDb *bt_open (BtMgr *mgr)
1089 BtDb *bt = malloc (sizeof(*bt));
1091 memset (bt, 0, sizeof(*bt));
1094 bt->mem = malloc (2 *mgr->page_size);
1096 bt->mem = VirtualAlloc(NULL, 2 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE);
1098 bt->frame = (BtPage)bt->mem;
1099 bt->cursor = (BtPage)(bt->mem + 1 * mgr->page_size);
1103 // compare two keys, returning > 0, = 0, or < 0
1104 // as the comparison value
1106 int keycmp (BtKey* key1, unsigned char *key2, uint len2)
1108 uint len1 = key1->len;
1111 if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) )
1124 // find segment in pool
1125 // must be called with hashslot idx locked
1126 // return NULL if not there
1127 // otherwise return node
1129 BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx)
1134 // compute start of hash chain in pool
1136 if( slot = bt->mgr->hash[idx] )
1137 pool = bt->mgr->pool + slot;
1141 page_no &= ~bt->mgr->poolmask;
1143 while( pool->basepage != page_no )
1144 if( pool = pool->hashnext )
1152 // add segment to hash table
1154 void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx)
1159 pool->hashprev = pool->hashnext = NULL;
1160 pool->basepage = page_no & ~bt->mgr->poolmask;
1161 pool->pin = CLOCK_bit + 1;
1163 if( slot = bt->mgr->hash[idx] ) {
1164 node = bt->mgr->pool + slot;
1165 pool->hashnext = node;
1166 node->hashprev = pool;
1169 bt->mgr->hash[idx] = pool->slot;
1172 // map new buffer pool segment to virtual memory
1174 BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
1176 off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits;
1177 off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits);
1181 flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE );
1182 pool->map = mmap (0, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off);
1183 if( pool->map == MAP_FAILED )
1184 return bt->err = BTERR_map;
1187 flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE );
1188 pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL);
1190 return bt->err = BTERR_map;
1192 flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE );
1193 pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits);
1195 return bt->err = BTERR_map;
1200 // calculate page within pool
1202 BtPage bt_page (BtDb *bt, BtPool *pool, uid page_no)
1204 uint subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping
1207 page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits));
1208 // madvise (page, bt->mgr->page_size, MADV_WILLNEED);
1214 void bt_unpinpool (BtPool *pool)
1217 __sync_fetch_and_add(&pool->pin, -1);
1219 _InterlockedDecrement16 (&pool->pin);
1223 // find or place requested page in segment-pool
1224 // return pool table entry, incrementing pin
1226 BtPool *bt_pinpool(BtDb *bt, uid page_no)
1228 uint slot, hashidx, idx, victim;
1229 BtPool *pool, *node, *next;
1231 // lock hash table chain
1233 hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
1234 bt_spinwritelock (&bt->mgr->latch[hashidx]);
1236 // look up in hash table
1238 if( pool = bt_findpool(bt, page_no, hashidx) ) {
1240 __sync_fetch_and_or(&pool->pin, CLOCK_bit);
1241 __sync_fetch_and_add(&pool->pin, 1);
1243 _InterlockedOr16 (&pool->pin, CLOCK_bit);
1244 _InterlockedIncrement16 (&pool->pin);
1246 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1250 // allocate a new pool node
1251 // and add to hash table
1254 slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1);
1256 slot = _InterlockedIncrement16 (&bt->mgr->poolcnt) - 1;
1259 if( ++slot < bt->mgr->poolmax ) {
1260 pool = bt->mgr->pool + slot;
1263 if( bt_mapsegment(bt, pool, page_no) )
1266 bt_linkhash(bt, pool, page_no, hashidx);
1267 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1271 // pool table is full
1272 // find best pool entry to evict
1275 __sync_fetch_and_add(&bt->mgr->poolcnt, -1);
1277 _InterlockedDecrement16 (&bt->mgr->poolcnt);
1282 victim = __sync_fetch_and_add(&bt->mgr->evicted, 1);
1284 victim = _InterlockedIncrement (&bt->mgr->evicted) - 1;
1286 victim %= bt->mgr->poolmax;
1287 pool = bt->mgr->pool + victim;
1288 idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
1293 // try to get write lock
1294 // skip entry if not obtained
1296 if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
1299 // skip this entry if
1301 // or clock bit is set
1305 __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
1307 _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
1309 bt_spinreleasewrite (&bt->mgr->latch[idx]);
1313 // unlink victim pool node from hash table
1315 if( node = pool->hashprev )
1316 node->hashnext = pool->hashnext;
1317 else if( node = pool->hashnext )
1318 bt->mgr->hash[idx] = node->slot;
1320 bt->mgr->hash[idx] = 0;
1322 if( node = pool->hashnext )
1323 node->hashprev = pool->hashprev;
1325 bt_spinreleasewrite (&bt->mgr->latch[idx]);
1327 // remove old file mapping
1329 munmap (pool->map, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits);
1331 // FlushViewOfFile(pool->map, 0);
1332 UnmapViewOfFile(pool->map);
1333 CloseHandle(pool->hmap);
1337 // create new pool mapping
1338 // and link into hash table
1340 if( bt_mapsegment(bt, pool, page_no) )
1343 bt_linkhash(bt, pool, page_no, hashidx);
1344 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1349 // place write, read, or parent lock on requested page_no.
1351 void bt_lockpage(BtLock mode, BtLatchSet *set)
1355 ReadLock (set->readwr);
1358 WriteLock (set->readwr);
1361 ReadLock (set->access);
1364 WriteLock (set->access);
1367 WriteLock (set->parent);
1372 // remove write, read, or parent lock on requested page
1374 void bt_unlockpage(BtLock mode, BtLatchSet *set)
1378 ReadRelease (set->readwr);
1381 WriteRelease (set->readwr);
1384 ReadRelease (set->access);
1387 WriteRelease (set->access);
1390 WriteRelease (set->parent);
1395 // allocate a new page
1397 BTERR bt_newpage(BtDb *bt, BtPageSet *set)
1401 // lock allocation page
1403 bt_spinwritelock(bt->mgr->latchmgr->lock);
1405 // use empty chain first
1406 // else allocate empty page
1408 if( set->page_no = bt_getid(bt->mgr->latchmgr->chain) ) {
1409 if( set->pool = bt_pinpool (bt, set->page_no) )
1410 set->page = bt_page (bt, set->pool, set->page_no);
1412 return bt->err = BTERR_struct;
1414 bt_putid(bt->mgr->latchmgr->chain, bt_getid(set->page->right));
1416 set->page_no = bt_getid(bt->mgr->latchmgr->alloc->right);
1417 bt_putid(bt->mgr->latchmgr->alloc->right, set->page_no+1);
1419 // if writing first page of pool block, set file length thru last page
1421 if( (set->page_no & bt->mgr->poolmask) == 0 )
1422 ftruncate (bt->mgr->idx, (set->page_no + bt->mgr->poolmask + 1) << bt->mgr->page_bits);
1424 if( set->pool = bt_pinpool (bt, set->page_no) )
1425 set->page = bt_page (bt, set->pool, set->page_no);
1430 // unlock allocation latch
1432 bt_spinreleasewrite(bt->mgr->latchmgr->lock);
1436 // find slot in page for given key at a given level
1438 int bt_findslot (BtPageSet *set, unsigned char *key, uint len)
1440 uint diff, higher = set->page->cnt, low = 1, slot;
1443 // make stopper key an infinite fence value
1445 if( bt_getid (set->page->right) )
1450 // low is the lowest candidate.
1451 // loop ends when they meet
1453 // higher is already
1454 // tested as .ge. the passed key.
1456 while( diff = higher - low ) {
1457 slot = low + ( diff >> 1 );
1458 if( keycmp (keyptr(set->page, slot), key, len) < 0 )
1461 higher = slot, good++;
1464 // return zero if key is on right link page
1466 return good ? higher : 0;
1469 // find and load page at given level for given key
1470 // leave page rd or wr locked as requested
1472 int bt_loadpage (BtDb *bt, BtPageSet *set, unsigned char *key, uint len, uint lvl, BtLock lock)
1474 uid page_no = ROOT_page, prevpage = 0;
1475 uint drill = 0xff, slot;
1476 BtLatchSet *prevlatch;
1477 uint mode, prevmode;
1480 // start at root of btree and drill down
1483 // determine lock mode of drill level
1484 mode = (drill == lvl) ? lock : BtLockRead;
1486 set->latch = bt_pinlatch (bt, page_no);
1487 set->page_no = page_no;
1489 // pin page contents
1491 if( set->pool = bt_pinpool (bt, page_no) )
1492 set->page = bt_page (bt, set->pool, page_no);
1496 // obtain access lock using lock chaining with Access mode
1498 if( page_no > ROOT_page )
1499 bt_lockpage(BtLockAccess, set->latch);
1501 // release & unpin parent page
1504 bt_unlockpage(prevmode, prevlatch);
1505 bt_unpinlatch (prevlatch);
1506 bt_unpinpool (prevpool);
1510 // obtain read lock using lock chaining
1512 bt_lockpage(mode, set->latch);
1514 if( set->page->free )
1515 return bt->err = BTERR_struct, 0;
1517 if( page_no > ROOT_page )
1518 bt_unlockpage(BtLockAccess, set->latch);
1520 // re-read and re-lock root after determining actual level of root
1522 if( set->page->lvl != drill) {
1523 if( set->page_no != ROOT_page )
1524 return bt->err = BTERR_struct, 0;
1526 drill = set->page->lvl;
1528 if( lock != BtLockRead && drill == lvl ) {
1529 bt_unlockpage(mode, set->latch);
1530 bt_unpinlatch (set->latch);
1531 bt_unpinpool (set->pool);
1536 prevpage = set->page_no;
1537 prevlatch = set->latch;
1538 prevpool = set->pool;
1541 // find key on page at this level
1542 // and descend to requested level
1544 if( set->page->kill )
1547 if( slot = bt_findslot (set, key, len) ) {
1551 while( slotptr(set->page, slot)->dead )
1552 if( slot++ < set->page->cnt )
1557 page_no = bt_getid(valptr(set->page, slot)->value);
1562 // or slide right into next page
1565 page_no = bt_getid(set->page->right);
1569 // return error on end of right chain
1571 bt->err = BTERR_struct;
1572 return 0; // return error
1575 // return page to free list
1576 // page must be delete & write locked
1578 void bt_freepage (BtDb *bt, BtPageSet *set)
1580 // lock allocation page
1582 bt_spinwritelock (bt->mgr->latchmgr->lock);
1585 memcpy(set->page->right, bt->mgr->latchmgr->chain, BtId);
1586 bt_putid(bt->mgr->latchmgr->chain, set->page_no);
1587 set->page->free = 1;
1589 // unlock released page
1591 bt_unlockpage (BtLockDelete, set->latch);
1592 bt_unlockpage (BtLockWrite, set->latch);
1593 bt_unpinlatch (set->latch);
1594 bt_unpinpool (set->pool);
1596 // unlock allocation page
1598 bt_spinreleasewrite (bt->mgr->latchmgr->lock);
1601 // a fence key was deleted from a page
1602 // push new fence value upwards
1604 BTERR bt_fixfence (BtDb *bt, BtPageSet *set, uint lvl)
1606 unsigned char leftkey[BT_keyarray], rightkey[BT_keyarray];
1607 unsigned char value[BtId];
1611 // remove the old fence value
1613 ptr = keyptr(set->page, set->page->cnt);
1614 memcpy (rightkey, ptr, ptr->len + sizeof(BtKey));
1615 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1617 // cache new fence value
1619 ptr = keyptr(set->page, set->page->cnt);
1620 memcpy (leftkey, ptr, ptr->len + sizeof(BtKey));
1622 bt_lockpage (BtLockParent, set->latch);
1623 bt_unlockpage (BtLockWrite, set->latch);
1625 // insert new (now smaller) fence key
1627 bt_putid (value, set->page_no);
1628 ptr = (BtKey*)leftkey;
1630 if( bt_insertkey (bt, ptr->key, ptr->len, lvl+1, value, BtId, 1) )
1633 // now delete old fence key
1635 ptr = (BtKey*)rightkey;
1637 if( bt_deletekey (bt, ptr->key, ptr->len, lvl+1) )
1640 bt_unlockpage (BtLockParent, set->latch);
1641 bt_unpinlatch(set->latch);
1642 bt_unpinpool (set->pool);
1646 // root has a single child
1647 // collapse a level from the tree
1649 BTERR bt_collapseroot (BtDb *bt, BtPageSet *root)
1654 // find the child entry and promote as new root contents
1657 for( idx = 0; idx++ < root->page->cnt; )
1658 if( !slotptr(root->page, idx)->dead )
1661 child->page_no = bt_getid (valptr(root->page, idx)->value);
1663 child->latch = bt_pinlatch (bt, child->page_no);
1664 bt_lockpage (BtLockDelete, child->latch);
1665 bt_lockpage (BtLockWrite, child->latch);
1667 if( child->pool = bt_pinpool (bt, child->page_no) )
1668 child->page = bt_page (bt, child->pool, child->page_no);
1672 memcpy (root->page, child->page, bt->mgr->page_size);
1673 bt_freepage (bt, child);
1675 } while( root->page->lvl > 1 && root->page->act == 1 );
1677 bt_unlockpage (BtLockWrite, root->latch);
1678 bt_unpinlatch (root->latch);
1679 bt_unpinpool (root->pool);
1683 // maintain reverse scan pointers by
1684 // linking left pointer of far right node
1686 BTERR bt_linkleft (BtDb *bt, uid left_page_no, uid right_page_no)
1688 BtPageSet right2[1];
1690 // keep track of rightmost leaf page
1692 if( !right_page_no ) {
1693 bt_putid (bt->mgr->latchmgr->alloc->left, left_page_no);
1697 // link right page to left page
1699 right2->latch = bt_pinlatch (bt, right_page_no);
1700 bt_lockpage (BtLockWrite, right2->latch);
1702 if( right2->pool = bt_pinpool (bt, right_page_no) )
1703 right2->page = bt_page (bt, right2->pool, right_page_no);
1707 bt_putid(right2->page->left, left_page_no);
1708 bt_unlockpage (BtLockWrite, right2->latch);
1709 bt_unpinlatch (right2->latch);
1713 // find and delete key on page by marking delete flag bit
1714 // if page becomes empty, delete it from the btree
1716 BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl)
1718 unsigned char lowerfence[BT_keyarray], higherfence[BT_keyarray];
1719 BtPageSet set[1], right[1], right2[1];
1720 uint slot, idx, found, fence;
1721 unsigned char value[BtId];
1725 if( slot = bt_loadpage (bt, set, key, len, lvl, BtLockWrite) )
1726 ptr = keyptr(set->page, slot);
1730 // if librarian slot, advance to real slot
1732 if( slotptr(set->page, slot)->type == Librarian )
1733 ptr = keyptr(set->page, ++slot);
1735 fence = slot == set->page->cnt;
1737 // if key is found delete it, otherwise ignore request
1739 if( found = !keycmp (ptr, key, len) )
1740 if( found = slotptr(set->page, slot)->dead == 0 ) {
1741 val = valptr(set->page,slot);
1742 slotptr(set->page, slot)->dead = 1;
1743 set->page->garbage += ptr->len + val->len + sizeof(BtKey) + sizeof(BtVal);
1746 // collapse empty slots beneath the fence
1748 while( idx = set->page->cnt - 1 )
1749 if( slotptr(set->page, idx)->dead ) {
1750 *slotptr(set->page, idx) = *slotptr(set->page, idx + 1);
1751 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1756 // did we delete a fence key in an upper level?
1758 if( found && lvl && set->page->act && fence )
1759 if( bt_fixfence (bt, set, lvl) )
1762 return bt->found = found, 0;
1764 // do we need to collapse root?
1766 if( lvl > 1 && set->page_no == ROOT_page && set->page->act == 1 )
1767 if( bt_collapseroot (bt, set) )
1770 return bt->found = found, 0;
1772 // return if page is not empty
1774 if( set->page->act ) {
1775 bt_unlockpage(BtLockWrite, set->latch);
1776 bt_unpinlatch (set->latch);
1777 bt_unpinpool (set->pool);
1778 return bt->found = found, 0;
1781 // cache copy of fence key
1782 // to post in parent
1784 ptr = keyptr(set->page, set->page->cnt);
1785 memcpy (lowerfence, ptr, ptr->len + sizeof(BtKey));
1787 // obtain lock on right page
1789 right->page_no = bt_getid(set->page->right);
1790 right->latch = bt_pinlatch (bt, right->page_no);
1791 bt_lockpage (BtLockWrite, right->latch);
1793 // pin page contents
1795 if( right->pool = bt_pinpool (bt, right->page_no) )
1796 right->page = bt_page (bt, right->pool, right->page_no);
1800 if( right->page->kill )
1801 return bt->err = BTERR_struct;
1803 // transfer left link
1805 memcpy (right->page->left, set->page->left, BtId);
1807 // pull contents of right peer into our empty page
1809 memcpy (set->page, right->page, bt->mgr->page_size);
1814 if( bt_linkleft (bt, set->page_no, bt_getid (set->page->right)) )
1817 // cache copy of key to update
1819 ptr = keyptr(right->page, right->page->cnt);
1820 memcpy (higherfence, ptr, ptr->len + sizeof(BtKey));
1822 // mark right page deleted and point it to left page
1823 // until we can post parent updates
1825 bt_putid (right->page->right, set->page_no);
1826 right->page->kill = 1;
1828 bt_lockpage (BtLockParent, right->latch);
1829 bt_unlockpage (BtLockWrite, right->latch);
1831 bt_lockpage (BtLockParent, set->latch);
1832 bt_unlockpage (BtLockWrite, set->latch);
1834 // redirect higher key directly to our new node contents
1836 bt_putid (value, set->page_no);
1837 ptr = (BtKey*)higherfence;
1839 if( bt_insertkey (bt, ptr->key, ptr->len, lvl+1, value, BtId, 1) )
1842 // delete old lower key to our node
1844 ptr = (BtKey*)lowerfence;
1846 if( bt_deletekey (bt, ptr->key, ptr->len, lvl+1) )
1849 // obtain delete and write locks to right node
1851 bt_unlockpage (BtLockParent, right->latch);
1852 bt_lockpage (BtLockDelete, right->latch);
1853 bt_lockpage (BtLockWrite, right->latch);
1854 bt_freepage (bt, right);
1856 bt_unlockpage (BtLockParent, set->latch);
1857 bt_unpinlatch (set->latch);
1858 bt_unpinpool (set->pool);
1863 BtKey *bt_foundkey (BtDb *bt)
1865 return (BtKey*)(bt->key);
1868 // advance to next slot
1870 uint bt_findnext (BtDb *bt, BtPageSet *set, uint slot)
1872 BtLatchSet *prevlatch;
1876 if( slot < set->page->cnt )
1879 prevlatch = set->latch;
1880 prevpool = set->pool;
1882 if( page_no = bt_getid(set->page->right) )
1883 set->latch = bt_pinlatch (bt, page_no);
1885 return bt->err = BTERR_struct, 0;
1887 // pin page contents
1889 if( set->pool = bt_pinpool (bt, page_no) )
1890 set->page = bt_page (bt, set->pool, page_no);
1894 // obtain access lock using lock chaining with Access mode
1896 bt_lockpage(BtLockAccess, set->latch);
1898 bt_unlockpage(BtLockRead, prevlatch);
1899 bt_unpinlatch (prevlatch);
1900 bt_unpinpool (prevpool);
1902 bt_lockpage(BtLockRead, set->latch);
1903 bt_unlockpage(BtLockAccess, set->latch);
1905 set->page_no = page_no;
1909 // find unique key or first duplicate key in
1910 // leaf level and return number of value bytes
1911 // or (-1) if not found. Setup key for bt_foundkey
1913 int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax)
1921 if( slot = bt_loadpage (bt, set, key, keylen, 0, BtLockRead) )
1923 ptr = keyptr(set->page, slot);
1925 // skip librarian slot place holder
1927 if( slotptr(set->page, slot)->type == Librarian )
1928 ptr = keyptr(set->page, ++slot);
1930 // return actual key found
1932 memcpy (bt->key, ptr, ptr->len + sizeof(BtKey));
1935 if( slotptr(set->page, slot)->type == Duplicate )
1938 // if key exists, return >= 0 value bytes copied
1939 // otherwise return (-1)
1941 if( slotptr(set->page, slot)->dead )
1945 if( !memcmp (ptr->key, key, len) ) {
1946 val = valptr (set->page,slot);
1947 if( valmax > val->len )
1949 memcpy (value, val->value, valmax);
1955 } while( slot = bt_findnext (bt, set, slot) );
1957 bt_unlockpage (BtLockRead, set->latch);
1958 bt_unpinlatch (set->latch);
1959 bt_unpinpool (set->pool);
1963 // check page for space available,
1964 // clean if necessary and return
1965 // 0 - page needs splitting
1966 // >0 new slot value
1968 uint bt_cleanpage(BtDb *bt, BtPage page, uint keylen, uint slot, uint vallen)
1970 uint nxt = bt->mgr->page_size;
1971 uint cnt = 0, idx = 0;
1972 uint max = page->cnt;
1977 if( page->min >= (max+2) * sizeof(BtSlot) + sizeof(*page) + keylen + sizeof(BtKey) + vallen + sizeof(BtVal))
1980 // skip cleanup and proceed to split
1981 // if there's not enough garbage
1984 if( page->garbage < nxt / 5 )
1987 memcpy (bt->frame, page, bt->mgr->page_size);
1989 // skip page info and set rest of page to zero
1991 memset (page+1, 0, bt->mgr->page_size - sizeof(*page));
1995 // clean up page first by
1996 // removing deleted keys
1998 while( cnt++ < max ) {
2001 if( cnt < max && slotptr(bt->frame,cnt)->dead )
2004 // copy the value across
2006 val = valptr(bt->frame, cnt);
2007 nxt -= val->len + sizeof(BtVal);
2008 ((unsigned char *)page)[nxt] = val->len;
2009 memcpy ((unsigned char *)page + nxt + sizeof(BtVal), val->value, val->len);
2011 // copy the key across
2013 key = keyptr(bt->frame, cnt);
2014 nxt -= key->len + sizeof(BtKey);
2015 memcpy ((unsigned char *)page + nxt, key, key->len + sizeof(BtKey));
2017 // make a librarian slot
2020 slotptr(page, ++idx)->off = nxt;
2021 slotptr(page, idx)->type = Librarian;
2022 slotptr(page, idx)->dead = 1;
2027 slotptr(page, ++idx)->off = nxt;
2028 slotptr(page, idx)->type = slotptr(bt->frame, cnt)->type;
2030 if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) )
2037 // see if page has enough space now, or does it need splitting?
2039 if( page->min >= (idx+2) * sizeof(BtSlot) + sizeof(*page) + keylen + sizeof(BtKey) + vallen + sizeof(BtVal) )
2045 // split the root and raise the height of the btree
2047 BTERR bt_splitroot(BtDb *bt, BtPageSet *root, BtKey *leftkey, BtPageSet *right)
2049 uint nxt = bt->mgr->page_size;
2050 unsigned char value[BtId];
2055 // Obtain an empty page to use, and copy the current
2056 // root contents into it, e.g. lower keys
2058 if( bt_newpage(bt, left) )
2061 memcpy(left->page, root->page, bt->mgr->page_size);
2062 bt_unpinpool (left->pool);
2064 // set left from higher to lower page
2066 bt_putid (right->page->left, left->page_no);
2067 bt_unpinpool (right->pool);
2069 // preserve the page info at the bottom
2070 // of higher keys and set rest to zero
2072 memset(root->page+1, 0, bt->mgr->page_size - sizeof(*root->page));
2074 // insert stopper key at top of newroot page
2075 // and increase the root height
2077 nxt -= BtId + sizeof(BtVal);
2078 bt_putid (value, right->page_no);
2079 val = (BtVal *)((unsigned char *)root->page + nxt);
2080 memcpy (val->value, value, BtId);
2083 nxt -= 2 + sizeof(BtKey);
2084 slotptr(root->page, 2)->off = nxt;
2085 ptr = (BtKey *)((unsigned char *)root->page + nxt);
2090 // insert lower keys page fence key on newroot page as first key
2092 nxt -= BtId + sizeof(BtVal);
2093 bt_putid (value, left->page_no);
2094 val = (BtVal *)((unsigned char *)root->page + nxt);
2095 memcpy (val->value, value, BtId);
2098 nxt -= leftkey->len + sizeof(BtKey);
2099 slotptr(root->page, 1)->off = nxt;
2100 memcpy ((unsigned char *)root->page + nxt, leftkey, leftkey->len + sizeof(BtKey));
2102 bt_putid(root->page->right, 0);
2103 root->page->min = nxt; // reset lowest used offset and key count
2104 root->page->cnt = 2;
2105 root->page->act = 2;
2108 // release and unpin root
2110 bt_unlockpage(BtLockWrite, root->latch);
2111 bt_unpinlatch (root->latch);
2112 bt_unpinpool (root->pool);
2116 // split already locked full node
2119 BTERR bt_splitpage (BtDb *bt, BtPageSet *set)
2121 unsigned char fencekey[BT_keyarray], rightkey[BT_keyarray];
2122 uint cnt = 0, idx = 0, max, nxt = bt->mgr->page_size;
2123 unsigned char value[BtId];
2124 uint lvl = set->page->lvl;
2131 // split higher half of keys to bt->frame
2133 memset (bt->frame, 0, bt->mgr->page_size);
2134 max = set->page->cnt;
2138 while( cnt++ < max ) {
2139 if( slotptr(set->page, cnt)->dead && cnt < max )
2141 src = valptr(set->page, cnt);
2142 nxt -= src->len + sizeof(BtVal);
2143 val = (BtVal*)((unsigned char *)bt->frame + nxt);
2144 memcpy (val->value, src->value, src->len);
2145 val->len = src->len;
2147 key = keyptr(set->page, cnt);
2148 nxt -= key->len + sizeof(BtKey);
2149 ptr = (BtKey*)((unsigned char *)bt->frame + nxt);
2150 memcpy (ptr, key, key->len + sizeof(BtKey));
2152 // add librarian slot
2155 slotptr(bt->frame, ++idx)->off = nxt;
2156 slotptr(bt->frame, idx)->type = Librarian;
2157 slotptr(bt->frame, idx)->dead = 1;
2162 slotptr(bt->frame, ++idx)->off = nxt;
2163 slotptr(bt->frame, idx)->type = slotptr(set->page, cnt)->type;
2165 if( !(slotptr(bt->frame, idx)->dead = slotptr(set->page, cnt)->dead) )
2169 // remember existing fence key for new page to the right
2171 memcpy (rightkey, key, key->len + sizeof(BtKey));
2173 bt->frame->bits = bt->mgr->page_bits;
2174 bt->frame->min = nxt;
2175 bt->frame->cnt = idx;
2176 bt->frame->lvl = lvl;
2180 if( set->page_no > ROOT_page ) {
2181 bt_putid (bt->frame->right, bt_getid (set->page->right));
2182 bt_putid(bt->frame->left, set->page_no);
2185 // get new free page and write higher keys to it.
2187 if( bt_newpage(bt, right) )
2192 if( set->page_no > ROOT_page && !lvl )
2193 if( bt_linkleft (bt, right->page_no, bt_getid (set->page->right)) )
2196 memcpy (right->page, bt->frame, bt->mgr->page_size);
2197 bt_unpinpool (right->pool);
2199 // update lower keys to continue in old page
2201 memcpy (bt->frame, set->page, bt->mgr->page_size);
2202 memset (set->page+1, 0, bt->mgr->page_size - sizeof(*set->page));
2203 nxt = bt->mgr->page_size;
2204 set->page->garbage = 0;
2210 if( slotptr(bt->frame, max)->type == Librarian )
2213 // assemble page of smaller keys
2215 while( cnt++ < max ) {
2216 if( slotptr(bt->frame, cnt)->dead )
2218 val = valptr(bt->frame, cnt);
2219 nxt -= val->len + sizeof(BtVal);
2220 ((unsigned char *)set->page)[nxt] = val->len;
2221 memcpy ((unsigned char *)set->page + nxt + sizeof(BtVal), val->value, val->len);
2223 key = keyptr(bt->frame, cnt);
2224 nxt -= key->len + sizeof(BtKey);
2225 memcpy ((unsigned char *)set->page + nxt, key, key->len + sizeof(BtKey));
2227 // add librarian slot
2230 slotptr(set->page, ++idx)->off = nxt;
2231 slotptr(set->page, idx)->type = Librarian;
2232 slotptr(set->page, idx)->dead = 1;
2237 slotptr(set->page, ++idx)->off = nxt;
2238 slotptr(set->page, idx)->type = slotptr(bt->frame, cnt)->type;
2242 // remember fence key for smaller page
2244 memcpy(fencekey, key, key->len + sizeof(BtKey));
2246 bt_putid(set->page->right, right->page_no);
2247 set->page->min = nxt;
2248 set->page->cnt = idx;
2250 // if current page is the root page, split it
2252 if( set->page_no == ROOT_page )
2253 return bt_splitroot (bt, set, (BtKey *)fencekey, right);
2255 // insert new fences in their parent pages
2257 right->latch = bt_pinlatch (bt, right->page_no);
2258 bt_lockpage (BtLockParent, right->latch);
2260 bt_lockpage (BtLockParent, set->latch);
2261 bt_unlockpage (BtLockWrite, set->latch);
2263 // insert new fence for reformulated left block of smaller keys
2265 bt_putid (value, set->page_no);
2267 if( bt_insertkey (bt, fencekey+1, *fencekey, lvl+1, value, BtId, 1) )
2270 // switch fence for right block of larger keys to new right page
2272 bt_putid (value, right->page_no);
2274 if( bt_insertkey (bt, rightkey+1, *rightkey, lvl+1, value, BtId, 1) )
2277 bt_unlockpage (BtLockParent, set->latch);
2278 bt_unpinlatch (set->latch);
2279 bt_unpinpool (set->pool);
2281 bt_unlockpage (BtLockParent, right->latch);
2282 bt_unpinlatch (right->latch);
2286 // install new key and value onto page
2287 // page must already be checked for
2290 BTERR bt_insertslot (BtDb *bt, BtPageSet *set, uint slot, unsigned char *key,uint keylen, unsigned char *value, uint vallen, uint type)
2292 uint idx, librarian;
2297 // if found slot > desired slot and previous slot
2298 // is a librarian slot, use it
2301 if( slotptr(set->page, slot-1)->type == Librarian )
2304 // copy value onto page
2306 set->page->min -= vallen + sizeof(BtVal);
2307 val = (BtVal*)((unsigned char *)set->page + set->page->min);
2308 memcpy (val->value, value, vallen);
2311 // copy key onto page
2313 set->page->min -= keylen + sizeof(BtKey);
2314 ptr = (BtKey*)((unsigned char *)set->page + set->page->min);
2315 memcpy (ptr->key, key, keylen);
2318 // find first empty slot
2320 for( idx = slot; idx < set->page->cnt; idx++ )
2321 if( slotptr(set->page, idx)->dead )
2324 // now insert key into array before slot
2326 if( idx == set->page->cnt )
2327 idx += 2, set->page->cnt += 2, librarian = 2;
2333 while( idx > slot + librarian - 1 )
2334 *slotptr(set->page, idx) = *slotptr(set->page, idx - librarian), idx--;
2336 // add librarian slot
2338 if( librarian > 1 ) {
2339 node = slotptr(set->page, slot++);
2340 node->off = set->page->min;
2341 node->type = Librarian;
2347 node = slotptr(set->page, slot);
2348 node->off = set->page->min;
2352 bt_unlockpage (BtLockWrite, set->latch);
2353 bt_unpinlatch (set->latch);
2354 bt_unpinpool (set->pool);
2358 // Insert new key into the btree at given level.
2359 // either add a new key or update/add an existing one
2361 BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint keylen, uint lvl, void *value, uint vallen, uint unique)
2363 unsigned char newkey[BT_keyarray];
2364 uint slot, idx, len;
2371 // set up the key we're working on
2373 ins = (BtKey*)newkey;
2374 memcpy (ins->key, key, keylen);
2377 // is this a non-unique index value?
2383 sequence = bt_newdup (bt);
2384 bt_putid (ins->key + ins->len + sizeof(BtKey), sequence);
2388 while( 1 ) { // find the page and slot for the current key
2389 if( slot = bt_loadpage (bt, set, ins->key, ins->len, lvl, BtLockWrite) )
2390 ptr = keyptr(set->page, slot);
2393 bt->err = BTERR_ovflw;
2397 // if librarian slot == found slot, advance to real slot
2399 if( slotptr(set->page, slot)->type == Librarian )
2400 if( !keycmp (ptr, key, keylen) )
2401 ptr = keyptr(set->page, ++slot);
2405 if( slotptr(set->page, slot)->type == Duplicate )
2408 // if inserting a duplicate key or unique key
2409 // check for adequate space on the page
2410 // and insert the new key before slot.
2412 if( unique && (len != ins->len || memcmp (ptr->key, ins->key, ins->len)) || !unique ) {
2413 if( !(slot = bt_cleanpage (bt, set->page, ins->len, slot, vallen)) )
2414 if( bt_splitpage (bt, set) )
2419 return bt_insertslot (bt, set, slot, ins->key, ins->len, value, vallen, type);
2422 // if key already exists, update value and return
2424 val = valptr(set->page, slot);
2426 if( val->len >= vallen ) {
2427 if( slotptr(set->page, slot)->dead )
2429 set->page->garbage += val->len - vallen;
2430 slotptr(set->page, slot)->dead = 0;
2432 memcpy (val->value, value, vallen);
2433 bt_unlockpage(BtLockWrite, set->latch);
2434 bt_unpinlatch (set->latch);
2435 bt_unpinpool (set->pool);
2439 // new update value doesn't fit in existing value area
2441 if( !slotptr(set->page, slot)->dead )
2442 set->page->garbage += val->len + ptr->len + sizeof(BtKey) + sizeof(BtVal);
2444 slotptr(set->page, slot)->dead = 0;
2448 if( !(slot = bt_cleanpage (bt, set->page, keylen, slot, vallen)) )
2449 if( bt_splitpage (bt, set) )
2454 set->page->min -= vallen + sizeof(BtVal);
2455 val = (BtVal*)((unsigned char *)set->page + set->page->min);
2456 memcpy (val->value, value, vallen);
2459 set->page->min -= keylen + sizeof(BtKey);
2460 ptr = (BtKey*)((unsigned char *)set->page + set->page->min);
2461 memcpy (ptr->key, key, keylen);
2464 slotptr(set->page, slot)->off = set->page->min;
2465 bt_unlockpage(BtLockWrite, set->latch);
2466 bt_unpinlatch (set->latch);
2467 bt_unpinpool (set->pool);
2473 // set cursor to highest slot on highest page
2475 uint bt_lastkey (BtDb *bt)
2477 uid page_no = bt_getid (bt->mgr->latchmgr->alloc->left);
2481 if( set->pool = bt_pinpool (bt, page_no) )
2482 set->page = bt_page (bt, set->pool, page_no);
2486 set->latch = bt_pinlatch (bt, page_no);
2487 bt_lockpage(BtLockRead, set->latch);
2489 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2490 slot = set->page->cnt;
2492 bt_unlockpage(BtLockRead, set->latch);
2493 bt_unpinlatch (set->latch);
2494 bt_unpinpool (set->pool);
2499 // return previous slot on cursor page
2501 uint bt_prevkey (BtDb *bt, uint slot)
2509 if( left = bt_getid(bt->cursor->left) )
2510 bt->cursor_page = left;
2514 if( set->pool = bt_pinpool (bt, left) )
2515 set->page = bt_page (bt, set->pool, left);
2519 set->latch = bt_pinlatch (bt, left);
2520 bt_lockpage(BtLockRead, set->latch);
2522 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2524 bt_unlockpage(BtLockRead, set->latch);
2525 bt_unpinlatch (set->latch);
2526 bt_unpinpool (set->pool);
2527 return bt->cursor->cnt;
2530 // return next slot on cursor page
2531 // or slide cursor right into next page
2533 uint bt_nextkey (BtDb *bt, uint slot)
2539 right = bt_getid(bt->cursor->right);
2541 while( slot++ < bt->cursor->cnt )
2542 if( slotptr(bt->cursor,slot)->dead )
2544 else if( right || (slot < bt->cursor->cnt) ) // skip infinite stopper
2552 bt->cursor_page = right;
2554 if( set->pool = bt_pinpool (bt, right) )
2555 set->page = bt_page (bt, set->pool, right);
2559 set->latch = bt_pinlatch (bt, right);
2560 bt_lockpage(BtLockRead, set->latch);
2562 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2564 bt_unlockpage(BtLockRead, set->latch);
2565 bt_unpinlatch (set->latch);
2566 bt_unpinpool (set->pool);
2574 // cache page of keys into cursor and return starting slot for given key
2576 uint bt_startkey (BtDb *bt, unsigned char *key, uint len)
2581 // cache page for retrieval
2583 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockRead) )
2584 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2588 bt->cursor_page = set->page_no;
2590 bt_unlockpage(BtLockRead, set->latch);
2591 bt_unpinlatch (set->latch);
2592 bt_unpinpool (set->pool);
2596 BtKey *bt_key(BtDb *bt, uint slot)
2598 return keyptr(bt->cursor, slot);
2601 BtVal *bt_val(BtDb *bt, uint slot)
2603 return valptr(bt->cursor,slot);
2609 double getCpuTime(int type)
2612 FILETIME xittime[1];
2613 FILETIME systime[1];
2614 FILETIME usrtime[1];
2615 SYSTEMTIME timeconv[1];
2618 memset (timeconv, 0, sizeof(SYSTEMTIME));
2622 GetSystemTimeAsFileTime (xittime);
2623 FileTimeToSystemTime (xittime, timeconv);
2624 ans = (double)timeconv->wDayOfWeek * 3600 * 24;
2627 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2628 FileTimeToSystemTime (usrtime, timeconv);
2631 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2632 FileTimeToSystemTime (systime, timeconv);
2636 ans += (double)timeconv->wHour * 3600;
2637 ans += (double)timeconv->wMinute * 60;
2638 ans += (double)timeconv->wSecond;
2639 ans += (double)timeconv->wMilliseconds / 1000;
2644 #include <sys/resource.h>
2646 double getCpuTime(int type)
2648 struct rusage used[1];
2649 struct timeval tv[1];
2653 gettimeofday(tv, NULL);
2654 return (double)tv->tv_sec + (double)tv->tv_usec / 1000000;
2657 getrusage(RUSAGE_SELF, used);
2658 return (double)used->ru_utime.tv_sec + (double)used->ru_utime.tv_usec / 1000000;
2661 getrusage(RUSAGE_SELF, used);
2662 return (double)used->ru_stime.tv_sec + (double)used->ru_stime.tv_usec / 1000000;
2669 uint bt_latchaudit (BtDb *bt)
2671 ushort idx, hashidx;
2678 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2680 if( *(ushort *)(bt->mgr->latchmgr->lock) )
2681 fprintf(stderr, "Alloc page locked\n");
2682 *(ushort *)(bt->mgr->latchmgr->lock) = 0;
2684 for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
2685 latch = bt->mgr->latchsets + idx;
2686 if( *latch->readwr->rin & MASK )
2687 fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
2688 memset ((ushort *)latch->readwr, 0, sizeof(RWLock));
2690 if( *latch->access->rin & MASK )
2691 fprintf(stderr, "latchset %d accesslocked for page %.8x\n", idx, latch->page_no);
2692 memset ((ushort *)latch->access, 0, sizeof(RWLock));
2694 if( *latch->parent->rin & MASK )
2695 fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
2696 memset ((ushort *)latch->parent, 0, sizeof(RWLock));
2699 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2704 for( hashidx = 0; hashidx < bt->mgr->latchmgr->latchhash; hashidx++ ) {
2705 if( *(ushort *)(bt->mgr->latchmgr->table[hashidx].latch) )
2706 fprintf(stderr, "hash entry %d locked\n", hashidx);
2708 *(ushort *)(bt->mgr->latchmgr->table[hashidx].latch) = 0;
2710 if( idx = bt->mgr->latchmgr->table[hashidx].slot ) do {
2711 latch = bt->mgr->latchsets + idx;
2712 if( *(ushort *)latch->busy )
2713 fprintf(stderr, "latchset %d busylocked for page %.8x\n", idx, latch->page_no);
2714 *(ushort *)latch->busy = 0;
2716 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2717 } while( idx = latch->next );
2720 next = bt->mgr->latchmgr->nlatchpage + LATCH_page;
2721 page_no = LEAF_page;
2723 while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
2724 off64_t off = page_no << bt->mgr->page_bits;
2726 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2730 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2732 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2733 fprintf(stderr, "page %.8x unable to read\n", page_no);
2735 if( *amt < bt->mgr->page_size )
2736 fprintf(stderr, "page %.8x unable to read\n", page_no);
2738 if( !bt->frame->free ) {
2739 for( idx = 0; idx++ < bt->frame->cnt - 1; ) {
2740 ptr = keyptr(bt->frame, idx+1);
2741 if( keycmp (keyptr(bt->frame, idx), ptr->key, ptr->len) >= 0 )
2742 fprintf(stderr, "page %.8x idx %.2x out of order\n", page_no, idx);
2744 if( !bt->frame->lvl )
2745 cnt += bt->frame->act;
2747 if( page_no > LEAF_page )
2762 // standalone program to index file of keys
2763 // then list them onto std-out
2766 void *index_file (void *arg)
2768 uint __stdcall index_file (void *arg)
2771 int line = 0, found = 0, cnt = 0, unique;
2772 uid next, page_no = LEAF_page; // start on first page of leaves
2773 unsigned char key[BT_maxkey];
2774 ThreadArg *args = arg;
2775 int ch, len = 0, slot;
2782 bt = bt_open (args->mgr);
2784 unique = (args->type[1] | 0x20) != 'd';
2786 switch(args->type[0] | 0x20)
2789 fprintf(stderr, "started latch mgr audit\n");
2790 cnt = bt_latchaudit (bt);
2791 fprintf(stderr, "finished latch mgr audit, found %d keys\n", cnt);
2795 fprintf(stderr, "started pennysort for %s\n", args->infile);
2796 if( in = fopen (args->infile, "rb") )
2797 while( ch = getc(in), ch != EOF )
2802 if( bt_insertkey (bt, key, 10, 0, key + 10, len - 10, unique) )
2803 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2806 else if( len < BT_maxkey )
2808 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
2812 fprintf(stderr, "started indexing for %s\n", args->infile);
2813 if( in = fopen (args->infile, "rb") )
2814 while( ch = getc(in), ch != EOF )
2819 if( args->num == 1 )
2820 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2822 else if( args->num )
2823 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2825 if( bt_insertkey (bt, key, len, 0, NULL, 0, unique) )
2826 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2829 else if( len < BT_maxkey )
2831 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
2835 fprintf(stderr, "started deleting keys for %s\n", args->infile);
2836 if( in = fopen (args->infile, "rb") )
2837 while( ch = getc(in), ch != EOF )
2841 if( args->num == 1 )
2842 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2844 else if( args->num )
2845 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2847 if( bt_findkey (bt, key, len, NULL, 0) < 0 )
2848 fprintf(stderr, "Cannot find key for Line: %d\n", line), exit(0);
2849 ptr = (BtKey*)(bt->key);
2852 if( bt_deletekey (bt, ptr->key, ptr->len, 0) )
2853 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2856 else if( len < BT_maxkey )
2858 fprintf(stderr, "finished %s for %d keys, %d found\n", args->infile, line, found);
2862 fprintf(stderr, "started finding keys for %s\n", args->infile);
2863 if( in = fopen (args->infile, "rb") )
2864 while( ch = getc(in), ch != EOF )
2868 if( args->num == 1 )
2869 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2871 else if( args->num )
2872 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2874 if( bt_findkey (bt, key, len, NULL, 0) == 0 )
2877 fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
2880 else if( len < BT_maxkey )
2882 fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found);
2886 fprintf(stderr, "started scanning\n");
2888 if( set->pool = bt_pinpool (bt, page_no) )
2889 set->page = bt_page (bt, set->pool, page_no);
2892 set->latch = bt_pinlatch (bt, page_no);
2893 bt_lockpage (BtLockRead, set->latch);
2894 next = bt_getid (set->page->right);
2896 for( slot = 0; slot++ < set->page->cnt; )
2897 if( next || slot < set->page->cnt )
2898 if( !slotptr(set->page, slot)->dead ) {
2899 ptr = keyptr(set->page, slot);
2902 if( slotptr(set->page, slot)->type == Duplicate )
2905 fwrite (ptr->key, len, 1, stdout);
2906 val = valptr(set->page, slot);
2907 fwrite (val->value, val->len, 1, stdout);
2908 fputc ('\n', stdout);
2912 bt_unlockpage (BtLockRead, set->latch);
2913 bt_unpinlatch (set->latch);
2914 bt_unpinpool (set->pool);
2915 } while( page_no = next );
2917 fprintf(stderr, " Total keys read %d\n", cnt);
2921 fprintf(stderr, "started reverse scan\n");
2922 if( slot = bt_lastkey (bt) )
2923 while( slot = bt_prevkey (bt, slot) ) {
2924 if( slotptr(bt->cursor, slot)->dead )
2927 ptr = keyptr(bt->cursor, slot);
2930 if( slotptr(bt->cursor, slot)->type == Duplicate )
2933 fwrite (ptr->key, len, 1, stdout);
2934 val = valptr(bt->cursor, slot);
2935 fwrite (val->value, val->len, 1, stdout);
2936 fputc ('\n', stdout);
2940 fprintf(stderr, " Total keys read %d\n", cnt);
2945 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2947 fprintf(stderr, "started counting\n");
2948 next = bt->mgr->latchmgr->nlatchpage + LATCH_page;
2949 page_no = LEAF_page;
2951 while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
2952 uid off = page_no << bt->mgr->page_bits;
2954 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2958 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2960 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2961 return bt->err = BTERR_map;
2963 if( *amt < bt->mgr->page_size )
2964 return bt->err = BTERR_map;
2966 if( !bt->frame->free && !bt->frame->lvl )
2967 cnt += bt->frame->act;
2968 if( page_no > LEAF_page )
2973 cnt--; // remove stopper key
2974 fprintf(stderr, " Total keys read %d\n", cnt);
2986 typedef struct timeval timer;
2988 int main (int argc, char **argv)
2990 int idx, cnt, len, slot, err;
2991 int segsize, bits = 16;
3008 fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]);
3009 fprintf (stderr, " where page_bits is the page size in bits\n");
3010 fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n");
3011 fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n");
3012 fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n");
3013 fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n");
3017 start = getCpuTime(0);
3020 bits = atoi(argv[3]);
3023 poolsize = atoi(argv[4]);
3026 fprintf (stderr, "Warning: no mapped_pool\n");
3028 if( poolsize > 65535 )
3029 fprintf (stderr, "Warning: mapped_pool > 65535 segments\n");
3032 segsize = atoi(argv[5]);
3034 segsize = 4; // 16 pages per mmap segment
3037 num = atoi(argv[6]);
3041 threads = malloc (cnt * sizeof(pthread_t));
3043 threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE));
3045 args = malloc (cnt * sizeof(ThreadArg));
3047 mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8);
3050 fprintf(stderr, "Index Open Error %s\n", argv[1]);
3056 for( idx = 0; idx < cnt; idx++ ) {
3057 args[idx].infile = argv[idx + 7];
3058 args[idx].type = argv[2];
3059 args[idx].mgr = mgr;
3060 args[idx].num = num;
3061 args[idx].idx = idx;
3063 if( err = pthread_create (threads + idx, NULL, index_file, args + idx) )
3064 fprintf(stderr, "Error creating thread %d\n", err);
3066 threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL);
3070 // wait for termination
3073 for( idx = 0; idx < cnt; idx++ )
3074 pthread_join (threads[idx], NULL);
3076 WaitForMultipleObjects (cnt, threads, TRUE, INFINITE);
3078 for( idx = 0; idx < cnt; idx++ )
3079 CloseHandle(threads[idx]);
3082 elapsed = getCpuTime(0) - start;
3083 fprintf(stderr, " real %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
3084 elapsed = getCpuTime(1);
3085 fprintf(stderr, " user %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
3086 elapsed = getCpuTime(2);
3087 fprintf(stderr, " sys %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);