1 // btree version threadskv4b sched_yield version
2 // with reworked bt_deletekey code,
3 // phase-fair reader writer lock,
4 // librarian page split code,
5 // and duplicate key management
9 // author: karl malbrain, malbrain@cal.berkeley.edu
12 This work, including the source code, documentation
13 and related data, is placed into the public domain.
15 The orginal author is Karl Malbrain.
17 THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
18 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
19 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
20 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
21 RESULTING FROM THE USE, MODIFICATION, OR
22 REDISTRIBUTION OF THIS SOFTWARE.
25 // Please see the project home page for documentation
26 // code.google.com/p/high-concurrency-btree
28 #define _FILE_OFFSET_BITS 64
29 #define _LARGEFILE64_SOURCE
45 #define WIN32_LEAN_AND_MEAN
59 typedef unsigned long long uid;
62 typedef unsigned long long off64_t;
63 typedef unsigned short ushort;
64 typedef unsigned int uint;
67 #define BT_latchtable 128 // number of latch manager slots
69 #define BT_ro 0x6f72 // ro
70 #define BT_rw 0x7772 // rw
72 #define BT_maxbits 24 // maximum page size in bits
73 #define BT_minbits 9 // minimum page size in bits
74 #define BT_minpage (1 << BT_minbits) // minimum page size
75 #define BT_maxpage (1 << BT_maxbits) // maximum page size
78 There are five lock types for each node in three independent sets:
79 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
80 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent.
81 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock.
82 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks.
83 5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification.
94 // definition for phase-fair reader/writer lock implementation
108 // definition for spin latch implementation
110 // exclusive is set for write access
111 // share is count of read accessors
112 // grant write lock when share == 0
114 volatile typedef struct {
125 // hash table entries
128 BtSpinLatch latch[1];
129 volatile ushort slot; // Latch table entry at head of chain
132 // latch manager table structure
135 RWLock readwr[1]; // read/write page lock
136 RWLock access[1]; // Access Intent/Page delete
137 RWLock parent[1]; // Posting of fence key in parent
138 BtSpinLatch busy[1]; // slot is being moved between chains
139 volatile ushort next; // next entry in hash table chain
140 volatile ushort prev; // prev entry in hash table chain
141 volatile ushort pin; // number of outstanding locks
142 volatile ushort hash; // hash slot entry is under
143 volatile uid page_no; // latch set page number
146 // Define the length of the page and key pointers
150 // Page key slot definition.
152 // Keys are marked dead, but remain on the page until
153 // it cleanup is called. The fence key (highest key) for
154 // a leaf page is always present, even after cleanup.
158 // In addition to the Unique keys that occupy slots
159 // there are Librarian and Duplicate key
160 // slots occupying the key slot array.
162 // The Librarian slots are dead keys that
163 // serve as filler, available to add new Unique
164 // or Dup slots that are inserted into the B-tree.
166 // The Duplicate slots have had their key bytes extended
167 // by 6 bytes to contain a binary duplicate key uniqueifier.
176 uint off:BT_maxbits; // page offset for key start
177 uint type:3; // type of slot
178 uint dead:1; // set for deleted slot
181 // The key structure occupies space at the upper end of
182 // each page. It's a length byte followed by the key
187 unsigned char key[1];
190 // the value structure also occupies space at the upper
191 // end of the page. Each key is immediately followed by a value.
195 unsigned char value[1];
198 // The first part of an index page.
199 // It is immediately followed
200 // by the BtSlot array of keys.
202 // note that this structure size
203 // must be a multiple of 8 bytes
204 // in order to place dups correctly.
206 typedef struct BtPage_ {
207 uint cnt; // count of keys in page
208 uint act; // count of active keys
209 uint min; // next key offset
210 uint garbage; // page garbage in bytes
211 unsigned char bits:7; // page size in bits
212 unsigned char free:1; // page is on free chain
213 unsigned char lvl:7; // level of page
214 unsigned char kill:1; // page is being deleted
215 unsigned char right[BtId]; // page number to right
218 // The memory mapping pool table buffer manager entry
221 uid basepage; // mapped base page number
222 char *map; // mapped memory pointer
223 ushort slot; // slot index in this array
224 ushort pin; // mapped page pin counter
225 void *hashprev; // previous pool entry for the same hash idx
226 void *hashnext; // next pool entry for the same hash idx
228 HANDLE hmap; // Windows memory mapping handle
232 #define CLOCK_bit 0x8000 // bit in pool->pin
234 // The loadpage interface object
237 uid page_no; // current page number
238 BtPage page; // current page pointer
239 BtPool *pool; // current page pool
240 BtLatchSet *latch; // current page latch set
243 // structure for latch manager on ALLOC_page
246 struct BtPage_ alloc[1]; // next page_no in right ptr
247 unsigned long long dups[1]; // global duplicate key uniqueifier
248 unsigned char chain[BtId]; // head of free page_nos chain
249 BtSpinLatch lock[1]; // allocation area lite latch
250 ushort latchdeployed; // highest number of latch entries deployed
251 ushort nlatchpage; // number of latch pages at BT_latch
252 ushort latchtotal; // number of page latch entries
253 ushort latchhash; // number of latch hash table slots
254 ushort latchvictim; // next latch entry to examine
255 BtHashEntry table[0]; // the hash table
258 // The object structure for Btree access
261 uint page_size; // page size
262 uint page_bits; // page size in bits
263 uint seg_bits; // seg size in pages in bits
264 uint mode; // read-write mode
270 ushort poolcnt; // highest page pool node in use
271 ushort poolmax; // highest page pool node allocated
272 ushort poolmask; // total number of pages in mmap segment - 1
273 ushort hashsize; // size of Hash Table for pool entries
274 volatile uint evicted; // last evicted hash table slot
275 ushort *hash; // pool index for hash entries
276 BtSpinLatch *latch; // latches for hash table slots
277 BtLatchMgr *latchmgr; // mapped latch page from allocation page
278 BtLatchSet *latchsets; // mapped latch set from latch pages
279 BtPool *pool; // memory pool page segments
281 HANDLE halloc; // allocation and latch table handle
286 BtMgr *mgr; // buffer manager for thread
287 BtPage cursor; // cached frame for start/next (never mapped)
288 BtPage frame; // spare frame for the page split (never mapped)
289 uid cursor_page; // current cursor page number
290 unsigned char *mem; // frame, cursor, page memory buffer
291 unsigned char key[256]; // last found complete key
292 int found; // last delete or insert was found
293 int err; // last error
307 extern void bt_close (BtDb *bt);
308 extern BtDb *bt_open (BtMgr *mgr);
309 extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, void *value, uint vallen, uint update);
310 extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl);
311 extern int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax);
312 extern BtKey bt_foundkey (BtDb *bt);
313 extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len);
314 extern uint bt_nextkey (BtDb *bt, uint slot);
317 extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize);
318 void bt_mgrclose (BtMgr *mgr);
320 // Helper functions to return slot values
321 // from the cursor page.
323 extern BtKey bt_key (BtDb *bt, uint slot);
324 extern BtVal bt_val (BtDb *bt, uint slot);
326 // BTree page number constants
327 #define ALLOC_page 0 // allocation & latch manager hash table
328 #define ROOT_page 1 // root of the btree
329 #define LEAF_page 2 // first page of leaves
330 #define LATCH_page 3 // pages for latch manager
332 // Number of levels to create in a new BTree
336 // The page is allocated from low and hi ends.
337 // The key slots are allocated from the bottom,
338 // while the text and value of the key
339 // are allocated from the top. When the two
340 // areas meet, the page is split into two.
342 // A key consists of a length byte, two bytes of
343 // index number (0 - 65534), and up to 253 bytes
346 // Associated with each key is a value byte string
347 // containing any value desired.
349 // The b-tree root is always located at page 1.
350 // The first leaf page of level zero is always
351 // located on page 2.
353 // The b-tree pages are linked with next
354 // pointers to facilitate enumerators,
355 // and provide for concurrency.
357 // When to root page fills, it is split in two and
358 // the tree height is raised by a new root at page
359 // one with two keys.
361 // Deleted keys are marked with a dead bit until
362 // page cleanup. The fence key for a leaf node is
365 // Groups of pages called segments from the btree are optionally
366 // cached with a memory mapped pool. A hash table is used to keep
367 // track of the cached segments. This behaviour is controlled
368 // by the cache block size parameter to bt_open.
370 // To achieve maximum concurrency one page is locked at a time
371 // as the tree is traversed to find leaf key in question. The right
372 // page numbers are used in cases where the page is being split,
375 // Page 0 is dedicated to lock for new page extensions,
376 // and chains empty pages together for reuse. It also
377 // contains the latch manager hash table.
379 // The ParentModification lock on a node is obtained to serialize posting
380 // or changing the fence key for a node.
382 // Empty pages are chained together through the ALLOC page and reused.
384 // Access macros to address slot and key values from the page
385 // Page slots use 1 based indexing.
387 #define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1))
388 #define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off))
389 #define valptr(page, slot) ((BtVal)(keyptr(page,slot)->key + keyptr(page,slot)->len))
391 void bt_putid(unsigned char *dest, uid id)
396 dest[i] = (unsigned char)id, id >>= 8;
399 uid bt_getid(unsigned char *src)
404 for( i = 0; i < BtId; i++ )
405 id <<= 8, id |= *src++;
410 uid bt_newdup (BtDb *bt)
413 return __sync_fetch_and_add (bt->mgr->latchmgr->dups, 1) + 1;
415 return _InterlockedIncrement64(bt->mgr->latchmgr->dups, 1);
419 // Phase-Fair reader/writer lock implementation
421 void WriteLock (RWLock *lock)
426 tix = __sync_fetch_and_add (lock->ticket, 1);
428 tix = _InterlockedExchangeAdd16 (lock->ticket, 1);
430 // wait for our ticket to come up
432 while( tix != lock->serving[0] )
439 w = PRES | (tix & PHID);
441 r = __sync_fetch_and_add (lock->rin, w);
443 r = _InterlockedExchangeAdd16 (lock->rin, w);
445 while( r != *lock->rout )
453 void WriteRelease (RWLock *lock)
456 __sync_fetch_and_and (lock->rin, ~MASK);
458 _InterlockedAnd16 (lock->rin, ~MASK);
463 void ReadLock (RWLock *lock)
467 w = __sync_fetch_and_add (lock->rin, RINC) & MASK;
469 w = _InterlockedExchangeAdd16 (lock->rin, RINC) & MASK;
472 while( w == (*lock->rin & MASK) )
480 void ReadRelease (RWLock *lock)
483 __sync_fetch_and_add (lock->rout, RINC);
485 _InterlockedExchangeAdd16 (lock->rout, RINC);
489 // Spin Latch Manager
491 // wait until write lock mode is clear
492 // and add 1 to the share count
494 void bt_spinreadlock(BtSpinLatch *latch)
500 prev = __sync_fetch_and_add ((ushort *)latch, SHARE);
502 prev = _InterlockedExchangeAdd16((ushort *)latch, SHARE);
504 // see if exclusive request is granted or pending
509 prev = __sync_fetch_and_add ((ushort *)latch, -SHARE);
511 prev = _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
514 } while( sched_yield(), 1 );
516 } while( SwitchToThread(), 1 );
520 // wait for other read and write latches to relinquish
522 void bt_spinwritelock(BtSpinLatch *latch)
528 prev = __sync_fetch_and_or((ushort *)latch, PEND | XCL);
530 prev = _InterlockedOr16((ushort *)latch, PEND | XCL);
533 if( !(prev & ~BOTH) )
537 __sync_fetch_and_and ((ushort *)latch, ~XCL);
539 _InterlockedAnd16((ushort *)latch, ~XCL);
542 } while( sched_yield(), 1 );
544 } while( SwitchToThread(), 1 );
548 // try to obtain write lock
550 // return 1 if obtained,
553 int bt_spinwritetry(BtSpinLatch *latch)
558 prev = __sync_fetch_and_or((ushort *)latch, XCL);
560 prev = _InterlockedOr16((ushort *)latch, XCL);
562 // take write access if all bits are clear
565 if( !(prev & ~BOTH) )
569 __sync_fetch_and_and ((ushort *)latch, ~XCL);
571 _InterlockedAnd16((ushort *)latch, ~XCL);
578 void bt_spinreleasewrite(BtSpinLatch *latch)
581 __sync_fetch_and_and((ushort *)latch, ~BOTH);
583 _InterlockedAnd16((ushort *)latch, ~BOTH);
587 // decrement reader count
589 void bt_spinreleaseread(BtSpinLatch *latch)
592 __sync_fetch_and_add((ushort *)latch, -SHARE);
594 _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
598 // link latch table entry into latch hash table
600 void bt_latchlink (BtDb *bt, ushort hashidx, ushort victim, uid page_no)
602 BtLatchSet *set = bt->mgr->latchsets + victim;
604 if( set->next = bt->mgr->latchmgr->table[hashidx].slot )
605 bt->mgr->latchsets[set->next].prev = victim;
607 bt->mgr->latchmgr->table[hashidx].slot = victim;
608 set->page_no = page_no;
615 void bt_unpinlatch (BtLatchSet *set)
618 __sync_fetch_and_add(&set->pin, -1);
620 _InterlockedDecrement16 (&set->pin);
624 // find existing latchset or inspire new one
625 // return with latchset pinned
627 BtLatchSet *bt_pinlatch (BtDb *bt, uid page_no)
629 ushort hashidx = page_no % bt->mgr->latchmgr->latchhash;
630 ushort slot, avail = 0, victim, idx;
633 // obtain read lock on hash table entry
635 bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch);
637 if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do
639 set = bt->mgr->latchsets + slot;
640 if( page_no == set->page_no )
642 } while( slot = set->next );
646 __sync_fetch_and_add(&set->pin, 1);
648 _InterlockedIncrement16 (&set->pin);
652 bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch);
657 // try again, this time with write lock
659 bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch);
661 if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do
663 set = bt->mgr->latchsets + slot;
664 if( page_no == set->page_no )
666 if( !set->pin && !avail )
668 } while( slot = set->next );
670 // found our entry, or take over an unpinned one
672 if( slot || (slot = avail) ) {
673 set = bt->mgr->latchsets + slot;
675 __sync_fetch_and_add(&set->pin, 1);
677 _InterlockedIncrement16 (&set->pin);
679 set->page_no = page_no;
680 bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch);
684 // see if there are any unused entries
686 victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, 1) + 1;
688 victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchdeployed);
691 if( victim < bt->mgr->latchmgr->latchtotal ) {
692 set = bt->mgr->latchsets + victim;
694 __sync_fetch_and_add(&set->pin, 1);
696 _InterlockedIncrement16 (&set->pin);
698 bt_latchlink (bt, hashidx, victim, page_no);
699 bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch);
704 victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, -1);
706 victim = _InterlockedDecrement16 (&bt->mgr->latchmgr->latchdeployed);
708 // find and reuse previous lock entry
712 victim = __sync_fetch_and_add(&bt->mgr->latchmgr->latchvictim, 1);
714 victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchvictim) - 1;
716 // we don't use slot zero
718 if( victim %= bt->mgr->latchmgr->latchtotal )
719 set = bt->mgr->latchsets + victim;
723 // take control of our slot
724 // from other threads
726 if( set->pin || !bt_spinwritetry (set->busy) )
731 // try to get write lock on hash chain
732 // skip entry if not obtained
733 // or has outstanding locks
735 if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) {
736 bt_spinreleasewrite (set->busy);
741 bt_spinreleasewrite (set->busy);
742 bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch);
746 // unlink our available victim from its hash chain
749 bt->mgr->latchsets[set->prev].next = set->next;
751 bt->mgr->latchmgr->table[idx].slot = set->next;
754 bt->mgr->latchsets[set->next].prev = set->prev;
756 bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch);
758 __sync_fetch_and_add(&set->pin, 1);
760 _InterlockedIncrement16 (&set->pin);
762 bt_latchlink (bt, hashidx, victim, page_no);
763 bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch);
764 bt_spinreleasewrite (set->busy);
769 void bt_mgrclose (BtMgr *mgr)
774 // release mapped pages
775 // note that slot zero is never used
777 for( slot = 1; slot < mgr->poolmax; slot++ ) {
778 pool = mgr->pool + slot;
781 munmap (pool->map, (uid)(mgr->poolmask+1) << mgr->page_bits);
784 FlushViewOfFile(pool->map, 0);
785 UnmapViewOfFile(pool->map);
786 CloseHandle(pool->hmap);
792 munmap (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size);
793 munmap (mgr->latchmgr, 2 * mgr->page_size);
795 FlushViewOfFile(mgr->latchmgr, 0);
796 UnmapViewOfFile(mgr->latchmgr);
797 CloseHandle(mgr->halloc);
803 free ((void *)mgr->latch);
806 FlushFileBuffers(mgr->idx);
807 CloseHandle(mgr->idx);
808 GlobalFree (mgr->pool);
809 GlobalFree (mgr->hash);
810 GlobalFree ((void *)mgr->latch);
815 // close and release memory
817 void bt_close (BtDb *bt)
824 VirtualFree (bt->mem, 0, MEM_RELEASE);
829 // open/create new btree buffer manager
831 // call with file_name, BT_openmode, bits in page size (e.g. 16),
832 // size of mapped page pool (e.g. 8192)
834 BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize)
836 uint lvl, attr, cacheblk, last, slot, idx;
837 uint nlatchpage, latchhash;
838 unsigned char value[BtId];
839 BtLatchMgr *latchmgr;
848 SYSTEM_INFO sysinfo[1];
851 // determine sanity of page size and buffer pool
853 if( bits > BT_maxbits )
855 else if( bits < BT_minbits )
859 return NULL; // must have buffer pool
862 mgr = calloc (1, sizeof(BtMgr));
864 mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
867 return free(mgr), NULL;
869 cacheblk = 4096; // minimum mmap segment size for unix
872 mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr));
873 attr = FILE_ATTRIBUTE_NORMAL;
874 mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL);
876 if( mgr->idx == INVALID_HANDLE_VALUE )
877 return GlobalFree(mgr), NULL;
879 // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity
880 GetSystemInfo(sysinfo);
881 cacheblk = sysinfo->dwAllocationGranularity;
885 latchmgr = malloc (BT_maxpage);
888 // read minimum page size to get root info
890 if( size = lseek (mgr->idx, 0L, 2) ) {
891 if( pread(mgr->idx, latchmgr, BT_minpage, 0) == BT_minpage )
892 bits = latchmgr->alloc->bits;
894 return free(mgr), free(latchmgr), NULL;
895 } else if( mode == BT_ro )
896 return free(latchmgr), bt_mgrclose (mgr), NULL;
898 latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE);
899 size = GetFileSize(mgr->idx, amt);
902 if( !ReadFile(mgr->idx, (char *)latchmgr, BT_minpage, amt, NULL) )
903 return bt_mgrclose (mgr), NULL;
904 bits = latchmgr->alloc->bits;
905 } else if( mode == BT_ro )
906 return bt_mgrclose (mgr), NULL;
909 mgr->page_size = 1 << bits;
910 mgr->page_bits = bits;
912 mgr->poolmax = poolmax;
915 if( cacheblk < mgr->page_size )
916 cacheblk = mgr->page_size;
918 // mask for partial memmaps
920 mgr->poolmask = (cacheblk >> bits) - 1;
922 // see if requested size of pages per memmap is greater
924 if( (1 << segsize) > mgr->poolmask )
925 mgr->poolmask = (1 << segsize) - 1;
929 while( (1 << mgr->seg_bits) <= mgr->poolmask )
932 mgr->hashsize = hashsize;
935 mgr->pool = calloc (poolmax, sizeof(BtPool));
936 mgr->hash = calloc (hashsize, sizeof(ushort));
937 mgr->latch = calloc (hashsize, sizeof(BtSpinLatch));
939 mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool));
940 mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort));
941 mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch));
947 // initialize an empty b-tree with latch page, root page, page of leaves
948 // and page(s) of latches
950 memset (latchmgr, 0, 1 << bits);
951 nlatchpage = BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1;
952 bt_putid(latchmgr->alloc->right, MIN_lvl+1+nlatchpage);
953 latchmgr->alloc->bits = mgr->page_bits;
955 latchmgr->nlatchpage = nlatchpage;
956 latchmgr->latchtotal = nlatchpage * (mgr->page_size / sizeof(BtLatchSet));
958 // initialize latch manager
960 latchhash = (mgr->page_size - sizeof(BtLatchMgr)) / sizeof(BtHashEntry);
962 // size of hash table = total number of latchsets
964 if( latchhash > latchmgr->latchtotal )
965 latchhash = latchmgr->latchtotal;
967 latchmgr->latchhash = latchhash;
970 if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size )
971 return bt_mgrclose (mgr), NULL;
973 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
974 return bt_mgrclose (mgr), NULL;
976 if( *amt < mgr->page_size )
977 return bt_mgrclose (mgr), NULL;
980 memset (latchmgr, 0, 1 << bits);
981 latchmgr->alloc->bits = mgr->page_bits;
983 for( lvl=MIN_lvl; lvl--; ) {
984 slotptr(latchmgr->alloc, 1)->off = mgr->page_size - 3 - (lvl ? BtId + 1: 1);
985 key = keyptr(latchmgr->alloc, 1);
986 key->len = 2; // create stopper key
990 bt_putid(value, MIN_lvl - lvl + 1);
991 val = valptr(latchmgr->alloc, 1);
992 val->len = lvl ? BtId : 0;
993 memcpy (val->value, value, val->len);
995 latchmgr->alloc->min = slotptr(latchmgr->alloc, 1)->off;
996 latchmgr->alloc->lvl = lvl;
997 latchmgr->alloc->cnt = 1;
998 latchmgr->alloc->act = 1;
1000 if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size )
1001 return bt_mgrclose (mgr), NULL;
1003 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
1004 return bt_mgrclose (mgr), NULL;
1006 if( *amt < mgr->page_size )
1007 return bt_mgrclose (mgr), NULL;
1011 // clear out latch manager locks
1012 // and rest of pages to round out segment
1014 memset(latchmgr, 0, mgr->page_size);
1017 while( last <= ((MIN_lvl + 1 + nlatchpage) | mgr->poolmask) ) {
1019 pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits);
1021 SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN);
1022 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
1023 return bt_mgrclose (mgr), NULL;
1024 if( *amt < mgr->page_size )
1025 return bt_mgrclose (mgr), NULL;
1032 // mlock the root page and the latchmgr page
1034 flag = PROT_READ | PROT_WRITE;
1035 mgr->latchmgr = mmap (0, 2 * mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size);
1036 if( mgr->latchmgr == MAP_FAILED )
1037 return bt_mgrclose (mgr), NULL;
1038 mlock (mgr->latchmgr, 2 * mgr->page_size);
1040 mgr->latchsets = (BtLatchSet *)mmap (0, mgr->latchmgr->nlatchpage * mgr->page_size, flag, MAP_SHARED, mgr->idx, LATCH_page * mgr->page_size);
1041 if( mgr->latchsets == MAP_FAILED )
1042 return bt_mgrclose (mgr), NULL;
1043 mlock (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size);
1045 flag = PAGE_READWRITE;
1046 mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size, NULL);
1048 return bt_mgrclose (mgr), NULL;
1050 flag = FILE_MAP_WRITE;
1051 mgr->latchmgr = MapViewOfFile(mgr->halloc, flag, 0, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size);
1052 if( !mgr->latchmgr )
1053 return GetLastError(), bt_mgrclose (mgr), NULL;
1055 mgr->latchsets = (void *)((char *)mgr->latchmgr + LATCH_page * mgr->page_size);
1061 VirtualFree (latchmgr, 0, MEM_RELEASE);
1066 // open BTree access method
1067 // based on buffer manager
1069 BtDb *bt_open (BtMgr *mgr)
1071 BtDb *bt = malloc (sizeof(*bt));
1073 memset (bt, 0, sizeof(*bt));
1076 bt->mem = malloc (2 *mgr->page_size);
1078 bt->mem = VirtualAlloc(NULL, 2 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE);
1080 bt->frame = (BtPage)bt->mem;
1081 bt->cursor = (BtPage)(bt->mem + 1 * mgr->page_size);
1085 // compare two keys, returning > 0, = 0, or < 0
1086 // as the comparison value
1088 int keycmp (BtKey key1, unsigned char *key2, uint len2)
1090 uint len1 = key1->len;
1093 if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) )
1106 // find segment in pool
1107 // must be called with hashslot idx locked
1108 // return NULL if not there
1109 // otherwise return node
1111 BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx)
1116 // compute start of hash chain in pool
1118 if( slot = bt->mgr->hash[idx] )
1119 pool = bt->mgr->pool + slot;
1123 page_no &= ~bt->mgr->poolmask;
1125 while( pool->basepage != page_no )
1126 if( pool = pool->hashnext )
1134 // add segment to hash table
1136 void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx)
1141 pool->hashprev = pool->hashnext = NULL;
1142 pool->basepage = page_no & ~bt->mgr->poolmask;
1143 pool->pin = CLOCK_bit + 1;
1145 if( slot = bt->mgr->hash[idx] ) {
1146 node = bt->mgr->pool + slot;
1147 pool->hashnext = node;
1148 node->hashprev = pool;
1151 bt->mgr->hash[idx] = pool->slot;
1154 // map new buffer pool segment to virtual memory
1156 BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
1158 off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits;
1159 off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits);
1163 flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE );
1164 pool->map = mmap (0, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off);
1165 if( pool->map == MAP_FAILED )
1166 return bt->err = BTERR_map;
1169 flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE );
1170 pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL);
1172 return bt->err = BTERR_map;
1174 flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE );
1175 pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits);
1177 return bt->err = BTERR_map;
1182 // calculate page within pool
1184 BtPage bt_page (BtDb *bt, BtPool *pool, uid page_no)
1186 uint subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping
1189 page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits));
1190 // madvise (page, bt->mgr->page_size, MADV_WILLNEED);
1196 void bt_unpinpool (BtPool *pool)
1199 __sync_fetch_and_add(&pool->pin, -1);
1201 _InterlockedDecrement16 (&pool->pin);
1205 // find or place requested page in segment-pool
1206 // return pool table entry, incrementing pin
1208 BtPool *bt_pinpool(BtDb *bt, uid page_no)
1210 uint slot, hashidx, idx, victim;
1211 BtPool *pool, *node, *next;
1213 // lock hash table chain
1215 hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
1216 bt_spinwritelock (&bt->mgr->latch[hashidx]);
1218 // look up in hash table
1220 if( pool = bt_findpool(bt, page_no, hashidx) ) {
1222 __sync_fetch_and_or(&pool->pin, CLOCK_bit);
1223 __sync_fetch_and_add(&pool->pin, 1);
1225 _InterlockedOr16 (&pool->pin, CLOCK_bit);
1226 _InterlockedIncrement16 (&pool->pin);
1228 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1232 // allocate a new pool node
1233 // and add to hash table
1236 slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1);
1238 slot = _InterlockedIncrement16 (&bt->mgr->poolcnt) - 1;
1241 if( ++slot < bt->mgr->poolmax ) {
1242 pool = bt->mgr->pool + slot;
1245 if( bt_mapsegment(bt, pool, page_no) )
1248 bt_linkhash(bt, pool, page_no, hashidx);
1249 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1253 // pool table is full
1254 // find best pool entry to evict
1257 __sync_fetch_and_add(&bt->mgr->poolcnt, -1);
1259 _InterlockedDecrement16 (&bt->mgr->poolcnt);
1264 victim = __sync_fetch_and_add(&bt->mgr->evicted, 1);
1266 victim = _InterlockedIncrement (&bt->mgr->evicted) - 1;
1268 victim %= bt->mgr->poolmax;
1269 pool = bt->mgr->pool + victim;
1270 idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
1275 // try to get write lock
1276 // skip entry if not obtained
1278 if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
1281 // skip this entry if
1283 // or clock bit is set
1287 __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
1289 _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
1291 bt_spinreleasewrite (&bt->mgr->latch[idx]);
1295 // unlink victim pool node from hash table
1297 if( node = pool->hashprev )
1298 node->hashnext = pool->hashnext;
1299 else if( node = pool->hashnext )
1300 bt->mgr->hash[idx] = node->slot;
1302 bt->mgr->hash[idx] = 0;
1304 if( node = pool->hashnext )
1305 node->hashprev = pool->hashprev;
1307 bt_spinreleasewrite (&bt->mgr->latch[idx]);
1309 // remove old file mapping
1311 munmap (pool->map, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits);
1313 // FlushViewOfFile(pool->map, 0);
1314 UnmapViewOfFile(pool->map);
1315 CloseHandle(pool->hmap);
1319 // create new pool mapping
1320 // and link into hash table
1322 if( bt_mapsegment(bt, pool, page_no) )
1325 bt_linkhash(bt, pool, page_no, hashidx);
1326 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1331 // place write, read, or parent lock on requested page_no.
1333 void bt_lockpage(BtLock mode, BtLatchSet *set)
1337 ReadLock (set->readwr);
1340 WriteLock (set->readwr);
1343 ReadLock (set->access);
1346 WriteLock (set->access);
1349 WriteLock (set->parent);
1354 // remove write, read, or parent lock on requested page
1356 void bt_unlockpage(BtLock mode, BtLatchSet *set)
1360 ReadRelease (set->readwr);
1363 WriteRelease (set->readwr);
1366 ReadRelease (set->access);
1369 WriteRelease (set->access);
1372 WriteRelease (set->parent);
1377 // allocate a new page and write page into it
1379 uid bt_newpage(BtDb *bt, BtPage page)
1385 // lock allocation page
1387 bt_spinwritelock(bt->mgr->latchmgr->lock);
1389 // use empty chain first
1390 // else allocate empty page
1392 if( new_page = bt_getid(bt->mgr->latchmgr->chain) ) {
1393 if( set->pool = bt_pinpool (bt, new_page) )
1394 set->page = bt_page (bt, set->pool, new_page);
1398 bt_putid(bt->mgr->latchmgr->chain, bt_getid(set->page->right));
1399 bt_unpinpool (set->pool);
1401 new_page = bt_getid(bt->mgr->latchmgr->alloc->right);
1402 bt_putid(bt->mgr->latchmgr->alloc->right, new_page+1);
1404 // if writing first page of pool block, set file length thru last page
1406 if( (new_page & bt->mgr->poolmask) == 0 )
1407 ftruncate (bt->mgr->idx, (new_page + bt->mgr->poolmask + 1) << bt->mgr->page_bits);
1411 // unlock allocation latch
1413 bt_spinreleasewrite(bt->mgr->latchmgr->lock);
1416 // bring new page into pool and copy page.
1417 // this will extend the file into the new pages on WIN32.
1419 if( set->pool = bt_pinpool (bt, new_page) )
1420 set->page = bt_page (bt, set->pool, new_page);
1424 memcpy(set->page, page, bt->mgr->page_size);
1425 bt_unpinpool (set->pool);
1428 // unlock allocation latch
1430 bt_spinreleasewrite(bt->mgr->latchmgr->lock);
1435 // find slot in page for given key at a given level
1437 int bt_findslot (BtPageSet *set, unsigned char *key, uint len)
1439 uint diff, higher = set->page->cnt, low = 1, slot;
1442 // make stopper key an infinite fence value
1444 if( bt_getid (set->page->right) )
1449 // low is the lowest candidate.
1450 // loop ends when they meet
1452 // higher is already
1453 // tested as .ge. the passed key.
1455 while( diff = higher - low ) {
1456 slot = low + ( diff >> 1 );
1457 if( keycmp (keyptr(set->page, slot), key, len) < 0 )
1460 higher = slot, good++;
1463 // return zero if key is on right link page
1465 return good ? higher : 0;
1468 // find and load page at given level for given key
1469 // leave page rd or wr locked as requested
1471 int bt_loadpage (BtDb *bt, BtPageSet *set, unsigned char *key, uint len, uint lvl, BtLock lock)
1473 uid page_no = ROOT_page, prevpage = 0;
1474 uint drill = 0xff, slot;
1475 BtLatchSet *prevlatch;
1476 uint mode, prevmode;
1479 // start at root of btree and drill down
1482 // determine lock mode of drill level
1483 mode = (drill == lvl) ? lock : BtLockRead;
1485 set->latch = bt_pinlatch (bt, page_no);
1486 set->page_no = page_no;
1488 // pin page contents
1490 if( set->pool = bt_pinpool (bt, page_no) )
1491 set->page = bt_page (bt, set->pool, page_no);
1495 // obtain access lock using lock chaining with Access mode
1497 if( page_no > ROOT_page )
1498 bt_lockpage(BtLockAccess, set->latch);
1500 // release & unpin parent page
1503 bt_unlockpage(prevmode, prevlatch);
1504 bt_unpinlatch (prevlatch);
1505 bt_unpinpool (prevpool);
1509 // obtain read lock using lock chaining
1511 bt_lockpage(mode, set->latch);
1513 if( set->page->free )
1514 return bt->err = BTERR_struct, 0;
1516 if( page_no > ROOT_page )
1517 bt_unlockpage(BtLockAccess, set->latch);
1519 // re-read and re-lock root after determining actual level of root
1521 if( set->page->lvl != drill) {
1522 if( set->page_no != ROOT_page )
1523 return bt->err = BTERR_struct, 0;
1525 drill = set->page->lvl;
1527 if( lock != BtLockRead && drill == lvl ) {
1528 bt_unlockpage(mode, set->latch);
1529 bt_unpinlatch (set->latch);
1530 bt_unpinpool (set->pool);
1535 prevpage = set->page_no;
1536 prevlatch = set->latch;
1537 prevpool = set->pool;
1540 // find key on page at this level
1541 // and descend to requested level
1543 if( set->page->kill )
1546 if( slot = bt_findslot (set, key, len) ) {
1550 while( slotptr(set->page, slot)->dead )
1551 if( slot++ < set->page->cnt )
1556 page_no = bt_getid(valptr(set->page, slot)->value);
1561 // or slide right into next page
1564 page_no = bt_getid(set->page->right);
1568 // return error on end of right chain
1570 bt->err = BTERR_struct;
1571 return 0; // return error
1574 // return page to free list
1575 // page must be delete & write locked
1577 void bt_freepage (BtDb *bt, BtPageSet *set)
1579 // lock allocation page
1581 bt_spinwritelock (bt->mgr->latchmgr->lock);
1584 memcpy(set->page->right, bt->mgr->latchmgr->chain, BtId);
1585 bt_putid(bt->mgr->latchmgr->chain, set->page_no);
1586 set->page->free = 1;
1588 // unlock released page
1590 bt_unlockpage (BtLockDelete, set->latch);
1591 bt_unlockpage (BtLockWrite, set->latch);
1592 bt_unpinlatch (set->latch);
1593 bt_unpinpool (set->pool);
1595 // unlock allocation page
1597 bt_spinreleasewrite (bt->mgr->latchmgr->lock);
1600 // a fence key was deleted from a page
1601 // push new fence value upwards
1603 BTERR bt_fixfence (BtDb *bt, BtPageSet *set, uint lvl)
1605 unsigned char leftkey[256], rightkey[256];
1606 unsigned char value[BtId];
1609 // remove the old fence value
1611 ptr = keyptr(set->page, set->page->cnt);
1612 memcpy (rightkey, ptr, ptr->len + 1);
1613 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1615 // cache new fence value
1617 ptr = keyptr(set->page, set->page->cnt);
1618 memcpy (leftkey, ptr, ptr->len + 1);
1620 bt_lockpage (BtLockParent, set->latch);
1621 bt_unlockpage (BtLockWrite, set->latch);
1623 // insert new (now smaller) fence key
1625 bt_putid (value, set->page_no);
1627 if( bt_insertkey (bt, leftkey+1, *leftkey, lvl+1, value, BtId, 1) )
1630 // now delete old fence key
1632 if( bt_deletekey (bt, rightkey+1, *rightkey, lvl+1) )
1635 bt_unlockpage (BtLockParent, set->latch);
1636 bt_unpinlatch(set->latch);
1637 bt_unpinpool (set->pool);
1641 // root has a single child
1642 // collapse a level from the tree
1644 BTERR bt_collapseroot (BtDb *bt, BtPageSet *root)
1649 // find the child entry and promote as new root contents
1652 for( idx = 0; idx++ < root->page->cnt; )
1653 if( !slotptr(root->page, idx)->dead )
1656 child->page_no = bt_getid (valptr(root->page, idx)->value);
1658 child->latch = bt_pinlatch (bt, child->page_no);
1659 bt_lockpage (BtLockDelete, child->latch);
1660 bt_lockpage (BtLockWrite, child->latch);
1662 if( child->pool = bt_pinpool (bt, child->page_no) )
1663 child->page = bt_page (bt, child->pool, child->page_no);
1667 memcpy (root->page, child->page, bt->mgr->page_size);
1668 bt_freepage (bt, child);
1670 } while( root->page->lvl > 1 && root->page->act == 1 );
1672 bt_unlockpage (BtLockWrite, root->latch);
1673 bt_unpinlatch (root->latch);
1674 bt_unpinpool (root->pool);
1678 // find and delete key on page by marking delete flag bit
1679 // if page becomes empty, delete it from the btree
1681 BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl)
1683 unsigned char lowerfence[256], higherfence[256];
1684 uint slot, idx, found, fence;
1685 BtPageSet set[1], right[1];
1686 unsigned char value[BtId];
1690 if( slot = bt_loadpage (bt, set, key, len, lvl, BtLockWrite) )
1691 ptr = keyptr(set->page, slot);
1695 // if librarian slot, advance to real slot
1697 if( slotptr(set->page, slot)->type == Librarian )
1698 ptr = keyptr(set->page, ++slot);
1700 fence = slot == set->page->cnt;
1702 // if key is found delete it, otherwise ignore request
1704 if( found = !keycmp (ptr, key, len) )
1705 if( found = slotptr(set->page, slot)->dead == 0 ) {
1706 val = valptr(set->page,slot);
1707 slotptr(set->page, slot)->dead = 1;
1708 set->page->garbage += ptr->len + val->len + 2;
1711 // collapse empty slots beneath our fence
1713 while( idx = set->page->cnt - 1 )
1714 if( slotptr(set->page, idx)->dead ) {
1715 *slotptr(set->page, idx) = *slotptr(set->page, idx + 1);
1716 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1721 // did we delete a fence key in an upper level?
1723 if( found && lvl && fence && set->page->act )
1724 if( bt_fixfence (bt, set, lvl) )
1727 return bt->found = found, 0;
1729 // do we need to collapse root?
1731 if( lvl > 1 && set->page_no == ROOT_page && set->page->act == 1 )
1732 if( bt_collapseroot (bt, set) )
1735 return bt->found = found, 0;
1737 // return if page is not empty
1739 if( set->page->act ) {
1740 bt_unlockpage(BtLockWrite, set->latch);
1741 bt_unpinlatch (set->latch);
1742 bt_unpinpool (set->pool);
1743 return bt->found = found, 0;
1746 // cache copy of fence key
1747 // to post in parent
1749 ptr = keyptr(set->page, set->page->cnt);
1750 memcpy (lowerfence, ptr, ptr->len + 1);
1752 // obtain lock on right page
1754 right->page_no = bt_getid(set->page->right);
1755 right->latch = bt_pinlatch (bt, right->page_no);
1756 bt_lockpage (BtLockWrite, right->latch);
1758 // pin page contents
1760 if( right->pool = bt_pinpool (bt, right->page_no) )
1761 right->page = bt_page (bt, right->pool, right->page_no);
1765 if( right->page->kill )
1766 return bt->err = BTERR_struct;
1768 // pull contents of right peer into our empty page
1770 memcpy (set->page, right->page, bt->mgr->page_size);
1772 // cache copy of key to update
1774 ptr = keyptr(right->page, right->page->cnt);
1775 memcpy (higherfence, ptr, ptr->len + 1);
1777 // mark right page deleted and point it to left page
1778 // until we can post parent updates
1780 bt_putid (right->page->right, set->page_no);
1781 right->page->kill = 1;
1783 bt_lockpage (BtLockParent, right->latch);
1784 bt_unlockpage (BtLockWrite, right->latch);
1786 bt_lockpage (BtLockParent, set->latch);
1787 bt_unlockpage (BtLockWrite, set->latch);
1789 // redirect higher key directly to our new node contents
1791 bt_putid (value, set->page_no);
1793 if( bt_insertkey (bt, higherfence+1, *higherfence, lvl+1, value, BtId, 1) )
1796 // delete old lower key to our node
1798 if( bt_deletekey (bt, lowerfence+1, *lowerfence, lvl+1) )
1801 // obtain delete and write locks to right node
1803 bt_unlockpage (BtLockParent, right->latch);
1804 bt_lockpage (BtLockDelete, right->latch);
1805 bt_lockpage (BtLockWrite, right->latch);
1806 bt_freepage (bt, right);
1808 bt_unlockpage (BtLockParent, set->latch);
1809 bt_unpinlatch (set->latch);
1810 bt_unpinpool (set->pool);
1815 BtKey bt_foundkey (BtDb *bt)
1817 return (BtKey)(bt->key);
1820 // advance to next slot
1822 uint bt_findnext (BtDb *bt, BtPageSet *set, uint slot)
1824 BtLatchSet *prevlatch;
1828 if( slot < set->page->cnt )
1831 prevlatch = set->latch;
1832 prevpool = set->pool;
1834 if( page_no = bt_getid(set->page->right) )
1835 set->latch = bt_pinlatch (bt, page_no);
1837 return bt->err = BTERR_struct, 0;
1839 // pin page contents
1841 if( set->pool = bt_pinpool (bt, page_no) )
1842 set->page = bt_page (bt, set->pool, page_no);
1846 // obtain access lock using lock chaining with Access mode
1848 bt_lockpage(BtLockAccess, set->latch);
1850 bt_unlockpage(BtLockRead, prevlatch);
1851 bt_unpinlatch (prevlatch);
1852 bt_unpinpool (prevpool);
1854 bt_lockpage(BtLockRead, set->latch);
1855 bt_unlockpage(BtLockAccess, set->latch);
1857 set->page_no = page_no;
1861 // find unique key or first duplicate key in
1862 // leaf level and return number of value bytes
1863 // or (-1) if not found. Setup key for bt_foundkey
1865 int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax)
1873 if( slot = bt_loadpage (bt, set, key, keylen, 0, BtLockRead) )
1875 ptr = keyptr(set->page, slot);
1877 // skip librarian slot place holder
1879 if( slotptr(set->page, slot)->type == Librarian )
1880 ptr = keyptr(set->page, ++slot);
1882 if( slotptr(set->page, slot)->dead )
1885 // return actual key found
1887 memcpy (bt->key, ptr, ptr->len + 1);
1890 if( slotptr(set->page, slot)->type == Duplicate )
1893 // not there if we reach the stopper key
1895 if( slot == set->page->cnt )
1896 if( !bt_getid (set->page->right) )
1899 // if key exists, return >= 0 value bytes copied
1900 // otherwise return (-1)
1903 if( !memcmp (ptr->key, key, len) ) {
1904 val = valptr (set->page,slot);
1905 if( valmax > val->len )
1907 memcpy (value, val->value, valmax);
1913 } while( slot = bt_findnext (bt, set, slot) );
1915 bt_unlockpage (BtLockRead, set->latch);
1916 bt_unpinlatch (set->latch);
1917 bt_unpinpool (set->pool);
1921 // check page for space available,
1922 // clean if necessary and return
1923 // 0 - page needs splitting
1924 // >0 new slot value
1926 uint bt_cleanpage(BtDb *bt, BtPage page, uint keylen, uint slot, uint vallen)
1928 uint nxt = bt->mgr->page_size;
1929 uint cnt = 0, idx = 0;
1930 uint max = page->cnt;
1935 if( page->min >= (max+2) * sizeof(BtSlot) + sizeof(*page) + keylen + 1 + vallen + 1)
1938 // skip cleanup and proceed to split
1939 // if there's not enough garbage
1942 if( page->garbage < nxt / 5 )
1945 memcpy (bt->frame, page, bt->mgr->page_size);
1947 // skip page info and set rest of page to zero
1949 memset (page+1, 0, bt->mgr->page_size - sizeof(*page));
1953 // clean up page first by
1954 // removing deleted keys
1956 while( cnt++ < max ) {
1959 if( cnt < max && slotptr(bt->frame,cnt)->dead )
1962 // copy the value across
1964 val = valptr(bt->frame, cnt);
1965 nxt -= val->len + 1;
1966 ((unsigned char *)page)[nxt] = val->len;
1967 memcpy ((unsigned char *)page + nxt + 1, val->value, val->len);
1969 // copy the key across
1971 key = keyptr(bt->frame, cnt);
1972 nxt -= key->len + 1;
1973 memcpy ((unsigned char *)page + nxt, key, key->len + 1);
1975 // make a librarian slot
1978 slotptr(page, ++idx)->off = nxt;
1979 slotptr(page, idx)->type = Librarian;
1980 slotptr(page, idx)->dead = 1;
1985 slotptr(page, ++idx)->off = nxt;
1986 slotptr(page, idx)->type = slotptr(bt->frame, cnt)->type;
1988 if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) )
1995 // see if page has enough space now, or does it need splitting?
1997 if( page->min >= (idx+2) * sizeof(BtSlot) + sizeof(*page) + keylen + 1 + vallen + 1 )
2003 // split the root and raise the height of the btree
2005 BTERR bt_splitroot(BtDb *bt, BtPageSet *root, unsigned char *leftkey, uid page_no2)
2007 uint nxt = bt->mgr->page_size;
2008 unsigned char value[BtId];
2011 // Obtain an empty page to use, and copy the current
2012 // root contents into it, e.g. lower keys
2014 if( !(left = bt_newpage(bt, root->page)) )
2017 // preserve the page info at the bottom
2018 // of higher keys and set rest to zero
2020 memset(root->page+1, 0, bt->mgr->page_size - sizeof(*root->page));
2022 // insert lower keys page fence key on newroot page as first key
2025 bt_putid (value, left);
2026 ((unsigned char *)root->page)[nxt] = BtId;
2027 memcpy ((unsigned char *)root->page + nxt + 1, value, BtId);
2029 nxt -= *leftkey + 1;
2030 memcpy ((unsigned char *)root->page + nxt, leftkey, *leftkey + 1);
2031 slotptr(root->page, 1)->off = nxt;
2033 // insert stopper key on newroot page
2034 // and increase the root height
2036 nxt -= 3 + BtId + 1;
2037 ((unsigned char *)root->page)[nxt] = 2;
2038 ((unsigned char *)root->page)[nxt+1] = 0xff;
2039 ((unsigned char *)root->page)[nxt+2] = 0xff;
2041 bt_putid (value, page_no2);
2042 ((unsigned char *)root->page)[nxt+3] = BtId;
2043 memcpy ((unsigned char *)root->page + nxt + 4, value, BtId);
2044 slotptr(root->page, 2)->off = nxt;
2046 bt_putid(root->page->right, 0);
2047 root->page->min = nxt; // reset lowest used offset and key count
2048 root->page->cnt = 2;
2049 root->page->act = 2;
2052 // release and unpin root
2054 bt_unlockpage(BtLockWrite, root->latch);
2055 bt_unpinlatch (root->latch);
2056 bt_unpinpool (root->pool);
2060 // split already locked full node
2063 BTERR bt_splitpage (BtDb *bt, BtPageSet *set)
2065 uint cnt = 0, idx = 0, max, nxt = bt->mgr->page_size;
2066 unsigned char fencekey[256], rightkey[256];
2067 unsigned char value[BtId];
2068 uint lvl = set->page->lvl;
2074 // split higher half of keys to bt->frame
2076 memset (bt->frame, 0, bt->mgr->page_size);
2077 max = set->page->cnt;
2081 while( cnt++ < max ) {
2082 if( slotptr(set->page, cnt)->dead && cnt < max )
2084 val = valptr(set->page, cnt);
2085 nxt -= val->len + 1;
2086 ((unsigned char *)bt->frame)[nxt] = val->len;
2087 memcpy ((unsigned char *)bt->frame + nxt + 1, val->value, val->len);
2089 key = keyptr(set->page, cnt);
2090 nxt -= key->len + 1;
2091 memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1);
2093 // add librarian slot
2096 slotptr(bt->frame, ++idx)->off = nxt;
2097 slotptr(bt->frame, idx)->type = Librarian;
2098 slotptr(bt->frame, idx)->dead = 1;
2103 slotptr(bt->frame, ++idx)->off = nxt;
2104 slotptr(bt->frame, idx)->type = slotptr(set->page, cnt)->type;
2106 if( !(slotptr(bt->frame, idx)->dead = slotptr(set->page, cnt)->dead) )
2110 // remember existing fence key for new page to the right
2112 memcpy (rightkey, key, key->len + 1);
2114 bt->frame->bits = bt->mgr->page_bits;
2115 bt->frame->min = nxt;
2116 bt->frame->cnt = idx;
2117 bt->frame->lvl = lvl;
2121 if( set->page_no > ROOT_page )
2122 memcpy (bt->frame->right, set->page->right, BtId);
2124 // get new free page and write higher keys to it.
2126 if( !(right->page_no = bt_newpage(bt, bt->frame)) )
2129 // update lower keys to continue in old page
2131 memcpy (bt->frame, set->page, bt->mgr->page_size);
2132 memset (set->page+1, 0, bt->mgr->page_size - sizeof(*set->page));
2133 nxt = bt->mgr->page_size;
2134 set->page->garbage = 0;
2140 if( slotptr(bt->frame, max)->type == Librarian )
2143 // assemble page of smaller keys
2145 while( cnt++ < max ) {
2146 if( slotptr(bt->frame, cnt)->dead )
2148 val = valptr(bt->frame, cnt);
2149 nxt -= val->len + 1;
2150 ((unsigned char *)set->page)[nxt] = val->len;
2151 memcpy ((unsigned char *)set->page + nxt + 1, val->value, val->len);
2153 key = keyptr(bt->frame, cnt);
2154 nxt -= key->len + 1;
2155 memcpy ((unsigned char *)set->page + nxt, key, key->len + 1);
2157 // add librarian slot
2160 slotptr(set->page, ++idx)->off = nxt;
2161 slotptr(set->page, idx)->type = Librarian;
2162 slotptr(set->page, idx)->dead = 1;
2167 slotptr(set->page, ++idx)->off = nxt;
2168 slotptr(set->page, idx)->type = slotptr(bt->frame, cnt)->type;
2172 // remember fence key for smaller page
2174 memcpy(fencekey, key, key->len + 1);
2176 bt_putid(set->page->right, right->page_no);
2177 set->page->min = nxt;
2178 set->page->cnt = idx;
2180 // if current page is the root page, split it
2182 if( set->page_no == ROOT_page )
2183 return bt_splitroot (bt, set, fencekey, right->page_no);
2185 // insert new fences in their parent pages
2187 right->latch = bt_pinlatch (bt, right->page_no);
2188 bt_lockpage (BtLockParent, right->latch);
2190 bt_lockpage (BtLockParent, set->latch);
2191 bt_unlockpage (BtLockWrite, set->latch);
2193 // insert new fence for reformulated left block of smaller keys
2195 bt_putid (value, set->page_no);
2197 if( bt_insertkey (bt, fencekey+1, *fencekey, lvl+1, value, BtId, 1) )
2200 // switch fence for right block of larger keys to new right page
2202 bt_putid (value, right->page_no);
2204 if( bt_insertkey (bt, rightkey+1, *rightkey, lvl+1, value, BtId, 1) )
2207 bt_unlockpage (BtLockParent, set->latch);
2208 bt_unpinlatch (set->latch);
2209 bt_unpinpool (set->pool);
2211 bt_unlockpage (BtLockParent, right->latch);
2212 bt_unpinlatch (right->latch);
2216 // install new key and value onto page
2217 // page must already be checked for
2220 BTERR bt_insertslot (BtDb *bt, BtPageSet *set, uint slot, unsigned char *key,uint keylen, unsigned char *value, uint vallen, uint type)
2222 uint idx, librarian;
2225 // if found slot > desired slot and previous slot
2226 // is a librarian slot, use it
2229 if( slotptr(set->page, slot-1)->type == Librarian )
2232 // copy value onto page
2234 set->page->min -= vallen + 1; // reset lowest used offset
2235 ((unsigned char *)set->page)[set->page->min] = vallen;
2236 memcpy ((unsigned char *)set->page + set->page->min +1, value, vallen );
2238 // copy key onto page
2240 set->page->min -= keylen + 1; // reset lowest used offset
2241 ((unsigned char *)set->page)[set->page->min] = keylen;
2242 memcpy ((unsigned char *)set->page + set->page->min +1, key, keylen );
2244 // find first empty slot
2246 for( idx = slot; idx < set->page->cnt; idx++ )
2247 if( slotptr(set->page, idx)->dead )
2250 // now insert key into array before slot
2252 if( idx == set->page->cnt )
2253 idx += 2, set->page->cnt += 2, librarian = 2;
2259 while( idx > slot + librarian - 1 )
2260 *slotptr(set->page, idx) = *slotptr(set->page, idx - librarian), idx--;
2262 // add librarian slot
2264 if( librarian > 1 ) {
2265 node = slotptr(set->page, slot++);
2266 node->off = set->page->min;
2267 node->type = Librarian;
2273 node = slotptr(set->page, slot);
2274 node->off = set->page->min;
2278 bt_unlockpage (BtLockWrite, set->latch);
2279 bt_unpinlatch (set->latch);
2280 bt_unpinpool (set->pool);
2284 // Insert new key into the btree at given level.
2285 // either add a new key or update/add an existing one
2287 BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint keylen, uint lvl, void *value, uint vallen, uint unique)
2289 unsigned char newkey[256];
2290 uint slot, idx, len;
2297 // set up the key we're working on
2299 memcpy (newkey + 1, key, keylen);
2302 // is this a non-unique index value?
2308 sequence = bt_newdup (bt);
2309 bt_putid (newkey + *newkey + 1, sequence);
2313 while( 1 ) { // find the page and slot for the current key
2314 if( slot = bt_loadpage (bt, set, newkey + 1, *newkey, lvl, BtLockWrite) )
2315 ptr = keyptr(set->page, slot);
2318 bt->err = BTERR_ovflw;
2322 // if librarian slot == found slot, advance to real slot
2324 if( slotptr(set->page, slot)->type == Librarian )
2325 if( !keycmp (ptr, key, keylen) )
2326 ptr = keyptr(set->page, ++slot);
2330 if( slotptr(set->page, slot)->type == Duplicate )
2333 // if inserting a duplicate key or unique key
2334 // check for adequate space on the page
2335 // and insert the new key before slot.
2337 if( unique && (len != *newkey || memcmp (ptr->key, newkey+1, *newkey)) || !unique ) {
2338 if( !(slot = bt_cleanpage (bt, set->page, *newkey, slot, vallen)) )
2339 if( bt_splitpage (bt, set) )
2344 return bt_insertslot (bt, set, slot, newkey + 1, *newkey, value, vallen, type);
2347 // if key already exists, update value and return
2349 if( val = valptr(set->page, slot), val->len >= vallen ) {
2350 if( slotptr(set->page, slot)->dead )
2352 set->page->garbage += val->len - vallen;
2353 slotptr(set->page, slot)->dead = 0;
2355 memcpy (val->value, value, vallen);
2356 bt_unlockpage(BtLockWrite, set->latch);
2357 bt_unpinlatch (set->latch);
2358 bt_unpinpool (set->pool);
2362 // new update value doesn't fit in existing value area
2364 if( !slotptr(set->page, slot)->dead )
2365 set->page->garbage += val->len + ptr->len + 2;
2367 slotptr(set->page, slot)->dead = 0;
2371 if( !(slot = bt_cleanpage (bt, set->page, keylen, slot, vallen)) )
2372 if( bt_splitpage (bt, set) )
2377 set->page->min -= vallen + 1;
2378 ((unsigned char *)set->page)[set->page->min] = vallen;
2379 memcpy ((unsigned char *)set->page + set->page->min +1, value, vallen);
2381 set->page->min -= keylen + 1;
2382 ((unsigned char *)set->page)[set->page->min] = keylen;
2383 memcpy ((unsigned char *)set->page + set->page->min +1, key, keylen);
2385 slotptr(set->page, slot)->off = set->page->min;
2386 bt_unlockpage(BtLockWrite, set->latch);
2387 bt_unpinlatch (set->latch);
2388 bt_unpinpool (set->pool);
2394 // return next slot for cursor page
2395 // or slide cursor right into next page
2397 uint bt_nextkey (BtDb *bt, uint slot)
2403 right = bt_getid(bt->cursor->right);
2405 while( slot++ < bt->cursor->cnt )
2406 if( slotptr(bt->cursor,slot)->dead )
2408 else if( right || (slot < bt->cursor->cnt) ) // skip infinite stopper
2416 bt->cursor_page = right;
2418 if( set->pool = bt_pinpool (bt, right) )
2419 set->page = bt_page (bt, set->pool, right);
2423 set->latch = bt_pinlatch (bt, right);
2424 bt_lockpage(BtLockRead, set->latch);
2426 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2428 bt_unlockpage(BtLockRead, set->latch);
2429 bt_unpinlatch (set->latch);
2430 bt_unpinpool (set->pool);
2438 // cache page of keys into cursor and return starting slot for given key
2440 uint bt_startkey (BtDb *bt, unsigned char *key, uint len)
2445 // cache page for retrieval
2447 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockRead) )
2448 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2452 bt->cursor_page = set->page_no;
2454 bt_unlockpage(BtLockRead, set->latch);
2455 bt_unpinlatch (set->latch);
2456 bt_unpinpool (set->pool);
2460 BtKey bt_key(BtDb *bt, uint slot)
2462 return keyptr(bt->cursor, slot);
2465 BtVal bt_val(BtDb *bt, uint slot)
2467 return valptr(bt->cursor,slot);
2473 double getCpuTime(int type)
2476 FILETIME xittime[1];
2477 FILETIME systime[1];
2478 FILETIME usrtime[1];
2479 SYSTEMTIME timeconv[1];
2482 memset (timeconv, 0, sizeof(SYSTEMTIME));
2486 GetSystemTimeAsFileTime (xittime);
2487 FileTimeToSystemTime (xittime, timeconv);
2488 ans = (double)timeconv->wDayOfWeek * 3600 * 24;
2491 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2492 FileTimeToSystemTime (usrtime, timeconv);
2495 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2496 FileTimeToSystemTime (systime, timeconv);
2500 ans += (double)timeconv->wHour * 3600;
2501 ans += (double)timeconv->wMinute * 60;
2502 ans += (double)timeconv->wSecond;
2503 ans += (double)timeconv->wMilliseconds / 1000;
2508 #include <sys/resource.h>
2510 double getCpuTime(int type)
2512 struct rusage used[1];
2513 struct timeval tv[1];
2517 gettimeofday(tv, NULL);
2518 return (double)tv->tv_sec + (double)tv->tv_usec / 1000000;
2521 getrusage(RUSAGE_SELF, used);
2522 return (double)used->ru_utime.tv_sec + (double)used->ru_utime.tv_usec / 1000000;
2525 getrusage(RUSAGE_SELF, used);
2526 return (double)used->ru_stime.tv_sec + (double)used->ru_stime.tv_usec / 1000000;
2533 uint bt_latchaudit (BtDb *bt)
2535 ushort idx, hashidx;
2542 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2544 if( *(ushort *)(bt->mgr->latchmgr->lock) )
2545 fprintf(stderr, "Alloc page locked\n");
2546 *(ushort *)(bt->mgr->latchmgr->lock) = 0;
2548 for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
2549 latch = bt->mgr->latchsets + idx;
2550 if( *latch->readwr->rin & MASK )
2551 fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
2552 memset ((ushort *)latch->readwr, 0, sizeof(RWLock));
2554 if( *latch->access->rin & MASK )
2555 fprintf(stderr, "latchset %d accesslocked for page %.8x\n", idx, latch->page_no);
2556 memset ((ushort *)latch->access, 0, sizeof(RWLock));
2558 if( *latch->parent->rin & MASK )
2559 fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
2560 memset ((ushort *)latch->parent, 0, sizeof(RWLock));
2563 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2568 for( hashidx = 0; hashidx < bt->mgr->latchmgr->latchhash; hashidx++ ) {
2569 if( *(ushort *)(bt->mgr->latchmgr->table[hashidx].latch) )
2570 fprintf(stderr, "hash entry %d locked\n", hashidx);
2572 *(ushort *)(bt->mgr->latchmgr->table[hashidx].latch) = 0;
2574 if( idx = bt->mgr->latchmgr->table[hashidx].slot ) do {
2575 latch = bt->mgr->latchsets + idx;
2576 if( *(ushort *)latch->busy )
2577 fprintf(stderr, "latchset %d busylocked for page %.8x\n", idx, latch->page_no);
2578 *(ushort *)latch->busy = 0;
2580 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2581 } while( idx = latch->next );
2584 next = bt->mgr->latchmgr->nlatchpage + LATCH_page;
2585 page_no = LEAF_page;
2587 while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
2588 off64_t off = page_no << bt->mgr->page_bits;
2590 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2594 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2596 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2597 fprintf(stderr, "page %.8x unable to read\n", page_no);
2599 if( *amt < bt->mgr->page_size )
2600 fprintf(stderr, "page %.8x unable to read\n", page_no);
2602 if( !bt->frame->free ) {
2603 for( idx = 0; idx++ < bt->frame->cnt - 1; ) {
2604 ptr = keyptr(bt->frame, idx+1);
2605 if( keycmp (keyptr(bt->frame, idx), ptr->key, ptr->len) >= 0 )
2606 fprintf(stderr, "page %.8x idx %.2x out of order\n", page_no, idx);
2608 if( !bt->frame->lvl )
2609 cnt += bt->frame->act;
2611 if( page_no > LEAF_page )
2626 // standalone program to index file of keys
2627 // then list them onto std-out
2630 void *index_file (void *arg)
2632 uint __stdcall index_file (void *arg)
2635 int line = 0, found = 0, cnt = 0;
2636 uid next, page_no = LEAF_page; // start on first page of leaves
2637 unsigned char key[256];
2638 ThreadArg *args = arg;
2639 int ch, len = 0, slot;
2645 bt = bt_open (args->mgr);
2647 switch(args->type[0] | 0x20)
2650 fprintf(stderr, "started latch mgr audit\n");
2651 cnt = bt_latchaudit (bt);
2652 fprintf(stderr, "finished latch mgr audit, found %d keys\n", cnt);
2656 fprintf(stderr, "started pennysort for %s\n", args->infile);
2657 if( in = fopen (args->infile, "rb") )
2658 while( ch = getc(in), ch != EOF )
2663 if( bt_insertkey (bt, key, 10, 0, key + 10, len - 10, 0) )
2664 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2667 else if( len < 255 )
2669 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
2673 fprintf(stderr, "started indexing for %s\n", args->infile);
2674 if( in = fopen (args->infile, "rb") )
2675 while( ch = getc(in), ch != EOF )
2680 if( args->num == 1 )
2681 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2683 else if( args->num )
2684 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2686 if( bt_insertkey (bt, key, len, 0, NULL, 0, 0) )
2687 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2690 else if( len < 255 )
2692 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
2696 fprintf(stderr, "started deleting keys for %s\n", args->infile);
2697 if( in = fopen (args->infile, "rb") )
2698 while( ch = getc(in), ch != EOF )
2702 if( args->num == 1 )
2703 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2705 else if( args->num )
2706 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2708 if( bt_findkey (bt, key, len, NULL, 0) < 0 )
2709 fprintf(stderr, "Cannot find key for Line: %d\n", line), exit(0);
2710 ptr = (BtKey)(bt->key);
2712 if( bt_deletekey (bt, ptr->key, ptr->len, 0) )
2713 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2717 else if( len < 255 )
2719 fprintf(stderr, "finished %s for %d keys, %d found\n", args->infile, line, found);
2723 fprintf(stderr, "started finding keys for %s\n", args->infile);
2724 if( in = fopen (args->infile, "rb") )
2725 while( ch = getc(in), ch != EOF )
2729 if( args->num == 1 )
2730 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2732 else if( args->num )
2733 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2735 if( bt_findkey (bt, key, len, NULL, 0) == 0 )
2738 fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
2741 else if( len < 255 )
2743 fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found);
2747 fprintf(stderr, "started scanning\n");
2749 if( set->pool = bt_pinpool (bt, page_no) )
2750 set->page = bt_page (bt, set->pool, page_no);
2753 set->latch = bt_pinlatch (bt, page_no);
2754 bt_lockpage (BtLockRead, set->latch);
2755 next = bt_getid (set->page->right);
2757 for( slot = 0; slot++ < set->page->cnt; )
2758 if( next || slot < set->page->cnt )
2759 if( !slotptr(set->page, slot)->dead ) {
2760 ptr = keyptr(set->page, slot);
2763 if( slotptr(set->page, slot)->type == Duplicate )
2766 fwrite (ptr->key, len, 1, stdout);
2767 fputc ('\n', stdout);
2771 bt_unlockpage (BtLockRead, set->latch);
2772 bt_unpinlatch (set->latch);
2773 bt_unpinpool (set->pool);
2774 } while( page_no = next );
2776 fprintf(stderr, " Total keys read %d\n", cnt);
2781 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2783 fprintf(stderr, "started counting\n");
2784 next = bt->mgr->latchmgr->nlatchpage + LATCH_page;
2785 page_no = LEAF_page;
2787 while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
2788 uid off = page_no << bt->mgr->page_bits;
2790 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2794 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2796 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2797 return bt->err = BTERR_map;
2799 if( *amt < bt->mgr->page_size )
2800 return bt->err = BTERR_map;
2802 if( !bt->frame->free && !bt->frame->lvl )
2803 cnt += bt->frame->act;
2804 if( page_no > LEAF_page )
2809 cnt--; // remove stopper key
2810 fprintf(stderr, " Total keys read %d\n", cnt);
2822 typedef struct timeval timer;
2824 int main (int argc, char **argv)
2826 int idx, cnt, len, slot, err;
2827 int segsize, bits = 16;
2844 fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]);
2845 fprintf (stderr, " where page_bits is the page size in bits\n");
2846 fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n");
2847 fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n");
2848 fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n");
2849 fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n");
2853 start = getCpuTime(0);
2856 bits = atoi(argv[3]);
2859 poolsize = atoi(argv[4]);
2862 fprintf (stderr, "Warning: no mapped_pool\n");
2864 if( poolsize > 65535 )
2865 fprintf (stderr, "Warning: mapped_pool > 65535 segments\n");
2868 segsize = atoi(argv[5]);
2870 segsize = 4; // 16 pages per mmap segment
2873 num = atoi(argv[6]);
2877 threads = malloc (cnt * sizeof(pthread_t));
2879 threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE));
2881 args = malloc (cnt * sizeof(ThreadArg));
2883 mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8);
2886 fprintf(stderr, "Index Open Error %s\n", argv[1]);
2892 for( idx = 0; idx < cnt; idx++ ) {
2893 args[idx].infile = argv[idx + 7];
2894 args[idx].type = argv[2];
2895 args[idx].mgr = mgr;
2896 args[idx].num = num;
2897 args[idx].idx = idx;
2899 if( err = pthread_create (threads + idx, NULL, index_file, args + idx) )
2900 fprintf(stderr, "Error creating thread %d\n", err);
2902 threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL);
2906 // wait for termination
2909 for( idx = 0; idx < cnt; idx++ )
2910 pthread_join (threads[idx], NULL);
2912 WaitForMultipleObjects (cnt, threads, TRUE, INFINITE);
2914 for( idx = 0; idx < cnt; idx++ )
2915 CloseHandle(threads[idx]);
2918 elapsed = getCpuTime(0) - start;
2919 fprintf(stderr, " real %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
2920 elapsed = getCpuTime(1);
2921 fprintf(stderr, " user %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
2922 elapsed = getCpuTime(2);
2923 fprintf(stderr, " sys %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);