1 // btree version threadskv4b sched_yield version
2 // with reworked bt_deletekey code,
3 // phase-fair reader writer lock,
4 // librarian page split code,
5 // and duplicate key management
9 // author: karl malbrain, malbrain@cal.berkeley.edu
12 This work, including the source code, documentation
13 and related data, is placed into the public domain.
15 The orginal author is Karl Malbrain.
17 THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
18 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
19 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
20 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
21 RESULTING FROM THE USE, MODIFICATION, OR
22 REDISTRIBUTION OF THIS SOFTWARE.
25 // Please see the project home page for documentation
26 // code.google.com/p/high-concurrency-btree
28 #define _FILE_OFFSET_BITS 64
29 #define _LARGEFILE64_SOURCE
45 #define WIN32_LEAN_AND_MEAN
59 typedef unsigned long long uid;
62 typedef unsigned long long off64_t;
63 typedef unsigned short ushort;
64 typedef unsigned int uint;
67 #define BT_latchtable 128 // number of latch manager slots
69 #define BT_ro 0x6f72 // ro
70 #define BT_rw 0x7772 // rw
72 #define BT_maxbits 24 // maximum page size in bits
73 #define BT_minbits 9 // minimum page size in bits
74 #define BT_minpage (1 << BT_minbits) // minimum page size
75 #define BT_maxpage (1 << BT_maxbits) // maximum page size
78 There are five lock types for each node in three independent sets:
79 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
80 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent.
81 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock.
82 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks.
83 5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification.
94 // definition for phase-fair reader/writer lock implementation
108 // definition for spin latch implementation
110 // exclusive is set for write access
111 // share is count of read accessors
112 // grant write lock when share == 0
114 volatile typedef struct {
125 // hash table entries
128 BtSpinLatch latch[1];
129 volatile ushort slot; // Latch table entry at head of chain
132 // latch manager table structure
135 RWLock readwr[1]; // read/write page lock
136 RWLock access[1]; // Access Intent/Page delete
137 RWLock parent[1]; // Posting of fence key in parent
138 BtSpinLatch busy[1]; // slot is being moved between chains
139 volatile ushort next; // next entry in hash table chain
140 volatile ushort prev; // prev entry in hash table chain
141 volatile ushort pin; // number of outstanding locks
142 volatile ushort hash; // hash slot entry is under
143 volatile uid page_no; // latch set page number
146 // Define the length of the page and key pointers
150 // Page key slot definition.
152 // Keys are marked dead, but remain on the page until
153 // it cleanup is called. The fence key (highest key) for
154 // a leaf page is always present, even after cleanup.
158 // In addition to the Unique keys that occupy slots
159 // there are Librarian and Duplicate key
160 // slots occupying the key slot array.
162 // The Librarian slots are dead keys that
163 // serve as filler, available to add new Unique
164 // or Dup slots that are inserted into the B-tree.
166 // The Duplicate slots have had their key bytes extended
167 // by 6 bytes to contain a binary duplicate key uniqueifier.
176 uint off:BT_maxbits; // page offset for key start
177 uint type:3; // type of slot
178 uint dead:1; // set for deleted slot
181 // The key structure occupies space at the upper end of
182 // each page. It's a length byte followed by the key
187 unsigned char key[1];
190 // the value structure also occupies space at the upper
191 // end of the page. Each key is immediately followed by a value.
195 unsigned char value[1];
198 // The first part of an index page.
199 // It is immediately followed
200 // by the BtSlot array of keys.
202 // note that this structure size
203 // must be a multiple of 8 bytes
204 // in order to place dups correctly.
206 typedef struct BtPage_ {
207 uint cnt; // count of keys in page
208 uint act; // count of active keys
209 uint min; // next key offset
210 uint garbage; // page garbage in bytes
211 unsigned char bits:7; // page size in bits
212 unsigned char free:1; // page is on free chain
213 unsigned char lvl:7; // level of page
214 unsigned char kill:1; // page is being deleted
215 unsigned char right[BtId]; // page number to right
218 // The memory mapping pool table buffer manager entry
221 uid basepage; // mapped base page number
222 char *map; // mapped memory pointer
223 ushort slot; // slot index in this array
224 ushort pin; // mapped page pin counter
225 void *hashprev; // previous pool entry for the same hash idx
226 void *hashnext; // next pool entry for the same hash idx
228 HANDLE hmap; // Windows memory mapping handle
232 #define CLOCK_bit 0x8000 // bit in pool->pin
234 // The loadpage interface object
237 uid page_no; // current page number
238 BtPage page; // current page pointer
239 BtPool *pool; // current page pool
240 BtLatchSet *latch; // current page latch set
243 // structure for latch manager on ALLOC_page
246 struct BtPage_ alloc[1]; // next page_no in right ptr
247 unsigned long long dups[1]; // global duplicate key uniqueifier
248 unsigned char chain[BtId]; // head of free page_nos chain
249 BtSpinLatch lock[1]; // allocation area lite latch
250 ushort latchdeployed; // highest number of latch entries deployed
251 ushort nlatchpage; // number of latch pages at BT_latch
252 ushort latchtotal; // number of page latch entries
253 ushort latchhash; // number of latch hash table slots
254 ushort latchvictim; // next latch entry to examine
255 BtHashEntry table[0]; // the hash table
258 // The object structure for Btree access
261 uint page_size; // page size
262 uint page_bits; // page size in bits
263 uint seg_bits; // seg size in pages in bits
264 uint mode; // read-write mode
270 ushort poolcnt; // highest page pool node in use
271 ushort poolmax; // highest page pool node allocated
272 ushort poolmask; // total number of pages in mmap segment - 1
273 ushort hashsize; // size of Hash Table for pool entries
274 volatile uint evicted; // last evicted hash table slot
275 ushort *hash; // pool index for hash entries
276 BtSpinLatch *latch; // latches for hash table slots
277 BtLatchMgr *latchmgr; // mapped latch page from allocation page
278 BtLatchSet *latchsets; // mapped latch set from latch pages
279 BtPool *pool; // memory pool page segments
281 HANDLE halloc; // allocation and latch table handle
286 BtMgr *mgr; // buffer manager for thread
287 BtPage cursor; // cached frame for start/next (never mapped)
288 BtPage frame; // spare frame for the page split (never mapped)
289 uid cursor_page; // current cursor page number
290 unsigned char *mem; // frame, cursor, page memory buffer
291 unsigned char key[256]; // last found complete key
292 int found; // last delete or insert was found
293 int err; // last error
307 extern void bt_close (BtDb *bt);
308 extern BtDb *bt_open (BtMgr *mgr);
309 extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, void *value, uint vallen, uint update);
310 extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl);
311 extern int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax);
312 extern BtKey bt_foundkey (BtDb *bt);
313 extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len);
314 extern uint bt_nextkey (BtDb *bt, uint slot);
317 extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize);
318 void bt_mgrclose (BtMgr *mgr);
320 // Helper functions to return slot values
321 // from the cursor page.
323 extern BtKey bt_key (BtDb *bt, uint slot);
324 extern BtVal bt_val (BtDb *bt, uint slot);
326 // BTree page number constants
327 #define ALLOC_page 0 // allocation & latch manager hash table
328 #define ROOT_page 1 // root of the btree
329 #define LEAF_page 2 // first page of leaves
330 #define LATCH_page 3 // pages for latch manager
332 // Number of levels to create in a new BTree
336 // The page is allocated from low and hi ends.
337 // The key slots are allocated from the bottom,
338 // while the text and value of the key
339 // are allocated from the top. When the two
340 // areas meet, the page is split into two.
342 // A key consists of a length byte, two bytes of
343 // index number (0 - 65534), and up to 253 bytes
346 // Associated with each key is a value byte string
347 // containing any value desired.
349 // The b-tree root is always located at page 1.
350 // The first leaf page of level zero is always
351 // located on page 2.
353 // The b-tree pages are linked with next
354 // pointers to facilitate enumerators,
355 // and provide for concurrency.
357 // When to root page fills, it is split in two and
358 // the tree height is raised by a new root at page
359 // one with two keys.
361 // Deleted keys are marked with a dead bit until
362 // page cleanup. The fence key for a leaf node is
365 // Groups of pages called segments from the btree are optionally
366 // cached with a memory mapped pool. A hash table is used to keep
367 // track of the cached segments. This behaviour is controlled
368 // by the cache block size parameter to bt_open.
370 // To achieve maximum concurrency one page is locked at a time
371 // as the tree is traversed to find leaf key in question. The right
372 // page numbers are used in cases where the page is being split,
375 // Page 0 is dedicated to lock for new page extensions,
376 // and chains empty pages together for reuse. It also
377 // contains the latch manager hash table.
379 // The ParentModification lock on a node is obtained to serialize posting
380 // or changing the fence key for a node.
382 // Empty pages are chained together through the ALLOC page and reused.
384 // Access macros to address slot and key values from the page
385 // Page slots use 1 based indexing.
387 #define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1))
388 #define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off))
389 #define valptr(page, slot) ((BtVal)(keyptr(page,slot)->key + keyptr(page,slot)->len))
391 void bt_putid(unsigned char *dest, uid id)
396 dest[i] = (unsigned char)id, id >>= 8;
399 uid bt_getid(unsigned char *src)
404 for( i = 0; i < BtId; i++ )
405 id <<= 8, id |= *src++;
410 uid bt_newdup (BtDb *bt)
413 return __sync_fetch_and_add (bt->mgr->latchmgr->dups, 1) + 1;
415 return _InterlockedIncrement64(bt->mgr->latchmgr->dups, 1);
419 // Phase-Fair reader/writer lock implementation
421 void WriteLock (RWLock *lock)
426 tix = __sync_fetch_and_add (lock->ticket, 1);
428 tix = _InterlockedExchangeAdd16 (lock->ticket, 1);
430 // wait for our ticket to come up
432 while( tix != lock->serving[0] )
439 w = PRES | (tix & PHID);
441 r = __sync_fetch_and_add (lock->rin, w);
443 r = _InterlockedExchangeAdd16 (lock->rin, w);
445 while( r != *lock->rout )
453 void WriteRelease (RWLock *lock)
456 __sync_fetch_and_and (lock->rin, ~MASK);
458 _InterlockedAnd16 (lock->rin, ~MASK);
463 void ReadLock (RWLock *lock)
467 w = __sync_fetch_and_add (lock->rin, RINC) & MASK;
469 w = _InterlockedExchangeAdd16 (lock->rin, RINC) & MASK;
472 while( w == (*lock->rin & MASK) )
480 void ReadRelease (RWLock *lock)
483 __sync_fetch_and_add (lock->rout, RINC);
485 _InterlockedExchangeAdd16 (lock->rout, RINC);
489 // Spin Latch Manager
491 // wait until write lock mode is clear
492 // and add 1 to the share count
494 void bt_spinreadlock(BtSpinLatch *latch)
500 prev = __sync_fetch_and_add ((ushort *)latch, SHARE);
502 prev = _InterlockedExchangeAdd16((ushort *)latch, SHARE);
504 // see if exclusive request is granted or pending
509 prev = __sync_fetch_and_add ((ushort *)latch, -SHARE);
511 prev = _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
514 } while( sched_yield(), 1 );
516 } while( SwitchToThread(), 1 );
520 // wait for other read and write latches to relinquish
522 void bt_spinwritelock(BtSpinLatch *latch)
528 prev = __sync_fetch_and_or((ushort *)latch, PEND | XCL);
530 prev = _InterlockedOr16((ushort *)latch, PEND | XCL);
533 if( !(prev & ~BOTH) )
537 __sync_fetch_and_and ((ushort *)latch, ~XCL);
539 _InterlockedAnd16((ushort *)latch, ~XCL);
542 } while( sched_yield(), 1 );
544 } while( SwitchToThread(), 1 );
548 // try to obtain write lock
550 // return 1 if obtained,
553 int bt_spinwritetry(BtSpinLatch *latch)
558 prev = __sync_fetch_and_or((ushort *)latch, XCL);
560 prev = _InterlockedOr16((ushort *)latch, XCL);
562 // take write access if all bits are clear
565 if( !(prev & ~BOTH) )
569 __sync_fetch_and_and ((ushort *)latch, ~XCL);
571 _InterlockedAnd16((ushort *)latch, ~XCL);
578 void bt_spinreleasewrite(BtSpinLatch *latch)
581 __sync_fetch_and_and((ushort *)latch, ~BOTH);
583 _InterlockedAnd16((ushort *)latch, ~BOTH);
587 // decrement reader count
589 void bt_spinreleaseread(BtSpinLatch *latch)
592 __sync_fetch_and_add((ushort *)latch, -SHARE);
594 _InterlockedExchangeAdd16((ushort *)latch, -SHARE);
598 // link latch table entry into latch hash table
600 void bt_latchlink (BtDb *bt, ushort hashidx, ushort victim, uid page_no)
602 BtLatchSet *set = bt->mgr->latchsets + victim;
604 if( set->next = bt->mgr->latchmgr->table[hashidx].slot )
605 bt->mgr->latchsets[set->next].prev = victim;
607 bt->mgr->latchmgr->table[hashidx].slot = victim;
608 set->page_no = page_no;
615 void bt_unpinlatch (BtLatchSet *set)
618 __sync_fetch_and_add(&set->pin, -1);
620 _InterlockedDecrement16 (&set->pin);
624 // find existing latchset or inspire new one
625 // return with latchset pinned
627 BtLatchSet *bt_pinlatch (BtDb *bt, uid page_no)
629 ushort hashidx = page_no % bt->mgr->latchmgr->latchhash;
630 ushort slot, avail = 0, victim, idx;
633 // obtain read lock on hash table entry
635 bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch);
637 if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do
639 set = bt->mgr->latchsets + slot;
640 if( page_no == set->page_no )
642 } while( slot = set->next );
646 __sync_fetch_and_add(&set->pin, 1);
648 _InterlockedIncrement16 (&set->pin);
652 bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch);
657 // try again, this time with write lock
659 bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch);
661 if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do
663 set = bt->mgr->latchsets + slot;
664 if( page_no == set->page_no )
666 if( !set->pin && !avail )
668 } while( slot = set->next );
670 // found our entry, or take over an unpinned one
672 if( slot || (slot = avail) ) {
673 set = bt->mgr->latchsets + slot;
675 __sync_fetch_and_add(&set->pin, 1);
677 _InterlockedIncrement16 (&set->pin);
679 set->page_no = page_no;
680 bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch);
684 // see if there are any unused entries
686 victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, 1) + 1;
688 victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchdeployed);
691 if( victim < bt->mgr->latchmgr->latchtotal ) {
692 set = bt->mgr->latchsets + victim;
694 __sync_fetch_and_add(&set->pin, 1);
696 _InterlockedIncrement16 (&set->pin);
698 bt_latchlink (bt, hashidx, victim, page_no);
699 bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch);
704 victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, -1);
706 victim = _InterlockedDecrement16 (&bt->mgr->latchmgr->latchdeployed);
708 // find and reuse previous lock entry
712 victim = __sync_fetch_and_add(&bt->mgr->latchmgr->latchvictim, 1);
714 victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchvictim) - 1;
716 // we don't use slot zero
718 if( victim %= bt->mgr->latchmgr->latchtotal )
719 set = bt->mgr->latchsets + victim;
723 // take control of our slot
724 // from other threads
726 if( set->pin || !bt_spinwritetry (set->busy) )
731 // try to get write lock on hash chain
732 // skip entry if not obtained
733 // or has outstanding locks
735 if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) {
736 bt_spinreleasewrite (set->busy);
741 bt_spinreleasewrite (set->busy);
742 bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch);
746 // unlink our available victim from its hash chain
749 bt->mgr->latchsets[set->prev].next = set->next;
751 bt->mgr->latchmgr->table[idx].slot = set->next;
754 bt->mgr->latchsets[set->next].prev = set->prev;
756 bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch);
758 __sync_fetch_and_add(&set->pin, 1);
760 _InterlockedIncrement16 (&set->pin);
762 bt_latchlink (bt, hashidx, victim, page_no);
763 bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch);
764 bt_spinreleasewrite (set->busy);
769 void bt_mgrclose (BtMgr *mgr)
774 // release mapped pages
775 // note that slot zero is never used
777 for( slot = 1; slot < mgr->poolmax; slot++ ) {
778 pool = mgr->pool + slot;
781 munmap (pool->map, (uid)(mgr->poolmask+1) << mgr->page_bits);
784 FlushViewOfFile(pool->map, 0);
785 UnmapViewOfFile(pool->map);
786 CloseHandle(pool->hmap);
792 munmap (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size);
793 munmap (mgr->latchmgr, 2 * mgr->page_size);
795 FlushViewOfFile(mgr->latchmgr, 0);
796 UnmapViewOfFile(mgr->latchmgr);
797 CloseHandle(mgr->halloc);
803 free ((void *)mgr->latch);
806 FlushFileBuffers(mgr->idx);
807 CloseHandle(mgr->idx);
808 GlobalFree (mgr->pool);
809 GlobalFree (mgr->hash);
810 GlobalFree ((void *)mgr->latch);
815 // close and release memory
817 void bt_close (BtDb *bt)
824 VirtualFree (bt->mem, 0, MEM_RELEASE);
829 // open/create new btree buffer manager
831 // call with file_name, BT_openmode, bits in page size (e.g. 16),
832 // size of mapped page pool (e.g. 8192)
834 BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize)
836 uint lvl, attr, cacheblk, last, slot, idx;
837 uint nlatchpage, latchhash;
838 unsigned char value[BtId];
839 BtLatchMgr *latchmgr;
848 SYSTEM_INFO sysinfo[1];
851 // determine sanity of page size and buffer pool
853 if( bits > BT_maxbits )
855 else if( bits < BT_minbits )
859 return NULL; // must have buffer pool
862 mgr = calloc (1, sizeof(BtMgr));
864 mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
867 return free(mgr), NULL;
869 cacheblk = 4096; // minimum mmap segment size for unix
872 mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr));
873 attr = FILE_ATTRIBUTE_NORMAL;
874 mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL);
876 if( mgr->idx == INVALID_HANDLE_VALUE )
877 return GlobalFree(mgr), NULL;
879 // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity
880 GetSystemInfo(sysinfo);
881 cacheblk = sysinfo->dwAllocationGranularity;
885 latchmgr = malloc (BT_maxpage);
888 // read minimum page size to get root info
890 if( size = lseek (mgr->idx, 0L, 2) ) {
891 if( pread(mgr->idx, latchmgr, BT_minpage, 0) == BT_minpage )
892 bits = latchmgr->alloc->bits;
894 return free(mgr), free(latchmgr), NULL;
895 } else if( mode == BT_ro )
896 return free(latchmgr), bt_mgrclose (mgr), NULL;
898 latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE);
899 size = GetFileSize(mgr->idx, amt);
902 if( !ReadFile(mgr->idx, (char *)latchmgr, BT_minpage, amt, NULL) )
903 return bt_mgrclose (mgr), NULL;
904 bits = latchmgr->alloc->bits;
905 } else if( mode == BT_ro )
906 return bt_mgrclose (mgr), NULL;
909 mgr->page_size = 1 << bits;
910 mgr->page_bits = bits;
912 mgr->poolmax = poolmax;
915 if( cacheblk < mgr->page_size )
916 cacheblk = mgr->page_size;
918 // mask for partial memmaps
920 mgr->poolmask = (cacheblk >> bits) - 1;
922 // see if requested size of pages per memmap is greater
924 if( (1 << segsize) > mgr->poolmask )
925 mgr->poolmask = (1 << segsize) - 1;
929 while( (1 << mgr->seg_bits) <= mgr->poolmask )
932 mgr->hashsize = hashsize;
935 mgr->pool = calloc (poolmax, sizeof(BtPool));
936 mgr->hash = calloc (hashsize, sizeof(ushort));
937 mgr->latch = calloc (hashsize, sizeof(BtSpinLatch));
939 mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool));
940 mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort));
941 mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch));
947 // initialize an empty b-tree with latch page, root page, page of leaves
948 // and page(s) of latches
950 memset (latchmgr, 0, 1 << bits);
951 nlatchpage = BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1;
952 bt_putid(latchmgr->alloc->right, MIN_lvl+1+nlatchpage);
953 latchmgr->alloc->bits = mgr->page_bits;
955 latchmgr->nlatchpage = nlatchpage;
956 latchmgr->latchtotal = nlatchpage * (mgr->page_size / sizeof(BtLatchSet));
958 // initialize latch manager
960 latchhash = (mgr->page_size - sizeof(BtLatchMgr)) / sizeof(BtHashEntry);
962 // size of hash table = total number of latchsets
964 if( latchhash > latchmgr->latchtotal )
965 latchhash = latchmgr->latchtotal;
967 latchmgr->latchhash = latchhash;
970 if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size )
971 return bt_mgrclose (mgr), NULL;
973 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
974 return bt_mgrclose (mgr), NULL;
976 if( *amt < mgr->page_size )
977 return bt_mgrclose (mgr), NULL;
980 memset (latchmgr, 0, 1 << bits);
981 latchmgr->alloc->bits = mgr->page_bits;
983 for( lvl=MIN_lvl; lvl--; ) {
984 slotptr(latchmgr->alloc, 1)->off = mgr->page_size - 3 - (lvl ? BtId + 1: 1);
985 key = keyptr(latchmgr->alloc, 1);
986 key->len = 2; // create stopper key
990 bt_putid(value, MIN_lvl - lvl + 1);
991 val = valptr(latchmgr->alloc, 1);
992 val->len = lvl ? BtId : 0;
993 memcpy (val->value, value, val->len);
995 latchmgr->alloc->min = slotptr(latchmgr->alloc, 1)->off;
996 latchmgr->alloc->lvl = lvl;
997 latchmgr->alloc->cnt = 1;
998 latchmgr->alloc->act = 1;
1000 if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size )
1001 return bt_mgrclose (mgr), NULL;
1003 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
1004 return bt_mgrclose (mgr), NULL;
1006 if( *amt < mgr->page_size )
1007 return bt_mgrclose (mgr), NULL;
1011 // clear out latch manager locks
1012 // and rest of pages to round out segment
1014 memset(latchmgr, 0, mgr->page_size);
1017 while( last <= ((MIN_lvl + 1 + nlatchpage) | mgr->poolmask) ) {
1019 pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits);
1021 SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN);
1022 if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) )
1023 return bt_mgrclose (mgr), NULL;
1024 if( *amt < mgr->page_size )
1025 return bt_mgrclose (mgr), NULL;
1032 // mlock the root page and the latchmgr page
1034 flag = PROT_READ | PROT_WRITE;
1035 mgr->latchmgr = mmap (0, 2 * mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size);
1036 if( mgr->latchmgr == MAP_FAILED )
1037 return bt_mgrclose (mgr), NULL;
1038 mlock (mgr->latchmgr, 2 * mgr->page_size);
1040 mgr->latchsets = (BtLatchSet *)mmap (0, mgr->latchmgr->nlatchpage * mgr->page_size, flag, MAP_SHARED, mgr->idx, LATCH_page * mgr->page_size);
1041 if( mgr->latchsets == MAP_FAILED )
1042 return bt_mgrclose (mgr), NULL;
1043 mlock (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size);
1045 flag = PAGE_READWRITE;
1046 mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size, NULL);
1048 return bt_mgrclose (mgr), NULL;
1050 flag = FILE_MAP_WRITE;
1051 mgr->latchmgr = MapViewOfFile(mgr->halloc, flag, 0, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size);
1052 if( !mgr->latchmgr )
1053 return GetLastError(), bt_mgrclose (mgr), NULL;
1055 mgr->latchsets = (void *)((char *)mgr->latchmgr + LATCH_page * mgr->page_size);
1061 VirtualFree (latchmgr, 0, MEM_RELEASE);
1066 // open BTree access method
1067 // based on buffer manager
1069 BtDb *bt_open (BtMgr *mgr)
1071 BtDb *bt = malloc (sizeof(*bt));
1073 memset (bt, 0, sizeof(*bt));
1076 bt->mem = malloc (2 *mgr->page_size);
1078 bt->mem = VirtualAlloc(NULL, 2 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE);
1080 bt->frame = (BtPage)bt->mem;
1081 bt->cursor = (BtPage)(bt->mem + 1 * mgr->page_size);
1085 // compare two keys, returning > 0, = 0, or < 0
1086 // as the comparison value
1088 int keycmp (BtKey key1, unsigned char *key2, uint len2)
1090 uint len1 = key1->len;
1093 if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) )
1106 // find segment in pool
1107 // must be called with hashslot idx locked
1108 // return NULL if not there
1109 // otherwise return node
1111 BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx)
1116 // compute start of hash chain in pool
1118 if( slot = bt->mgr->hash[idx] )
1119 pool = bt->mgr->pool + slot;
1123 page_no &= ~bt->mgr->poolmask;
1125 while( pool->basepage != page_no )
1126 if( pool = pool->hashnext )
1134 // add segment to hash table
1136 void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx)
1141 pool->hashprev = pool->hashnext = NULL;
1142 pool->basepage = page_no & ~bt->mgr->poolmask;
1143 pool->pin = CLOCK_bit + 1;
1145 if( slot = bt->mgr->hash[idx] ) {
1146 node = bt->mgr->pool + slot;
1147 pool->hashnext = node;
1148 node->hashprev = pool;
1151 bt->mgr->hash[idx] = pool->slot;
1154 // map new buffer pool segment to virtual memory
1156 BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
1158 off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits;
1159 off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits);
1163 flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE );
1164 pool->map = mmap (0, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off);
1165 if( pool->map == MAP_FAILED )
1166 return bt->err = BTERR_map;
1169 flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE );
1170 pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL);
1172 return bt->err = BTERR_map;
1174 flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE );
1175 pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits);
1177 return bt->err = BTERR_map;
1182 // calculate page within pool
1184 BtPage bt_page (BtDb *bt, BtPool *pool, uid page_no)
1186 uint subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping
1189 page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits));
1190 // madvise (page, bt->mgr->page_size, MADV_WILLNEED);
1196 void bt_unpinpool (BtPool *pool)
1199 __sync_fetch_and_add(&pool->pin, -1);
1201 _InterlockedDecrement16 (&pool->pin);
1205 // find or place requested page in segment-pool
1206 // return pool table entry, incrementing pin
1208 BtPool *bt_pinpool(BtDb *bt, uid page_no)
1210 uint slot, hashidx, idx, victim;
1211 BtPool *pool, *node, *next;
1213 // lock hash table chain
1215 hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
1216 bt_spinwritelock (&bt->mgr->latch[hashidx]);
1218 // look up in hash table
1220 if( pool = bt_findpool(bt, page_no, hashidx) ) {
1222 __sync_fetch_and_or(&pool->pin, CLOCK_bit);
1223 __sync_fetch_and_add(&pool->pin, 1);
1225 _InterlockedOr16 (&pool->pin, CLOCK_bit);
1226 _InterlockedIncrement16 (&pool->pin);
1228 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1232 // allocate a new pool node
1233 // and add to hash table
1236 slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1);
1238 slot = _InterlockedIncrement16 (&bt->mgr->poolcnt) - 1;
1241 if( ++slot < bt->mgr->poolmax ) {
1242 pool = bt->mgr->pool + slot;
1245 if( bt_mapsegment(bt, pool, page_no) )
1248 bt_linkhash(bt, pool, page_no, hashidx);
1249 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1253 // pool table is full
1254 // find best pool entry to evict
1257 __sync_fetch_and_add(&bt->mgr->poolcnt, -1);
1259 _InterlockedDecrement16 (&bt->mgr->poolcnt);
1264 victim = __sync_fetch_and_add(&bt->mgr->evicted, 1);
1266 victim = _InterlockedIncrement (&bt->mgr->evicted) - 1;
1268 victim %= bt->mgr->poolmax;
1269 pool = bt->mgr->pool + victim;
1270 idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
1275 // try to get write lock
1276 // skip entry if not obtained
1278 if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
1281 // skip this entry if
1283 // or clock bit is set
1287 __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
1289 _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
1291 bt_spinreleasewrite (&bt->mgr->latch[idx]);
1295 // unlink victim pool node from hash table
1297 if( node = pool->hashprev )
1298 node->hashnext = pool->hashnext;
1299 else if( node = pool->hashnext )
1300 bt->mgr->hash[idx] = node->slot;
1302 bt->mgr->hash[idx] = 0;
1304 if( node = pool->hashnext )
1305 node->hashprev = pool->hashprev;
1307 bt_spinreleasewrite (&bt->mgr->latch[idx]);
1309 // remove old file mapping
1311 munmap (pool->map, (uid)(bt->mgr->poolmask+1) << bt->mgr->page_bits);
1313 // FlushViewOfFile(pool->map, 0);
1314 UnmapViewOfFile(pool->map);
1315 CloseHandle(pool->hmap);
1319 // create new pool mapping
1320 // and link into hash table
1322 if( bt_mapsegment(bt, pool, page_no) )
1325 bt_linkhash(bt, pool, page_no, hashidx);
1326 bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
1331 // place write, read, or parent lock on requested page_no.
1333 void bt_lockpage(BtLock mode, BtLatchSet *set)
1337 ReadLock (set->readwr);
1340 WriteLock (set->readwr);
1343 ReadLock (set->access);
1346 WriteLock (set->access);
1349 WriteLock (set->parent);
1354 // remove write, read, or parent lock on requested page
1356 void bt_unlockpage(BtLock mode, BtLatchSet *set)
1360 ReadRelease (set->readwr);
1363 WriteRelease (set->readwr);
1366 ReadRelease (set->access);
1369 WriteRelease (set->access);
1372 WriteRelease (set->parent);
1377 // allocate a new page and write page into it
1379 uid bt_newpage(BtDb *bt, BtPage page)
1385 // lock allocation page
1387 bt_spinwritelock(bt->mgr->latchmgr->lock);
1389 // use empty chain first
1390 // else allocate empty page
1392 if( new_page = bt_getid(bt->mgr->latchmgr->chain) ) {
1393 if( set->pool = bt_pinpool (bt, new_page) )
1394 set->page = bt_page (bt, set->pool, new_page);
1398 bt_putid(bt->mgr->latchmgr->chain, bt_getid(set->page->right));
1399 bt_unpinpool (set->pool);
1401 new_page = bt_getid(bt->mgr->latchmgr->alloc->right);
1402 bt_putid(bt->mgr->latchmgr->alloc->right, new_page+1);
1404 // if writing first page of pool block, set file length thru last page
1406 if( (new_page & bt->mgr->poolmask) == 0 )
1407 ftruncate (bt->mgr->idx, (new_page + bt->mgr->poolmask + 1) << bt->mgr->page_bits);
1411 // unlock allocation latch
1413 bt_spinreleasewrite(bt->mgr->latchmgr->lock);
1416 // bring new page into pool and copy page.
1417 // this will extend the file into the new pages on WIN32.
1419 if( set->pool = bt_pinpool (bt, new_page) )
1420 set->page = bt_page (bt, set->pool, new_page);
1424 memcpy(set->page, page, bt->mgr->page_size);
1425 bt_unpinpool (set->pool);
1428 // unlock allocation latch
1430 bt_spinreleasewrite(bt->mgr->latchmgr->lock);
1435 // find slot in page for given key at a given level
1437 int bt_findslot (BtPageSet *set, unsigned char *key, uint len)
1439 uint diff, higher = set->page->cnt, low = 1, slot;
1442 // make stopper key an infinite fence value
1444 if( bt_getid (set->page->right) )
1449 // low is the lowest candidate.
1450 // loop ends when they meet
1452 // higher is already
1453 // tested as .ge. the passed key.
1455 while( diff = higher - low ) {
1456 slot = low + ( diff >> 1 );
1457 if( keycmp (keyptr(set->page, slot), key, len) < 0 )
1460 higher = slot, good++;
1463 // return zero if key is on right link page
1465 return good ? higher : 0;
1468 // find and load page at given level for given key
1469 // leave page rd or wr locked as requested
1471 int bt_loadpage (BtDb *bt, BtPageSet *set, unsigned char *key, uint len, uint lvl, BtLock lock)
1473 uid page_no = ROOT_page, prevpage = 0;
1474 uint drill = 0xff, slot;
1475 BtLatchSet *prevlatch;
1476 uint mode, prevmode;
1479 // start at root of btree and drill down
1482 // determine lock mode of drill level
1483 mode = (drill == lvl) ? lock : BtLockRead;
1485 set->latch = bt_pinlatch (bt, page_no);
1486 set->page_no = page_no;
1488 // pin page contents
1490 if( set->pool = bt_pinpool (bt, page_no) )
1491 set->page = bt_page (bt, set->pool, page_no);
1495 // obtain access lock using lock chaining with Access mode
1497 if( page_no > ROOT_page )
1498 bt_lockpage(BtLockAccess, set->latch);
1500 // release & unpin parent page
1503 bt_unlockpage(prevmode, prevlatch);
1504 bt_unpinlatch (prevlatch);
1505 bt_unpinpool (prevpool);
1509 // obtain read lock using lock chaining
1511 bt_lockpage(mode, set->latch);
1513 if( set->page->free )
1514 return bt->err = BTERR_struct, 0;
1516 if( page_no > ROOT_page )
1517 bt_unlockpage(BtLockAccess, set->latch);
1519 // re-read and re-lock root after determining actual level of root
1521 if( set->page->lvl != drill) {
1522 if( set->page_no != ROOT_page )
1523 return bt->err = BTERR_struct, 0;
1525 drill = set->page->lvl;
1527 if( lock != BtLockRead && drill == lvl ) {
1528 bt_unlockpage(mode, set->latch);
1529 bt_unpinlatch (set->latch);
1530 bt_unpinpool (set->pool);
1535 prevpage = set->page_no;
1536 prevlatch = set->latch;
1537 prevpool = set->pool;
1540 // find key on page at this level
1541 // and descend to requested level
1543 if( set->page->kill )
1546 if( slot = bt_findslot (set, key, len) ) {
1550 while( slotptr(set->page, slot)->dead )
1551 if( slot++ < set->page->cnt )
1556 page_no = bt_getid(valptr(set->page, slot)->value);
1561 // or slide right into next page
1564 page_no = bt_getid(set->page->right);
1568 // return error on end of right chain
1570 bt->err = BTERR_struct;
1571 return 0; // return error
1574 // return page to free list
1575 // page must be delete & write locked
1577 void bt_freepage (BtDb *bt, BtPageSet *set)
1579 // lock allocation page
1581 bt_spinwritelock (bt->mgr->latchmgr->lock);
1584 memcpy(set->page->right, bt->mgr->latchmgr->chain, BtId);
1585 bt_putid(bt->mgr->latchmgr->chain, set->page_no);
1586 set->page->free = 1;
1588 // unlock released page
1590 bt_unlockpage (BtLockDelete, set->latch);
1591 bt_unlockpage (BtLockWrite, set->latch);
1592 bt_unpinlatch (set->latch);
1593 bt_unpinpool (set->pool);
1595 // unlock allocation page
1597 bt_spinreleasewrite (bt->mgr->latchmgr->lock);
1600 // a fence key was deleted from a page
1601 // push new fence value upwards
1603 BTERR bt_fixfence (BtDb *bt, BtPageSet *set, uint lvl)
1605 unsigned char leftkey[256], rightkey[256];
1606 unsigned char value[BtId];
1609 // remove the old fence value
1611 ptr = keyptr(set->page, set->page->cnt);
1612 memcpy (rightkey, ptr, ptr->len + 1);
1613 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1615 // cache new fence value
1617 ptr = keyptr(set->page, set->page->cnt);
1618 memcpy (leftkey, ptr, ptr->len + 1);
1620 bt_lockpage (BtLockParent, set->latch);
1621 bt_unlockpage (BtLockWrite, set->latch);
1623 // insert new (now smaller) fence key
1625 bt_putid (value, set->page_no);
1627 if( bt_insertkey (bt, leftkey+1, *leftkey, lvl+1, value, BtId, 1) )
1630 // now delete old fence key
1632 if( bt_deletekey (bt, rightkey+1, *rightkey, lvl+1) )
1635 bt_unlockpage (BtLockParent, set->latch);
1636 bt_unpinlatch(set->latch);
1637 bt_unpinpool (set->pool);
1641 // root has a single child
1642 // collapse a level from the tree
1644 BTERR bt_collapseroot (BtDb *bt, BtPageSet *root)
1649 // find the child entry and promote as new root contents
1652 for( idx = 0; idx++ < root->page->cnt; )
1653 if( !slotptr(root->page, idx)->dead )
1656 child->page_no = bt_getid (valptr(root->page, idx)->value);
1658 child->latch = bt_pinlatch (bt, child->page_no);
1659 bt_lockpage (BtLockDelete, child->latch);
1660 bt_lockpage (BtLockWrite, child->latch);
1662 if( child->pool = bt_pinpool (bt, child->page_no) )
1663 child->page = bt_page (bt, child->pool, child->page_no);
1667 memcpy (root->page, child->page, bt->mgr->page_size);
1668 bt_freepage (bt, child);
1670 } while( root->page->lvl > 1 && root->page->act == 1 );
1672 bt_unlockpage (BtLockWrite, root->latch);
1673 bt_unpinlatch (root->latch);
1674 bt_unpinpool (root->pool);
1678 // find and delete key on page by marking delete flag bit
1679 // if page becomes empty, delete it from the btree
1681 BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl)
1683 unsigned char lowerfence[256], higherfence[256];
1684 uint slot, idx, found, fence;
1685 BtPageSet set[1], right[1];
1686 unsigned char value[BtId];
1690 if( slot = bt_loadpage (bt, set, key, len, lvl, BtLockWrite) )
1691 ptr = keyptr(set->page, slot);
1695 // if librarian slot, advance to real slot
1697 if( slotptr(set->page, slot)->type == Librarian )
1698 ptr = keyptr(set->page, ++slot);
1700 fence = slot == set->page->cnt;
1702 // if key is found delete it, otherwise ignore request
1704 if( found = !keycmp (ptr, key, len) )
1705 if( found = slotptr(set->page, slot)->dead == 0 ) {
1706 val = valptr(set->page,slot);
1707 slotptr(set->page, slot)->dead = 1;
1708 set->page->garbage += ptr->len + val->len + 2;
1711 // collapse empty slots beneath our fence
1713 while( idx = set->page->cnt - 1 )
1714 if( slotptr(set->page, idx)->dead ) {
1715 *slotptr(set->page, idx) = *slotptr(set->page, idx + 1);
1716 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1721 // did we delete a fence key in an upper level?
1723 if( found && lvl && fence && set->page->act )
1724 if( bt_fixfence (bt, set, lvl) )
1727 return bt->found = found, 0;
1729 // do we need to collapse root?
1731 if( lvl > 1 && set->page_no == ROOT_page && set->page->act == 1 )
1732 if( bt_collapseroot (bt, set) )
1735 return bt->found = found, 0;
1737 // return if page is not empty
1739 if( set->page->act ) {
1740 bt_unlockpage(BtLockWrite, set->latch);
1741 bt_unpinlatch (set->latch);
1742 bt_unpinpool (set->pool);
1743 return bt->found = found, 0;
1746 // cache copy of fence key
1747 // to post in parent
1749 ptr = keyptr(set->page, set->page->cnt);
1750 memcpy (lowerfence, ptr, ptr->len + 1);
1752 // obtain lock on right page
1754 right->page_no = bt_getid(set->page->right);
1755 right->latch = bt_pinlatch (bt, right->page_no);
1756 bt_lockpage (BtLockWrite, right->latch);
1758 // pin page contents
1760 if( right->pool = bt_pinpool (bt, right->page_no) )
1761 right->page = bt_page (bt, right->pool, right->page_no);
1765 if( right->page->kill )
1766 return bt->err = BTERR_struct;
1768 // pull contents of right peer into our empty page
1770 memcpy (set->page, right->page, bt->mgr->page_size);
1772 // cache copy of key to update
1774 ptr = keyptr(right->page, right->page->cnt);
1775 memcpy (higherfence, ptr, ptr->len + 1);
1777 // mark right page deleted and point it to left page
1778 // until we can post parent updates
1780 bt_putid (right->page->right, set->page_no);
1781 right->page->kill = 1;
1783 bt_lockpage (BtLockParent, right->latch);
1784 bt_unlockpage (BtLockWrite, right->latch);
1786 bt_lockpage (BtLockParent, set->latch);
1787 bt_unlockpage (BtLockWrite, set->latch);
1789 // redirect higher key directly to our new node contents
1791 bt_putid (value, set->page_no);
1793 if( bt_insertkey (bt, higherfence+1, *higherfence, lvl+1, value, BtId, 1) )
1796 // delete old lower key to our node
1798 if( bt_deletekey (bt, lowerfence+1, *lowerfence, lvl+1) )
1801 // obtain delete and write locks to right node
1803 bt_unlockpage (BtLockParent, right->latch);
1804 bt_lockpage (BtLockDelete, right->latch);
1805 bt_lockpage (BtLockWrite, right->latch);
1806 bt_freepage (bt, right);
1808 bt_unlockpage (BtLockParent, set->latch);
1809 bt_unpinlatch (set->latch);
1810 bt_unpinpool (set->pool);
1815 BtKey bt_foundkey (BtDb *bt)
1817 return (BtKey)(bt->key);
1820 // advance to next slot
1822 uint bt_findnext (BtDb *bt, BtPageSet *set, uint slot)
1824 BtLatchSet *prevlatch;
1828 if( slot < set->page->cnt )
1831 prevlatch = set->latch;
1832 prevpool = set->pool;
1834 if( page_no = bt_getid(set->page->right) )
1835 set->latch = bt_pinlatch (bt, page_no);
1837 return bt->err = BTERR_struct, 0;
1839 // pin page contents
1841 if( set->pool = bt_pinpool (bt, page_no) )
1842 set->page = bt_page (bt, set->pool, page_no);
1846 // obtain access lock using lock chaining with Access mode
1848 bt_lockpage(BtLockAccess, set->latch);
1850 bt_unlockpage(BtLockRead, prevlatch);
1851 bt_unpinlatch (prevlatch);
1852 bt_unpinpool (prevpool);
1854 bt_lockpage(BtLockRead, set->latch);
1855 bt_unlockpage(BtLockAccess, set->latch);
1857 set->page_no = page_no;
1861 // find unique key or first duplicate key in
1862 // leaf level and return number of value bytes
1863 // or (-1) if not found. Setup key for bt_foundkey
1865 int bt_findkey (BtDb *bt, unsigned char *key, uint keylen, unsigned char *value, uint valmax)
1873 if( slot = bt_loadpage (bt, set, key, keylen, 0, BtLockRead) )
1875 ptr = keyptr(set->page, slot);
1877 // skip librarian slot place holder
1879 if( slotptr(set->page, slot)->type == Librarian )
1880 ptr = keyptr(set->page, ++slot);
1882 if( slotptr(set->page, slot)->dead )
1885 // return actual key found
1887 memcpy (bt->key, ptr, ptr->len + 1);
1890 if( slotptr(set->page, slot)->type == Duplicate )
1893 // if key exists, return >= 0 value bytes copied
1894 // otherwise return (-1)
1897 if( !memcmp (ptr->key, key, len) ) {
1898 val = valptr (set->page,slot);
1899 if( valmax > val->len )
1901 memcpy (value, val->value, valmax);
1907 } while( slot = bt_findnext (bt, set, slot) );
1909 bt_unlockpage (BtLockRead, set->latch);
1910 bt_unpinlatch (set->latch);
1911 bt_unpinpool (set->pool);
1915 // check page for space available,
1916 // clean if necessary and return
1917 // 0 - page needs splitting
1918 // >0 new slot value
1920 uint bt_cleanpage(BtDb *bt, BtPage page, uint keylen, uint slot, uint vallen)
1922 uint nxt = bt->mgr->page_size;
1923 uint cnt = 0, idx = 0;
1924 uint max = page->cnt;
1929 if( page->min >= (max+2) * sizeof(BtSlot) + sizeof(*page) + keylen + 1 + vallen + 1)
1932 // skip cleanup and proceed to split
1933 // if there's not enough garbage
1936 if( page->garbage < nxt / 5 )
1939 memcpy (bt->frame, page, bt->mgr->page_size);
1941 // skip page info and set rest of page to zero
1943 memset (page+1, 0, bt->mgr->page_size - sizeof(*page));
1947 // clean up page first by
1948 // removing deleted keys
1950 while( cnt++ < max ) {
1953 if( cnt < max && slotptr(bt->frame,cnt)->dead )
1956 // copy the value across
1958 val = valptr(bt->frame, cnt);
1959 nxt -= val->len + 1;
1960 ((unsigned char *)page)[nxt] = val->len;
1961 memcpy ((unsigned char *)page + nxt + 1, val->value, val->len);
1963 // copy the key across
1965 key = keyptr(bt->frame, cnt);
1966 nxt -= key->len + 1;
1967 memcpy ((unsigned char *)page + nxt, key, key->len + 1);
1969 // make a librarian slot
1972 slotptr(page, ++idx)->off = nxt;
1973 slotptr(page, idx)->type = Librarian;
1974 slotptr(page, idx)->dead = 1;
1979 slotptr(page, ++idx)->off = nxt;
1980 slotptr(page, idx)->type = slotptr(bt->frame, cnt)->type;
1982 if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) )
1989 // see if page has enough space now, or does it need splitting?
1991 if( page->min >= (idx+2) * sizeof(BtSlot) + sizeof(*page) + keylen + 1 + vallen + 1 )
1997 // split the root and raise the height of the btree
1999 BTERR bt_splitroot(BtDb *bt, BtPageSet *root, unsigned char *leftkey, uid page_no2)
2001 uint nxt = bt->mgr->page_size;
2002 unsigned char value[BtId];
2005 // Obtain an empty page to use, and copy the current
2006 // root contents into it, e.g. lower keys
2008 if( !(left = bt_newpage(bt, root->page)) )
2011 // preserve the page info at the bottom
2012 // of higher keys and set rest to zero
2014 memset(root->page+1, 0, bt->mgr->page_size - sizeof(*root->page));
2016 // insert lower keys page fence key on newroot page as first key
2019 bt_putid (value, left);
2020 ((unsigned char *)root->page)[nxt] = BtId;
2021 memcpy ((unsigned char *)root->page + nxt + 1, value, BtId);
2023 nxt -= *leftkey + 1;
2024 memcpy ((unsigned char *)root->page + nxt, leftkey, *leftkey + 1);
2025 slotptr(root->page, 1)->off = nxt;
2027 // insert stopper key on newroot page
2028 // and increase the root height
2030 nxt -= 3 + BtId + 1;
2031 ((unsigned char *)root->page)[nxt] = 2;
2032 ((unsigned char *)root->page)[nxt+1] = 0xff;
2033 ((unsigned char *)root->page)[nxt+2] = 0xff;
2035 bt_putid (value, page_no2);
2036 ((unsigned char *)root->page)[nxt+3] = BtId;
2037 memcpy ((unsigned char *)root->page + nxt + 4, value, BtId);
2038 slotptr(root->page, 2)->off = nxt;
2040 bt_putid(root->page->right, 0);
2041 root->page->min = nxt; // reset lowest used offset and key count
2042 root->page->cnt = 2;
2043 root->page->act = 2;
2046 // release and unpin root
2048 bt_unlockpage(BtLockWrite, root->latch);
2049 bt_unpinlatch (root->latch);
2050 bt_unpinpool (root->pool);
2054 // split already locked full node
2057 BTERR bt_splitpage (BtDb *bt, BtPageSet *set)
2059 uint cnt = 0, idx = 0, max, nxt = bt->mgr->page_size;
2060 unsigned char fencekey[256], rightkey[256];
2061 unsigned char value[BtId];
2062 uint lvl = set->page->lvl;
2068 // split higher half of keys to bt->frame
2070 memset (bt->frame, 0, bt->mgr->page_size);
2071 max = set->page->cnt;
2075 while( cnt++ < max ) {
2076 if( slotptr(set->page, cnt)->dead && cnt < max )
2078 val = valptr(set->page, cnt);
2079 nxt -= val->len + 1;
2080 ((unsigned char *)bt->frame)[nxt] = val->len;
2081 memcpy ((unsigned char *)bt->frame + nxt + 1, val->value, val->len);
2083 key = keyptr(set->page, cnt);
2084 nxt -= key->len + 1;
2085 memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1);
2087 // add librarian slot
2090 slotptr(bt->frame, ++idx)->off = nxt;
2091 slotptr(bt->frame, idx)->type = Librarian;
2092 slotptr(bt->frame, idx)->dead = 1;
2097 slotptr(bt->frame, ++idx)->off = nxt;
2098 slotptr(bt->frame, idx)->type = slotptr(set->page, cnt)->type;
2100 if( !(slotptr(bt->frame, idx)->dead = slotptr(set->page, cnt)->dead) )
2104 // remember existing fence key for new page to the right
2106 memcpy (rightkey, key, key->len + 1);
2108 bt->frame->bits = bt->mgr->page_bits;
2109 bt->frame->min = nxt;
2110 bt->frame->cnt = idx;
2111 bt->frame->lvl = lvl;
2115 if( set->page_no > ROOT_page )
2116 memcpy (bt->frame->right, set->page->right, BtId);
2118 // get new free page and write higher keys to it.
2120 if( !(right->page_no = bt_newpage(bt, bt->frame)) )
2123 // update lower keys to continue in old page
2125 memcpy (bt->frame, set->page, bt->mgr->page_size);
2126 memset (set->page+1, 0, bt->mgr->page_size - sizeof(*set->page));
2127 nxt = bt->mgr->page_size;
2128 set->page->garbage = 0;
2134 if( slotptr(bt->frame, max)->type == Librarian )
2137 // assemble page of smaller keys
2139 while( cnt++ < max ) {
2140 if( slotptr(bt->frame, cnt)->dead )
2142 val = valptr(bt->frame, cnt);
2143 nxt -= val->len + 1;
2144 ((unsigned char *)set->page)[nxt] = val->len;
2145 memcpy ((unsigned char *)set->page + nxt + 1, val->value, val->len);
2147 key = keyptr(bt->frame, cnt);
2148 nxt -= key->len + 1;
2149 memcpy ((unsigned char *)set->page + nxt, key, key->len + 1);
2151 // add librarian slot
2154 slotptr(set->page, ++idx)->off = nxt;
2155 slotptr(set->page, idx)->type = Librarian;
2156 slotptr(set->page, idx)->dead = 1;
2161 slotptr(set->page, ++idx)->off = nxt;
2162 slotptr(set->page, idx)->type = slotptr(bt->frame, cnt)->type;
2166 // remember fence key for smaller page
2168 memcpy(fencekey, key, key->len + 1);
2170 bt_putid(set->page->right, right->page_no);
2171 set->page->min = nxt;
2172 set->page->cnt = idx;
2174 // if current page is the root page, split it
2176 if( set->page_no == ROOT_page )
2177 return bt_splitroot (bt, set, fencekey, right->page_no);
2179 // insert new fences in their parent pages
2181 right->latch = bt_pinlatch (bt, right->page_no);
2182 bt_lockpage (BtLockParent, right->latch);
2184 bt_lockpage (BtLockParent, set->latch);
2185 bt_unlockpage (BtLockWrite, set->latch);
2187 // insert new fence for reformulated left block of smaller keys
2189 bt_putid (value, set->page_no);
2191 if( bt_insertkey (bt, fencekey+1, *fencekey, lvl+1, value, BtId, 1) )
2194 // switch fence for right block of larger keys to new right page
2196 bt_putid (value, right->page_no);
2198 if( bt_insertkey (bt, rightkey+1, *rightkey, lvl+1, value, BtId, 1) )
2201 bt_unlockpage (BtLockParent, set->latch);
2202 bt_unpinlatch (set->latch);
2203 bt_unpinpool (set->pool);
2205 bt_unlockpage (BtLockParent, right->latch);
2206 bt_unpinlatch (right->latch);
2210 // install new key and value onto page
2211 // page must already be checked for
2214 BTERR bt_insertslot (BtDb *bt, BtPageSet *set, uint slot, unsigned char *key,uint keylen, unsigned char *value, uint vallen, uint type)
2216 uint idx, librarian;
2219 // if found slot > desired slot and previous slot
2220 // is a librarian slot, use it
2223 if( slotptr(set->page, slot-1)->type == Librarian )
2226 // copy value onto page
2228 set->page->min -= vallen + 1; // reset lowest used offset
2229 ((unsigned char *)set->page)[set->page->min] = vallen;
2230 memcpy ((unsigned char *)set->page + set->page->min +1, value, vallen );
2232 // copy key onto page
2234 set->page->min -= keylen + 1; // reset lowest used offset
2235 ((unsigned char *)set->page)[set->page->min] = keylen;
2236 memcpy ((unsigned char *)set->page + set->page->min +1, key, keylen );
2238 // find first empty slot
2240 for( idx = slot; idx < set->page->cnt; idx++ )
2241 if( slotptr(set->page, idx)->dead )
2244 // now insert key into array before slot
2246 if( idx == set->page->cnt )
2247 idx += 2, set->page->cnt += 2, librarian = 2;
2253 while( idx > slot + librarian - 1 )
2254 *slotptr(set->page, idx) = *slotptr(set->page, idx - librarian), idx--;
2256 // add librarian slot
2258 if( librarian > 1 ) {
2259 node = slotptr(set->page, slot++);
2260 node->off = set->page->min;
2261 node->type = Librarian;
2267 node = slotptr(set->page, slot);
2268 node->off = set->page->min;
2272 bt_unlockpage (BtLockWrite, set->latch);
2273 bt_unpinlatch (set->latch);
2274 bt_unpinpool (set->pool);
2278 // Insert new key into the btree at given level.
2279 // either add a new key or update/add an existing one
2281 BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint keylen, uint lvl, void *value, uint vallen, uint unique)
2283 unsigned char newkey[256];
2284 uint slot, idx, len;
2291 // set up the key we're working on
2293 memcpy (newkey + 1, key, keylen);
2296 // is this a non-unique index value?
2302 sequence = bt_newdup (bt);
2303 bt_putid (newkey + *newkey + 1, sequence);
2307 while( 1 ) { // find the page and slot for the current key
2308 if( slot = bt_loadpage (bt, set, newkey + 1, *newkey, lvl, BtLockWrite) )
2309 ptr = keyptr(set->page, slot);
2312 bt->err = BTERR_ovflw;
2316 // if librarian slot == found slot, advance to real slot
2318 if( slotptr(set->page, slot)->type == Librarian )
2319 if( !keycmp (ptr, key, keylen) )
2320 ptr = keyptr(set->page, ++slot);
2324 if( slotptr(set->page, slot)->type == Duplicate )
2327 // if inserting a duplicate key or unique key
2328 // check for adequate space on the page
2329 // and insert the new key before slot.
2331 if( unique && (len != *newkey || memcmp (ptr->key, newkey+1, *newkey)) || !unique ) {
2332 if( !(slot = bt_cleanpage (bt, set->page, *newkey, slot, vallen)) )
2333 if( bt_splitpage (bt, set) )
2338 return bt_insertslot (bt, set, slot, newkey + 1, *newkey, value, vallen, type);
2341 // if key already exists, update value and return
2343 if( val = valptr(set->page, slot), val->len >= vallen ) {
2344 if( slotptr(set->page, slot)->dead )
2346 set->page->garbage += val->len - vallen;
2347 slotptr(set->page, slot)->dead = 0;
2349 memcpy (val->value, value, vallen);
2350 bt_unlockpage(BtLockWrite, set->latch);
2351 bt_unpinlatch (set->latch);
2352 bt_unpinpool (set->pool);
2356 // new update value doesn't fit in existing value area
2358 if( !slotptr(set->page, slot)->dead )
2359 set->page->garbage += val->len + ptr->len + 2;
2361 slotptr(set->page, slot)->dead = 0;
2365 if( !(slot = bt_cleanpage (bt, set->page, keylen, slot, vallen)) )
2366 if( bt_splitpage (bt, set) )
2371 set->page->min -= vallen + 1;
2372 ((unsigned char *)set->page)[set->page->min] = vallen;
2373 memcpy ((unsigned char *)set->page + set->page->min +1, value, vallen);
2375 set->page->min -= keylen + 1;
2376 ((unsigned char *)set->page)[set->page->min] = keylen;
2377 memcpy ((unsigned char *)set->page + set->page->min +1, key, keylen);
2379 slotptr(set->page, slot)->off = set->page->min;
2380 bt_unlockpage(BtLockWrite, set->latch);
2381 bt_unpinlatch (set->latch);
2382 bt_unpinpool (set->pool);
2388 // return next slot for cursor page
2389 // or slide cursor right into next page
2391 uint bt_nextkey (BtDb *bt, uint slot)
2397 right = bt_getid(bt->cursor->right);
2399 while( slot++ < bt->cursor->cnt )
2400 if( slotptr(bt->cursor,slot)->dead )
2402 else if( right || (slot < bt->cursor->cnt) ) // skip infinite stopper
2410 bt->cursor_page = right;
2412 if( set->pool = bt_pinpool (bt, right) )
2413 set->page = bt_page (bt, set->pool, right);
2417 set->latch = bt_pinlatch (bt, right);
2418 bt_lockpage(BtLockRead, set->latch);
2420 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2422 bt_unlockpage(BtLockRead, set->latch);
2423 bt_unpinlatch (set->latch);
2424 bt_unpinpool (set->pool);
2432 // cache page of keys into cursor and return starting slot for given key
2434 uint bt_startkey (BtDb *bt, unsigned char *key, uint len)
2439 // cache page for retrieval
2441 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockRead) )
2442 memcpy (bt->cursor, set->page, bt->mgr->page_size);
2446 bt->cursor_page = set->page_no;
2448 bt_unlockpage(BtLockRead, set->latch);
2449 bt_unpinlatch (set->latch);
2450 bt_unpinpool (set->pool);
2454 BtKey bt_key(BtDb *bt, uint slot)
2456 return keyptr(bt->cursor, slot);
2459 BtVal bt_val(BtDb *bt, uint slot)
2461 return valptr(bt->cursor,slot);
2467 double getCpuTime(int type)
2470 FILETIME xittime[1];
2471 FILETIME systime[1];
2472 FILETIME usrtime[1];
2473 SYSTEMTIME timeconv[1];
2476 memset (timeconv, 0, sizeof(SYSTEMTIME));
2480 GetSystemTimeAsFileTime (xittime);
2481 FileTimeToSystemTime (xittime, timeconv);
2482 ans = (double)timeconv->wDayOfWeek * 3600 * 24;
2485 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2486 FileTimeToSystemTime (usrtime, timeconv);
2489 GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
2490 FileTimeToSystemTime (systime, timeconv);
2494 ans += (double)timeconv->wHour * 3600;
2495 ans += (double)timeconv->wMinute * 60;
2496 ans += (double)timeconv->wSecond;
2497 ans += (double)timeconv->wMilliseconds / 1000;
2502 #include <sys/resource.h>
2504 double getCpuTime(int type)
2506 struct rusage used[1];
2507 struct timeval tv[1];
2511 gettimeofday(tv, NULL);
2512 return (double)tv->tv_sec + (double)tv->tv_usec / 1000000;
2515 getrusage(RUSAGE_SELF, used);
2516 return (double)used->ru_utime.tv_sec + (double)used->ru_utime.tv_usec / 1000000;
2519 getrusage(RUSAGE_SELF, used);
2520 return (double)used->ru_stime.tv_sec + (double)used->ru_stime.tv_usec / 1000000;
2527 uint bt_latchaudit (BtDb *bt)
2529 ushort idx, hashidx;
2536 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2538 if( *(ushort *)(bt->mgr->latchmgr->lock) )
2539 fprintf(stderr, "Alloc page locked\n");
2540 *(ushort *)(bt->mgr->latchmgr->lock) = 0;
2542 for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
2543 latch = bt->mgr->latchsets + idx;
2544 if( *latch->readwr->rin & MASK )
2545 fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
2546 memset ((ushort *)latch->readwr, 0, sizeof(RWLock));
2548 if( *latch->access->rin & MASK )
2549 fprintf(stderr, "latchset %d accesslocked for page %.8x\n", idx, latch->page_no);
2550 memset ((ushort *)latch->access, 0, sizeof(RWLock));
2552 if( *latch->parent->rin & MASK )
2553 fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
2554 memset ((ushort *)latch->parent, 0, sizeof(RWLock));
2557 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2562 for( hashidx = 0; hashidx < bt->mgr->latchmgr->latchhash; hashidx++ ) {
2563 if( *(ushort *)(bt->mgr->latchmgr->table[hashidx].latch) )
2564 fprintf(stderr, "hash entry %d locked\n", hashidx);
2566 *(ushort *)(bt->mgr->latchmgr->table[hashidx].latch) = 0;
2568 if( idx = bt->mgr->latchmgr->table[hashidx].slot ) do {
2569 latch = bt->mgr->latchsets + idx;
2570 if( *(ushort *)latch->busy )
2571 fprintf(stderr, "latchset %d busylocked for page %.8x\n", idx, latch->page_no);
2572 *(ushort *)latch->busy = 0;
2574 fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
2575 } while( idx = latch->next );
2578 next = bt->mgr->latchmgr->nlatchpage + LATCH_page;
2579 page_no = LEAF_page;
2581 while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
2582 off64_t off = page_no << bt->mgr->page_bits;
2584 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2588 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2590 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2591 fprintf(stderr, "page %.8x unable to read\n", page_no);
2593 if( *amt < bt->mgr->page_size )
2594 fprintf(stderr, "page %.8x unable to read\n", page_no);
2596 if( !bt->frame->free ) {
2597 for( idx = 0; idx++ < bt->frame->cnt - 1; ) {
2598 ptr = keyptr(bt->frame, idx+1);
2599 if( keycmp (keyptr(bt->frame, idx), ptr->key, ptr->len) >= 0 )
2600 fprintf(stderr, "page %.8x idx %.2x out of order\n", page_no, idx);
2602 if( !bt->frame->lvl )
2603 cnt += bt->frame->act;
2605 if( page_no > LEAF_page )
2620 // standalone program to index file of keys
2621 // then list them onto std-out
2624 void *index_file (void *arg)
2626 uint __stdcall index_file (void *arg)
2629 int line = 0, found = 0, cnt = 0;
2630 uid next, page_no = LEAF_page; // start on first page of leaves
2631 unsigned char key[256];
2632 ThreadArg *args = arg;
2633 int ch, len = 0, slot;
2639 bt = bt_open (args->mgr);
2641 switch(args->type[0] | 0x20)
2644 fprintf(stderr, "started latch mgr audit\n");
2645 cnt = bt_latchaudit (bt);
2646 fprintf(stderr, "finished latch mgr audit, found %d keys\n", cnt);
2650 fprintf(stderr, "started pennysort for %s\n", args->infile);
2651 if( in = fopen (args->infile, "rb") )
2652 while( ch = getc(in), ch != EOF )
2657 if( bt_insertkey (bt, key, 10, 0, key + 10, len - 10, 0) )
2658 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2661 else if( len < 255 )
2663 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
2667 fprintf(stderr, "started indexing for %s\n", args->infile);
2668 if( in = fopen (args->infile, "rb") )
2669 while( ch = getc(in), ch != EOF )
2674 if( args->num == 1 )
2675 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2677 else if( args->num )
2678 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2680 if( bt_insertkey (bt, key, len, 0, NULL, 0, 0) )
2681 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2684 else if( len < 255 )
2686 fprintf(stderr, "finished %s for %d keys\n", args->infile, line);
2690 fprintf(stderr, "started deleting keys for %s\n", args->infile);
2691 if( in = fopen (args->infile, "rb") )
2692 while( ch = getc(in), ch != EOF )
2696 if( args->num == 1 )
2697 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2699 else if( args->num )
2700 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2702 if( bt_findkey (bt, key, len, NULL, 0) < 0 )
2703 fprintf(stderr, "Cannot find key for Line: %d\n", line), exit(0);
2704 ptr = (BtKey)(bt->key);
2706 if( bt_deletekey (bt, ptr->key, ptr->len, 0) )
2707 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2711 else if( len < 255 )
2713 fprintf(stderr, "finished %s for %d keys, %d found\n", args->infile, line, found);
2717 fprintf(stderr, "started finding keys for %s\n", args->infile);
2718 if( in = fopen (args->infile, "rb") )
2719 while( ch = getc(in), ch != EOF )
2723 if( args->num == 1 )
2724 sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9;
2726 else if( args->num )
2727 sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9;
2729 if( bt_findkey (bt, key, len, NULL, 0) == 0 )
2732 fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
2735 else if( len < 255 )
2737 fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found);
2741 fprintf(stderr, "started scanning\n");
2743 if( set->pool = bt_pinpool (bt, page_no) )
2744 set->page = bt_page (bt, set->pool, page_no);
2747 set->latch = bt_pinlatch (bt, page_no);
2748 bt_lockpage (BtLockRead, set->latch);
2749 next = bt_getid (set->page->right);
2751 for( slot = 0; slot++ < set->page->cnt; )
2752 if( next || slot < set->page->cnt )
2753 if( !slotptr(set->page, slot)->dead ) {
2754 ptr = keyptr(set->page, slot);
2757 if( slotptr(set->page, slot)->type == Duplicate )
2760 fwrite (ptr->key, len, 1, stdout);
2761 fputc ('\n', stdout);
2765 bt_unlockpage (BtLockRead, set->latch);
2766 bt_unpinlatch (set->latch);
2767 bt_unpinpool (set->pool);
2768 } while( page_no = next );
2770 fprintf(stderr, " Total keys read %d\n", cnt);
2775 posix_fadvise( bt->mgr->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
2777 fprintf(stderr, "started counting\n");
2778 next = bt->mgr->latchmgr->nlatchpage + LATCH_page;
2779 page_no = LEAF_page;
2781 while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
2782 uid off = page_no << bt->mgr->page_bits;
2784 pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
2788 SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
2790 if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
2791 return bt->err = BTERR_map;
2793 if( *amt < bt->mgr->page_size )
2794 return bt->err = BTERR_map;
2796 if( !bt->frame->free && !bt->frame->lvl )
2797 cnt += bt->frame->act;
2798 if( page_no > LEAF_page )
2803 cnt--; // remove stopper key
2804 fprintf(stderr, " Total keys read %d\n", cnt);
2816 typedef struct timeval timer;
2818 int main (int argc, char **argv)
2820 int idx, cnt, len, slot, err;
2821 int segsize, bits = 16;
2838 fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]);
2839 fprintf (stderr, " where page_bits is the page size in bits\n");
2840 fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n");
2841 fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n");
2842 fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n");
2843 fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n");
2847 start = getCpuTime(0);
2850 bits = atoi(argv[3]);
2853 poolsize = atoi(argv[4]);
2856 fprintf (stderr, "Warning: no mapped_pool\n");
2858 if( poolsize > 65535 )
2859 fprintf (stderr, "Warning: mapped_pool > 65535 segments\n");
2862 segsize = atoi(argv[5]);
2864 segsize = 4; // 16 pages per mmap segment
2867 num = atoi(argv[6]);
2871 threads = malloc (cnt * sizeof(pthread_t));
2873 threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE));
2875 args = malloc (cnt * sizeof(ThreadArg));
2877 mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8);
2880 fprintf(stderr, "Index Open Error %s\n", argv[1]);
2886 for( idx = 0; idx < cnt; idx++ ) {
2887 args[idx].infile = argv[idx + 7];
2888 args[idx].type = argv[2];
2889 args[idx].mgr = mgr;
2890 args[idx].num = num;
2891 args[idx].idx = idx;
2893 if( err = pthread_create (threads + idx, NULL, index_file, args + idx) )
2894 fprintf(stderr, "Error creating thread %d\n", err);
2896 threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL);
2900 // wait for termination
2903 for( idx = 0; idx < cnt; idx++ )
2904 pthread_join (threads[idx], NULL);
2906 WaitForMultipleObjects (cnt, threads, TRUE, INFINITE);
2908 for( idx = 0; idx < cnt; idx++ )
2909 CloseHandle(threads[idx]);
2912 elapsed = getCpuTime(0) - start;
2913 fprintf(stderr, " real %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
2914 elapsed = getCpuTime(1);
2915 fprintf(stderr, " user %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
2916 elapsed = getCpuTime(2);
2917 fprintf(stderr, " sys %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);