4 // author: karl malbrain, malbrain@cal.berkeley.edu
7 This work, including the source code, documentation
8 and related data, is placed into the public domain.
10 The orginal author is Karl Malbrain.
12 THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
13 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
14 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
15 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
16 RESULTING FROM THE USE, MODIFICATION, OR
17 REDISTRIBUTION OF THIS SOFTWARE.
20 // Please see the project home page for documentation
21 // code.google.com/p/high-concurrency-btree
23 #define _FILE_OFFSET_BITS 64
24 #define _LARGEFILE64_SOURCE
40 #define WIN32_LEAN_AND_MEAN
51 typedef unsigned long long uid;
54 typedef unsigned long long off64_t;
55 typedef unsigned short ushort;
56 typedef unsigned int uint;
59 #define BT_ro 0x6f72 // ro
60 #define BT_rw 0x7772 // rw
61 #define BT_fl 0x6c66 // fl
63 #define BT_maxbits 24 // maximum page size in bits
64 #define BT_minbits 9 // minimum page size in bits
65 #define BT_minpage (1 << BT_minbits) // minimum page size
68 There are five lock types for each node in three independent sets:
69 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
70 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent.
71 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock.
72 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks.
73 5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification.
84 // Define the length of the page and key pointers
88 // Page key slot definition.
90 // If BT_maxbits is 15 or less, you can save 2 bytes
91 // for each key stored by making the first two uints
92 // into ushorts. You can also save 4 bytes by removing
93 // the tod field from the key.
95 // Keys are marked dead, but remain on the page until
99 uint off:BT_maxbits; // page offset for key start
100 uint dead:1; // set for deleted key
101 uint tod; // time-stamp for key
102 unsigned char id[BtId]; // id associated with key
105 // The key structure occupies space at the upper end of
106 // each page. It's a length byte followed by the value
111 unsigned char key[0];
114 // The first part of an index page.
115 // It is immediately followed
116 // by the BtSlot array of keys.
118 typedef struct BtPage_ {
119 uint cnt; // count of keys in page
120 uint act; // count of active keys
121 uint min; // next key offset
122 unsigned char bits; // page size in bits
123 unsigned char lvl:6; // level of page
124 unsigned char dirty:1; // page is dirty
125 unsigned char posted:1; // page fence is posted
126 unsigned char right[BtId]; // page number to right
127 unsigned char fence[256]; // page fence key
130 // The loadpage interface object
137 // The memory mapping hash table entry
140 BtPage page; // mapped page pointer
141 uid page_no; // mapped page number
142 void *lruprev; // least recently used previous cache block
143 void *lrunext; // lru next cache block
144 void *hashprev; // previous cache block for the same hash idx
145 void *hashnext; // next cache block for the same hash idx
151 // The object structure for Btree access
153 typedef struct _BtDb {
154 uint page_size; // each page size
155 uint page_bits; // each page size in bits
156 uint seg_bits; // segment size in pages in bits
157 uid page_no; // current page number
158 uid cursor_page; // current cursor page number
160 uint mode; // read-write mode
161 uint mapped_io; // use memory mapping
162 BtPage temp; // temporary frame buffer (memory mapped/file IO)
163 BtPage parent; // current page's parent node (memory mapped/file IO)
164 BtPage alloc; // frame for alloc page (memory mapped/file IO)
165 BtPage cursor; // cached frame for start/next (never mapped)
166 BtPage frame; // spare frame for the page split (never mapped)
167 BtPage zero; // zeroes frame buffer (never mapped)
168 BtPage page; // temporary page (memory mapped/file IO)
174 unsigned char *mem; // frame, cursor, page memory buffer
175 int nodecnt; // highest page cache segment in use
176 int nodemax; // highest page cache segment allocated
177 int hashmask; // number of pages in segments - 1
178 int hashsize; // size of hash table
179 int posted; // last loadpage found posted key
180 int found; // last insert/delete found key
181 BtHash *lrufirst; // lru list head
182 BtHash *lrulast; // lru list tail
183 ushort *cache; // hash table for cached segments
184 BtHash nodes[1]; // segment cache follows
198 extern void bt_close (BtDb *bt);
199 extern BtDb *bt_open (char *name, uint mode, uint bits, uint cacheblk, uint pgblk);
200 extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, uid id, uint tod);
201 extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len);
202 extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len);
203 extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len);
204 extern uint bt_nextkey (BtDb *bt, uint slot);
206 // Internal functions
208 BTERR bt_removepage (BtDb *bt, uid page_no, uint lvl, unsigned char *pagefence);
210 // Helper functions to return slot values
212 extern BtKey bt_key (BtDb *bt, uint slot);
213 extern uid bt_uid (BtDb *bt, uint slot);
214 extern uint bt_tod (BtDb *bt, uint slot);
216 // BTree page number constants
220 // Number of levels to create in a new BTree
224 // The page is allocated from low and hi ends.
225 // The key offsets and row-id's are allocated
226 // from the bottom, while the text of the key
227 // is allocated from the top. When the two
228 // areas meet, the page is split into two.
230 // A key consists of a length byte, two bytes of
231 // index number (0 - 65534), and up to 253 bytes
232 // of key value. Duplicate keys are discarded.
233 // Associated with each key is a 48 bit row-id.
235 // The b-tree root is always located at page 1.
236 // The first leaf page of level zero is always
237 // located on page 2.
239 // The b-tree pages are linked with right
240 // pointers to facilitate enumerators,
241 // and provide for concurrency.
243 // When to root page fills, it is split in two and
244 // the tree height is raised by a new root at page
245 // one with two keys.
247 // Deleted keys are marked with a dead bit until
250 // Groups of pages from the btree are optionally
251 // cached with memory mapping. A hash table is used to keep
252 // track of the cached pages. This behaviour is controlled
253 // by the number of cache blocks parameter and pages per block
256 // To achieve maximum concurrency one page is locked at a time
257 // as the tree is traversed to find leaf key in question. The right
258 // page numbers are used in cases where the page is being split,
261 // Page 0 is dedicated to lock for new page extensions,
262 // and chains empty pages together for reuse.
264 // Parent locks are obtained to prevent resplitting or deleting a node
265 // before its fence is posted into its upper level.
267 // Empty nodes are chained together through the ALLOC page and reused.
269 // A special open mode of BT_fl is provided to safely access files on
270 // WIN32 networks. WIN32 network operations should not use memory mapping.
271 // This WIN32 mode sets FILE_FLAG_NOBUFFERING and FILE_FLAG_WRITETHROUGH
272 // to prevent local caching of network file contents.
274 // Access macros to address slot and key values from the page.
275 // Page slots use 1 based indexing.
277 #define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1))
278 #define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off))
280 void bt_putid(unsigned char *dest, uid id)
285 dest[i] = (unsigned char)id, id >>= 8;
288 uid bt_getid(unsigned char *src)
293 for( i = 0; i < BtId; i++ )
294 id <<= 8, id |= *src++;
299 // place write, read, or parent lock on requested page_no.
301 BTERR bt_lockpage(BtDb *bt, uid page_no, BtLock mode)
303 off64_t off = page_no << bt->page_bits;
305 int flag = PROT_READ | ( bt->mode == BT_ro ? 0 : PROT_WRITE );
306 struct flock lock[1];
312 if( mode == BtLockRead || mode == BtLockWrite )
313 off += 1 * sizeof(*bt->page); // use second segment
315 if( mode == BtLockParent )
316 off += 2 * sizeof(*bt->page); // use third segment
319 memset (lock, 0, sizeof(lock));
322 lock->l_type = (mode == BtLockDelete || mode == BtLockWrite || mode == BtLockParent) ? F_WRLCK : F_RDLCK;
323 lock->l_len = sizeof(*bt->page);
326 if( fcntl (bt->idx, F_SETLKW, lock) < 0 )
327 return bt->err = BTERR_lock;
331 memset (ovl, 0, sizeof(ovl));
332 ovl->OffsetHigh = (uint)(off >> 32);
333 ovl->Offset = (uint)off;
334 len = sizeof(*bt->page);
336 // use large offsets to
337 // simulate advisory locking
339 ovl->OffsetHigh |= 0x80000000;
341 if( mode == BtLockDelete || mode == BtLockWrite || mode == BtLockParent )
342 flags |= LOCKFILE_EXCLUSIVE_LOCK;
344 if( LockFileEx (bt->idx, flags, 0, len, 0L, ovl) )
347 return bt->err = BTERR_lock;
351 // remove write, read, or parent lock on requested page_no.
353 BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode)
355 off64_t off = page_no << bt->page_bits;
357 struct flock lock[1];
363 if( mode == BtLockRead || mode == BtLockWrite )
364 off += 1 * sizeof(*bt->page); // use second segment
366 if( mode == BtLockParent )
367 off += 2 * sizeof(*bt->page); // use third segment
370 memset (lock, 0, sizeof(lock));
373 lock->l_type = F_UNLCK;
374 lock->l_len = sizeof(*bt->page);
377 if( fcntl (bt->idx, F_SETLK, lock) < 0 )
378 return bt->err = BTERR_lock;
380 memset (ovl, 0, sizeof(ovl));
381 ovl->OffsetHigh = (uint)(off >> 32);
382 ovl->Offset = (uint)off;
383 len = sizeof(*bt->page);
385 // use large offsets to
386 // simulate advisory locking
388 ovl->OffsetHigh |= 0x80000000;
390 if( !UnlockFileEx (bt->idx, 0, len, 0, ovl) )
391 return GetLastError(), bt->err = BTERR_lock;
397 // close and release memory
399 void bt_close (BtDb *bt)
403 // release mapped pages
405 if( hash = bt->lrufirst )
406 do munmap (hash->page, (bt->hashmask+1) << bt->page_bits);
407 while(hash = hash->lrunext);
415 if( hash = bt->lrufirst )
418 FlushViewOfFile(hash->page, 0);
419 UnmapViewOfFile(hash->page);
420 CloseHandle(hash->hmap);
421 } while(hash = hash->lrunext);
424 VirtualFree (bt->mem, 0, MEM_RELEASE);
425 FlushFileBuffers(bt->idx);
426 CloseHandle(bt->idx);
427 GlobalFree (bt->cache);
432 // open/create new btree
433 // call with file_name, BT_openmode, bits in page size (e.g. 16),
434 // size of mapped page cache (e.g. 8192) or zero for no mapping.
436 BtDb *bt_open (char *name, uint mode, uint bits, uint nodemax, uint pgblk)
438 uint lvl, attr, cacheblk, last;
439 BtLock lockmode = BtLockWrite;
446 SYSTEM_INFO sysinfo[1];
450 bt = malloc (sizeof(BtDb) + nodemax * sizeof(BtHash));
451 memset (bt, 0, sizeof(BtDb));
453 switch (mode & 0x7fff)
457 bt->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
462 bt->idx = open ((char*)name, O_RDONLY);
463 lockmode = BtLockRead;
467 return free(bt), NULL;
470 cacheblk = 4096; // page size for unix
475 bt = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtDb) + nodemax * sizeof(BtHash));
476 attr = FILE_ATTRIBUTE_NORMAL;
477 switch (mode & 0x7fff)
480 attr |= FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING;
483 bt->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL);
488 bt->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL);
489 lockmode = BtLockRead;
492 if( bt->idx == INVALID_HANDLE_VALUE )
493 return GlobalFree(bt), NULL;
495 // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity
496 GetSystemInfo(sysinfo);
499 cacheblk = sysinfo->dwAllocationGranularity;
504 // determine sanity of page size
506 if( bits > BT_maxbits )
508 else if( bits < BT_minbits )
511 if ( bt_lockpage(bt, ALLOC_page, lockmode) )
512 return bt_close (bt), NULL;
517 // read minimum page size to get root info
519 if( size = lseek (bt->idx, 0L, 2) ) {
520 alloc = malloc (BT_minpage);
521 pread(bt->idx, alloc, BT_minpage, 0);
524 } else if( mode == BT_ro )
525 return bt_close (bt), NULL;
527 size = GetFileSize(bt->idx, amt);
530 alloc = VirtualAlloc(NULL, BT_minpage, MEM_COMMIT, PAGE_READWRITE);
531 if( !ReadFile(bt->idx, (char *)alloc, BT_minpage, amt, NULL) )
532 return bt_close (bt), NULL;
534 VirtualFree (alloc, 0, MEM_RELEASE);
535 } else if( mode == BT_ro )
536 return bt_close (bt), NULL;
539 bt->page_size = 1 << bits;
540 bt->page_bits = bits;
542 bt->nodemax = nodemax;
545 // setup cache mapping
548 if( cacheblk < bt->page_size )
549 cacheblk = bt->page_size;
551 bt->hashsize = nodemax / 8;
552 bt->hashmask = (cacheblk >> bits) - 1;
556 // requested number of pages per memmap segment
559 if( (1 << pgblk) > bt->hashmask )
560 bt->hashmask = (1 << pgblk) - 1;
564 while( (1 << bt->seg_bits) <= bt->hashmask )
568 bt->mem = malloc (7 *bt->page_size);
569 bt->cache = calloc (bt->hashsize, sizeof(ushort));
571 bt->mem = VirtualAlloc(NULL, 7 * bt->page_size, MEM_COMMIT, PAGE_READWRITE);
572 bt->cache = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, bt->hashsize * sizeof(ushort));
574 bt->frame = (BtPage)bt->mem;
575 bt->cursor = (BtPage)(bt->mem + bt->page_size);
576 bt->page = (BtPage)(bt->mem + 2 * bt->page_size);
577 bt->alloc = (BtPage)(bt->mem + 3 * bt->page_size);
578 bt->temp = (BtPage)(bt->mem + 4 * bt->page_size);
579 bt->parent = (BtPage)(bt->mem + 5 * bt->page_size);
580 bt->zero = (BtPage)(bt->mem + 6 * bt->page_size);
583 if ( bt_unlockpage(bt, ALLOC_page, lockmode) )
584 return bt_close (bt), NULL;
589 // initializes an empty b-tree with root page and page of leaves
591 memset (bt->alloc, 0, bt->page_size);
592 bt_putid(bt->alloc->right, MIN_lvl+1);
593 bt->alloc->bits = bt->page_bits;
596 if( write (bt->idx, bt->alloc, bt->page_size) < bt->page_size )
597 return bt_close (bt), NULL;
599 if( !WriteFile (bt->idx, (char *)bt->alloc, bt->page_size, amt, NULL) )
600 return bt_close (bt), NULL;
602 if( *amt < bt->page_size )
603 return bt_close (bt), NULL;
606 memset (bt->frame, 0, bt->page_size);
607 bt->frame->bits = bt->page_bits;
608 bt->frame->posted = 1;
610 for( lvl=MIN_lvl; lvl--; ) {
611 slotptr(bt->frame, 1)->off = offsetof(struct BtPage_, fence);
612 bt_putid(slotptr(bt->frame, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number
613 bt->frame->fence[0] = 2;
614 bt->frame->fence[1] = 0xff;
615 bt->frame->fence[2] = 0xff;
616 bt->frame->min = bt->page_size;
617 bt->frame->lvl = lvl;
621 if( write (bt->idx, bt->frame, bt->page_size) < bt->page_size )
622 return bt_close (bt), NULL;
624 if( !WriteFile (bt->idx, (char *)bt->frame, bt->page_size, amt, NULL) )
625 return bt_close (bt), NULL;
627 if( *amt < bt->page_size )
628 return bt_close (bt), NULL;
632 // create empty page area by writing last page of first
633 // cache area (other pages are zeroed by O/S)
635 if( bt->mapped_io && bt->hashmask ) {
636 memset(bt->frame, 0, bt->page_size);
639 while( last < MIN_lvl + 1 )
640 last += bt->hashmask + 1;
642 pwrite(bt->idx, bt->frame, bt->page_size, last << bt->page_bits);
644 SetFilePointer (bt->idx, last << bt->page_bits, NULL, FILE_BEGIN);
645 if( !WriteFile (bt->idx, (char *)bt->frame, bt->page_size, amt, NULL) )
646 return bt_close (bt), NULL;
647 if( *amt < bt->page_size )
648 return bt_close (bt), NULL;
652 if( bt_unlockpage(bt, ALLOC_page, lockmode) )
653 return bt_close (bt), NULL;
658 // compare two keys, returning > 0, = 0, or < 0
659 // as the comparison value
661 int keycmp (BtKey key1, unsigned char *key2, uint len2)
663 uint len1 = key1->len;
666 if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) )
677 // Update current page of btree by writing file contents
678 // or flushing mapped area to disk.
680 BTERR bt_update (BtDb *bt, BtPage page, uid page_no)
682 off64_t off = page_no << bt->page_bits;
685 if ( !bt->mapped_io )
686 if ( pwrite(bt->idx, page, bt->page_size, off) != bt->page_size )
687 return bt->err = BTERR_wrt;
690 if ( !bt->mapped_io )
692 SetFilePointer (bt->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
693 if( !WriteFile (bt->idx, (char *)page, bt->page_size, amt, NULL) )
694 return GetLastError(), bt->err = BTERR_wrt;
696 if( *amt < bt->page_size )
697 return GetLastError(), bt->err = BTERR_wrt;
699 else if ( bt->mode == BT_fl ) {
700 FlushViewOfFile(page, bt->page_size);
701 FlushFileBuffers(bt->idx);
707 // find page in cache
709 BtHash *bt_findhash(BtDb *bt, uid page_no)
714 // compute cache block first page and hash idx
716 page_no &= ~bt->hashmask;
717 idx = (uint)(page_no >> bt->seg_bits) % bt->hashsize;
720 hash = bt->nodes + bt->cache[idx];
724 do if( hash->page_no == page_no )
726 while(hash = hash->hashnext );
731 // add page cache entry to hash index
733 void bt_linkhash(BtDb *bt, BtHash *node, uid page_no)
735 uint idx = (uint)(page_no >> bt->seg_bits) % bt->hashsize;
738 if( bt->cache[idx] ) {
739 node->hashnext = hash = bt->nodes + bt->cache[idx];
740 hash->hashprev = node;
743 node->hashprev = NULL;
744 bt->cache[idx] = (ushort)(node - bt->nodes);
747 // remove cache entry from hash table
749 void bt_unlinkhash(BtDb *bt, BtHash *node)
751 uint idx = (uint)(node->page_no >> bt->seg_bits) % bt->hashsize;
755 if( hash = node->hashprev )
756 hash->hashnext = node->hashnext;
757 else if( hash = node->hashnext )
758 bt->cache[idx] = (ushort)(hash - bt->nodes);
762 if( hash = node->hashnext )
763 hash->hashprev = node->hashprev;
766 // add cache page to lru chain and map pages
768 BtPage bt_linklru(BtDb *bt, BtHash *hash, uid page_no)
771 off64_t off = (page_no & ~bt->hashmask) << bt->page_bits;
772 off64_t limit = off + ((bt->hashmask+1) << bt->page_bits);
775 memset(hash, 0, sizeof(BtHash));
776 hash->page_no = (page_no & ~bt->hashmask);
777 bt_linkhash(bt, hash, page_no);
779 if( node = hash->lrunext = bt->lrufirst )
780 node->lruprev = hash;
787 flag = PROT_READ | ( bt->mode == BT_ro ? 0 : PROT_WRITE );
788 hash->page = (BtPage)mmap (0, (bt->hashmask+1) << bt->page_bits, flag, MAP_SHARED, bt->idx, off);
789 if( hash->page == MAP_FAILED )
790 return bt->err = BTERR_map, (BtPage)NULL;
793 flag = ( bt->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE );
794 hash->hmap = CreateFileMapping(bt->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL);
796 return bt->err = BTERR_map, NULL;
798 flag = ( bt->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE );
799 hash->page = MapViewOfFile(hash->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->hashmask+1) << bt->page_bits);
801 return bt->err = BTERR_map, NULL;
804 return (BtPage)((char*)hash->page + ((uint)(page_no & bt->hashmask) << bt->page_bits));
807 // find or place requested page in page-cache
808 // return memory address where page is located.
810 BtPage bt_hashpage(BtDb *bt, uid page_no)
812 BtHash *hash, *node, *next;
815 // find page in cache and move to top of lru list
817 if( hash = bt_findhash(bt, page_no) ) {
818 page = (BtPage)((char*)hash->page + ((uint)(page_no & bt->hashmask) << bt->page_bits));
819 // swap node in lru list
820 if( node = hash->lruprev ) {
821 if( next = node->lrunext = hash->lrunext )
822 next->lruprev = node;
826 if( next = hash->lrunext = bt->lrufirst )
827 next->lruprev = hash;
829 return bt->err = BTERR_hash, (BtPage)NULL;
831 hash->lruprev = NULL;
837 // map pages and add to cache entry
839 if( bt->nodecnt < bt->nodemax ) {
840 hash = bt->nodes + ++bt->nodecnt;
841 return bt_linklru(bt, hash, page_no);
844 // hash table is already full, replace last lru entry from the cache
846 if( hash = bt->lrulast ) {
847 // unlink from lru list
848 if( node = bt->lrulast = hash->lruprev )
849 node->lrunext = NULL;
851 return bt->err = BTERR_hash, (BtPage)NULL;
854 munmap (hash->page, (bt->hashmask+1) << bt->page_bits);
856 FlushViewOfFile(hash->page, 0);
857 UnmapViewOfFile(hash->page);
858 CloseHandle(hash->hmap);
860 // unlink from hash table
862 bt_unlinkhash(bt, hash);
864 // map and add to cache
866 return bt_linklru(bt, hash, page_no);
869 return bt->err = BTERR_hash, (BtPage)NULL;
872 // map a btree page onto current page
874 BTERR bt_mappage (BtDb *bt, BtPage *page, uid page_no)
876 off64_t off = page_no << bt->page_bits;
881 if( bt->mapped_io ) {
883 *page = bt_hashpage(bt, page_no);
887 if ( pread(bt->idx, *page, bt->page_size, off) < bt->page_size )
888 return bt->err = BTERR_map;
890 SetFilePointer (bt->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
892 if( !ReadFile(bt->idx, *page, bt->page_size, amt, NULL) )
893 return bt->err = BTERR_map;
895 if( *amt < bt->page_size )
896 return bt->err = BTERR_map;
901 // allocate a new page and write page into it
903 uid bt_newpage(BtDb *bt, BtPage page)
911 if ( bt_lockpage(bt, ALLOC_page, BtLockWrite) )
914 if( bt_mappage (bt, &bt->alloc, ALLOC_page) )
917 // use empty chain first
918 // else allocate empty page
920 if( new_page = bt_getid(bt->alloc[1].right) ) {
921 if( bt_mappage (bt, &bt->temp, new_page) )
922 return 0; // don't unlock on error
923 memcpy(bt->alloc[1].right, bt->temp->right, BtId);
926 new_page = bt_getid(bt->alloc->right);
927 bt_putid(bt->alloc->right, new_page+1);
931 if( bt_update(bt, bt->alloc, ALLOC_page) )
932 return 0; // don't unlock on error
934 if( !bt->mapped_io ) {
935 if( bt_update(bt, page, new_page) )
936 return 0; //don't unlock on error
940 if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) )
947 if ( pwrite(bt->idx, page, bt->page_size, new_page << bt->page_bits) < bt->page_size )
948 return bt->err = BTERR_wrt, 0;
950 // if writing first page of hash block, zero last page in the block
952 if ( !reuse && bt->hashmask > 0 && (new_page & bt->hashmask) == 0 )
954 // use temp buffer to write zeros
955 memset(bt->zero, 0, bt->page_size);
956 if ( pwrite(bt->idx,bt->zero, bt->page_size, (new_page | bt->hashmask) << bt->page_bits) < bt->page_size )
957 return bt->err = BTERR_wrt, 0;
960 // bring new page into page-cache and copy page.
961 // this will extend the file into the new pages.
963 if( !(pmap = (char*)bt_hashpage(bt, new_page & ~bt->hashmask)) )
966 memcpy(pmap+((new_page & bt->hashmask) << bt->page_bits), page, bt->page_size);
971 if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) )
977 // find slot in page for given key at a given level
978 // return 0 if beyond fence value
980 int bt_findslot (BtPageSet *set, unsigned char *key, uint len)
982 uint diff, higher = set->page->cnt, low = 1, slot;
984 // is page being deleted? if so,
985 // tell caller to follow right link
987 if( !set->page->act )
990 // make stopper key an infinite fence value
992 if( bt_getid (set->page->right) )
995 // low is the next candidate, higher is already
996 // tested as .ge. the given key, loop ends when they meet
998 while( diff = higher - low ) {
999 slot = low + ( diff >> 1 );
1000 if( keycmp (keyptr(set->page, slot), key, len) < 0 )
1006 if( higher <= set->page->cnt )
1009 // if leaf page, compare against fence value
1011 // return zero if key is on right link page
1012 // or return slot beyond last key
1014 if( set->page->lvl || keycmp ((BtKey)set->page->fence, key, len) < 0 )
1020 // find and load page at given level for given key
1021 // leave page rd or wr locked as requested
1023 int bt_loadpage (BtDb *bt, BtPageSet *set, unsigned char *key, uint len, uint lvl, uint lock)
1025 uid page_no = ROOT_page, prevpage = 0;
1026 uint drill = 0xff, slot;
1027 uint mode, prevmode;
1030 // start at root of btree and drill down
1033 // determine lock mode of drill level
1034 mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead;
1036 set->page_no = page_no;
1038 // obtain access lock using lock chaining
1040 if( page_no > ROOT_page )
1041 if( bt_lockpage(bt, page_no, BtLockAccess) )
1045 if( bt_unlockpage(bt, prevpage, prevmode) )
1048 // obtain read lock using lock chaining
1050 if( bt_lockpage(bt, page_no, mode) )
1053 if( page_no > ROOT_page )
1054 if( bt_unlockpage(bt, page_no, BtLockAccess) )
1057 // map/obtain page contents
1059 if( bt_mappage (bt, &set->page, page_no) )
1062 // re-read and re-lock root after determining actual level of root
1064 if( set->page->lvl != drill) {
1065 if ( page_no != ROOT_page )
1066 return bt->err = BTERR_struct, 0;
1068 drill = set->page->lvl;
1070 if( lock == BtLockWrite && drill == lvl )
1071 if( bt_unlockpage(bt, page_no, mode) )
1080 // find key on page at this level
1081 // and descend to requested level
1083 if( slot = bt_findslot (set, key, len) ) {
1085 return bt->posted = posted, slot;
1087 if( slot > set->page->cnt )
1088 return bt->err = BTERR_struct, 0;
1090 // if drilling down, find next active key
1092 while( slotptr(set->page, slot)->dead )
1093 if( slot++ < set->page->cnt )
1096 return bt->err = BTERR_struct, 0;
1098 page_no = bt_getid(slotptr(set->page, slot)->id);
1104 // or slide right into next page
1105 // (slide left from deleted page)
1107 page_no = bt_getid(set->page->right);
1112 // return error on end of right chain
1114 bt->err = BTERR_struct;
1115 return 0; // return error
1118 // drill down fixing fence values for left sibling tree
1119 // starting with current left page in bt->temp
1120 // call with bt->temp write locked
1121 // return with all pages unlocked.
1123 BTERR bt_fixfences (BtDb *bt, uid page_no, unsigned char *newfence)
1125 unsigned char oldfence[256];
1129 memcpy (oldfence, bt->temp->fence, 256);
1131 // start at left page of btree and drill down
1132 // to update their fence values
1135 // obtain access lock using lock chaining
1138 if( bt_unlockpage(bt, prevpage, BtLockWrite) )
1140 if( bt_unlockpage(bt, prevpage, BtLockParent) )
1143 // obtain parent/fence key maintenance lock
1145 if( bt_lockpage(bt, page_no, BtLockParent) )
1148 if( bt_lockpage(bt, page_no, BtLockAccess) )
1151 if( bt_unlockpage(bt, prevpage, BtLockWrite) )
1154 // obtain write lock using lock chaining
1156 if( bt_lockpage(bt, page_no, BtLockWrite) )
1159 if( bt_unlockpage(bt, page_no, BtLockAccess) )
1162 // map/obtain page contents
1164 if( bt_mappage (bt, &bt->temp, page_no) )
1168 chk = keycmp ((BtKey)bt->temp->fence, oldfence + 1, *oldfence);
1172 page_no = bt_getid (bt->temp->right);
1177 return bt->err = BTERR_struct;
1179 memcpy (bt->temp->fence, newfence, 256);
1181 if( bt_update (bt, bt->temp, page_no) )
1184 // return when we reach a leaf page
1186 if( !bt->temp->lvl ) {
1187 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1189 return bt_unlockpage (bt, page_no, BtLockParent);
1192 page_no = bt_getid(slotptr(bt->temp, bt->temp->cnt)->id);
1196 // return error on end of right chain
1198 return bt->err = BTERR_struct;
1201 // return page to free list
1202 // page must be delete & write locked
1204 BTERR bt_freepage (BtDb *bt, BtPage page, uid page_no)
1206 // lock & map allocation page
1208 if( bt_lockpage (bt, ALLOC_page, BtLockWrite) )
1211 if( bt_mappage (bt, &bt->alloc, ALLOC_page) )
1214 // store chain in second right
1215 bt_putid(page->right, bt_getid(bt->alloc[1].right));
1216 bt_putid(bt->alloc[1].right, page_no);
1218 if( bt_update(bt, bt->alloc, ALLOC_page) )
1220 if( bt_update(bt, page, page_no) )
1225 if( bt_unlockpage(bt, ALLOC_page, BtLockWrite) )
1228 // remove write lock on deleted node
1230 if( bt_unlockpage(bt, page_no, BtLockWrite) )
1233 return bt_unlockpage (bt, page_no, BtLockDelete);
1236 // remove the root level by promoting its only child
1238 BTERR bt_removeroot (BtDb *bt, BtPage root, BtPage child, uid page_no)
1244 if( bt_lockpage (bt, next, BtLockDelete) )
1246 if( bt_lockpage (bt, next, BtLockWrite) )
1249 if( bt_mappage (bt, &child, next) )
1255 memcpy (root, child, bt->page_size);
1256 next = bt_getid (slotptr(child, child->cnt)->id);
1258 if( bt_freepage (bt, child, page_no) )
1260 } while( root->lvl > 1 && root->cnt == 1 );
1262 if( bt_update (bt, root, ROOT_page) )
1265 return bt_unlockpage (bt, ROOT_page, BtLockWrite);
1268 // pull right page over ourselves in simple merge
1270 BTERR bt_mergeright (BtDb *bt, uid page_no, BtPageSet *parent, uint slot)
1275 // find our right neighbor
1276 // right must exist because the stopper prevents
1277 // the rightmost page from deleting
1279 for( idx = slot; idx++ < parent->page->cnt; )
1280 if( !slotptr(parent->page, idx)->dead )
1283 right = bt_getid (slotptr (parent->page, idx)->id);
1285 if( right != bt_getid (bt->page->right) )
1286 return bt->err = BTERR_struct;
1288 if( bt_lockpage (bt, right, BtLockDelete) )
1291 if( bt_lockpage (bt, right, BtLockWrite) )
1294 if( bt_mappage (bt, &bt->temp, right) )
1297 memcpy (bt->page, bt->temp, bt->page_size);
1299 if( bt_update(bt, bt->page, page_no) )
1302 // install ourselves as child page
1303 // and delete ourselves from parent
1305 bt_putid (slotptr(parent->page, idx)->id, page_no);
1306 slotptr(parent->page, slot)->dead = 1;
1307 parent->page->act--;
1309 // collapse any empty slots
1311 while( idx = parent->page->cnt - 1 )
1312 if( slotptr(parent->page, idx)->dead ) {
1313 *slotptr(parent->page, idx) = *slotptr(parent->page, idx + 1);
1314 memset (slotptr(parent->page, parent->page->cnt--), 0, sizeof(BtSlot));
1318 if( bt_freepage (bt, bt->temp, right) )
1321 // do we need to remove a btree level?
1322 // (leave the first page of leaves alone)
1324 if( parent->page_no == ROOT_page && parent->page->cnt == 1 )
1326 return bt_removeroot (bt, parent->page, bt->page, page_no);
1328 if( bt_update (bt, parent->page, parent->page_no) )
1331 if( bt_unlockpage (bt, parent->page_no, BtLockWrite) )
1334 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1337 if( bt_unlockpage (bt, page_no, BtLockDelete) )
1343 // remove both child and parent from the btree
1344 // from the fence position in the parent
1346 BTERR bt_removeparent (BtDb *bt, uid page_no, BtPageSet *parent, uint lvl)
1348 unsigned char rightfence[256], pagefence[256];
1349 uid right, ppage_no;
1351 right = bt_getid (bt->page->right);
1353 if( bt_lockpage (bt, right, BtLockParent) )
1356 if( bt_lockpage (bt, right, BtLockAccess) )
1359 if( bt_lockpage (bt, right, BtLockWrite) )
1362 if( bt_unlockpage (bt, right, BtLockAccess) )
1365 if( bt_mappage (bt, &bt->temp, right) )
1368 // save right page fence value and
1369 // parent fence value
1371 memcpy (rightfence, bt->temp->fence, 256);
1372 memcpy (pagefence, parent->page->fence, 256);
1373 ppage_no = parent->page_no;
1375 // pull right sibling over ourselves and unlock
1377 memcpy (bt->page, bt->temp, bt->page_size);
1379 if( bt_update(bt, bt->page, page_no) )
1382 if( bt_unlockpage (bt, page_no, BtLockDelete) )
1385 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1388 // install ourselves into right link from old right page
1390 bt->temp->act = 0; // tell bt_findslot to go right (left)
1391 bt_putid (bt->temp->right, page_no);
1393 if( bt_update (bt, bt->temp, right) )
1396 if( bt_unlockpage (bt, right, BtLockWrite) )
1399 // remove our slot from our parent
1400 // clear act to signal bt_findslot to move right
1402 slotptr(parent->page, parent->page->cnt)->dead = 1;
1403 parent->page->act = 0;
1405 if( bt_update (bt, parent->page, ppage_no) )
1407 if( bt_unlockpage (bt, ppage_no, BtLockWrite) )
1410 // redirect right page pointer in its parent to our left
1411 // and free the right page
1413 if( bt_insertkey (bt, rightfence+1, *rightfence, lvl, page_no, time(NULL)) )
1416 if( bt_removepage (bt, ppage_no, lvl, pagefence) )
1419 if( bt_unlockpage (bt, right, BtLockParent) )
1422 // wait for others to drain away
1424 if( bt_lockpage (bt, right, BtLockDelete) )
1427 if( bt_lockpage (bt, right, BtLockWrite) )
1430 if( bt_mappage (bt, &bt->temp, right) )
1433 return bt_freepage (bt, bt->temp, right);
1436 // remove page from btree
1437 // call with page unlocked
1438 // returns with page on free list
1440 BTERR bt_removepage (BtDb *bt, uid page_no, uint lvl, unsigned char *pagefence)
1442 BtPageSet parent[1];
1447 parent->page = bt->parent;
1449 // load and lock our parent
1452 if( !(slot = bt_loadpage (bt, parent, pagefence+1, *pagefence, lvl+1, BtLockWrite)) )
1455 // wait until we are posted in our parent
1458 if( bt_unlockpage (bt, parent->page_no, BtLockWrite) )
1468 // wait for others to finish
1469 // and obtain final WriteLock
1471 if( bt_lockpage (bt, page_no, BtLockDelete) )
1474 if( bt_lockpage (bt, page_no, BtLockWrite) )
1477 if( bt_mappage (bt, &bt->page, page_no) )
1480 // was page re-established?
1482 if( bt->page->act ) {
1483 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1485 if( bt_unlockpage (bt, page_no, BtLockDelete) )
1488 return bt_unlockpage (bt, parent->page_no, BtLockWrite);
1491 // can we do a simple merge entirely
1492 // between siblings on the parent page?
1494 if( slot < parent->page->cnt )
1495 return bt_mergeright(bt, page_no, parent, slot);
1497 // find our left neighbor in our parent page
1499 for( idx = slot; --idx; )
1500 if( !slotptr(parent->page, idx)->dead )
1503 // if no left neighbor, delete ourselves and our parent
1506 return bt_removeparent (bt, page_no, parent, lvl+1);
1508 // lock and map our left neighbor's page
1510 left = bt_getid (slotptr(parent->page, idx)->id);
1512 // wait our turn on fence key maintenance
1514 if( bt_lockpage(bt, left, BtLockParent) )
1517 if( bt_lockpage(bt, left, BtLockAccess) )
1520 if( bt_lockpage(bt, left, BtLockWrite) )
1523 if( bt_unlockpage(bt, left, BtLockAccess) )
1526 if( bt_mappage (bt, &bt->temp, left) )
1529 // wait until sibling is in our parent
1531 if( bt_getid (bt->temp->right) != page_no ) {
1532 if( bt_unlockpage (bt, parent->page_no, BtLockWrite) )
1534 if( bt_unlockpage (bt, left, BtLockWrite) )
1536 if( bt_unlockpage (bt, left, BtLockParent) )
1538 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1540 if( bt_unlockpage (bt, page_no, BtLockDelete) )
1550 // delete our left sibling from parent
1552 slotptr(parent->page,idx)->dead = 1;
1553 parent->page->dirty = 1;
1554 parent->page->act--;
1556 // redirect our parent slot to our left sibling
1558 bt_putid (slotptr(parent->page, slot)->id, left);
1560 // update left page with our fence key
1562 memcpy (bt->temp->right, bt->page->right, BtId);
1564 // collapse dead slots from parent
1566 while( idx = parent->page->cnt - 1 )
1567 if( slotptr(parent->page, idx)->dead ) {
1568 *slotptr(parent->page, idx) = *slotptr(parent->page, parent->page->cnt);
1569 memset (slotptr(parent->page, parent->page->cnt--), 0, sizeof(BtSlot));
1573 // update parent page
1575 if( bt_update (bt, parent->page, parent->page_no) )
1578 // go down the left node's fence keys to the leaf level
1579 // and update the fence keys in each page
1581 if( bt_fixfences (bt, left, pagefence) )
1584 if( bt_unlockpage (bt, parent->page_no, BtLockWrite) )
1587 // free our original page
1589 if( bt_mappage (bt, &bt->temp, page_no) )
1592 return bt_freepage (bt, bt->temp,page_no);
1595 // find and delete key on page by marking delete flag bit
1596 // when page becomes empty, delete it
1598 BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len)
1600 unsigned char pagefence[256];
1601 uint slot, found, act, idx;
1605 set->page = bt->page;
1607 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockWrite) )
1608 ptr = keyptr(set->page, slot);
1612 // if key is found delete it, otherwise ignore request
1614 if( found = slot <= set->page->cnt )
1615 if( found = !keycmp (ptr, key, len) )
1616 if( found = slotptr(set->page, slot)->dead == 0 ) {
1617 slotptr(set->page,slot)->dead = 1;
1618 set->page->dirty = 1;
1621 // collapse empty slots
1623 while( idx = set->page->cnt - 1 )
1624 if( slotptr(set->page, idx)->dead ) {
1625 *slotptr(set->page, idx) = *slotptr(set->page, idx + 1);
1626 memset (slotptr(set->page, set->page->cnt--), 0, sizeof(BtSlot));
1630 if( bt_update(bt, set->page, set->page_no) )
1634 // delete page when empty
1636 memcpy (pagefence, set->page->fence, 256);
1637 act = set->page->act;
1639 if( bt_unlockpage(bt, set->page_no, BtLockWrite) )
1643 if( bt_removepage (bt, set->page_no, 0, pagefence) )
1650 // find key in leaf level and return row-id
1652 uid bt_findkey (BtDb *bt, unsigned char *key, uint len)
1659 set->page = bt->page;
1661 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockRead) )
1662 ptr = keyptr(set->page, slot);
1666 // if key exists, return row-id
1667 // otherwise return 0
1669 if( slot <= set->page->cnt )
1670 if( !keycmp (ptr, key, len) )
1671 id = bt_getid(slotptr(set->page,slot)->id);
1673 if ( bt_unlockpage(bt, set->page_no, BtLockRead) )
1679 // check page for space available,
1680 // clean if necessary and return
1681 // 0 - page needs splitting
1682 // >0 - new slot value
1684 uint bt_cleanpage(BtDb *bt, BtPage page, uint amt, uint slot)
1686 uint nxt = bt->page_size, off;
1687 uint cnt = 0, idx = 0;
1688 uint max = page->cnt;
1692 if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 )
1695 // skip cleanup if nothing to reclaim
1700 memcpy (bt->frame, page, bt->page_size);
1702 // skip page info and set rest of page to zero
1704 memset (page+1, 0, bt->page_size - sizeof(*page));
1708 while( cnt++ < max ) {
1711 if( slotptr(bt->frame,cnt)->dead )
1715 if( !page->lvl || cnt < max ) {
1716 key = keyptr(bt->frame, cnt);
1717 off = nxt -= key->len + 1;
1718 memcpy ((unsigned char *)page + nxt, key, key->len + 1);
1720 off = offsetof(struct BtPage_, fence);
1723 memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId);
1724 slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod;
1725 slotptr(page, idx)->off = off;
1731 if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 )
1737 // split the root and raise the height of the btree
1739 BTERR bt_splitroot(BtDb *bt, BtPageSet *root, uid page_no2)
1741 unsigned char leftkey[256];
1742 uint nxt = bt->page_size;
1745 // Obtain an empty page to use, and copy the current
1746 // root contents into it, e.g. lower keys
1748 memcpy (leftkey, root->page->fence, 256);
1749 root->page->posted = 1;
1751 if( !(new_page = bt_newpage(bt, root->page)) )
1754 // preserve the page info at the bottom
1755 // of higher keys and set rest to zero
1757 memset(root->page+1, 0, bt->page_size - sizeof(*root->page));
1758 memset(root->page->fence, 0, 256);
1759 root->page->fence[0] = 2;
1760 root->page->fence[1] = 0xff;
1761 root->page->fence[2] = 0xff;
1763 // insert new page fence key on newroot page
1765 nxt -= *leftkey + 1;
1766 memcpy ((unsigned char *)root->page + nxt, leftkey, *leftkey + 1);
1767 bt_putid(slotptr(root->page, 1)->id, new_page);
1768 slotptr(root->page, 1)->off = nxt;
1770 // insert stopper key on newroot page
1771 // and increase the root height
1773 bt_putid(slotptr(root->page, 2)->id, page_no2);
1774 slotptr(root->page, 2)->off = offsetof(struct BtPage_, fence);
1776 bt_putid(root->page->right, 0);
1777 root->page->min = nxt; // reset lowest used offset and key count
1778 root->page->cnt = 2;
1779 root->page->act = 2;
1782 // update and release root
1784 if( bt_update(bt, root->page, root->page_no) )
1787 return bt_unlockpage(bt, root->page_no, BtLockWrite);
1790 // split already locked full node
1793 BTERR bt_splitpage (BtDb *bt, BtPageSet *set)
1795 uint cnt = 0, idx = 0, max, nxt = bt->page_size, off;
1796 uid right, page_no = set->page_no;
1797 unsigned char fencekey[256];
1798 uint lvl = set->page->lvl;
1801 // split higher half of keys to bt->frame
1803 memset (bt->frame, 0, bt->page_size);
1804 max = set->page->cnt;
1808 while( cnt++ < max ) {
1809 if( !lvl || cnt < max ) {
1810 key = keyptr(set->page, cnt);
1811 off = nxt -= key->len + 1;
1812 memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1);
1814 off = offsetof(struct BtPage_, fence);
1816 memcpy(slotptr(bt->frame,++idx)->id, slotptr(set->page,cnt)->id, BtId);
1817 slotptr(bt->frame, idx)->tod = slotptr(set->page, cnt)->tod;
1818 slotptr(bt->frame, idx)->off = off;
1822 if( page_no == ROOT_page )
1823 bt->frame->posted = 1;
1825 memcpy (bt->frame->fence, set->page->fence, 256);
1826 bt->frame->bits = bt->page_bits;
1827 bt->frame->min = nxt;
1828 bt->frame->cnt = idx;
1829 bt->frame->lvl = lvl;
1833 if( page_no > ROOT_page )
1834 memcpy (bt->frame->right, set->page->right, BtId);
1836 // get new free page and write higher keys to it.
1838 if( !(right = bt_newpage(bt, bt->frame)) )
1841 // update lower keys to continue in old page
1843 memcpy (bt->frame, set->page, bt->page_size);
1844 memset (set->page+1, 0, bt->page_size - sizeof(*set->page));
1845 nxt = bt->page_size;
1846 set->page->posted = 0;
1847 set->page->dirty = 0;
1852 // assemble page of smaller keys
1854 while( cnt++ < max / 2 ) {
1855 key = keyptr(bt->frame, cnt);
1857 if( !lvl || cnt < max / 2 ) {
1858 off = nxt -= key->len + 1;
1859 memcpy ((unsigned char *)set->page + nxt, key, key->len + 1);
1861 off = offsetof(struct BtPage_, fence);
1863 memcpy(slotptr(set->page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId);
1864 slotptr(set->page, idx)->tod = slotptr(bt->frame, cnt)->tod;
1865 slotptr(set->page, idx)->off = off;
1869 // install fence key for smaller key page
1871 memset(set->page->fence, 0, 256);
1872 memcpy(set->page->fence, key, key->len + 1);
1874 bt_putid(set->page->right, right);
1875 set->page->min = nxt;
1876 set->page->cnt = idx;
1878 // if current page is the root page, split it
1880 if( page_no == ROOT_page )
1881 return bt_splitroot (bt, set, right);
1883 if( bt_update (bt, set->page, page_no) )
1886 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1889 // insert new fences in their parent pages
1892 if( bt_lockpage (bt, page_no, BtLockParent) )
1895 if( bt_lockpage (bt, page_no, BtLockRead) )
1898 if( bt_mappage (bt, &set->page, page_no) )
1901 memcpy (fencekey, set->page->fence, 256);
1903 if( set->page->posted ) {
1904 if( bt_unlockpage (bt, page_no, BtLockParent) )
1907 return bt_unlockpage (bt, page_no, BtLockRead);
1910 if( bt_unlockpage (bt, page_no, BtLockRead) )
1913 if( bt_insertkey (bt, fencekey+1, *fencekey, lvl+1, page_no, time(NULL)) )
1916 if( bt_lockpage (bt, page_no, BtLockWrite) )
1919 if( bt_mappage (bt, &set->page, page_no) )
1922 right = bt_getid (set->page->right);
1923 set->page->posted = 1;
1925 if( bt_update (bt, set->page, page_no) )
1928 if( bt_unlockpage (bt, page_no, BtLockWrite) )
1931 if( bt_unlockpage (bt, page_no, BtLockParent) )
1934 if( !(page_no = right) )
1937 if( bt_mappage (bt, &set->page, page_no) )
1944 // Insert new key into the btree at requested level.
1946 BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, uid id, uint tod)
1952 set->page = bt->page;
1955 if( slot = bt_loadpage (bt, set, key, len, lvl, BtLockWrite) )
1956 ptr = keyptr(set->page, slot);
1960 bt->err = BTERR_ovflw;
1964 // if key already exists, update id and return
1966 if( slot <= set->page->cnt )
1967 if( !keycmp (ptr, key, len) ) {
1968 if( slotptr(set->page, slot)->dead )
1971 slotptr(set->page, slot)->dead = 0;
1972 slotptr(set->page, slot)->tod = tod;
1973 bt_putid(slotptr(set->page,slot)->id, id);
1975 if ( bt_update(bt, set->page, set->page_no) )
1978 return bt_unlockpage(bt, set->page_no, BtLockWrite);
1981 // check if page has enough space
1983 if( slot = bt_cleanpage (bt, set->page, len, slot) )
1986 if( bt_splitpage (bt, set) )
1990 // calculate next available slot and copy key into page
1992 set->page->min -= len + 1; // reset lowest used offset
1993 ((unsigned char *)set->page)[set->page->min] = len;
1994 memcpy ((unsigned char *)set->page + set->page->min +1, key, len );
1996 for( idx = slot; idx <= set->page->cnt; idx++ )
1997 if( slotptr(set->page, idx)->dead )
2000 // now insert key into array before slot
2002 if( idx > set->page->cnt )
2008 *slotptr(set->page, idx) = *slotptr(set->page, idx -1), idx--;
2010 bt_putid(slotptr(set->page,slot)->id, id);
2011 slotptr(set->page, slot)->off = set->page->min;
2012 slotptr(set->page, slot)->tod = tod;
2013 slotptr(set->page, slot)->dead = 0;
2015 if( bt_update(bt, set->page, set->page_no) )
2018 return bt_unlockpage(bt, set->page_no, BtLockWrite);
2021 // cache page of keys into cursor and return starting slot for given key
2023 uint bt_startkey (BtDb *bt, unsigned char *key, uint len)
2028 set->page = bt->page;
2030 // cache page for retrieval
2031 if( slot = bt_loadpage (bt, set, key, len, 0, BtLockRead) )
2032 memcpy (bt->cursor, set->page, bt->page_size);
2033 bt->cursor_page = set->page_no;
2034 if ( bt_unlockpage(bt, set->page_no, BtLockRead) )
2040 // return next slot for cursor page
2041 // or slide cursor right into next page
2043 uint bt_nextkey (BtDb *bt, uint slot)
2049 right = bt_getid(bt->cursor->right);
2050 while( slot++ < bt->cursor->cnt )
2051 if( slotptr(bt->cursor,slot)->dead )
2053 else if( right || (slot < bt->cursor->cnt)) // skip infinite stopper
2061 bt->cursor_page = right;
2062 set->page = bt->page;
2064 if( bt_lockpage(bt, right, BtLockRead) )
2067 if( bt_mappage (bt, &set->page, right) )
2070 memcpy (bt->cursor, set->page, bt->page_size);
2072 if( bt_unlockpage(bt, right, BtLockRead) )
2081 BtKey bt_key(BtDb *bt, uint slot)
2083 return keyptr(bt->cursor, slot);
2086 uid bt_uid(BtDb *bt, uint slot)
2088 return bt_getid(slotptr(bt->cursor,slot)->id);
2091 uint bt_tod(BtDb *bt, uint slot)
2093 return slotptr(bt->cursor,slot)->tod;
2098 // standalone program to index file of keys
2099 // then list them onto std-out
2101 int main (int argc, char **argv)
2103 uint slot, line = 0, off = 0, found = 0;
2104 int dead, ch, cnt = 0, bits = 12;
2105 unsigned char key[256];
2106 clock_t done, start;
2117 fprintf (stderr, "Usage: %s idx_file src_file Read/Write/Scan/Delete/Find [page_bits mapped_pool_segments pages_per_segment start_line_number]\n", argv[0]);
2118 fprintf (stderr, " page_bits: size of btree page in bits\n");
2119 fprintf (stderr, " mapped_pool_segments: size of buffer pool in segments\n");
2120 fprintf (stderr, " pages_per_segment: size of buffer pool segment in pages in bits\n");
2128 bits = atoi(argv[4]);
2131 map = atoi(argv[5]);
2134 fprintf (stderr, "Warning: buffer_pool > 65536 segments\n");
2136 if( map && map < 8 )
2137 fprintf (stderr, "Buffer_pool too small\n");
2140 pgblk = atoi(argv[6]);
2142 if( bits + pgblk > 30 )
2143 fprintf (stderr, "Warning: very large buffer pool segment size\n");
2146 off = atoi(argv[7]);
2148 bt = bt_open ((argv[1]), BT_rw, bits, map, pgblk);
2151 fprintf(stderr, "Index Open Error %s\n", argv[1]);
2155 switch(argv[3][0]| 0x20)
2158 fprintf(stderr, "started indexing for %s\n", argv[2]);
2159 if( argc > 2 && (in = fopen (argv[2], "rb")) )
2160 while( ch = getc(in), ch != EOF )
2164 sprintf((char *)key+len, "%.9d", line + off), len += 9;
2166 if( bt_insertkey (bt, key, len, 0, ++line, *tod) )
2167 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2170 else if( len < 245 )
2172 fprintf(stderr, "finished adding keys, %d \n", line);
2176 fprintf(stderr, "started deleting keys for %s\n", argv[2]);
2177 if( argc > 2 && (in = fopen (argv[2], "rb")) )
2178 while( ch = getc(in), ch != EOF )
2182 sprintf((char *)key+len, "%.9d", line + off), len += 9;
2184 if( bt_deletekey (bt, key, len) )
2185 fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0);
2188 else if( len < 245 )
2190 fprintf(stderr, "finished deleting keys, %d \n", line);
2194 fprintf(stderr, "started finding keys for %s\n", argv[2]);
2195 if( argc > 2 && (in = fopen (argv[2], "rb")) )
2196 while( ch = getc(in), ch != EOF )
2200 sprintf((char *)key+len, "%.9d", line + off), len += 9;
2202 if( bt_findkey (bt, key, len) )
2205 fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
2208 else if( len < 245 )
2210 fprintf(stderr, "finished search of %d keys, found %d\n", line, found);
2220 fprintf(stderr, " Time to complete: %.2f seconds\n", (float)(done - start) / CLOCKS_PER_SEC);
2225 fprintf(stderr, "started reading\n");
2227 if( slot = bt_startkey (bt, key, len) )
2230 fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0);
2232 while( slot = bt_nextkey (bt, slot) )
2234 ptr = bt_key(bt, slot);
2235 fwrite (ptr->key, ptr->len, 1, stdout);
2236 fputc ('\n', stdout);
2239 fprintf(stderr, " Total keys read %d\n", cnt);