From 42c2e785f24492e30f4262e7332bd41c7c672a56 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 24 Jan 2014 12:07:34 -0800 Subject: [PATCH] release of multi-threaded/multi-process btree code --- fosterbtree.c | 2054 ------------------------- fosterbtreea.c | 2075 -------------------------- fosterbtreeb.c | 2192 --------------------------- fosterbtreec.c | 2166 --------------------------- fosterbtreed.c | 2130 -------------------------- fosterbtreee1.c | 2529 ------------------------------- fosterbtreee2.c | 2554 -------------------------------- fosterbtreef1.c | 2447 ------------------------------ fosterbtreef2.c => threads2j.c | 1071 ++++++------- 9 files changed, 486 insertions(+), 18732 deletions(-) delete mode 100644 fosterbtree.c delete mode 100644 fosterbtreea.c delete mode 100644 fosterbtreeb.c delete mode 100644 fosterbtreec.c delete mode 100644 fosterbtreed.c delete mode 100644 fosterbtreee1.c delete mode 100644 fosterbtreee2.c delete mode 100644 fosterbtreef1.c rename fosterbtreef2.c => threads2j.c (73%) diff --git a/fosterbtree.c b/fosterbtree.c deleted file mode 100644 index 1b9735e..0000000 --- a/fosterbtree.c +++ /dev/null @@ -1,2054 +0,0 @@ -// foster btree -// 26 MAY 2013 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits:7; // page size in bits - unsigned char kill:1; // page is being deleted - unsigned char lvl; // level of page - unsigned char right[BtId]; // page number to right -} *BtPage; - -// mode & definition for latch table implementation - -enum { - Write = 1, - Share = 2 -} LockMode; - -// latch table lock structure - -// mode is set for write access -// share is count of read accessors -// grant write lock when share == 0 - -typedef struct { - int mode:1; - int share:31; -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children -} BtLatchSet; - -// The memory mapping hash table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - uint pin; // mapped page pin counter - uint slot; // slot index in this array - void *hashprev; // previous cache block for the same hash idx - void *hashnext; // next cache block for the same hash idx -#ifndef unix - HANDLE hmap; -#endif -// array of page latch sets, one for each page in map segment - BtLatchSet pagelatch[0]; -} BtHash; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; -#else - HANDLE idx; -#endif - uint nodecnt; // highest page cache node in use - uint nodemax; // highest page cache node allocated - uint hashmask; // number of pages in mmap segment - uint hashsize; // size of Hash Table - uint evicted; // last evicted hash slot - ushort *cache; // hash index for memory pool - BtLatch *latch; // latches for hash table slots - char *nodes; // memory pool page hash nodes -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage alloc; // frame buffer for alloc page ( page 0 ) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_again, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint cacheblk, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 -#define ROOT_page 1 - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtHash *hash; -uint slot; - - // release mapped pages - - for( slot = 0; slot < mgr->nodemax; slot++ ) { - hash = (BtHash *)(mgr->nodes + slot * (sizeof(BtHash) + (mgr->hashmask + 1) * sizeof(BtLatchSet))); - if( hash->slot ) -#ifdef unix - munmap (hash->map, (mgr->hashmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(hash->map, 0); - UnmapViewOfFile(hash->map); - CloseHandle(hash->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->nodes); - free (mgr->cache); - free (mgr->latch); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->nodes); - GlobalFree (mgr->cache); - GlobalFree (mgr->latch); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); - free (bt); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); - GlobalFree (bt); -#endif -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page cache (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint nodemax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last; -BtPage alloc; -int lockmode; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !nodemax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = open ((char*)name, O_RDONLY); - lockmode = 0; - break; - } - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL); - lockmode = 0; - break; - } - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - alloc = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, alloc, BT_minpage, 0) == BT_minpage ) - bits = alloc->bits; - else - return free(mgr), free(alloc), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - alloc = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)alloc, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->nodemax = nodemax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->hashmask = (cacheblk >> bits) - 1; - - // see if requested number of pages per memmap is greater - - if( (1 << segsize) > mgr->hashmask ) - mgr->hashmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->hashmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->nodes = calloc (cacheblk, (sizeof(BtHash) + (mgr->hashmask + 1) * sizeof(BtLatchSet))); - mgr->cache = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtLatch)); -#else - mgr->nodes = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cacheblk * (sizeof(BtHash) + (mgr->hashmask + 1) * sizeof(BtLatchSet))); - mgr->cache = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); -#endif - - if( size || *amt ) - goto mgrxit; - - // initializes an empty b-tree with root page and page of leaves - - memset (alloc, 0, 1 << bits); - bt_putid(slotptr(alloc, 2)->id, MIN_lvl+1); - alloc->bits = mgr->page_bits; - -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (alloc, 0, 1 << bits); - alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - alloc->min = mgr->page_size - 3; - alloc->lvl = lvl; - alloc->cnt = 1; - alloc->act = 1; -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // create empty page area by writing last page of first - // cache area (other pages are zeroed by O/S) - - if( mgr->hashmask ) { - memset(alloc, 0, mgr->page_size); - last = mgr->hashmask; - - while( last < MIN_lvl + 1 ) - last += mgr->hashmask + 1; - -#ifdef unix - pwrite(mgr->idx, alloc, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - -mgrxit: -#ifdef unix - free (alloc); -#else - VirtualFree (alloc, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -// wait until write lock mode is clear -// and add 1 to the share count - -void bt_readlock(BtLatch *latch) -{ - do { - // add one to counter, check write bit - -#ifdef unix - if( ~__sync_fetch_and_add((int *)latch, Share) & Write ) - return; -#else - if( ~InterlockedAdd((int *)latch, Share) & Write ) - return; -#endif - // didn't get latch, reset counter by one - -#ifdef unix - __sync_fetch_and_add((int *)latch, -Share); -#else - InterlockedAdd ((int *)latch, -Share); -#endif - - // and yield -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -int prev, ours = 0; - - do { - // see if we can get write access - // with no readers -#ifdef unix - prev = __sync_fetch_and_or((int *)latch, Write); -#else - prev = InterlockedOr((int *)latch, Write); -#endif - - if( ~prev & 1 ) - ours++; // it's ours - - if( !(prev >> 1) && ours ) - return; - - // otherwise yield - -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write locked - -int bt_writetry(BtLatch *latch) -{ -int prev, ours = 0; - - do { - // see if we can get write access - // with no readers -#ifdef unix - prev = __sync_fetch_and_or((int *)latch, Write); -#else - prev = InterlockedOr((int *)latch, Write); -#endif - - if( ~prev & 1 ) - ours++; // it's ours - - if( !ours ) - return 0; - - if( !(prev >> 1) && ours ) - return 1; - - // otherwise yield -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - __sync_fetch_and_and((int *)latch, ~Write); -#else - InterlockedAnd ((int *)latch, ~Write); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - __sync_fetch_and_add((int *)latch, -Share); -#else - InterlockedAdd((int *)latch, -Share); -#endif -} - -// Buffer Pool mgr - -// find segment in cache -// return NULL if not there -// otherwise return node - -BtHash *bt_findhash(BtDb *bt, uid page_no, uint idx) -{ -BtHash *hash; -uint slot; - - // compute cache block first page and hash idx - - if( slot = bt->mgr->cache[idx] ) - hash = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - else - return NULL; - - page_no &= ~bt->mgr->hashmask; - - while( hash->basepage != page_no ) - if( hash = hash->hashnext ) - continue; - else - return NULL; - - return hash; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtHash *hash, uid page_no, int idx) -{ -BtHash *node; -uint slot; - - hash->hashprev = hash->hashnext = NULL; - hash->basepage = page_no & ~bt->mgr->hashmask; - hash->pin = 1; - hash->lru = 1; - - if( slot = bt->mgr->cache[idx] ) { - node = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - hash->hashnext = node; - node->hashprev = hash; - } - - bt->mgr->cache[idx] = hash->slot; -} - -// find best segment to evict from buffer pool - -BtHash *bt_findlru (BtDb *bt, uint slot) -{ -unsigned long long int target = ~0LL; -BtHash *hash = NULL, *node; - - if( !slot ) - return NULL; - - node = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - hash = node; - } while( node = node->hashnext ); - - return hash; -} - -// map new segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtHash *hash, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->hashmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->hashmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - hash->map = mmap (0, (bt->mgr->hashmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( hash->map == MAP_FAILED ) - return bt->err = BTERR_map; -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - hash->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !hash->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - hash->map = MapViewOfFile(hash->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->hashmask+1) << bt->mgr->page_bits); - if( !hash->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-cache -// return hash table entry - -BtHash *bt_hashpage(BtDb *bt, uid page_no) -{ -BtHash *hash, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( hash = bt_findhash(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&hash->pin, 1); -#else - InterlockedIncrement (&hash->pin); -#endif - bt_releaseread (&bt->mgr->latch[idx]); - hash->lru++; - return hash; - } - - // upgrade to write lock - - bt_releaseread (&bt->mgr->latch[idx]); - bt_writelock (&bt->mgr->latch[idx]); - - // try to find page in cache with write lock - - if( hash = bt_findhash(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&hash->pin, 1); -#else - InterlockedIncrement (&hash->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - hash->lru++; - return hash; - } - - // allocate a new hash node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->nodecnt, 1); -#else - slot = InterlockedIncrement (&bt->mgr->nodecnt) - 1; -#endif - - if( ++slot < bt->mgr->nodemax ) { - hash = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - hash->slot = slot; - - if( bt_mapsegment(bt, hash, page_no) ) - return NULL; - - bt_linkhash(bt, hash, page_no, idx); - bt_releasewrite (&bt->mgr->latch[idx]); - return hash; - } - - // hash table is full - // find best cache entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->nodecnt, -1); -#else - InterlockedDecrement (&bt->mgr->nodecnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = InterlockedIncrement (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_writetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(hash = bt_findlru(bt, bt->mgr->cache[victim])) ) { - bt_releasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim hash node from hash table - - if( node = hash->hashprev ) - node->hashnext = hash->hashnext; - else if( node = hash->hashnext ) - bt->mgr->cache[victim] = node->slot; - else - bt->mgr->cache[victim] = 0; - - if( node = hash->hashnext ) - node->hashprev = hash->hashprev; - - // remove old file mapping -#ifdef unix - munmap (hash->map, (bt->mgr->hashmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(hash->map, 0); - UnmapViewOfFile(hash->map); - CloseHandle(hash->hmap); -#endif - hash->map = NULL; - bt_releasewrite (&bt->mgr->latch[victim]); - - // create new file mapping - // and link into hash table - - if( bt_mapsegment(bt, hash, page_no) ) - return NULL; - - bt_linkhash(bt, hash, page_no, idx); - bt_releasewrite (&bt->mgr->latch[idx]); - return hash; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool - -BTERR bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *page) -{ -BtLatchSet *set; -BtHash *hash; -uint subpage; - - // find/create maping in hash table - - if( hash = bt_hashpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->hashmask); // page within mapping - else - return bt->err; - - set = hash->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - - if( page ) - *page = (BtPage)(hash->map + (subpage << bt->mgr->page_bits)); - - return bt->err = 0; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode) -{ -uint subpage, idx; -BtLatchSet *set; -BtHash *hash; - - // since page is pinned - // it should still be in the buffer pool - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - if( hash = bt_findhash(bt, page_no, idx) ) - subpage = (uint)(page_no & bt->mgr->hashmask); - else - return bt->err = BTERR_hash; - - bt_releaseread (&bt->mgr->latch[idx]); - set = hash->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&hash->pin, -1); -#else - InterlockedDecrement (&hash->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page that has no tree pointers -// place on free chain out of allocator page - -BTERR bt_freepage(BtDb *bt, uid page_no) -{ - // obtain delete lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - // obtain write lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockWrite, &bt->temp) ) - return bt->err; - - // lock allocation page - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return bt->err; - - // store chain in first key - bt_putid(slotptr(bt->temp, 1)->id, bt_getid(slotptr(bt->alloc, 1)->id)); - bt_putid(slotptr(bt->alloc, 1)->id, page_no); - - // unlock page zero - - if( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return bt->err; - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -uid new_page; -BtPage pmap; -int reuse; - - // lock page zero - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return 0; - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(slotptr(bt->alloc, 1)->id) ) { - if( bt_lockpage (bt, new_page, BtLockWrite, &bt->temp) ) - return 0; - bt_putid(slotptr(bt->alloc, 1)->id, bt_getid(slotptr(bt->temp, 1)->id)); - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(slotptr(bt->alloc, 2)->id); - bt_putid(slotptr(bt->alloc, 2)->id, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of hash block, zero last page in the block - - if ( !reuse && bt->mgr->hashmask > 0 && (new_page & bt->mgr->hashmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->hashmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into page-cache and copy page. - // this will extend the file into the new pages. - - if( bt_lockpage(bt, new_page, BtLockWrite, &pmap) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; -#endif - // unlock page zero - - if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return 0; - - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, uint lock) -{ -uid page_no = ROOT_page, prevpage = 0; -uint drill = 0xff, slot; -uint mode, prevmode; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( bt_lockpage(bt, page_no, BtLockAccess, NULL) ) - return 0; - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode) ) - return 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( bt_lockpage(bt, page_no, mode, &bt->page) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode) ) - return 0; - else - continue; - } - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - else - drill--; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - prevmode = mode; - prevpage = bt->page_no; - page_no = bt_getid(slotptr(bt->page, slot)->id); - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) - slotptr(bt->page,slot)->dead = 1, bt->page->act--; - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite); - - // obtain Parent lock over write lock - - if( bt_lockpage(bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if ( bt_lockpage(bt, right, BtLockWrite, &bt->temp) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - bt_unlockpage(bt, bt->page_no, BtLockWrite); - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return id; -} - -void bt_cleanpage(BtDb *bt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key in list - if( cnt < max && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - page->min = nxt; - page->cnt = idx; -} - -// add key to page -// return with page unlocked - -BTERR bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split the root and raise the height of the btree - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it. Strip foster child key. - // Save left fence key. - - bt->page->act--; - bt->page->cnt--; - bt->page->foster--; - key = keyptr(bt->page, bt->page->cnt); - memcpy (fencekey, key, key->len + 1); - - if( !(new_page = bt_newpage(bt, bt->page)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split already locked full node -// return unlocked. - -BTERR bt_splitpage (BtDb *bt, uint len) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // perform cleanup - - bt_cleanpage(bt); - - // return if enough space now - - if( page->min >= (page->cnt + 1) * sizeof(BtSlot) + sizeof(*page) + len + 1) - return bt_unlockpage(bt, page_no, BtLockWrite); - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // assemble old foster child keys - // add new foster child fence - - cnt = bt->frame->cnt - bt->frame->foster - 1; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // link new right page - - bt_putid (page->right, new_page); - - // put new page as smallest foster child key - - page->cnt = idx; - cnt = page->cnt - page->foster++; - bt_putid (slotptr(page,cnt)->id, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // release wr lock on page - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix highest foster child on page - - if( bt_lockpage (bt, page_no, BtLockParent, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockRead, &page) ) - return bt->err; - - // get our old fence key - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - // get our new fence key length - - key = keyptr(page, page->cnt - 1); - len = key->len; - - if( bt_unlockpage (bt, page_no, BtLockRead) ) - return bt->err; - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space - - if( bt->page->min < (bt->page->cnt + 1) * sizeof(BtSlot) + sizeof(*bt->page) + len + 1) - if( bt_splitpage (bt, len) ) - return bt->err; - else - continue; - else - break; - } while( 1 ); - - // wait for readers from parent get their locks - - if( bt_lockpage (bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockWrite, &page) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove foster child from our page - // add our new fence key to parent - - page->cnt--; - page->act--; - page->foster--; - key = keyptr(page, page->cnt); - - if( bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockParent) ) - return bt->err; - - return bt_unlockpage (bt, page_no, BtLockWrite); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite); - } - - // check if page has enough space - - if( page->min >= (page->cnt + 1) * sizeof(BtSlot) + sizeof(*page) + len + 1) - break; - - if( bt_splitpage (bt, len) ) - return bt->err; - } - - return bt_addkeytopage (bt, slot, key, len, id, tod); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( bt_lockpage(bt, right, BtLockRead, &page) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char *infile; - char type; - BtMgr *mgr; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0; -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - if( bt_insertkey (bt, key, len, ++line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint map = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits hash_size src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " hash_size is the size of buffer pool hash table\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - map = atoi(argv[4]); - - if( map > 65536 ) - fprintf (stderr, "Warning: mapped_pool > 65536 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - cnt = argc - 6; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, map, segsize, map / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 6]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - - cnt = 0; - len = key[0] = 0; - bt = bt_open (mgr); - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) - cnt++; - - fprintf(stderr, " Total keys read %d\n", cnt); - - bt_close (bt); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreea.c b/fosterbtreea.c deleted file mode 100644 index 4a96ff3..0000000 --- a/fosterbtreea.c +++ /dev/null @@ -1,2075 +0,0 @@ -// foster btree version a -// 16 DEC 2013 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits:6; // page size in bits - unsigned char dirty:1; // page needs to be cleaned - unsigned char kill:1; // page is being deleted - unsigned char lvl; // level of page - unsigned char right[BtId]; // page number to right -} *BtPage; - -// mode & definition for latch table implementation - -enum { - Write = 1, - Share = 2 -} LockMode; - -// latch table lock structure - -// mode is set for write access -// share is count of read accessors -// grant write lock when share == 0 - -typedef struct { - int mode:1; - int share:31; -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children -} BtLatchSet; - -// The memory mapping hash table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - uint pin; // mapped page pin counter - uint slot; // slot index in this array - void *hashprev; // previous cache block for the same hash idx - void *hashnext; // next cache block for the same hash idx -#ifndef unix - HANDLE hmap; -#endif -// array of page latch sets, one for each page in map segment - BtLatchSet pagelatch[0]; -} BtHash; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; -#else - HANDLE idx; -#endif - uint nodecnt; // highest page cache node in use - uint nodemax; // highest page cache node allocated - uint hashmask; // number of pages in mmap segment - uint hashsize; // size of Hash Table - uint evicted; // last evicted hash slot - ushort *cache; // hash index for memory pool - BtLatch *latch; // latches for hash table slots - char *nodes; // memory pool page hash nodes -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage alloc; // frame buffer for alloc page ( page 0 ) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_again, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint cacheblk, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 -#define ROOT_page 1 - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtHash *hash; -uint slot; - - // release mapped pages - - for( slot = 0; slot < mgr->nodemax; slot++ ) { - hash = (BtHash *)(mgr->nodes + slot * (sizeof(BtHash) + (mgr->hashmask + 1) * sizeof(BtLatchSet))); - if( hash->slot ) -#ifdef unix - munmap (hash->map, (mgr->hashmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(hash->map, 0); - UnmapViewOfFile(hash->map); - CloseHandle(hash->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->nodes); - free (mgr->cache); - free (mgr->latch); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->nodes); - GlobalFree (mgr->cache); - GlobalFree (mgr->latch); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); - free (bt); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); - GlobalFree (bt); -#endif -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page cache (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint nodemax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last; -BtPage alloc; -int lockmode; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !nodemax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = open ((char*)name, O_RDONLY); - lockmode = 0; - break; - } - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL); - lockmode = 0; - break; - } - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - alloc = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, alloc, BT_minpage, 0) == BT_minpage ) - bits = alloc->bits; - else - return free(mgr), free(alloc), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - alloc = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)alloc, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->nodemax = nodemax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->hashmask = (cacheblk >> bits) - 1; - - // see if requested number of pages per memmap is greater - - if( (1 << segsize) > mgr->hashmask ) - mgr->hashmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->hashmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->nodes = calloc (nodemax, (sizeof(BtHash) + (mgr->hashmask + 1) * sizeof(BtLatchSet))); - mgr->cache = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtLatch)); -#else - mgr->nodes = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cacheblk * (sizeof(BtHash) + (mgr->hashmask + 1) * sizeof(BtLatchSet))); - mgr->cache = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); -#endif - - if( size || *amt ) - goto mgrxit; - - // initializes an empty b-tree with root page and page of leaves - - memset (alloc, 0, 1 << bits); - bt_putid(slotptr(alloc, 2)->id, MIN_lvl+1); - alloc->bits = mgr->page_bits; - -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (alloc, 0, 1 << bits); - alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - alloc->min = mgr->page_size - 3; - alloc->lvl = lvl; - alloc->cnt = 1; - alloc->act = 1; -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // create empty page area by writing last page of first - // cache area (other pages are zeroed by O/S) - - if( mgr->hashmask ) { - memset(alloc, 0, mgr->page_size); - last = mgr->hashmask; - - while( last < MIN_lvl + 1 ) - last += mgr->hashmask + 1; - -#ifdef unix - pwrite(mgr->idx, alloc, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - -mgrxit: -#ifdef unix - free (alloc); -#else - VirtualFree (alloc, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -// wait until write lock mode is clear -// and add 1 to the share count - -void bt_readlock(BtLatch *latch) -{ - do { - // add one to counter, check write bit - -#ifdef unix - if( ~__sync_fetch_and_add((int *)latch, Share) & Write ) - return; -#else - if( ~InterlockedAdd((int *)latch, Share) & Write ) - return; -#endif - // didn't get latch, reset counter by one - -#ifdef unix - __sync_fetch_and_add((int *)latch, -Share); -#else - InterlockedAdd ((int *)latch, -Share); -#endif - - // and yield -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -int prev, ours = 0; - - do { - // see if we can get write access - // with no readers -#ifdef unix - prev = __sync_fetch_and_or((int *)latch, Write); -#else - prev = InterlockedOr((int *)latch, Write); -#endif - - if( ~prev & 1 ) - ours++; // it's ours - - if( !(prev >> 1) && ours ) - return; - - // otherwise yield - -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write locked - -int bt_writetry(BtLatch *latch) -{ -int prev, ours = 0; - - do { - // see if we can get write access - // with no readers -#ifdef unix - prev = __sync_fetch_and_or((int *)latch, Write); -#else - prev = InterlockedOr((int *)latch, Write); -#endif - - if( ~prev & 1 ) - ours++; // it's ours - - if( !ours ) - return 0; - - if( !(prev >> 1) && ours ) - return 1; - - // otherwise yield -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - __sync_fetch_and_and((int *)latch, ~Write); -#else - InterlockedAnd ((int *)latch, ~Write); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - __sync_fetch_and_add((int *)latch, -Share); -#else - InterlockedAdd((int *)latch, -Share); -#endif -} - -// Buffer Pool mgr - -// find segment in cache -// return NULL if not there -// otherwise return node - -BtHash *bt_findhash(BtDb *bt, uid page_no, uint idx) -{ -BtHash *hash; -uint slot; - - // compute cache block first page and hash idx - - if( slot = bt->mgr->cache[idx] ) - hash = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - else - return NULL; - - page_no &= ~bt->mgr->hashmask; - - while( hash->basepage != page_no ) - if( hash = hash->hashnext ) - continue; - else - return NULL; - - return hash; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtHash *hash, uid page_no, int idx) -{ -BtHash *node; -uint slot; - - hash->hashprev = hash->hashnext = NULL; - hash->basepage = page_no & ~bt->mgr->hashmask; - hash->pin = 1; - hash->lru = 1; - - if( slot = bt->mgr->cache[idx] ) { - node = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - hash->hashnext = node; - node->hashprev = hash; - } - - bt->mgr->cache[idx] = hash->slot; -} - -// find best segment to evict from buffer pool - -BtHash *bt_findlru (BtDb *bt, uint slot) -{ -unsigned long long int target = ~0LL; -BtHash *hash = NULL, *node; - - if( !slot ) - return NULL; - - node = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - hash = node; - } while( node = node->hashnext ); - - return hash; -} - -// map new segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtHash *hash, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->hashmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->hashmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - hash->map = mmap (0, (bt->mgr->hashmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( hash->map == MAP_FAILED ) - return bt->err = BTERR_map; -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - hash->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !hash->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - hash->map = MapViewOfFile(hash->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->hashmask+1) << bt->mgr->page_bits); - if( !hash->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-cache -// return hash table entry - -BtHash *bt_hashpage(BtDb *bt, uid page_no) -{ -BtHash *hash, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( hash = bt_findhash(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&hash->pin, 1); -#else - InterlockedIncrement (&hash->pin); -#endif - bt_releaseread (&bt->mgr->latch[idx]); - hash->lru++; - return hash; - } - - // upgrade to write lock - - bt_releaseread (&bt->mgr->latch[idx]); - bt_writelock (&bt->mgr->latch[idx]); - - // try to find page in cache with write lock - - if( hash = bt_findhash(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&hash->pin, 1); -#else - InterlockedIncrement (&hash->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - hash->lru++; - return hash; - } - - // allocate a new hash node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->nodecnt, 1); -#else - slot = InterlockedIncrement (&bt->mgr->nodecnt) - 1; -#endif - - if( ++slot < bt->mgr->nodemax ) { - hash = (BtHash *)(bt->mgr->nodes + slot * (sizeof(BtHash) + (bt->mgr->hashmask + 1) * sizeof(BtLatchSet))); - hash->slot = slot; - - if( bt_mapsegment(bt, hash, page_no) ) - return NULL; - - bt_linkhash(bt, hash, page_no, idx); - bt_releasewrite (&bt->mgr->latch[idx]); - return hash; - } - - // hash table is full - // find best cache entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->nodecnt, -1); -#else - InterlockedDecrement (&bt->mgr->nodecnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = InterlockedIncrement (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_writetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(hash = bt_findlru(bt, bt->mgr->cache[victim])) ) { - bt_releasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim hash node from hash table - - if( node = hash->hashprev ) - node->hashnext = hash->hashnext; - else if( node = hash->hashnext ) - bt->mgr->cache[victim] = node->slot; - else - bt->mgr->cache[victim] = 0; - - if( node = hash->hashnext ) - node->hashprev = hash->hashprev; - - // remove old file mapping -#ifdef unix - munmap (hash->map, (bt->mgr->hashmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(hash->map, 0); - UnmapViewOfFile(hash->map); - CloseHandle(hash->hmap); -#endif - hash->map = NULL; - bt_releasewrite (&bt->mgr->latch[victim]); - - // create new file mapping - // and link into hash table - - if( bt_mapsegment(bt, hash, page_no) ) - return NULL; - - bt_linkhash(bt, hash, page_no, idx); - bt_releasewrite (&bt->mgr->latch[idx]); - return hash; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool - -BTERR bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *page) -{ -BtLatchSet *set; -BtHash *hash; -uint subpage; - - // find/create maping in hash table - - if( hash = bt_hashpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->hashmask); // page within mapping - else - return bt->err; - - set = hash->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - - if( page ) - *page = (BtPage)(hash->map + (subpage << bt->mgr->page_bits)); - - return bt->err = 0; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode) -{ -uint subpage, idx; -BtLatchSet *set; -BtHash *hash; - - // since page is pinned - // it should still be in the buffer pool - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - if( hash = bt_findhash(bt, page_no, idx) ) - subpage = (uint)(page_no & bt->mgr->hashmask); - else - return bt->err = BTERR_hash; - - bt_releaseread (&bt->mgr->latch[idx]); - set = hash->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&hash->pin, -1); -#else - InterlockedDecrement (&hash->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page that has no tree pointers -// place on free chain out of allocator page - -BTERR bt_freepage(BtDb *bt, uid page_no) -{ - // obtain delete lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - // obtain write lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockWrite, &bt->temp) ) - return bt->err; - - // lock allocation page - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return bt->err; - - // store chain in first key - bt_putid(slotptr(bt->temp, 1)->id, bt_getid(slotptr(bt->alloc, 1)->id)); - bt_putid(slotptr(bt->alloc, 1)->id, page_no); - - // unlock page zero - - if( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return bt->err; - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -uid new_page; -BtPage pmap; -int reuse; - - // lock page zero - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return 0; - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(slotptr(bt->alloc, 1)->id) ) { - if( bt_lockpage (bt, new_page, BtLockWrite, &bt->temp) ) - return 0; - bt_putid(slotptr(bt->alloc, 1)->id, bt_getid(slotptr(bt->temp, 1)->id)); - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(slotptr(bt->alloc, 2)->id); - bt_putid(slotptr(bt->alloc, 2)->id, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of hash block, zero last page in the block - - if ( !reuse && bt->mgr->hashmask > 0 && (new_page & bt->mgr->hashmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->hashmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into page-cache and copy page. - // this will extend the file into the new pages. - - if( bt_lockpage(bt, new_page, BtLockWrite, &pmap) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; -#endif - // unlock page zero - - if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return 0; - - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, uint lock) -{ -uid page_no = ROOT_page, prevpage = 0; -uint drill = 0xff, slot; -uint mode, prevmode; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( bt_lockpage(bt, page_no, BtLockAccess, NULL) ) - return 0; - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode) ) - return 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( bt_lockpage(bt, page_no, mode, &bt->page) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode) ) - return 0; - else - continue; - } - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - else - drill--; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - prevmode = mode; - prevpage = bt->page_no; - page_no = bt_getid(slotptr(bt->page, slot)->id); - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite); - - // obtain Parent lock over write lock - - if( bt_lockpage(bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if ( bt_lockpage(bt, right, BtLockWrite, &bt->temp) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - bt_unlockpage(bt, bt->page_no, BtLockWrite); - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// 1 - go ahead - -uint bt_cleanpage(BtDb *bt, uint amt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key in list - if( cnt < max && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - page->min = nxt; - page->cnt = idx; - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - return 0; -} - -// add key to page -// return with page unlocked - -BTERR bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split the root and raise the height of the btree - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it. Strip foster child key. - // Save left fence key. - - bt->page->act--; - bt->page->cnt--; - bt->page->foster--; - key = keyptr(bt->page, bt->page->cnt); - memcpy (fencekey, key, key->len + 1); - - if( !(new_page = bt_newpage(bt, bt->page)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split already locked full node -// return unlocked. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // assemble old foster child keys - // add new foster child fence - - cnt = bt->frame->cnt - bt->frame->foster - 1; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // link new right page - - bt_putid (page->right, new_page); - - // put new page as smallest foster child key - - page->min = nxt; - page->cnt = idx; - cnt = page->cnt - page->foster++; - bt_putid (slotptr(page,cnt)->id, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // release wr lock on page - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix highest foster child on page - - if( bt_lockpage (bt, page_no, BtLockParent, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockRead, &page) ) - return bt->err; - - // get our old fence key - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - if( bt_unlockpage (bt, page_no, BtLockRead) ) - return bt->err; - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space for largest possible key - - if( bt_cleanpage (bt, 256) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } while( 1 ); - - // wait until readers from parent get their locks - - if( bt_lockpage (bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockWrite, &page) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove foster child from our page - // add our new fence key to parent - - page->cnt--; - page->act--; - page->foster--; - page->dirty = 1; - key = keyptr(page, page->cnt); - - if( bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockParent) ) - return bt->err; - - return bt_unlockpage (bt, page_no, BtLockWrite); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite); - } - - // check if page has enough space - - if( bt_cleanpage (bt, len) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - return bt_addkeytopage (bt, slot, key, len, id, tod); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( bt_lockpage(bt, right, BtLockRead, &page) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, num; - char *infile; - BtMgr *mgr; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0; -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num ) - sprintf((char *)key+len, "%.9d", line), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint map = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - map = atoi(argv[4]); - - if( map > 65536 ) - fprintf (stderr, "Warning: mapped_pool > 65536 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, map, segsize, map / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - - cnt = 0; - len = key[0] = 0; - bt = bt_open (mgr); - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) - cnt++; - - fprintf(stderr, " Total keys read %d\n", cnt); - - bt_close (bt); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreeb.c b/fosterbtreeb.c deleted file mode 100644 index 2211cb0..0000000 --- a/fosterbtreeb.c +++ /dev/null @@ -1,2192 +0,0 @@ -// foster btree version b -// 22 DEC 2013 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits; // page size in bits - unsigned char lvl:6; // level of page - unsigned char kill:1; // page is being deleted - unsigned char dirty:1; // page needs to be cleaned - unsigned char right[BtId]; // page number to right -} *BtPage; - -// mode & definition for latch table implementation - -enum { - Write = 1, - Pending = 2, - Share = 4 -} LockMode; - -// latch table lock structure - -// exclusive is set for write access -// share is count of read accessors -// grant write lock when share == 0 - -typedef struct { - volatile uint exclusive:1; - volatile uint request:1; - volatile uint share:30; -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children -} BtLatchSet; - -// The memory mapping pool table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - uint pin; // mapped page pin counter - uint slot; // slot index in this array - void *hashprev; // previous pool entry for the same hash idx - void *hashnext; // next pool entry for the same hash idx -#ifndef unix - HANDLE hmap; -#endif -// array of page latch sets, one for each page in map segment - BtLatchSet pagelatch[0]; -} BtPool; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; - char *pooladvise; // bit maps for pool page advisements -#else - HANDLE idx; -#endif - uint poolcnt; // highest page pool node in use - uint poolmax; // highest page pool node allocated - uint poolmask; // total size of pages in mmap segment - 1 - uint hashsize; // size of Hash Table for pool entries - volatile uint evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries - BtLatch *latch; // latches for hash table slots - char *nodes; // memory pool page segments -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage alloc; // frame buffer for alloc page ( page 0 ) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 -#define ROOT_page 1 -#define LEAF_page 2 - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtPool *pool; -uint slot; - - // release mapped pages - // note that slot zero is never used - - for( slot = 1; slot < mgr->poolmax; slot++ ) { - pool = (BtPool *)(mgr->nodes + slot * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - if( pool->slot ) -#ifdef unix - munmap (pool->map, (mgr->poolmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->nodes); - free (mgr->hash); - free (mgr->latch); - free (mgr->pooladvise); - free (mgr); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->nodes); - GlobalFree (mgr->hash); - GlobalFree (mgr->latch); - GlobalFree (mgr); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); -#endif - free (bt); -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page pool (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last; -BtPage alloc; -int lockmode; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !poolmax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = open ((char*)name, O_RDONLY); - lockmode = 0; - break; - } - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL); - lockmode = 0; - break; - } - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - alloc = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, alloc, BT_minpage, 0) == BT_minpage ) - bits = alloc->bits; - else - return free(mgr), free(alloc), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - alloc = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)alloc, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->poolmax = poolmax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->poolmask = (cacheblk >> bits) - 1; - - // see if requested size of pages per memmap is greater - - if( (1 << segsize) > mgr->poolmask ) - mgr->poolmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->poolmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->nodes = calloc (poolmax, (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtLatch)); - mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 1) / 8); -#else - mgr->nodes = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); -#endif - - if( size || *amt ) - goto mgrxit; - - // initializes an empty b-tree with root page and page of leaves - - memset (alloc, 0, 1 << bits); - bt_putid(alloc->right, MIN_lvl+1); - alloc->bits = mgr->page_bits; - -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (alloc, 0, 1 << bits); - alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - alloc->min = mgr->page_size - 3; - alloc->lvl = lvl; - alloc->cnt = 1; - alloc->act = 1; -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // create empty page area by writing last page of first - // segment area (other pages are zeroed by O/S) - - if( mgr->poolmask ) { - memset(alloc, 0, mgr->page_size); - last = mgr->poolmask; - - while( last < MIN_lvl + 1 ) - last += mgr->poolmask + 1; - -#ifdef unix - pwrite(mgr->idx, alloc, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - -mgrxit: -#ifdef unix - free (alloc); -#else - VirtualFree (alloc, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -// wait until write lock mode is clear -// and add 1 to the share count - -void bt_readlock(BtLatch *latch) -{ - do { - // see if exclusive request is pending, or granted - - if( !(volatile int)latch->request && !(volatile int)latch->exclusive ) { - // add one to counter, check write bit -#ifdef unix - if( ~__sync_fetch_and_add((volatile int *)latch, Share) & Write ) - return; -#else - if( ~_InterlockedExchangeAdd((volatile int *)latch, Share) & Write ) - return; -#endif - // didn't get latch, reduce counter by one - -#ifdef unix - __sync_fetch_and_add((volatile int *)latch, -Share); -#else - _InterlockedExchangeAdd ((volatile int *)latch, -Share); -#endif - } - - // and yield -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -int prev; - - do { - // set exclusive access pending - -#ifdef unix - __sync_fetch_and_or((int *)latch, Pending); -#else - _InterlockedOr((int *)latch, Pending); -#endif - - // see if we can get write access - // with no readers -#ifdef unix - prev = __sync_fetch_and_or((volatile int *)latch, Write); -#else - prev = _InterlockedOr((volatile int *)latch, Write); -#endif - - // did we get exclusive access? - // if so, clear write pending - - if( !(prev & ~Pending) ) { -#ifdef unix - __sync_fetch_and_and((volatile int *)latch, ~Pending); -#else - _InterlockedAnd((volatile int *)latch, ~Pending); -#endif - return; - } - - // reset our Write mode if it was clear before - - if( !(prev & Write) ) { -#ifdef unix - __sync_fetch_and_and((volatile int *)latch, ~Write); -#else - _InterlockedAnd((volatile int *)latch, ~Write); -#endif - } - - // otherwise yield - -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write locked - -int bt_writetry(BtLatch *latch) -{ -int prev; - - // see if we can get write access - // with no readers -#ifdef unix - prev = __sync_fetch_and_or((volatile int *)latch, Write); -#else - prev = _InterlockedOr((volatile int *)latch, Write); -#endif - - // did we get exclusive access? - // if so, return OK - - if( !(prev & ~Pending) ) - return 1; - - // reset our Write mode if it was clear before - - if( !(prev & Write) ) { -#ifdef unix - __sync_fetch_and_and((volatile int *)latch, ~Write); -#else - _InterlockedAnd((volatile int *)latch, ~Write); -#endif - } - return 0; -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - __sync_fetch_and_and((int *)latch, ~Write); -#else - _InterlockedAnd ((int *)latch, ~Write); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - __sync_fetch_and_add((int *)latch, -Share); -#else - _InterlockedExchangeAdd((int *)latch, -Share); -#endif -} - -// Buffer Pool mgr - -// find segment in pool -// must be called with hashslot idx locked -// return NULL if not there -// otherwise return node - -BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx) -{ -BtPool *pool; -uint slot; - - // compute start of hash chain in pool - - if( slot = bt->mgr->hash[idx] ) - pool = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - else - return NULL; - - page_no &= ~bt->mgr->poolmask; - - while( pool->basepage != page_no ) - if( pool = pool->hashnext ) - continue; - else - return NULL; - - return pool; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx) -{ -BtPool *node; -uint slot; - - pool->hashprev = pool->hashnext = NULL; - pool->basepage = page_no & ~bt->mgr->poolmask; - pool->lru = 1; - - if( slot = bt->mgr->hash[idx] ) { - node = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - pool->hashnext = node; - node->hashprev = pool; - } - - bt->mgr->hash[idx] = pool->slot; -} - -// find best segment to evict from buffer pool - -BtPool *bt_findlru (BtDb *bt, uint hashslot) -{ -unsigned long long int target = ~0LL; -BtPool *pool = NULL, *node; - - if( !hashslot ) - return NULL; - - node = (BtPool *)(bt->mgr->nodes + hashslot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - - // scan pool entries under hash table slot - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - pool = node; - } while( node = node->hashnext ); - - return pool; -} - -// map new buffer pool segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( pool->map == MAP_FAILED ) - return bt->err = BTERR_map; - // clear out madvise issued bits - memset (bt->mgr->pooladvise + pool->slot * (bt->mgr->poolmask + 1) / 8, 0, (bt->mgr->poolmask + 1)/8); -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !pool->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->poolmask+1) << bt->mgr->page_bits); - if( !pool->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-pool -// return pool table entry, incrementing pin - -BtPool *bt_pinpage(BtDb *bt, uid page_no) -{ -BtPool *pool, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releaseread (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // upgrade to write lock - - bt_releaseread (&bt->mgr->latch[idx]); - bt_writelock (&bt->mgr->latch[idx]); - - // try to find page in pool with write lock - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // allocate a new pool node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1); -#else - slot = _InterlockedIncrement (&bt->mgr->poolcnt) - 1; -#endif - - if( ++slot < bt->mgr->poolmax ) { - pool = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - pool->slot = slot; - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - return pool; - } - - // pool table is full - // find best pool entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->poolcnt, -1); -#else - _InterlockedDecrement (&bt->mgr->poolcnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = _InterlockedIncrement (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_writetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_releasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim pool node from hash table - - if( node = pool->hashprev ) - node->hashnext = pool->hashnext; - else if( node = pool->hashnext ) - bt->mgr->hash[victim] = node->slot; - else - bt->mgr->hash[victim] = 0; - - if( node = pool->hashnext ) - node->hashprev = pool->hashprev; - - bt_releasewrite (&bt->mgr->latch[victim]); - - // remove old file mapping -#ifdef unix - munmap (pool->map, (bt->mgr->poolmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); -#endif - pool->map = NULL; - - // create new pool mapping - // and link into hash table - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - return pool; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool and return page pointer - -BTERR bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *pageptr) -{ -BtLatchSet *set; -BtPool *pool; -uint subpage; -BtPage page; - - // find/create maping in pool table - // and pin our pool slot - - if( pool = bt_pinpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping - else - return bt->err; - - set = pool->pagelatch + subpage; - page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits)); -#ifdef unix - { - uint idx = subpage / 8; - uint bit = subpage % 8; - - if( !((bt->mgr->pooladvise + pool->slot * (bt->mgr->poolmask + 1)/8)[idx] >> bit) & 1 ) { - madvise (page, bt->mgr->page_size, MADV_WILLNEED); - (bt->mgr->pooladvise + pool->slot * (bt->mgr->poolmask + 1)/8)[idx] |= 1 << bit; - } - } -#endif - - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - - if( pageptr ) - *pageptr = page; - - return bt->err = 0; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode) -{ -uint subpage, idx; -BtLatchSet *set; -BtPool *pool; - - // since page is pinned - // it should still be in the buffer pool - // and is in no danger of being a victim for reuse - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - if( pool = bt_findpool(bt, page_no, idx) ) - subpage = (uint)(page_no & bt->mgr->poolmask); - else - return bt->err = BTERR_hash; - - bt_releaseread (&bt->mgr->latch[idx]); - set = pool->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&pool->pin, -1); -#else - _InterlockedDecrement (&pool->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page -// place on free chain out of allocator page - -BTERR bt_freepage(BtDb *bt, uid page_no) -{ - // obtain delete lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - // obtain write lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockWrite, &bt->temp) ) - return bt->err; - - // lock allocation page - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return bt->err; - - // store chain in second right - bt_putid(bt->temp->right, bt_getid(bt->alloc[1].right)); - bt_putid(bt->alloc[1].right, page_no); - - // unlock page zero - - if( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return bt->err; - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -uid new_page; -BtPage pmap; -int reuse; - - // lock page zero - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return 0; - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(bt->alloc[1].right) ) { - if( bt_lockpage (bt, new_page, BtLockWrite, &bt->temp) ) - return 0; - bt_putid(bt->alloc[1].right, bt_getid(bt->temp->right)); - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(bt->alloc->right); - bt_putid(bt->alloc->right, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of pool block, zero last page in the block - - if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into pool and copy page. - // this will extend the file into the new pages. - - if( bt_lockpage(bt, new_page, BtLockWrite, &pmap) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; -#endif - // unlock page zero - - if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return 0; - - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, uint lock) -{ -uid page_no = ROOT_page, prevpage = 0; -uint drill = 0xff, slot; -uint mode, prevmode; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( bt_lockpage(bt, page_no, BtLockAccess, NULL) ) - return 0; - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode) ) - return 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( bt_lockpage(bt, page_no, mode, &bt->page) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode) ) - return 0; - else - continue; - } - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - else - drill--; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - prevmode = mode; - prevpage = bt->page_no; - page_no = bt_getid(slotptr(bt->page, slot)->id); - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite); - - // obtain Parent lock over write lock - - if( bt_lockpage(bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if ( bt_lockpage(bt, right, BtLockWrite, &bt->temp) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - bt_unlockpage(bt, bt->page_no, BtLockWrite); - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// 1 - go ahead - -uint bt_cleanpage(BtDb *bt, uint amt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key and foster children in list - if( cnt < max - page->foster && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - page->min = nxt; - page->cnt = idx; - - // see if page has enough space now, or does it need splitting? - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - return 0; -} - -// add key to page -// return with page unlocked - -BTERR bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split the root and raise the height of the btree - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - root->act--; - root->cnt--; - root->foster--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page - - if( !(new_page = bt_newpage(bt, root)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest of the root to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on empty newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split already locked full node -// return unlocked. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // remember fence key for new page to add - // as foster child - - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // insert new foster child at beginning of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys if any - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - page->min = nxt; - page->cnt = idx; - - // link new right page - - bt_putid (page->right, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // release wr lock on page - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix fence key and highest foster child on page - - if( bt_lockpage (bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // get our highest foster child key to find in parent node - - if( bt_lockpage (bt, page_no, BtLockRead, &page) ) - return bt->err; - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - if( bt_unlockpage (bt, page_no, BtLockRead) ) - return bt->err; - -try_again: - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space for any possible key - - if( bt_cleanpage (bt, 256) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } while( 1 ); - - // see if we are still a foster child from another node - - if( bt_getid (slotptr(bt->page, slot)->id) != page_no ) { - bt_unlockpage (bt, bt->page_no, BtLockWrite); -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - goto try_again; - } - - // wait until readers from parent get their locks - - if( bt_lockpage (bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockWrite, &page) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove highest foster child from our page - // add our new fence key to parent - - page->cnt--; - page->act--; - page->foster--; - page->dirty = 1; - key = keyptr(page, page->cnt); - - if( bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - return bt_unlockpage (bt, page_no, BtLockParent); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite); - } - - // check if page has enough space - - if( bt_cleanpage (bt, len) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - return bt_addkeytopage (bt, slot, key, len, id, tod); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( bt_lockpage(bt, right, BtLockRead, &page) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, idx; - char *infile; - BtMgr *mgr; - int num; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0, cnt = 0; -uid next, page_no = LEAF_page; // start on first page of leaves -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtPage page; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - - break; - - case 'c': - fprintf(stderr, "started reading\n"); - - do { - bt_lockpage (bt, page_no, BtLockRead, &page); - cnt += page->act; - next = bt_getid (page->right); - bt_unlockpage (bt, page_no, BtLockRead); - } while( page_no = next ); - - cnt--; // remove stopper key - fprintf(stderr, " Total keys read %d\n", cnt); - break; - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint poolsize = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - poolsize = atoi(argv[4]); - - if( !poolsize ) - fprintf (stderr, "Warning: no mapped_pool\n"); - - if( poolsize > 65535 ) - fprintf (stderr, "Warning: mapped_pool > 65535 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; - args[idx].idx = idx; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreec.c b/fosterbtreec.c deleted file mode 100644 index 3b07013..0000000 --- a/fosterbtreec.c +++ /dev/null @@ -1,2166 +0,0 @@ -// foster btree version d -// 26 DEC 2013 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits; // page size in bits - unsigned char lvl:6; // level of page - unsigned char kill:1; // page is being deleted - unsigned char dirty:1; // page needs to be cleaned - unsigned char right[BtId]; // page number to right -} *BtPage; - -// latch table lock structure - -// exclusive is set for write access -// share is count of read accessors -// pending is count of waiting writers -// grant write lock when share == 0 - -typedef struct { -#ifdef unix - volatile uint exclusive:1; - volatile uint pending:15; - volatile uint share:16; - pthread_mutex_t mut[1]; - pthread_cond_t cond[1]; -#else - SRWLOCK srw[1]; -#endif -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children -} BtLatchSet; - -// The memory mapping pool table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - uint pin; // mapped page pin counter - uint slot; // slot index in this array - void *hashprev; // previous pool entry for the same hash idx - void *hashnext; // next pool entry for the same hash idx -#ifndef unix - HANDLE hmap; -#endif -// array of page latch sets, one for each page in map segment - BtLatchSet pagelatch[0]; -} BtPool; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; - char *pooladvise; // bit maps for pool page advisements -#else - HANDLE idx; -#endif - uint poolcnt; // highest page pool node in use - uint poolmax; // highest page pool node allocated - uint poolmask; // total size of pages in mmap segment - 1 - uint hashsize; // size of Hash Table for pool entries - volatile uint evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries - BtLatch *latch; // latches for hash table slots - char *nodes; // memory pool page segments -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage alloc; // frame buffer for alloc page ( page 0 ) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 -#define ROOT_page 1 -#define LEAF_page 2 - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtPool *pool; -uint slot; - - // release mapped pages - // note that slot zero is never used - - for( slot = 1; slot < mgr->poolmax; slot++ ) { - pool = (BtPool *)(mgr->nodes + slot * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - if( pool->slot ) -#ifdef unix - munmap (pool->map, (mgr->poolmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->nodes); - free (mgr->hash); - free (mgr->latch); - free (mgr->pooladvise); - free (mgr); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->nodes); - GlobalFree (mgr->hash); - GlobalFree (mgr->latch); - GlobalFree (mgr); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); -#endif - free (bt); -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page pool (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last, slot, idx; -BtPage alloc; -int lockmode; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !poolmax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = open ((char*)name, O_RDONLY); - lockmode = 0; - break; - } - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL); - lockmode = 0; - break; - } - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - alloc = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, alloc, BT_minpage, 0) == BT_minpage ) - bits = alloc->bits; - else - return free(mgr), free(alloc), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - alloc = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)alloc, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->poolmax = poolmax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->poolmask = (cacheblk >> bits) - 1; - - // see if requested size of pages per memmap is greater - - if( (1 << segsize) > mgr->poolmask ) - mgr->poolmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->poolmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->nodes = calloc (poolmax, (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtLatch)); - mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 1) / 8); -#else - mgr->nodes = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); -#endif - - // initialize buffer pool page latches - - for( slot = 1; slot < poolmax; slot++ ) { - BtLatchSet *latchset = (BtLatchSet *)(mgr->nodes + slot * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - for( idx = 0; idx < mgr->poolmask + 1; idx++ ) { -#ifdef unix - pthread_mutex_init (latchset[idx].readwr->mut, NULL); - pthread_cond_init (latchset[idx].readwr->cond, NULL); - pthread_mutex_init (latchset[idx].access->mut, NULL); - pthread_cond_init (latchset[idx].access->cond, NULL); - pthread_mutex_init (latchset[idx].parent->mut, NULL); - pthread_cond_init (latchset[idx].parent->cond, NULL); -#else - InitializeSRWLock (latchset[idx].readwr->srw); - InitializeSRWLock (latchset[idx].access->srw); - InitializeSRWLock (latchset[idx].parent->srw); -#endif - } - } - - // initialize buffer pool mgr latches - - for( slot = 0; slot < hashsize; slot++ ) { -#ifdef unix - pthread_mutex_init (mgr->latch[slot].mut, NULL); - pthread_cond_init (mgr->latch[slot].cond, NULL); -#else - InitializeSRWLock (mgr->latch[slot].srw); -#endif - } - - if( size || *amt ) - goto mgrxit; - - // initializes an empty b-tree with root page and page of leaves - - memset (alloc, 0, 1 << bits); - bt_putid(alloc->right, MIN_lvl+1); - alloc->bits = mgr->page_bits; - -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (alloc, 0, 1 << bits); - alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - alloc->min = mgr->page_size - 3; - alloc->lvl = lvl; - alloc->cnt = 1; - alloc->act = 1; -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // create empty page area by writing last page of first - // segment area (other pages are zeroed by O/S) - - if( mgr->poolmask ) { - memset(alloc, 0, mgr->page_size); - last = mgr->poolmask; - - while( last < MIN_lvl + 1 ) - last += mgr->poolmask + 1; - -#ifdef unix - pwrite(mgr->idx, alloc, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - -mgrxit: -#ifdef unix - free (alloc); -#else - VirtualFree (alloc, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -// wait if exclusive request is pending, or granted -// and add 1 to the share count - -void bt_readlock(BtLatch *latch) -{ -#ifdef unix - pthread_mutex_lock (latch->mut); - - while( latch->pending || latch->exclusive ) - pthread_cond_wait (latch->cond, latch->mut); - - // add one to readers counter - - latch->share++; - pthread_mutex_unlock (latch->mut); -#else - AcquireSRWLockShared (latch->srw); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -#ifdef unix - pthread_mutex_lock (latch->mut); - latch->pending++; - - while( latch->share || latch->exclusive ) - pthread_cond_wait (latch->cond, latch->mut); - - latch->exclusive = 1; - latch->pending--; - pthread_mutex_unlock (latch->mut); -#else - AcquireSRWLockExclusive (latch->srw); -#endif -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write or read locked - -int bt_writetry(BtLatch *latch) -{ -int result = 0; - -#ifdef unix - pthread_mutex_lock (latch->mut); - - if( !latch->share && !latch->exclusive ) - result = latch->exclusive = 1; - - pthread_mutex_unlock (latch->mut); -#else - result = TryAcquireSRWLockExclusive (latch->srw); -#endif - return result; -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - pthread_mutex_lock (latch->mut); - latch->exclusive = 0; - pthread_cond_broadcast (latch->cond); - pthread_mutex_unlock (latch->mut); -#else - ReleaseSRWLockExclusive (latch->srw); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - pthread_mutex_lock (latch->mut); - - if( !--latch->share && latch->pending ) - pthread_cond_broadcast (latch->cond); - - pthread_mutex_unlock (latch->mut); -#else - ReleaseSRWLockShared (latch->srw); -#endif -} - -// Buffer Pool mgr - -// find segment in pool -// must be called with hashslot idx locked -// return NULL if not there -// otherwise return node - -BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx) -{ -BtPool *pool; -uint slot; - - // compute start of hash chain in pool - - if( slot = bt->mgr->hash[idx] ) - pool = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - else - return NULL; - - page_no &= ~bt->mgr->poolmask; - - while( pool->basepage != page_no ) - if( pool = pool->hashnext ) - continue; - else - return NULL; - - return pool; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx) -{ -BtPool *node; -uint slot; - - pool->hashprev = pool->hashnext = NULL; - pool->basepage = page_no & ~bt->mgr->poolmask; - pool->lru = 1; - - if( slot = bt->mgr->hash[idx] ) { - node = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - pool->hashnext = node; - node->hashprev = pool; - } - - bt->mgr->hash[idx] = pool->slot; -} - -// find best segment to evict from buffer pool - -BtPool *bt_findlru (BtDb *bt, uint hashslot) -{ -unsigned long long int target = ~0LL; -BtPool *pool = NULL, *node; - - if( !hashslot ) - return NULL; - - node = (BtPool *)(bt->mgr->nodes + hashslot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - - // scan pool entries under hash table slot - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - pool = node; - } while( node = node->hashnext ); - - return pool; -} - -// map new buffer pool segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( pool->map == MAP_FAILED ) - return bt->err = BTERR_map; - // clear out madvise issued bits - memset (bt->mgr->pooladvise + pool->slot * (bt->mgr->poolmask + 1) / 8, 0, (bt->mgr->poolmask + 1)/8); -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !pool->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->poolmask+1) << bt->mgr->page_bits); - if( !pool->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-pool -// return pool table entry, incrementing pin - -BtPool *bt_pinpage(BtDb *bt, uid page_no) -{ -BtPool *pool, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releaseread (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // upgrade to write lock - - bt_releaseread (&bt->mgr->latch[idx]); - bt_writelock (&bt->mgr->latch[idx]); - - // try to find page in pool with write lock - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // allocate a new pool node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1); -#else - slot = _InterlockedIncrement (&bt->mgr->poolcnt) - 1; -#endif - - if( ++slot < bt->mgr->poolmax ) { - pool = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - pool->slot = slot; - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - return pool; - } - - // pool table is full - // find best pool entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->poolcnt, -1); -#else - _InterlockedDecrement (&bt->mgr->poolcnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = _InterlockedIncrement (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_writetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_releasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim pool node from hash table - - if( node = pool->hashprev ) - node->hashnext = pool->hashnext; - else if( node = pool->hashnext ) - bt->mgr->hash[victim] = node->slot; - else - bt->mgr->hash[victim] = 0; - - if( node = pool->hashnext ) - node->hashprev = pool->hashprev; - - bt_releasewrite (&bt->mgr->latch[victim]); - - // remove old file mapping -#ifdef unix - munmap (pool->map, (bt->mgr->poolmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); -#endif - pool->map = NULL; - - // create new pool mapping - // and link into hash table - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - return pool; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool and return page pointer - -BTERR bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *pageptr) -{ -BtLatchSet *set; -BtPool *pool; -uint subpage; -BtPage page; - - // find/create maping in pool table - // and pin our pool slot - - if( pool = bt_pinpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping - else - return bt->err; - - set = pool->pagelatch + subpage; - page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits)); -#ifdef unix - { - uint idx = subpage / 8; - uint bit = subpage % 8; - - if( !((bt->mgr->pooladvise + pool->slot * (bt->mgr->poolmask + 1)/8)[idx] >> bit) & 1 ) { - madvise (page, bt->mgr->page_size, MADV_WILLNEED); - (bt->mgr->pooladvise + pool->slot * (bt->mgr->poolmask + 1)/8)[idx] |= 1 << bit; - } - } -#endif - - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - - if( pageptr ) - *pageptr = page; - - return bt->err = 0; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode) -{ -uint subpage, idx; -BtLatchSet *set; -BtPool *pool; - - // since page is pinned - // it should still be in the buffer pool - // and is in no danger of being a victim for reuse - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - if( pool = bt_findpool(bt, page_no, idx) ) - subpage = (uint)(page_no & bt->mgr->poolmask); - else - return bt->err = BTERR_hash; - - bt_releaseread (&bt->mgr->latch[idx]); - set = pool->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&pool->pin, -1); -#else - _InterlockedDecrement (&pool->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page -// place on free chain out of allocator page - -BTERR bt_freepage(BtDb *bt, uid page_no) -{ - // obtain delete lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - // obtain write lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockWrite, &bt->temp) ) - return bt->err; - - // lock allocation page - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return bt->err; - - // store chain in second right - bt_putid(bt->temp->right, bt_getid(bt->alloc[1].right)); - bt_putid(bt->alloc[1].right, page_no); - - // unlock page zero - - if( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return bt->err; - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -uid new_page; -BtPage pmap; -int reuse; - - // lock page zero - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return 0; - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(bt->alloc[1].right) ) { - if( bt_lockpage (bt, new_page, BtLockWrite, &bt->temp) ) - return 0; - bt_putid(bt->alloc[1].right, bt_getid(bt->temp->right)); - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(bt->alloc->right); - bt_putid(bt->alloc->right, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of pool block, zero last page in the block - - if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into pool and copy page. - // this will extend the file into the new pages. - - if( bt_lockpage(bt, new_page, BtLockWrite, &pmap) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; -#endif - // unlock page zero - - if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return 0; - - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, uint lock) -{ -uid page_no = ROOT_page, prevpage = 0; -uint drill = 0xff, slot; -uint mode, prevmode; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( bt_lockpage(bt, page_no, BtLockAccess, NULL) ) - return 0; - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode) ) - return 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( bt_lockpage(bt, page_no, mode, &bt->page) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode) ) - return 0; - else - continue; - } - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - else - drill--; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - prevmode = mode; - prevpage = bt->page_no; - page_no = bt_getid(slotptr(bt->page, slot)->id); - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite); - - // obtain Parent lock over write lock - - if( bt_lockpage(bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if ( bt_lockpage(bt, right, BtLockWrite, &bt->temp) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - bt_unlockpage(bt, bt->page_no, BtLockWrite); - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// 1 - go ahead - -uint bt_cleanpage(BtDb *bt, uint amt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key and foster children in list - if( cnt < max - page->foster && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - page->min = nxt; - page->cnt = idx; - - // see if page has enough space now, or does it need splitting? - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - return 0; -} - -// add key to page -// return with page unlocked - -BTERR bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split the root and raise the height of the btree - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - root->act--; - root->cnt--; - root->foster--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page - - if( !(new_page = bt_newpage(bt, root)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest of the root to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on empty newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split already locked full node -// return unlocked. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // remember fence key for new page to add - // as foster child - - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // insert new foster child at beginning of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys if any - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - page->min = nxt; - page->cnt = idx; - - // link new right page - - bt_putid (page->right, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // release wr lock on page - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix fence key and highest foster child on page - - if( bt_lockpage (bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // get our highest foster child key to find in parent node - - if( bt_lockpage (bt, page_no, BtLockRead, &page) ) - return bt->err; - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - if( bt_unlockpage (bt, page_no, BtLockRead) ) - return bt->err; - -try_again: - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space for any possible key - - if( bt_cleanpage (bt, 256) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } while( 1 ); - - // see if we are still a foster child from another node - - if( bt_getid (slotptr(bt->page, slot)->id) != page_no ) { - bt_unlockpage (bt, bt->page_no, BtLockWrite); -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - goto try_again; - } - - // wait until readers from parent get their locks - - if( bt_lockpage (bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockWrite, &page) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove highest foster child from our page - // add our new fence key to parent - - page->cnt--; - page->act--; - page->foster--; - page->dirty = 1; - key = keyptr(page, page->cnt); - - if( bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - return bt_unlockpage (bt, page_no, BtLockParent); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite); - } - - // check if page has enough space - - if( bt_cleanpage (bt, len) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - return bt_addkeytopage (bt, slot, key, len, id, tod); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( bt_lockpage(bt, right, BtLockRead, &page) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, idx; - char *infile; - BtMgr *mgr; - int num; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0, cnt = 0; -uid next, page_no = LEAF_page; // start on first page of leaves -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtPage page; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - - break; - - case 'c': - fprintf(stderr, "started reading\n"); - - do { - bt_lockpage (bt, page_no, BtLockRead, &page); - cnt += page->act; - next = bt_getid (page->right); - bt_unlockpage (bt, page_no, BtLockRead); - } while( page_no = next ); - - cnt--; // remove stopper key - fprintf(stderr, " Total keys read %d\n", cnt); - break; - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint poolsize = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - poolsize = atoi(argv[4]); - - if( !poolsize ) - fprintf (stderr, "Warning: no mapped_pool\n"); - - if( poolsize > 65535 ) - fprintf (stderr, "Warning: mapped_pool > 65535 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; - args[idx].idx = idx; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreed.c b/fosterbtreed.c deleted file mode 100644 index e98234d..0000000 --- a/fosterbtreed.c +++ /dev/null @@ -1,2130 +0,0 @@ -// foster btree version d -// 24 DEC 2013 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits; // page size in bits - unsigned char lvl:6; // level of page - unsigned char kill:1; // page is being deleted - unsigned char dirty:1; // page needs to be cleaned - unsigned char right[BtId]; // page number to right -} *BtPage; - -// latch table lock structure - -typedef struct { -#ifdef unix - pthread_rwlock_t lock[1]; -#else - SRWLOCK srw[1]; -#endif -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children -} BtLatchSet; - -// The memory mapping pool table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - uint pin; // mapped page pin counter - uint slot; // slot index in this array - void *hashprev; // previous pool entry for the same hash idx - void *hashnext; // next pool entry for the same hash idx -#ifndef unix - HANDLE hmap; -#endif -// array of page latch sets, one for each page in map segment - BtLatchSet pagelatch[0]; -} BtPool; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; - char *pooladvise; // bit maps for pool page advisements -#else - HANDLE idx; -#endif - uint poolcnt; // highest page pool node in use - uint poolmax; // highest page pool node allocated - uint poolmask; // total size of pages in mmap segment - 1 - uint hashsize; // size of Hash Table for pool entries - volatile uint evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries - BtLatch *latch; // latches for hash table slots - char *nodes; // memory pool page segments -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage alloc; // frame buffer for alloc page ( page 0 ) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 -#define ROOT_page 1 -#define LEAF_page 2 - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtPool *pool; -uint slot; - - // release mapped pages - // note that slot zero is never used - - for( slot = 1; slot < mgr->poolmax; slot++ ) { - pool = (BtPool *)(mgr->nodes + slot * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - if( pool->slot ) -#ifdef unix - munmap (pool->map, (mgr->poolmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->nodes); - free (mgr->hash); - free (mgr->latch); - free (mgr->pooladvise); - free (mgr); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->nodes); - GlobalFree (mgr->hash); - GlobalFree (mgr->latch); - GlobalFree (mgr); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); -#endif - free (bt); -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page pool (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last, slot, idx; -BtPage alloc; -int lockmode; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; - -#ifdef unix -pthread_rwlockattr_t rwattr[1]; -#else -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !poolmax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = open ((char*)name, O_RDONLY); - lockmode = 0; - break; - } - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - switch (mode & 0x7fff) - { - case BT_rw: - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - lockmode = 1; - break; - - case BT_ro: - default: - mgr->idx = CreateFile(name, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, attr, NULL); - lockmode = 0; - break; - } - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - alloc = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, alloc, BT_minpage, 0) == BT_minpage ) - bits = alloc->bits; - else - return free(mgr), free(alloc), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - alloc = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)alloc, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->poolmax = poolmax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->poolmask = (cacheblk >> bits) - 1; - - // see if requested size of pages per memmap is greater - - if( (1 << segsize) > mgr->poolmask ) - mgr->poolmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->poolmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->nodes = calloc (poolmax, (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtLatch)); - mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 8) / 8); -#else - mgr->nodes = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet))); - mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); -#endif - -#ifdef unix - pthread_rwlockattr_init (rwattr); - pthread_rwlockattr_setkind_np (rwattr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); -#endif - - // initialize buffer pool mgr latches - - for( slot = 0; slot < hashsize; slot++ ) { -#ifdef unix - pthread_rwlock_init (mgr->latch[slot].lock, rwattr); -#else - InitializeSRWLock (mgr->latch[slot].srw); -#endif - } - - // initialize buffer pool page latches -#ifdef unix -// pthread_rwlockattr_setpshared (rwattr, PTHREAD_PROCESS_SHARED); -#endif - for( slot = 1; slot < poolmax; slot++ ) { - BtLatchSet *latchset = (BtLatchSet *)(mgr->nodes + slot * (sizeof(BtPool) + (mgr->poolmask + 1) * sizeof(BtLatchSet)) + sizeof(BtPool)); - for( idx = 0; idx < mgr->poolmask + 1; idx++ ) { -#ifdef unix - pthread_rwlock_init (latchset[idx].readwr->lock, rwattr); - pthread_rwlock_init (latchset[idx].access->lock, rwattr); - pthread_rwlock_init (latchset[idx].parent->lock, rwattr); -#else - InitializeSRWLock (latchset[idx].readwr->srw); - InitializeSRWLock (latchset[idx].access->srw); - InitializeSRWLock (latchset[idx].parent->srw); -#endif - } - } - - if( size || *amt ) - goto mgrxit; - - // initializes an empty b-tree with root page and page of leaves - - memset (alloc, 0, 1 << bits); - bt_putid(alloc->right, MIN_lvl+1); - alloc->bits = mgr->page_bits; - -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (alloc, 0, 1 << bits); - alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - alloc->min = mgr->page_size - 3; - alloc->lvl = lvl; - alloc->cnt = 1; - alloc->act = 1; -#ifdef unix - if( write (mgr->idx, alloc, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // create empty page area by writing last page of first - // segment area (other pages are zeroed by O/S) - - if( mgr->poolmask ) { - memset(alloc, 0, mgr->page_size); - last = mgr->poolmask; - - while( last < MIN_lvl + 1 ) - last += mgr->poolmask + 1; - -#ifdef unix - pwrite(mgr->idx, alloc, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)alloc, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - -mgrxit: -#ifdef unix - free (alloc); -#else - VirtualFree (alloc, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -void bt_readlock(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_rdlock (latch->lock); -#else - AcquireSRWLockShared (latch->srw); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_wrlock (latch->lock); -#else - AcquireSRWLockExclusive (latch->srw); -#endif -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write or read locked - -int bt_writetry(BtLatch *latch) -{ -int result = 0; - -#ifdef unix - result = !pthread_rwlock_trywrlock (latch->lock); -#else - result = TryAcquireSRWLockExclusive (latch->srw); -#endif - return result; -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_unlock (latch->lock); -#else - ReleaseSRWLockExclusive (latch->srw); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_unlock (latch->lock); -#else - ReleaseSRWLockShared (latch->srw); -#endif -} - -// Buffer Pool mgr - -// find segment in pool -// must be called with hashslot idx locked -// return NULL if not there -// otherwise return node - -BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx) -{ -BtPool *pool; -uint slot; - - // compute start of hash chain in pool - - if( slot = bt->mgr->hash[idx] ) - pool = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - else - return NULL; - - page_no &= ~bt->mgr->poolmask; - - while( pool->basepage != page_no ) - if( pool = pool->hashnext ) - continue; - else - return NULL; - - return pool; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx) -{ -BtPool *node; -uint slot; - - pool->hashprev = pool->hashnext = NULL; - pool->basepage = page_no & ~bt->mgr->poolmask; - pool->lru = 1; - - if( slot = bt->mgr->hash[idx] ) { - node = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - pool->hashnext = node; - node->hashprev = pool; - } - - bt->mgr->hash[idx] = pool->slot; -} - -// find best segment to evict from buffer pool - -BtPool *bt_findlru (BtDb *bt, uint hashslot) -{ -unsigned long long int target = ~0LL; -BtPool *pool = NULL, *node; - - if( !hashslot ) - return NULL; - - node = (BtPool *)(bt->mgr->nodes + hashslot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - - // scan pool entries under hash table slot - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - pool = node; - } while( node = node->hashnext ); - - return pool; -} - -// map new buffer pool segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( pool->map == MAP_FAILED ) - return bt->err = BTERR_map; - // clear out madvise issued bits - memset (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8) / 8), 0, (bt->mgr->poolmask + 8)/8); -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !pool->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->poolmask+1) << bt->mgr->page_bits); - if( !pool->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-pool -// return pool table entry, incrementing pin - -BtPool *bt_pinpage(BtDb *bt, uid page_no) -{ -BtPool *pool, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releaseread (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // upgrade to write lock - - bt_releaseread (&bt->mgr->latch[idx]); - bt_writelock (&bt->mgr->latch[idx]); - - // try to find page in pool with write lock - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // allocate a new pool node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1); -#else - slot = _InterlockedIncrement (&bt->mgr->poolcnt) - 1; -#endif - - if( ++slot < bt->mgr->poolmax ) { - pool = (BtPool *)(bt->mgr->nodes + slot * (sizeof(BtPool) + (bt->mgr->poolmask + 1) * sizeof(BtLatchSet))); - pool->slot = slot; - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - return pool; - } - - // pool table is full - // find best pool entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->poolcnt, -1); -#else - _InterlockedDecrement (&bt->mgr->poolcnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = _InterlockedIncrement (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_writetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_releasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim pool node from hash table - - if( node = pool->hashprev ) - node->hashnext = pool->hashnext; - else if( node = pool->hashnext ) - bt->mgr->hash[victim] = node->slot; - else - bt->mgr->hash[victim] = 0; - - if( node = pool->hashnext ) - node->hashprev = pool->hashprev; - - bt_releasewrite (&bt->mgr->latch[victim]); - - // remove old file mapping -#ifdef unix - munmap (pool->map, (bt->mgr->poolmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); -#endif - pool->map = NULL; - - // create new pool mapping - // and link into hash table - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement (&pool->pin); -#endif - bt_releasewrite (&bt->mgr->latch[idx]); - return pool; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool and return page pointer - -BTERR bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *pageptr) -{ -BtLatchSet *set; -BtPool *pool; -uint subpage; -BtPage page; - - // find/create maping in pool table - // and pin our pool slot - - if( pool = bt_pinpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping - else - return bt->err; - - set = pool->pagelatch + subpage; - page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits)); -#ifdef unix - { - uint idx = subpage / 8; - uint bit = subpage % 8; - - if( ~((bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] >> bit) & 1 ) { - madvise (page, bt->mgr->page_size, MADV_WILLNEED); - (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] |= 1 << bit; - } - } -#endif - - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - - if( pageptr ) - *pageptr = page; - - return bt->err = 0; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode) -{ -uint subpage, idx; -BtLatchSet *set; -BtPool *pool; - - // since page is pinned - // it should still be in the buffer pool - // and is in no danger of being a victim for reuse - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_readlock (&bt->mgr->latch[idx]); - - if( pool = bt_findpool(bt, page_no, idx) ) - subpage = (uint)(page_no & bt->mgr->poolmask); - else - return bt->err = BTERR_hash; - - bt_releaseread (&bt->mgr->latch[idx]); - set = pool->pagelatch + subpage; - - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&pool->pin, -1); -#else - _InterlockedDecrement (&pool->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page -// place on free chain out of allocator page - -BTERR bt_freepage(BtDb *bt, uid page_no) -{ - // obtain delete lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - // obtain write lock on deleted page - - if( bt_lockpage(bt, page_no, BtLockWrite, &bt->temp) ) - return bt->err; - - // lock allocation page - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return bt->err; - - // store chain in second right - bt_putid(bt->temp->right, bt_getid(bt->alloc[1].right)); - bt_putid(bt->alloc[1].right, page_no); - - // unlock page zero - - if( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return bt->err; - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -uid new_page; -BtPage pmap; -int reuse; - - // lock page zero - - if ( bt_lockpage(bt, ALLOC_page, BtLockWrite, &bt->alloc) ) - return 0; - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(bt->alloc[1].right) ) { - if( bt_lockpage (bt, new_page, BtLockWrite, &bt->temp) ) - return 0; - bt_putid(bt->alloc[1].right, bt_getid(bt->temp->right)); - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(bt->alloc->right); - bt_putid(bt->alloc->right, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of pool block, zero last page in the block - - if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into pool and copy page. - // this will extend the file into the new pages. - - if( bt_lockpage(bt, new_page, BtLockWrite, &pmap) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite) ) - return 0; -#endif - // unlock page zero - - if ( bt_unlockpage(bt, ALLOC_page, BtLockWrite) ) - return 0; - - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, uint lock) -{ -uid page_no = ROOT_page, prevpage = 0; -uint drill = 0xff, slot; -uint mode, prevmode; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( bt_lockpage(bt, page_no, BtLockAccess, NULL) ) - return 0; - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode) ) - return 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( bt_lockpage(bt, page_no, mode, &bt->page) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode) ) - return 0; - else - continue; - } - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - else - drill--; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - prevmode = mode; - prevpage = bt->page_no; - page_no = bt_getid(slotptr(bt->page, slot)->id); - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite); - - // obtain Parent lock over write lock - - if( bt_lockpage(bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if ( bt_lockpage(bt, right, BtLockWrite, &bt->temp) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - bt_unlockpage(bt, bt->page_no, BtLockWrite); - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// 1 - go ahead - -uint bt_cleanpage(BtDb *bt, uint amt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key and foster children in list - if( cnt < max - page->foster && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - page->min = nxt; - page->cnt = idx; - - // see if page has enough space now, or does it need splitting? - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - return 0; -} - -// add key to page -// return with page unlocked - -BTERR bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split the root and raise the height of the btree - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - root->act--; - root->cnt--; - root->foster--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page - - if( !(new_page = bt_newpage(bt, root)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest of the root to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on empty newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, bt->page_no, BtLockWrite); -} - -// split already locked full node -// return unlocked. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // remember fence key for new page to add - // as foster child - - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // insert new foster child at beginning of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys if any - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - page->min = nxt; - page->cnt = idx; - - // link new right page - - bt_putid (page->right, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // release wr lock on page - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix fence key and highest foster child on page - - if( bt_lockpage (bt, page_no, BtLockParent, NULL) ) - return bt->err; - - // get our highest foster child key to find in parent node - - if( bt_lockpage (bt, page_no, BtLockRead, &page) ) - return bt->err; - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - if( bt_unlockpage (bt, page_no, BtLockRead) ) - return bt->err; - -try_again: - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space for any possible key - - if( bt_cleanpage (bt, 256) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } while( 1 ); - - // see if we are still a foster child from another node - - if( bt_getid (slotptr(bt->page, slot)->id) != page_no ) { - bt_unlockpage (bt, bt->page_no, BtLockWrite); -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - goto try_again; - } - - // wait until readers from parent get their locks - - if( bt_lockpage (bt, page_no, BtLockDelete, NULL) ) - return bt->err; - - if( bt_lockpage (bt, page_no, BtLockWrite, &page) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove highest foster child from our page - // add our new fence key to parent - - page->cnt--; - page->act--; - page->foster--; - page->dirty = 1; - key = keyptr(page, page->cnt); - - if( bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - - return bt_unlockpage (bt, page_no, BtLockParent); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite); - } - - // check if page has enough space - - if( bt_cleanpage (bt, len) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - return bt_addkeytopage (bt, slot, key, len, id, tod); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( bt_lockpage(bt, right, BtLockRead, &page) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, idx; - char *infile; - BtMgr *mgr; - int num; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0, cnt = 0; -uid next, page_no = LEAF_page; // start on first page of leaves -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtPage page; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - - break; - - case 'c': - fprintf(stderr, "started reading\n"); - - do { - bt_lockpage (bt, page_no, BtLockRead, &page); - cnt += page->act; - next = bt_getid (page->right); - bt_unlockpage (bt, page_no, BtLockRead); - } while( page_no = next ); - - cnt--; // remove stopper key - fprintf(stderr, " Total keys read %d\n", cnt); - break; - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint poolsize = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - poolsize = atoi(argv[4]); - - if( !poolsize ) - fprintf (stderr, "Warning: no mapped_pool\n"); - - if( poolsize > 65535 ) - fprintf (stderr, "Warning: mapped_pool > 65535 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; - args[idx].idx = idx; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreee1.c b/fosterbtreee1.c deleted file mode 100644 index 2b45ce8..0000000 --- a/fosterbtreee1.c +++ /dev/null @@ -1,2529 +0,0 @@ -// foster btree version e -// 17 JAN 2014 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_latchtable 128 // number of latch manager slots - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent, - BtLockPin -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits; // page size in bits - unsigned char lvl:6; // level of page - unsigned char kill:1; // page is being deleted - unsigned char dirty:1; // page needs to be cleaned - unsigned char right[BtId]; // page number to right -} *BtPage; - -// mode & definition for hash latch implementation - -enum { - Mutex = 1, - Write = 2, - Pending = 4, - Share = 8 -} LockMode; - -// mutex locks the other fields -// exclusive is set for write access -// share is count of read accessors - -typedef struct { - volatile ushort mutex:1; - volatile ushort exclusive:1; - volatile ushort pending:1; - volatile ushort share:13; -} BtSpinLatch; - -// hash table entries - -typedef struct { - BtSpinLatch latch[1]; - volatile ushort slot; // Latch table entry at head of chain -} BtHashEntry; - -// latch table lock structure -// implements a fair read-write lock - -typedef struct { -#ifdef unix - pthread_rwlock_t lock[1]; -#else - SRWLOCK srw[1]; -#endif -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children - BtSpinLatch busy[1]; // slot is being moved between chains - volatile ushort next; // next entry in hash table chain - volatile ushort prev; // prev entry in hash table chain - volatile ushort pin; // number of outstanding locks - volatile ushort hash; // hash slot entry is under - volatile uid page_no; // latch set page number -} BtLatchSet; - -// The memory mapping pool table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - ushort pin; // mapped page pin counter - ushort slot; // slot index in this array - void *hashprev; // previous pool entry for the same hash idx - void *hashnext; // next pool entry for the same hash idx -#ifndef unix - HANDLE hmap; // Windows memory mapping handle -#endif -} BtPool; - -// structure for latch manager on ALLOC_page - -typedef struct { - struct Page alloc[2]; // next & free page_nos in right ptr - BtSpinLatch lock[1]; // allocation area lite latch - ushort latchdeployed; // highest number of latch entries deployed - ushort nlatchpage; // number of latch pages at BT_latch - ushort latchtotal; // number of page latch entries - ushort latchhash; // number of latch hash table slots - ushort latchvictim; // next latch entry to examine - BtHashEntry table[0]; // the hash table -} BtLatchMgr; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; - char *pooladvise; // bit maps for pool page advisements -#else - HANDLE idx; -#endif - ushort poolcnt; // highest page pool node in use - ushort poolmax; // highest page pool node allocated - ushort poolmask; // total size of pages in mmap segment - 1 - ushort hashsize; // size of Hash Table for pool entries - ushort evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries - BtPool *pool; // memory pool page segments - BtSpinLatch *latch; // latches for pool hash slots - BtLatchMgr *latchmgr; // mapped latch page from allocation page - BtLatchSet *latchsets; // mapped latch set from latch pages -#ifndef unix - HANDLE halloc; // allocation and latch table handle -#endif -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - BtLatchSet *set; // current page latch set - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash, - BTERR_latch -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 // allocation & lock manager hash table -#define ROOT_page 1 // root of the btree -#define LEAF_page 2 // first page of leaves -#define LATCH_page 3 // pages for lock manager - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -// wait until write lock mode is clear -// and add 1 to the share count - -void bt_spinreadlock(BtSpinLatch *latch) -{ -ushort prev; - - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex) & Mutex ) - sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex) & Mutex ) - SwitchToThread(); -#endif - - // see if exclusive request is granted or pending - - if( prev = !(latch->exclusive | latch->pending) ) -#ifdef unix - __sync_fetch_and_add((ushort *)latch, Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, Share); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - if( prev ) - return; -#ifdef unix - } while( sched_yield(), 1 ); -#else - } while( SwitchToThread(), 1 ); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_spinwritelock(BtSpinLatch *latch) -{ -ushort prev; - - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex | Pending) & Mutex ) - sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex | Pending) & Mutex ) - SwitchToThread(); -#endif - if( prev = !(latch->share | latch->exclusive) ) -#ifdef unix - __sync_fetch_and_or((ushort *)latch, Write); -#else - _InterlockedOr16((ushort *)latch, Write); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~(Mutex | Pending)); -#else - _InterlockedAnd16((ushort *)latch, ~(Mutex | Pending)); -#endif - if( prev ) - return; -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 otherwise - -int bt_spinwritetry(BtSpinLatch *latch) -{ -ushort prev; - -#ifdef unix - if( prev = __sync_fetch_and_or((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#else - if( prev = _InterlockedOr16((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#endif - // take write access if all bits are clear - - if( !prev ) -#ifdef unix - __sync_fetch_and_or ((ushort *)latch, Write); -#else - _InterlockedOr16((ushort *)latch, Write); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - return !prev; -} - -// clear write mode - -void bt_spinreleasewrite(BtSpinLatch *latch) -{ -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Write); -#else - _InterlockedAnd16((ushort *)latch, ~Write); -#endif -} - -// decrement reader count - -void bt_spinreleaseread(BtSpinLatch *latch) -{ -#ifdef unix - __sync_fetch_and_add((ushort *)latch, -Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, -Share); -#endif -} - -void bt_initlockset (BtLatchSet *set) -{ -#ifdef unix -pthread_rwlockattr_t rwattr[1]; - - pthread_rwlockattr_init (rwattr); - pthread_rwlockattr_setkind_np (rwattr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); - pthread_rwlockattr_setpshared (rwattr, PTHREAD_PROCESS_SHARED); - - pthread_rwlock_init (set->readwr->lock, rwattr); - pthread_rwlock_init (set->access->lock, rwattr); - pthread_rwlock_init (set->parent->lock, rwattr); - pthread_rwlockattr_destroy (rwattr); -#else - InitializeSRWLock (set->readwr->srw); - InitializeSRWLock (set->access->srw); - InitializeSRWLock (set->parent->srw); -#endif -} - -// link latch table entry into latch hash table - -void bt_latchlink (BtDb *bt, ushort hashidx, ushort victim, uid page_no) -{ -BtLatchSet *set = bt->mgr->latchsets + victim; - - if( set->next = bt->mgr->latchmgr->table[hashidx].slot ) - bt->mgr->latchsets[set->next].prev = victim; - - bt->mgr->latchmgr->table[hashidx].slot = victim; - set->page_no = page_no; - set->hash = hashidx; - set->prev = 0; -} - -// find existing latchset or inspire new one -// return with latchset pinned - -BtLatchSet *bt_bindlatch (BtDb *bt, uid page_no, int incr) -{ -ushort hashidx = page_no % bt->mgr->latchmgr->latchhash; -ushort slot, avail = 0, victim, idx; -BtLatchSet *set; - - // obtain read lock on hash table entry - - bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch); - - if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do - { - set = bt->mgr->latchsets + slot; - if( page_no == set->page_no ) - break; - } while( slot = set->next ); - - if( slot && incr ) { -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - } - - bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch); - - if( slot ) - return set; - - // try again, this time with write lock - - bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch); - - if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do - { - set = bt->mgr->latchsets + slot; - if( page_no == set->page_no ) - break; - if( !set->pin && !avail ) - avail = slot; - } while( slot = set->next ); - - // found our entry, or take over an unpinned one - - if( slot || (slot = avail) ) { - set = bt->mgr->latchsets + slot; - if( incr ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - set->page_no = page_no; - bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch); - return set; - } - - // see if there are any unused entries -#ifdef unix - victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, 1) + 1; -#else - victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchdeployed); -#endif - - if( victim < bt->mgr->latchmgr->latchtotal ) { - set = bt->mgr->latchsets + victim; - if( incr ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - bt_initlockset (set); - bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - return set; - } - -#ifdef unix - victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, -1); -#else - victim = _InterlockedDecrement16 (&bt->mgr->latchmgr->latchdeployed); -#endif - // find and reuse previous lock entry - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->latchmgr->latchvictim, 1); -#else - victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchvictim) - 1; -#endif - // we don't use slot zero - - if( victim %= bt->mgr->latchmgr->latchtotal ) - set = bt->mgr->latchsets + victim; - else - continue; - - // take control of our slot - // from other threads - - if( set->pin || !bt_spinwritetry (set->busy) ) - continue; - - idx = set->hash; - - // try to get write lock on hash chain - // skip entry if not obtained - // or has outstanding locks - - if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) { - bt_spinreleasewrite (set->busy); - continue; - } - - if( set->pin ) { - bt_spinreleasewrite (set->busy); - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); - continue; - } - - // unlink our available victim from its hash chain - - if( set->prev ) - bt->mgr->latchsets[set->prev].next = set->next; - else - bt->mgr->latchmgr->table[idx].slot = set->next; - - if( set->next ) - bt->mgr->latchsets[set->next].prev = set->prev; - - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); - - if( incr ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - - bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - bt_spinreleasewrite (set->busy); - return set; - } -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtPool *pool; -uint slot; - - // release mapped pages - // note that slot zero is never used - - for( slot = 1; slot < mgr->poolmax; slot++ ) { - pool = mgr->pool + slot; - if( pool->slot ) -#ifdef unix - munmap (pool->map, (mgr->poolmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->pool); - free (mgr->hash); - free (mgr->latch); - free (mgr->pooladvise); - free (mgr); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->pool); - GlobalFree (mgr->hash); - GlobalFree (mgr->latch); - GlobalFree (mgr); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); -#endif - free (bt); -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page pool (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last, slot, idx; -uint nlatchpage, latchhash; -BtLatchMgr *latchmgr; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; -int flag; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !poolmax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - latchmgr = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, latchmgr, BT_minpage, 0) == BT_minpage ) - bits = latchmgr->alloc->bits; - else - return free(mgr), free(latchmgr), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)latchmgr, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = latchmgr->alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->poolmax = poolmax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->poolmask = (cacheblk >> bits) - 1; - - // see if requested size of pages per memmap is greater - - if( (1 << segsize) > mgr->poolmask ) - mgr->poolmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->poolmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->pool = calloc (poolmax, sizeof(BtPool)); - mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtSpinLatch)); - mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 8) / 8); -#else - mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool)); - mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch)); -#endif - - if( size || *amt ) - goto mgrlatch; - - // initialize an empty b-tree with latch page, root page, page of leaves - // and page(s) of latches - - memset (latchmgr, 0, 1 << bits); - nlatchpage = BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1; - bt_putid(latchmgr->alloc->right, MIN_lvl+1+nlatchpage); - latchmgr->alloc->bits = mgr->page_bits; - - latchmgr->nlatchpage = nlatchpage; - latchmgr->latchtotal = nlatchpage * (mgr->page_size / sizeof(BtLatchSet)); - - // initialize latch manager - - latchhash = (mgr->page_size - sizeof(BtLatchMgr)) / sizeof(BtHashEntry); - - // size of hash table = total number of latchsets - - if( latchhash > latchmgr->latchtotal ) - latchhash = latchmgr->latchtotal; - - latchmgr->latchhash = latchhash; - -#ifdef unix - if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (latchmgr, 0, 1 << bits); - latchmgr->alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(latchmgr->alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(latchmgr->alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(latchmgr->alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - latchmgr->alloc->min = mgr->page_size - 3; - latchmgr->alloc->lvl = lvl; - latchmgr->alloc->cnt = 1; - latchmgr->alloc->act = 1; -#ifdef unix - if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // clear out latch manager locks - // and rest of pages to round out segment - - memset(latchmgr, 0, mgr->page_size); - last = MIN_lvl + 1; - - while( last <= ((MIN_lvl + 1 + nlatchpage) | mgr->poolmask) ) { -#ifdef unix - pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - last++; - } - -mgrlatch: -#ifdef unix - flag = PROT_READ | PROT_WRITE; - mgr->latchmgr = mmap (0, mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size); - if( mgr->latchmgr == MAP_FAILED ) - return bt_mgrclose (mgr), NULL; - mgr->latchsets = (BtLatchSet *)mmap (0, mgr->latchmgr->nlatchpage * mgr->page_size, flag, MAP_SHARED, mgr->idx, LATCH_page * mgr->page_size); - if( mgr->latchsets == MAP_FAILED ) - return bt_mgrclose (mgr), NULL; -#else - flag = PAGE_READWRITE; - mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size, NULL); - if( !mgr->halloc ) - return bt_mgrclose (mgr), NULL; - - flag = FILE_MAP_WRITE; - mgr->latchmgr = MapViewOfFile(mgr->halloc, flag, 0, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size); - if( !mgr->latchmgr ) - return GetLastError(), bt_mgrclose (mgr), NULL; - - mgr->latchsets = (void *)((char *)mgr->latchmgr + LATCH_page * mgr->page_size); -#endif - -#ifdef unix - free (latchmgr); -#else - VirtualFree (latchmgr, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -void bt_readlock(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_rdlock (latch->lock); -#else - AcquireSRWLockShared (latch->srw); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_wrlock (latch->lock); -#else - AcquireSRWLockExclusive (latch->srw); -#endif -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write or read locked - -int bt_writetry(BtLatch *latch) -{ -int result = 0; - -#ifdef unix - result = !pthread_rwlock_trywrlock (latch->lock); -#else - result = TryAcquireSRWLockExclusive (latch->srw); -#endif - return result; -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_unlock (latch->lock); -#else - ReleaseSRWLockExclusive (latch->srw); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_unlock (latch->lock); -#else - ReleaseSRWLockShared (latch->srw); -#endif -} - -// Buffer Pool mgr - -// find segment in pool -// must be called with hashslot idx locked -// return NULL if not there -// otherwise return node - -BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx) -{ -BtPool *pool; -uint slot; - - // compute start of hash chain in pool - - if( slot = bt->mgr->hash[idx] ) - pool = bt->mgr->pool + slot; - else - return NULL; - - page_no &= ~bt->mgr->poolmask; - - while( pool->basepage != page_no ) - if( pool = pool->hashnext ) - continue; - else - return NULL; - - return pool; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx) -{ -BtPool *node; -uint slot; - - pool->hashprev = pool->hashnext = NULL; - pool->basepage = page_no & ~bt->mgr->poolmask; - pool->lru = 1; - - if( slot = bt->mgr->hash[idx] ) { - node = bt->mgr->pool + slot; - pool->hashnext = node; - node->hashprev = pool; - } - - bt->mgr->hash[idx] = pool->slot; -} - -// find best segment to evict from buffer pool - -BtPool *bt_findlru (BtDb *bt, uint hashslot) -{ -unsigned long long int target = ~0LL; -BtPool *pool = NULL, *node; - - if( !hashslot ) - return NULL; - - node = bt->mgr->pool + hashslot; - - // scan pool entries under hash table slot - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - pool = node; - } while( node = node->hashnext ); - - return pool; -} - -// map new buffer pool segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( pool->map == MAP_FAILED ) - return bt->err = BTERR_map; - // clear out madvise issued bits - memset (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8) / 8), 0, (bt->mgr->poolmask + 8)/8); -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !pool->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->poolmask+1) << bt->mgr->page_bits); - if( !pool->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-pool -// return pool table entry, incrementing pin - -BtPool *bt_pinpage(BtDb *bt, uid page_no) -{ -BtPool *pool, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_spinreadlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleaseread (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // upgrade to write lock - - bt_spinreleaseread (&bt->mgr->latch[idx]); - bt_spinwritelock (&bt->mgr->latch[idx]); - - // try to find page in pool with write lock - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // allocate a new pool node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1); -#else - slot = _InterlockedIncrement16 (&bt->mgr->poolcnt) - 1; -#endif - - if( ++slot < bt->mgr->poolmax ) { - pool = bt->mgr->pool + slot; - pool->slot = slot; - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - return pool; - } - - // pool table is full - // find best pool entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->poolcnt, -1); -#else - _InterlockedDecrement16 (&bt->mgr->poolcnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = _InterlockedIncrement16 (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_spinwritetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_spinreleasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim pool node from hash table - - if( node = pool->hashprev ) - node->hashnext = pool->hashnext; - else if( node = pool->hashnext ) - bt->mgr->hash[victim] = node->slot; - else - bt->mgr->hash[victim] = 0; - - if( node = pool->hashnext ) - node->hashprev = pool->hashprev; - - bt_spinreleasewrite (&bt->mgr->latch[victim]); - - // remove old file mapping -#ifdef unix - munmap (pool->map, (bt->mgr->poolmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); -#endif - pool->map = NULL; - - // create new pool mapping - // and link into hash table - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - return pool; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool and return latchset pointer - -BtLatchSet *bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *pageptr, BtLatchSet *set) -{ -BtPool *pool; -uint subpage; -BtPage page; - - // find/create maping in pool table - // and pin our pool slot - - if( pool = bt_pinpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping - else - return NULL; - - if( set ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - else if( !(set = bt_bindlatch (bt, page_no, 1)) ) - return NULL; - - page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits)); - -#ifdef unix - { - uint idx = subpage / 8; - uint bit = subpage % 8; - - if( mode == BtLockRead || mode == BtLockWrite ) - if( ~((bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] >> bit) & 1 ) { - madvise (page, bt->mgr->page_size, MADV_WILLNEED); - (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] |= 1 << bit; - } - } -#endif - - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - case BtLockPin: - break; - default: - return bt->err = BTERR_lock, NULL; - } - - if( pageptr ) - *pageptr = page; - - return set; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode, BtLatchSet *set) -{ -BtPool *pool; -uint idx; - - // since page is pinned - // it should still be in the buffer pool - // and is in no danger of being a victim for reuse - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_spinreadlock (&bt->mgr->latch[idx]); - - if( !(pool = bt_findpool(bt, page_no, idx)) ) - return bt->err = BTERR_hash; - - bt_spinreleaseread (&bt->mgr->latch[idx]); - - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - case BtLockPin: - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&pool->pin, -1); - __sync_fetch_and_add (&set->pin, -1); -#else - _InterlockedDecrement16 (&pool->pin); - _InterlockedDecrement16 (&set->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page -// place on free chain out of allocator page -// fence key must already be removed from parent - -BTERR bt_freepage(BtDb *bt, uid page_no, BtLatchSet *set) -{ - // obtain delete lock on deleted page - - if( !bt_lockpage(bt, page_no, BtLockDelete, NULL, set) ) - return bt->err; - - // obtain write lock on deleted page - - if( !bt_lockpage(bt, page_no, BtLockWrite, &bt->temp, set) ) - return bt->err; - - // lock allocation page - - bt_spinwritelock(bt->mgr->latchmgr->lock); - - // store free chain in allocation page second right - bt_putid(bt->temp->right, bt_getid(bt->mgr->latchmgr->alloc[1].right)); - bt_putid(bt->mgr->latchmgr->alloc[1].right, page_no); - - // unlock page zero - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite, set) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete, set) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -BtLatchSet *set; -uid new_page; -BtPage pmap; -int reuse; - - // lock allocation page - - bt_spinwritelock(bt->mgr->latchmgr->lock); - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(bt->mgr->latchmgr->alloc[1].right) ) { - if( !(set = bt_lockpage (bt, new_page, BtLockWrite, &bt->temp, NULL)) ) - return 0; - bt_putid(bt->mgr->latchmgr->alloc[1].right, bt_getid(bt->temp->right)); - if( bt_unlockpage (bt, new_page, BtLockWrite, set) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(bt->mgr->latchmgr->alloc->right); - bt_putid(bt->mgr->latchmgr->alloc->right, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of pool block, zero last page in the block - - if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into pool and copy page. - // this will extend the file into the new pages. - - if( !(set = bt_lockpage(bt, new_page, BtLockWrite, &pmap, NULL)) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite, set) ) - return 0; -#endif - // unlock allocation latch and return new page no - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, BtLock lock) -{ -uid page_no = ROOT_page, prevpage = 0; -BtLatchSet *set, *prevset; -uint drill = 0xff, slot; -uint mode, prevmode; - - bt->set = NULL; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( !(bt->set = bt_lockpage(bt, page_no, BtLockAccess, NULL, NULL)) ) - return 0; - - // now unlock our (possibly foster) parent - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode, prevset) ) - return 0; - else - prevpage = 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( !(bt->set = bt_lockpage(bt, page_no, mode, &bt->page, bt->set)) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess, bt->set) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode, bt->set) ) - return 0; - else - continue; - } - - prevpage = bt->page_no; - prevset = bt->set; - prevmode = mode; - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - goto slideright; - - if( slot <= bt->page->cnt - bt->page->foster ) - drill--; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - page_no = bt_getid(slotptr(bt->page, slot)->id); - continue; - -slideright: - page_no = bt_getid(bt->page->right); - - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -BtLatchSet *rset, *set; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - set = bt->set; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite, set); - - // obtain Parent lock over write lock - - if( !bt_lockpage(bt, page_no, BtLockParent, NULL, set) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if( !(rset = bt_lockpage(bt, right, BtLockWrite, &bt->temp, NULL)) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite, rset) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite, set) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - - if( bt_unlockpage(bt, bt->page_no, BtLockWrite, bt->set) ) - return bt->err; - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right, rset) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent, set) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if( bt_unlockpage (bt, bt->page_no, BtLockRead, bt->set) ) - return 0; - - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// 1 - go ahead - -uint bt_cleanpage(BtDb *bt, uint amt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key and foster children in list - if( cnt < max - page->foster && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - page->min = nxt; - page->cnt = idx; - - // see if page has enough space now, or does it need splitting? - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - return 0; -} - -// add key to current page -// page must already be writelocked - -void bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; -} - -// split the root and raise the height of the btree -// call with current page locked and page no of foster child -// return with current page (root) unlocked - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - root->act--; - root->cnt--; - root->foster--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page - - if( !(new_page = bt_newpage(bt, root)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest of the root to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on empty newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, ROOT_page, BtLockWrite, bt->set); -} - -// split already locked full node -// in current page variables -// return unlocked. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtLatchSet *set = bt->set; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // remember fence key for new page to add - // as foster child - - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // insert new foster child at beginning of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys if any - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - page->min = nxt; - page->cnt = idx; - - // link new right page - - bt_putid (page->right, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // keep our latch set - // release wr lock on our page - - if( !bt_lockpage (bt, page_no, BtLockPin, NULL, set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite, set) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix fence key and highest foster child on page - - if( !bt_lockpage (bt, page_no, BtLockParent, NULL, set) ) - return bt->err; - - // get our highest foster child key to find in parent node - - if( !bt_lockpage (bt, page_no, BtLockRead, &page, set) ) - return bt->err; - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - if( bt_unlockpage (bt, page_no, BtLockRead, set) ) - return bt->err; - - // update our parent -try_again: - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space for any possible key - - if( bt_cleanpage (bt, 256) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } while( 1 ); - - // see if we are still a foster child from another node - - if( bt_getid (slotptr(bt->page, slot)->id) != page_no ) { - if( bt_unlockpage (bt, bt->page_no, BtLockWrite, bt->set) ) - return bt->err; -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - goto try_again; - } - - // wait until readers from parent get their locks - // on our page - - if( !bt_lockpage (bt, page_no, BtLockDelete, NULL, set) ) - return bt->err; - - // lock our page for writing - - if( !bt_lockpage (bt, page_no, BtLockWrite, &page, set) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove highest foster child from our page - - page->cnt--; - page->act--; - page->foster--; - page->dirty = 1; - key = keyptr(page, page->cnt); - - // add our new fence key for foster child to our parent - - bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod); - - if( bt_unlockpage (bt, bt->page_no, BtLockWrite, bt->set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete, set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite, set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockParent, set) ) - return bt->err; - - // release extra latch pin - - return bt_unlockpage (bt, page_no, BtLockPin, set); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite, bt->set); - } - - // check if page has enough space - - if( bt_cleanpage (bt, len) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - bt_addkeytopage (bt, slot, key, len, id, tod); - - return bt_unlockpage (bt, bt->page_no, BtLockWrite, bt->set); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead, bt->set) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtLatchSet *rset; -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( !(bt->set = bt_lockpage(bt, right, BtLockRead, &page, NULL)) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead, bt->set) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, idx; - char *infile; - BtMgr *mgr; - int num; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0, cnt = 0; -uid next, page_no = LEAF_page; // start on first page of leaves -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtPage page; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - - break; - - case 'c': - fprintf(stderr, "started reading\n"); - - do { - bt->set = bt_lockpage (bt, page_no, BtLockRead, &page, NULL); - cnt += page->act; - next = bt_getid (page->right); - bt_unlockpage (bt, page_no, BtLockRead, bt->set); - } while( page_no = next ); - - cnt--; // remove stopper key - fprintf(stderr, " Total keys read %d\n", cnt); - break; - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint poolsize = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - poolsize = atoi(argv[4]); - - if( !poolsize ) - fprintf (stderr, "Warning: no mapped_pool\n"); - - if( poolsize > 65535 ) - fprintf (stderr, "Warning: mapped_pool > 65535 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; - args[idx].idx = idx; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreee2.c b/fosterbtreee2.c deleted file mode 100644 index 86f836d..0000000 --- a/fosterbtreee2.c +++ /dev/null @@ -1,2554 +0,0 @@ -// foster btree version e2 -// 18 JAN 2014 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_latchtable 128 // number of latch manager slots - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - volatile uint cnt; // count of keys in page - volatile uint act; // count of active keys - volatile uint min; // next key offset - volatile uint foster; // count of foster children - unsigned char bits; // page size in bits - unsigned char lvl:7; // level of page - unsigned char dirty:1; // page needs to be cleaned - unsigned char right[BtId]; // page number to right -} *BtPage; - -// mode & definition for hash latch implementation - -enum { - Mutex = 1, - Write = 2, - Pending = 4, - Share = 8 -} LockMode; - -// mutex locks the other fields -// exclusive is set for write access -// share is count of read accessors - -typedef struct { - volatile ushort mutex:1; - volatile ushort exclusive:1; - volatile ushort pending:1; - volatile ushort share:13; -} BtSpinLatch; - -// hash table entries - -typedef struct { - BtSpinLatch latch[1]; - volatile ushort slot; // Latch table entry at head of chain -} BtHashEntry; - -// latch manager table structure - -typedef struct { -#ifdef unix - pthread_rwlock_t lock[1]; -#else - SRWLOCK srw[1]; -#endif -} BtLatch; - -typedef struct { - BtLatch readwr[1]; // read/write page lock - BtLatch access[1]; // Access Intent/Page delete - BtLatch parent[1]; // adoption of foster children - BtSpinLatch busy[1]; // slot is being moved between chains - volatile ushort next; // next entry in hash table chain - volatile ushort prev; // prev entry in hash table chain - volatile ushort pin; // number of outstanding locks - volatile ushort hash; // hash slot entry is under - volatile uid page_no; // latch set page number -} BtLatchSet; - -// The memory mapping pool table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - ushort pin; // mapped page pin counter - ushort slot; // slot index in this array - void *hashprev; // previous pool entry for the same hash idx - void *hashnext; // next pool entry for the same hash idx -#ifndef unix - HANDLE hmap; // Windows memory mapping handle -#endif -} BtPool; - -// structure for latch manager on ALLOC_page - -typedef struct { - struct Page alloc[2]; // next & free page_nos in right ptr - BtSpinLatch lock[1]; // allocation area lite latch - ushort latchdeployed; // highest number of latch entries deployed - ushort nlatchpage; // number of latch pages at BT_latch - ushort latchtotal; // number of page latch entries - ushort latchhash; // number of latch hash table slots - ushort latchvictim; // next latch entry to examine - BtHashEntry table[0]; // the hash table -} BtLatchMgr; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; - char *pooladvise; // bit maps for pool page advisements -#else - HANDLE idx; -#endif - ushort poolcnt; // highest page pool node in use - ushort poolmax; // highest page pool node allocated - ushort poolmask; // total number of pages in mmap segment - 1 - ushort hashsize; // size of Hash Table for pool entries - ushort evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries - BtPool *pool; // memory pool page segments - BtSpinLatch *latch; // latches for pool hash slots - BtLatchMgr *latchmgr; // mapped latch page from allocation page - BtLatchSet *latchsets; // mapped latch set from latch pages -#ifndef unix - HANDLE halloc; // allocation and latch table handle -#endif -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - BtLatchSet *set; // current page latch set - BtPool *pool; // current page pool - unsigned char *mem; // frame, cursor, page memory buffer - int found; // last delete was found - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash, - BTERR_latch -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod, uint lvl); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 // allocation & lock manager hash table -#define ROOT_page 1 // root of the btree -#define LEAF_page 2 // first page of leaves -#define LATCH_page 3 // pages for lock manager - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -// wait until write lock mode is clear -// and add 1 to the share count - -void bt_spinreadlock(BtSpinLatch *latch) -{ -ushort prev; - - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex) & Mutex ) - sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex) & Mutex ) - SwitchToThread(); -#endif - - // see if exclusive request is granted or pending - - if( prev = !(latch->exclusive | latch->pending) ) -#ifdef unix - __sync_fetch_and_add((ushort *)latch, Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, Share); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - if( prev ) - return; -#ifdef unix - } while( sched_yield(), 1 ); -#else - } while( SwitchToThread(), 1 ); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_spinwritelock(BtSpinLatch *latch) -{ - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex | Pending) & Mutex ) - sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex | Pending) & Mutex ) - SwitchToThread(); -#endif - if( !(latch->share | latch->exclusive) ) { -#ifdef unix - __sync_fetch_and_or((ushort *)latch, Write); - __sync_fetch_and_and ((ushort *)latch, ~(Mutex | Pending)); -#else - _InterlockedOr16((ushort *)latch, Write); - _InterlockedAnd16((ushort *)latch, ~(Mutex | Pending)); -#endif - return; - } - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 otherwise - -int bt_spinwritetry(BtSpinLatch *latch) -{ -ushort prev; - -#ifdef unix - if( prev = __sync_fetch_and_or((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#else - if( prev = _InterlockedOr16((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#endif - // take write access if all bits are clear - - if( !prev ) -#ifdef unix - __sync_fetch_and_or ((ushort *)latch, Write); -#else - _InterlockedOr16((ushort *)latch, Write); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - return !prev; -} - -// clear write mode - -void bt_spinreleasewrite(BtSpinLatch *latch) -{ -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Write); -#else - _InterlockedAnd16((ushort *)latch, ~Write); -#endif -} - -// decrement reader count - -void bt_spinreleaseread(BtSpinLatch *latch) -{ -#ifdef unix - __sync_fetch_and_add((ushort *)latch, -Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, -Share); -#endif -} - -void bt_initlockset (BtLatchSet *set, int reuse) -{ -#ifdef unix -pthread_rwlockattr_t rwattr[1]; - - if( reuse ) { - pthread_rwlock_destroy (set->readwr->lock); - pthread_rwlock_destroy (set->access->lock); - pthread_rwlock_destroy (set->parent->lock); - } - - pthread_rwlockattr_init (rwattr); - pthread_rwlockattr_setkind_np (rwattr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); - pthread_rwlockattr_setpshared (rwattr, PTHREAD_PROCESS_SHARED); - - pthread_rwlock_init (set->readwr->lock, rwattr); - pthread_rwlock_init (set->access->lock, rwattr); - pthread_rwlock_init (set->parent->lock, rwattr); - pthread_rwlockattr_destroy (rwattr); -#else - InitializeSRWLock (set->readwr->srw); - InitializeSRWLock (set->access->srw); - InitializeSRWLock (set->parent->srw); -#endif -} - -// link latch table entry into latch hash table - -void bt_latchlink (BtDb *bt, ushort hashidx, ushort victim, uid page_no) -{ -BtLatchSet *set = bt->mgr->latchsets + victim; - - if( set->next = bt->mgr->latchmgr->table[hashidx].slot ) - bt->mgr->latchsets[set->next].prev = victim; - - bt->mgr->latchmgr->table[hashidx].slot = victim; - set->page_no = page_no; - set->hash = hashidx; - set->prev = 0; -} - -void bt_unpinlatch (BtLatchSet *set) -{ -#ifdef unix - __sync_fetch_and_add(&set->pin, -1); -#else - _InterlockedDecrement16 (&set->pin); -#endif -} - -// find existing latchset or inspire new one -// return with latchset pinned - -BtLatchSet *bt_pinlatch (BtDb *bt, uid page_no) -{ -ushort hashidx = page_no % bt->mgr->latchmgr->latchhash; -ushort slot, avail = 0, victim, idx; -BtLatchSet *set; - - // obtain read lock on hash table entry - - bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch); - - if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do - { - set = bt->mgr->latchsets + slot; - if( page_no == set->page_no ) - break; - } while( slot = set->next ); - - if( slot ) { -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - } - - bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch); - - if( slot ) - return set; - - // try again, this time with write lock - - bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch); - - if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do - { - set = bt->mgr->latchsets + slot; - if( page_no == set->page_no ) - break; - if( !set->pin && !avail ) - avail = slot; - } while( slot = set->next ); - - // found our entry, or take over an unpinned one - - if( slot || (slot = avail) ) { - set = bt->mgr->latchsets + slot; -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - set->page_no = page_no; - bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch); - return set; - } - - // see if there are any unused entries -#ifdef unix - victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, 1) + 1; -#else - victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchdeployed); -#endif - - if( victim < bt->mgr->latchmgr->latchtotal ) { - set = bt->mgr->latchsets + victim; -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - bt_initlockset (set, 0); - bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - return set; - } - -#ifdef unix - victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, -1); -#else - victim = _InterlockedDecrement16 (&bt->mgr->latchmgr->latchdeployed); -#endif - // find and reuse previous lock entry - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->latchmgr->latchvictim, 1); -#else - victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchvictim) - 1; -#endif - // we don't use slot zero - - if( victim %= bt->mgr->latchmgr->latchtotal ) - set = bt->mgr->latchsets + victim; - else - continue; - - // take control of our slot - // from other threads - - if( set->pin || !bt_spinwritetry (set->busy) ) - continue; - - idx = set->hash; - - // try to get write lock on hash chain - // skip entry if not obtained - // or has outstanding locks - - if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) { - bt_spinreleasewrite (set->busy); - continue; - } - - if( set->pin ) { - bt_spinreleasewrite (set->busy); - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); - continue; - } - - // unlink our available victim from its hash chain - - if( set->prev ) - bt->mgr->latchsets[set->prev].next = set->next; - else - bt->mgr->latchmgr->table[idx].slot = set->next; - - if( set->next ) - bt->mgr->latchsets[set->next].prev = set->prev; - - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - bt_initlockset (set, 1); - bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - bt_spinreleasewrite (set->busy); - return set; - } -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtPool *pool; -uint slot; - - // release mapped pages - // note that slot zero is never used - - for( slot = 1; slot < mgr->poolmax; slot++ ) { - pool = mgr->pool + slot; - if( pool->slot ) -#ifdef unix - munmap (pool->map, (mgr->poolmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); - } -#endif - } - -#ifdef unix - munmap (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size); - munmap (mgr->latchmgr, mgr->page_size); -#else - FlushViewOfFile(mgr->latchmgr, 0); - UnmapViewOfFile(mgr->latchmgr); - CloseHandle(mgr->halloc); -#endif -#ifdef unix - close (mgr->idx); - free (mgr->pool); - free (mgr->hash); - free (mgr->latch); - free (mgr->pooladvise); - free (mgr); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->pool); - GlobalFree (mgr->hash); - GlobalFree (mgr->latch); - GlobalFree (mgr); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); -#endif - free (bt); -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page pool (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last, slot, idx; -uint nlatchpage, latchhash; -BtLatchMgr *latchmgr; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; -int flag; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !poolmax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - latchmgr = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, latchmgr, BT_minpage, 0) == BT_minpage ) - bits = latchmgr->alloc->bits; - else - return free(mgr), free(latchmgr), NULL; - } else if( mode == BT_ro ) - return free(latchmgr), free (mgr), NULL; -#else - latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)latchmgr, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = latchmgr->alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->poolmax = poolmax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->poolmask = (cacheblk >> bits) - 1; - - // see if requested size of pages per memmap is greater - - if( (1 << segsize) > mgr->poolmask ) - mgr->poolmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->poolmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->pool = calloc (poolmax, sizeof(BtPool)); - mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtSpinLatch)); - mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 8) / 8); -#else - mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool)); - mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch)); -#endif - - if( size || *amt ) - goto mgrlatch; - - // initialize an empty b-tree with latch page, root page, page of leaves - // and page(s) of latches - - memset (latchmgr, 0, 1 << bits); - nlatchpage = BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1; - bt_putid(latchmgr->alloc->right, MIN_lvl+1+nlatchpage); - latchmgr->alloc->bits = mgr->page_bits; - - latchmgr->nlatchpage = nlatchpage; - latchmgr->latchtotal = nlatchpage * (mgr->page_size / sizeof(BtLatchSet)); - - // initialize latch manager - - latchhash = (mgr->page_size - sizeof(BtLatchMgr)) / sizeof(BtHashEntry); - - // size of hash table = total number of latchsets - - if( latchhash > latchmgr->latchtotal ) - latchhash = latchmgr->latchtotal; - - latchmgr->latchhash = latchhash; - -#ifdef unix - if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size ) - return free(latchmgr), bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (latchmgr, 0, 1 << bits); - latchmgr->alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(latchmgr->alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(latchmgr->alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(latchmgr->alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - latchmgr->alloc->min = mgr->page_size - 3; - latchmgr->alloc->lvl = lvl; - latchmgr->alloc->cnt = 1; - latchmgr->alloc->act = 1; -#ifdef unix - if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // clear out latch manager locks - // and rest of pages to round out segment - - memset(latchmgr, 0, mgr->page_size); - last = MIN_lvl + 1; - - while( last <= ((MIN_lvl + 1 + nlatchpage) | mgr->poolmask) ) { -#ifdef unix - pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - last++; - } - -mgrlatch: -#ifdef unix - flag = PROT_READ | PROT_WRITE; - mgr->latchmgr = mmap (0, mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size); - if( mgr->latchmgr == MAP_FAILED ) - return bt_mgrclose (mgr), NULL; - mgr->latchsets = (BtLatchSet *)mmap (0, mgr->latchmgr->nlatchpage * mgr->page_size, flag, MAP_SHARED, mgr->idx, LATCH_page * mgr->page_size); - if( mgr->latchsets == MAP_FAILED ) - return bt_mgrclose (mgr), NULL; -#else - flag = PAGE_READWRITE; - mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size, NULL); - if( !mgr->halloc ) - return bt_mgrclose (mgr), NULL; - - flag = FILE_MAP_WRITE; - mgr->latchmgr = MapViewOfFile(mgr->halloc, flag, 0, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size); - if( !mgr->latchmgr ) - return GetLastError(), bt_mgrclose (mgr), NULL; - - mgr->latchsets = (void *)((char *)mgr->latchmgr + LATCH_page * mgr->page_size); -#endif - -#ifdef unix - free (latchmgr); -#else - VirtualFree (latchmgr, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - - memset(bt->zero, 0, mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Latch Manager - -void bt_readlock(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_rdlock (latch->lock); -#else - AcquireSRWLockShared (latch->srw); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_writelock(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_wrlock (latch->lock); -#else - AcquireSRWLockExclusive (latch->srw); -#endif -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 if already write or read locked - -int bt_writetry(BtLatch *latch) -{ -int result = 0; - -#ifdef unix - result = !pthread_rwlock_trywrlock (latch->lock); -#else - result = TryAcquireSRWLockExclusive (latch->srw); -#endif - return result; -} - -// clear write mode - -void bt_releasewrite(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_unlock (latch->lock); -#else - ReleaseSRWLockExclusive (latch->srw); -#endif -} - -// decrement reader count - -void bt_releaseread(BtLatch *latch) -{ -#ifdef unix - pthread_rwlock_unlock (latch->lock); -#else - ReleaseSRWLockShared (latch->srw); -#endif -} - -// Buffer Pool mgr - -// find segment in pool -// must be called with hashslot idx locked -// return NULL if not there -// otherwise return node - -BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx) -{ -BtPool *pool; -uint slot; - - // compute start of hash chain in pool - - if( slot = bt->mgr->hash[idx] ) - pool = bt->mgr->pool + slot; - else - return NULL; - - page_no &= ~bt->mgr->poolmask; - - while( pool->basepage != page_no ) - if( pool = pool->hashnext ) - continue; - else - return NULL; - - return pool; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx) -{ -BtPool *node; -uint slot; - - pool->hashprev = pool->hashnext = NULL; - pool->basepage = page_no & ~bt->mgr->poolmask; - pool->lru = 1; - - if( slot = bt->mgr->hash[idx] ) { - node = bt->mgr->pool + slot; - pool->hashnext = node; - node->hashprev = pool; - } - - bt->mgr->hash[idx] = pool->slot; -} - -// find best segment to evict from buffer pool - -BtPool *bt_findlru (BtDb *bt, uint hashslot) -{ -unsigned long long int target = ~0LL; -BtPool *pool = NULL, *node; - - if( !hashslot ) - return NULL; - - node = bt->mgr->pool + hashslot; - - // scan pool entries under hash table slot - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - pool = node; - } while( node = node->hashnext ); - - return pool; -} - -// map new buffer pool segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( pool->map == MAP_FAILED ) - return bt->err = BTERR_map; - // clear out madvise issued bits - memset (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8) / 8), 0, (bt->mgr->poolmask + 8)/8); -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !pool->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->poolmask+1) << bt->mgr->page_bits); - if( !pool->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// calculate page within pool - -BtPage bt_page (BtDb *bt, BtPool *pool, uid page_no) -{ -uint subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping -BtPage page; - - page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits)); -#ifdef unix - { - uint idx = subpage / 8; - uint bit = subpage % 8; - - if( ~((bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] >> bit) & 1 ) { - madvise (page, bt->mgr->page_size, MADV_WILLNEED); - (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] |= 1 << bit; - } - } -#endif - return page; -} - -// release pool pin - -void bt_unpinpool (BtPool *pool) -{ -#ifdef unix - __sync_fetch_and_add(&pool->pin, -1); -#else - _InterlockedDecrement16 (&pool->pin); -#endif -} - -// find or place requested page in segment-pool -// return pool table entry, incrementing pin - -BtPool *bt_pinpool(BtDb *bt, uid page_no) -{ -BtPool *pool, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_spinreadlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleaseread (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // upgrade to write lock - - bt_spinreleaseread (&bt->mgr->latch[idx]); - bt_spinwritelock (&bt->mgr->latch[idx]); - - // try to find page in pool with write lock - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // allocate a new pool node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1); -#else - slot = _InterlockedIncrement16 (&bt->mgr->poolcnt) - 1; -#endif - - if( ++slot < bt->mgr->poolmax ) { - pool = bt->mgr->pool + slot; - pool->slot = slot; - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - return pool; - } - - // pool table is full - // find best pool entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->poolcnt, -1); -#else - _InterlockedDecrement16 (&bt->mgr->poolcnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = _InterlockedIncrement16 (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_spinwritetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_spinreleasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim pool node from hash table - - if( node = pool->hashprev ) - node->hashnext = pool->hashnext; - else if( node = pool->hashnext ) - bt->mgr->hash[victim] = node->slot; - else - bt->mgr->hash[victim] = 0; - - if( node = pool->hashnext ) - node->hashprev = pool->hashprev; - - bt_spinreleasewrite (&bt->mgr->latch[victim]); - - // remove old file mapping -#ifdef unix - munmap (pool->map, (bt->mgr->poolmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); -#endif - pool->map = NULL; - - // create new pool mapping - // and link into hash table - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - return pool; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool and return latchset pointer - -void bt_lockpage(BtLock mode, BtLatchSet *set) -{ - switch( mode ) { - case BtLockRead: - bt_readlock (set->readwr); - break; - case BtLockWrite: - bt_writelock (set->readwr); - break; - case BtLockAccess: - bt_readlock (set->access); - break; - case BtLockDelete: - bt_writelock (set->access); - break; - case BtLockParent: - bt_writelock (set->parent); - break; - } -} - -// remove write, read, or parent lock on requested page_no. - -void bt_unlockpage(BtLock mode, BtLatchSet *set) -{ - switch( mode ) { - case BtLockRead: - bt_releaseread (set->readwr); - break; - case BtLockWrite: - bt_releasewrite (set->readwr); - break; - case BtLockAccess: - bt_releaseread (set->access); - break; - case BtLockDelete: - bt_releasewrite (set->access); - break; - case BtLockParent: - bt_releasewrite (set->parent); - break; - } -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -BtLatchSet *set; -BtPool *pool; -uid new_page; -BtPage pmap; -int reuse; - - // lock allocation page - - bt_spinwritelock(bt->mgr->latchmgr->lock); - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(bt->mgr->latchmgr->alloc[1].right) ) { - if( pool = bt_pinpool (bt, new_page) ) - pmap = bt_page (bt, pool, new_page); - else - return 0; - bt_putid(bt->mgr->latchmgr->alloc[1].right, bt_getid(pmap->right)); - bt_unpinpool (pool); - reuse = 1; - } else { - new_page = bt_getid(bt->mgr->latchmgr->alloc->right); - bt_putid(bt->mgr->latchmgr->alloc->right, new_page+1); - reuse = 0; - } -#ifdef unix - // if writing first page of pool block, zero last page in the block - - if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) - { - // use zero buffer to write zeros - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } - - // unlock allocation latch - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - -#else - // unlock allocation latch - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - - // bring new page into pool and copy page. - // this will extend the file into the new pages. - // NB -- no latch required - - if( pool = bt_pinpool (bt, new_page) ) - pmap = bt_page (bt, pool, new_page); - else - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - bt_unpinpool (pool); -#endif - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, BtLock lock) -{ -uid page_no = ROOT_page, prevpage = 0; -BtLatchSet *set, *prevset; -uint drill = 0xff, slot; -uint mode, prevmode; -BtPool *prevpool; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - // obtain latch set for this page - - bt->set = bt_pinlatch (bt, page_no); - bt->page_no = page_no; - - // pin page contents - - if( bt->pool = bt_pinpool (bt, page_no) ) - bt->page = bt_page (bt, bt->pool, page_no); - else - return 0; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - bt_lockpage(BtLockAccess, bt->set); - - // now unlock and unpin our (possibly foster) parent - - if( prevpage ) { - bt_unlockpage(prevmode, prevset); - bt_unpinlatch (prevset); - bt_unpinpool (prevpool); - prevpage = 0; - } - - // obtain read lock using lock chaining - - bt_lockpage(mode, bt->set); - - if( page_no > ROOT_page ) - bt_unlockpage(BtLockAccess, bt->set); - - // re-read and re-lock root after determining actual level of root - - if( page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) { - bt_unlockpage(mode, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - continue; - } - } - - prevpage = bt->page_no; - prevpool = bt->pool; - prevset = bt->set; - prevmode = mode; - - // find key on page at this level - // and either descend to requested level - // or return key slot - - slot = bt_findslot (bt, key, len); - - // is this slot < foster child area - // on the requested level? - - // if so, return actual slot even if dead - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - - // find next active slot - - // note: foster children are never dead - // nor fence keys for interiour nodes - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; // last key shouldn't be deleted - - // is this slot < foster child area - // if so, drill to next level - - if( slot <= bt->page->cnt - bt->page->foster ) - drill--; - - // continue right onto foster child - // or down to next level. - - page_no = bt_getid(slotptr(bt->page, slot)->id); - - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when leaf page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len) -{ -unsigned char leftkey[256]; -BtLatchSet *rset, *set; -BtPool *pool, *rpool; -BtPage rpage, page; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - // note that fence keys of interiour nodes are not deleted. - - if( bt->found = !keycmp (ptr, key, len) ) - if( bt->found = slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - page_no = bt->page_no; - pool = bt->pool; - page = bt->page; - set = bt->set; - - // return if page is not empty or not found - - if( page->act || !bt->found ) { - bt_unlockpage(BtLockWrite, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - return bt->err; - } - - // cache copy of fence key of empty node - - ptr = keyptr(page, page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // release write lock on empty node - // obtain Parent lock - - bt_unlockpage(BtLockWrite, set); - bt_lockpage(BtLockParent, set); - - // load and lock parent to see - // if delete of empty node is OK - // ie, not a fence key of parent - - while( 1 ) { - if( slot = bt_loadpage (bt, leftkey+1, *leftkey, 1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // does parent level contain our fence key yet? - // and is it free of foster children? - - if( !bt->page->foster ) - if( !keycmp (ptr, leftkey+1, *leftkey) ) - break; - - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } - - // find our left fence key - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct; // last key shouldn't be deleted - - // now we have both parent and child - - bt_lockpage(BtLockDelete, set); - bt_lockpage(BtLockWrite, set); - - // return if page has no right sibling within parent - // or if empty node is no longer empty - - if( page->act || slot == bt->page->cnt ) { - // unpin parent - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - // unpin empty node - bt_unlockpage(BtLockParent, set); - bt_unlockpage(BtLockDelete, set); - bt_unlockpage(BtLockWrite, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - return bt->err; - } - - // lock and map our right page - // note that it cannot be our foster child - // since the our node is empty - - right = bt_getid(page->right); - - if( rpool = bt_pinpool (bt, right) ) - rpage = bt_page (bt, rpool, right); - else - return bt->err; - - rset = bt_pinlatch (bt, right); - bt_lockpage(BtLockWrite, rset); - bt_lockpage(BtLockDelete, rset); - - // pull contents of right page into empty page - - memcpy (page, rpage, bt->mgr->page_size); - - // delete left parent slot for old empty page - // and redirect right parent slot to it - - bt->page->act--; - bt->page->dirty = 1; - slotptr(bt->page, slot)->dead = 1; - - while( slot++ < bt->page->cnt ) - if( !slotptr(bt->page, slot)->dead ) - break; - - bt_putid(slotptr(bt->page,slot)->id, page_no); - - // release parent level lock - // and our empty node lock - - bt_unlockpage(BtLockWrite, set); - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - - // add killed right block to free chain - // lock latch mgr - - bt_spinwritelock(bt->mgr->latchmgr->lock); - - // store free chain in allocation page second right - bt_putid(rpage->right, bt_getid(bt->mgr->latchmgr->alloc[1].right)); - bt_putid(bt->mgr->latchmgr->alloc[1].right, right); - - // unlock latch mgr and right page - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - - bt_unlockpage(BtLockWrite, rset); - bt_unlockpage(BtLockDelete, rset); - bt_unpinlatch (rset); - bt_unpinpool (rpool); - - // remove ParentModify lock - - bt_unlockpage(BtLockParent, set); - bt_unlockpage(BtLockDelete, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( slot <= bt->page->cnt && !keycmp (ptr, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - bt_unlockpage (BtLockRead, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// >0 new slot value - -uint bt_cleanpage(BtDb *bt, uint amt, uint slot) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -uint newslot; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return slot; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - // always leave fence key in the array - // otherwise, remove deleted key - - // note: foster children are never dead - // nor are fence keys for interiour nodes - - while( cnt++ < max ) { - if( cnt == slot ) - newslot = idx + 1; - else if( cnt < max && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - page->min = nxt; - page->cnt = idx; - - // see if page has enough space now, or does it need splitting? - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return newslot; - - return 0; -} - -// add key to current page -// page must already be writelocked - -void bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // find next available dead slot and copy key onto page - // note that foster children on the page are never dead - - // look for next hole, but stay back from the fence key - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - // now insert key into array before slot - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; -} - -// split the root and raise the height of the btree -// call with current page locked and page no of foster child -// return with current page (root) unlocked - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - memset (slotptr(root, root->cnt), 0, sizeof(BtSlot)); - root->dirty = 1; - root->foster--; - root->act--; - root->cnt--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page - - if( !(new_page = bt_newpage(bt, root)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest of the root to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on empty newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release and unpin root (bt->page) - - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return 0; -} - -// split already locked full node -// in current page variables -// return unlocked and unpinned. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtLatchSet *set = bt->set; -BtPool *pool = bt->pool; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer for right node - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving old foster children in the left node, - // and adding a new foster child there. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - if( !(slotptr(bt->frame, idx)->dead = slotptr(page, cnt)->dead) ) - bt->frame->act++; - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - } - - // transfer right link node to new right node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write right frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // remember fence key for new right page to add - // as foster child to the left node - - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->dirty = 0; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - // insert new foster child for right page in queue - // before any of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys - // note that none will be dead - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - page->min = nxt; - page->cnt = idx; - - // link new right page - - bt_putid (page->right, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // release wr lock on our page - - bt_unlockpage (BtLockWrite, set); - - // obtain ParentModification lock for current page - // to fix new fence key and oldest foster child on page - - bt_lockpage (BtLockParent, set); - - // get our new fence key to insert in parent node - - bt_lockpage (BtLockRead, set); - - key = keyptr(page, page->cnt-1); - memcpy (fencekey, key, key->len+1); - - bt_unlockpage (BtLockRead, set); - - if( bt_insertkey (bt, fencekey + 1, *fencekey, page_no, tod, lvl + 1) ) - return bt->err; - - // lock our page for writing - - bt_lockpage (BtLockRead, set); - - // switch old parent key from us to our oldest foster child - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - new_page = bt_getid (slotptr(page, page->cnt)->id); - bt_unlockpage (BtLockRead, set); - - if( bt_insertkey (bt, fencekey + 1, *fencekey, new_page, tod, lvl + 1) ) - return bt->err; - - // now that it has its own parent pointer, - // remove oldest foster child from our page - - bt_lockpage (BtLockWrite, set); - memset (slotptr(page, page->cnt), 0, sizeof(BtSlot)); - page->dirty = 1; - page->foster--; - page->cnt--; - page->act--; - - // unlock and unpin - - bt_unlockpage (BtLockWrite, set); - bt_unlockpage (BtLockParent, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - return 0; -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod, uint lvl) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - if( slotptr(page, slot)->dead ) - page->act++; - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return bt->err; - } - - // check if page has enough space - - if( slot = bt_cleanpage (bt, len, slot) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - bt_addkeytopage (bt, slot, key, len, id, tod); - - bt_unlockpage (BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return 0; -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - - bt->cursor_page = bt->page_no; - - bt_unlockpage(BtLockRead, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtLatchSet *set; -BtPool *pool; -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - if( pool = bt_pinpool (bt, right) ) - page = bt_page (bt, pool, right); - else - return 0; - - set = bt_pinlatch (bt, right); - bt_lockpage(BtLockRead, set); - - memcpy (bt->cursor, page, bt->mgr->page_size); - - bt_unlockpage(BtLockRead, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, idx; - char *infile; - BtMgr *mgr; - int num; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0, cnt = 0; -uid next, page_no = LEAF_page; // start on first page of leaves -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -BtLatchSet *set; -time_t tod[1]; -BtPool *pool; -BtPage page; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_deletekey (bt, key, len) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - - break; - - case 'c': - fprintf(stderr, "started reading\n"); - - do { - if( pool = bt_pinpool (bt, page_no) ) - page = bt_page (bt, pool, page_no); - else - break; - set = bt_pinlatch (bt, page_no); - bt_lockpage (BtLockRead, set); - cnt += page->act; - next = bt_getid (page->right); - bt_unlockpage (BtLockRead, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - } while( page_no = next ); - - cnt--; // remove stopper key - fprintf(stderr, " Total keys read %d\n", cnt); - break; - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint poolsize = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - poolsize = atoi(argv[4]); - - if( !poolsize ) - fprintf (stderr, "Warning: no mapped_pool\n"); - - if( poolsize > 65535 ) - fprintf (stderr, "Warning: mapped_pool > 65535 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; - args[idx].idx = idx; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreef1.c b/fosterbtreef1.c deleted file mode 100644 index f2fea9d..0000000 --- a/fosterbtreef1.c +++ /dev/null @@ -1,2447 +0,0 @@ -// foster btree version f -// 17 JAN 2014 - -// author: karl malbrain, malbrain@cal.berkeley.edu - -/* -This work, including the source code, documentation -and related data, is placed into the public domain. - -The orginal author is Karl Malbrain. - -THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY -OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF -MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE, -ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE -RESULTING FROM THE USE, MODIFICATION, OR -REDISTRIBUTION OF THIS SOFTWARE. -*/ - -// Please see the project home page for documentation -// code.google.com/p/high-concurrency-btree - -#define _FILE_OFFSET_BITS 64 -#define _LARGEFILE64_SOURCE - -#ifdef linux -#define _GNU_SOURCE -#endif - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include -#include -#else -#define WIN32_LEAN_AND_MEAN -#include -#include -#include -#include -#include -#include -#include -#endif - -#include -#include - -typedef unsigned long long uid; - -#ifndef unix -typedef unsigned long long off64_t; -typedef unsigned short ushort; -typedef unsigned int uint; -#endif - -#define BT_ro 0x6f72 // ro -#define BT_rw 0x7772 // rw - -#define BT_latchtable 128 // number of latch manager slots - -#define BT_maxbits 24 // maximum page size in bits -#define BT_minbits 9 // minimum page size in bits -#define BT_minpage (1 << BT_minbits) // minimum page size -#define BT_maxpage (1 << BT_maxbits) // maximum page size - -/* -There are five lock types for each node in three independent sets: -1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. -2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. -3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. -4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. -*/ - -typedef enum{ - BtLockAccess, - BtLockDelete, - BtLockRead, - BtLockWrite, - BtLockParent, - BtLockPin -}BtLock; - -// Define the length of the page and key pointers - -#define BtId 6 - -// Page key slot definition. - -// If BT_maxbits is 15 or less, you can save 4 bytes -// for each key stored by making the first two uints -// into ushorts. You can also save 4 bytes by removing -// the tod field from the key. - -// Keys are marked dead, but remain on the page until -// it cleanup is called. The fence key (highest key) for -// the page is always present, even after cleanup. - -typedef struct { - uint off:BT_maxbits; // page offset for key start - uint dead:1; // set for deleted key - uint tod; // time-stamp for key - unsigned char id[BtId]; // id associated with key -} BtSlot; - -// The key structure occupies space at the upper end of -// each page. It's a length byte followed by the value -// bytes. - -typedef struct { - unsigned char len; - unsigned char key[1]; -} *BtKey; - -// The first part of an index page. -// It is immediately followed -// by the BtSlot array of keys. - -typedef struct Page { - uint cnt; // count of keys in page - uint act; // count of active keys - uint min; // next key offset - uint foster; // count of foster children - unsigned char bits; // page size in bits - unsigned char lvl:6; // level of page - unsigned char kill:1; // page is being deleted - unsigned char dirty:1; // page needs to be cleaned - unsigned char right[BtId]; // page number to right -} *BtPage; - -// mode & definition for hash latch implementation - -enum { - Mutex = 1, - Write = 2, - Pending = 4, - Share = 8 -} LockMode; - -// mutex locks the other fields -// exclusive is set for write access -// share is count of read accessors - -typedef struct { - volatile ushort mutex:1; - volatile ushort exclusive:1; - volatile ushort pending:1; - volatile ushort share:13; -} BtSpinLatch; - -// hash table entries - -typedef struct { - BtSpinLatch latch[1]; - volatile ushort slot; // Latch table entry at head of chain -} BtHashEntry; - -// latch table lock structure -// implements a fair read-write lock - -typedef struct { -#ifdef unix - pthread_rwlock_t lock[1]; -#else - SRWLOCK srw[1]; -#endif -} BtLatch; - -typedef struct { - BtSpinLatch readwr[1]; // read/write page lock - BtSpinLatch access[1]; // Access Intent/Page delete - BtSpinLatch parent[1]; // adoption of foster children - BtSpinLatch busy[1]; // slot is being moved between chains - volatile ushort next; // next entry in hash table chain - volatile ushort prev; // prev entry in hash table chain - volatile ushort pin; // number of outstanding locks - volatile ushort hash; // hash slot entry is under - volatile uid page_no; // latch set page number -} BtLatchSet; - -// The memory mapping pool table buffer manager entry - -typedef struct { - unsigned long long int lru; // number of times accessed - uid basepage; // mapped base page number - char *map; // mapped memory pointer - ushort pin; // mapped page pin counter - ushort slot; // slot index in this array - void *hashprev; // previous pool entry for the same hash idx - void *hashnext; // next pool entry for the same hash idx -#ifndef unix - HANDLE hmap; // Windows memory mapping handle -#endif -} BtPool; - -// structure for latch manager on ALLOC_page - -typedef struct { - struct Page alloc[2]; // next & free page_nos in right ptr - BtSpinLatch lock[1]; // allocation area lite latch - ushort latchdeployed; // highest number of latch entries deployed - ushort nlatchpage; // number of latch pages at BT_latch - ushort latchtotal; // number of page latch entries - ushort latchhash; // number of latch hash table slots - ushort latchvictim; // next latch entry to examine - BtHashEntry table[0]; // the hash table -} BtLatchMgr; - -// The object structure for Btree access - -typedef struct { - uint page_size; // page size - uint page_bits; // page size in bits - uint seg_bits; // seg size in pages in bits - uint mode; // read-write mode -#ifdef unix - int idx; - char *pooladvise; // bit maps for pool page advisements -#else - HANDLE idx; -#endif - ushort poolcnt; // highest page pool node in use - ushort poolmax; // highest page pool node allocated - ushort poolmask; // total size of pages in mmap segment - 1 - ushort hashsize; // size of Hash Table for pool entries - ushort evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries - BtPool *pool; // memory pool page segments - BtSpinLatch *latch; // latches for pool hash slots - BtLatchMgr *latchmgr; // mapped latch page from allocation page - BtLatchSet *latchsets; // mapped latch set from latch pages -#ifndef unix - HANDLE halloc; // allocation and latch table handle -#endif -} BtMgr; - -typedef struct { - BtMgr *mgr; // buffer manager for thread - BtPage temp; // temporary frame buffer (memory mapped/file IO) - BtPage cursor; // cached frame for start/next (never mapped) - BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page - uid page_no; // current page number - uid cursor_page; // current cursor page number - BtLatchSet *set; // current page latch set - unsigned char *mem; // frame, cursor, page memory buffer - int err; // last error -} BtDb; - -typedef enum { - BTERR_ok = 0, - BTERR_struct, - BTERR_ovflw, - BTERR_lock, - BTERR_map, - BTERR_wrt, - BTERR_hash, - BTERR_latch -} BTERR; - -// B-Tree functions -extern void bt_close (BtDb *bt); -extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); -extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); -extern uint bt_nextkey (BtDb *bt, uint slot); - -// manager functions -extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); -void bt_mgrclose (BtMgr *mgr); - -// Helper functions to return cursor slot values - -extern BtKey bt_key (BtDb *bt, uint slot); -extern uid bt_uid (BtDb *bt, uint slot); -extern uint bt_tod (BtDb *bt, uint slot); - -// BTree page number constants -#define ALLOC_page 0 // allocation & lock manager hash table -#define ROOT_page 1 // root of the btree -#define LEAF_page 2 // first page of leaves -#define LATCH_page 3 // pages for lock manager - -// Number of levels to create in a new BTree - -#define MIN_lvl 2 - -// The page is allocated from low and hi ends. -// The key offsets and row-id's are allocated -// from the bottom, while the text of the key -// is allocated from the top. When the two -// areas meet, the page is split into two. - -// A key consists of a length byte, two bytes of -// index number (0 - 65534), and up to 253 bytes -// of key value. Duplicate keys are discarded. -// Associated with each key is a 48 bit row-id. - -// The b-tree root is always located at page 1. -// The first leaf page of level zero is always -// located on page 2. - -// When to root page fills, it is split in two and -// the tree height is raised by a new root at page -// one with two keys. - -// Deleted keys are marked with a dead bit until -// page cleanup The fence key for a node is always -// present, even after deletion and cleanup. - -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep -// track of the cached segments. This behaviour is controlled -// by the cache block size parameter to bt_open. - -// To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. - -// Page 0 is dedicated to lock for new page extensions, -// and chains empty pages together for reuse. - -// Empty pages are chained together through the ALLOC page and reused. - -// Access macros to address slot and key values from the page - -#define slotptr(page, slot) (((BtSlot *)(page+1)) + (slot-1)) -#define keyptr(page, slot) ((BtKey)((unsigned char*)(page) + slotptr(page, slot)->off)) - -void bt_putid(unsigned char *dest, uid id) -{ -int i = BtId; - - while( i-- ) - dest[i] = (unsigned char)id, id >>= 8; -} - -uid bt_getid(unsigned char *src) -{ -uid id = 0; -int i; - - for( i = 0; i < BtId; i++ ) - id <<= 8, id |= *src++; - - return id; -} - -// wait until write lock mode is clear -// and add 1 to the share count - -void bt_spinreadlock(BtSpinLatch *latch) -{ -ushort prev; - - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex) & Mutex ) - sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex) & Mutex ) - SwitchToThread(); -#endif - - // see if exclusive request is granted or pending - - if( prev = !(latch->exclusive | latch->pending) ) -#ifdef unix - __sync_fetch_and_add((ushort *)latch, Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, Share); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - if( prev ) - return; -#ifdef unix - } while( sched_yield(), 1 ); -#else - } while( SwitchToThread(), 1 ); -#endif -} - -// wait for other read and write latches to relinquish - -void bt_spinwritelock(BtSpinLatch *latch) -{ -ushort prev; - - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex | Pending) & Mutex ) - sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex | Pending) & Mutex ) - SwitchToThread(); -#endif - if( prev = !(latch->share | latch->exclusive) ) -#ifdef unix - __sync_fetch_and_or((ushort *)latch, Write); -#else - _InterlockedOr16((ushort *)latch, Write); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~(Mutex | Pending)); -#else - _InterlockedAnd16((ushort *)latch, ~(Mutex | Pending)); -#endif - if( prev ) - return; -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } while( 1 ); -} - -// try to obtain write lock - -// return 1 if obtained, -// 0 otherwise - -int bt_spinwritetry(BtSpinLatch *latch) -{ -ushort prev; - -#ifdef unix - if( prev = __sync_fetch_and_or((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#else - if( prev = _InterlockedOr16((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#endif - // take write access if all bits are clear - - if( !prev ) -#ifdef unix - __sync_fetch_and_or ((ushort *)latch, Write); -#else - _InterlockedOr16((ushort *)latch, Write); -#endif - -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - return !prev; -} - -// clear write mode - -void bt_spinreleasewrite(BtSpinLatch *latch) -{ -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Write); -#else - _InterlockedAnd16((ushort *)latch, ~Write); -#endif -} - -// decrement reader count - -void bt_spinreleaseread(BtSpinLatch *latch) -{ -#ifdef unix - __sync_fetch_and_add((ushort *)latch, -Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, -Share); -#endif -} - -// link latch table entry into latch hash table - -void bt_latchlink (BtDb *bt, ushort hashidx, ushort victim, uid page_no) -{ -BtLatchSet *set = bt->mgr->latchsets + victim; - - if( set->next = bt->mgr->latchmgr->table[hashidx].slot ) - bt->mgr->latchsets[set->next].prev = victim; - - bt->mgr->latchmgr->table[hashidx].slot = victim; - set->page_no = page_no; - set->hash = hashidx; - set->prev = 0; -} - -// find existing latchset or inspire new one -// return with latchset pinned - -BtLatchSet *bt_bindlatch (BtDb *bt, uid page_no, int incr) -{ -ushort hashidx = page_no % bt->mgr->latchmgr->latchhash; -ushort slot, avail = 0, victim, idx; -BtLatchSet *set; - - // obtain read lock on hash table entry - - bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch); - - if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do - { - set = bt->mgr->latchsets + slot; - if( page_no == set->page_no ) - break; - } while( slot = set->next ); - - if( slot && incr ) { -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - } - - bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch); - - if( slot ) - return set; - - // try again, this time with write lock - - bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch); - - if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do - { - set = bt->mgr->latchsets + slot; - if( page_no == set->page_no ) - break; - if( !set->pin && !avail ) - avail = slot; - } while( slot = set->next ); - - // found our entry, or take over an unpinned one - - if( slot || (slot = avail) ) { - set = bt->mgr->latchsets + slot; - if( incr ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - set->page_no = page_no; - bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch); - return set; - } - - // see if there are any unused entries -#ifdef unix - victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, 1) + 1; -#else - victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchdeployed); -#endif - - if( victim < bt->mgr->latchmgr->latchtotal ) { - set = bt->mgr->latchsets + victim; - if( incr ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - return set; - } - -#ifdef unix - victim = __sync_fetch_and_add (&bt->mgr->latchmgr->latchdeployed, -1); -#else - victim = _InterlockedDecrement16 (&bt->mgr->latchmgr->latchdeployed); -#endif - // find and reuse previous lock entry - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->latchmgr->latchvictim, 1); -#else - victim = _InterlockedIncrement16 (&bt->mgr->latchmgr->latchvictim) - 1; -#endif - // we don't use slot zero - - if( victim %= bt->mgr->latchmgr->latchtotal ) - set = bt->mgr->latchsets + victim; - else - continue; - - // take control of our slot - // from other threads - - if( set->pin || !bt_spinwritetry (set->busy) ) - continue; - - idx = set->hash; - - // try to get write lock on hash chain - // skip entry if not obtained - // or has outstanding locks - - if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) { - bt_spinreleasewrite (set->busy); - continue; - } - - if( set->pin ) { - bt_spinreleasewrite (set->busy); - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); - continue; - } - - // unlink our available victim from its hash chain - - if( set->prev ) - bt->mgr->latchsets[set->prev].next = set->next; - else - bt->mgr->latchmgr->table[idx].slot = set->next; - - if( set->next ) - bt->mgr->latchsets[set->next].prev = set->prev; - - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); - - if( incr ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - - bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - bt_spinreleasewrite (set->busy); - return set; - } -} - -void bt_mgrclose (BtMgr *mgr) -{ -BtPool *pool; -uint slot; - - // release mapped pages - // note that slot zero is never used - - for( slot = 1; slot < mgr->poolmax; slot++ ) { - pool = mgr->pool + slot; - if( pool->slot ) -#ifdef unix - munmap (pool->map, (mgr->poolmask+1) << mgr->page_bits); -#else - { - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); - } -#endif - } - -#ifdef unix - close (mgr->idx); - free (mgr->pool); - free (mgr->hash); - free (mgr->latch); - free (mgr->pooladvise); - free (mgr); -#else - FlushFileBuffers(mgr->idx); - CloseHandle(mgr->idx); - GlobalFree (mgr->pool); - GlobalFree (mgr->hash); - GlobalFree (mgr->latch); - GlobalFree (mgr); -#endif -} - -// close and release memory - -void bt_close (BtDb *bt) -{ -#ifdef unix - if ( bt->mem ) - free (bt->mem); -#else - if ( bt->mem) - VirtualFree (bt->mem, 0, MEM_RELEASE); -#endif - free (bt); -} - -// open/create new btree buffer manager - -// call with file_name, BT_openmode, bits in page size (e.g. 16), -// size of mapped page pool (e.g. 8192) - -BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolmax, uint segsize, uint hashsize) -{ -uint lvl, attr, cacheblk, last, slot, idx; -uint nlatchpage, latchhash; -BtLatchMgr *latchmgr; -off64_t size; -uint amt[1]; -BtMgr* mgr; -BtKey key; -int flag; - -#ifndef unix -SYSTEM_INFO sysinfo[1]; -#endif - - // determine sanity of page size and buffer pool - - if( bits > BT_maxbits ) - bits = BT_maxbits; - else if( bits < BT_minbits ) - bits = BT_minbits; - - if( !poolmax ) - return NULL; // must have buffer pool - -#ifdef unix - mgr = calloc (1, sizeof(BtMgr)); - - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); - - if( mgr->idx == -1 ) - return free(mgr), NULL; - - cacheblk = 4096; // minimum mmap segment size for unix - -#else - mgr = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, sizeof(BtMgr)); - attr = FILE_ATTRIBUTE_NORMAL; - mgr->idx = CreateFile(name, GENERIC_READ| GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, attr, NULL); - - if( mgr->idx == INVALID_HANDLE_VALUE ) - return GlobalFree(mgr), NULL; - - // normalize cacheblk to multiple of sysinfo->dwAllocationGranularity - GetSystemInfo(sysinfo); - cacheblk = sysinfo->dwAllocationGranularity; -#endif - -#ifdef unix - latchmgr = malloc (BT_maxpage); - *amt = 0; - - // read minimum page size to get root info - - if( size = lseek (mgr->idx, 0L, 2) ) { - if( pread(mgr->idx, latchmgr, BT_minpage, 0) == BT_minpage ) - bits = latchmgr->alloc->bits; - else - return free(mgr), free(latchmgr), NULL; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#else - latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); - size = GetFileSize(mgr->idx, amt); - - if( size || *amt ) { - if( !ReadFile(mgr->idx, (char *)latchmgr, BT_minpage, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - bits = latchmgr->alloc->bits; - } else if( mode == BT_ro ) - return bt_mgrclose (mgr), NULL; -#endif - - mgr->page_size = 1 << bits; - mgr->page_bits = bits; - - mgr->poolmax = poolmax; - mgr->mode = mode; - - if( cacheblk < mgr->page_size ) - cacheblk = mgr->page_size; - - // mask for partial memmaps - - mgr->poolmask = (cacheblk >> bits) - 1; - - // see if requested size of pages per memmap is greater - - if( (1 << segsize) > mgr->poolmask ) - mgr->poolmask = (1 << segsize) - 1; - - mgr->seg_bits = 0; - - while( (1 << mgr->seg_bits) <= mgr->poolmask ) - mgr->seg_bits++; - - mgr->hashsize = hashsize; - -#ifdef unix - mgr->pool = calloc (poolmax, sizeof(BtPool)); - mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtSpinLatch)); - mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 8) / 8); -#else - mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool)); - mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch)); -#endif - - if( size || *amt ) - goto mgrlatch; - - // initialize an empty b-tree with latch page, root page, page of leaves - // and page(s) of latches - - memset (latchmgr, 0, 1 << bits); - nlatchpage = BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1; - bt_putid(latchmgr->alloc->right, MIN_lvl+1+nlatchpage); - latchmgr->alloc->bits = mgr->page_bits; - - latchmgr->nlatchpage = nlatchpage; - latchmgr->latchtotal = nlatchpage * (mgr->page_size / sizeof(BtLatchSet)); - - // initialize latch manager - - latchhash = (mgr->page_size - sizeof(BtLatchMgr)) / sizeof(BtHashEntry); - - // size of hash table = total number of latchsets - - if( latchhash > latchmgr->latchtotal ) - latchhash = latchmgr->latchtotal; - - latchmgr->latchhash = latchhash; - -#ifdef unix - if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - - memset (latchmgr, 0, 1 << bits); - latchmgr->alloc->bits = mgr->page_bits; - - for( lvl=MIN_lvl; lvl--; ) { - slotptr(latchmgr->alloc, 1)->off = mgr->page_size - 3; - bt_putid(slotptr(latchmgr->alloc, 1)->id, lvl ? MIN_lvl - lvl + 1 : 0); // next(lower) page number - key = keyptr(latchmgr->alloc, 1); - key->len = 2; // create stopper key - key->key[0] = 0xff; - key->key[1] = 0xff; - latchmgr->alloc->min = mgr->page_size - 3; - latchmgr->alloc->lvl = lvl; - latchmgr->alloc->cnt = 1; - latchmgr->alloc->act = 1; -#ifdef unix - if( write (mgr->idx, latchmgr, mgr->page_size) < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#else - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - } - - // clear out latch manager locks - // and rest of pages to round out segment - - memset(latchmgr, 0, mgr->page_size); - last = MIN_lvl + 1; - - while( last <= ((MIN_lvl + 1 + nlatchpage) | mgr->poolmask) ) { -#ifdef unix - pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits); -#else - SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); - if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) - return bt_mgrclose (mgr), NULL; - if( *amt < mgr->page_size ) - return bt_mgrclose (mgr), NULL; -#endif - last++; - } - -mgrlatch: -#ifdef unix - flag = PROT_READ | PROT_WRITE; - mgr->latchmgr = mmap (0, mgr->page_size, flag, MAP_SHARED, mgr->idx, ALLOC_page * mgr->page_size); - if( mgr->latchmgr == MAP_FAILED ) - return bt_mgrclose (mgr), NULL; - mgr->latchsets = (BtLatchSet *)mmap (0, mgr->latchmgr->nlatchpage * mgr->page_size, flag, MAP_SHARED, mgr->idx, LATCH_page * mgr->page_size); - if( mgr->latchsets == MAP_FAILED ) - return bt_mgrclose (mgr), NULL; -#else - flag = PAGE_READWRITE; - mgr->halloc = CreateFileMapping(mgr->idx, NULL, flag, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size, NULL); - if( !mgr->halloc ) - return bt_mgrclose (mgr), NULL; - - flag = FILE_MAP_WRITE; - mgr->latchmgr = MapViewOfFile(mgr->halloc, flag, 0, 0, (BT_latchtable / (mgr->page_size / sizeof(BtLatchSet)) + 1 + LATCH_page) * mgr->page_size); - if( !mgr->latchmgr ) - return GetLastError(), bt_mgrclose (mgr), NULL; - - mgr->latchsets = (void *)((char *)mgr->latchmgr + LATCH_page * mgr->page_size); -#endif - -#ifdef unix - free (latchmgr); -#else - VirtualFree (latchmgr, 0, MEM_RELEASE); -#endif - return mgr; -} - -// open BTree access method -// based on buffer manager - -BtDb *bt_open (BtMgr *mgr) -{ -BtDb *bt = malloc (sizeof(*bt)); - - memset (bt, 0, sizeof(*bt)); - bt->mgr = mgr; -#ifdef unix - bt->mem = malloc (3 *mgr->page_size); -#else - bt->mem = VirtualAlloc(NULL, 3 * mgr->page_size, MEM_COMMIT, PAGE_READWRITE); -#endif - bt->frame = (BtPage)bt->mem; - bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); - bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - return bt; -} - -// compare two keys, returning > 0, = 0, or < 0 -// as the comparison value - -int keycmp (BtKey key1, unsigned char *key2, uint len2) -{ -uint len1 = key1->len; -int ans; - - if( ans = memcmp (key1->key, key2, len1 > len2 ? len2 : len1) ) - return ans; - - if( len1 > len2 ) - return 1; - if( len1 < len2 ) - return -1; - - return 0; -} - -// Buffer Pool mgr - -// find segment in pool -// must be called with hashslot idx locked -// return NULL if not there -// otherwise return node - -BtPool *bt_findpool(BtDb *bt, uid page_no, uint idx) -{ -BtPool *pool; -uint slot; - - // compute start of hash chain in pool - - if( slot = bt->mgr->hash[idx] ) - pool = bt->mgr->pool + slot; - else - return NULL; - - page_no &= ~bt->mgr->poolmask; - - while( pool->basepage != page_no ) - if( pool = pool->hashnext ) - continue; - else - return NULL; - - return pool; -} - -// add segment to hash table - -void bt_linkhash(BtDb *bt, BtPool *pool, uid page_no, int idx) -{ -BtPool *node; -uint slot; - - pool->hashprev = pool->hashnext = NULL; - pool->basepage = page_no & ~bt->mgr->poolmask; - pool->lru = 1; - - if( slot = bt->mgr->hash[idx] ) { - node = bt->mgr->pool + slot; - pool->hashnext = node; - node->hashprev = pool; - } - - bt->mgr->hash[idx] = pool->slot; -} - -// find best segment to evict from buffer pool - -BtPool *bt_findlru (BtDb *bt, uint hashslot) -{ -unsigned long long int target = ~0LL; -BtPool *pool = NULL, *node; - - if( !hashslot ) - return NULL; - - node = bt->mgr->pool + hashslot; - - // scan pool entries under hash table slot - - do { - if( node->pin ) - continue; - if( node->lru > target ) - continue; - target = node->lru; - pool = node; - } while( node = node->hashnext ); - - return pool; -} - -// map new buffer pool segment to virtual memory - -BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no) -{ -off64_t off = (page_no & ~bt->mgr->poolmask) << bt->mgr->page_bits; -off64_t limit = off + ((bt->mgr->poolmask+1) << bt->mgr->page_bits); -int flag; - -#ifdef unix - flag = PROT_READ | ( bt->mgr->mode == BT_ro ? 0 : PROT_WRITE ); - pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); - if( pool->map == MAP_FAILED ) - return bt->err = BTERR_map; - // clear out madvise issued bits - memset (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8) / 8), 0, (bt->mgr->poolmask + 8)/8); -#else - flag = ( bt->mgr->mode == BT_ro ? PAGE_READONLY : PAGE_READWRITE ); - pool->hmap = CreateFileMapping(bt->mgr->idx, NULL, flag, (DWORD)(limit >> 32), (DWORD)limit, NULL); - if( !pool->hmap ) - return bt->err = BTERR_map; - - flag = ( bt->mgr->mode == BT_ro ? FILE_MAP_READ : FILE_MAP_WRITE ); - pool->map = MapViewOfFile(pool->hmap, flag, (DWORD)(off >> 32), (DWORD)off, (bt->mgr->poolmask+1) << bt->mgr->page_bits); - if( !pool->map ) - return bt->err = BTERR_map; -#endif - return bt->err = 0; -} - -// find or place requested page in segment-pool -// return pool table entry, incrementing pin - -BtPool *bt_pinpage(BtDb *bt, uid page_no) -{ -BtPool *pool, *node, *next; -uint slot, idx, victim; -BtLatchSet *set; - - // lock hash table chain - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_spinreadlock (&bt->mgr->latch[idx]); - - // look up in hash table - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleaseread (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // upgrade to write lock - - bt_spinreleaseread (&bt->mgr->latch[idx]); - bt_spinwritelock (&bt->mgr->latch[idx]); - - // try to find page in pool with write lock - - if( pool = bt_findpool(bt, page_no, idx) ) { -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - pool->lru++; - return pool; - } - - // allocate a new pool node - // and add to hash table - -#ifdef unix - slot = __sync_fetch_and_add(&bt->mgr->poolcnt, 1); -#else - slot = _InterlockedIncrement16 (&bt->mgr->poolcnt) - 1; -#endif - - if( ++slot < bt->mgr->poolmax ) { - pool = bt->mgr->pool + slot; - pool->slot = slot; - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - return pool; - } - - // pool table is full - // find best pool entry to evict - -#ifdef unix - __sync_fetch_and_add(&bt->mgr->poolcnt, -1); -#else - _InterlockedDecrement16 (&bt->mgr->poolcnt); -#endif - - while( 1 ) { -#ifdef unix - victim = __sync_fetch_and_add(&bt->mgr->evicted, 1); -#else - victim = _InterlockedIncrement16 (&bt->mgr->evicted) - 1; -#endif - victim %= bt->mgr->hashsize; - - // try to get write lock - // skip entry if not obtained - - if( !bt_spinwritetry (&bt->mgr->latch[victim]) ) - continue; - - // if cache entry is empty - // or no slots are unpinned - // skip this entry - - if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_spinreleasewrite (&bt->mgr->latch[victim]); - continue; - } - - // unlink victim pool node from hash table - - if( node = pool->hashprev ) - node->hashnext = pool->hashnext; - else if( node = pool->hashnext ) - bt->mgr->hash[victim] = node->slot; - else - bt->mgr->hash[victim] = 0; - - if( node = pool->hashnext ) - node->hashprev = pool->hashprev; - - bt_spinreleasewrite (&bt->mgr->latch[victim]); - - // remove old file mapping -#ifdef unix - munmap (pool->map, (bt->mgr->poolmask+1) << bt->mgr->page_bits); -#else - FlushViewOfFile(pool->map, 0); - UnmapViewOfFile(pool->map); - CloseHandle(pool->hmap); -#endif - pool->map = NULL; - - // create new pool mapping - // and link into hash table - - if( bt_mapsegment(bt, pool, page_no) ) - return NULL; - - bt_linkhash(bt, pool, page_no, idx); -#ifdef unix - __sync_fetch_and_add(&pool->pin, 1); -#else - _InterlockedIncrement16 (&pool->pin); -#endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); - return pool; - } -} - -// place write, read, or parent lock on requested page_no. -// pin to buffer pool and return latchset pointer - -BtLatchSet *bt_lockpage(BtDb *bt, uid page_no, BtLock mode, BtPage *pageptr, BtLatchSet *set) -{ -BtPool *pool; -uint subpage; -BtPage page; - - // find/create maping in pool table - // and pin our pool slot - - if( pool = bt_pinpage(bt, page_no) ) - subpage = (uint)(page_no & bt->mgr->poolmask); // page within mapping - else - return NULL; - - if( set ) -#ifdef unix - __sync_fetch_and_add(&set->pin, 1); -#else - _InterlockedIncrement16 (&set->pin); -#endif - else if( !(set = bt_bindlatch (bt, page_no, 1)) ) - return NULL; - - page = (BtPage)(pool->map + (subpage << bt->mgr->page_bits)); - -#ifdef unix - { - uint idx = subpage / 8; - uint bit = subpage % 8; - - if( mode == BtLockRead || mode == BtLockWrite ) - if( ~((bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] >> bit) & 1 ) { - madvise (page, bt->mgr->page_size, MADV_WILLNEED); - (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8)/8))[idx] |= 1 << bit; - } - } -#endif - - switch( mode ) { - case BtLockRead: - bt_spinreadlock (set->readwr); - break; - case BtLockWrite: - bt_spinwritelock (set->readwr); - break; - case BtLockAccess: - bt_spinreadlock (set->access); - break; - case BtLockDelete: - bt_spinwritelock (set->access); - break; - case BtLockParent: - bt_spinwritelock (set->parent); - break; - case BtLockPin: - break; - default: - return bt->err = BTERR_lock, NULL; - } - - if( pageptr ) - *pageptr = page; - - return set; -} - -// remove write, read, or parent lock on requested page_no. - -BTERR bt_unlockpage(BtDb *bt, uid page_no, BtLock mode, BtLatchSet *set) -{ -BtPool *pool; -uint idx; - - // since page is pinned - // it should still be in the buffer pool - // and is in no danger of being a victim for reuse - - idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_spinreadlock (&bt->mgr->latch[idx]); - - if( !(pool = bt_findpool(bt, page_no, idx)) ) - return bt->err = BTERR_hash; - - bt_spinreleaseread (&bt->mgr->latch[idx]); - - switch( mode ) { - case BtLockRead: - bt_spinreleaseread (set->readwr); - break; - case BtLockWrite: - bt_spinreleasewrite (set->readwr); - break; - case BtLockAccess: - bt_spinreleaseread (set->access); - break; - case BtLockDelete: - bt_spinreleasewrite (set->access); - break; - case BtLockParent: - bt_spinreleasewrite (set->parent); - break; - case BtLockPin: - break; - default: - return bt->err = BTERR_lock; - } - -#ifdef unix - __sync_fetch_and_add(&pool->pin, -1); - __sync_fetch_and_add (&set->pin, -1); -#else - _InterlockedDecrement16 (&pool->pin); - _InterlockedDecrement16 (&set->pin); -#endif - return bt->err = 0; -} - -// deallocate a deleted page -// place on free chain out of allocator page -// fence key must already be removed from parent - -BTERR bt_freepage(BtDb *bt, uid page_no, BtLatchSet *set) -{ - // obtain delete lock on deleted page - - if( !bt_lockpage(bt, page_no, BtLockDelete, NULL, set) ) - return bt->err; - - // obtain write lock on deleted page - - if( !bt_lockpage(bt, page_no, BtLockWrite, &bt->temp, set) ) - return bt->err; - - // lock allocation page - - bt_spinwritelock(bt->mgr->latchmgr->lock); - - // store free chain in allocation page second right - bt_putid(bt->temp->right, bt_getid(bt->mgr->latchmgr->alloc[1].right)); - bt_putid(bt->mgr->latchmgr->alloc[1].right, page_no); - - // unlock page zero - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - - // remove write lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockWrite, set) ) - return bt->err; - - // remove delete lock on deleted node - - if( bt_unlockpage(bt, page_no, BtLockDelete, set) ) - return bt->err; - - return 0; -} - -// allocate a new page and write page into it - -uid bt_newpage(BtDb *bt, BtPage page) -{ -BtLatchSet *set; -uid new_page; -BtPage pmap; -int reuse; - - // lock allocation page - - bt_spinwritelock(bt->mgr->latchmgr->lock); - - // use empty chain first - // else allocate empty page - - if( new_page = bt_getid(bt->mgr->latchmgr->alloc[1].right) ) { - if( !(set = bt_lockpage (bt, new_page, BtLockWrite, &bt->temp, NULL)) ) - return 0; - bt_putid(bt->mgr->latchmgr->alloc[1].right, bt_getid(bt->temp->right)); - if( bt_unlockpage (bt, new_page, BtLockWrite, set) ) - return 0; - reuse = 1; - } else { - new_page = bt_getid(bt->mgr->latchmgr->alloc->right); - bt_putid(bt->mgr->latchmgr->alloc->right, new_page+1); - reuse = 0; - } -#ifdef unix - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - - // if writing first page of pool block, zero last page in the block - - if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) - { - // use zero buffer to write zeros - memset(bt->zero, 0, bt->mgr->page_size); - if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - } -#else - // bring new page into pool and copy page. - // this will extend the file into the new pages. - - if( !(set = bt_lockpage(bt, new_page, BtLockWrite, &pmap, NULL)) ) - return 0; - - memcpy(pmap, page, bt->mgr->page_size); - - if( bt_unlockpage (bt, new_page, BtLockWrite, set) ) - return 0; -#endif - // unlock allocation latch and return new page no - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - return new_page; -} - -// find slot in page for given key at a given level - -int bt_findslot (BtDb *bt, unsigned char *key, uint len) -{ -uint diff, higher = bt->page->cnt, low = 1, slot; - - // low is the lowest candidate, higher is already - // tested as .ge. the given key, loop ends when they meet - - while( diff = higher - low ) { - slot = low + ( diff >> 1 ); - if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) - low = slot + 1; - else - higher = slot; - } - - return higher; -} - -// find and load page at given level for given key -// leave page rd or wr locked as requested - -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, BtLock lock) -{ -uid page_no = ROOT_page, prevpage = 0; -BtLatchSet *set, *prevset; -uint drill = 0xff, slot; -uint mode, prevmode; - - bt->set = NULL; - - // start at root of btree and drill down - - do { - // determine lock mode of drill level - mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - - bt->page_no = page_no; - - // obtain access lock using lock chaining with Access mode - - if( page_no > ROOT_page ) - if( !(bt->set = bt_lockpage(bt, page_no, BtLockAccess, NULL, NULL)) ) - return 0; - - // now unlock our (possibly foster) parent - - if( prevpage ) - if( bt_unlockpage(bt, prevpage, prevmode, prevset) ) - return 0; - else - prevpage = 0; - - // obtain read lock using lock chaining - // and pin page contents - - if( !(bt->set = bt_lockpage(bt, page_no, mode, &bt->page, bt->set)) ) - return 0; - - if( page_no > ROOT_page ) - if( bt_unlockpage(bt, page_no, BtLockAccess, bt->set) ) - return 0; - - // re-read and re-lock root after determining actual level of root - - if( bt->page_no == ROOT_page ) - if( bt->page->lvl != drill) { - drill = bt->page->lvl; - - if( lock == BtLockWrite && drill == lvl ) - if( bt_unlockpage(bt, page_no, mode, bt->set) ) - return 0; - else - continue; - } - - prevpage = bt->page_no; - prevset = bt->set; - prevmode = mode; - - // if page is being deleted, - // move back to preceeding page - - if( bt->page->kill ) { - page_no = bt_getid (bt->page->right); - continue; - } - - // find key on page at this level - // and descend to requested level - - slot = bt_findslot (bt, key, len); - - // is this slot a foster child? - - if( slot <= bt->page->cnt - bt->page->foster ) - if( drill == lvl ) - return slot; - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - goto slideright; - - if( slot <= bt->page->cnt - bt->page->foster ) - drill--; - - // continue down / right using overlapping locks - // to protect pages being killed or split. - - page_no = bt_getid(slotptr(bt->page, slot)->id); - continue; - -slideright: - page_no = bt_getid(bt->page->right); - - } while( page_no ); - - // return error on end of chain - - bt->err = BTERR_struct; - return 0; // return error -} - -// find and delete key on page by marking delete flag bit -// when page becomes empty, delete it from the btree - -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) -{ -unsigned char leftkey[256], rightkey[256]; -BtLatchSet *rset, *set; -uid page_no, right; -uint slot, tod; -BtKey ptr; - - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // if key is found delete it, otherwise ignore request - - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } - - // return if page is not empty, or it has no right sibling - - right = bt_getid(bt->page->right); - page_no = bt->page_no; - set = bt->set; - - if( !right || bt->page->act ) - return bt_unlockpage(bt, page_no, BtLockWrite, set); - - // obtain Parent lock over write lock - - if( !bt_lockpage(bt, page_no, BtLockParent, NULL, set) ) - return bt->err; - - // cache copy of key to delete - - ptr = keyptr(bt->page, bt->page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); - - // lock and map right page - - if( !(rset = bt_lockpage(bt, right, BtLockWrite, &bt->temp, NULL)) ) - return bt->err; - - // pull contents of next page into current empty page - memcpy (bt->page, bt->temp, bt->mgr->page_size); - - // cache copy of key to update - ptr = keyptr(bt->temp, bt->temp->cnt); - memcpy(rightkey, ptr, ptr->len + 1); - - // Mark right page as deleted and point it to left page - // until we can post updates at higher level. - - bt_putid(bt->temp->right, page_no); - bt->temp->kill = 1; - bt->temp->cnt = 0; - - if( bt_unlockpage(bt, right, BtLockWrite, rset) ) - return bt->err; - if( bt_unlockpage(bt, page_no, BtLockWrite, set) ) - return bt->err; - - // delete old lower key to consolidated node - - if( bt_deletekey (bt, leftkey + 1, *leftkey, lvl + 1) ) - return bt->err; - - // redirect higher key directly to consolidated node - - if( slot = bt_loadpage (bt, rightkey+1, *rightkey, lvl+1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // since key already exists, update id - - if( keycmp (ptr, rightkey+1, *rightkey) ) - return bt->err = BTERR_struct; - - slotptr(bt->page, slot)->dead = 0; - bt_putid(slotptr(bt->page,slot)->id, page_no); - - if( bt_unlockpage(bt, bt->page_no, BtLockWrite, bt->set) ) - return bt->err; - - // obtain write lock and - // add right block to free chain - - if( bt_freepage (bt, right, rset) ) - return bt->err; - - // remove ParentModify lock - - if( bt_unlockpage(bt, page_no, BtLockParent, set) ) - return bt->err; - - return 0; -} - -// find key in leaf level and return row-id - -uid bt_findkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; -BtKey ptr; -uid id; - - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - ptr = keyptr(bt->page, slot); - else - return 0; - - // if key exists, return row-id - // otherwise return 0 - - if( ptr->len == len && !memcmp (ptr->key, key, len) ) - id = bt_getid(slotptr(bt->page,slot)->id); - else - id = 0; - - if( bt_unlockpage (bt, bt->page_no, BtLockRead, bt->set) ) - return 0; - - return id; -} - -// check page for space available, -// clean if necessary and return -// 0 - page needs splitting -// 1 - go ahead - -uint bt_cleanpage(BtDb *bt, uint amt) -{ -uint nxt = bt->mgr->page_size; -BtPage page = bt->page; -uint cnt = 0, idx = 0; -uint max = page->cnt; -BtKey key; - - if( page->min >= (max+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - // skip cleanup if nothing to reclaim - - if( !page->dirty ) - return 0; - - memcpy (bt->frame, page, bt->mgr->page_size); - - // skip page info and set rest of page to zero - - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - page->dirty = 0; - page->act = 0; - - // try cleaning up page first - - while( cnt++ < max ) { - // always leave fence key and foster children in list - if( cnt < max - page->foster && slotptr(bt->frame,cnt)->dead ) - continue; - - // copy key - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - - // copy slot - memcpy(slotptr(page, ++idx)->id, slotptr(bt->frame, cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - page->min = nxt; - page->cnt = idx; - - // see if page has enough space now, or does it need splitting? - - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) - return 1; - - return 0; -} - -// add key to current page -// page must already be writelocked - -void bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // calculate next available slot and copy key into page - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - // now insert key into array before slot - // preserving the fence slot - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; -} - -// split the root and raise the height of the btree -// call with current page locked and page no of foster child -// return with current page (root) unlocked - -BTERR bt_splitroot(BtDb *bt, uid right) -{ -uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -BtPage root = bt->page; -uid new_page; -BtKey key; - - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - root->act--; - root->cnt--; - root->foster--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page - - if( !(new_page = bt_newpage(bt, root)) ) - return bt->err; - - // preserve the page info at the bottom - // and set rest of the root to zero - - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); - - // insert left fence key on empty newroot page - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 1)->id, new_page); - slotptr(root, 1)->off = nxt; - - // insert stopper key on newroot page - // and increase the root height - - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); - slotptr(root, 2)->off = nxt; - - bt_putid(root->right, 0); - root->min = nxt; // reset lowest used offset and key count - root->cnt = 2; - root->act = 2; - root->lvl++; - - // release root (bt->page) - - return bt_unlockpage(bt, ROOT_page, BtLockWrite, bt->set); -} - -// split already locked full node -// in current page variables -// return unlocked. - -BTERR bt_splitpage (BtDb *bt) -{ -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtLatchSet *set = bt->set; -BtPage page = bt->page; -uint tod = time(NULL); -uint lvl = page->lvl; -uid new_page, right; -BtKey key; - - // initialize frame buffer - - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; - tod = (uint)time(NULL); - cnt = max / 2; - idx = 0; - - // split higher half of keys to bt->frame - // leaving foster children in the left node. - - while( cnt++ < max ) { - key = keyptr(page, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)bt->frame + nxt, key, key->len + 1); - memcpy(slotptr(bt->frame,++idx)->id, slotptr(page,cnt)->id, BtId); - slotptr(bt->frame, idx)->tod = slotptr(page, cnt)->tod; - slotptr(bt->frame, idx)->off = nxt; - bt->frame->act++; - } - - // transfer right link node - - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } - - bt->frame->bits = bt->mgr->page_bits; - bt->frame->min = nxt; - bt->frame->cnt = idx; - bt->frame->lvl = lvl; - - // get new free page and write frame to it. - - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; - - // remember fence key for new page to add - // as foster child - - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); - - // update lower keys and foster children to continue in old page - - memcpy (bt->frame, page, bt->mgr->page_size); - memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); - nxt = bt->mgr->page_size; - page->act = 0; - cnt = 0; - idx = 0; - - // assemble page of smaller keys - // to remain in the old page - - while( cnt++ < max / 2 ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - // insert new foster child at beginning of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys if any - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - page->act++; - } - - page->min = nxt; - page->cnt = idx; - - // link new right page - - bt_putid (page->right, new_page); - - // if current page is the root page, split it - - if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); - - // keep our latch set - // release wr lock on our page - - if( !bt_lockpage (bt, page_no, BtLockPin, NULL, set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite, set) ) - return bt->err; - - // obtain ParentModification lock for current page - // to fix fence key and highest foster child on page - - if( !bt_lockpage (bt, page_no, BtLockParent, NULL, set) ) - return bt->err; - - // get our highest foster child key to find in parent node - - if( !bt_lockpage (bt, page_no, BtLockRead, &page, set) ) - return bt->err; - - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - if( bt_unlockpage (bt, page_no, BtLockRead, set) ) - return bt->err; - - // update our parent -try_again: - - do { - slot = bt_loadpage (bt, fencekey + 1, *fencekey, lvl + 1, BtLockWrite); - - if( !slot ) - return bt->err; - - // check if parent page has enough space for any possible key - - if( bt_cleanpage (bt, 256) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } while( 1 ); - - // see if we are still a foster child from another node - - if( bt_getid (slotptr(bt->page, slot)->id) != page_no ) { - if( bt_unlockpage (bt, bt->page_no, BtLockWrite, bt->set) ) - return bt->err; -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - goto try_again; - } - - // wait until readers from parent get their locks - // on our page - - if( !bt_lockpage (bt, page_no, BtLockDelete, NULL, set) ) - return bt->err; - - // lock our page for writing - - if( !bt_lockpage (bt, page_no, BtLockWrite, &page, set) ) - return bt->err; - - // switch parent fence key to foster child - - if( slotptr(page, page->cnt)->dead ) - slotptr(bt->page, slot)->dead = 1; - else - bt_putid (slotptr(bt->page, slot)->id, bt_getid(slotptr(page, page->cnt)->id)); - - // remove highest foster child from our page - - page->cnt--; - page->act--; - page->foster--; - page->dirty = 1; - key = keyptr(page, page->cnt); - - // add our new fence key for foster child to our parent - - bt_addkeytopage (bt, slot, key->key, key->len, page_no, tod); - - if( bt_unlockpage (bt, bt->page_no, BtLockWrite, bt->set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockDelete, set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockWrite, set) ) - return bt->err; - - if( bt_unlockpage (bt, page_no, BtLockParent, set) ) - return bt->err; - - // release extra latch pin - - return bt_unlockpage (bt, page_no, BtLockPin, set); -} - -// Insert new key into the btree at leaf level. - -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod) -{ -uint slot, idx; -BtPage page; -BtKey ptr; - - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } - - // if key already exists, update id and return - - page = bt->page; - - if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - return bt_unlockpage(bt, bt->page_no, BtLockWrite, bt->set); - } - - // check if page has enough space - - if( bt_cleanpage (bt, len) ) - break; - - if( bt_splitpage (bt) ) - return bt->err; - } - - bt_addkeytopage (bt, slot, key, len, id, tod); - - return bt_unlockpage (bt, bt->page_no, BtLockWrite, bt->set); -} - -// cache page of keys into cursor and return starting slot for given key - -uint bt_startkey (BtDb *bt, unsigned char *key, uint len) -{ -uint slot; - - // cache page for retrieval - if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - if ( bt_unlockpage(bt, bt->page_no, BtLockRead, bt->set) ) - return 0; - - return slot; -} - -// return next slot for cursor page -// or slide cursor right into next page - -uint bt_nextkey (BtDb *bt, uint slot) -{ -BtLatchSet *rset; -BtPage page; -uid right; - - do { - right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) - if( slotptr(bt->cursor,slot)->dead ) - continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) - return slot; - else - break; - - if( !right ) - break; - - bt->cursor_page = right; - - if( !(bt->set = bt_lockpage(bt, right, BtLockRead, &page, NULL)) ) - return 0; - - memcpy (bt->cursor, page, bt->mgr->page_size); - - if ( bt_unlockpage(bt, right, BtLockRead, bt->set) ) - return 0; - - slot = 0; - } while( 1 ); - - return bt->err = 0; -} - -BtKey bt_key(BtDb *bt, uint slot) -{ - return keyptr(bt->cursor, slot); -} - -uid bt_uid(BtDb *bt, uint slot) -{ - return bt_getid(slotptr(bt->cursor,slot)->id); -} - -uint bt_tod(BtDb *bt, uint slot) -{ - return slotptr(bt->cursor,slot)->tod; -} - - -#ifdef STANDALONE - -typedef struct { - char type, idx; - char *infile; - BtMgr *mgr; - int num; -} ThreadArg; - -// standalone program to index file of keys -// then list them onto std-out - -#ifdef unix -void *index_file (void *arg) -#else -uint __stdcall index_file (void *arg) -#endif -{ -int line = 0, found = 0, cnt = 0; -uid next, page_no = LEAF_page; // start on first page of leaves -unsigned char key[256]; -ThreadArg *args = arg; -int ch, len = 0, slot; -time_t tod[1]; -BtPage page; -BtKey ptr; -BtDb *bt; -FILE *in; - - bt = bt_open (args->mgr); - time (tod); - - switch(args->type | 0x20) - { - case 'w': - fprintf(stderr, "started indexing for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_insertkey (bt, key, len, line, *tod) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys\n", args->infile, line); - break; - - case 'd': - fprintf(stderr, "started deleting keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_deletekey (bt, key, len, 0) ) - fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for keys, %d \n", args->infile, line); - break; - - case 'f': - fprintf(stderr, "started finding keys for %s\n", args->infile); - if( in = fopen (args->infile, "rb") ) - while( ch = getc(in), ch != EOF ) - if( ch == '\n' ) - { - line++; - if( args->num == 1 ) - sprintf((char *)key+len, "%.9d", 1000000000 - line), len += 9; - - else if( args->num ) - sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - - if( bt_findkey (bt, key, len) ) - found++; - else if( bt->err ) - fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0); - len = 0; - } - else if( len < 255 ) - key[len++] = ch; - fprintf(stderr, "finished %s for %d keys, found %d\n", args->infile, line, found); - break; - - case 's': - len = key[0] = 0; - - fprintf(stderr, "started reading\n"); - - if( slot = bt_startkey (bt, key, len) ) - slot--; - else - fprintf(stderr, "Error %d in StartKey. Syserror: %d\n", bt->err, errno), exit(0); - - while( slot = bt_nextkey (bt, slot) ) { - ptr = bt_key(bt, slot); - fwrite (ptr->key, ptr->len, 1, stdout); - fputc ('\n', stdout); - } - - break; - - case 'c': - fprintf(stderr, "started reading\n"); - - do { - bt->set = bt_lockpage (bt, page_no, BtLockRead, &page, NULL); - cnt += page->act; - next = bt_getid (page->right); - bt_unlockpage (bt, page_no, BtLockRead, bt->set); - } while( page_no = next ); - - cnt--; // remove stopper key - fprintf(stderr, " Total keys read %d\n", cnt); - break; - } - - bt_close (bt); -#ifdef unix - return NULL; -#else - return 0; -#endif -} - -typedef struct timeval timer; - -int main (int argc, char **argv) -{ -int idx, cnt, len, slot, err; -int segsize, bits = 16; -#ifdef unix -pthread_t *threads; -timer start, stop; -#else -time_t start[1], stop[1]; -HANDLE *threads; -#endif -double real_time; -ThreadArg *args; -uint poolsize = 0; -int num = 0; -char key[1]; -BtMgr *mgr; -BtKey ptr; -BtDb *bt; - - if( argc < 3 ) { - fprintf (stderr, "Usage: %s idx_file Read/Write/Scan/Delete/Find [page_bits mapped_segments seg_bits line_numbers src_file1 src_file2 ... ]\n", argv[0]); - fprintf (stderr, " where page_bits is the page size in bits\n"); - fprintf (stderr, " mapped_segments is the number of mmap segments in buffer pool\n"); - fprintf (stderr, " seg_bits is the size of individual segments in buffer pool in pages in bits\n"); - fprintf (stderr, " line_numbers = 1 to append line numbers to keys\n"); - fprintf (stderr, " src_file1 thru src_filen are files of keys separated by newline\n"); - exit(0); - } - -#ifdef unix - gettimeofday(&start, NULL); -#else - time(start); -#endif - - if( argc > 3 ) - bits = atoi(argv[3]); - - if( argc > 4 ) - poolsize = atoi(argv[4]); - - if( !poolsize ) - fprintf (stderr, "Warning: no mapped_pool\n"); - - if( poolsize > 65535 ) - fprintf (stderr, "Warning: mapped_pool > 65535 segments\n"); - - if( argc > 5 ) - segsize = atoi(argv[5]); - else - segsize = 4; // 16 pages per mmap segment - - if( argc > 6 ) - num = atoi(argv[6]); - - cnt = argc - 7; -#ifdef unix - threads = malloc (cnt * sizeof(pthread_t)); -#else - threads = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, cnt * sizeof(HANDLE)); -#endif - args = malloc (cnt * sizeof(ThreadArg)); - - mgr = bt_mgr ((argv[1]), BT_rw, bits, poolsize, segsize, poolsize / 8); - - if( !mgr ) { - fprintf(stderr, "Index Open Error %s\n", argv[1]); - exit (1); - } - - // fire off threads - - for( idx = 0; idx < cnt; idx++ ) { - args[idx].infile = argv[idx + 7]; - args[idx].type = argv[2][0]; - args[idx].mgr = mgr; - args[idx].num = num; - args[idx].idx = idx; -#ifdef unix - if( err = pthread_create (threads + idx, NULL, index_file, args + idx) ) - fprintf(stderr, "Error creating thread %d\n", err); -#else - threads[idx] = (HANDLE)_beginthreadex(NULL, 65536, index_file, args + idx, 0, NULL); -#endif - } - - // wait for termination - -#ifdef unix - for( idx = 0; idx < cnt; idx++ ) - pthread_join (threads[idx], NULL); - gettimeofday(&stop, NULL); - real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec ); -#else - WaitForMultipleObjects (cnt, threads, TRUE, INFINITE); - - for( idx = 0; idx < cnt; idx++ ) - CloseHandle(threads[idx]); - - time (stop); - real_time = 1000 * (*stop - *start); -#endif - fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000); - bt_mgrclose (mgr); -} - -#endif //STANDALONE diff --git a/fosterbtreef2.c b/threads2j.c similarity index 73% rename from fosterbtreef2.c rename to threads2j.c index e4592d2..d7cb994 100644 --- a/fosterbtreef2.c +++ b/threads2j.c @@ -1,5 +1,5 @@ -// foster btree version f2 -// 18 JAN 2014 +// btree version threads2j linux futex concurrency version +// 24 JAN 2014 // author: karl malbrain, malbrain@cal.berkeley.edu @@ -25,6 +25,8 @@ REDISTRIBUTION OF THIS SOFTWARE. #ifdef linux #define _GNU_SOURCE +#include +#define SYS_futex 202 #endif #ifdef unix @@ -36,6 +38,7 @@ REDISTRIBUTION OF THIS SOFTWARE. #include #include #include +#include #else #define WIN32_LEAN_AND_MEAN #include @@ -74,7 +77,7 @@ There are five lock types for each node in three independent sets: 2. (set 1) NodeDelete: Exclusive. About to release the node. Incompatible with AccessIntent. 3. (set 2) ReadLock: Sharable. Read the node. Incompatible with WriteLock. 4. (set 2) WriteLock: Exclusive. Modify the node. Incompatible with ReadLock and other WriteLocks. -5. (set 3) ParentLock: Exclusive. Have parent adopt/delete maximum foster child from the node. +5. (set 3) ParentModification: Exclusive. Change the node's parent keys. Incompatible with another ParentModification. */ typedef enum{ @@ -83,7 +86,33 @@ typedef enum{ BtLockRead, BtLockWrite, BtLockParent -}BtLock; +} BtLock; + +// mode & definition for latch implementation + +enum { + Mutex = 1 << 0, // the mutex bit + Write = 1 << 1, // the writers bit + Share = 1 << 2, // reader count + PendRd = 1 << 12, // reader contended count + PendWr = 1 << 22 // writer contended count +} LockMode; + +enum { + QueRd = 1, // reader queue + QueWr = 2 // writer queue +} RWQueue; + +// share is count of read accessors +// grant write lock when share == 0 + +typedef struct { + volatile uint mutex:1; // 1 = busy + volatile uint write:1; // 1 = exclusive + volatile uint share:10; // count of readers holding locks + volatile uint readwait:10; // count of readers waiting + volatile uint writewait:10; // count of writers waiting +} BtLatch; // Define the length of the page and key pointers @@ -121,50 +150,30 @@ typedef struct { // by the BtSlot array of keys. typedef struct Page { - volatile uint cnt; // count of keys in page - volatile uint act; // count of active keys - volatile uint min; // next key offset - volatile uint foster; // count of foster children + uint cnt; // count of keys in page + uint act; // count of active keys + uint min; // next key offset unsigned char bits; // page size in bits - unsigned char lvl:7; // level of page - unsigned char dirty:1; // page needs to be cleaned + unsigned char lvl:6; // level of page + unsigned char kill:1; // page is being deleted + unsigned char dirty:1; // page has deleted keys unsigned char right[BtId]; // page number to right } *BtPage; -// mode & definition for hash latch implementation - -enum { - Mutex = 1, - Write = 2, - Pending = 4, - Share = 8 -} LockMode; - -// mutex locks the other fields -// exclusive is set for write access -// share is count of read accessors - -typedef struct { - volatile ushort mutex:1; - volatile ushort exclusive:1; - volatile ushort pending:1; - volatile ushort share:13; -} BtSpinLatch; - // hash table entries typedef struct { - BtSpinLatch latch[1]; + BtLatch latch[1]; volatile ushort slot; // Latch table entry at head of chain } BtHashEntry; // latch manager table structure typedef struct { - BtSpinLatch readwr[1]; // read/write page lock - BtSpinLatch access[1]; // Access Intent/Page delete - BtSpinLatch parent[1]; // adoption of foster children - BtSpinLatch busy[1]; // slot is being moved between chains + BtLatch readwr[1]; // read/write page lock + BtLatch access[1]; // Access Intent/Page delete + BtLatch parent[1]; // adoption of foster children + BtLatch busy[1]; // slot is being moved between chains volatile ushort next; // next entry in hash table chain volatile ushort prev; // prev entry in hash table chain volatile ushort pin; // number of outstanding locks @@ -178,8 +187,8 @@ typedef struct { unsigned long long int lru; // number of times accessed uid basepage; // mapped base page number char *map; // mapped memory pointer - ushort pin; // mapped page pin counter ushort slot; // slot index in this array + ushort pin; // mapped page pin counter void *hashprev; // previous pool entry for the same hash idx void *hashnext; // next pool entry for the same hash idx #ifndef unix @@ -191,7 +200,7 @@ typedef struct { typedef struct { struct Page alloc[2]; // next & free page_nos in right ptr - BtSpinLatch lock[1]; // allocation area lite latch + BtLatch lock[1]; // allocation area lite latch ushort latchdeployed; // highest number of latch entries deployed ushort nlatchpage; // number of latch pages at BT_latch ushort latchtotal; // number of page latch entries @@ -208,19 +217,19 @@ typedef struct { uint seg_bits; // seg size in pages in bits uint mode; // read-write mode #ifdef unix - int idx; char *pooladvise; // bit maps for pool page advisements + int idx; #else HANDLE idx; #endif ushort poolcnt; // highest page pool node in use ushort poolmax; // highest page pool node allocated ushort poolmask; // total number of pages in mmap segment - 1 - ushort hashsize; // size of Hash Table for pool entries ushort evicted; // last evicted hash table slot - ushort *hash; // hash table of pool entries + ushort hashsize; // size of Hash Table for pool entries + ushort *hash; // pool index for hash entries BtPool *pool; // memory pool page segments - BtSpinLatch *latch; // latches for pool hash slots + BtLatch *latch; // latches for pool hash slots BtLatchMgr *latchmgr; // mapped latch page from allocation page BtLatchSet *latchsets; // mapped latch set from latch pages #ifndef unix @@ -232,11 +241,11 @@ typedef struct { BtMgr *mgr; // buffer manager for thread BtPage cursor; // cached frame for start/next (never mapped) BtPage frame; // spare frame for the page split (never mapped) - BtPage zero; // page frame for zeroes at end of file - BtPage page; // current page + BtPage zero; // page of zeroes to extend the file (never mapped) + BtPage page; // current page mapped from file uid page_no; // current page number uid cursor_page; // current cursor page number - BtLatchSet *set; // current page latch set + BtLatchSet *set; // current page latchset BtPool *pool; // current page pool unsigned char *mem; // frame, cursor, page memory buffer int found; // last delete was found @@ -257,8 +266,8 @@ typedef enum { // B-Tree functions extern void bt_close (BtDb *bt); extern BtDb *bt_open (BtMgr *mgr); -extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod, uint lvl); -extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len); +extern BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, uid id, uint tod); +extern BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl); extern uid bt_findkey (BtDb *bt, unsigned char *key, uint len); extern uint bt_startkey (BtDb *bt, unsigned char *key, uint len); extern uint bt_nextkey (BtDb *bt, uint slot); @@ -267,7 +276,7 @@ extern uint bt_nextkey (BtDb *bt, uint slot); extern BtMgr *bt_mgr (char *name, uint mode, uint bits, uint poolsize, uint segsize, uint hashsize); void bt_mgrclose (BtMgr *mgr); -// Helper functions to return cursor slot values +// Helper functions to return slot values extern BtKey bt_key (BtDb *bt, uint slot); extern uid bt_uid (BtDb *bt, uint slot); @@ -298,6 +307,10 @@ extern uint bt_tod (BtDb *bt, uint slot); // The first leaf page of level zero is always // located on page 2. +// The b-tree pages are linked with next +// pointers to facilitate enumerators, +// and provide for concurrency. + // When to root page fills, it is split in two and // the tree height is raised by a new root at page // one with two keys. @@ -306,20 +319,22 @@ extern uint bt_tod (BtDb *bt, uint slot); // page cleanup The fence key for a node is always // present, even after deletion and cleanup. -// Groups of pages called segments from the btree are -// cached with memory mapping. A hash table is used to keep +// Groups of pages called segments from the btree are optionally +// cached with a memory mapped pool. A hash table is used to keep // track of the cached segments. This behaviour is controlled // by the cache block size parameter to bt_open. // To achieve maximum concurrency one page is locked at a time -// as the tree is traversed to find leaf key in question. - -// An adoption traversal leaves the parent node locked as the -// tree is traversed to the level in quesiton. +// as the tree is traversed to find leaf key in question. The right +// page numbers are used in cases where the page is being split, +// or consolidated. // Page 0 is dedicated to lock for new page extensions, // and chains empty pages together for reuse. +// The ParentModification lock on a node is obtained to prevent resplitting +// or deleting a node before its fence is posted into its upper level. + // Empty pages are chained together through the ALLOC page and reused. // Access macros to address slot and key values from the page @@ -346,76 +361,83 @@ int i; return id; } +// Latch Manager + +int sys_futex(void *addr1, int op, int val1, struct timespec *timeout, void *addr2, int val3) +{ + return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3); +} + // wait until write lock mode is clear // and add 1 to the share count -void bt_spinreadlock(BtSpinLatch *latch) +void bt_spinreadlock(BtLatch *latch, int private) { -ushort prev; +uint prev; - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex) & Mutex ) + if( private ) + private = FUTEX_PRIVATE_FLAG; + + while( 1 ) { + // obtain latch mutex + if( __sync_fetch_and_or((uint *)latch, Mutex) & Mutex ) { sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex) & Mutex ) - SwitchToThread(); -#endif + continue; + } - // see if exclusive request is granted or pending + // wait for writers to clear + // increment read waiters and wait - if( prev = !(latch->exclusive | latch->pending) ) -#ifdef unix - __sync_fetch_and_add((ushort *)latch, Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, Share); -#endif + if( latch->write || latch->writewait ) { + __sync_fetch_and_add ((uint *)latch, PendRd); + prev = __sync_fetch_and_and ((uint *)latch, ~Mutex) & ~Mutex; + sys_futex( (uint *)latch, FUTEX_WAIT_BITSET | private, prev, NULL, NULL, QueRd ); + __sync_fetch_and_sub ((uint *)latch, PendRd); + continue; + } + + // increment reader lock count + // and release latch mutex -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - if( prev ) - return; -#ifdef unix - } while( sched_yield(), 1 ); -#else - } while( SwitchToThread(), 1 ); -#endif + __sync_fetch_and_add ((uint *)latch, Share); + __sync_fetch_and_and ((uint *)latch, ~Mutex); + return; + } } // wait for other read and write latches to relinquish -void bt_spinwritelock(BtSpinLatch *latch) +void bt_spinwritelock(BtLatch *latch, int private) { - do { -#ifdef unix - while( __sync_fetch_and_or((ushort *)latch, Mutex | Pending) & Mutex ) +uint prev; + + if( private ) + private = FUTEX_PRIVATE_FLAG; + + while( 1 ) { + // obtain latch mutex + if( __sync_fetch_and_or((uint *)latch, Mutex) & Mutex ) { sched_yield(); -#else - while( _InterlockedOr16((ushort *)latch, Mutex | Pending) & Mutex ) - SwitchToThread(); -#endif - if( !(latch->share | latch->exclusive) ) { -#ifdef unix - __sync_fetch_and_or((ushort *)latch, Write); - __sync_fetch_and_and ((ushort *)latch, ~(Mutex | Pending)); -#else - _InterlockedOr16((ushort *)latch, Write); - _InterlockedAnd16((ushort *)latch, ~(Mutex | Pending)); -#endif - return; + continue; } -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); - sched_yield(); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); - SwitchToThread(); -#endif - } while( 1 ); + // wait for write and reader count to clear + + if( latch->write || latch->share ) { + __sync_fetch_and_add ((uint *)latch, PendWr); + prev = __sync_fetch_and_and ((uint *)latch, ~Mutex) & ~Mutex; + sys_futex( (uint *)latch, FUTEX_WAIT_BITSET | private, prev, NULL, NULL, QueWr ); + __sync_fetch_and_sub ((uint *)latch, PendWr); + continue; + } + + // take write mutex + // release latch mutex + + __sync_fetch_and_or ((uint *)latch, Write); + __sync_fetch_and_and ((uint *)latch, ~Mutex); + return; + } } // try to obtain write lock @@ -423,54 +445,81 @@ void bt_spinwritelock(BtSpinLatch *latch) // return 1 if obtained, // 0 otherwise -int bt_spinwritetry(BtSpinLatch *latch) +int bt_spinwritetry(BtLatch *latch) { -ushort prev; +int ans; -#ifdef unix - if( prev = __sync_fetch_and_or((ushort *)latch, Mutex), prev & Mutex ) - return 0; -#else - if( prev = _InterlockedOr16((ushort *)latch, Mutex), prev & Mutex ) + // try for mutex, + // abandon request if not taken + + if( __sync_fetch_and_or((uint *)latch, Mutex) & Mutex ) return 0; -#endif - // take write access if all bits are clear - if( !prev ) -#ifdef unix - __sync_fetch_and_or ((ushort *)latch, Write); -#else - _InterlockedOr16((ushort *)latch, Write); -#endif + // see if write mode is available -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Mutex); -#else - _InterlockedAnd16((ushort *)latch, ~Mutex); -#endif - return !prev; + if( !latch->write && !latch->share ) { + __sync_fetch_and_or ((uint *)latch, Write); + ans = 1; + } else + ans = 0; + + // release latch mutex + + __sync_fetch_and_and ((uint *)latch, ~Mutex); + return ans; } -// clear write mode +// clear write lock -void bt_spinreleasewrite(BtSpinLatch *latch) +void bt_spinreleasewrite(BtLatch *latch, int private) { -#ifdef unix - __sync_fetch_and_and ((ushort *)latch, ~Write); -#else - _InterlockedAnd16((ushort *)latch, ~Write); -#endif + if( private ) + private = FUTEX_PRIVATE_FLAG; + + // obtain latch mutex + + while( __sync_fetch_and_or((uint *)latch, Mutex) & Mutex ) + sched_yield(); + + __sync_fetch_and_and ((uint *)latch, ~Write); + + // favor writers + + if( latch->writewait ) + if( sys_futex( (uint *)latch, FUTEX_WAKE_BITSET | private, 1, NULL, NULL, QueWr ) ) + goto wakexit; + + if( latch->readwait ) + sys_futex( (uint *)latch, FUTEX_WAKE_BITSET | private, INT_MAX, NULL, NULL, QueRd ); + + // release latch mutex + +wakexit: + __sync_fetch_and_and ((uint *)latch, ~Mutex); } // decrement reader count -void bt_spinreleaseread(BtSpinLatch *latch) +void bt_spinreleaseread(BtLatch *latch, int private) { -#ifdef unix - __sync_fetch_and_add((ushort *)latch, -Share); -#else - _InterlockedExchangeAdd16 ((ushort *)latch, -Share); -#endif + if( private ) + private = FUTEX_PRIVATE_FLAG; + + // obtain latch mutex + + while( __sync_fetch_and_or((uint *)latch, Mutex) & Mutex ) + sched_yield(); + + __sync_fetch_and_sub ((uint *)latch, Share); + + // wake waiting writers + + if( !latch->share && latch->writewait ) + sys_futex( (uint *)latch, FUTEX_WAKE_BITSET | private, 1, NULL, NULL, QueWr ); + + // release latch mutex + + __sync_fetch_and_and ((uint *)latch, ~Mutex); } // link latch table entry into latch hash table @@ -510,7 +559,7 @@ BtLatchSet *set; // obtain read lock on hash table entry - bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch); + bt_spinreadlock(bt->mgr->latchmgr->table[hashidx].latch, 0); if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do { @@ -527,14 +576,14 @@ BtLatchSet *set; #endif } - bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch); + bt_spinreleaseread (bt->mgr->latchmgr->table[hashidx].latch, 0); if( slot ) return set; // try again, this time with write lock - bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch); + bt_spinwritelock(bt->mgr->latchmgr->table[hashidx].latch, 0); if( slot = bt->mgr->latchmgr->table[hashidx].slot ) do { @@ -555,7 +604,7 @@ BtLatchSet *set; _InterlockedIncrement16 (&set->pin); #endif set->page_no = page_no; - bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch); + bt_spinreleasewrite(bt->mgr->latchmgr->table[hashidx].latch, 0); return set; } @@ -574,7 +623,7 @@ BtLatchSet *set; _InterlockedIncrement16 (&set->pin); #endif bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); + bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch, 0); return set; } @@ -611,13 +660,13 @@ BtLatchSet *set; // or has outstanding locks if( !bt_spinwritetry (bt->mgr->latchmgr->table[idx].latch) ) { - bt_spinreleasewrite (set->busy); + bt_spinreleasewrite (set->busy, 0); continue; } if( set->pin ) { - bt_spinreleasewrite (set->busy); - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); + bt_spinreleasewrite (set->busy, 0); + bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch, 0); continue; } @@ -631,15 +680,15 @@ BtLatchSet *set; if( set->next ) bt->mgr->latchsets[set->next].prev = set->prev; - bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch); + bt_spinreleasewrite (bt->mgr->latchmgr->table[idx].latch, 0); #ifdef unix __sync_fetch_and_add(&set->pin, 1); #else _InterlockedIncrement16 (&set->pin); #endif bt_latchlink (bt, hashidx, victim, page_no); - bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch); - bt_spinreleasewrite (set->busy); + bt_spinreleasewrite (bt->mgr->latchmgr->table[hashidx].latch, 0); + bt_spinreleasewrite (set->busy, 0); return set; } } @@ -668,7 +717,7 @@ uint slot; #ifdef unix munmap (mgr->latchsets, mgr->latchmgr->nlatchpage * mgr->page_size); - munmap (mgr->latchmgr, mgr->page_size); + munmap (mgr->latchmgr, mgr->page_size); #else FlushViewOfFile(mgr->latchmgr, 0); UnmapViewOfFile(mgr->latchmgr); @@ -720,7 +769,6 @@ uint amt[1]; BtMgr* mgr; BtKey key; int flag; - #ifndef unix SYSTEM_INFO sysinfo[1]; #endif @@ -737,7 +785,6 @@ SYSTEM_INFO sysinfo[1]; #ifdef unix mgr = calloc (1, sizeof(BtMgr)); - mgr->idx = open ((char*)name, O_RDWR | O_CREAT, 0666); if( mgr->idx == -1 ) @@ -770,7 +817,7 @@ SYSTEM_INFO sysinfo[1]; else return free(mgr), free(latchmgr), NULL; } else if( mode == BT_ro ) - return free(latchmgr), bt_mgrclose (mgr), NULL; + return free(latchmgr), free (mgr), NULL; #else latchmgr = VirtualAlloc(NULL, BT_maxpage, MEM_COMMIT, PAGE_READWRITE); size = GetFileSize(mgr->idx, amt); @@ -811,12 +858,12 @@ SYSTEM_INFO sysinfo[1]; #ifdef unix mgr->pool = calloc (poolmax, sizeof(BtPool)); mgr->hash = calloc (hashsize, sizeof(ushort)); - mgr->latch = calloc (hashsize, sizeof(BtSpinLatch)); + mgr->latch = calloc (hashsize, sizeof(BtLatch)); mgr->pooladvise = calloc (poolmax, (mgr->poolmask + 8) / 8); #else mgr->pool = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, poolmax * sizeof(BtPool)); mgr->hash = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(ushort)); - mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtSpinLatch)); + mgr->latch = GlobalAlloc (GMEM_FIXED|GMEM_ZEROINIT, hashsize * sizeof(BtLatch)); #endif if( size || *amt ) @@ -900,6 +947,27 @@ SYSTEM_INFO sysinfo[1]; last++; } + // create empty page area by writing last page of first + // segment area (other pages are zeroed by O/S) + + if( mgr->poolmask ) { + memset(latchmgr, 0, mgr->page_size); + last = mgr->poolmask; + + while( last < MIN_lvl + 1 ) + last += mgr->poolmask + 1; + +#ifdef unix + pwrite(mgr->idx, latchmgr, mgr->page_size, last << mgr->page_bits); +#else + SetFilePointer (mgr->idx, last << mgr->page_bits, NULL, FILE_BEGIN); + if( !WriteFile (mgr->idx, (char *)latchmgr, mgr->page_size, amt, NULL) ) + return bt_mgrclose (mgr), NULL; + if( *amt < mgr->page_size ) + return bt_mgrclose (mgr), NULL; +#endif + } + mgrlatch: #ifdef unix flag = PROT_READ | PROT_WRITE; @@ -949,7 +1017,7 @@ BtDb *bt = malloc (sizeof(*bt)); bt->zero = (BtPage)(bt->mem + 1 * mgr->page_size); bt->cursor = (BtPage)(bt->mem + 2 * mgr->page_size); - memset(bt->zero, 0, mgr->page_size); + memset (bt->zero, 0, mgr->page_size); return bt; } @@ -1034,7 +1102,7 @@ BtPool *pool = NULL, *node; node = bt->mgr->pool + hashslot; - // scan pool entries under hash table slot + // scan pool entries under hash table slot do { if( node->pin ) @@ -1061,6 +1129,7 @@ int flag; pool->map = mmap (0, (bt->mgr->poolmask+1) << bt->mgr->page_bits, flag, MAP_SHARED, bt->mgr->idx, off); if( pool->map == MAP_FAILED ) return bt->err = BTERR_map; + // clear out madvise issued bits memset (bt->mgr->pooladvise + pool->slot * ((bt->mgr->poolmask + 8) / 8), 0, (bt->mgr->poolmask + 8)/8); #else @@ -1117,12 +1186,11 @@ BtPool *bt_pinpool(BtDb *bt, uid page_no) { BtPool *pool, *node, *next; uint slot, idx, victim; -BtLatchSet *set; // lock hash table chain idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize; - bt_spinreadlock (&bt->mgr->latch[idx]); + bt_spinreadlock (&bt->mgr->latch[idx], 1); // look up in hash table @@ -1132,15 +1200,15 @@ BtLatchSet *set; #else _InterlockedIncrement16 (&pool->pin); #endif - bt_spinreleaseread (&bt->mgr->latch[idx]); + bt_spinreleaseread (&bt->mgr->latch[idx], 1); pool->lru++; return pool; } // upgrade to write lock - bt_spinreleaseread (&bt->mgr->latch[idx]); - bt_spinwritelock (&bt->mgr->latch[idx]); + bt_spinreleaseread (&bt->mgr->latch[idx], 1); + bt_spinwritelock (&bt->mgr->latch[idx], 1); // try to find page in pool with write lock @@ -1150,7 +1218,7 @@ BtLatchSet *set; #else _InterlockedIncrement16 (&pool->pin); #endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); + bt_spinreleasewrite (&bt->mgr->latch[idx], 1); pool->lru++; return pool; } @@ -1177,7 +1245,7 @@ BtLatchSet *set; #else _InterlockedIncrement16 (&pool->pin); #endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); + bt_spinreleasewrite (&bt->mgr->latch[idx], 1); return pool; } @@ -1204,12 +1272,12 @@ BtLatchSet *set; if( !bt_spinwritetry (&bt->mgr->latch[victim]) ) continue; - // if cache entry is empty - // or no slots are unpinned + // if pool entry is empty + // or any pages are pinned // skip this entry if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) { - bt_spinreleasewrite (&bt->mgr->latch[victim]); + bt_spinreleasewrite (&bt->mgr->latch[victim], 1); continue; } @@ -1225,7 +1293,7 @@ BtLatchSet *set; if( node = pool->hashnext ) node->hashprev = pool->hashprev; - bt_spinreleasewrite (&bt->mgr->latch[victim]); + bt_spinreleasewrite (&bt->mgr->latch[victim], 1); // remove old file mapping #ifdef unix @@ -1249,54 +1317,54 @@ BtLatchSet *set; #else _InterlockedIncrement16 (&pool->pin); #endif - bt_spinreleasewrite (&bt->mgr->latch[idx]); + bt_spinreleasewrite (&bt->mgr->latch[idx], 1); return pool; } } // place write, read, or parent lock on requested page_no. -// pin to buffer pool and return latchset pointer +// pin to buffer pool and return page pointer void bt_lockpage(BtLock mode, BtLatchSet *set) { switch( mode ) { case BtLockRead: - bt_spinreadlock (set->readwr); + bt_spinreadlock (set->readwr, 0); break; case BtLockWrite: - bt_spinwritelock (set->readwr); + bt_spinwritelock (set->readwr, 0); break; case BtLockAccess: - bt_spinreadlock (set->access); + bt_spinreadlock (set->access, 0); break; case BtLockDelete: - bt_spinwritelock (set->access); + bt_spinwritelock (set->access, 0); break; case BtLockParent: - bt_spinwritelock (set->parent); + bt_spinwritelock (set->parent, 0); break; } } -// remove write, read, or parent lock on requested page_no. +// remove write, read, or parent lock on requested page void bt_unlockpage(BtLock mode, BtLatchSet *set) { switch( mode ) { case BtLockRead: - bt_spinreleaseread (set->readwr); + bt_spinreleaseread (set->readwr, 0); break; case BtLockWrite: - bt_spinreleasewrite (set->readwr); + bt_spinreleasewrite (set->readwr, 0); break; case BtLockAccess: - bt_spinreleaseread (set->access); + bt_spinreleaseread (set->access, 0); break; case BtLockDelete: - bt_spinreleasewrite (set->access); + bt_spinreleasewrite (set->access, 0); break; case BtLockParent: - bt_spinreleasewrite (set->parent); + bt_spinreleasewrite (set->parent, 0); break; } } @@ -1313,7 +1381,7 @@ int reuse; // lock allocation page - bt_spinwritelock(bt->mgr->latchmgr->lock); + bt_spinwritelock(bt->mgr->latchmgr->lock, 0); // use empty chain first // else allocate empty page @@ -1332,30 +1400,21 @@ int reuse; reuse = 0; } #ifdef unix + if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) + return bt->err = BTERR_wrt, 0; + // if writing first page of pool block, zero last page in the block if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 ) { // use zero buffer to write zeros + memset(bt->zero, 0, bt->mgr->page_size); if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size ) return bt->err = BTERR_wrt, 0; } - - // unlock allocation latch - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - - if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size ) - return bt->err = BTERR_wrt, 0; - #else - // unlock allocation latch - - bt_spinreleasewrite(bt->mgr->latchmgr->lock); - // bring new page into pool and copy page. // this will extend the file into the new pages. - // NB -- no latch required if( pool = bt_pinpool (bt, new_page) ) pmap = bt_page (bt, pool, new_page); @@ -1365,6 +1424,9 @@ int reuse; memcpy(pmap, page, bt->mgr->page_size); bt_unpinpool (pool); #endif + // unlock allocation latch and return new page no + + bt_spinreleasewrite(bt->mgr->latchmgr->lock, 0); return new_page; } @@ -1373,8 +1435,16 @@ int reuse; int bt_findslot (BtDb *bt, unsigned char *key, uint len) { uint diff, higher = bt->page->cnt, low = 1, slot; +uint good = 0; + + // make stopper key an infinite fence value - // low is the lowest candidate, higher is already + if( bt_getid (bt->page->right) ) + higher++; + else + good++; + + // low is the next candidate, higher is already // tested as .ge. the given key, loop ends when they meet while( diff = higher - low ) { @@ -1382,16 +1452,18 @@ uint diff, higher = bt->page->cnt, low = 1, slot; if( keycmp (keyptr(bt->page, slot), key, len) < 0 ) low = slot + 1; else - higher = slot; + higher = slot, good++; } - return higher; + // return zero if key is on right link page + + return good ? higher : 0; } // find and load page at given level for given key // leave page rd or wr locked as requested -int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, BtLock lock) +int bt_loadpage (BtDb *bt, unsigned char *key, uint len, uint lvl, uint lock) { uid page_no = ROOT_page, prevpage = 0; BtLatchSet *set, *prevset; @@ -1401,12 +1473,12 @@ BtPool *prevpool; // start at root of btree and drill down + bt->set = NULL; + do { // determine lock mode of drill level mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; - // obtain latch set for this page - bt->set = bt_pinlatch (bt, page_no); bt->page_no = page_no; @@ -1422,7 +1494,7 @@ BtPool *prevpool; if( page_no > ROOT_page ) bt_lockpage(BtLockAccess, bt->set); - // now unlock and unpin our (possibly foster) parent + // release & unpin parent page if( prevpage ) { bt_unlockpage(prevmode, prevset); @@ -1440,184 +1512,113 @@ BtPool *prevpool; // re-read and re-lock root after determining actual level of root - if( page_no == ROOT_page ) - if( bt->page->lvl != drill) { + if( bt->page->lvl != drill) { + if ( bt->page_no != ROOT_page ) + return bt->err = BTERR_struct, 0; + drill = bt->page->lvl; - if( lock == BtLockWrite && drill == lvl ) { + if( lock == BtLockWrite && drill == lvl ) { bt_unlockpage(mode, bt->set); bt_unpinlatch (bt->set); bt_unpinpool (bt->pool); continue; } - } - - prevpage = bt->page_no; - prevpool = bt->pool; - prevset = bt->set; - prevmode = mode; + } // find key on page at this level - // and either descend to requested level - // or return key slot - - slot = bt_findslot (bt, key, len); + // and descend to requested level - // is this slot < foster child area - // on the requested level? - - // if so, return actual slot even if dead - - if( slot <= bt->page->cnt - bt->page->foster ) + if( !bt->page->kill && (slot = bt_findslot (bt, key, len)) ) { if( drill == lvl ) return slot; - // find next active slot - - // note: foster children are never dead - // nor fence keys for interiour nodes - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct, 0; // last key shouldn't be deleted + while( slotptr(bt->page, slot)->dead ) + if( slot++ < bt->page->cnt ) + continue; + else { + page_no = bt_getid(bt->page->right); + goto slideright; + } - // is this slot < foster child area - // if so, drill to next level + page_no = bt_getid(slotptr(bt->page, slot)->id); + drill--; + } - if( slot <= bt->page->cnt - bt->page->foster ) - drill--; + // or slide right into next page + // (slide left from deleted page) - // continue right onto foster child - // or down to next level. + else + page_no = bt_getid(bt->page->right); - page_no = bt_getid(slotptr(bt->page, slot)->id); + // continue down / right using overlapping locks + // to protect pages being killed or split. +slideright: + prevpage = bt->page_no; + prevpool = bt->pool; + prevset = bt->set; + prevmode = mode; } while( page_no ); - // return error on end of chain + // return error on end of right chain bt->err = BTERR_struct; return 0; // return error } // find and delete key on page by marking delete flag bit -// when leaf page becomes empty, delete it from the btree +// when page becomes empty, delete it -BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len) +BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) { -unsigned char leftkey[256]; +unsigned char lowerkey[256], higherkey[256]; BtLatchSet *rset, *set; BtPool *pool, *rpool; -BtPage rpage, page; uid page_no, right; uint slot, tod; +BtPage rpage; BtKey ptr; - if( slot = bt_loadpage (bt, key, len, 0, BtLockWrite) ) + if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) ptr = keyptr(bt->page, slot); else return bt->err; // if key is found delete it, otherwise ignore request - // note that fence keys of interiour nodes are not deleted. if( bt->found = !keycmp (ptr, key, len) ) if( bt->found = slotptr(bt->page, slot)->dead == 0 ) { - slotptr(bt->page,slot)->dead = 1; + slotptr(bt->page,slot)->dead = 1; if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; + bt->page->dirty = 1; + bt->page->act--; } + // return if page is not empty, or it has no right sibling + + right = bt_getid(bt->page->right); page_no = bt->page_no; pool = bt->pool; - page = bt->page; set = bt->set; - // return if page is not empty or not found - - if( page->act || !bt->found ) { + if( !right || bt->page->act ) { bt_unlockpage(BtLockWrite, set); bt_unpinlatch (set); bt_unpinpool (pool); return bt->err; } - // cache copy of fence key of empty node - - ptr = keyptr(page, page->cnt); - memcpy(leftkey, ptr, ptr->len + 1); + // obtain Parent lock over write lock - // release write lock on empty node - // obtain Parent lock - - bt_unlockpage(BtLockWrite, set); bt_lockpage(BtLockParent, set); - // load and lock parent to see - // if delete of empty node is OK - // ie, not a fence key of parent - - while( 1 ) { - if( slot = bt_loadpage (bt, leftkey+1, *leftkey, 1, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - return bt->err; - - // does parent level contain our fence key yet? - // and is it free of foster children? - - if( !bt->page->foster ) - if( !keycmp (ptr, leftkey+1, *leftkey) ) - break; - - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); -#ifdef unix - sched_yield(); -#else - SwitchToThread(); -#endif - } - - // find our left fence key - - while( slotptr(bt->page, slot)->dead ) - if( slot++ < bt->page->cnt ) - continue; - else - return bt->err = BTERR_struct; // last key shouldn't be deleted - - // now we have both parent and child - - bt_lockpage(BtLockDelete, set); - bt_lockpage(BtLockWrite, set); + // keep copy of key to delete - // return if page has no right sibling within parent - // or if empty node is no longer empty - - if( page->act || slot == bt->page->cnt ) { - // unpin parent - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - // unpin empty node - bt_unlockpage(BtLockParent, set); - bt_unlockpage(BtLockDelete, set); - bt_unlockpage(BtLockWrite, set); - bt_unpinlatch (set); - bt_unpinpool (pool); - return bt->err; - } + ptr = keyptr(bt->page, bt->page->cnt); + memcpy(lowerkey, ptr, ptr->len + 1); - // lock and map our right page - // note that it cannot be our foster child - // since the our node is empty - - right = bt_getid(page->right); + // lock and map right page if( rpool = bt_pinpool (bt, right) ) rpage = bt_page (bt, rpool, right); @@ -1626,37 +1627,42 @@ BtKey ptr; rset = bt_pinlatch (bt, right); bt_lockpage(BtLockWrite, rset); - bt_lockpage(BtLockDelete, rset); - - // pull contents of right page into empty page - memcpy (page, rpage, bt->mgr->page_size); + // pull contents of next page into current empty page - // delete left parent slot for old empty page - // and redirect right parent slot to it + memcpy (bt->page, rpage, bt->mgr->page_size); - bt->page->act--; - bt->page->dirty = 1; - slotptr(bt->page, slot)->dead = 1; + // keep copy of key to update - while( slot++ < bt->page->cnt ) - if( !slotptr(bt->page, slot)->dead ) - break; + ptr = keyptr(rpage, rpage->cnt); + memcpy(higherkey, ptr, ptr->len + 1); - bt_putid(slotptr(bt->page,slot)->id, page_no); + // Mark right page as deleted and point it to left page + // until we can post updates at higher level. - // release parent level lock - // and our empty node lock + bt_putid(rpage->right, page_no); + rpage->kill = 1; + rpage->cnt = 0; + bt_unlockpage(BtLockWrite, rset); bt_unlockpage(BtLockWrite, set); - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); + + // delete old lower key to consolidated node + + if( bt_deletekey (bt, lowerkey + 1, *lowerkey, lvl + 1) ) + return bt->err; + + // redirect higher key directly to consolidated node + + tod = (uint)time(NULL); + + if( bt_insertkey (bt, higherkey+1, *higherkey, lvl + 1, page_no, tod) ) + return bt->err; // add killed right block to free chain // lock latch mgr - bt_spinwritelock(bt->mgr->latchmgr->lock); + bt_spinwritelock(bt->mgr->latchmgr->lock, 0); // store free chain in allocation page second right bt_putid(rpage->right, bt_getid(bt->mgr->latchmgr->alloc[1].right)); @@ -1664,7 +1670,7 @@ BtKey ptr; // unlock latch mgr and right page - bt_spinreleasewrite(bt->mgr->latchmgr->lock); + bt_spinreleasewrite(bt->mgr->latchmgr->lock, 0); bt_unlockpage(BtLockWrite, rset); bt_unlockpage(BtLockDelete, rset); @@ -1674,11 +1680,10 @@ BtKey ptr; // remove ParentModify lock bt_unlockpage(BtLockParent, set); - bt_unlockpage(BtLockDelete, set); bt_unpinlatch (set); bt_unpinpool (pool); return 0; -} +} // find key in leaf level and return row-id @@ -1696,7 +1701,7 @@ uid id; // if key exists, return row-id // otherwise return 0 - if( slot <= bt->page->cnt && !keycmp (ptr, key, len) ) + if( ptr->len == len && !memcmp (ptr->key, key, len) ) id = bt_getid(slotptr(bt->page,slot)->id); else id = 0; @@ -1709,8 +1714,8 @@ uid id; // check page for space available, // clean if necessary and return -// 0 - page needs splitting -// >0 new slot value +// =0 - page needs splitting +// >0 - go ahead at returned slot uint bt_cleanpage(BtDb *bt, uint amt, uint slot) { @@ -1737,13 +1742,7 @@ BtKey key; page->dirty = 0; page->act = 0; - // try cleaning up page first - - // always leave fence key in the array - // otherwise, remove deleted key - - // note: foster children are never dead - // nor are fence keys for interiour nodes + // always leave fence key in list while( cnt++ < max ) { if( cnt == slot ) @@ -1752,7 +1751,6 @@ BtKey key; continue; // copy key - key = keyptr(bt->frame, cnt); nxt -= key->len + 1; memcpy ((unsigned char *)page + nxt, key, key->len + 1); @@ -1764,108 +1762,48 @@ BtKey key; slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; slotptr(page, idx)->off = nxt; } - page->min = nxt; page->cnt = idx; - // see if page has enough space now, or does it need splitting? - if( page->min >= (idx+1) * sizeof(BtSlot) + sizeof(*page) + amt + 1 ) return newslot; return 0; } -// add key to current page -// page must already be writelocked - -void bt_addkeytopage (BtDb *bt, uint slot, unsigned char *key, uint len, uid id, uint tod) -{ -BtPage page = bt->page; -uint idx; - - // find next available dead slot and copy key onto page - // note that foster children on the page are never dead - - // look for next hole, but stay back from the fence key - - for( idx = slot; idx < page->cnt; idx++ ) - if( slotptr(page, idx)->dead ) - break; - - if( idx == page->cnt ) - idx++, page->cnt++; - - page->act++; - - // now insert key into array before slot - - while( idx > slot ) - *slotptr(page, idx) = *slotptr(page, idx -1), idx--; - - page->min -= len + 1; - ((unsigned char *)page)[page->min] = len; - memcpy ((unsigned char *)page + page->min +1, key, len ); - - bt_putid(slotptr(page,slot)->id, id); - slotptr(page, slot)->off = page->min; - slotptr(page, slot)->tod = tod; - slotptr(page, slot)->dead = 0; -} - // split the root and raise the height of the btree -// call with current page locked and page no of foster child -// return with current page (root) unlocked -BTERR bt_splitroot(BtDb *bt, uid right) +BTERR bt_splitroot(BtDb *bt, unsigned char *newkey, unsigned char *oldkey, uid page_no2) { uint nxt = bt->mgr->page_size; -unsigned char fencekey[256]; BtPage root = bt->page; uid new_page; -BtKey key; - // Obtain an empty page to use, and copy the left page - // contents into it from the root. Strip foster child key. - // (it's the stopper key) - - memset (slotptr(root, root->cnt), 0, sizeof(BtSlot)); - root->dirty = 1; - root->foster--; - root->act--; - root->cnt--; - - // Save left fence key. - - key = keyptr(root, root->cnt); - memcpy (fencekey, key, key->len + 1); - - // copy the lower keys into a new left page + // Obtain an empty page to use, and copy the current + // root contents into it which is the lower half of + // the old root. if( !(new_page = bt_newpage(bt, root)) ) return bt->err; // preserve the page info at the bottom - // and set rest of the root to zero + // and set rest to zero - memset (root+1, 0, bt->mgr->page_size - sizeof(*root)); + memset(root+1, 0, bt->mgr->page_size - sizeof(*root)); - // insert left fence key on empty newroot page + // insert first key on newroot page - nxt -= *fencekey + 1; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); + nxt -= *newkey + 1; + memcpy ((unsigned char *)root + nxt, newkey, *newkey + 1); bt_putid(slotptr(root, 1)->id, new_page); slotptr(root, 1)->off = nxt; - // insert stopper key on newroot page + // insert second key on newroot page // and increase the root height - nxt -= 3; - fencekey[0] = 2; - fencekey[1] = 0xff; - fencekey[2] = 0xff; - memcpy ((unsigned char *)root + nxt, fencekey, *fencekey + 1); - bt_putid(slotptr(root, 2)->id, right); + nxt -= *oldkey + 1; + memcpy ((unsigned char *)root + nxt, oldkey, *oldkey + 1); + bt_putid(slotptr(root, 2)->id, page_no2); slotptr(root, 2)->off = nxt; bt_putid(root->right, 0); @@ -1883,34 +1821,31 @@ BtKey key; } // split already locked full node -// in current page variables -// return unlocked and unpinned. +// return unlocked. BTERR bt_splitpage (BtDb *bt) { -uint slot, cnt, idx, max, nxt = bt->mgr->page_size; -unsigned char fencekey[256]; -uid page_no = bt->page_no; -BtLatchSet *set = bt->set; +uint cnt = 0, idx = 0, max, nxt = bt->mgr->page_size; +unsigned char oldkey[256], lowerkey[256]; +uid page_no = bt->page_no, right; +BtLatchSet *nset, *set = bt->set; BtPool *pool = bt->pool; BtPage page = bt->page; -uint tod = time(NULL); uint lvl = page->lvl; -uid new_page, right; +uid new_page; BtKey key; +uint tod; - // initialize frame buffer for right node + // split higher half of keys to bt->frame + // the last key (fence key) might be dead - memset (bt->frame, 0, bt->mgr->page_size); - max = page->cnt - page->foster; tod = (uint)time(NULL); + + memset (bt->frame, 0, bt->mgr->page_size); + max = (int)page->cnt; cnt = max / 2; idx = 0; - // split higher half of keys to bt->frame - // leaving old foster children in the left node, - // and adding a new foster child there. - while( cnt++ < max ) { key = keyptr(page, cnt); nxt -= key->len + 1; @@ -1922,196 +1857,165 @@ BtKey key; slotptr(bt->frame, idx)->off = nxt; } - // transfer right link node to new right node + // remember existing fence key for new page to the right - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } + memcpy (oldkey, key, key->len + 1); bt->frame->bits = bt->mgr->page_bits; bt->frame->min = nxt; bt->frame->cnt = idx; bt->frame->lvl = lvl; - // get new free page and write right frame to it. + // link right node - if( !(new_page = bt_newpage(bt, bt->frame)) ) - return bt->err; + if( page_no > ROOT_page ) { + right = bt_getid (page->right); + bt_putid(bt->frame->right, right); + } - // remember fence key for new right page to add - // as foster child to the left node + // get new free page and write frame to it. - key = keyptr(bt->frame, idx); - memcpy (fencekey, key, key->len + 1); + if( !(new_page = bt_newpage(bt, bt->frame)) ) + return bt->err; - // update lower keys and foster children to continue in old page + // update lower keys to continue in old page memcpy (bt->frame, page, bt->mgr->page_size); memset (page+1, 0, bt->mgr->page_size - sizeof(*page)); nxt = bt->mgr->page_size; - page->dirty = 0; page->act = 0; cnt = 0; idx = 0; // assemble page of smaller keys - // to remain in the old page + // (they're all active keys) while( cnt++ < max / 2 ) { key = keyptr(bt->frame, cnt); nxt -= key->len + 1; memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); - if( !(slotptr(page, idx)->dead = slotptr(bt->frame, cnt)->dead) ) - page->act++; - slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; - slotptr(page, idx)->off = nxt; - } - - // insert new foster child for right page in queue - // before any of the current foster children - - nxt -= *fencekey + 1; - memcpy ((unsigned char *)page + nxt, fencekey, *fencekey + 1); - - bt_putid (slotptr(page,++idx)->id, new_page); - slotptr(page, idx)->tod = tod; - slotptr(page, idx)->off = nxt; - page->foster++; - page->act++; - - // continue with old foster child keys - // note that none will be dead - - cnt = bt->frame->cnt - bt->frame->foster; - - while( cnt++ < bt->frame->cnt ) { - key = keyptr(bt->frame, cnt); - nxt -= key->len + 1; - memcpy ((unsigned char *)page + nxt, key, key->len + 1); - memcpy (slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); + memcpy(slotptr(page,++idx)->id, slotptr(bt->frame,cnt)->id, BtId); slotptr(page, idx)->tod = slotptr(bt->frame, cnt)->tod; slotptr(page, idx)->off = nxt; page->act++; } + // remember fence key for old page + + memcpy(lowerkey, key, key->len + 1); + bt_putid(page->right, new_page); page->min = nxt; page->cnt = idx; - // link new right page - - bt_putid (page->right, new_page); - // if current page is the root page, split it if( page_no == ROOT_page ) - return bt_splitroot (bt, new_page); + return bt_splitroot (bt, lowerkey, oldkey, new_page); - // release wr lock on our page + // release wr lock on left page bt_unlockpage (BtLockWrite, set); - // obtain ParentModification lock for current page - // to fix new fence key and oldest foster child on page + // obtain Parent/Write locks + // for left and right node pages - bt_lockpage (BtLockParent, set); + nset = bt_pinlatch (bt, new_page); - // get our new fence key to insert in parent node - - bt_lockpage (BtLockRead, set); - - key = keyptr(page, page->cnt-1); - memcpy (fencekey, key, key->len+1); + bt_lockpage (BtLockParent, nset); + bt_lockpage (BtLockParent, set); - bt_unlockpage (BtLockRead, set); + // insert new fence for reformulated left block - if( bt_insertkey (bt, fencekey + 1, *fencekey, page_no, tod, lvl + 1) ) + if( bt_insertkey (bt, lowerkey+1, *lowerkey, lvl + 1, page_no, tod) ) return bt->err; - // lock our page for writing - - bt_lockpage (BtLockRead, set); - - // switch old parent key from us to our oldest foster child + // fix old fence for newly allocated right block page - key = keyptr(page, page->cnt); - memcpy (fencekey, key, key->len+1); - - new_page = bt_getid (slotptr(page, page->cnt)->id); - bt_unlockpage (BtLockRead, set); - - if( bt_insertkey (bt, fencekey + 1, *fencekey, new_page, tod, lvl + 1) ) + if( bt_insertkey (bt, oldkey+1, *oldkey, lvl + 1, new_page, tod) ) return bt->err; - // now that it has its own parent pointer, - // remove oldest foster child from our page - - bt_lockpage (BtLockWrite, set); - memset (slotptr(page, page->cnt), 0, sizeof(BtSlot)); - page->dirty = 1; - page->foster--; - page->cnt--; - page->act--; + // release Parent locks - // unlock and unpin - - bt_unlockpage (BtLockWrite, set); + bt_unlockpage (BtLockParent, nset); bt_unlockpage (BtLockParent, set); + bt_unpinlatch (nset); bt_unpinlatch (set); bt_unpinpool (pool); return 0; } -// Insert new key into the btree at leaf level. +// Insert new key into the btree at requested level. +// Level zero pages are leaf pages. Page is unlocked at exit. -BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uid id, uint tod, uint lvl) +BTERR bt_insertkey (BtDb *bt, unsigned char *key, uint len, uint lvl, uid id, uint tod) { uint slot, idx; BtPage page; BtKey ptr; - while( 1 ) { - if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) - ptr = keyptr(bt->page, slot); - else - { - if ( !bt->err ) - bt->err = BTERR_ovflw; - return bt->err; - } + while( 1 ) { + if( slot = bt_loadpage (bt, key, len, lvl, BtLockWrite) ) + ptr = keyptr(bt->page, slot); + else + { + if ( !bt->err ) + bt->err = BTERR_ovflw; + return bt->err; + } - // if key already exists, update id and return + // if key already exists, update id and return - page = bt->page; + page = bt->page; - if( !keycmp (ptr, key, len) ) { - if( slotptr(page, slot)->dead ) - page->act++; - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - bt_unlockpage(BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return bt->err; - } + if( !keycmp (ptr, key, len) ) { + slotptr(page, slot)->dead = 0; + slotptr(page, slot)->tod = tod; + bt_putid(slotptr(page,slot)->id, id); + bt_unlockpage(BtLockWrite, bt->set); + bt_unpinlatch(bt->set); + bt_unpinpool (bt->pool); + return bt->err; + } - // check if page has enough space + // check if page has enough space - if( slot = bt_cleanpage (bt, len, slot) ) - break; + if( slot = bt_cleanpage (bt, len, slot) ) + break; - if( bt_splitpage (bt) ) - return bt->err; - } + if( bt_splitpage (bt) ) + return bt->err; + } - bt_addkeytopage (bt, slot, key, len, id, tod); + // calculate next available slot and copy key into page - bt_unlockpage (BtLockWrite, bt->set); - bt_unpinlatch (bt->set); - bt_unpinpool (bt->pool); - return 0; + page->min -= len + 1; // reset lowest used offset + ((unsigned char *)page)[page->min] = len; + memcpy ((unsigned char *)page + page->min +1, key, len ); + + for( idx = slot; idx < page->cnt; idx++ ) + if( slotptr(page, idx)->dead ) + break; + + // now insert key into array before slot + // preserving the fence slot + + if( idx == page->cnt ) + idx++, page->cnt++; + + page->act++; + + while( idx > slot ) + *slotptr(page, idx) = *slotptr(page, idx -1), idx--; + + bt_putid(slotptr(page,slot)->id, id); + slotptr(page, slot)->off = page->min; + slotptr(page, slot)->tod = tod; + slotptr(page, slot)->dead = 0; + + bt_unlockpage (BtLockWrite, bt->set); + bt_unpinlatch (bt->set); + bt_unpinpool (bt->pool); + return 0; } // cache page of keys into cursor and return starting slot for given key @@ -2123,9 +2027,7 @@ uint slot; // cache page for retrieval if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) memcpy (bt->cursor, bt->page, bt->mgr->page_size); - bt->cursor_page = bt->page_no; - bt_unlockpage(BtLockRead, bt->set); bt_unpinlatch (bt->set); bt_unpinpool (bt->pool); @@ -2137,17 +2039,16 @@ uint slot; uint bt_nextkey (BtDb *bt, uint slot) { -BtLatchSet *set; BtPool *pool; BtPage page; uid right; do { right = bt_getid(bt->cursor->right); - while( slot++ < bt->cursor->cnt - bt->cursor->foster ) + while( slot++ < bt->cursor->cnt ) if( slotptr(bt->cursor,slot)->dead ) continue; - else if( right || (slot < bt->cursor->cnt - bt->cursor->foster) ) + else if( right || (slot < bt->cursor->cnt)) return slot; else break; @@ -2156,18 +2057,19 @@ uid right; break; bt->cursor_page = right; + if( pool = bt_pinpool (bt, right) ) page = bt_page (bt, pool, right); else return 0; - set = bt_pinlatch (bt, right); - bt_lockpage(BtLockRead, set); + bt->set = bt_pinlatch (bt, right); + bt_lockpage(BtLockRead, bt->set); memcpy (bt->cursor, page, bt->mgr->page_size); - bt_unlockpage(BtLockRead, set); - bt_unpinlatch (set); + bt_unlockpage(BtLockRead, bt->set); + bt_unpinlatch (bt->set); bt_unpinpool (pool); slot = 0; } while( 1 ); @@ -2190,7 +2092,6 @@ uint bt_tod(BtDb *bt, uint slot) return slotptr(bt->cursor,slot)->tod; } - #ifdef STANDALONE void bt_latchaudit (BtDb *bt) @@ -2217,11 +2118,11 @@ uid page_no; } for( hashidx = 0; hashidx < bt->mgr->latchmgr->latchhash; hashidx++ ) { - if( *(ushort *)bt->mgr->latchmgr->table[hashidx].latch ) + if( *(uint *)bt->mgr->latchmgr->table[hashidx].latch ) fprintf(stderr, "latchmgr locked\n"); if( idx = bt->mgr->latchmgr->table[hashidx].slot ) do { set = bt->mgr->latchsets + idx; - if( *(ushort *)set->readwr || *(ushort *)set->access || *(ushort *)set->parent ) + if( *(uint *)set->readwr || *(ushort *)set->access || *(ushort *)set->parent ) fprintf(stderr, "latchset %d locked\n", idx); if( set->hash != hashidx ) fprintf(stderr, "latchset %d wrong hashidx\n", idx); @@ -2262,7 +2163,6 @@ uid next, page_no = LEAF_page; // start on first page of leaves unsigned char key[256]; ThreadArg *args = arg; int ch, len = 0, slot; -BtLatchSet *set; time_t tod[1]; BtPool *pool; BtPage page; @@ -2280,6 +2180,7 @@ FILE *in; bt_latchaudit (bt); fprintf(stderr, "finished latch mgr audit\n"); break; + case 'w': fprintf(stderr, "started indexing for %s\n", args->infile); if( in = fopen (args->infile, "rb") ) @@ -2294,7 +2195,7 @@ FILE *in; else if( args->num ) sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - if( bt_insertkey (bt, key, len, line, *tod, 0) ) + if( bt_insertkey (bt, key, len, 0, line, *tod) ) fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); len = 0; } @@ -2316,7 +2217,7 @@ FILE *in; else if( args->num ) sprintf((char *)key+len, "%.9d", line + args->idx * args->num), len += 9; - if( bt_deletekey (bt, key, len) ) + if( bt_deletekey (bt, key, len, 0) ) fprintf(stderr, "Error %d Line: %d\n", bt->err, line), exit(0); len = 0; } @@ -2371,17 +2272,17 @@ FILE *in; fprintf(stderr, "started reading\n"); do { - if( pool = bt_pinpool (bt, page_no) ) - page = bt_page (bt, pool, page_no); + if( bt->pool = bt_pinpool (bt, page_no) ) + page = bt_page (bt, bt->pool, page_no); else break; - set = bt_pinlatch (bt, page_no); - bt_lockpage (BtLockRead, set); + bt->set = bt_pinlatch (bt, page_no); + bt_lockpage (BtLockRead, bt->set); cnt += page->act; next = bt_getid (page->right); - bt_unlockpage (BtLockRead, set); - bt_unpinlatch (set); - bt_unpinpool (pool); + bt_unlockpage (BtLockRead, bt->set); + bt_unpinlatch (bt->set); + bt_unpinpool (bt->pool); } while( page_no = next ); cnt--; // remove stopper key -- 2.40.0