-// btree version 2u
+// btree version 2u sched_yield locks
// with combined latch & pool manager
-// 26 FEB 2014
+// and phase-fair reader writer lock
+// 12 MAR 2014
// author: karl malbrain, malbrain@cal.berkeley.edu
#define BT_minpage (1 << BT_minbits) // minimum page size
#define BT_maxpage (1 << BT_maxbits) // maximum page size
+// BTree page number constants
+#define ALLOC_page 0
+#define ROOT_page 1
+#define LEAF_page 2
+#define LATCH_page 3
+
+// Number of levels to create in a new BTree
+
+#define MIN_lvl 2
+#define MAX_lvl 15
+
/*
There are five lock types for each node in three independent sets:
1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete.
// definition for latch implementation
-// exclusive is set for write access
-// share is count of read accessors
-// grant write lock when share == 0
-
volatile typedef struct {
- unsigned char mutex[1];
- unsigned char exclusive:1;
- unsigned char pending:1;
- ushort share;
+ ushort lock[1];
} BtSpinLatch;
+#define XCL 1
+#define PEND 2
+#define BOTH 3
+#define SHARE 4
+
+volatile typedef struct {
+ ushort rin[1]; // readers in count
+ ushort rout[1]; // readers out count
+ ushort serving[1]; // writers out count
+ ushort ticket[1]; // writers in count
+} RWLock;
+
+// define bits at bottom of rin
+
+#define PHID 0x1 // writer phase (0/1)
+#define PRES 0x2 // writer present
+#define MASK 0x3 // both write bits
+#define RINC 0x4 // reader increment
+
// Define the length of the page and key pointers
#define BtId 6
typedef struct {
struct BtPage_ alloc[2]; // next & free page_nos in right ptr
BtSpinLatch lock[1]; // allocation area lite latch
- uint latchdeployed; // highest number of latch entries deployed
- uint nlatchpage; // number of latch pages at BT_latch
- uint latchtotal; // number of page latch entries
- uint latchhash; // number of latch hash table slots
- uint latchvictim; // next latch hash entry to examine
+ volatile uint latchdeployed;// highest number of latch entries deployed
+ volatile uint nlatchpage; // number of latch pages at BT_latch
+ volatile uint latchtotal; // number of page latch entries
+ volatile uint latchhash; // number of latch hash table slots
+ volatile uint latchvictim; // next latch hash entry to examine
+ volatile uint safelevel; // safe page level in cache
+ volatile uint cache[MAX_lvl];// cache census counts by btree level
} BtLatchMgr;
// latch hash table entries
// latch manager table structure
typedef struct {
- volatile uid page_no; // latch set page number on disk
- BtSpinLatch readwr[1]; // read/write page lock
- BtSpinLatch access[1]; // Access Intent/Page delete
- BtSpinLatch parent[1]; // Posting of fence key in parent
- volatile ushort clock; // accessed since last clock pass
- volatile uint next; // next entry in hash table chain
- volatile uint prev; // prev entry in hash table chain
- volatile uint pin; // number of outstanding pins
+ volatile uid page_no; // latch set page number on disk
+ RWLock readwr[1]; // read/write page lock
+ RWLock access[1]; // Access Intent/Page delete
+ RWLock parent[1]; // Posting of fence key in parent
+ volatile ushort pin; // number of pins/level/clock bits
+ volatile uint next; // next entry in hash table chain
+ volatile uint prev; // prev entry in hash table chain
} BtLatchSet;
+#define CLOCK_mask 0xe000
+#define CLOCK_unit 0x2000
+#define PIN_mask 0x07ff
+#define LVL_mask 0x1800
+#define LVL_shift 11
+
// The object structure for Btree access
typedef struct _BtDb {
extern uint bt_tod (BtDb *bt, uint slot);
#endif
-// BTree page number constants
-#define ALLOC_page 0
-#define ROOT_page 1
-#define LEAF_page 2
-#define LATCH_page 3
-
-// Number of levels to create in a new BTree
-
-#define MIN_lvl 2
-
// The page is allocated from low and hi ends.
// The key offsets and row-id's are allocated
// from the bottom, while the text of the key
return bt->err = err;
}
-// Latch Manager
+// Phase-Fair reader/writer lock implementation
+
+void WriteLock (RWLock *lock)
+{
+ushort w, r, tix;
+
+#ifdef unix
+ tix = __sync_fetch_and_add (lock->ticket, 1);
+#else
+ tix = _InterlockedExchangeAdd16 (lock->ticket, 1);
+#endif
+ // wait for our ticket to come up
+
+ while( tix != lock->serving[0] )
+#ifdef unix
+ sched_yield();
+#else
+ SwitchToThread ();
+#endif
+
+ w = PRES | (tix & PHID);
+#ifdef unix
+ r = __sync_fetch_and_add (lock->rin, w);
+#else
+ r = _InterlockedExchangeAdd16 (lock->rin, w);
+#endif
+ while( r != *lock->rout )
+#ifdef unix
+ sched_yield();
+#else
+ SwitchToThread();
+#endif
+}
+
+void WriteRelease (RWLock *lock)
+{
+#ifdef unix
+ __sync_fetch_and_and (lock->rin, ~MASK);
+#else
+ _InterlockedAnd16 (lock->rin, ~MASK);
+#endif
+ lock->serving[0]++;
+}
+
+void ReadLock (RWLock *lock)
+{
+ushort w;
+#ifdef unix
+ w = __sync_fetch_and_add (lock->rin, RINC) & MASK;
+#else
+ w = _InterlockedExchangeAdd16 (lock->rin, RINC) & MASK;
+#endif
+ if( w )
+ while( w == (*lock->rin & MASK) )
+#ifdef unix
+ sched_yield ();
+#else
+ SwitchToThread ();
+#endif
+}
+
+void ReadRelease (RWLock *lock)
+{
+#ifdef unix
+ __sync_fetch_and_add (lock->rout, RINC);
+#else
+ _InterlockedExchangeAdd16 (lock->rout, RINC);
+#endif
+}
+
+// Spin Latch Manager
// wait until write lock mode is clear
// and add 1 to the share count
ushort prev;
do {
- // obtain latch mutex
#ifdef unix
- if( __sync_lock_test_and_set(latch->mutex, 1) )
- continue;
+ prev = __sync_fetch_and_add (latch->lock, SHARE);
#else
- if( _InterlockedExchange8(latch->mutex, 1) )
- continue;
+ prev = _InterlockedExchangeAdd16(latch->lock, SHARE);
#endif
// see if exclusive request is granted or pending
- if( prev = !(latch->exclusive | latch->pending) )
- latch->share++;
-
+ if( !(prev & BOTH) )
+ return;
#ifdef unix
- *latch->mutex = 0;
+ prev = __sync_fetch_and_add (latch->lock, -SHARE);
#else
- _InterlockedExchange8(latch->mutex, 0);
+ prev = _InterlockedExchangeAdd16(latch->lock, -SHARE);
#endif
-
- if( prev )
- return;
-
#ifdef unix
} while( sched_yield(), 1 );
#else
void bt_spinwritelock(BtSpinLatch *latch)
{
-uint prev;
+ushort prev;
do {
#ifdef unix
- if( __sync_lock_test_and_set(latch->mutex, 1) )
- continue;
+ prev = __sync_fetch_and_or(latch->lock, PEND | XCL);
#else
- if( _InterlockedExchange8(latch->mutex, 1) )
- continue;
+ prev = _InterlockedOr16(latch->lock, PEND | XCL);
#endif
- if( prev = !(latch->share | latch->exclusive) )
- latch->exclusive = 1, latch->pending = 0;
- else
- latch->pending = 1;
+ if( !(prev & XCL) )
+ if( !(prev & ~BOTH) )
+ return;
+ else
#ifdef unix
- *latch->mutex = 0;
+ __sync_fetch_and_and (latch->lock, ~XCL);
#else
- _InterlockedExchange8(latch->mutex, 0);
+ _InterlockedAnd16(latch->lock, ~XCL);
#endif
- if( prev )
- return;
#ifdef unix
} while( sched_yield(), 1 );
#else
int bt_spinwritetry(BtSpinLatch *latch)
{
-uint prev;
+ushort prev;
-#ifdef unix
- if( __sync_lock_test_and_set(latch->mutex, 1) )
- return 0;
+#ifdef unix
+ prev = __sync_fetch_and_or(latch->lock, XCL);
#else
- if( _InterlockedExchange8(latch->mutex, 1) )
- return 0;
+ prev = _InterlockedOr16(latch->lock, XCL);
#endif
// take write access if all bits are clear
- if( prev = !(latch->exclusive | latch->share) )
- latch->exclusive = 1;
-
+ if( !(prev & XCL) )
+ if( !(prev & ~BOTH) )
+ return 1;
+ else
#ifdef unix
- *latch->mutex = 0;
+ __sync_fetch_and_and (latch->lock, ~XCL);
#else
- _InterlockedExchange8(latch->mutex, 0);
+ _InterlockedAnd16(latch->lock, ~XCL);
#endif
- return prev;
+ return 0;
}
// clear write mode
void bt_spinreleasewrite(BtSpinLatch *latch)
{
#ifdef unix
- while( __sync_lock_test_and_set(latch->mutex, 1) )
- sched_yield();
+ __sync_fetch_and_and(latch->lock, ~BOTH);
#else
- while( _InterlockedExchange8(latch->mutex, 1) )
- SwitchToThread();
-#endif
- latch->exclusive = 0;
-#ifdef unix
- *latch->mutex = 0;
-#else
- _InterlockedExchange8(latch->mutex, 0);
+ _InterlockedAnd16(latch->lock, ~BOTH);
#endif
}
void bt_spinreleaseread(BtSpinLatch *latch)
{
#ifdef unix
- while( __sync_lock_test_and_set(latch->mutex, 1) )
- sched_yield();
-#else
- while( _InterlockedExchange8(latch->mutex, 1) )
- SwitchToThread();
-#endif
- latch->share--;
-#ifdef unix
- *latch->mutex = 0;
+ __sync_fetch_and_add(latch->lock, -SHARE);
#else
- _InterlockedExchange8(latch->mutex, 0);
+ _InterlockedExchangeAdd16(latch->lock, -SHARE);
#endif
}
{
BtPage page = (BtPage)((uid)slot * bt->page_size + bt->pagepool);
BtLatchSet *latch = bt->latchsets + slot;
+int lvl;
if( latch->next = bt->table[hashidx].slot )
bt->latchsets[latch->next].prev = slot;
bt->table[hashidx].slot = slot;
latch->page_no = page_no;
- latch->clock = 1;
latch->prev = 0;
latch->pin = 1;
- return bt_readpage (bt, page, page_no);
+ if( bt_readpage (bt, page, page_no) )
+ return bt->err;
+
+ lvl = page->lvl << LVL_shift;
+ if( lvl > LVL_mask )
+ lvl = LVL_mask;
+ latch->pin |= lvl; // store lvl
+ latch->pin |= lvl << 3; // initialize clock
+
+#ifdef unix
+ __sync_fetch_and_add (&bt->latchmgr->cache[page->lvl], 1);
+#else
+ _InterlockedExchangeAdd(&bt->latchmgr->cache[page->lvl], 1);
+#endif
+ return bt->err = 0;
}
// release latch pin
#ifdef unix
__sync_fetch_and_add(&latch->pin, -1);
#else
- _InterlockedDecrement (&latch->pin);
+ _InterlockedDecrement16 (&latch->pin);
#endif
}
uint hashidx = page_no % bt->latchmgr->latchhash;
BtLatchSet *latch;
uint slot, idx;
+uint lvl, cnt;
off64_t off;
uint amt[1];
BtPage page;
} while( slot = latch->next );
// found our entry
+ // increment clock
if( slot ) {
latch = bt->latchsets + slot;
- latch->clock = 1;
- latch->pin++;
+ lvl = (latch->pin & LVL_mask) >> LVL_shift;
+ lvl *= CLOCK_unit * 2;
+ lvl |= CLOCK_unit;
+#ifdef unix
+ __sync_fetch_and_add(&latch->pin, 1);
+ __sync_fetch_and_or(&latch->pin, lvl);
+#else
+ _InterlockedIncrement16 (&latch->pin);
+ _InterlockedOr16 (&latch->pin, lvl);
+#endif
bt_spinreleasewrite(bt->table[hashidx].latch);
return latch;
}
// or has outstanding pins
slot %= bt->latchmgr->latchtotal;
- latch = bt->latchsets + slot;
+ // on slot wraparound, check census
+ // count and increment safe level
+
+ cnt = bt->latchmgr->cache[bt->latchmgr->safelevel];
+
+ if( !slot ) {
+ if( cnt < bt->latchmgr->latchtotal / 10 )
+#ifdef unix
+ __sync_fetch_and_add(&bt->latchmgr->safelevel, 1);
+#else
+ _InterlockedIncrement (&bt->latchmgr->safelevel);
+#endif
+ continue;
+ }
+
+ latch = bt->latchsets + slot;
idx = latch->page_no % bt->latchmgr->latchhash;
+ lvl = (latch->pin & LVL_mask) >> LVL_shift;
- if( !slot )
+ // see if we are evicting this level yet
+ // or if we are on same chain as hashidx
+
+ if( idx == hashidx || lvl > bt->latchmgr->safelevel )
continue;
if( !bt_spinwritetry (bt->table[idx].latch) )
continue;
- if( latch->clock ) {
- latch->clock = 0;
- bt_spinreleasewrite (bt->table[idx].latch);
- continue;
+ if( latch->pin & ~LVL_mask ) {
+ if( latch->pin & CLOCK_mask )
+#ifdef unix
+ __sync_fetch_and_add(&latch->pin, -CLOCK_unit);
+#else
+ _InterlockedExchangeAdd16 (&latch->pin, -CLOCK_unit);
+#endif
+ bt_spinreleasewrite (bt->table[idx].latch);
+ continue;
}
// update permanent page area in btree
page = (BtPage)((uid)slot * bt->page_size + bt->pagepool);
#ifdef unix
posix_fadvise (bt->idx, page_no << bt->page_bits, bt->page_size, POSIX_FADV_WILLNEED);
+ __sync_fetch_and_add (&bt->latchmgr->cache[page->lvl], -1);
+#else
+ _InterlockedExchangeAdd(&bt->latchmgr->cache[page->lvl], -1);
#endif
if( page->dirty )
if( bt_writepage (bt, page, latch->page_no) )
{
switch( mode ) {
case BtLockRead:
- bt_spinreadlock (latch->readwr);
+ ReadLock (latch->readwr);
break;
case BtLockWrite:
- bt_spinwritelock (latch->readwr);
+ WriteLock (latch->readwr);
break;
case BtLockAccess:
- bt_spinreadlock (latch->access);
+ ReadLock (latch->access);
break;
case BtLockDelete:
- bt_spinwritelock (latch->access);
+ WriteLock (latch->access);
break;
case BtLockParent:
- bt_spinwritelock (latch->parent);
+ WriteLock (latch->parent);
break;
}
}
{
switch( mode ) {
case BtLockRead:
- bt_spinreleaseread (latch->readwr);
+ ReadRelease (latch->readwr);
break;
case BtLockWrite:
- bt_spinreleasewrite (latch->readwr);
+ WriteRelease (latch->readwr);
break;
case BtLockAccess:
- bt_spinreleaseread (latch->access);
+ ReadRelease (latch->access);
break;
case BtLockDelete:
- bt_spinreleasewrite (latch->access);
+ WriteRelease (latch->access);
break;
case BtLockParent:
- bt_spinreleasewrite (latch->parent);
+ WriteRelease (latch->parent);
break;
}
}
void bt_update (BtDb *bt, BtPage page)
{
#ifdef unix
-// msync (page, bt->page_size, MS_ASYNC);
+ msync (page, bt->page_size, MS_ASYNC);
#else
// FlushViewOfFile (page, bt->page_size);
#endif
uint amt[1];
BtKey ptr;
- memset (blks, 0, sizeof(blks));
-
+#ifdef unix
+ posix_fadvise( bt->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
if( *(ushort *)(bt->latchmgr->lock) )
fprintf(stderr, "Alloc page locked\n");
*(ushort *)(bt->latchmgr->lock) = 0;
+ memset (blks, 0, sizeof(blks));
+
for( idx = 1; idx <= bt->latchmgr->latchdeployed; idx++ ) {
latch = bt->latchsets + idx;
if( *(ushort *)latch->readwr )
fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
*(ushort *)latch->parent = 0;
- if( latch->pin ) {
+ if( latch->pin & PIN_mask ) {
fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
latch->pin = 0;
}
page = (BtPage)((uid)idx * bt->page_size + bt->pagepool);
+ blks[page->lvl]++;
if( page->dirty )
if( bt_writepage (bt, page, latch->page_no) )
fprintf(stderr, "Page %.8x Write Error\n", latch->page_no);
}
+ for( idx = 0; blks[idx]; idx++ )
+ fprintf(stderr, "cache: %d lvl %d blocks\n", blks[idx], idx);
+
for( hashidx = 0; hashidx < bt->latchmgr->latchhash; hashidx++ ) {
if( *(ushort *)(bt->table[hashidx].latch) )
fprintf(stderr, "hash entry %d locked\n", hashidx);
*(ushort *)(bt->table[hashidx].latch) = 0;
-
- if( idx = bt->table[hashidx].slot ) do {
- latch = bt->latchsets + idx;
- if( latch->pin )
- fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
- } while( idx = latch->next );
}
+ memset (blks, 0, sizeof(blks));
+
next = bt->latchmgr->nlatchpage + LATCH_page;
page_no = LEAF_page;
next = page_no + 1;
page_no = next;
}
+
for( idx = 0; blks[idx]; idx++ )
- fprintf(stderr, "%d lvl %d blocks\n", blks[idx], idx);
+ fprintf(stderr, "btree: %d lvl %d blocks\n", blks[idx], idx);
+
return cnt - 1;
}