]> pd.if.org Git - btree/blobdiff - btree2u.c
Fix small bug in main when there is less t han one input file
[btree] / btree2u.c
index 2d2a3d0a716149d87a5ab7378a5b6835d8315e36..aaffee30d65fdafb176738bf864ab368dd3cda8c 100644 (file)
--- a/btree2u.c
+++ b/btree2u.c
@@ -1,6 +1,7 @@
-// btree version 2u
+// btree version 2u  sched_yield locks
 //     with combined latch & pool manager
-// 26 FEB 2014
+//     and phase-fair reader writer lock
+// 12 MAR 2014
 
 // author: karl malbrain, malbrain@cal.berkeley.edu
 
@@ -66,6 +67,17 @@ typedef unsigned int         uint;
 #define BT_minpage             (1 << BT_minbits)       // minimum page size
 #define BT_maxpage             (1 << BT_maxbits)       // maximum page size
 
+//  BTree page number constants
+#define ALLOC_page             0
+#define ROOT_page              1
+#define LEAF_page              2
+#define LATCH_page             3
+
+//     Number of levels to create in a new BTree
+
+#define MIN_lvl                        2
+#define MAX_lvl                        15
+
 /*
 There are five lock types for each node in three independent sets: 
 1. (set 1) AccessIntent: Sharable. Going to Read the node. Incompatible with NodeDelete. 
@@ -85,17 +97,29 @@ typedef enum{
 
 //     definition for latch implementation
 
-// exclusive is set for write access
-// share is count of read accessors
-// grant write lock when share == 0
-
 volatile typedef struct {
-       unsigned char mutex[1];
-       unsigned char exclusive:1;
-       unsigned char pending:1;
-       ushort share;
+       ushort lock[1];
 } BtSpinLatch;
 
+#define XCL 1
+#define PEND 2
+#define BOTH 3
+#define SHARE 4
+
+volatile typedef struct {
+       ushort rin[1];          // readers in count
+       ushort rout[1];         // readers out count
+       ushort serving[1];      // writers out count
+       ushort ticket[1];       // writers in count
+} RWLock;
+
+//     define bits at bottom of rin
+
+#define PHID 0x1       // writer phase (0/1)
+#define PRES 0x2       // writer present
+#define MASK 0x3       // both write bits
+#define RINC 0x4       // reader increment
+
 //     Define the length of the page and key pointers
 
 #define BtId 6
@@ -149,11 +173,13 @@ typedef struct BtPage_ {
 typedef struct {
        struct BtPage_ alloc[2];        // next & free page_nos in right ptr
        BtSpinLatch lock[1];            // allocation area lite latch
-       uint latchdeployed;                     // highest number of latch entries deployed
-       uint nlatchpage;                        // number of latch pages at BT_latch
-       uint latchtotal;                        // number of page latch entries
-       uint latchhash;                         // number of latch hash table slots
-       uint latchvictim;                       // next latch hash entry to examine
+       volatile uint latchdeployed;// highest number of latch entries deployed
+       volatile uint nlatchpage;       // number of latch pages at BT_latch
+       volatile uint latchtotal;       // number of page latch entries
+       volatile uint latchhash;        // number of latch hash table slots
+       volatile uint latchvictim;      // next latch hash entry to examine
+       volatile uint safelevel;        // safe page level in cache
+       volatile uint cache[MAX_lvl];// cache census counts by btree level
 } BtLatchMgr;
 
 //  latch hash table entries
@@ -166,16 +192,21 @@ typedef struct {
 //     latch manager table structure
 
 typedef struct {
-       volatile uid page_no;           // latch set page number on disk
-       BtSpinLatch readwr[1];          // read/write page lock
-       BtSpinLatch access[1];          // Access Intent/Page delete
-       BtSpinLatch parent[1];          // Posting of fence key in parent
-       volatile ushort clock;          // accessed since last clock pass
-       volatile uint next;                     // next entry in hash table chain
-       volatile uint prev;                     // prev entry in hash table chain
-       volatile uint pin;                      // number of outstanding pins
+       volatile uid page_no;   // latch set page number on disk
+       RWLock readwr[1];               // read/write page lock
+       RWLock access[1];               // Access Intent/Page delete
+       RWLock parent[1];               // Posting of fence key in parent
+       volatile ushort pin;    // number of pins/level/clock bits
+       volatile uint next;             // next entry in hash table chain
+       volatile uint prev;             // prev entry in hash table chain
 } BtLatchSet;
 
+#define CLOCK_mask 0xe000
+#define CLOCK_unit 0x2000
+#define PIN_mask 0x07ff
+#define LVL_mask 0x1800
+#define LVL_shift 11
+
 //     The object structure for Btree access
 
 typedef struct _BtDb {
@@ -237,16 +268,6 @@ extern uid bt_uid (BtDb *bt, uint slot);
 extern uint bt_tod (BtDb *bt, uint slot);
 #endif
 
-//  BTree page number constants
-#define ALLOC_page             0
-#define ROOT_page              1
-#define LEAF_page              2
-#define LATCH_page             3
-
-//     Number of levels to create in a new BTree
-
-#define MIN_lvl                        2
-
 //  The page is allocated from low and hi ends.
 //  The key offsets and row-id's are allocated
 //  from the bottom, while the text of the key
@@ -330,7 +351,77 @@ BtKey ptr;
        return bt->err = err;
 }
 
-//     Latch Manager
+//     Phase-Fair reader/writer lock implementation
+
+void WriteLock (RWLock *lock)
+{
+ushort w, r, tix;
+
+#ifdef unix
+       tix = __sync_fetch_and_add (lock->ticket, 1);
+#else
+       tix = _InterlockedExchangeAdd16 (lock->ticket, 1);
+#endif
+       // wait for our ticket to come up
+
+       while( tix != lock->serving[0] )
+#ifdef unix
+               sched_yield();
+#else
+               SwitchToThread ();
+#endif
+
+       w = PRES | (tix & PHID);
+#ifdef  unix
+       r = __sync_fetch_and_add (lock->rin, w);
+#else
+       r = _InterlockedExchangeAdd16 (lock->rin, w);
+#endif
+       while( r != *lock->rout )
+#ifdef unix
+               sched_yield();
+#else
+               SwitchToThread();
+#endif
+}
+
+void WriteRelease (RWLock *lock)
+{
+#ifdef unix
+       __sync_fetch_and_and (lock->rin, ~MASK);
+#else
+       _InterlockedAnd16 (lock->rin, ~MASK);
+#endif
+       lock->serving[0]++;
+}
+
+void ReadLock (RWLock *lock)
+{
+ushort w;
+#ifdef unix
+       w = __sync_fetch_and_add (lock->rin, RINC) & MASK;
+#else
+       w = _InterlockedExchangeAdd16 (lock->rin, RINC) & MASK;
+#endif
+       if( w )
+         while( w == (*lock->rin & MASK) )
+#ifdef unix
+               sched_yield ();
+#else
+               SwitchToThread ();
+#endif
+}
+
+void ReadRelease (RWLock *lock)
+{
+#ifdef unix
+       __sync_fetch_and_add (lock->rout, RINC);
+#else
+       _InterlockedExchangeAdd16 (lock->rout, RINC);
+#endif
+}
+
+//     Spin Latch Manager
 
 //     wait until write lock mode is clear
 //     and add 1 to the share count
@@ -340,28 +431,20 @@ void bt_spinreadlock(BtSpinLatch *latch)
 ushort prev;
 
   do {
-       //      obtain latch mutex
 #ifdef unix
-       if( __sync_lock_test_and_set(latch->mutex, 1) )
-               continue;
+       prev = __sync_fetch_and_add (latch->lock, SHARE);
 #else
-       if( _InterlockedExchange8(latch->mutex, 1) )
-               continue;
+       prev = _InterlockedExchangeAdd16(latch->lock, SHARE);
 #endif
        //  see if exclusive request is granted or pending
 
-       if( prev = !(latch->exclusive | latch->pending) )
-               latch->share++;
-
+       if( !(prev & BOTH) )
+               return;
 #ifdef unix
-       *latch->mutex = 0;
+       prev = __sync_fetch_and_add (latch->lock, -SHARE);
 #else
-       _InterlockedExchange8(latch->mutex, 0);
+       prev = _InterlockedExchangeAdd16(latch->lock, -SHARE);
 #endif
-
-       if( prev )
-               return;
-
 #ifdef  unix
   } while( sched_yield(), 1 );
 #else
@@ -373,27 +456,23 @@ ushort prev;
 
 void bt_spinwritelock(BtSpinLatch *latch)
 {
-uint prev;
+ushort prev;
 
   do {
 #ifdef  unix
-       if( __sync_lock_test_and_set(latch->mutex, 1) )
-               continue;
+       prev = __sync_fetch_and_or(latch->lock, PEND | XCL);
 #else
-       if( _InterlockedExchange8(latch->mutex, 1) )
-               continue;
+       prev = _InterlockedOr16(latch->lock, PEND | XCL);
 #endif
-       if( prev = !(latch->share | latch->exclusive) )
-               latch->exclusive = 1, latch->pending = 0;
-       else
-               latch->pending = 1;
+       if( !(prev & XCL) )
+         if( !(prev & ~BOTH) )
+               return;
+         else
 #ifdef unix
-       *latch->mutex = 0;
+               __sync_fetch_and_and (latch->lock, ~XCL);
 #else
-       _InterlockedExchange8(latch->mutex, 0);
+               _InterlockedAnd16(latch->lock, ~XCL);
 #endif
-       if( prev )
-               return;
 #ifdef  unix
   } while( sched_yield(), 1 );
 #else
@@ -408,26 +487,25 @@ uint prev;
 
 int bt_spinwritetry(BtSpinLatch *latch)
 {
-uint prev;
+ushort prev;
 
-#ifdef unix
-       if( __sync_lock_test_and_set(latch->mutex, 1) )
-               return 0;
+#ifdef  unix
+       prev = __sync_fetch_and_or(latch->lock, XCL);
 #else
-       if( _InterlockedExchange8(latch->mutex, 1) )
-               return 0;
+       prev = _InterlockedOr16(latch->lock, XCL);
 #endif
        //      take write access if all bits are clear
 
-       if( prev = !(latch->exclusive | latch->share) )
-               latch->exclusive = 1;
-
+       if( !(prev & XCL) )
+         if( !(prev & ~BOTH) )
+               return 1;
+         else
 #ifdef unix
-       *latch->mutex = 0;
+               __sync_fetch_and_and (latch->lock, ~XCL);
 #else
-       _InterlockedExchange8(latch->mutex, 0);
+               _InterlockedAnd16(latch->lock, ~XCL);
 #endif
-       return prev;
+       return 0;
 }
 
 //     clear write mode
@@ -435,17 +513,9 @@ uint prev;
 void bt_spinreleasewrite(BtSpinLatch *latch)
 {
 #ifdef unix
-       while( __sync_lock_test_and_set(latch->mutex, 1) )
-               sched_yield();
+       __sync_fetch_and_and(latch->lock, ~BOTH);
 #else
-       while( _InterlockedExchange8(latch->mutex, 1) )
-               SwitchToThread();
-#endif
-       latch->exclusive = 0;
-#ifdef unix
-       *latch->mutex = 0;
-#else
-       _InterlockedExchange8(latch->mutex, 0);
+       _InterlockedAnd16(latch->lock, ~BOTH);
 #endif
 }
 
@@ -454,17 +524,9 @@ void bt_spinreleasewrite(BtSpinLatch *latch)
 void bt_spinreleaseread(BtSpinLatch *latch)
 {
 #ifdef unix
-       while( __sync_lock_test_and_set(latch->mutex, 1) )
-               sched_yield();
-#else
-       while( _InterlockedExchange8(latch->mutex, 1) )
-               SwitchToThread();
-#endif
-       latch->share--;
-#ifdef unix
-       *latch->mutex = 0;
+       __sync_fetch_and_add(latch->lock, -SHARE);
 #else
-       _InterlockedExchange8(latch->mutex, 0);
+       _InterlockedExchangeAdd16(latch->lock, -SHARE);
 #endif
 }
 
@@ -535,17 +597,31 @@ BTERR bt_latchlink (BtDb *bt, uint hashidx, uint slot, uid page_no)
 {
 BtPage page = (BtPage)((uid)slot * bt->page_size + bt->pagepool);
 BtLatchSet *latch = bt->latchsets + slot;
+int lvl;
 
        if( latch->next = bt->table[hashidx].slot )
                bt->latchsets[latch->next].prev = slot;
 
        bt->table[hashidx].slot = slot;
        latch->page_no = page_no;
-       latch->clock = 1;
        latch->prev = 0;
        latch->pin = 1;
 
-       return bt_readpage (bt, page, page_no);
+       if( bt_readpage (bt, page, page_no) )
+               return bt->err;
+
+       lvl = page->lvl << LVL_shift;
+       if( lvl > LVL_mask )
+               lvl = LVL_mask;
+       latch->pin |= lvl;              // store lvl
+       latch->pin |= lvl << 3; // initialize clock
+
+#ifdef unix
+       __sync_fetch_and_add (&bt->latchmgr->cache[page->lvl], 1);
+#else
+       _InterlockedExchangeAdd(&bt->latchmgr->cache[page->lvl], 1);
+#endif
+       return bt->err = 0;
 }
 
 //     release latch pin
@@ -555,7 +631,7 @@ void bt_unpinlatch (BtLatchSet *latch)
 #ifdef unix
        __sync_fetch_and_add(&latch->pin, -1);
 #else
-       _InterlockedDecrement (&latch->pin);
+       _InterlockedDecrement16 (&latch->pin);
 #endif
 }
 
@@ -567,6 +643,7 @@ BtLatchSet *bt_pinlatch (BtDb *bt, uid page_no)
 uint hashidx = page_no % bt->latchmgr->latchhash;
 BtLatchSet *latch;
 uint slot, idx;
+uint lvl, cnt;
 off64_t off;
 uint amt[1];
 BtPage page;
@@ -583,11 +660,20 @@ BtPage page;
   } while( slot = latch->next );
 
   //  found our entry
+  //  increment clock
 
   if( slot ) {
        latch = bt->latchsets + slot;
-       latch->clock = 1;
-       latch->pin++;
+       lvl = (latch->pin & LVL_mask) >> LVL_shift;
+       lvl *= CLOCK_unit * 2;
+       lvl |= CLOCK_unit;
+#ifdef unix
+       __sync_fetch_and_add(&latch->pin, 1);
+       __sync_fetch_and_or(&latch->pin, lvl);
+#else
+       _InterlockedIncrement16 (&latch->pin);
+       _InterlockedOr16 (&latch->pin, lvl);
+#endif
        bt_spinreleasewrite(bt->table[hashidx].latch);
        return latch;
   }
@@ -625,20 +711,44 @@ BtPage page;
        //      or has outstanding pins
 
        slot %= bt->latchmgr->latchtotal;
-       latch = bt->latchsets + slot;
 
+       //      on slot wraparound, check census
+       //      count and increment safe level
+
+       cnt = bt->latchmgr->cache[bt->latchmgr->safelevel];
+
+       if( !slot ) {
+         if( cnt < bt->latchmgr->latchtotal / 10 )
+#ifdef unix
+               __sync_fetch_and_add(&bt->latchmgr->safelevel, 1);
+#else
+               _InterlockedIncrement (&bt->latchmgr->safelevel);
+#endif
+         continue;
+       }
+
+       latch = bt->latchsets + slot;
        idx = latch->page_no % bt->latchmgr->latchhash;
+       lvl = (latch->pin & LVL_mask) >> LVL_shift;
 
-       if( !slot )
+       //      see if we are evicting this level yet
+       //      or if we are on same chain as hashidx
+
+       if( idx == hashidx || lvl > bt->latchmgr->safelevel )
                continue;
 
        if( !bt_spinwritetry (bt->table[idx].latch) )
                continue;
 
-       if( latch->clock ) {
-               latch->clock = 0;
-               bt_spinreleasewrite (bt->table[idx].latch);
-               continue;
+       if( latch->pin & ~LVL_mask ) {
+         if( latch->pin & CLOCK_mask )
+#ifdef unix
+               __sync_fetch_and_add(&latch->pin, -CLOCK_unit);
+#else
+               _InterlockedExchangeAdd16 (&latch->pin, -CLOCK_unit);
+#endif
+         bt_spinreleasewrite (bt->table[idx].latch);
+         continue;
        }
 
        //  update permanent page area in btree
@@ -646,6 +756,9 @@ BtPage page;
        page = (BtPage)((uid)slot * bt->page_size + bt->pagepool);
 #ifdef unix
        posix_fadvise (bt->idx, page_no << bt->page_bits, bt->page_size, POSIX_FADV_WILLNEED);
+       __sync_fetch_and_add (&bt->latchmgr->cache[page->lvl], -1);
+#else
+       _InterlockedExchangeAdd(&bt->latchmgr->cache[page->lvl], -1);
 #endif
        if( page->dirty )
          if( bt_writepage (bt, page, latch->page_no) )
@@ -939,19 +1052,19 @@ void bt_lockpage(BtLock mode, BtLatchSet *latch)
 {
        switch( mode ) {
        case BtLockRead:
-               bt_spinreadlock (latch->readwr);
+               ReadLock (latch->readwr);
                break;
        case BtLockWrite:
-               bt_spinwritelock (latch->readwr);
+               WriteLock (latch->readwr);
                break;
        case BtLockAccess:
-               bt_spinreadlock (latch->access);
+               ReadLock (latch->access);
                break;
        case BtLockDelete:
-               bt_spinwritelock (latch->access);
+               WriteLock (latch->access);
                break;
        case BtLockParent:
-               bt_spinwritelock (latch->parent);
+               WriteLock (latch->parent);
                break;
        }
 }
@@ -962,19 +1075,19 @@ void bt_unlockpage(BtLock mode, BtLatchSet *latch)
 {
        switch( mode ) {
        case BtLockRead:
-               bt_spinreleaseread (latch->readwr);
+               ReadRelease (latch->readwr);
                break;
        case BtLockWrite:
-               bt_spinreleasewrite (latch->readwr);
+               WriteRelease (latch->readwr);
                break;
        case BtLockAccess:
-               bt_spinreleaseread (latch->access);
+               ReadRelease (latch->access);
                break;
        case BtLockDelete:
-               bt_spinreleasewrite (latch->access);
+               WriteRelease (latch->access);
                break;
        case BtLockParent:
-               bt_spinreleasewrite (latch->parent);
+               WriteRelease (latch->parent);
                break;
        }
 }
@@ -1046,7 +1159,7 @@ int ans;
 void bt_update (BtDb *bt, BtPage page)
 {
 #ifdef unix
-//     msync (page, bt->page_size, MS_ASYNC);
+       msync (page, bt->page_size, MS_ASYNC);
 #else
 //     FlushViewOfFile (page, bt->page_size);
 #endif
@@ -1873,12 +1986,15 @@ BtPage page;
 uint amt[1];
 BtKey ptr;
 
-       memset (blks, 0, sizeof(blks));
-
+#ifdef unix
+       posix_fadvise( bt->idx, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
        if( *(ushort *)(bt->latchmgr->lock) )
                fprintf(stderr, "Alloc page locked\n");
        *(ushort *)(bt->latchmgr->lock) = 0;
 
+       memset (blks, 0, sizeof(blks));
+
        for( idx = 1; idx <= bt->latchmgr->latchdeployed; idx++ ) {
                latch = bt->latchsets + idx;
                if( *(ushort *)latch->readwr )
@@ -1893,30 +2009,30 @@ BtKey ptr;
                        fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
                *(ushort *)latch->parent = 0;
 
-               if( latch->pin ) {
+               if( latch->pin & PIN_mask ) {
                        fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
                        latch->pin = 0;
                }
                page = (BtPage)((uid)idx * bt->page_size + bt->pagepool);
+               blks[page->lvl]++;
 
            if( page->dirty )
                 if( bt_writepage (bt, page, latch->page_no) )
                        fprintf(stderr, "Page %.8x Write Error\n", latch->page_no);
        }
 
+       for( idx = 0; blks[idx]; idx++ )
+               fprintf(stderr, "cache: %d lvl %d blocks\n", blks[idx], idx);
+
        for( hashidx = 0; hashidx < bt->latchmgr->latchhash; hashidx++ ) {
          if( *(ushort *)(bt->table[hashidx].latch) )
                        fprintf(stderr, "hash entry %d locked\n", hashidx);
 
          *(ushort *)(bt->table[hashidx].latch) = 0;
-
-         if( idx = bt->table[hashidx].slot ) do {
-               latch = bt->latchsets + idx;
-               if( latch->pin )
-                       fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
-         } while( idx = latch->next );
        }
 
+       memset (blks, 0, sizeof(blks));
+
        next = bt->latchmgr->nlatchpage + LATCH_page;
        page_no = LEAF_page;
 
@@ -1938,8 +2054,10 @@ BtKey ptr;
                        next = page_no + 1;
                page_no = next;
        }
+
        for( idx = 0; blks[idx]; idx++ )
-               fprintf(stderr, "%d lvl %d blocks\n", blks[idx], idx);
+               fprintf(stderr, "btree: %d lvl %d blocks\n", blks[idx], idx);
+
        return cnt - 1;
 }