]> pd.if.org Git - btree/commitdiff
Implement CLOCK buffer pool page replacement method
authorunknown <karl@E04.petzent.com>
Mon, 3 Mar 2014 23:24:59 +0000 (15:24 -0800)
committerunknown <karl@E04.petzent.com>
Mon, 3 Mar 2014 23:24:59 +0000 (15:24 -0800)
btree2u.c
threads2h.c
threads2i.c
threads2j.c

index bd97dd2a609a397e5bae9106b27baccd06033944..2d2a3d0a716149d87a5ab7378a5b6835d8315e36 100644 (file)
--- a/btree2u.c
+++ b/btree2u.c
@@ -43,6 +43,7 @@ REDISTRIBUTION OF THIS SOFTWARE.
 #include <stdlib.h>
 #include <time.h>
 #include <fcntl.h>
+#include <io.h>
 #endif
 
 #include <memory.h>
@@ -169,6 +170,7 @@ typedef struct {
        BtSpinLatch readwr[1];          // read/write page lock
        BtSpinLatch access[1];          // Access Intent/Page delete
        BtSpinLatch parent[1];          // Posting of fence key in parent
+       volatile ushort clock;          // accessed since last clock pass
        volatile uint next;                     // next entry in hash table chain
        volatile uint prev;                     // prev entry in hash table chain
        volatile uint pin;                      // number of outstanding pins
@@ -539,6 +541,7 @@ BtLatchSet *latch = bt->latchsets + slot;
 
        bt->table[hashidx].slot = slot;
        latch->page_no = page_no;
+       latch->clock = 1;
        latch->prev = 0;
        latch->pin = 1;
 
@@ -568,7 +571,7 @@ off64_t off;
 uint amt[1];
 BtPage page;
 
-  //  try to find unpinned entry
+  //  try to find our entry
 
   bt_spinwritelock(bt->table[hashidx].latch);
 
@@ -579,33 +582,12 @@ BtPage page;
                break;
   } while( slot = latch->next );
 
-  //  found our entry, bring to front of hash chain
+  //  found our entry
 
   if( slot ) {
        latch = bt->latchsets + slot;
-#ifdef unix
-       __sync_fetch_and_add(&latch->pin, 1);
-#else
-       _InterlockedIncrement (&latch->pin);
-#endif
-       //  unlink our entry from its hash chain position
-
-       if( latch->prev )
-               bt->latchsets[latch->prev].next = latch->next;
-       else
-               bt->table[hashidx].slot = latch->next;
-
-       if( latch->next )
-               bt->latchsets[latch->next].prev = latch->prev;
-
-       //  now link into head of the hash chain
-
-       if( latch->next = bt->table[hashidx].slot )
-               bt->latchsets[latch->next].prev = slot;
-
-       bt->table[hashidx].slot = slot;
-       latch->prev = 0;
-
+       latch->clock = 1;
+       latch->pin++;
        bt_spinreleasewrite(bt->table[hashidx].latch);
        return latch;
   }
@@ -630,32 +612,31 @@ BtPage page;
 #else
        _InterlockedDecrement (&bt->latchmgr->latchdeployed);
 #endif
-  //  find and reuse previous lru lock entry on victim hash chain
+  //  find and reuse previous entry on victim
 
   while( 1 ) {
 #ifdef unix
-       idx = __sync_fetch_and_add(&bt->latchmgr->latchvictim, 1);
+       slot = __sync_fetch_and_add(&bt->latchmgr->latchvictim, 1);
 #else
-       idx = _InterlockedIncrement (&bt->latchmgr->latchvictim) - 1;
+       slot = _InterlockedIncrement (&bt->latchmgr->latchvictim) - 1;
 #endif
        // try to get write lock on hash chain
        //      skip entry if not obtained
-       //      or has outstanding locks
+       //      or has outstanding pins
+
+       slot %= bt->latchmgr->latchtotal;
+       latch = bt->latchsets + slot;
 
-       idx %= bt->latchmgr->latchhash;
+       idx = latch->page_no % bt->latchmgr->latchhash;
 
-       if( !bt_spinwritetry (bt->table[idx].latch) )
+       if( !slot )
                continue;
 
-       if( slot = bt->table[idx].slot )
-         while( 1 ) {
-               latch = bt->latchsets + slot;
-               if( !latch->next )
-                       break;
-               slot = latch->next;
-         }
+       if( !bt_spinwritetry (bt->table[idx].latch) )
+               continue;
 
-       if( !slot || latch->pin ) {
+       if( latch->clock ) {
+               latch->clock = 0;
                bt_spinreleasewrite (bt->table[idx].latch);
                continue;
        }
@@ -663,7 +644,9 @@ BtPage page;
        //  update permanent page area in btree
 
        page = (BtPage)((uid)slot * bt->page_size + bt->pagepool);
-
+#ifdef unix
+       posix_fadvise (bt->idx, page_no << bt->page_bits, bt->page_size, POSIX_FADV_WILLNEED);
+#endif
        if( page->dirty )
          if( bt_writepage (bt, page, latch->page_no) )
                return NULL;
@@ -750,7 +733,8 @@ struct flock lock[1];
        bt = calloc (1, sizeof(BtDb));
 
        bt->idx = open ((char*)name, O_RDWR | O_CREAT, 0666);
-
+       posix_fadvise( bt->idx, 0, 0, POSIX_FADV_RANDOM);
        if( bt->idx == -1 ) {
                fprintf(stderr, "unable to open %s\n", name);
                return free(bt), NULL;
@@ -789,7 +773,7 @@ struct flock lock[1];
 #endif 
 
 #ifdef unix
-       latchmgr = malloc (BT_maxpage);
+       latchmgr = valloc (BT_maxpage);
        *amt = 0;
 
        // read minimum page size to get root info
@@ -880,13 +864,12 @@ struct flock lock[1];
        // clear out buffer pool pages
 
        memset(latchmgr, 0, bt->page_size);
-       last = MIN_lvl;
+       last = MIN_lvl + nlatchpage;
 
-       while( ++last < ((MIN_lvl + 1 + nlatchpage) ) )
-         if( bt_writepage (bt, latchmgr->alloc, last) ) {
+       if( bt_writepage (bt, latchmgr->alloc, last) ) {
                fprintf (stderr, "Unable to write buffer pool page %.8x\n", last);
                return bt_close (bt), NULL;
-         }
+       }
                
 #ifdef unix
        free (latchmgr);
@@ -919,6 +902,7 @@ btlatch:
                fprintf (stderr, "Unable to mmap buffer pool, errno = %d", errno);
                return bt_close (bt), NULL;
        }
+       madvise (bt->table, (uid)nlatchpage << bt->page_bits, MADV_RANDOM | MADV_WILLNEED);
 #else
        flag = PAGE_READWRITE;
        bt->halloc = CreateFileMapping(bt->idx, NULL, flag, 0, ((uid)nlatchpage + LATCH_page) * bt->page_size, NULL);
@@ -940,7 +924,7 @@ btlatch:
        bt->latchsets = (BtLatchSet *)(bt->pagepool - (uid)bt->latchmgr->latchtotal * sizeof(BtLatchSet));
 
 #ifdef unix
-       bt->mem = malloc (2 * bt->page_size);
+       bt->mem = valloc (2 * bt->page_size);
 #else
        bt->mem = VirtualAlloc(NULL, 2 * bt->page_size, MEM_COMMIT, PAGE_READWRITE);
 #endif
@@ -1062,7 +1046,7 @@ int ans;
 void bt_update (BtDb *bt, BtPage page)
 {
 #ifdef unix
-       msync (page, bt->page_size, MS_ASYNC);
+//     msync (page, bt->page_size, MS_ASYNC);
 #else
 //     FlushViewOfFile (page, bt->page_size);
 #endif
@@ -1883,11 +1867,14 @@ uint bt_audit (BtDb *bt)
 uint idx, hashidx;
 uid next, page_no;
 BtLatchSet *latch;
+uint blks[64];
 uint cnt = 0;
 BtPage page;
 uint amt[1];
 BtKey ptr;
 
+       memset (blks, 0, sizeof(blks));
+
        if( *(ushort *)(bt->latchmgr->lock) )
                fprintf(stderr, "Alloc page locked\n");
        *(ushort *)(bt->latchmgr->lock) = 0;
@@ -1944,12 +1931,15 @@ BtKey ptr;
                 }
                 if( !bt->frame->lvl )
                        cnt += bt->frame->act;
+                blks[bt->frame->lvl]++;
                }
 
                if( page_no > LEAF_page )
                        next = page_no + 1;
                page_no = next;
        }
+       for( idx = 0; blks[idx]; idx++ )
+               fprintf(stderr, "%d lvl %d blocks\n", blks[idx], idx);
        return cnt - 1;
 }
 
@@ -2024,15 +2014,20 @@ int ch, cnt = 0, bits = 12, idx;
 unsigned char key[256];
 double done, start;
 uid next, page_no;
+BtLatchSet *latch;
 float elapsed;
 time_t tod[1];
 uint scan = 0;
 uint len = 0;
 uint map = 0;
+BtPage page;
 BtKey ptr;
 BtDb *bt;
 FILE *in;
 
+#ifdef WIN32
+       _setmode (1, _O_BINARY);
+#endif
        if( argc < 4 ) {
                fprintf (stderr, "Usage: %s idx_file src_file Read/Write/Scan/Delete/Find/Count [page_bits mapped_pool_pages start_line_number]\n", argv[0]);
                fprintf (stderr, "  page_bits: size of btree page in bits\n");
@@ -2061,15 +2056,27 @@ FILE *in;
 
        switch(argv[3][0]| 0x20)
        {
-       case 'a':
-               fprintf(stderr, "started audit for %s\n", argv[2]);
+       case 'p':       // display page
+               if( latch = bt_pinlatch (bt, off) )
+                       page = bt_mappage (bt, latch);
+               else
+                       fprintf(stderr, "unable to read page %.8x\n", off);
+
+               write (1, page, bt->page_size);
+               break;
+
+       case 'a':       // buffer pool audit
+               fprintf(stderr, "started audit for %s\n", argv[1]);
                cnt = bt_audit (bt);
-               fprintf(stderr, "finished audit for %s, %d keys\n", argv[2], cnt);
+               fprintf(stderr, "finished audit for %s, %d keys\n", argv[1], cnt);
                break;
 
-       case 'w':
+       case 'w':       // write keys
                fprintf(stderr, "started indexing for %s\n", argv[2]);
-               if( argc > 2 && (in = fopen (argv[2], "rb")) )
+               if( argc > 2 && (in = fopen (argv[2], "rb")) ) {
+#ifdef unix
+                 posix_fadvise( fileno(in), 0, 0, POSIX_FADV_NOREUSE);
+#endif
                  while( ch = getc(in), ch != EOF )
                        if( ch == '\n' )
                        {
@@ -2082,12 +2089,16 @@ FILE *in;
                        }
                        else if( len < 245 )
                                key[len++] = ch;
+                 }
                fprintf(stderr, "finished adding keys for %s, %d \n", argv[2], line);
                break;
 
-       case 'd':
+       case 'd':       // delete keys
                fprintf(stderr, "started deleting keys for %s\n", argv[2]);
-               if( argc > 2 && (in = fopen (argv[2], "rb")) )
+               if( argc > 2 && (in = fopen (argv[2], "rb")) ) {
+#ifdef unix
+                 posix_fadvise( fileno(in), 0, 0, POSIX_FADV_NOREUSE);
+#endif
                  while( ch = getc(in), ch != EOF )
                        if( ch == '\n' )
                        {
@@ -2100,12 +2111,16 @@ FILE *in;
                        }
                        else if( len < 245 )
                                key[len++] = ch;
+                 }
                fprintf(stderr, "finished deleting keys for %s, %d \n", argv[2], line);
                break;
 
-       case 'f':
+       case 'f':       // find keys
                fprintf(stderr, "started finding keys for %s\n", argv[2]);
-               if( argc > 2 && (in = fopen (argv[2], "rb")) )
+               if( argc > 2 && (in = fopen (argv[2], "rb")) ) {
+#ifdef unix
+                 posix_fadvise( fileno(in), 0, 0, POSIX_FADV_NOREUSE);
+#endif
                  while( ch = getc(in), ch != EOF )
                        if( ch == '\n' )
                        {
@@ -2120,10 +2135,11 @@ FILE *in;
                        }
                        else if( len < 245 )
                                key[len++] = ch;
+                 }
                fprintf(stderr, "finished search of %d keys for %s, found %d\n", line, argv[2], found);
                break;
 
-       case 's':
+       case 's':       // scan and print keys
                fprintf(stderr, "started scaning\n");
                cnt = len = key[0] = 0;
 
@@ -2142,7 +2158,7 @@ FILE *in;
                fprintf(stderr, " Total keys read %d\n", cnt - 1);
                break;
 
-       case 'c':
+       case 'c':       // count keys
          fprintf(stderr, "started counting\n");
          cnt = 0;
 
@@ -2150,8 +2166,6 @@ FILE *in;
          page_no = LEAF_page;
 
          while( page_no < bt_getid(bt->latchmgr->alloc->right) ) {
-         BtLatchSet *latch;
-         BtPage page;
                if( latch = bt_pinlatch (bt, page_no) )
                        page = bt_mappage (bt, latch);
                if( !page->free && !page->lvl )
index 7602b1179d54676363846162a930dcf1822fcc4a..500e8b3d470af09006dcd851e1aaeab26941a09d 100644 (file)
@@ -179,7 +179,6 @@ typedef struct BtPage_ {
 //     The memory mapping pool table buffer manager entry
 
 typedef struct {
-       unsigned long long int lru;     // number of times accessed
        uid  basepage;                          // mapped base page number
        char *map;                                      // mapped memory pointer
        ushort slot;                            // slot index in this array
@@ -191,6 +190,8 @@ typedef struct {
 #endif
 } BtPool;
 
+#define CLOCK_bit 0x8000               // bit in pool->pin
+
 //  The loadpage interface object
 
 typedef struct {
@@ -229,7 +230,7 @@ typedef struct {
        ushort poolmax;                         // highest page pool node allocated
        ushort poolmask;                        // total number of pages in mmap segment - 1
        ushort hashsize;                        // size of Hash Table for pool entries
-       volatile uint evicted;          // last evicted hash table slot
+       volatile uint evicted;          // last evicted pool table slot
        ushort *hash;                           // pool index for hash entries
        BtSpinLatch *latch;                     // latches for hash table slots
        BtLatchMgr *latchmgr;           // mapped latch page from allocation page
@@ -1116,7 +1117,7 @@ uint slot;
 
        pool->hashprev = pool->hashnext = NULL;
        pool->basepage = page_no & ~bt->mgr->poolmask;
-       pool->lru = 1;
+       pool->pin = CLOCK_bit + 1;
 
        if( slot = bt->mgr->hash[idx] ) {
                node = bt->mgr->pool + slot;
@@ -1127,32 +1128,6 @@ uint slot;
        bt->mgr->hash[idx] = pool->slot;
 }
 
-//     find best segment to evict from buffer pool
-
-BtPool *bt_findlru (BtDb *bt, uint hashslot)
-{
-unsigned long long int target = ~0LL;
-BtPool *pool = NULL, *node;
-
-       if( !hashslot )
-               return NULL;
-
-       node = bt->mgr->pool + hashslot;
-
-       //  scan pool entries under hash table slot
-
-       do {
-         if( node->pin )
-               continue;
-         if( node->lru > target )
-               continue;
-         target = node->lru;
-         pool = node;
-       } while( node = node->hashnext );
-
-       return pool;
-}
-
 //  map new buffer pool segment to virtual memory
 
 BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
@@ -1207,42 +1182,25 @@ void bt_unpinpool (BtPool *pool)
 
 BtPool *bt_pinpool(BtDb *bt, uid page_no)
 {
+uint slot, hashidx, idx, victim;
 BtPool *pool, *node, *next;
-uint slot, idx, victim;
 
        //      lock hash table chain
 
-       idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
-       bt_spinreadlock (&bt->mgr->latch[idx]);
+       hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+       bt_spinwritelock (&bt->mgr->latch[hashidx]);
 
        //      look up in hash table
 
-       if( pool = bt_findpool(bt, page_no, idx) ) {
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleaseread (&bt->mgr->latch[idx]);
-               pool->lru++;
-               return pool;
-       }
-
-       // upgrade to write lock
-
-       bt_spinreleaseread (&bt->mgr->latch[idx]);
-       bt_spinwritelock (&bt->mgr->latch[idx]);
-
-       // try to find page in pool with write lock
-
-       if( pool = bt_findpool(bt, page_no, idx) ) {
+       if( pool = bt_findpool(bt, page_no, hashidx) ) {
 #ifdef unix
+               __sync_fetch_and_or(&pool->pin, CLOCK_bit);
                __sync_fetch_and_add(&pool->pin, 1);
 #else
+               _InterlockedOr16 (&pool->pin, CLOCK_bit);
                _InterlockedIncrement16 (&pool->pin);
 #endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx]);
-               pool->lru++;
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
                return pool;
        }
 
@@ -1262,13 +1220,8 @@ uint slot, idx, victim;
                if( bt_mapsegment(bt, pool, page_no) )
                        return NULL;
 
-               bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx]);
+               bt_linkhash(bt, pool, page_no, hashidx);
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
                return pool;
        }
 
@@ -1287,20 +1240,30 @@ uint slot, idx, victim;
 #else
                victim = _InterlockedIncrement (&bt->mgr->evicted) - 1;
 #endif
-               victim %= bt->mgr->hashsize;
+               victim %= bt->mgr->poolmax;
+               pool = bt->mgr->pool + victim;
+               idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+
+               if( !victim )
+                       continue;
 
                // try to get write lock
                //      skip entry if not obtained
 
-               if( !bt_spinwritetry (&bt->mgr->latch[victim]) )
+               if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
                        continue;
 
-               //  if pool entry is empty
-               //      or any pages are pinned
-               //      skip this entry
+               //      skip this entry if
+               //      page is pinned
+               //  or clock bit is set
 
-               if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) {
-                       bt_spinreleasewrite (&bt->mgr->latch[victim]);
+               if( pool->pin ) {
+#ifdef unix
+                       __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
+#else
+                       _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
+#endif
+                       bt_spinreleasewrite (&bt->mgr->latch[idx]);
                        continue;
                }
 
@@ -1309,14 +1272,14 @@ uint slot, idx, victim;
                if( node = pool->hashprev )
                        node->hashnext = pool->hashnext;
                else if( node = pool->hashnext )
-                       bt->mgr->hash[victim] = node->slot;
+                       bt->mgr->hash[idx] = node->slot;
                else
-                       bt->mgr->hash[victim] = 0;
+                       bt->mgr->hash[idx] = 0;
 
                if( node = pool->hashnext )
                        node->hashprev = pool->hashprev;
 
-               bt_spinreleasewrite (&bt->mgr->latch[victim]);
+               bt_spinreleasewrite (&bt->mgr->latch[idx]);
 
                //      remove old file mapping
 #ifdef unix
@@ -1334,13 +1297,8 @@ uint slot, idx, victim;
                if( bt_mapsegment(bt, pool, page_no) )
                        return NULL;
 
-               bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx]);
+               bt_linkhash(bt, pool, page_no, hashidx);
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
                return pool;
        }
 }
@@ -2296,7 +2254,7 @@ BtPageSet set[1];
 BtKey ptr;
 
 #ifdef unix
-       for( idx = 1; idx < bt->mgr->latchmgr->latchdeployed; idx++ ) {
+       for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
                set->latch = bt->mgr->latchsets + idx;
                if( set->latch->pin ) {
                        fprintf(stderr, "latchset %d pinned for page %.6x\n", idx, set->latch->page_no);
@@ -2433,8 +2391,6 @@ FILE *in;
                                found++;
                          else if( bt->err )
                                fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
-                         else
-                               fprintf(stderr, "Unable to find key %.*s line %d\n", len, key, line);
                          len = 0;
                        }
                        else if( len < 255 )
index b67940e65e7b5b85d067e05786f8f7938e7f3382..f1d0269b89776928b209bef823aa65c40c4f91ae 100644 (file)
@@ -171,7 +171,6 @@ typedef struct BtPage_ {
 //     The memory mapping pool table buffer manager entry
 
 typedef struct {
-       unsigned long long int lru;     // number of times accessed
        uid  basepage;                          // mapped base page number
        char *map;                                      // mapped memory pointer
        ushort slot;                            // slot index in this array
@@ -183,6 +182,8 @@ typedef struct {
 #endif
 } BtPool;
 
+#define CLOCK_bit 0x8000               // bit in pool->pin
+
 //  The loadpage interface object
 
 typedef struct {
@@ -1027,7 +1028,7 @@ uint slot;
 
        pool->hashprev = pool->hashnext = NULL;
        pool->basepage = page_no & ~bt->mgr->poolmask;
-       pool->lru = 1;
+       pool->pin = CLOCK_bit + 1;
 
        if( slot = bt->mgr->hash[idx] ) {
                node = bt->mgr->pool + slot;
@@ -1038,32 +1039,6 @@ uint slot;
        bt->mgr->hash[idx] = pool->slot;
 }
 
-//     find best segment to evict from buffer pool
-
-BtPool *bt_findlru (BtDb *bt, uint hashslot)
-{
-unsigned long long int target = ~0LL;
-BtPool *pool = NULL, *node;
-
-       if( !hashslot )
-               return NULL;
-
-       node = bt->mgr->pool + hashslot;
-
-       //  scan pool entries under hash table slot
-
-       do {
-         if( node->pin )
-               continue;
-         if( node->lru > target )
-               continue;
-         target = node->lru;
-         pool = node;
-       } while( node = node->hashnext );
-
-       return pool;
-}
-
 //  map new buffer pool segment to virtual memory
 
 BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
@@ -1119,42 +1094,25 @@ void bt_unpinpool (BtPool *pool)
 
 BtPool *bt_pinpool(BtDb *bt, uid page_no)
 {
+uint slot, hashidx, idx, victim;
 BtPool *pool, *node, *next;
-uint slot, idx, victim;
 
        //      lock hash table chain
 
-       idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
-       bt_spinreadlock (&bt->mgr->latch[idx]);
+       hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+       bt_spinwritelock (&bt->mgr->latch[hashidx]);
 
        //      look up in hash table
 
-       if( pool = bt_findpool(bt, page_no, idx) ) {
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleaseread (&bt->mgr->latch[idx]);
-               pool->lru++;
-               return pool;
-       }
-
-       //      upgrade to write lock
-
-       bt_spinreleaseread (&bt->mgr->latch[idx]);
-       bt_spinwritelock (&bt->mgr->latch[idx]);
-
-       // try to find page in pool with write lock
-
-       if( pool = bt_findpool(bt, page_no, idx) ) {
+       if( pool = bt_findpool(bt, page_no, hashidx) ) {
 #ifdef unix
+               __sync_fetch_and_or(&pool->pin, CLOCK_bit);
                __sync_fetch_and_add(&pool->pin, 1);
 #else
+               _InterlockedOr16 (&pool->pin, CLOCK_bit);
                _InterlockedIncrement16 (&pool->pin);
 #endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx]);
-               pool->lru++;
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
                return pool;
        }
 
@@ -1174,13 +1132,8 @@ uint slot, idx, victim;
                if( bt_mapsegment(bt, pool, page_no) )
                        return NULL;
 
-               bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx]);
+               bt_linkhash(bt, pool, page_no, hashidx);
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
                return pool;
        }
 
@@ -1199,20 +1152,30 @@ uint slot, idx, victim;
 #else
                victim = _InterlockedIncrement (&bt->mgr->evicted) - 1;
 #endif
-               victim %= bt->mgr->hashsize;
+               victim %= bt->mgr->poolmax;
+               pool = bt->mgr->pool + victim;
+               idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+
+               if( !victim )
+                       continue;
 
                // try to get write lock
                //      skip entry if not obtained
 
-               if( !bt_spinwritetry (&bt->mgr->latch[victim]) )
+               if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
                        continue;
 
-               //  if pool entry is empty
-               //      or any pages are pinned
-               //      skip this entry
+               //      skip this entry if
+               //      page is pinned
+               //  or clock bit is set
 
-               if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) {
-                       bt_spinreleasewrite (&bt->mgr->latch[victim]);
+               if( pool->pin ) {
+#ifdef unix
+                       __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
+#else
+                       _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
+#endif
+                       bt_spinreleasewrite (&bt->mgr->latch[idx]);
                        continue;
                }
 
@@ -1221,14 +1184,14 @@ uint slot, idx, victim;
                if( node = pool->hashprev )
                        node->hashnext = pool->hashnext;
                else if( node = pool->hashnext )
-                       bt->mgr->hash[victim] = node->slot;
+                       bt->mgr->hash[idx] = node->slot;
                else
-                       bt->mgr->hash[victim] = 0;
+                       bt->mgr->hash[idx] = 0;
 
                if( node = pool->hashnext )
                        node->hashprev = pool->hashprev;
 
-               bt_spinreleasewrite (&bt->mgr->latch[victim]);
+               bt_spinreleasewrite (&bt->mgr->latch[idx]);
 
                //      remove old file mapping
 #ifdef unix
@@ -1246,13 +1209,8 @@ uint slot, idx, victim;
                if( bt_mapsegment(bt, pool, page_no) )
                        return NULL;
 
-               bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx]);
+               bt_linkhash(bt, pool, page_no, hashidx);
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx]);
                return pool;
        }
 }
@@ -2210,7 +2168,7 @@ BtKey ptr;
                fprintf(stderr, "Alloc page locked\n");
        *(uint *)(bt->mgr->latchmgr->lock) = 0;
 
-       for( idx = 1; idx < bt->mgr->latchmgr->latchdeployed; idx++ ) {
+       for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
                latch = bt->mgr->latchsets + idx;
                if( *(uint *)latch->readwr )
                        fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
@@ -2367,8 +2325,6 @@ FILE *in;
                                found++;
                          else if( bt->err )
                                fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
-                         else
-                               fprintf(stderr, "Unable to find key %.*s line %d\n", len, key, line);
                          len = 0;
                        }
                        else if( len < 255 )
index ca156180799bb11cababbd6b43c292651a71ca01..89ea6d835b37528ed1dbfc04a3ae2512cc58165a 100644 (file)
@@ -183,7 +183,6 @@ typedef struct {
 //     The memory mapping pool table buffer manager entry
 
 typedef struct {
-       unsigned long long int lru;     // number of times accessed
        uid  basepage;                          // mapped base page number
        char *map;                                      // mapped memory pointer
        ushort slot;                            // slot index in this array
@@ -195,6 +194,8 @@ typedef struct {
 #endif
 } BtPool;
 
+#define CLOCK_bit 0x8000               // bit in pool->pin
+
 //  The loadpage interface object
 
 typedef struct {
@@ -1070,7 +1071,7 @@ uint slot;
 
        pool->hashprev = pool->hashnext = NULL;
        pool->basepage = page_no & ~bt->mgr->poolmask;
-       pool->lru = 1;
+       pool->pin = CLOCK_bit + 1;
 
        if( slot = bt->mgr->hash[idx] ) {
                node = bt->mgr->pool + slot;
@@ -1081,32 +1082,6 @@ uint slot;
        bt->mgr->hash[idx] = pool->slot;
 }
 
-//     find best segment to evict from buffer pool
-
-BtPool *bt_findlru (BtDb *bt, uint hashslot)
-{
-unsigned long long int target = ~0LL;
-BtPool *pool = NULL, *node;
-
-       if( !hashslot )
-               return NULL;
-
-       node = bt->mgr->pool + hashslot;
-
-       //  scan pool entries under hash table slot
-
-       do {
-         if( node->pin )
-               continue;
-         if( node->lru > target )
-               continue;
-         target = node->lru;
-         pool = node;
-       } while( node = node->hashnext );
-
-       return pool;
-}
-
 //  map new buffer pool segment to virtual memory
 
 BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
@@ -1162,42 +1137,25 @@ void bt_unpinpool (BtPool *pool)
 
 BtPool *bt_pinpool(BtDb *bt, uid page_no)
 {
+uint slot, hashidx, idx, victim;
 BtPool *pool, *node, *next;
-uint slot, idx, victim;
 
        //      lock hash table chain
 
-       idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
-       bt_spinreadlock (&bt->mgr->latch[idx], 1);
+       hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+       bt_spinreadlock (&bt->mgr->latch[hashidx], 1);
 
        //      look up in hash table
 
-       if( pool = bt_findpool(bt, page_no, idx) ) {
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleaseread (&bt->mgr->latch[idx], 1);
-               pool->lru++;
-               return pool;
-       }
-
-       //      upgrade to write lock
-
-       bt_spinreleaseread (&bt->mgr->latch[idx], 1);
-       bt_spinwritelock (&bt->mgr->latch[idx], 1);
-
-       // try to find page in pool with write lock
-
-       if( pool = bt_findpool(bt, page_no, idx) ) {
+       if( pool = bt_findpool(bt, page_no, hashidx) ) {
 #ifdef unix
+               __sync_fetch_and_or(&pool->pin, CLOCK_bit);
                __sync_fetch_and_add(&pool->pin, 1);
 #else
+               _InterlockedOr16 (&pool->pin, CLOCK_bit);
                _InterlockedIncrement16 (&pool->pin);
 #endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
-               pool->lru++;
+               bt_spinreleaseread (&bt->mgr->latch[hashidx], 1);
                return pool;
        }
 
@@ -1217,13 +1175,8 @@ uint slot, idx, victim;
                if( bt_mapsegment(bt, pool, page_no) )
                        return NULL;
 
-               bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
+               bt_linkhash(bt, pool, page_no, hashidx);
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx], 1);
                return pool;
        }
 
@@ -1242,20 +1195,30 @@ uint slot, idx, victim;
 #else
                victim = _InterlockedIncrement16 (&bt->mgr->evicted) - 1;
 #endif
-               victim %= bt->mgr->hashsize;
+               victim %= bt->mgr->poolmax;
+               pool = bt->mgr->pool + victim;
+               idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+
+               if( !victim )
+                       continue;
 
                // try to get write lock
                //      skip entry if not obtained
 
-               if( !bt_spinwritetry (&bt->mgr->latch[victim]) )
+               if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
                        continue;
 
-               //  if pool entry is empty
-               //      or any pages are pinned
-               //      skip this entry
+               //      skip this entry if
+               //      page is pinned
+               //  or clock bit is set
 
-               if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) {
-                       bt_spinreleasewrite (&bt->mgr->latch[victim], 1);
+               if( pool->pin ) {
+#ifdef unix
+                       __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
+#else
+                       _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
+#endif
+                       bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
                        continue;
                }
 
@@ -1264,14 +1227,14 @@ uint slot, idx, victim;
                if( node = pool->hashprev )
                        node->hashnext = pool->hashnext;
                else if( node = pool->hashnext )
-                       bt->mgr->hash[victim] = node->slot;
+                       bt->mgr->hash[idx] = node->slot;
                else
-                       bt->mgr->hash[victim] = 0;
+                       bt->mgr->hash[idx] = 0;
 
                if( node = pool->hashnext )
                        node->hashprev = pool->hashprev;
 
-               bt_spinreleasewrite (&bt->mgr->latch[victim], 1);
+               bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
 
                //      remove old file mapping
 #ifdef unix
@@ -1289,13 +1252,8 @@ uint slot, idx, victim;
                if( bt_mapsegment(bt, pool, page_no) )
                        return NULL;
 
-               bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
-               __sync_fetch_and_add(&pool->pin, 1);
-#else
-               _InterlockedIncrement16 (&pool->pin);
-#endif
-               bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
+               bt_linkhash(bt, pool, page_no, hashidx);
+               bt_spinreleasewrite (&bt->mgr->latch[hashidx], 1);
                return pool;
        }
 }
@@ -2258,7 +2216,7 @@ BtKey ptr;
                fprintf(stderr, "Alloc page locked\n");
        *(uint *)(bt->mgr->latchmgr->lock) = 0;
 
-       for( idx = 1; idx < bt->mgr->latchmgr->latchdeployed; idx++ ) {
+       for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
                latch = bt->mgr->latchsets + idx;
                if( *(uint *)latch->readwr )
                        fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
@@ -2415,8 +2373,6 @@ FILE *in;
                                found++;
                          else if( bt->err )
                                fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
-                         else
-                               fprintf(stderr, "Unable to find key %.*s line %d\n", len, key, line);
                          len = 0;
                        }
                        else if( len < 255 )