// share is count of read accessors
// grant write lock when share == 0
-typedef struct {
- volatile unsigned char mutex; // 1 = busy
- volatile unsigned char write:1; // 1 = exclusive
- volatile unsigned char readwait:1; // readers are waiting
- volatile unsigned char writewait:1; // writers are waiting
- volatile unsigned char filler:5;
- volatile ushort share; // count of readers holding locks
- volatile ushort rcnt; // count of waiting readers
- volatile ushort wcnt; // count of waiting writers
+volatile typedef struct {
+ unsigned char mutex[1]; // 1 = busy
+ unsigned char write:1; // 1 = exclusive
+ unsigned char readwait:1; // readers are waiting
+ unsigned char writewait:1; // writers are waiting
+ unsigned char filler:5;
+ ushort share; // count of readers holding locks
+ ushort rcnt; // count of waiting readers
+ ushort wcnt; // count of waiting writers
} BtLatch;
// Define the length of the page and key pointers
uint min; // next key offset
unsigned char bits:7; // page size in bits
unsigned char free:1; // page is on free list
- unsigned char lvl:5; // level of page
+ unsigned char lvl:6; // level of page
unsigned char kill:1; // page is being deleted
unsigned char dirty:1; // page has deleted keys
- unsigned char posted:1; // page fence has posted
unsigned char right[BtId]; // page number to right
} *BtPage;
// The memory mapping pool table buffer manager entry
typedef struct {
- unsigned long long int lru; // number of times accessed
uid basepage; // mapped base page number
char *map; // mapped memory pointer
ushort slot; // slot index in this array
#endif
} BtPool;
+#define CLOCK_bit 0x8000 // bit in pool->pin
+
// The loadpage interface object
typedef struct {
while( 1 ) {
// obtain latch mutex
- while( __sync_lock_test_and_set(&latch->mutex, 1) )
+ while( __sync_lock_test_and_set(latch->mutex, 1) )
sched_yield();
if( decr )
latch->readwait = 1;
latch->rcnt++;
prev = *(uint *)latch & ~1;
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
sys_futex( (uint *)latch, FUTEX_WAIT_BITSET | private, prev, NULL, NULL, QueRd );
decr = 1;
continue;
latch->readwait = 0;
latch->share++;
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
return;
}
}
while( 1 ) {
// obtain latch mutex
- while( __sync_lock_test_and_set(&latch->mutex, 1) )
+ while( __sync_lock_test_and_set(latch->mutex, 1) )
sched_yield();
if( decr )
latch->writewait = 1;
latch->wcnt++;
prev = *(uint *)latch & ~1;
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
sys_futex( (uint *)latch, FUTEX_WAIT_BITSET | private, prev, NULL, NULL, QueWr );
decr = 1;
continue;
latch->writewait = 0;
latch->write = 1;
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
return;
}
}
// try for mutex,
// abandon request if not taken
- if( __sync_lock_test_and_set(&latch->mutex, 1) )
+ if( __sync_lock_test_and_set(latch->mutex, 1) )
return 0;
// see if write mode is available
// release latch mutex
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
return ans;
}
// obtain latch mutex
- while( __sync_lock_test_and_set(&latch->mutex, 1) )
+ while( __sync_lock_test_and_set(latch->mutex, 1) )
sched_yield();
latch->write = 0;
// release latch mutex
wakexit:
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
}
// decrement reader count
// obtain latch mutex
- while( __sync_lock_test_and_set(&latch->mutex, 1) )
+ while( __sync_lock_test_and_set(latch->mutex, 1) )
sched_yield();
latch->share--;
- // wake waiting writers
+ // wake one waiting writer
if( !latch->share && latch->wcnt )
sys_futex( (uint *)latch, FUTEX_WAKE_BITSET | private, 1, NULL, NULL, QueWr );
// release latch mutex
- __sync_lock_release (&latch->mutex);
+ __sync_lock_release (latch->mutex);
}
// link latch table entry into latch hash table
close (mgr->idx);
free (mgr->pool);
free (mgr->hash);
- free (mgr->latch);
+ free ((void *)mgr->latch);
free (mgr);
#else
FlushFileBuffers(mgr->idx);
CloseHandle(mgr->idx);
GlobalFree (mgr->pool);
GlobalFree (mgr->hash);
- GlobalFree (mgr->latch);
+ GlobalFree ((void *)mgr->latch);
GlobalFree (mgr);
#endif
}
void bt_close (BtDb *bt)
{
#ifdef unix
- if ( bt->mem )
+ if( bt->mem )
free (bt->mem);
#else
- if ( bt->mem)
+ if( bt->mem)
VirtualFree (bt->mem, 0, MEM_RELEASE);
#endif
free (bt);
pool->hashprev = pool->hashnext = NULL;
pool->basepage = page_no & ~bt->mgr->poolmask;
- pool->lru = 1;
+ pool->pin = CLOCK_bit + 1;
if( slot = bt->mgr->hash[idx] ) {
node = bt->mgr->pool + slot;
bt->mgr->hash[idx] = pool->slot;
}
-// find best segment to evict from buffer pool
-
-BtPool *bt_findlru (BtDb *bt, uint hashslot)
-{
-unsigned long long int target = ~0LL;
-BtPool *pool = NULL, *node;
-
- if( !hashslot )
- return NULL;
-
- node = bt->mgr->pool + hashslot;
-
- // scan pool entries under hash table slot
-
- do {
- if( node->pin )
- continue;
- if( node->lru > target )
- continue;
- target = node->lru;
- pool = node;
- } while( node = node->hashnext );
-
- return pool;
-}
-
// map new buffer pool segment to virtual memory
BTERR bt_mapsegment(BtDb *bt, BtPool *pool, uid page_no)
BtPool *bt_pinpool(BtDb *bt, uid page_no)
{
+uint slot, hashidx, idx, victim;
BtPool *pool, *node, *next;
-uint slot, idx, victim;
// lock hash table chain
- idx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
- bt_spinreadlock (&bt->mgr->latch[idx], 1);
+ hashidx = (uint)(page_no >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+ bt_spinreadlock (&bt->mgr->latch[hashidx], 1);
// look up in hash table
- if( pool = bt_findpool(bt, page_no, idx) ) {
-#ifdef unix
- __sync_fetch_and_add(&pool->pin, 1);
-#else
- _InterlockedIncrement16 (&pool->pin);
-#endif
- bt_spinreleaseread (&bt->mgr->latch[idx], 1);
- pool->lru++;
- return pool;
- }
-
- // upgrade to write lock
-
- bt_spinreleaseread (&bt->mgr->latch[idx], 1);
- bt_spinwritelock (&bt->mgr->latch[idx], 1);
-
- // try to find page in pool with write lock
-
- if( pool = bt_findpool(bt, page_no, idx) ) {
+ if( pool = bt_findpool(bt, page_no, hashidx) ) {
#ifdef unix
+ __sync_fetch_and_or(&pool->pin, CLOCK_bit);
__sync_fetch_and_add(&pool->pin, 1);
#else
+ _InterlockedOr16 (&pool->pin, CLOCK_bit);
_InterlockedIncrement16 (&pool->pin);
#endif
- bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
- pool->lru++;
+ bt_spinreleaseread (&bt->mgr->latch[hashidx], 1);
return pool;
}
if( bt_mapsegment(bt, pool, page_no) )
return NULL;
- bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
- __sync_fetch_and_add(&pool->pin, 1);
-#else
- _InterlockedIncrement16 (&pool->pin);
-#endif
- bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
+ bt_linkhash(bt, pool, page_no, hashidx);
+ bt_spinreleasewrite (&bt->mgr->latch[hashidx], 1);
return pool;
}
#else
victim = _InterlockedIncrement16 (&bt->mgr->evicted) - 1;
#endif
- victim %= bt->mgr->hashsize;
+ victim %= bt->mgr->poolmax;
+ pool = bt->mgr->pool + victim;
+ idx = (uint)(pool->basepage >> bt->mgr->seg_bits) % bt->mgr->hashsize;
+
+ if( !victim )
+ continue;
// try to get write lock
// skip entry if not obtained
- if( !bt_spinwritetry (&bt->mgr->latch[victim]) )
+ if( !bt_spinwritetry (&bt->mgr->latch[idx]) )
continue;
- // if pool entry is empty
- // or any pages are pinned
- // skip this entry
+ // skip this entry if
+ // page is pinned
+ // or clock bit is set
- if( !(pool = bt_findlru(bt, bt->mgr->hash[victim])) ) {
- bt_spinreleasewrite (&bt->mgr->latch[victim], 1);
+ if( pool->pin ) {
+#ifdef unix
+ __sync_fetch_and_and(&pool->pin, ~CLOCK_bit);
+#else
+ _InterlockedAnd16 (&pool->pin, ~CLOCK_bit);
+#endif
+ bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
continue;
}
if( node = pool->hashprev )
node->hashnext = pool->hashnext;
else if( node = pool->hashnext )
- bt->mgr->hash[victim] = node->slot;
+ bt->mgr->hash[idx] = node->slot;
else
- bt->mgr->hash[victim] = 0;
+ bt->mgr->hash[idx] = 0;
if( node = pool->hashnext )
node->hashprev = pool->hashprev;
- bt_spinreleasewrite (&bt->mgr->latch[victim], 1);
+ bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
// remove old file mapping
#ifdef unix
if( bt_mapsegment(bt, pool, page_no) )
return NULL;
- bt_linkhash(bt, pool, page_no, idx);
-#ifdef unix
- __sync_fetch_and_add(&pool->pin, 1);
-#else
- _InterlockedIncrement16 (&pool->pin);
-#endif
- bt_spinreleasewrite (&bt->mgr->latch[idx], 1);
+ bt_linkhash(bt, pool, page_no, hashidx);
+ bt_spinreleasewrite (&bt->mgr->latch[hashidx], 1);
return pool;
}
}
reuse = 0;
}
#ifdef unix
- if ( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size )
+ if( pwrite(bt->mgr->idx, page, bt->mgr->page_size, new_page << bt->mgr->page_bits) < bt->mgr->page_size )
return bt->err = BTERR_wrt, 0;
// if writing first page of pool block, zero last page in the block
- if ( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 )
+ if( !reuse && bt->mgr->poolmask > 0 && (new_page & bt->mgr->poolmask) == 0 )
{
// use zero buffer to write zeros
- if ( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size )
+ if( pwrite(bt->mgr->idx,bt->zero, bt->mgr->page_size, (new_page | bt->mgr->poolmask) << bt->mgr->page_bits) < bt->mgr->page_size )
return bt->err = BTERR_wrt, 0;
}
#else
// re-read and re-lock root after determining actual level of root
if( set->page->lvl != drill) {
- if ( set->page_no != ROOT_page )
+ if( set->page_no != ROOT_page )
return bt->err = BTERR_struct, 0;
drill = set->page->lvl;
ptr = keyptr(set->page, slot);
else
{
- if ( !bt->err )
+ if( !bt->err )
bt->err = BTERR_ovflw;
return bt->err;
}
return slotptr(bt->cursor,slot)->tod;
}
-
#ifdef STANDALONE
+#ifndef unix
+double getCpuTime(int type)
+{
+FILETIME crtime[1];
+FILETIME xittime[1];
+FILETIME systime[1];
+FILETIME usrtime[1];
+SYSTEMTIME timeconv[1];
+double ans = 0;
+
+ memset (timeconv, 0, sizeof(SYSTEMTIME));
+
+ switch( type ) {
+ case 0:
+ GetSystemTimeAsFileTime (xittime);
+ FileTimeToSystemTime (xittime, timeconv);
+ ans = (double)timeconv->wDayOfWeek * 3600 * 24;
+ break;
+ case 1:
+ GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
+ FileTimeToSystemTime (usrtime, timeconv);
+ break;
+ case 2:
+ GetProcessTimes (GetCurrentProcess(), crtime, xittime, systime, usrtime);
+ FileTimeToSystemTime (systime, timeconv);
+ break;
+ }
+
+ ans += (double)timeconv->wHour * 3600;
+ ans += (double)timeconv->wMinute * 60;
+ ans += (double)timeconv->wSecond;
+ ans += (double)timeconv->wMilliseconds / 1000;
+ return ans;
+}
+#else
+#include <time.h>
+#include <sys/resource.h>
+
+double getCpuTime(int type)
+{
+struct rusage used[1];
+struct timeval tv[1];
+
+ switch( type ) {
+ case 0:
+ gettimeofday(tv, NULL);
+ return (double)tv->tv_sec + (double)tv->tv_usec / 1000000;
+
+ case 1:
+ getrusage(RUSAGE_SELF, used);
+ return (double)used->ru_utime.tv_sec + (double)used->ru_utime.tv_usec / 1000000;
+
+ case 2:
+ getrusage(RUSAGE_SELF, used);
+ return (double)used->ru_stime.tv_sec + (double)used->ru_stime.tv_usec / 1000000;
+ }
+
+ return 0;
+}
+#endif
+
void bt_latchaudit (BtDb *bt)
{
ushort idx, hashidx;
uid next, page_no;
BtLatchSet *latch;
-BtPool *pool;
-BtPage page;
BtKey ptr;
#ifdef unix
- for( idx = 1; idx < bt->mgr->latchmgr->latchdeployed; idx++ ) {
+ if( *(uint *)(bt->mgr->latchmgr->lock) )
+ fprintf(stderr, "Alloc page locked\n");
+ *(uint *)(bt->mgr->latchmgr->lock) = 0;
+
+ for( idx = 1; idx <= bt->mgr->latchmgr->latchdeployed; idx++ ) {
latch = bt->mgr->latchsets + idx;
- if( *(uint *)latch->readwr ) {
- fprintf(stderr, "latchset %d r/w locked for page %.8x\n", idx, latch->page_no);
- *(uint *)latch->readwr = 0;
- }
- if( *(uint *)latch->access ) {
- fprintf(stderr, "latchset %d access locked for page %.8x\n", idx, latch->page_no);
- *(uint *)latch->access = 0;
- }
- if( *(uint *)latch->parent ) {
- fprintf(stderr, "latchset %d parent locked for page %.8x\n", idx, latch->page_no);
- *(uint *)latch->parent = 0;
- }
- if( *(uint *)latch->busy ) {
- fprintf(stderr, "latchset %d busy locked for page %.8x\n", idx, latch->page_no);
- *(uint *)latch->parent = 0;
- }
+ if( *(uint *)latch->readwr )
+ fprintf(stderr, "latchset %d rwlocked for page %.8x\n", idx, latch->page_no);
+ *(uint *)latch->readwr = 0;
+
+ if( *(uint *)latch->access )
+ fprintf(stderr, "latchset %d accesslocked for page %.8x\n", idx, latch->page_no);
+ *(uint *)latch->access = 0;
+
+ if( *(uint *)latch->parent )
+ fprintf(stderr, "latchset %d parentlocked for page %.8x\n", idx, latch->page_no);
+ *(uint *)latch->parent = 0;
+
if( latch->pin ) {
fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
latch->pin = 0;
}
for( hashidx = 0; hashidx < bt->mgr->latchmgr->latchhash; hashidx++ ) {
+ if( *(uint *)(bt->mgr->latchmgr->table[hashidx].latch) )
+ fprintf(stderr, "hash entry %d locked\n", hashidx);
+
+ *(uint *)(bt->mgr->latchmgr->table[hashidx].latch) = 0;
+
if( idx = bt->mgr->latchmgr->table[hashidx].slot ) do {
latch = bt->mgr->latchsets + idx;
- if( latch->hash != hashidx ) {
+ if( *(uint *)latch->busy )
+ fprintf(stderr, "latchset %d busylocked for page %.8x\n", idx, latch->page_no);
+ *(uint *)latch->busy = 0;
+ if( latch->hash != hashidx )
fprintf(stderr, "latchset %d wrong hashidx\n", idx);
- latch->hash = hashidx;
- }
+ if( latch->pin )
+ fprintf(stderr, "latchset %d pinned for page %.8x\n", idx, latch->page_no);
} while( idx = latch->next );
}
found++;
else if( bt->err )
fprintf(stderr, "Error %d Syserr %d Line: %d\n", bt->err, errno, line), exit(0);
- else
- fprintf(stderr, "Unable to find key %.*s line %d\n", len, key, line);
len = 0;
}
else if( len < 255 )
page_no = LEAF_page;
while( page_no < bt_getid(bt->mgr->latchmgr->alloc->right) ) {
- pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, page_no << bt->mgr->page_bits);
+ uid off = page_no << bt->mgr->page_bits;
+#ifdef unix
+ pread (bt->mgr->idx, bt->frame, bt->mgr->page_size, off);
+#else
+ DWORD amt[1];
+
+ SetFilePointer (bt->mgr->idx, (long)off, (long*)(&off)+1, FILE_BEGIN);
+
+ if( !ReadFile(bt->mgr->idx, bt->frame, bt->mgr->page_size, amt, NULL))
+ return bt->err = BTERR_map;
+
+ if( *amt < bt->mgr->page_size )
+ return bt->err = BTERR_map;
+#endif
if( !bt->frame->free && !bt->frame->lvl )
cnt += bt->frame->act;
if( page_no > LEAF_page )
{
int idx, cnt, len, slot, err;
int segsize, bits = 16;
+double start, stop;
#ifdef unix
pthread_t *threads;
-timer start, stop;
#else
-time_t start[1], stop[1];
HANDLE *threads;
#endif
-double real_time;
ThreadArg *args;
uint poolsize = 0;
+float elapsed;
int num = 0;
char key[1];
BtMgr *mgr;
exit(0);
}
-#ifdef unix
- gettimeofday(&start, NULL);
-#else
- time(start);
-#endif
+ start = getCpuTime(0);
if( argc > 3 )
bits = atoi(argv[3]);
#ifdef unix
for( idx = 0; idx < cnt; idx++ )
pthread_join (threads[idx], NULL);
- gettimeofday(&stop, NULL);
- real_time = 1000.0 * ( stop.tv_sec - start.tv_sec ) + 0.001 * (stop.tv_usec - start.tv_usec );
#else
WaitForMultipleObjects (cnt, threads, TRUE, INFINITE);
for( idx = 0; idx < cnt; idx++ )
CloseHandle(threads[idx]);
- time (stop);
- real_time = 1000 * (*stop - *start);
#endif
- fprintf(stderr, " Time to complete: %.2f seconds\n", real_time/1000);
+ elapsed = getCpuTime(0) - start;
+ fprintf(stderr, " real %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
+ elapsed = getCpuTime(1);
+ fprintf(stderr, " user %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
+ elapsed = getCpuTime(2);
+ fprintf(stderr, " sys %dm%.3fs\n", (int)(elapsed/60), elapsed - (int)(elapsed/60)*60);
+
bt_mgrclose (mgr);
}