X-Git-Url: https://pd.if.org/git/?p=btree;a=blobdiff_plain;f=btree2s.c;h=49b910961e8240139eabe622717f61a2f2daab0b;hp=ac7673e75cca8a0c1a9debe608f938163dfa99d0;hb=0756819e460acb958eeb073f4d5ca9ddf66ec6c7;hpb=c0d1fd8ff9f39147bdda132d5e8725b9bcfe2a9a diff --git a/btree2s.c b/btree2s.c index ac7673e..49b9109 100644 --- a/btree2s.c +++ b/btree2s.c @@ -1,5 +1,5 @@ // btree version 2s -// 02 FEB 2014 +// 18 FEB 2014 // author: karl malbrain, malbrain@cal.berkeley.edu @@ -119,8 +119,10 @@ typedef struct { uint cnt; // count of keys in page uint act; // count of active keys uint min; // next key offset - unsigned char bits; // page size in bits - unsigned char lvl:7; // level of page + unsigned char bits:7; // page size in bits + unsigned char free:1; // page is on free list + unsigned char lvl:6; // level of page + unsigned char kill:1; // page is being deleted unsigned char dirty:1; // page is dirty unsigned char right[BtId]; // page number to right } *BtPage; @@ -166,9 +168,7 @@ typedef struct _BtDb { int nodemax; // highest page cache segment allocated int hashmask; // number of pages in segments - 1 int hashsize; // size of hash table - int posted; // last loadpage found posted key int found; // last deletekey found key - int fence; // last load page used fence position BtHash *lrufirst; // lru list head BtHash *lrulast; // lru list tail ushort *cache; // hash table for cached segments @@ -893,16 +893,6 @@ int amt[1]; BTERR bt_freepage(BtDb *bt, uid page_no) { - // obtain delete lock on deleted node - - if( bt_lockpage(bt, page_no, BtLockDelete) ) - return bt->err; - - // obtain write lock on deleted node - - if( bt_lockpage(bt, page_no, BtLockWrite) ) - return bt->err; - if( bt_mappage (bt, &bt->temp, page_no) ) return bt->err; @@ -917,6 +907,7 @@ BTERR bt_freepage(BtDb *bt, uid page_no) // store chain in second right bt_putid(bt->temp->right, bt_getid(bt->alloc[1].right)); bt_putid(bt->alloc[1].right, page_no); + bt->temp->free = 1; if( bt_update(bt, bt->alloc, ALLOC_page) ) return bt->err; @@ -1024,16 +1015,6 @@ int bt_findslot (BtDb *bt, unsigned char *key, uint len) uint diff, higher = bt->page->cnt, low = 1, slot; uint good = 0; - // if page is being deleted, send to right - - if( !bt->page->cnt ) - return 0; - - // if page is an empty fence holder - - if( !bt->page->act ) - return bt->page->cnt; - // make stopper key an infinite fence value if( bt_getid (bt->page->right) ) @@ -1041,7 +1022,7 @@ uint good = 0; else good++; - // low is the next candidate, higher is already + // low is the lowest candidate, higher is already // tested as .ge. the given key, loop ends when they meet while( diff = higher - low ) { @@ -1068,8 +1049,6 @@ uint mode, prevmode; // start at root of btree and drill down - bt->posted = 1; - do { // determine lock mode of drill level mode = (lock == BtLockWrite) && (drill == lvl) ? BtLockWrite : BtLockRead; @@ -1108,17 +1087,21 @@ uint mode, prevmode; drill = bt->page->lvl; - if( lock == BtLockWrite && drill == lvl ) + if( lock != BtLockRead && drill == lvl ) if( bt_unlockpage(bt, page_no, mode) ) return 0; else continue; } + prevpage = bt->page_no; + prevmode = mode; + // find key on page at this level // and descend to requested level - if( slot = bt_findslot (bt, key, len) ) { + if( !bt->page->kill ) + if( slot = bt_findslot (bt, key, len) ) { if( drill == lvl ) return slot; @@ -1126,27 +1109,18 @@ uint mode, prevmode; if( slot++ < bt->page->cnt ) continue; else - return bt->err = BTERR_struct, 0; + goto slideright; page_no = bt_getid(slotptr(bt->page, slot)->id); - bt->fence = slot == bt->page->cnt; - bt->posted = 1; drill--; - } + continue; + } // or slide right into next page - // (slide left from deleted page) - else { - page_no = bt_getid(bt->page->right); - bt->posted = 0; - } - - // continue down / right using overlapping locks - // to protect pages being killed or split. +slideright: + page_no = bt_getid(bt->page->right); - prevpage = bt->page_no; - prevmode = mode; } while( page_no ); // return error on end of right chain @@ -1155,13 +1129,95 @@ uint mode, prevmode; return 0; // return error } +// a fence key was deleted from a page +// push new fence value upwards + +BTERR bt_fixfence (BtDb *bt, uid page_no, uint lvl) +{ +unsigned char leftkey[256], rightkey[256]; +BtKey ptr; + + // remove deleted key, the old fence value + + ptr = keyptr(bt->page, bt->page->cnt); + memcpy(rightkey, ptr, ptr->len + 1); + + memset (slotptr(bt->page, bt->page->cnt--), 0, sizeof(BtSlot)); + bt->page->dirty = 1; + + ptr = keyptr(bt->page, bt->page->cnt); + memcpy(leftkey, ptr, ptr->len + 1); + + if( bt_update (bt, bt->page, page_no) ) + return bt->err; + + if( bt_lockpage (bt, page_no, BtLockParent) ) + return bt->err; + + if( bt_unlockpage (bt, page_no, BtLockWrite) ) + return bt->err; + + // insert new (now smaller) fence key + + if( bt_insertkey (bt, leftkey+1, *leftkey, lvl + 1, page_no, time(NULL)) ) + return bt->err; + + // remove old (larger) fence key + + if( bt_deletekey (bt, rightkey+1, *rightkey, lvl + 1) ) + return bt->err; + + return bt_unlockpage (bt, page_no, BtLockParent); +} + +// root has a single child +// collapse a level from the btree +// call with root locked in bt->page + +BTERR bt_collapseroot (BtDb *bt, BtPage root) +{ +uid child; +uint idx; + + // find the child entry + // and promote to new root + + do { + for( idx = 0; idx++ < root->cnt; ) + if( !slotptr(root, idx)->dead ) + break; + + child = bt_getid (slotptr(root, idx)->id); + + if( bt_lockpage (bt, child, BtLockDelete) ) + return bt->err; + + if( bt_lockpage (bt, child, BtLockWrite) ) + return bt->err; + + if( bt_mappage (bt, &bt->temp, child) ) + return bt->err; + + memcpy (root, bt->temp, bt->page_size); + + if( bt_update (bt, root, ROOT_page) ) + return bt->err; + + if( bt_freepage (bt, child) ) + return bt->err; + + } while( root->lvl > 1 && root->act == 1 ); + + return bt_unlockpage (bt, ROOT_page, BtLockWrite); +} + // find and delete key on page by marking delete flag bit // when page becomes empty, delete it BTERR bt_deletekey (BtDb *bt, unsigned char *key, uint len, uint lvl) { unsigned char lowerkey[256], higherkey[256]; -uint slot, tod, dirty = 0; +uint slot, dirty = 0, idx, fence, found; uid page_no, right; BtKey ptr; @@ -1170,32 +1226,56 @@ BtKey ptr; else return bt->err; + // are we deleting a fence slot? + + fence = slot == bt->page->cnt; + // if key is found delete it, otherwise ignore request - if( !keycmp (ptr, key, len) ) - if( slotptr(bt->page, slot)->dead == 0 ) { - dirty = slotptr(bt->page,slot)->dead = 1; - if( slot < bt->page->cnt ) - bt->page->dirty = 1; - bt->page->act--; - } + if( found = !keycmp (ptr, key, len) ) + if( found = slotptr(bt->page, slot)->dead == 0 ) { + dirty = slotptr(bt->page,slot)->dead = 1; + bt->page->dirty = 1; + bt->page->act--; - // return if page is not empty, or - // if non-leaf level or fence key + // collapse empty slots + + while( idx = bt->page->cnt - 1 ) + if( slotptr(bt->page, idx)->dead ) { + *slotptr(bt->page, idx) = *slotptr(bt->page, idx + 1); + memset (slotptr(bt->page, bt->page->cnt--), 0, sizeof(BtSlot)); + } else + break; + } right = bt_getid(bt->page->right); page_no = bt->page_no; - if( lvl || bt->page->act || bt->fence ) - if ( dirty && bt_update(bt, bt->page, page_no) ) + // did we delete a fence key in an upper level? + + if( dirty && lvl && bt->page->act && fence ) + if( bt_fixfence (bt, page_no, lvl) ) return bt->err; else - return bt_unlockpage(bt, page_no, BtLockWrite); + return bt->found = found, 0; - // obtain Parent lock over write lock + // is this a collapsed root? - if( bt_lockpage(bt, page_no, BtLockParent) ) + if( lvl > 1 && page_no == ROOT_page && bt->page->act == 1 ) + if( bt_collapseroot (bt, bt->page) ) + return bt->err; + else + return bt->found = found, 0; + + // return if page is not empty + + if( bt->page->act ) { + if( dirty && bt_update(bt, bt->page, page_no) ) return bt->err; + if( bt_unlockpage(bt, page_no, BtLockWrite) ) + return bt->err; + return bt->found = found, 0; + } // cache copy of fence key // in order to find parent @@ -1203,7 +1283,7 @@ BtKey ptr; ptr = keyptr(bt->page, bt->page->cnt); memcpy(lowerkey, ptr, ptr->len + 1); - // lock and map right page + // obtain lock on right page if ( bt_lockpage(bt, right, BtLockWrite) ) return bt->err; @@ -1211,6 +1291,9 @@ BtKey ptr; if( bt_mappage (bt, &bt->temp, right) ) return bt->err; + if( bt->temp->kill ) + return bt->err = BTERR_struct; + // pull contents of next page into current empty page memcpy (bt->page, bt->temp, bt->page_size); @@ -1224,7 +1307,7 @@ BtKey ptr; // until we can post updates at higher level. bt_putid(bt->temp->right, page_no); - bt->temp->cnt = 0; + bt->temp->kill = 1; if( bt_update(bt, bt->page, page_no) ) return bt->err; @@ -1232,36 +1315,46 @@ BtKey ptr; if( bt_update(bt, bt->temp, right) ) return bt->err; - if( bt_unlockpage(bt, right, BtLockWrite) ) + if( bt_lockpage(bt, page_no, BtLockParent) ) return bt->err; if( bt_unlockpage(bt, page_no, BtLockWrite) ) return bt->err; - // delete old lower key to consolidated node + if( bt_lockpage(bt, right, BtLockParent) ) + return bt->err; - if( bt_deletekey (bt, lowerkey + 1, *lowerkey, lvl + 1) ) + if( bt_unlockpage(bt, right, BtLockWrite) ) return bt->err; // redirect higher key directly to consolidated node - tod = (uint)time(NULL); + if( bt_insertkey (bt, higherkey+1, *higherkey, lvl+1, page_no, time(NULL)) ) + return bt->err; + + // delete old lower key to consolidated node - if( bt_insertkey (bt, higherkey+1, *higherkey, lvl + 1, page_no, tod) ) + if( bt_deletekey (bt, lowerkey + 1, *lowerkey, lvl + 1) ) return bt->err; - // obtain write lock and + // obtain write & delete lock on deleted node // add right block to free chain + if( bt_lockpage(bt, right, BtLockDelete) ) + return bt->err; + + if( bt_lockpage(bt, right, BtLockWrite) ) + return bt->err; + if( bt_freepage (bt, right) ) return bt->err; - // remove ParentModify lock + // remove ParentModify locks - if( bt_unlockpage(bt, page_no, BtLockParent) ) + if( bt_unlockpage(bt, right, BtLockParent) ) return bt->err; - - return 0; + + return bt_unlockpage(bt, page_no, BtLockParent); } // find key in leaf level and return row-id @@ -1352,16 +1445,16 @@ int ret; // split the root and raise the height of the btree -BTERR bt_splitroot(BtDb *bt, unsigned char *newkey, unsigned char *oldkey, uid page_no2) +BTERR bt_splitroot(BtDb *bt, unsigned char *leftkey, uid page_no2) { uint nxt = bt->page_size; BtPage root = bt->page; -uid new_page; +uid right; // Obtain an empty page to use, and copy the current // root contents into it - if( !(new_page = bt_newpage(bt, root)) ) + if( !(right = bt_newpage(bt, root)) ) return bt->err; // preserve the page info at the bottom @@ -1371,16 +1464,18 @@ uid new_page; // insert first key on newroot page - nxt -= *newkey + 1; - memcpy ((unsigned char *)root + nxt, newkey, *newkey + 1); - bt_putid(slotptr(root, 1)->id, new_page); + nxt -= *leftkey + 1; + memcpy ((unsigned char *)root + nxt, leftkey, *leftkey + 1); + bt_putid(slotptr(root, 1)->id, right); slotptr(root, 1)->off = nxt; // insert second key on newroot page // and increase the root height - nxt -= *oldkey + 1; - memcpy ((unsigned char *)root + nxt, oldkey, *oldkey + 1); + nxt -= 3; + ((unsigned char *)root)[nxt] = 2; + ((unsigned char *)root)[nxt+1] = 0xff; + ((unsigned char *)root)[nxt+2] = 0xff; bt_putid(slotptr(root, 2)->id, page_no2); slotptr(root, 2)->off = nxt; @@ -1404,21 +1499,17 @@ uid new_page; BTERR bt_splitpage (BtDb *bt) { uint cnt = 0, idx = 0, max, nxt = bt->page_size; -unsigned char oldkey[256], lowerkey[256]; +unsigned char fencekey[256], rightkey[256]; uid page_no = bt->page_no, right; BtPage page = bt->page; uint lvl = page->lvl; -uid new_page; BtKey key; -uint tod; // split higher half of keys to bt->frame // the last key (fence key) might be dead - tod = (uint)time(NULL); - memset (bt->frame, 0, bt->page_size); - max = (int)page->cnt; + max = page->cnt; cnt = max / 2; idx = 0; @@ -1433,9 +1524,9 @@ uint tod; slotptr(bt->frame, idx)->off = nxt; } - // remember existing fence key for new page to the right + // remember fence key for new right page - memcpy (oldkey, key, key->len + 1); + memcpy (rightkey, key, key->len + 1); bt->frame->bits = bt->page_bits; bt->frame->min = nxt; @@ -1444,14 +1535,12 @@ uint tod; // link right node - if( page_no > ROOT_page ) { - right = bt_getid (page->right); - bt_putid(bt->frame->right, right); - } + if( page_no > ROOT_page ) + memcpy (bt->frame->right, page->right, BtId); // get new free page and write frame to it. - if( !(new_page = bt_newpage(bt, bt->frame)) ) + if( !(right = bt_newpage(bt, bt->frame)) ) return bt->err; // update lower keys to continue in old page @@ -1459,6 +1548,7 @@ uint tod; memcpy (bt->frame, page, bt->page_size); memset (page+1, 0, bt->page_size - sizeof(*page)); nxt = bt->page_size; + page->dirty = 0; page->act = 0; cnt = 0; idx = 0; @@ -1476,56 +1566,46 @@ uint tod; page->act++; } - // remember fence key for old page + // remember fence key for smaller page + + memcpy (fencekey, key, key->len + 1); - memcpy(lowerkey, key, key->len + 1); - bt_putid(page->right, new_page); + bt_putid(page->right, right); page->min = nxt; page->cnt = idx; // if current page is the root page, split it if( page_no == ROOT_page ) - return bt_splitroot (bt, lowerkey, oldkey, new_page); + return bt_splitroot (bt, fencekey, right); - // update left (containing) node + // lock right page - if( bt_update(bt, page, page_no) ) + if( bt_lockpage (bt, right, BtLockParent) ) return bt->err; - // obtain Parent/Write locks - // for left and right node pages + // update left (containing) node - if( bt_lockpage (bt, new_page, BtLockParent) ) + if( bt_update(bt, page, page_no) ) return bt->err; if( bt_lockpage (bt, page_no, BtLockParent) ) return bt->err; - // release wr lock on left page - - if( bt_unlockpage (bt, page_no, BtLockWrite) ) - return bt->err; - // insert new fence for reformulated left block - if( bt_insertkey (bt, lowerkey+1, *lowerkey, lvl + 1, page_no, tod) ) + if( bt_insertkey (bt, fencekey+1, *fencekey, lvl+1, page_no, time(NULL)) ) return bt->err; - // fix old fence for newly allocated right block page + // switch fence for right block of larger keys to new right page - if( bt_insertkey (bt, oldkey+1, *oldkey, lvl + 1, new_page, tod) ) - return bt->err; - - // release Parent & Write locks - - if( bt_unlockpage (bt, new_page, BtLockParent) ) + if( bt_insertkey (bt, rightkey+1, *rightkey, lvl+1, right, time(NULL)) ) return bt->err; if( bt_unlockpage (bt, page_no, BtLockParent) ) return bt->err; - return 0; + return bt_unlockpage (bt, right, BtLockParent); } // Insert new key into the btree at requested level. @@ -1553,12 +1633,14 @@ BtKey ptr; page = bt->page; if( !keycmp (ptr, key, len) ) { - slotptr(page, slot)->dead = 0; - slotptr(page, slot)->tod = tod; - bt_putid(slotptr(page,slot)->id, id); - if ( bt_update(bt, bt->page, bt->page_no) ) - return bt->err; - return bt_unlockpage(bt, bt->page_no, BtLockWrite); + if( slotptr(page, slot)->dead ) + page->act++; + slotptr(page, slot)->dead = 0; + slotptr(page, slot)->tod = tod; + bt_putid(slotptr(page,slot)->id, id); + if ( bt_update(bt, bt->page, bt->page_no) ) + return bt->err; + return bt_unlockpage(bt, bt->page_no, BtLockWrite); } // check if page has enough space @@ -1596,7 +1678,7 @@ BtKey ptr; slotptr(page, slot)->tod = tod; slotptr(page, slot)->dead = 0; - if ( bt_update(bt, bt->page, bt->page_no) ) + if( bt_update(bt, bt->page, bt->page_no) ) return bt->err; return bt_unlockpage(bt, bt->page_no, BtLockWrite); @@ -1609,11 +1691,16 @@ uint bt_startkey (BtDb *bt, unsigned char *key, uint len) uint slot; // cache page for retrieval + if( slot = bt_loadpage (bt, key, len, 0, BtLockRead) ) - memcpy (bt->cursor, bt->page, bt->page_size); + memcpy (bt->cursor, bt->page, bt->page_size); + else + return 0; + bt->cursor_page = bt->page_no; + if ( bt_unlockpage(bt, bt->page_no, BtLockRead) ) - return 0; + return 0; return slot; } @@ -1627,6 +1714,7 @@ off64_t right; do { right = bt_getid(bt->cursor->right); + while( slot++ < bt->cursor->cnt ) if( slotptr(bt->cursor,slot)->dead ) continue; @@ -1640,17 +1728,19 @@ off64_t right; bt->cursor_page = right; - if( bt_lockpage(bt, right,BtLockRead) ) + if( bt_lockpage(bt, right, BtLockRead) ) return 0; if( bt_mappage (bt, &bt->page, right) ) - break; + return 0; memcpy (bt->cursor, bt->page, bt->page_size); + if ( bt_unlockpage(bt, right, BtLockRead) ) return 0; slot = 0; + } while( 1 ); return bt->err = 0;