From dbcd4739e02b8e774e28b752c412d7e2f242cd47 Mon Sep 17 00:00:00 2001 From: jdybnis Date: Mon, 8 Dec 2008 04:40:39 +0000 Subject: [PATCH] in txn, use a "naked" value without an update record whenever possible --- include/common.h | 9 ++-- makefile | 2 +- map/hashtable.c | 18 +++---- map/list.c | 26 ++++----- map/skiplist.c | 56 ++++++++++---------- txn/txn.c | 134 +++++++++++++++++++++++++++++++---------------- 6 files changed, 144 insertions(+), 101 deletions(-) diff --git a/include/common.h b/include/common.h index bf3d04e..faebe4c 100644 --- a/include/common.h +++ b/include/common.h @@ -31,10 +31,11 @@ #define TRUE 1 #define FALSE 0 -#define TAG (1ULL << 63) -#define TAG_VALUE(v) ((uint64_t)(v) | TAG) -#define IS_TAGGED(v) ((uint64_t)(v) & TAG) -#define STRIP_TAG(v) ((uint64_t)(v) & ~TAG) +#define TAG1 (1ULL << 63) +#define TAG2 (1ULL << 62) +#define TAG_VALUE(v, tag) ((uint64_t)(v) | tag) +#define IS_TAGGED(v, tag) ((uint64_t)(v) & tag) +#define STRIP_TAG(v, tag) ((uint64_t)(v) & ~tag) #define DOES_NOT_EXIST 0 #define ERROR_INVALID_OPTION (-1) diff --git a/makefile b/makefile index ba6ad31..82d921e 100644 --- a/makefile +++ b/makefile @@ -7,7 +7,7 @@ OPT := -fwhole-program -combine -03 #-DNDEBUG CFLAGS := -g -Wall -Werror -std=c99 -m64 $(OPT) #-DENABLE_TRACE INCS := $(addprefix -I, include) -TESTS := output/map_test1 output/map_test2 output/rcu_test output/txn_test +TESTS := output/map_test1 output/map_test2 output/txn_test EXES := $(TESTS) RUNTIME_SRCS := runtime/runtime.c runtime/rcu.c runtime/lwt.c runtime/mem.c datatype/nstring.c diff --git a/map/hashtable.c b/map/hashtable.c index 1733941..f9bf715 100644 --- a/map/hashtable.c +++ b/map/hashtable.c @@ -42,7 +42,7 @@ struct ht { }; static const uint64_t COPIED_VALUE = -1; -static const uint64_t TOMBSTONE = STRIP_TAG(-1); +static const uint64_t TOMBSTONE = STRIP_TAG(-1, TAG1); static const unsigned ENTRIES_PER_BUCKET = CACHE_LINE_SIZE/sizeof(entry_t); static const unsigned ENTRIES_PER_COPY_CHUNK = CACHE_LINE_SIZE/sizeof(entry_t)*2; @@ -201,7 +201,7 @@ static int hti_copy_entry (hti_t *ht1, volatile entry_t *ht1_ent, uint32_t key_h } // Tag the value in the old entry to indicate a copy is in progress. - ht1_ent_val = SYNC_FETCH_AND_OR(&ht1_ent->val, TAG_VALUE(0)); + ht1_ent_val = SYNC_FETCH_AND_OR(&ht1_ent->val, TAG_VALUE(0, TAG1)); TRACE("h2", "hti_copy_entry: tagged the value %p in old entry %p", ht1_ent_val, ht1_ent); if (ht1_ent_val == COPIED_VALUE) { TRACE("h1", "hti_copy_entry: entry %p already copied to table %p", ht1_ent, ht2); @@ -213,7 +213,7 @@ static int hti_copy_entry (hti_t *ht1, volatile entry_t *ht1_ent, uint32_t key_h void *key = (ht1->ht->key_type == NULL) ? (void *)ht1_ent_key : GET_PTR(ht1_ent_key); // The old table's dead entries don't need to be copied to the new table, but their keys need to be freed. - assert(COPIED_VALUE == TAG_VALUE(TOMBSTONE)); + assert(COPIED_VALUE == TAG_VALUE(TOMBSTONE, TAG1)); if (ht1_ent_val == TOMBSTONE) { TRACE("h1", "hti_copy_entry: entry %p old value was deleted, now freeing key %p", ht1_ent, key); if (EXPECT_FALSE(ht1->ht->key_type != NULL)) { @@ -251,7 +251,7 @@ static int hti_copy_entry (hti_t *ht1, volatile entry_t *ht1_ent, uint32_t key_h } // Copy the value to the entry in the new table. - ht1_ent_val = STRIP_TAG(ht1_ent_val); + ht1_ent_val = STRIP_TAG(ht1_ent_val, TAG1); uint64_t old_ht2_ent_val = SYNC_CAS(&ht2_ent->val, DOES_NOT_EXIST, ht1_ent_val); // If there is a nested copy in progress, we might have installed the key into a dead entry. @@ -294,7 +294,7 @@ static uint64_t hti_cas (hti_t *hti, void *key, uint32_t key_hash, uint64_t expe TRACE("h1", "hti_cas: hti %p key %p", hti, key); TRACE("h1", "hti_cas: value %p expect %p", new, expected); assert(hti); - assert(!IS_TAGGED(new)); + assert(!IS_TAGGED(new, TAG1)); assert(key); int is_empty; @@ -346,7 +346,7 @@ static uint64_t hti_cas (hti_t *hti, void *key, uint32_t key_hash, uint64_t expe // If the entry is in the middle of a copy, the copy must be completed first. uint64_t ent_val = ent->val; - if (EXPECT_FALSE(IS_TAGGED(ent_val))) { + if (EXPECT_FALSE(IS_TAGGED(ent_val, TAG1))) { if (ent_val != COPIED_VALUE) { int did_copy = hti_copy_entry(hti, ent, key_hash, ((volatile hti_t *)hti)->next); if (did_copy) { @@ -413,7 +413,7 @@ static uint64_t hti_get (hti_t *hti, void *key, uint32_t key_hash) { // If the entry is being copied, finish the copy and retry on the next table. uint64_t ent_val = ent->val; - if (EXPECT_FALSE(IS_TAGGED(ent_val))) { + if (EXPECT_FALSE(IS_TAGGED(ent_val, TAG1))) { if (EXPECT_FALSE(ent_val != COPIED_VALUE)) { int did_copy = hti_copy_entry(hti, ent, key_hash, ((volatile hti_t *)hti)->next); if (did_copy) { @@ -438,7 +438,7 @@ uint64_t ht_cas (hashtable_t *ht, void *key, uint64_t expected_val, uint64_t new TRACE("h2", "ht_cas: key %p ht %p", key, ht); TRACE("h2", "ht_cas: expected val %p new val %p", expected_val, new_val); assert(key != DOES_NOT_EXIST); - assert(!IS_TAGGED(new_val) && new_val != DOES_NOT_EXIST && new_val != TOMBSTONE); + assert(!IS_TAGGED(new_val, TAG1) && new_val != DOES_NOT_EXIST && new_val != TOMBSTONE); hti_t *hti = ht->hti; @@ -538,7 +538,7 @@ void ht_free (hashtable_t *ht) { hti_t *hti = ht->hti; do { for (uint32_t i = 0; i < (1 << hti->scale); ++i) { - assert(hti->table[i].val == COPIED_VALUE || !IS_TAGGED(hti->table[i].val)); + assert(hti->table[i].val == COPIED_VALUE || !IS_TAGGED(hti->table[i].val, TAG1)); if (ht->key_type != NULL && hti->table[i].key != DOES_NOT_EXIST) { nbd_free(GET_PTR(hti->table[i].key)); } diff --git a/map/list.c b/map/list.c index 0982e59..a94d2d0 100644 --- a/map/list.c +++ b/map/list.c @@ -42,7 +42,7 @@ list_t *ll_alloc (const datatype_t *key_type) { void ll_free (list_t *ll) { node_t *item = ll->head->next; while (item) { - node_t *next = (node_t *)STRIP_TAG(item->next); + node_t *next = (node_t *)STRIP_TAG(item->next, TAG1); nbd_free(item); item = next; } @@ -52,10 +52,10 @@ uint64_t ll_count (list_t *ll) { uint64_t count = 0; node_t *item = ll->head->next; while (item) { - if (!IS_TAGGED(item->next)) { + if (!IS_TAGGED(item->next, TAG1)) { count++; } - item = (node_t *)STRIP_TAG(item->next); + item = (node_t *)STRIP_TAG(item->next, TAG1); } return count; } @@ -69,11 +69,11 @@ static int find_pred (node_t **pred_ptr, node_t **item_ptr, list_t *ll, void *ke node_t *next = item->next; // A tag means an item is logically removed but not physically unlinked yet. - while (EXPECT_FALSE(IS_TAGGED(next))) { + while (EXPECT_FALSE(IS_TAGGED(next, TAG1))) { // Skip over logically removed items. if (!help_remove) { - item = (node_t *)STRIP_TAG(item->next); + item = (node_t *)STRIP_TAG(item->next, TAG1); if (EXPECT_FALSE(item == NULL)) break; TRACE("l3", "find_pred: skipping marked item %p (next is %p)", item, next); @@ -84,9 +84,9 @@ static int find_pred (node_t **pred_ptr, node_t **item_ptr, list_t *ll, void *ke // Unlink logically removed items. node_t *other; TRACE("l3", "find_pred: unlinking marked item %p next is %p", item, next); - if ((other = SYNC_CAS(&pred->next, item, STRIP_TAG(next))) == item) { + if ((other = SYNC_CAS(&pred->next, item, STRIP_TAG(next, TAG1))) == item) { TRACE("l2", "find_pred: unlinked item %p from pred %p", item, pred); - item = (node_t *)STRIP_TAG(next); + item = (node_t *)STRIP_TAG(next, TAG1); if (EXPECT_FALSE(item == NULL)) break; next = item->next; @@ -100,7 +100,7 @@ static int find_pred (node_t **pred_ptr, node_t **item_ptr, list_t *ll, void *ke } else { TRACE("l2", "find_pred: lost a race to unlink item %p from pred %p", item, pred); TRACE("l2", "find_pred: pred's link changed to %p", other, 0); - if (IS_TAGGED(other)) + if (IS_TAGGED(other, TAG1)) return find_pred(pred_ptr, item_ptr, ll, key, help_remove); // retry item = other; if (EXPECT_FALSE(item == NULL)) @@ -252,14 +252,14 @@ uint64_t ll_remove (list_t *ll, void *key) { node_t *old_next = item->next; do { next = old_next; - old_next = SYNC_CAS(&item->next, next, TAG_VALUE(next)); - if (IS_TAGGED(old_next)) { + old_next = SYNC_CAS(&item->next, next, TAG_VALUE(next, TAG1)); + if (IS_TAGGED(old_next, TAG1)) { TRACE("l1", "ll_remove: lost a race -- %p is already marked for removal by another thread", item, 0); return DOES_NOT_EXIST; } } while (next != old_next); TRACE("l2", "ll_remove: logically removed item %p", item, 0); - ASSERT(IS_TAGGED(item->next)); + ASSERT(IS_TAGGED(item->next, TAG1)); // Atomically swap out the item's value in case another thread is updating the item while we are // removing it. This establishes which operation occurs first logically, the update or the remove. @@ -291,12 +291,12 @@ void ll_print (list_t *ll) { int i = 0; while (item) { node_t *next = item->next; - if (IS_TAGGED(item)) { + if (IS_TAGGED(item, TAG1)) { printf("*"); } printf("%p:%p ", item, item->key); fflush(stdout); - item = (node_t *)STRIP_TAG(next); + item = (node_t *)STRIP_TAG(next, TAG1); if (i++ > 30) { printf("..."); break; diff --git a/map/skiplist.c b/map/skiplist.c index 49ef21b..62506e1 100644 --- a/map/skiplist.c +++ b/map/skiplist.c @@ -74,7 +74,7 @@ skiplist_t *sl_alloc (const datatype_t *key_type) { void sl_free (skiplist_t *sl) { node_t *item = sl->head->next[0]; while (item) { - node_t *next = (node_t *)STRIP_TAG(item->next[0]); + node_t *next = (node_t *)STRIP_TAG(item->next[0], TAG1); nbd_free(item); item = next; } @@ -84,10 +84,10 @@ uint64_t sl_count (skiplist_t *sl) { uint64_t count = 0; node_t *item = sl->head->next[0]; while (item) { - if (!IS_TAGGED(item->next[0])) { + if (!IS_TAGGED(item->next[0], TAG1)) { count++; } - item = (node_t *)STRIP_TAG(item->next[0]); + item = (node_t *)STRIP_TAG(item->next[0], TAG1); } return count; } @@ -117,7 +117,7 @@ static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl for (int level = start_level; level >= 0; --level) { TRACE("s3", "find_preds: level %llu", level, 0); item = pred->next[level]; - if (EXPECT_FALSE(IS_TAGGED(item))) { + if (EXPECT_FALSE(IS_TAGGED(item, TAG1))) { TRACE("s2", "find_preds: pred %p is marked for removal (item %p); retry", pred, item); return find_preds(preds, succs, n, sl, key, help_remove); // retry } @@ -125,11 +125,11 @@ static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl node_t *next = item->next[level]; // A tag means an item is logically removed but not physically unlinked yet. - while (EXPECT_FALSE(IS_TAGGED(next))) { + while (EXPECT_FALSE(IS_TAGGED(next, TAG1))) { // Skip over logically removed items. if (!help_remove) { - item = (node_t *)STRIP_TAG(item->next); + item = (node_t *)STRIP_TAG(item->next, TAG1); if (EXPECT_FALSE(item == NULL)) break; TRACE("s3", "find_preds: skipping marked item %p (next is %p)", item, next); @@ -140,8 +140,8 @@ static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl // Unlink logically removed items. node_t *other; TRACE("s3", "find_preds: unlinking marked item %p; next is %p", item, next); - if ((other = SYNC_CAS(&pred->next[level], item, STRIP_TAG(next))) == item) { - item = (node_t *)STRIP_TAG(next); + if ((other = SYNC_CAS(&pred->next[level], item, STRIP_TAG(next, TAG1))) == item) { + item = (node_t *)STRIP_TAG(next, TAG1); if (EXPECT_FALSE(item == NULL)) break; next = item->next[level]; @@ -157,7 +157,7 @@ static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl } else { TRACE("s3", "find_preds: lost race to unlink item %p from pred %p", item, pred); TRACE("s3", "find_preds: pred's link changed to %p", other, 0); - if (IS_TAGGED(other)) + if (IS_TAGGED(other, TAG1)) return find_preds(preds, succs, n, sl, key, help_remove); // retry item = other; if (EXPECT_FALSE(item == NULL)) @@ -170,7 +170,7 @@ static node_t *find_preds (node_t **preds, node_t **succs, int n, skiplist_t *sl break; TRACE("s4", "find_preds: visiting item %p (next is %p)", item, next); - TRACE("s4", "find_preds: key %p val %p", STRIP_TAG(item->key), item->val); + TRACE("s4", "find_preds: key %p val %p", STRIP_TAG(item->key, TAG1), item->val); if (EXPECT_TRUE(sl->key_type == NULL)) { d = (uint64_t)item->key - (uint64_t)key; @@ -235,9 +235,9 @@ void *sl_min_key (skiplist_t *sl) { node_t *item = sl->head->next[0]; while (item != NULL) { node_t *next = item->next[0]; - if (!IS_TAGGED(next)) + if (!IS_TAGGED(next, TAG1)) return item->key; - item = (node_t *)STRIP_TAG(next); + item = (node_t *)STRIP_TAG(next, TAG1); } return DOES_NOT_EXIST; } @@ -335,7 +335,7 @@ uint64_t sl_cas (skiplist_t *sl, void *key, uint64_t expectation, uint64_t new_v do { // There in no need to continue linking in the item if another thread removed it. node_t *old_next = ((volatile node_t *)new_item)->next[level]; - if (IS_TAGGED(old_next)) + if (IS_TAGGED(old_next, TAG1)) return DOES_NOT_EXIST; // success // Use a CAS so we do not inadvertantly stomp on a mark another thread placed on the item. @@ -363,17 +363,17 @@ uint64_t sl_remove (skiplist_t *sl, void *key) { node_t *old_next = item->next[level]; do { next = old_next; - old_next = SYNC_CAS(&item->next[level], next, TAG_VALUE(next)); - if (IS_TAGGED(old_next)) { + old_next = SYNC_CAS(&item->next[level], next, TAG_VALUE(next, TAG1)); + if (IS_TAGGED(old_next, TAG1)) { TRACE("s2", "sl_remove: %p is already marked for removal by another thread at level %llu", item, level); break; } } while (next != old_next); node_t *pred = preds[level]; - TRACE("s2", "sl_remove: linking the item's pred %p to the item's successor %p", pred, STRIP_TAG(next)); + TRACE("s2", "sl_remove: linking the item's pred %p to the item's successor %p", pred, STRIP_TAG(next, TAG1)); node_t *other = NULL; - if ((other = SYNC_CAS(&pred->next[level], item, STRIP_TAG(next))) != item) { + if ((other = SYNC_CAS(&pred->next[level], item, STRIP_TAG(next, TAG1))) != item) { TRACE("s1", "sl_remove: unlink failed; pred's link changed from %p to %p", item, other); // If our former predecessor now points past us we know another thread unlinked us. Otherwise, we need // to search for a new set of preds. @@ -381,7 +381,7 @@ uint64_t sl_remove (skiplist_t *sl, void *key) { continue; // points past to the end of the list; go on to the next level. int d = -1; - if (!IS_TAGGED(other)) { + if (!IS_TAGGED(other, TAG1)) { if (EXPECT_TRUE(sl->key_type == NULL)) { d = (uint64_t)item->key - (uint64_t)other->key; } else { @@ -401,8 +401,8 @@ uint64_t sl_remove (skiplist_t *sl, void *key) { node_t *old_next = item->next[0]; do { next = old_next; - old_next = SYNC_CAS(&item->next[0], next, TAG_VALUE(next)); - if (IS_TAGGED(old_next)) { + old_next = SYNC_CAS(&item->next[0], next, TAG_VALUE(next, TAG1)); + if (IS_TAGGED(old_next, TAG1)) { TRACE("s2", "sl_remove: %p is already marked for removal by another thread at level 0", item, 0); return DOES_NOT_EXIST; } @@ -415,8 +415,8 @@ uint64_t sl_remove (skiplist_t *sl, void *key) { TRACE("s2", "sl_remove: replaced item %p's value with DOES_NOT_EXIT", item, 0); node_t *pred = preds[0]; - TRACE("s2", "sl_remove: linking the item's pred %p to the item's successor %p", pred, STRIP_TAG(next)); - if (SYNC_CAS(&pred->next[0], item, STRIP_TAG(next))) { + TRACE("s2", "sl_remove: linking the item's pred %p to the item's successor %p", pred, STRIP_TAG(next, TAG1)); + if (SYNC_CAS(&pred->next[0], item, STRIP_TAG(next, TAG1))) { TRACE("s2", "sl_remove: unlinked item %p from the skiplist at level 0", item, 0); // The thread that completes the unlink should free the memory. if (sl->key_type != NULL) { @@ -436,8 +436,8 @@ void sl_print (skiplist_t *sl) { int i = 0; while (item) { node_t *next = item->next[level]; - printf("%s%p ", IS_TAGGED(next) ? "*" : "", item); - item = (node_t *)STRIP_TAG(next); + printf("%s%p ", IS_TAGGED(next, TAG1) ? "*" : "", item); + item = (node_t *)STRIP_TAG(next, TAG1); if (i++ > 30) { printf("..."); break; @@ -449,7 +449,7 @@ void sl_print (skiplist_t *sl) { node_t *item = sl->head; int i = 0; while (item) { - int is_marked = IS_TAGGED(item->next[0]); + int is_marked = IS_TAGGED(item->next[0], TAG1); printf("%s%p:%p ", is_marked ? "*" : "", item, item->key); if (item != sl->head) { printf("[%d]", item->top_level); @@ -457,15 +457,15 @@ void sl_print (skiplist_t *sl) { printf("[HEAD]"); } for (int level = 1; level <= item->top_level; ++level) { - node_t *next = (node_t *)STRIP_TAG(item->next[level]); - is_marked = IS_TAGGED(item->next[0]); + node_t *next = (node_t *)STRIP_TAG(item->next[level], TAG1); + is_marked = IS_TAGGED(item->next[0], TAG1); printf(" %p%s", next, is_marked ? "*" : ""); if (item == sl->head && item->next[level] == NULL) break; } printf("\n"); fflush(stdout); - item = (node_t *)STRIP_TAG(item->next[0]); + item = (node_t *)STRIP_TAG(item->next[0], TAG1); if (i++ > 30) { printf("...\n"); break; diff --git a/txn/txn.c b/txn/txn.c index 991c22c..ea3d6df 100644 --- a/txn/txn.c +++ b/txn/txn.c @@ -8,23 +8,20 @@ #include "skiplist.h" #define UNDETERMINED_VERSION 0 -#define ABORTED_VERSION TAG_VALUE(0) +#define ABORTED_VERSION TAG_VALUE(0, TAG1) #define INITIAL_WRITES_SIZE 4 -typedef enum { UPDATE_TYPE_PUT, UPDATE_TYPE_DELETE } update_type_t; - -typedef struct update_rec update_rec_t; +typedef struct update_rec update_t; struct update_rec { - update_type_t type; - uint64_t value; + update_t *next; // an earlier update uint64_t version; - update_rec_t *next; // an earlier update + uint64_t value; }; typedef struct write_rec { void *key; - update_rec_t *rec; + update_t *rec; } write_rec_t; struct txn { @@ -57,15 +54,18 @@ void txn_init (void) { // static txn_state_e tm_validate_key (txn_t *txn, void *key) { - update_rec_t *update = (update_rec_t *) map_get(txn->map, key); + update_t *update = (update_t *) map_get(txn->map, key); for (; update != NULL; update = update->next) { - // If the update's version is not tagged it means the update is committed. + // If the update or its version is not tagged it means the update is committed. // // We can stop at the first committed record we find that is at least as old as our read version. All // the other committed records following it will be older. And all the uncommitted records following it // will eventually conflict with it and abort. - if (!IS_TAGGED(update->version)) + if (!IS_TAGGED(update, TAG2)) + return TXN_VALIDATED; + update = (update_t *)STRIP_TAG(update, TAG2); + if (!IS_TAGGED(update->version, TAG1)) return (update->version <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED; // If the update's version is tagged then either the update was aborted or the the version number is @@ -76,12 +76,12 @@ static txn_state_e tm_validate_key (txn_t *txn, void *key) { continue; // The update's transaction is still in progress. Access its txn_t. - txn_t *writer = (txn_t *)STRIP_TAG(update->version); + txn_t *writer = (txn_t *)STRIP_TAG(update->version, TAG1); if (writer == txn) continue; // Skip our own updates. txn_state_e writer_state = writer->state; - // Any running transaction will only be able to aquire a wv greater than ours. A transaction changes its + // Any running transaction will only be able to acquire a wv greater than ours. A transaction changes its // state to validating before aquiring a wv. We can ignore an unvalidated transaction if its version is // greater than ours. See next comment below for why. if (writer_state == TXN_RUNNING) @@ -143,9 +143,9 @@ static txn_state_e txn_validate (txn_t *txn) { return txn->state; } -static update_rec_t *alloc_update_rec (void) { - update_rec_t *u = (update_rec_t *)nbd_malloc(sizeof(update_rec_t)); - memset(u, 0, sizeof(update_rec_t)); +static update_t *alloc_update_rec (void) { + update_t *u = (update_t *)nbd_malloc(sizeof(update_t)); + memset(u, 0, sizeof(update_t)); return u; } @@ -161,7 +161,7 @@ txn_t *txn_begin (txn_type_e type, map_t *map) { txn->writes_size = INITIAL_WRITES_SIZE; } - // aquire the read version for txn. must be careful to avoid a race + // acquire the read version for txn. must be careful to avoid a race do { txn->rv = version_; @@ -189,7 +189,7 @@ void txn_abort (txn_t *txn) { int i; for (i = 0; i < txn->writes_count; ++i) { - update_rec_t *update = (update_rec_t *)txn->writes[i].rec; + update_t *update = (update_t *)txn->writes[i].rec; update->version = ABORTED_VERSION; } @@ -207,7 +207,7 @@ txn_state_e txn_commit (txn_t *txn) { uint64_t wv = (txn->state == TXN_ABORTED) ? ABORTED_VERSION : txn->wv; int i; for (i = 0; i < txn->writes_count; ++i) { - update_rec_t *update = (update_rec_t *)txn->writes[i].rec; + update_t *update = (update_t *)txn->writes[i].rec; update->version = wv; } @@ -232,13 +232,22 @@ txn_state_e txn_commit (txn_t *txn) { // Get most recent committed version prior to our read version. uint64_t tm_get (txn_t *txn, void *key) { - // Iterate through update records associated with to find the latest committed version prior to our - // read version. - update_rec_t *update = (update_rec_t *) map_get(txn->map, key); - for (; update != NULL; update = update->next) { + update_t *newest_update = (update_t *) map_get(txn->map, key); + if (!IS_TAGGED(newest_update, TAG2)) + return (uint64_t)newest_update; + + // Iterate through the update records to find the latest committed version prior to our read version. + update_t *update; + for (update = newest_update; ; update = update->next) { + + if (!IS_TAGGED(update, TAG2)) + return (uint64_t)update; + + update = (update_t *)STRIP_TAG(update, TAG2); + assert(update != NULL); // If the update's version is not tagged it means the update is committed. - if (!IS_TAGGED(update->version)) { + if (!IS_TAGGED(update->version, TAG1)) { if (update->version <= txn->rv) break; // success continue; @@ -252,7 +261,7 @@ uint64_t tm_get (txn_t *txn, void *key) { continue; // The update's transaction is still in progress. Access its txn_t. - txn_t *writer = (txn_t *)STRIP_TAG(update->version); + txn_t *writer = (txn_t *)STRIP_TAG(update->version, TAG1); if (writer == txn) // found our own update break; // success @@ -276,49 +285,82 @@ uint64_t tm_get (txn_t *txn, void *key) { break; // success } - if (EXPECT_FALSE(update == NULL)) - return DOES_NOT_EXIST; + uint64_t value = update->value; // collect some garbage - update_rec_t *next = update->next; - if (next != NULL) { - uint64_t min_active_version = (uint64_t)sl_min_key(active_); - if (next->version < min_active_version) { - next = SYNC_SWAP(&update->next, NULL); - while (next != NULL) { + update_t *last = update; + update_t *next = update->next; + uint64_t min_active = 0; + if (IS_TAGGED(next, TAG2)) { + next = (update_t *)STRIP_TAG(next, TAG2); + min_active = (uint64_t)sl_min_key(active_); + if (next->version < min_active) { + + // Skip over aborted versions to verify the chain of updates is old enough for collection + update_t *temp = next; + while (temp->version == ABORTED_VERSION) { + assert(!IS_TAGGED(temp->version, TAG1)); + update_t *temp = next->next; + if (!IS_TAGGED(temp, TAG2)) + break; + temp = (update_t *)STRIP_TAG(temp, TAG2); + if (temp->version >= min_active) + return value; + temp = temp->next; + } + + // collect and all the update records following it + do { + next = SYNC_SWAP(&update->next, NULL); + + // if we find ourself in a race just back off and let the other thread take care of it + if (next == NULL) + return value; + update = next; - next = NULL; - if (update->next != NULL) { - next = SYNC_SWAP(&update->next, NULL); - } + next = next->next; nbd_free(update); - } + } while (IS_TAGGED(next, TAG2)); } } + + // If there is one item left and it is visible by all active transactions we can merge it into the map itself. + // There is no need for an update record. + if (next == NULL && last == (update_t *)STRIP_TAG(newest_update, TAG2)) { + if (min_active == UNDETERMINED_VERSION) { + min_active = (uint64_t)sl_min_key(active_); + } + if (last->version <= min_active) { + if (map_cas(txn->map, key, TAG_VALUE(last, TAG2), value) == TAG_VALUE(last, TAG2)) { + nbd_defer_free(last); + } + } + } - return update->value; + return value; } void tm_set (txn_t *txn, void *key, uint64_t value) { // create a new update record - update_rec_t *update = alloc_update_rec(); - update->type = UPDATE_TYPE_PUT; + update_t *update = alloc_update_rec(); update->value = value; - update->version = TAG_VALUE((uint64_t)txn); + update->version = TAG_VALUE(txn, TAG1); // push the new update record onto 's update list - uint64_t update_prev; + uint64_t old_update; do { - update->next = (update_rec_t *) map_get(txn->map, key); - update_prev = (uint64_t)update->next; - } while (map_cas(txn->map, key, update_prev, (uint64_t)update) != update_prev); + old_update = map_get(txn->map, key); + update->next = (update_t *)old_update; + } while (map_cas(txn->map, key, old_update, TAG_VALUE(update, TAG2)) != old_update); // add to the write set for commit-time validation if (txn->writes_count == txn->writes_size) { write_rec_t *w = nbd_malloc(sizeof(write_rec_t) * txn->writes_size * 2); memcpy(w, txn->writes, txn->writes_size * sizeof(write_rec_t)); txn->writes_size *= 2; + nbd_free(txn->writes); + txn->writes = w; } int i = txn->writes_count++; txn->writes[i].key = key; -- 2.40.0