2 * Written by Josh Dybnis and released to the public domain, as explained at
3 * http://creativecommons.org/licenses/publicdomain
12 #define UNDETERMINED_VERSION 0
13 #define ABORTED_VERSION TAG_VALUE(0, TAG1)
14 #define INITIAL_WRITES_SIZE 4
15 #define PTR_TO_VAL(x) ((size_t)(x) >> 2)
16 #define VAL_TO_PTR(x) ((update_t *)((x) << 2))
18 typedef struct update_rec update_t;
19 typedef map_key_t version_t;
22 version_t version; // tagged versions are txn_t pointers, untagged are actual version numbers
24 map_val_t next; // an earlier update
27 typedef struct write_rec {
43 static txn_state_e txn_validate (txn_t *txn);
45 static skiplist_t *active_ = NULL;
47 static version_t version_ = 1;
49 static inline skiplist_t *get_active (void) {
54 // Validate the updates for <key>. Validation fails if there is a write-write conflict. That is if after our
55 // read version another transaction committed a change to an entry we are also trying to change.
57 // If we encounter a potential conflict with a transaction that is in the process of validating, we help it
58 // complete validating. It must be finished before we can decide to rollback or commit.
60 static txn_state_e validate_key (txn_t *txn, map_key_t key) {
61 assert(txn->state != TXN_RUNNING);
63 map_val_t val = map_get(txn->map, key);
64 update_t *update = NULL;
65 for (; val != DOES_NOT_EXIST; val = update->next) {
67 // If the update or its version is not tagged it means the update is committed.
69 // We can stop at the first committed record we find that is at least as old as our read version. All
70 // the other committed records following it will be older. And all the uncommitted records following it
71 // will eventually conflict with it and abort.
72 if (!IS_TAGGED(val, TAG2))
74 update = VAL_TO_PTR(val);
75 if (!IS_TAGGED(update->version, TAG1))
76 return (update->version <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
78 // If the update's version is tagged then either the update was aborted or the the version number is
79 // actually a pointer to a running transaction's txn_t.
81 // Skip aborted transactions.
82 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
85 // The update's transaction is still in progress. Access its txn_t.
86 txn_t *writer = (txn_t *)VAL_TO_PTR(update->version);
88 continue; // Skip our own updates.
89 txn_state_e writer_state = writer->state;
91 // Any running transaction will only be able to acquire a wv greater than ours. A transaction changes its
92 // state to validating before aquiring a wv. We can ignore an unvalidated transaction if its version is
93 // greater than ours. See the next comment below for the explination why.
94 if (writer_state == TXN_RUNNING)
97 // If <writer> has a later version than us we can safely ignore its updates. It will not commit until
98 // we have completed validation (in order to remain non-blocking it will help us validate if necessary).
99 // This protocol ensures a deterministic resolution to every conflict and avoids infinite ping-ponging
100 // between validating two conflicting transactions.
101 if (writer_state == TXN_VALIDATING) {
102 if (writer->wv > txn->wv)
104 // Help <writer> commit. We need to know if <writer> aborts or commits before we can decide what to
105 // do. But we don't want to block, so we assist.
106 writer_state = txn_validate(writer);
109 // Skip updates from aborted transactions.
110 if (writer_state == TXN_ABORTED)
113 assert(writer_state == TXN_VALIDATED);
114 return (writer->wv <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
117 return TXN_VALIDATED;
120 static txn_state_e txn_validate (txn_t *txn) {
121 assert(txn->state != TXN_RUNNING);
123 switch (txn->state) {
126 if (txn->wv == UNDETERMINED_VERSION) {
127 version_t wv = SYNC_ADD(&version_, 1);
128 (void)SYNC_CAS(&txn->wv, UNDETERMINED_VERSION, wv);
131 for (i = 0; i < txn->writes_count; ++i) {
132 txn_state_e s = validate_key(txn, txn->writes[i].key);
133 if (s == TXN_ABORTED) {
134 txn->state = TXN_ABORTED;
137 assert(s == TXN_VALIDATED);
139 if (txn->state == TXN_VALIDATING) {
140 txn->state = TXN_VALIDATED;
155 static update_t *alloc_update_rec (version_t ver, map_val_t val) {
156 update_t *u = (update_t *)nbd_malloc(sizeof(update_t));
159 u->next = DOES_NOT_EXIST;
163 txn_t *txn_begin (map_t *map) {
164 TRACE("x1", "txn_begin: map %p", map, 0);
165 txn_t *txn = (txn_t *)nbd_malloc(sizeof(txn_t));
166 memset(txn, 0, sizeof(txn_t));
167 txn->wv = UNDETERMINED_VERSION;
168 txn->state = TXN_RUNNING;
170 txn->writes = nbd_malloc(sizeof(*txn->writes) * INITIAL_WRITES_SIZE);
171 txn->writes_size = INITIAL_WRITES_SIZE;
172 if (EXPECT_FALSE(active_ == NULL)) {
173 skiplist_t *a = sl_alloc(NULL);
174 if (SYNC_CAS(&active_, NULL, a) != NULL) {
179 // acquire the read version for txn. must be careful to avoid a race
187 temp = sl_cas(active_, txn->rv, old_count, old_count + 1);
188 } while (temp != old_count);
190 if (txn->rv == version_)
196 temp = sl_cas(active_, (map_key_t)txn->rv, old_count, old_count - 1);
197 } while (temp != old_count);
200 TRACE("x1", "txn_begin: returning new transaction %p (read version %p)", txn, txn->rv);
204 void txn_abort (txn_t *txn) {
205 if (txn->state != TXN_RUNNING)
209 for (i = 0; i < txn->writes_count; ++i) {
210 update_t *update = (update_t *)txn->writes[i].rec;
211 update->version = ABORTED_VERSION;
214 rcu_defer_free(txn->writes);
218 txn_state_e txn_commit (txn_t *txn) {
219 if (txn->state != TXN_RUNNING)
222 assert(txn->state == TXN_RUNNING);
223 txn->state = TXN_VALIDATING;
224 txn_state_e state = txn_validate(txn);
226 // Detach <txn> from its updates.
227 version_t wv = (txn->state == TXN_ABORTED) ? ABORTED_VERSION : txn->wv;
229 for (i = 0; i < txn->writes_count; ++i) {
230 update_t *update = txn->writes[i].rec;
231 update->version = wv;
234 // Lower the reference count for <txn>'s read version
239 temp = sl_cas(active_, (map_key_t)txn->rv, old_count, old_count - 1);
240 if (temp == 1 && txn->rv != version_) {
241 sl_remove(active_, (map_key_t)txn->rv);
244 } while (old_count != temp);
246 rcu_defer_free(txn->writes);
252 // Get most recent committed version prior to our read version.
253 map_val_t txn_map_get (txn_t *txn, map_key_t key) {
254 TRACE("x1", "txn_map_get: txn %p map %p", txn, txn->map);
255 TRACE("x1", "txn_map_get: key %p", key, 0);
257 if (txn->state != TXN_RUNNING) {
258 TRACE("x1", "txn_map_get: error txn not running (state %p)", txn->state, 0);
259 return ERROR_TXN_NOT_RUNNING;
262 // Iterate through the update records to find the latest committed version prior to our read version.
263 map_val_t newest_val = map_get(txn->map, key);
264 map_val_t val = newest_val;
266 for ( ; (update = VAL_TO_PTR(val)) != NULL ; val = update->next) {
268 // If TAG2 is set in <val> it indicates that <val> is an update record. Otherwise all the following are
269 // true: <val> is a literal value, it is older than any currently active transaction, and it is the most
270 // recently set value for its key. Therefore it is visible to <txn>.
271 if (!IS_TAGGED(val, TAG2)) {
272 TRACE("x1", "txn_map_get: found untagged value; returning %p", val, 0);
276 // If the update's version is not tagged it means the update is committed.
277 if (!IS_TAGGED(update->version, TAG1)) {
278 if (update->version <= txn->rv) {
279 TRACE("x2", "txn_map_get: found committed update %p (version %p)", update, update->version);
282 TRACE("x2", "txn_map_get: skipping update %p (version %p)", update, update->version);
286 // If the update's version is tagged then either the update was aborted or the the version number is
287 // actually a pointer to a running transaction's txn_t.
289 // Skip updates from aborted transactions.
290 if (EXPECT_FALSE(update->version == ABORTED_VERSION)) {
291 TRACE("x2", "txn_map_get: skipping aborted update %p", update, 0);
295 // The update's transaction is still in progress. Access its txn_t.
296 txn_t *writer = (txn_t *)VAL_TO_PTR(update->version);
298 TRACE("x2", "txn_map_get: found txn's own update %p", update, 0);
302 txn_state_e writer_state = writer->state;
303 if (writer_state == TXN_RUNNING) {
304 TRACE("x2", "txn_map_get: skipping update %p of in-progress transaction %p", update, writer);
308 if (writer_state == TXN_VALIDATING) {
309 TRACE("x2", "txn_map_get: update %p transaction %p validating", update, writer);
310 if (writer->wv > txn->rv)
312 writer_state = txn_validate(writer);
315 // Skip updates from aborted transactions.
316 if (writer_state == TXN_ABORTED) {
317 TRACE("x2", "txn_map_get: skipping aborted update %p", update, 0);
321 assert(writer_state == TXN_VALIDATED);
322 if (writer->wv > txn->rv) {
323 TRACE("x2", "txn_map_get: skipping update %p (version %p)", update, update->version);
329 if (update == NULL) {
330 TRACE("x1", "txn_map_get: key does not exist in map", key, 0);
331 return DOES_NOT_EXIST;
334 map_val_t value = update->value;
335 TRACE("x1", "txn_map_get: key found returning value %p", value, 0);
338 // collect some garbage
339 version_t min_active_version = UNDETERMINED_VERSION;
340 update_t *next_update = NULL;
341 if (IS_TAGGED(update->next, TAG2)) {
342 next_update = VAL_TO_PTR(update->next);
344 // If <next_update> (and all update records following it [execpt if it is aborted]) is old enough
345 // that it is not visible to any active transaction we can safely free it.
346 min_active_version = (version_t)sl_min_key(active_);
347 if (next_update->version < min_active_version) {
349 // If the <next_update> is aborted, skip over it to look for more recent ones that may follow
350 update_t *temp = next_update;
351 while (temp->version == ABORTED_VERSION) {
352 assert(!IS_TAGGED(temp->version, TAG1));
353 map_val_t next = temp->next;
354 if (!IS_TAGGED(next, TAG2))
357 // Bail out of garbage collection if we find a record that might still be accessed by an
358 // ongoing transaction.
359 if (VAL_TO_PTR(next)->version >= min_active_version)
362 temp = VAL_TO_PTR(next);
365 // free the next update record and all the ones following it
369 next = SYNC_SWAP(&temp->next, DOES_NOT_EXIST);
371 // if we find ourself in a race just back off and let the other thread take care of it
372 if (next == DOES_NOT_EXIST)
377 temp = VAL_TO_PTR(next);
379 } while (IS_TAGGED(next, TAG2));
383 // If there is one item left and it is visible by all active transactions we can merge it into the map itself.
384 // There is no need for an update record.
385 if (next_update == NULL && val == newest_val) {
386 if (min_active_version == UNDETERMINED_VERSION) {
387 min_active_version = (version_t)sl_min_key(active_);
389 if (update->version <= min_active_version) {
390 if (map_cas(txn->map, key, TAG_VALUE(val, TAG2), value) == TAG_VALUE(val, TAG2)) {
391 rcu_defer_free(update);
399 void txn_map_set (txn_t *txn, map_key_t key, map_val_t value) {
400 TRACE("x1", "txn_map_set: txn %p map %p", txn, txn->map);
401 TRACE("x1", "txn_map_set: key %p value %p", key, value);
402 assert(!IS_TAGGED(value, TAG1) && !IS_TAGGED(value, TAG2));
404 if (txn->state != TXN_RUNNING) {
405 TRACE("x1", "txn_map_set: error txn not running (state %p)", txn->state, 0);
409 // create a new update record
410 version_t ver = TAG_VALUE(PTR_TO_VAL(txn), TAG1); // tagged versions are txn_t pointers
411 update_t *update = alloc_update_rec(ver, value);
413 // push the new update record onto <key>'s update list
414 map_val_t old_update = map_get(txn->map, key);
415 TRACE("x2", "txn_map_set: old update %p new update record %p", old_update, update);
417 update->next = old_update;
418 map_val_t temp = map_cas(txn->map, key, old_update, TAG_VALUE(PTR_TO_VAL(update), TAG2));
419 if (temp == old_update)
422 TRACE("x1", "txn_map_set: cas failed; found %p expected %p", temp, old_update);
426 // add <key> to the write set for commit-time validation
427 if (txn->writes_count == txn->writes_size) {
428 write_rec_t *w = nbd_malloc(sizeof(write_rec_t) * txn->writes_size * 2);
429 memcpy(w, txn->writes, txn->writes_size * sizeof(write_rec_t));
430 txn->writes_size *= 2;
431 nbd_free(txn->writes);
434 int i = txn->writes_count++;
435 txn->writes[i].key = key;
436 txn->writes[i].rec = update;