2 * Written by Josh Dybnis and released to the public domain, as explained at
3 * http://creativecommons.org/licenses/publicdomain
10 #define UNDETERMINED_VERSION 0
11 #define ABORTED_VERSION TAG_VALUE(0, TAG1)
12 #define INITIAL_WRITES_SIZE 4
14 typedef struct update_rec update_t;
17 update_t *next; // an earlier update
22 typedef struct write_rec {
33 uint32_t writes_count;
39 static uint64_t version_ = 1;
41 static txn_state_e txn_validate (txn_t *txn);
43 static skiplist_t *active_ = NULL;
45 void txn_init (void) {
46 active_ = sl_alloc(NULL);
49 // Validate the updates for <key>. Validation fails if there is a write-write conflict. That is if after our
50 // read version another transaction committed a change to an entry we are also trying to change.
52 // If we encounter a potential conflict with a transaction that is in the process of validating, we help it
53 // complete validating. It must be finished before we can decide to rollback or commit.
55 static txn_state_e tm_validate_key (txn_t *txn, void *key) {
56 assert(txn->state != TXN_RUNNING);
58 update_t *update = (update_t *) map_get(txn->map, key);
59 for (; update != NULL; update = update->next) {
61 // If the update or its version is not tagged it means the update is committed.
63 // We can stop at the first committed record we find that is at least as old as our read version. All
64 // the other committed records following it will be older. And all the uncommitted records following it
65 // will eventually conflict with it and abort.
66 if (!IS_TAGGED(update, TAG2))
68 update = (update_t *)STRIP_TAG(update, TAG2);
69 if (!IS_TAGGED(update->version, TAG1))
70 return (update->version <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
72 // If the update's version is tagged then either the update was aborted or the the version number is
73 // actually a pointer to a running transaction's txn_t.
75 // Skip aborted transactions.
76 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
79 // The update's transaction is still in progress. Access its txn_t.
80 txn_t *writer = (txn_t *)STRIP_TAG(update->version, TAG1);
82 continue; // Skip our own updates.
83 txn_state_e writer_state = writer->state;
85 // Any running transaction will only be able to acquire a wv greater than ours. A transaction changes its
86 // state to validating before aquiring a wv. We can ignore an unvalidated transaction if its version is
87 // greater than ours. See next comment below for why.
88 if (writer_state == TXN_RUNNING)
91 // If <writer> has a later version than us we can safely ignore its updates. It will not commit until
92 // we have completed validation (in order to remain non-blocking it will help us validate if necessary).
93 // This protocol ensures a deterministic resolution to every conflict and avoids infinite ping-ponging
94 // between validating two conflicting transactions.
95 if (writer_state == TXN_VALIDATING) {
96 if (writer->wv > txn->wv)
98 // Help <writer> commit. We need to know if <writer> aborts or commits before we can decide what to
99 // do. But we don't want to block, so we assist.
100 writer_state = txn_validate(writer);
103 // Skip updates from aborted transactions.
104 if (writer_state == TXN_ABORTED)
107 assert(writer_state == TXN_VALIDATED);
108 return (writer->wv <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
111 return TXN_VALIDATED;
114 static txn_state_e txn_validate (txn_t *txn) {
115 assert(txn->state != TXN_RUNNING);
117 switch (txn->state) {
120 if (txn->wv == UNDETERMINED_VERSION) {
121 uint64_t wv = SYNC_ADD(&version_, 1);
122 SYNC_CAS(&txn->wv, UNDETERMINED_VERSION, wv);
125 for (i = 0; i < txn->writes_count; ++i) {
126 txn_state_e s = tm_validate_key(txn, txn->writes[i].key);
127 if (s == TXN_ABORTED) {
128 txn->state = TXN_ABORTED;
131 assert(s == TXN_VALIDATED);
133 if (txn->state == TXN_VALIDATING) {
134 txn->state = TXN_VALIDATED;
149 static update_t *alloc_update_rec (void) {
150 update_t *u = (update_t *)nbd_malloc(sizeof(update_t));
151 memset(u, 0, sizeof(update_t));
155 txn_t *txn_begin (txn_type_e type, map_t *map) {
156 txn_t *txn = (txn_t *)nbd_malloc(sizeof(txn_t));
157 memset(txn, 0, sizeof(txn_t));
159 txn->wv = UNDETERMINED_VERSION;
160 txn->state = TXN_RUNNING;
162 if (type != TXN_READ_ONLY) {
163 txn->writes = nbd_malloc(sizeof(*txn->writes) * INITIAL_WRITES_SIZE);
164 txn->writes_size = INITIAL_WRITES_SIZE;
167 // acquire the read version for txn. must be careful to avoid a race
175 temp = (uint64_t)sl_cas(active_, (void *)txn->rv, old_count, old_count + 1);
176 } while (temp != old_count);
178 if (txn->rv == version_)
184 temp = sl_cas(active_, (void *)txn->rv, old_count, old_count - 1);
185 } while (temp != old_count);
191 void txn_abort (txn_t *txn) {
192 if (txn->state != TXN_RUNNING)
193 return; // TODO: return some sort of error code
196 for (i = 0; i < txn->writes_count; ++i) {
197 update_t *update = (update_t *)txn->writes[i].rec;
198 update->version = ABORTED_VERSION;
201 nbd_defer_free(txn->writes);
205 txn_state_e txn_commit (txn_t *txn) {
206 if (txn->state != TXN_RUNNING)
207 return txn->state; // TODO: return some sort of error code
209 assert(txn->state == TXN_RUNNING);
210 txn->state = TXN_VALIDATING;
211 txn_state_e state = txn_validate(txn);
213 // Detach <txn> from its updates.
214 uint64_t wv = (txn->state == TXN_ABORTED) ? ABORTED_VERSION : txn->wv;
216 for (i = 0; i < txn->writes_count; ++i) {
217 update_t *update = (update_t *)txn->writes[i].rec;
218 update->version = wv;
221 // Lower the reference count for <txn>'s read version
226 temp = sl_cas(active_, (void *)txn->rv, old_count, old_count - 1);
227 if (temp == 1 && txn->rv != version_) {
228 sl_remove(active_, (void *)txn->rv);
231 } while (old_count != temp);
233 nbd_defer_free(txn->writes);
239 // Get most recent committed version prior to our read version.
240 uint64_t tm_get (txn_t *txn, void *key) {
241 if (txn->state != TXN_RUNNING)
242 return ERROR_TXN_NOT_RUNNING;
244 update_t *newest_update = (update_t *) map_get(txn->map, key);
245 if (!IS_TAGGED(newest_update, TAG2))
246 return (uint64_t)newest_update;
248 // Iterate through the update records to find the latest committed version prior to our read version.
250 for (update = newest_update; ; update = update->next) {
252 if (!IS_TAGGED(update, TAG2))
253 return (uint64_t)update;
255 update = (update_t *)STRIP_TAG(update, TAG2);
256 assert(update != NULL);
258 // If the update's version is not tagged it means the update is committed.
259 if (!IS_TAGGED(update->version, TAG1)) {
260 if (update->version <= txn->rv)
265 // If the update's version is tagged then either the update was aborted or the the version number is
266 // actually a pointer to a running transaction's txn_t.
268 // Skip updates from aborted transactions.
269 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
272 // The update's transaction is still in progress. Access its txn_t.
273 txn_t *writer = (txn_t *)STRIP_TAG(update->version, TAG1);
274 if (writer == txn) // found our own update
277 txn_state_e writer_state = writer->state;
278 if (writer_state == TXN_RUNNING)
281 if (writer_state == TXN_VALIDATING) {
282 if (writer->wv > txn->rv)
284 writer_state = txn_validate(writer);
287 // Skip updates from aborted transactions.
288 if (writer_state == TXN_ABORTED)
291 assert(writer_state == TXN_VALIDATED);
292 if (writer->wv > txn->rv)
297 uint64_t value = update->value;
299 // collect some garbage
300 update_t *last = update;
301 update_t *next = update->next;
302 uint64_t min_active = 0;
303 if (IS_TAGGED(next, TAG2)) {
304 next = (update_t *)STRIP_TAG(next, TAG2);
305 min_active = (uint64_t)sl_min_key(active_);
306 if (next->version < min_active) {
308 // Skip over aborted versions to verify the chain of updates is old enough for collection
309 update_t *temp = next;
310 while (temp->version == ABORTED_VERSION) {
311 assert(!IS_TAGGED(temp->version, TAG1));
312 update_t *temp = next->next;
313 if (!IS_TAGGED(temp, TAG2))
315 temp = (update_t *)STRIP_TAG(temp, TAG2);
316 if (temp->version >= min_active)
321 // collect <next> and all the update records following it
323 next = SYNC_SWAP(&update->next, NULL);
325 // if we find ourself in a race just back off and let the other thread take care of it
332 } while (IS_TAGGED(next, TAG2));
336 // If there is one item left and it is visible by all active transactions we can merge it into the map itself.
337 // There is no need for an update record.
338 if (next == NULL && last == (update_t *)STRIP_TAG(newest_update, TAG2)) {
339 if (min_active == UNDETERMINED_VERSION) {
340 min_active = (uint64_t)sl_min_key(active_);
342 if (last->version <= min_active) {
343 if (map_cas(txn->map, key, TAG_VALUE(last, TAG2), value) == TAG_VALUE(last, TAG2)) {
344 nbd_defer_free(last);
352 void tm_set (txn_t *txn, void *key, uint64_t value) {
353 if (txn->state != TXN_RUNNING)
354 return; // TODO: return some sort of error code
356 // create a new update record
357 update_t *update = alloc_update_rec();
358 update->value = value;
359 update->version = TAG_VALUE(txn, TAG1);
361 // push the new update record onto <key>'s update list
364 old_update = map_get(txn->map, key);
365 update->next = (update_t *)old_update;
366 } while (map_cas(txn->map, key, old_update, TAG_VALUE(update, TAG2)) != old_update);
368 // add <key> to the write set for commit-time validation
369 if (txn->writes_count == txn->writes_size) {
370 write_rec_t *w = nbd_malloc(sizeof(write_rec_t) * txn->writes_size * 2);
371 memcpy(w, txn->writes, txn->writes_size * sizeof(write_rec_t));
372 txn->writes_size *= 2;
373 nbd_free(txn->writes);
376 int i = txn->writes_count++;
377 txn->writes[i].key = key;
378 txn->writes[i].rec = update;