2 * Written by Josh Dybnis and released to the public domain, as explained at
3 * http://creativecommons.org/licenses/publicdomain
11 #define UNDETERMINED_VERSION 0
12 #define ABORTED_VERSION TAG_VALUE(0, TAG1)
13 #define INITIAL_WRITES_SIZE 4
15 typedef struct update_rec update_t;
16 typedef map_key_t version_t;
21 map_val_t next; // an earlier update
24 typedef struct write_rec {
40 static txn_state_e txn_validate (txn_t *txn);
42 static version_t version_ = 1;
44 static skiplist_t *active_ = NULL;
46 void txn_init (void) {
47 active_ = sl_alloc(NULL);
50 // Validate the updates for <key>. Validation fails if there is a write-write conflict. That is if after our
51 // read version another transaction committed a change to an entry we are also trying to change.
53 // If we encounter a potential conflict with a transaction that is in the process of validating, we help it
54 // complete validating. It must be finished before we can decide to rollback or commit.
56 static txn_state_e validate_key (txn_t *txn, map_key_t key) {
57 assert(txn->state != TXN_RUNNING);
59 map_val_t val = map_get(txn->map, key);
60 update_t *update = NULL;
61 for (; val != DOES_NOT_EXIST; val = update->next) {
63 // If the update or its version is not tagged it means the update is committed.
65 // We can stop at the first committed record we find that is at least as old as our read version. All
66 // the other committed records following it will be older. And all the uncommitted records following it
67 // will eventually conflict with it and abort.
68 if (!IS_TAGGED(val, TAG2))
70 update = (update_t *)STRIP_TAG(val, TAG2);
71 if (!IS_TAGGED(update->version, TAG1))
72 return (update->version <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
74 // If the update's version is tagged then either the update was aborted or the the version number is
75 // actually a pointer to a running transaction's txn_t.
77 // Skip aborted transactions.
78 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
81 // The update's transaction is still in progress. Access its txn_t.
82 txn_t *writer = (txn_t *)STRIP_TAG(update->version, TAG1);
84 continue; // Skip our own updates.
85 txn_state_e writer_state = writer->state;
87 // Any running transaction will only be able to acquire a wv greater than ours. A transaction changes its
88 // state to validating before aquiring a wv. We can ignore an unvalidated transaction if its version is
89 // greater than ours. See next comment below for why.
90 if (writer_state == TXN_RUNNING)
93 // If <writer> has a later version than us we can safely ignore its updates. It will not commit until
94 // we have completed validation (in order to remain non-blocking it will help us validate if necessary).
95 // This protocol ensures a deterministic resolution to every conflict and avoids infinite ping-ponging
96 // between validating two conflicting transactions.
97 if (writer_state == TXN_VALIDATING) {
98 if (writer->wv > txn->wv)
100 // Help <writer> commit. We need to know if <writer> aborts or commits before we can decide what to
101 // do. But we don't want to block, so we assist.
102 writer_state = txn_validate(writer);
105 // Skip updates from aborted transactions.
106 if (writer_state == TXN_ABORTED)
109 assert(writer_state == TXN_VALIDATED);
110 return (writer->wv <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
113 return TXN_VALIDATED;
116 static txn_state_e txn_validate (txn_t *txn) {
117 assert(txn->state != TXN_RUNNING);
119 switch (txn->state) {
122 if (txn->wv == UNDETERMINED_VERSION) {
123 version_t wv = SYNC_ADD(&version_, 1);
124 SYNC_CAS(&txn->wv, UNDETERMINED_VERSION, wv);
127 for (i = 0; i < txn->writes_count; ++i) {
128 txn_state_e s = validate_key(txn, txn->writes[i].key);
129 if (s == TXN_ABORTED) {
130 txn->state = TXN_ABORTED;
133 assert(s == TXN_VALIDATED);
135 if (txn->state == TXN_VALIDATING) {
136 txn->state = TXN_VALIDATED;
151 static update_t *alloc_update_rec (void) {
152 update_t *u = (update_t *)nbd_malloc(sizeof(update_t));
153 memset(u, 0, sizeof(update_t));
157 txn_t *txn_begin (map_t *map) {
158 txn_t *txn = (txn_t *)nbd_malloc(sizeof(txn_t));
159 memset(txn, 0, sizeof(txn_t));
160 txn->wv = UNDETERMINED_VERSION;
161 txn->state = TXN_RUNNING;
163 txn->writes = nbd_malloc(sizeof(*txn->writes) * INITIAL_WRITES_SIZE);
164 txn->writes_size = INITIAL_WRITES_SIZE;
166 // acquire the read version for txn. must be careful to avoid a race
174 temp = sl_cas(active_, txn->rv, old_count, old_count + 1);
175 } while (temp != old_count);
177 if (txn->rv == version_)
183 temp = sl_cas(active_, (map_key_t)txn->rv, old_count, old_count - 1);
184 } while (temp != old_count);
190 void txn_abort (txn_t *txn) {
191 if (txn->state != TXN_RUNNING)
192 return; // TODO: return some sort of error code
195 for (i = 0; i < txn->writes_count; ++i) {
196 update_t *update = (update_t *)txn->writes[i].rec;
197 update->version = ABORTED_VERSION;
200 rcu_defer_free(txn->writes);
204 txn_state_e txn_commit (txn_t *txn) {
205 if (txn->state != TXN_RUNNING)
206 return txn->state; // TODO: return some sort of error code
208 assert(txn->state == TXN_RUNNING);
209 txn->state = TXN_VALIDATING;
210 txn_state_e state = txn_validate(txn);
212 // Detach <txn> from its updates.
213 version_t wv = (txn->state == TXN_ABORTED) ? ABORTED_VERSION : txn->wv;
215 for (i = 0; i < txn->writes_count; ++i) {
216 update_t *update = (update_t *)txn->writes[i].rec;
217 update->version = wv;
220 // Lower the reference count for <txn>'s read version
225 temp = sl_cas(active_, (map_key_t)txn->rv, old_count, old_count - 1);
226 if (temp == 1 && txn->rv != version_) {
227 sl_remove(active_, (map_key_t)txn->rv);
230 } while (old_count != temp);
232 rcu_defer_free(txn->writes);
238 // Get most recent committed version prior to our read version.
239 map_val_t txn_map_get (txn_t *txn, map_key_t key) {
240 if (txn->state != TXN_RUNNING)
241 return ERROR_TXN_NOT_RUNNING;
243 // Iterate through the update records to find the latest committed version prior to our read version.
244 map_val_t newest_val = map_get(txn->map, key);
245 map_val_t val = newest_val;
246 update_t *update = NULL;
247 for ( ; ; val = update->next) {
249 if (!IS_TAGGED(val, TAG2))
252 update = (update_t *)STRIP_TAG(val, TAG2);
253 assert(update != NULL);
255 // If the update's version is not tagged it means the update is committed.
256 if (!IS_TAGGED(update->version, TAG1)) {
257 if (update->version <= txn->rv)
262 // If the update's version is tagged then either the update was aborted or the the version number is
263 // actually a pointer to a running transaction's txn_t.
265 // Skip updates from aborted transactions.
266 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
269 // The update's transaction is still in progress. Access its txn_t.
270 txn_t *writer = (txn_t *)STRIP_TAG(update->version, TAG1);
271 if (writer == txn) // found our own update
274 txn_state_e writer_state = writer->state;
275 if (writer_state == TXN_RUNNING)
278 if (writer_state == TXN_VALIDATING) {
279 if (writer->wv > txn->rv)
281 writer_state = txn_validate(writer);
284 // Skip updates from aborted transactions.
285 if (writer_state == TXN_ABORTED)
288 assert(writer_state == TXN_VALIDATED);
289 if (writer->wv > txn->rv)
294 map_val_t value = update->value;
296 // collect some garbage
297 version_t min_active_version = UNDETERMINED_VERSION;
298 update_t *next_update = NULL;
299 if (IS_TAGGED(update->next, TAG2)) {
300 next_update = (update_t *)STRIP_TAG(update->next, TAG2);
301 min_active_version = (version_t)sl_min_key(active_);
302 if (next_update->version < min_active_version) {
303 // <next_update> (and all update records following it [execpt if it is aborted]) is old enough that it is
304 // not visible to any active transaction. We can safely free it.
306 // Skip over aborted versions to look for more recent updates
307 update_t *temp = next_update;
308 while (temp->version == ABORTED_VERSION) {
309 assert(!IS_TAGGED(temp->version, TAG1));
310 map_val_t next = next_update->next;
311 if (!IS_TAGGED(next, TAG2))
314 temp = (update_t *)STRIP_TAG(next, TAG2);
315 if (temp->version >= min_active_version)
319 // free <next> and all the update records following it
322 map_val_t next = SYNC_SWAP(&temp->next, DOES_NOT_EXIST);
324 // if we find ourself in a race just back off and let the other thread take care of it
325 if (next == DOES_NOT_EXIST)
328 if (!IS_TAGGED(next, TAG2))
331 temp = (update_t *)STRIP_TAG(next, TAG2);
337 // If there is one item left and it is visible by all active transactions we can merge it into the map itself.
338 // There is no need for an update record.
339 if (next_update == NULL && val == newest_val) {
340 if (min_active_version == UNDETERMINED_VERSION) {
341 min_active_version = (version_t)sl_min_key(active_);
343 if (update->version <= min_active_version) {
344 if (map_cas(txn->map, key, TAG_VALUE(val, TAG2), value) == TAG_VALUE(val, TAG2)) {
345 rcu_defer_free(update);
353 void txn_map_set (txn_t *txn, map_key_t key, map_val_t value) {
354 if (txn->state != TXN_RUNNING)
355 return; // TODO: return some sort of error code
357 // create a new update record
358 update_t *update = alloc_update_rec();
359 update->value = value;
360 update->version = TAG_VALUE((version_t)txn, TAG1);
362 // push the new update record onto <key>'s update list
363 map_val_t old_update;
365 old_update = map_get(txn->map, key);
366 update->next = old_update;
367 } while (map_cas(txn->map, key, old_update, TAG_VALUE((map_val_t)update, TAG2)) != old_update);
369 // add <key> to the write set for commit-time validation
370 if (txn->writes_count == txn->writes_size) {
371 write_rec_t *w = nbd_malloc(sizeof(write_rec_t) * txn->writes_size * 2);
372 memcpy(w, txn->writes, txn->writes_size * sizeof(write_rec_t));
373 txn->writes_size *= 2;
374 nbd_free(txn->writes);
377 int i = txn->writes_count++;
378 txn->writes[i].key = key;
379 txn->writes[i].rec = update;