2 * Written by Josh Dybnis and released to the public domain, as explained at
3 * http://creativecommons.org/licenses/publicdomain
10 #define UNDETERMINED_VERSION 0
11 #define ABORTED_VERSION TAG_VALUE(0)
12 #define INITIAL_WRITES_SIZE 4
14 typedef enum { UPDATE_TYPE_PUT, UPDATE_TYPE_DELETE } update_type_t;
16 typedef struct update_rec update_rec_t;
22 update_rec_t *next; // an earlier update
25 typedef struct write_rec {
36 uint32_t writes_count;
42 static uint64_t version_ = 1;
44 static txn_state_e txn_validate (txn_t *txn);
46 static map_t *active_ = NULL;
48 void txn_init (void) {
49 active_ = map_alloc(&sl_map_impl, NULL);
52 // Validate the updates for <key>. Validation fails if there is a write-write conflict. That is if after our
53 // read version another transaction committed a change to an entry we are also trying to change.
55 // If we encounter a potential conflict with a transaction that is in the process of validating, we help it
56 // complete validating. It must be finished before we can decide to rollback or commit.
58 static txn_state_e tm_validate_key (txn_t *txn, void *key) {
60 update_rec_t *update = (update_rec_t *) map_get(txn->map, key);
61 for (; update != NULL; update = update->next) {
63 // If the update's version is not tagged it means the update is committed.
65 // We can stop at the first committed record we find that is at least as old as our read version. All
66 // the other committed records following it will be older. And all the uncommitted records following it
67 // will eventually conflict with it and abort.
68 if (!IS_TAGGED(update->version))
69 return (update->version <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
71 // If the update's version is tagged then either the update was aborted or the the version number is
72 // actually a pointer to a running transaction's txn_t.
74 // Skip aborted transactions.
75 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
78 // The update's transaction is still in progress. Access its txn_t.
79 txn_t *writer = (txn_t *)STRIP_TAG(update->version);
81 continue; // Skip our own updates.
82 txn_state_e writer_state = writer->state;
84 // Any running transaction will only be able to aquire a wv greater than ours. A transaction changes its
85 // state to validating before aquiring a wv. We can ignore an unvalidated transaction if its version is
86 // greater than ours. See next comment below for why.
87 if (writer_state == TXN_RUNNING)
90 // If <writer> has a later version than us we can safely ignore its updates. It will not commit until
91 // we have completed validation (in order to remain non-blocking it will help us validate if necessary).
92 // This protocol ensures a deterministic resolution to every conflict and avoids infinite ping-ponging
93 // between validating two conflicting transactions.
94 if (writer_state == TXN_VALIDATING) {
95 if (writer->wv > txn->wv)
97 // Help <writer> commit. We need to know if <writer> aborts or commits before we can decide what to
98 // do. But we don't want to block, so we assist.
99 writer_state = txn_validate(writer);
102 // Skip updates from aborted transactions.
103 if (writer_state == TXN_ABORTED)
106 assert(writer_state == TXN_VALIDATED);
107 return (writer->wv <= txn->rv) ? TXN_VALIDATED : TXN_ABORTED;
110 return TXN_VALIDATED;
113 static txn_state_e txn_validate (txn_t *txn) {
115 switch (txn->state) {
118 if (txn->wv == UNDETERMINED_VERSION) {
119 uint64_t wv = SYNC_ADD(&version_, 1);
120 SYNC_CAS(&txn->wv, UNDETERMINED_VERSION, wv);
123 for (i = 0; i < txn->writes_count; ++i) {
124 txn_state_e s = tm_validate_key(txn, txn->writes[i].key);
125 if (s == TXN_ABORTED) {
126 txn->state = TXN_ABORTED;
130 if (txn->state == TXN_VALIDATING) {
131 txn->state = TXN_VALIDATED;
146 static update_rec_t *alloc_update_rec (void) {
147 update_rec_t *u = (update_rec_t *)nbd_malloc(sizeof(update_rec_t));
148 memset(u, 0, sizeof(update_rec_t));
152 txn_t *txn_begin (txn_type_e type, map_t *map) {
153 txn_t *txn = (txn_t *)nbd_malloc(sizeof(txn_t));
154 memset(txn, 0, sizeof(txn_t));
156 txn->wv = UNDETERMINED_VERSION;
157 txn->state = TXN_RUNNING;
159 if (type != TXN_READ_ONLY) {
160 txn->writes = nbd_malloc(sizeof(*txn->writes) * INITIAL_WRITES_SIZE);
161 txn->writes_size = INITIAL_WRITES_SIZE;
164 // aquire the read version for txn. must be careful to avoid a race
172 temp = (uint64_t)map_cas(active_, (void *)txn->rv, old_count, old_count + 1);
173 } while (temp != old_count);
175 if (txn->rv == version_)
181 temp = map_cas(active_, (void *)txn->rv, old_count, old_count - 1);
182 } while (temp != old_count);
188 void txn_abort (txn_t *txn) {
191 for (i = 0; i < txn->writes_count; ++i) {
192 update_rec_t *update = (update_rec_t *)txn->writes[i].rec;
193 update->version = ABORTED_VERSION;
196 nbd_defer_free(txn->writes);
200 txn_state_e txn_commit (txn_t *txn) {
202 assert(txn->state == TXN_RUNNING);
203 txn->state = TXN_VALIDATING;
204 txn_state_e state = txn_validate(txn);
206 // Detach <txn> from its updates.
207 uint64_t wv = (txn->state == TXN_ABORTED) ? ABORTED_VERSION : txn->wv;
209 for (i = 0; i < txn->writes_count; ++i) {
210 update_rec_t *update = (update_rec_t *)txn->writes[i].rec;
211 update->version = wv;
215 // Lower the reference count for <txn>'s read version
220 temp = map_cas(active_, (void *)txn->rv, old_count, old_count - 1);
221 } while (old_count != temp);
222 if (old_count == 0 && version_ != txn->rv) {
223 map_remove(active_, (void *)txn->rv);
227 nbd_defer_free(txn->writes);
233 // Get most recent committed version prior to our read version.
234 uint64_t tm_get (txn_t *txn, void *key) {
236 // Iterate through update records associated with <key> to find the latest committed version prior to our
238 update_rec_t *update = (update_rec_t *) map_get(txn->map, key);
239 for (; update != NULL; update = update->next) {
241 // If the update's version is not tagged it means the update is committed.
242 if (!IS_TAGGED(update->version)) {
243 if (update->version <= txn->rv)
244 return update->value;
248 // If the update's version is tagged then either the update was aborted or the the version number is
249 // actually a pointer to a running transaction's txn_t.
251 // Skip updates from aborted transactions.
252 if (EXPECT_FALSE(update->version == ABORTED_VERSION))
255 // The update's transaction is still in progress. Access its txn_t.
256 txn_t *writer = (txn_t *)STRIP_TAG(update->version);
257 if (writer == txn) // found our own update
258 return update->type == UPDATE_TYPE_DELETE ? DOES_NOT_EXIST : update->value;
260 txn_state_e writer_state = writer->state;
261 if (writer_state == TXN_RUNNING)
264 if (writer_state == TXN_VALIDATING) {
265 if (writer->wv > txn->rv)
267 writer_state = txn_validate(writer);
270 // Skip updates from aborted transactions.
271 if (writer_state == TXN_ABORTED)
274 assert(writer_state == TXN_VALIDATED);
275 if (writer->wv > txn->rv)
277 return update->value;
279 return DOES_NOT_EXIST;
282 void tm_set (txn_t *txn, void *key, uint64_t value) {
284 // create a new update record
285 update_rec_t *update = alloc_update_rec();
286 update->type = UPDATE_TYPE_PUT;
287 update->value = value;
288 update->version = TAG_VALUE((uint64_t)txn);
290 // push the new update record onto <key>'s update list
291 uint64_t update_prev;
293 update->next = (update_rec_t *) map_get(txn->map, key);
294 update_prev = (uint64_t)update->next;
295 } while (map_cas(txn->map, key, update_prev, (uint64_t)update) != update_prev);
297 // add <key> to the write set for commit-time validation
298 if (txn->writes_count == txn->writes_size) {
299 write_rec_t *w = nbd_malloc(sizeof(write_rec_t) * txn->writes_size * 2);
300 memcpy(w, txn->writes, txn->writes_size * sizeof(write_rec_t));
301 txn->writes_size *= 2;
303 int i = txn->writes_count++;
304 txn->writes[i].key = key;
305 txn->writes[i].rec = update;