2 * Written by Josh Dybnis and released to the public domain, as explained at
3 * http://creativecommons.org/licenses/publicdomain
5 * Implementation of the lock-free skiplist data-structure created by Maurice Herlihy, Yossi Lev,
6 * and Nir Shavit. See Herlihy's and Shivit's book "The Art of Multiprocessor Programming".
7 * http://www.amazon.com/Art-Multiprocessor-Programming-Maurice-Herlihy/dp/0123705916/
9 * See also Kir Fraser's dissertation "Practical Lock Freedom".
10 * www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.pdf
12 * This code is written for the x86 memory-model. The algorithim depends on certain stores and
13 * loads being ordered. Be careful, this code probably won't work correctly on platforms with
14 * weaker memory models if you don't add memory barriers in the right places.
25 // Setting MAX_LEVEL to 0 essentially makes this data structure the Harris-Michael lock-free list
40 static int random_level (void) {
41 unsigned r = nbd_rand();
44 int n = __builtin_ctz(r)-1;
49 assert(n <= MAX_LEVEL);
53 node_t *node_alloc (int level, uint64_t key, uint64_t value) {
54 assert(level >= 0 && level <= MAX_LEVEL);
55 size_t sz = sizeof(node_t) + (level + 1) * sizeof(node_t *);
56 node_t *item = (node_t *)nbd_malloc(sz);
60 item->top_level = level;
64 skiplist_t *sl_alloc (void) {
65 skiplist_t *sl = (skiplist_t *)nbd_malloc(sizeof(skiplist_t));
66 sl->head = node_alloc(MAX_LEVEL, 0, 0);
67 memset(sl->head->next, 0, (MAX_LEVEL+1) * sizeof(skiplist_t *));
71 static node_t *find_preds (node_t *preds[MAX_LEVEL+1], int n, skiplist_t *sl, uint64_t key, int help_remove) {
72 node_t *pred = sl->head;
74 TRACE("s3", "find_preds: searching for key %p in sl (head is %p)", key, pred);
76 int start_level = MAX_LEVEL;
78 // Optimization for small lists. No need to traverse empty higher levels.
80 while (pred->next[start_level+1] != NULL) {
81 start_level += start_level - 1;
82 if (EXPECT_FALSE(start_level >= MAX_LEVEL)) {
83 start_level = MAX_LEVEL;
87 if (EXPECT_FALSE(start_level < n)) {
92 // Traverse the levels of <sl> from the top level to the bottom
93 for (int level = start_level; level >= 0; --level) {
94 TRACE("s3", "find_preds: level %llu", level, 0);
95 item = pred->next[level];
96 if (EXPECT_FALSE(IS_TAGGED(item))) {
97 TRACE("s3", "find_preds: pred %p is marked for removal (item %p); retry", pred, item);
98 return find_preds(preds, n, sl, key, help_remove); // retry
100 while (item != NULL) {
101 node_t *next = item->next[level];
102 TRACE("s3", "find_preds: visiting item %p (next %p)", item, next);
103 TRACE("s3", "find_preds: key %p", item->key, 0);
105 // Marked items are logically removed, but not fully unlinked yet.
106 while (EXPECT_FALSE(IS_TAGGED(next))) {
108 // Skip over partially removed items.
110 item = (node_t *)STRIP_TAG(item->next);
111 if (EXPECT_FALSE(item == NULL))
113 next = item->next[level];
117 // Unlink partially removed items.
119 if ((other = SYNC_CAS(&pred->next[level], item, STRIP_TAG(next))) == item) {
120 item = (node_t *)STRIP_TAG(next);
121 if (EXPECT_FALSE(item == NULL))
123 next = item->next[level];
124 TRACE("s3", "find_preds: unlinked item %p from pred %p", item, pred);
125 TRACE("s3", "find_preds: now item is %p next is %p", item, next);
127 // The thread that completes the unlink should free the memory.
128 if (level == 0) { nbd_defer_free(other); }
130 TRACE("s3", "find_preds: lost race to unlink from pred %p; its link changed to %p", pred, other);
131 if (IS_TAGGED(other))
132 return find_preds(preds, n, sl, key, help_remove); // retry
134 if (EXPECT_FALSE(item == NULL))
136 next = item->next[level];
140 if (EXPECT_FALSE(item == NULL))
143 // If we reached the key (or passed where it should be), we found a pred. Save it and continue down.
144 if (item->key >= key) {
145 TRACE("s3", "find_preds: found pred %p item %p", pred, item);
156 if (n == -1 && item != NULL) {
157 assert(preds != NULL);
158 for (int level = start_level + 1; level <= item->top_level; ++level) {
159 preds[level] = sl->head;
165 // Fast find that does not help unlink partially removed nodes and does not return the node's predecessors.
166 uint64_t sl_lookup (skiplist_t *sl, uint64_t key) {
167 TRACE("s3", "sl_lookup: searching for key %p in sl %p", key, sl);
168 node_t *item = find_preds(NULL, 0, sl, key, FALSE);
170 // If we found an <item> matching the <key> return its value.
171 return (item && item->key == key) ? item->value : DOES_NOT_EXIST;
174 // Insert the <key> if it doesn't already exist in <sl>
175 uint64_t sl_add (skiplist_t *sl, uint64_t key, uint64_t value) {
176 TRACE("s3", "sl_add: inserting key %p value %p", key, value);
177 node_t *preds[MAX_LEVEL+1];
180 int n = random_level();
181 node_t *next = find_preds(preds, n, sl, key, TRUE);
183 // If a node matching <key> already exists in <sl>, return its value.
184 if (next != NULL && next->key == key) {
185 TRACE("s3", "sl_add: there is already an item %p (value %p) with the same key", next, next->value);
186 if (EXPECT_FALSE(item != NULL)) { nbd_free(item); }
190 // First insert <item> into the bottom level.
191 if (EXPECT_TRUE(item == NULL)) { item = node_alloc(n, key, value); }
192 TRACE("s3", "sl_add: attempting to insert item between %p and %p", preds[0], next);
193 item->next[0] = next;
194 for (int level = 1; level <= item->top_level; ++level) {
195 node_t *pred = preds[level];
196 item->next[level] = pred->next[level];
198 node_t *pred = preds[0];
199 node_t *other = SYNC_CAS(&pred->next[0], next, item);
201 TRACE("s3", "sl_add: successfully inserted item %p at level 0", item, 0);
204 TRACE("s3", "sl_add: failed to change pred's link: expected %p found %p", next, other);
208 // Insert <item> into <sl> from the bottom level up.
209 for (int level = 1; level <= item->top_level; ++level) {
215 next = pred->next[level];
216 if (next == NULL) // item goes at the end of the list
218 if (!IS_TAGGED(next) && next->key > key) // pred's link changed
220 find_preds(preds, item->top_level, sl, key, TRUE);
224 // There in no need to continue linking in the item if another thread removed it.
225 node_t *old_next = ((volatile node_t *)item)->next[level];
226 if (IS_TAGGED(old_next))
227 return DOES_NOT_EXIST; // success
229 // Use a CAS so we to not inadvertantly remove a mark another thread placed on the item.
230 if (next == old_next || SYNC_CAS(&item->next[level], old_next, next) == old_next)
234 TRACE("s3", "sl_add: attempting to insert item between %p and %p", pred, next);
235 node_t *other = SYNC_CAS(&pred->next[level], next, item);
237 TRACE("s3", "sl_add: successfully inserted item %p at level %llu", item, level);
240 TRACE("s3", "sl_add: failed to change pred's link: expected %p found %p", next, other);
247 uint64_t sl_remove (skiplist_t *sl, uint64_t key) {
248 TRACE("s3", "sl_remove: removing item with key %p from sl %p", key, sl);
249 node_t *preds[MAX_LEVEL+1];
250 node_t *item = find_preds(preds, -1, sl, key, TRUE);
251 if (item == NULL || item->key != key) {
252 TRACE("s3", "sl_remove: remove failed, an item with a matching key does not exist in the sl", 0, 0);
253 return DOES_NOT_EXIST;
256 // Mark <item> removed at each level of <sl> from the top down. This must be atomic. If multiple threads
257 // try to remove the same item only one of them should succeed. Marking the bottom level establishes which of
259 for (int level = item->top_level; level >= 0; --level) {
260 if (EXPECT_FALSE(IS_TAGGED(item->next[level]))) {
261 TRACE("s3", "sl_remove: %p is already marked for removal by another thread", item, 0);
263 return DOES_NOT_EXIST;
266 node_t *next = SYNC_FETCH_AND_OR(&item->next[level], TAG);
267 if (EXPECT_FALSE(IS_TAGGED(next))) {
268 TRACE("s3", "sl_remove: lost race -- %p is already marked for removal by another thread", item, 0);
270 return DOES_NOT_EXIST;
275 uint64_t value = item->value;
277 // Unlink <item> from the top down.
278 int level = item->top_level;
280 node_t *pred = preds[level];
281 node_t *next = item->next[level];
282 TRACE("s3", "sl_remove: link item's pred %p to it's successor %p", pred, STRIP_TAG(next));
283 node_t *other = NULL;
284 if ((other = SYNC_CAS(&pred->next[level], item, STRIP_TAG(next))) != item) {
285 TRACE("s3", "sl_remove: unlink failed; pred's link changed from %p to %p", item, other);
286 // By marking the item earlier, we logically removed it. It is safe to leave the item partially
287 // unlinked. Another thread will finish physically removing it from <sl>.
293 // The thread that completes the unlink should free the memory.
294 nbd_defer_free(item);
298 void sl_print (skiplist_t *sl) {
299 for (int level = MAX_LEVEL; level >= 0; --level) {
300 node_t *item = sl->head;
301 if (item->next[level] == NULL)
303 printf("(%d) ", level);
305 node_t *next = item->next[level];
306 printf("%s%p ", IS_TAGGED(next) ? "*" : "", item);
307 item = (node_t *)STRIP_TAG(next);
314 node_t *item = sl->head;
316 int is_marked = IS_TAGGED(item->next[0]);
317 printf("%s%p:0x%llx ", is_marked ? "*" : "", item, item->key);
318 if (item != sl->head) {
319 printf("[%d]", item->top_level);
323 for (int level = 1; level <= item->top_level; ++level) {
324 node_t *next = (node_t *)STRIP_TAG(item->next[level]);
325 is_marked = IS_TAGGED(item->next[0]);
326 printf(" %p%s", next, is_marked ? "*" : "");
327 if (item == sl->head && item->next[level] == NULL)
332 item = (node_t *)STRIP_TAG(item->next[0]);
336 #ifdef MAKE_skiplist_test
339 #include <sys/time.h>
343 #define NUM_ITERATIONS 10000000
345 static volatile int wait_;
346 static long num_threads_;
347 static skiplist_t *sl_;
349 void *worker (void *arg) {
351 // Wait for all the worker threads to be ready.
352 SYNC_ADD(&wait_, -1);
355 for (int i = 0; i < NUM_ITERATIONS/num_threads_; ++i) {
356 unsigned r = nbd_rand();
370 int main (int argc, char **argv) {
372 lwt_set_trace_level("s3");
374 char* program_name = argv[0];
375 pthread_t thread[MAX_NUM_THREADS];
378 fprintf(stderr, "Usage: %s num_threads\n", program_name);
386 num_threads_ = strtol(argv[1], NULL, 10);
388 fprintf(stderr, "%s: Invalid argument for number of threads\n", program_name);
391 if (num_threads_ <= 0) {
392 fprintf(stderr, "%s: Number of threads must be at least 1\n", program_name);
395 if (num_threads_ > MAX_NUM_THREADS) {
396 fprintf(stderr, "%s: Number of threads cannot be more than %d\n", program_name, MAX_NUM_THREADS);
403 struct timeval tv1, tv2;
404 gettimeofday(&tv1, NULL);
406 wait_ = num_threads_;
408 for (int i = 0; i < num_threads_; ++i) {
409 int rc = nbd_thread_create(thread + i, i, worker, (void*)(size_t)i);
410 if (rc != 0) { perror("pthread_create"); return rc; }
413 for (int i = 0; i < num_threads_; ++i) {
414 pthread_join(thread[i], NULL);
417 gettimeofday(&tv2, NULL);
418 int ms = (int)(1000000*(tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec) / 1000;
420 printf("Th:%ld Time:%dms\n", num_threads_, ms);
424 #endif//skiplist_test