*
* Note: This is code uses synchronous atomic operations because that is all that x86 provides.
* Every atomic operation is also an implicit full memory barrier. The upshot is that it simplifies
- * the code a bit, but it won't be as fast as it could be on platforms like SPARC that provide
- * weaker operations which would still do the job.
+ * the code a bit, but it won't be as fast as it could be on platforms that provide weaker
+ * operations like and unfenced CAS which would still do the job.
*/
#include <stdio.h>
#include "mem.h"
#include "hashtable.h"
-#define GET_PTR(x) ((void *)((x) & MASK(48))) // low-order 48 bits is a pointer to a nstring_t
+#define GET_PTR(x) ((void *)(size_t)((x) & MASK(48))) // low-order 48 bits is a pointer to a nstring_t
-typedef struct ht_entry {
- uint64_t key;
- uint64_t val;
+typedef struct entry {
+ uint64_t key;
+ map_val_t val;
} entry_t;
typedef struct hti {
volatile entry_t *table;
hashtable_t *ht; // parent ht;
struct hti *next;
- unsigned int scale;
+ unsigned scale;
int max_probe;
+ int references;
int count; // TODO: make these counters distributed
int num_entries_copied;
- int scan;
+ int copy_scan;
} hti_t;
+struct ht_iter {
+ hti_t * hti;
+ int64_t idx;
+};
+
struct ht {
hti_t *hti;
const datatype_t *key_type;
};
-static const uint64_t COPIED_VALUE = -1;
-static const uint64_t TOMBSTONE = STRIP_TAG(-1);
+static const map_val_t COPIED_VALUE = -1;
+static const map_val_t TOMBSTONE = STRIP_TAG(-1, TAG1);
static const unsigned ENTRIES_PER_BUCKET = CACHE_LINE_SIZE/sizeof(entry_t);
static const unsigned ENTRIES_PER_COPY_CHUNK = CACHE_LINE_SIZE/sizeof(entry_t)*2;
//
// Record if the entry being returned is empty. Otherwise the caller will have to waste time
// re-comparing the keys to confirm that it did not lose a race to fill an empty entry.
-static volatile entry_t *hti_lookup (hti_t *hti, void *key, uint32_t key_hash, int *is_empty) {
+static volatile entry_t *hti_lookup (hti_t *hti, map_key_t key, uint32_t key_hash, int *is_empty) {
TRACE("h2", "hti_lookup(key %p in hti %p)", key, hti);
*is_empty = 0;
// The key in <ent> is made up of two parts. The 48 low-order bits are a pointer. The
// high-order 16 bits are taken from the hash. The bits from the hash are used as a
// quick check to rule out non-equal keys without doing a complete compare.
- if ((key_hash >> 16) == (ent_key >> 48) && hti->ht->key_type->cmp(GET_PTR(ent_key), key) == 0) {
- TRACE("h1", "hti_lookup: found entry %p with key %p", ent, GET_PTR(ent_key));
- return ent;
+ if ((key_hash >> 16) == (ent_key >> 48)) {
+ if (hti->ht->key_type->cmp(GET_PTR(ent_key), (void *)(size_t)key) == 0) {
+ TRACE("h1", "hti_lookup: found entry %p with key %p", ent, GET_PTR(ent_key));
+ return ent;
+ }
}
}
}
// Allocate and initialize a hti_t with 2^<scale> entries.
static hti_t *hti_alloc (hashtable_t *parent, int scale) {
- // Include enough slop to align the actual table on a cache line boundry
- size_t n = sizeof(hti_t)
- + sizeof(entry_t) * (1 << scale)
- + (CACHE_LINE_SIZE - 1);
- hti_t *hti = (hti_t *)calloc(n, 1);
+ hti_t *hti = (hti_t *)nbd_malloc(sizeof(hti_t));
+ memset(hti, 0, sizeof(hti_t));
- // Align the table of hash entries on a cache line boundry.
- hti->table = (entry_t *)(((uint64_t)hti + sizeof(hti_t) + (CACHE_LINE_SIZE-1))
- & ~(CACHE_LINE_SIZE-1));
+ size_t sz = sizeof(entry_t) * (1 << scale);
+ entry_t *table = nbd_malloc(sz);
+ memset(table, 0, sz);
+ hti->table = table;
hti->scale = scale;
assert(ht1_ent >= ht1->table && ht1_ent < ht1->table + (1 << ht1->scale));
assert(key_hash == 0 || ht1->ht->key_type == NULL || (key_hash >> 16) == (ht1_ent->key >> 48));
- uint64_t ht1_ent_val = ht1_ent->val;
+ map_val_t ht1_ent_val = ht1_ent->val;
if (EXPECT_FALSE(ht1_ent_val == COPIED_VALUE)) {
TRACE("h1", "hti_copy_entry: entry %p already copied to table %p", ht1_ent, ht2);
return FALSE; // already copied
// Kill empty entries.
if (EXPECT_FALSE(ht1_ent_val == DOES_NOT_EXIST)) {
- uint64_t ht1_ent_val = SYNC_CAS(&ht1_ent->val, DOES_NOT_EXIST, COPIED_VALUE);
+ map_val_t ht1_ent_val = SYNC_CAS(&ht1_ent->val, DOES_NOT_EXIST, COPIED_VALUE);
if (ht1_ent_val == DOES_NOT_EXIST) {
TRACE("h1", "hti_copy_entry: empty entry %p killed", ht1_ent, 0);
return TRUE;
}
// Tag the value in the old entry to indicate a copy is in progress.
- ht1_ent_val = SYNC_FETCH_AND_OR(&ht1_ent->val, TAG_VALUE(0));
+ ht1_ent_val = SYNC_FETCH_AND_OR(&ht1_ent->val, TAG_VALUE(0, TAG1));
TRACE("h2", "hti_copy_entry: tagged the value %p in old entry %p", ht1_ent_val, ht1_ent);
if (ht1_ent_val == COPIED_VALUE) {
TRACE("h1", "hti_copy_entry: entry %p already copied to table %p", ht1_ent, ht2);
// Install the key in the new table.
uint64_t ht1_ent_key = ht1_ent->key;
- void *key = (ht1->ht->key_type == NULL) ? (void *)ht1_ent_key : GET_PTR(ht1_ent_key);
+ map_key_t key = (ht1->ht->key_type == NULL) ? (map_key_t)ht1_ent_key : (map_key_t)(size_t)GET_PTR(ht1_ent_key);
// The old table's dead entries don't need to be copied to the new table, but their keys need to be freed.
- assert(COPIED_VALUE == TAG_VALUE(TOMBSTONE));
+ assert(COPIED_VALUE == TAG_VALUE(TOMBSTONE, TAG1));
if (ht1_ent_val == TOMBSTONE) {
TRACE("h1", "hti_copy_entry: entry %p old value was deleted, now freeing key %p", ht1_ent, key);
if (EXPECT_FALSE(ht1->ht->key_type != NULL)) {
- nbd_defer_free(key);
+ nbd_defer_free((void *)(size_t)key);
}
return TRUE;
}
// We use 0 to indicate that <key_hash> is uninitiallized. Occasionally the key's hash will really be 0 and we
// waste time recomputing it every time. It is rare enough (1 in 65k) that it won't hurt performance.
if (key_hash == 0) {
- key_hash = (ht1->ht->key_type == NULL) ? murmur32_8b(ht1_ent_key) : ht1->ht->key_type->hash(key);
+ key_hash = (ht1->ht->key_type == NULL)
+ ? murmur32_8b(ht1_ent_key)
+ : ht1->ht->key_type->hash((void *)(size_t)key);
}
int ht2_ent_is_empty;
}
// Copy the value to the entry in the new table.
- ht1_ent_val = STRIP_TAG(ht1_ent_val);
- uint64_t old_ht2_ent_val = SYNC_CAS(&ht2_ent->val, DOES_NOT_EXIST, ht1_ent_val);
+ ht1_ent_val = STRIP_TAG(ht1_ent_val, TAG1);
+ map_val_t old_ht2_ent_val = SYNC_CAS(&ht2_ent->val, DOES_NOT_EXIST, ht1_ent_val);
// If there is a nested copy in progress, we might have installed the key into a dead entry.
if (old_ht2_ent_val == COPIED_VALUE) {
// real value matches (i.ent. not a TOMBSTONE or DOES_NOT_EXIST) as long as <key> is in the table. If
// <expected> is CAS_EXPECT_WHATEVER then skip the test entirely.
//
-static uint64_t hti_cas (hti_t *hti, void *key, uint32_t key_hash, uint64_t expected, uint64_t new) {
+static map_val_t hti_cas (hti_t *hti, map_key_t key, uint32_t key_hash, map_val_t expected, map_val_t new) {
TRACE("h1", "hti_cas: hti %p key %p", hti, key);
TRACE("h1", "hti_cas: value %p expect %p", new, expected);
assert(hti);
- assert(!IS_TAGGED(new));
+ assert(!IS_TAGGED(new, TAG1));
assert(key);
int is_empty;
return DOES_NOT_EXIST;
// Allocate <new_key>.
- uint64_t new_key = (uint64_t)((hti->ht->key_type == NULL) ? key : hti->ht->key_type->clone(key));
+ uint64_t new_key = (uint64_t)((hti->ht->key_type == NULL) ? key : (map_key_t)(size_t)hti->ht->key_type->clone((void *)(size_t)key));
if (EXPECT_FALSE(hti->ht->key_type != NULL)) {
// Combine <new_key> pointer with bits from its hash
new_key = ((uint64_t)(key_hash >> 16) << 48) | new_key;
(hti->ht->key_type == NULL) ? (void *)ent->key : GET_PTR(ent->key), ent);
// If the entry is in the middle of a copy, the copy must be completed first.
- uint64_t ent_val = ent->val;
- if (EXPECT_FALSE(IS_TAGGED(ent_val))) {
+ map_val_t ent_val = ent->val;
+ if (EXPECT_FALSE(IS_TAGGED(ent_val, TAG1))) {
if (ent_val != COPIED_VALUE) {
int did_copy = hti_copy_entry(hti, ent, key_hash, ((volatile hti_t *)hti)->next);
if (did_copy) {
}
// CAS the value into the entry. Retry if it fails.
- uint64_t v = SYNC_CAS(&ent->val, ent_val, new == DOES_NOT_EXIST ? TOMBSTONE : new);
+ map_val_t v = SYNC_CAS(&ent->val, ent_val, new == DOES_NOT_EXIST ? TOMBSTONE : new);
if (EXPECT_FALSE(v != ent_val)) {
TRACE("h0", "hti_cas: value CAS failed; expected %p found %p", ent_val, v);
return hti_cas(hti, key, key_hash, expected, new); // recursive tail-call
}
//
-static uint64_t hti_get (hti_t *hti, void *key, uint32_t key_hash) {
+static map_val_t hti_get (hti_t *hti, map_key_t key, uint32_t key_hash) {
int is_empty;
volatile entry_t *ent = hti_lookup(hti, key, key_hash, &is_empty);
return DOES_NOT_EXIST;
// If the entry is being copied, finish the copy and retry on the next table.
- uint64_t ent_val = ent->val;
- if (EXPECT_FALSE(IS_TAGGED(ent_val))) {
+ map_val_t ent_val = ent->val;
+ if (EXPECT_FALSE(IS_TAGGED(ent_val, TAG1))) {
if (EXPECT_FALSE(ent_val != COPIED_VALUE)) {
int did_copy = hti_copy_entry(hti, ent, key_hash, ((volatile hti_t *)hti)->next);
if (did_copy) {
}
//
-uint64_t ht_get (hashtable_t *ht, void *key) {
- uint32_t hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash(key);
+map_val_t ht_get (hashtable_t *ht, map_key_t key) {
+ uint32_t hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash((void *)(size_t)key);
return hti_get(ht->hti, key, hash);
}
-//
-uint64_t ht_cas (hashtable_t *ht, void *key, uint64_t expected_val, uint64_t new_val) {
-
- TRACE("h2", "ht_cas: key %p ht %p", key, ht);
- TRACE("h2", "ht_cas: expected val %p new val %p", expected_val, new_val);
- assert(key != DOES_NOT_EXIST);
- assert(!IS_TAGGED(new_val) && new_val != DOES_NOT_EXIST && new_val != TOMBSTONE);
-
- hti_t *hti = ht->hti;
-
- // Help with an ongoing copy.
- if (EXPECT_FALSE(hti->next != NULL)) {
- volatile entry_t *ent;
- uint64_t limit;
- int num_copied = 0;
- int x = hti->scan;
+// returns TRUE if copy is done
+int hti_help_copy (hti_t *hti) {
+ volatile entry_t *ent;
+ uint64_t limit;
+ uint64_t total_copied = hti->num_entries_copied;
+ uint64_t num_copied = 0;
+ uint64_t x = hti->copy_scan;
- TRACE("h1", "ht_cas: help copy. scan is %llu, size is %llu", x, 1<<hti->scale);
+ TRACE("h1", "ht_cas: help copy. scan is %llu, size is %llu", x, 1<<hti->scale);
+ if (total_copied != (1 << hti->scale)) {
// Panic if we've been around the array twice and still haven't finished the copy.
int panic = (x >= (1 << (hti->scale + 1)));
if (!panic) {
// Reserve some entries for this thread to copy. There is a race condition here because the
// fetch and add isn't atomic, but that is ok.
- hti->scan = x + ENTRIES_PER_COPY_CHUNK;
+ hti->copy_scan = x + ENTRIES_PER_COPY_CHUNK;
- // <hti->scan> might be larger than the size of the table, if some thread stalls while
+ // <copy_scan> might be larger than the size of the table, if some thread stalls while
// copying. In that case we just wrap around to the begining and make another pass through
// the table.
ent = hti->table + (x & MASK(hti->scale));
} else {
TRACE("h1", "ht_cas: help copy panic", 0, 0);
// scan the whole table
- limit = (1 << hti->scale);
ent = hti->table;
+ limit = (1 << hti->scale);
}
// Copy the entries
assert(ent <= hti->table + (1 << hti->scale));
}
if (num_copied != 0) {
- SYNC_ADD(&hti->num_entries_copied, num_copied);
+ total_copied = SYNC_ADD(&hti->num_entries_copied, num_copied);
}
+ }
+
+ return (total_copied == (1 << hti->scale));
+}
+
+//
+map_val_t ht_cas (hashtable_t *ht, map_key_t key, map_val_t expected_val, map_val_t new_val) {
+
+ TRACE("h2", "ht_cas: key %p ht %p", key, ht);
+ TRACE("h2", "ht_cas: expected val %p new val %p", expected_val, new_val);
+ assert(key != DOES_NOT_EXIST);
+ assert(!IS_TAGGED(new_val, TAG1) && new_val != DOES_NOT_EXIST && new_val != TOMBSTONE);
+
+ hti_t *hti = ht->hti;
+
+ // Help with an ongoing copy.
+ if (EXPECT_FALSE(hti->next != NULL)) {
+ int done = hti_help_copy(hti);
// Dispose of fully copied tables.
- if (hti->num_entries_copied == (1 << hti->scale) || panic) {
- assert(hti->next);
- if (SYNC_CAS(&ht->hti, hti, hti->next) == hti) {
- nbd_defer_free(hti);
+ if (done && hti->references == 0) {
+
+ int r = SYNC_CAS(&hti->references, 0, -1);
+ if (r == 0) {
+ assert(hti->next);
+ if (SYNC_CAS(&ht->hti, hti, hti->next) == hti) {
+ nbd_defer_free((void *)hti->table);
+ nbd_defer_free(hti);
+ }
}
}
}
- uint64_t old_val;
- uint32_t key_hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash(key);
+ map_val_t old_val;
+ uint32_t key_hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash((void *)(size_t)key);
while ((old_val = hti_cas(hti, key, key_hash, expected_val, new_val)) == COPIED_VALUE) {
assert(hti->next);
hti = hti->next;
// Remove the value in <ht> associated with <key>. Returns the value removed, or DOES_NOT_EXIST if there was
// no value for that key.
-uint64_t ht_remove (hashtable_t *ht, void *key) {
+map_val_t ht_remove (hashtable_t *ht, map_key_t key) {
hti_t *hti = ht->hti;
- uint64_t val;
- uint32_t key_hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash(key);
+ map_val_t val;
+ uint32_t key_hash = (ht->key_type == NULL) ? murmur32_8b((uint64_t)key) : ht->key_type->hash((void *)(size_t)key);
do {
val = hti_cas(hti, key, key_hash, CAS_EXPECT_WHATEVER, DOES_NOT_EXIST);
if (val != COPIED_VALUE)
hti_t *hti = ht->hti;
do {
for (uint32_t i = 0; i < (1 << hti->scale); ++i) {
- assert(hti->table[i].val == COPIED_VALUE || !IS_TAGGED(hti->table[i].val));
+ assert(hti->table[i].val == COPIED_VALUE || !IS_TAGGED(hti->table[i].val, TAG1));
if (ht->key_type != NULL && hti->table[i].key != DOES_NOT_EXIST) {
nbd_free(GET_PTR(hti->table[i].key));
}
}
hti_t *next = hti->next;
+ nbd_free((void *)hti->table);
nbd_free(hti);
hti = next;
} while (hti);
printf("hti:%p scale:%u count:%d copied:%d\n", hti, hti->scale, hti->count, hti->num_entries_copied);
for (int i = 0; i < (1 << hti->scale); ++i) {
volatile entry_t *ent = hti->table + i;
- printf("[0x%x] %p:%p\n", i, (void *)ent->key, (void *)ent->val);
+ printf("[0x%x] 0x%llx:0x%llx\n", i, (uint64_t)ent->key, ent->val);
if (i > 30) {
printf("...\n");
break;
hti = hti->next;
}
}
+
+ht_iter_t *ht_iter_begin (hashtable_t *ht, map_key_t key) {
+ hti_t *hti = ht->hti;
+ int rcount;
+ do {
+ while (((volatile hti_t *)hti)->next != NULL) {
+ do { } while (hti_help_copy(hti) != TRUE);
+ hti = hti->next;
+ }
+
+ int old = hti->references;
+ do {
+ rcount = old;
+ if (rcount != -1) {
+ old = SYNC_CAS(&hti->references, rcount, rcount + 1);
+ }
+ } while (rcount != old);
+ } while (rcount == -1);
+
+ ht_iter_t *iter = nbd_malloc(sizeof(ht_iter_t));
+ iter->hti = hti;
+ iter->idx = -1;
+
+ return iter;
+}
+
+map_val_t ht_iter_next (ht_iter_t *iter, map_key_t *key_ptr) {
+ volatile entry_t *ent;
+ map_key_t key;
+ map_val_t val;
+ uint64_t table_size = (1 << iter->hti->scale);
+ do {
+ iter->idx++;
+ if (iter->idx == table_size) {
+ return DOES_NOT_EXIST;
+ }
+ ent = &iter->hti->table[iter->idx];
+ key = (iter->hti->ht->key_type == NULL) ? (map_key_t)ent->key : (map_key_t)(size_t)GET_PTR(ent->key);
+ val = ent->val;
+
+ } while (key == DOES_NOT_EXIST || val == DOES_NOT_EXIST || val == TOMBSTONE);
+
+ if (key_ptr) {
+ *key_ptr = key;
+ }
+ if (val == COPIED_VALUE) {
+ uint32_t hash = (iter->hti->ht->key_type == NULL)
+ ? murmur32_8b((uint64_t)key)
+ : iter->hti->ht->key_type->hash((void *)(size_t)key);
+ val = hti_get(iter->hti->next, (map_key_t)ent->key, hash);
+ }
+
+ return val;
+}
+
+void ht_iter_free (ht_iter_t *iter) {
+ SYNC_ADD(&iter->hti->references, -1);
+ nbd_free(iter);
+}