From 4a849fd5bc44c88d747808907c148c35694f3ff1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 16 Sep 2011 11:22:39 -0400 Subject: [PATCH] bugfix: resize() cannot shrink; added comments --- khash.h | 62 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/khash.h b/khash.h index 4d22762..9b877e9 100644 --- a/khash.h +++ b/khash.h @@ -100,8 +100,6 @@ int main() { @header Generic hash table library. - - @copyright Heng Li */ #define AC_VERSION_KHASH_H "0.2.6" @@ -142,6 +140,10 @@ typedef khint_t khiter_t; #define __ac_inc(k) ((k) | 1) #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + static const double __ac_HASH_UPPER = 0.77; #define KHASH_DECLARE(name, khkey_t, khval_t) \ @@ -190,7 +192,7 @@ static const double __ac_HASH_UPPER = 0.77; khint_t inc, k, i, last, mask; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ - inc = __ac_inc(k) & mask; last = i; \ + inc = __ac_inc(k) & mask; last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ @@ -199,42 +201,43 @@ static const double __ac_HASH_UPPER = 0.77; } else return 0; \ } \ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ - new_n_buckets = h->n_buckets? h->n_buckets<<1 : 4; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } /* otherwise shrink */ \ } \ } \ - if (j) { \ + if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ - while (1) { \ - khint_t inc, k, i, new_mask; \ - new_mask = new_n_buckets - 1; \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t inc, k, i; \ k = __hash_func(key); \ i = k & new_mask; \ inc = __ac_inc(k) & new_mask; \ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ @@ -242,12 +245,11 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (h->n_buckets > new_n_buckets) { \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); \ + free(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ @@ -257,14 +259,14 @@ static const double __ac_HASH_UPPER = 0.77; SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ - if (__ac_isempty(h->flags, i)) x = i; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ inc = __ac_inc(k) & mask; last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ @@ -278,17 +280,17 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (__ac_isempty(h->flags, x)) { \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ - } else *ret = 0; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ -- 2.47.3