]> git.kaiwu.me - klib.git/commitdiff
bugfix: resize() cannot shrink; added comments
authorHeng Li <lh3@live.co.uk>
Fri, 16 Sep 2011 15:22:39 +0000 (11:22 -0400)
committerHeng Li <lh3@live.co.uk>
Fri, 16 Sep 2011 15:22:39 +0000 (11:22 -0400)
khash.h

diff --git a/khash.h b/khash.h
index 4d22762a81764c96dcc7aacba93d8522065bc20c..9b877e98b724b44c8f4b3065e73501b735fa43cf 100644 (file)
--- a/khash.h
+++ b/khash.h
@@ -100,8 +100,6 @@ int main() {
   @header
 
   Generic hash table library.
-
-  @copyright Heng Li
  */
 
 #define AC_VERSION_KHASH_H "0.2.6"
@@ -142,6 +140,10 @@ typedef khint_t khiter_t;
 #define __ac_inc(k) ((k) | 1)
 #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
 
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
 static const double __ac_HASH_UPPER = 0.77;
 
 #define KHASH_DECLARE(name, khkey_t, khval_t)                                                  \
@@ -190,7 +192,7 @@ static const double __ac_HASH_UPPER = 0.77;
                        khint_t inc, k, i, last, mask;                                                          \
                        mask = h->n_buckets - 1;                                                                        \
                        k = __hash_func(key); i = k & mask;                                                     \
-                       inc = __ac_inc(k) & mask; last = i;                                                     \
+                       inc = __ac_inc(k) & mask; last = i; /* inc==1 for linear probing */ \
                        while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
                                i = (i + inc) & mask;                                                                   \
                                if (i == last) return h->n_buckets;                                             \
@@ -199,42 +201,43 @@ static const double __ac_HASH_UPPER = 0.77;
                } else return 0;                                                                                                \
        }                                                                                                                                       \
        SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
-       {                                                                                                                                       \
+       { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
                khint32_t *new_flags = 0;                                                                               \
                khint_t j = 1;                                                                                                  \
                {                                                                                                                               \
-                       new_n_buckets = h->n_buckets? h->n_buckets<<1 : 4;                      \
-                       if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
-                       else {                                                                                                          \
+                       kroundup32(new_n_buckets);                                                                      \
+                       if (new_n_buckets < 4) new_n_buckets = 4;                                       \
+                       if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+                       else { /* hash table size to be changed (shrink or expand); rehash */ \
                                new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
                                memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
-                               if (h->n_buckets < new_n_buckets) {                                             \
+                               if (h->n_buckets < new_n_buckets) {     /* expand */            \
                                        h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
-                                       if (kh_is_map)                                                                          \
-                                               h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
-                               }                                                                                                               \
+                                       if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+                               } /* otherwise shrink */                                                                \
                        }                                                                                                                       \
                }                                                                                                                               \
-               if (j) {                                                                                                                \
+               if (j) { /* rehashing is needed */                                                              \
                        for (j = 0; j != h->n_buckets; ++j) {                                           \
                                if (__ac_iseither(h->flags, j) == 0) {                                  \
                                        khkey_t key = h->keys[j];                                                       \
                                        khval_t val;                                                                            \
+                                       khint_t new_mask;                                                                       \
+                                       new_mask = new_n_buckets - 1;                                           \
                                        if (kh_is_map) val = h->vals[j];                                        \
                                        __ac_set_isdel_true(h->flags, j);                                       \
-                                       while (1) {                                                                                     \
-                                               khint_t inc, k, i, new_mask;                                    \
-                                               new_mask = new_n_buckets - 1;                                   \
+                                       while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+                                               khint_t inc, k, i;                                                              \
                                                k = __hash_func(key);                                                   \
                                                i = k & new_mask;                                                               \
                                                inc = __ac_inc(k) & new_mask;                                   \
                                                while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
                                                __ac_set_isempty_false(new_flags, i);                   \
-                                               if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+                                               if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
                                                        { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
                                                        if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-                                                       __ac_set_isdel_true(h->flags, i);                       \
-                                               } else {                                                                                \
+                                                       __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+                                               } else { /* write the element and jump out of the loop */ \
                                                        h->keys[i] = key;                                                       \
                                                        if (kh_is_map) h->vals[i] = val;                        \
                                                        break;                                                                          \
@@ -242,12 +245,11 @@ static const double __ac_HASH_UPPER = 0.77;
                                        }                                                                                                       \
                                }                                                                                                               \
                        }                                                                                                                       \
-                       if (h->n_buckets > new_n_buckets) {                                                     \
+                       if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
                                h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
-                               if (kh_is_map)                                                                                  \
-                                       h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+                               if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
                        }                                                                                                                       \
-                       free(h->flags);                                                                                         \
+                       free(h->flags); /* free the working space */                            \
                        h->flags = new_flags;                                                                           \
                        h->n_buckets = new_n_buckets;                                                           \
                        h->n_occupied = h->size;                                                                        \
@@ -257,14 +259,14 @@ static const double __ac_HASH_UPPER = 0.77;
        SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
        {                                                                                                                                       \
                khint_t x;                                                                                                              \
-               if (h->n_occupied >= h->upper_bound) {                                                  \
-                       if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
-                       else kh_resize_##name(h, h->n_buckets + 1);                                     \
-               }                                                                                                                               \
+               if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+                       if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
+                       else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
+               } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
                {                                                                                                                               \
                        khint_t inc, k, i, site, last, mask = h->n_buckets - 1;         \
                        x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
-                       if (__ac_isempty(h->flags, i)) x = i;                                           \
+                       if (__ac_isempty(h->flags, i)) x = i; /* for speed up */        \
                        else {                                                                                                          \
                                inc = __ac_inc(k) & mask; last = i;                                             \
                                while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
@@ -278,17 +280,17 @@ static const double __ac_HASH_UPPER = 0.77;
                                }                                                                                                               \
                        }                                                                                                                       \
                }                                                                                                                               \
-               if (__ac_isempty(h->flags, x)) {                                                                \
+               if (__ac_isempty(h->flags, x)) { /* not present at all */               \
                        h->keys[x] = key;                                                                                       \
                        __ac_set_isboth_false(h->flags, x);                                                     \
                        ++h->size; ++h->n_occupied;                                                                     \
                        *ret = 1;                                                                                                       \
-               } else if (__ac_isdel(h->flags, x)) {                                                   \
+               } else if (__ac_isdel(h->flags, x)) { /* deleted */                             \
                        h->keys[x] = key;                                                                                       \
                        __ac_set_isboth_false(h->flags, x);                                                     \
                        ++h->size;                                                                                                      \
                        *ret = 2;                                                                                                       \
-               } else *ret = 0;                                                                                                \
+               } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
                return x;                                                                                                               \
        }                                                                                                                                       \
        SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)                           \