From 3cd7139ba4ce0d104059a7484c4ea84224c07b58 Mon Sep 17 00:00:00 2001 From: Attractive Chaos Date: Thu, 13 Jan 2011 12:28:46 -0500 Subject: [PATCH] Added the kstring library --- kstring.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++ kstring.h | 116 +++++++++++++++++++++++++++ kstring_bench.c | 51 ++++++++++++ kstring_bench2.c | 131 ++++++++++++++++++++++++++++++ 4 files changed, 502 insertions(+) create mode 100644 kstring.c create mode 100644 kstring.h create mode 100644 kstring_bench.c create mode 100644 kstring_bench2.c diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..43d524c --- /dev/null +++ b/kstring.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) +{ + const char *p, *start; + if (sep) { // set up the table + if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) + aux->tab[*p/64] |= 1ull<<(*p%64); + } + if (str) aux->p = str - 1, aux->tab[0] &= ~1ull; + else if (aux->tab[0]&1) return 0; + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p/64]>>(*p%64)&1) break; + aux->p = p; // end of token + if (*p == 0) aux->tab[0] |= 1; // no more tokens + return (char*)start; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +typedef unsigned char ubyte_t; + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +static int *ksBM_prep(const ubyte_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, sizeof(int)); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) +{ + int i, j, *prep = 0, *bmGs, *bmBc; + const ubyte_t *str, *pat; + str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; + prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; + if (_prep && *_prep == 0) *_prep = prep; + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i >= 0) { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } else return (void*)(str + j); + } + if (_prep == 0) free(prep); + return 0; +} + +char *kstrstr(const char *str, const char *pat, int **_prep) +{ + return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); +} + +char *kstrnstr(const char *str, const char *pat, int n, int **_prep) +{ + return (char*)kmemmem(str, n, pat, strlen(pat), _prep); +} + +/*********************** + * The main() function * + ***********************/ + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + ks_tokaux_t aux; + char *p; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + // test kstrtok() + s->l = 0; + for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { + kputsn(p, aux.p - p, s); + kputc('\n', s); + } + printf("%s", s->s); + // free + free(s->s); free(s); free(fields); + + { + static char *str = "abcdefgcdgcagtcakcdcd"; + static char *pat = "cd"; + char *ret, *s = str; + int *prep = 0; + while ((ret = kstrstr(s, pat, &prep)) != 0) { + printf("match: %s\n", ret); + s = ret + prep[0]; + } + free(prep); + } + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..c46a62b --- /dev/null +++ b/kstring.h @@ -0,0 +1,116 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +typedef struct { + uint64_t tab[4]; + const char *p; // end of the current token +} ks_tokaux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + int ksprintf(kstring_t *s, const char *fmt, ...); + int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + char *kstrstr(const char *str, const char *pat, int **_prep); + char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); + + /* kstrtok() is similar to strtok_r() except that str is not + * modified and both str and sep can be NULL. For efficiency, it is + * actually recommended to set both to NULL in the subsequent calls + * if sep is not changed. */ + char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); + +#ifdef __cplusplus +} +#endif + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/kstring_bench.c b/kstring_bench.c new file mode 100644 index 0000000..82598e8 --- /dev/null +++ b/kstring_bench.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include "kstring.h" + +#define N 10000000 + +int main() +{ + int i; + clock_t t; + kstring_t s, s2; + srand48(11); + s.l = s.m = 0; s.s = 0; + t = clock(); + for (i = 0; i < N; ++i) { + int x = lrand48(); + s.l = 0; + kputw(x, &s); + } + fprintf(stderr, "kputw: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC); + srand48(11); + t = clock(); + for (i = 0; i < N; ++i) { + int x = lrand48(); + s.l = 0; + ksprintf(&s, "%d", x); + } + fprintf(stderr, "ksprintf: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC); + + srand48(11); + s2.l = s2.m = 0; s2.s = 0; + t = clock(); + for (i = 0; i < N; ++i) { + int x = lrand48(); + s2.l = s.l = 0; + kputw(x, &s2); + kputs(s2.s, &s); + } + fprintf(stderr, "kputw+kputs: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC); + srand48(11); + t = clock(); + for (i = 0; i < N; ++i) { + int x = lrand48(); + s2.l = s.l = 0; + kputw(x, &s2); + ksprintf(&s, "%s", s2.s); + } + fprintf(stderr, "kputw+ksprintf: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC); + return 0; +} diff --git a/kstring_bench2.c b/kstring_bench2.c new file mode 100644 index 0000000..b7707a8 --- /dev/null +++ b/kstring_bench2.c @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include "kstring.h" + +#ifdef __APPLE__ +#define HAVE_STRNSTR +#endif + +#ifdef __linux__ +#define HAVE_MEMMEM +#endif + +static int str_len = 1024*1024*128; +static int pat_len = 30; +static int alphabet = 2; +static int repeat = 50; + +char *gen_data(int len, int a) +{ + char *data; + int i; + long x; + srand48(11); + data = malloc(len); + for (i = 0; i < len; ++i) + data[i] = (int)(a * drand48()) + '!'; + data[str_len - 1] = 0; + return data; +} +// http://srcvault.scali.eu.org/cgi-bin/Syntax/c/BoyerMoore.c +char *BoyerMoore( unsigned char *data, unsigned int dataLength, unsigned char *string, unsigned int strLength ) +{ + unsigned int skipTable[256], i; + unsigned char *search; + register unsigned char lastChar; + + if (strLength == 0) + return NULL; + + for (i = 0; i < 256; i++) + skipTable[i] = strLength; + search = string; + i = --strLength; + do { + skipTable[*search++] = i; + } while (i--); + lastChar = *--search; + search = data + strLength; + dataLength -= strLength+(strLength-1); + while ((int)dataLength > 0 ) { + unsigned int skip; + skip = skipTable[*search]; + search += skip; + dataLength -= skip; + skip = skipTable[*search]; + search += skip; + dataLength -= skip; + skip = skipTable[*search]; + if (*search != lastChar) { + search += skip; + dataLength -= skip; + continue; + } + i = strLength; + do { + if (i-- == 0) return search; + } while (*--search == string[i]); + search += (strLength - i + 1); + dataLength--; + } + return NULL; +} + +int main() +{ + char *data; + int i; + clock_t t; + t = clock(); + data = gen_data(str_len, alphabet); + fprintf(stderr, "Generate data in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + { + t = clock(); srand48(1331); + for (i = 0; i < repeat; ++i) { + int y = lrand48() % (str_len - pat_len); + char *ret; + ret = kmemmem(data, str_len, data + y, pat_len, 0); +// printf("%d, %d\n", (int)(ret - data), y); + } + fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + if (1) { + t = clock(); srand48(1331); + for (i = 0; i < repeat; ++i) { + int y = lrand48() % (str_len - pat_len); + char *ret; + ret = BoyerMoore(data, str_len, data + y, pat_len); +// printf("%d, %d\n", (int)(ret - data), y); + } + fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } +#ifdef HAVE_STRNSTR + if (1) { + char *tmp; + t = clock(); srand48(1331); + tmp = calloc(pat_len+1, 1); + for (i = 0; i < repeat; ++i) { + int y = lrand48() % (str_len - pat_len); + char *ret; + memcpy(tmp, data + y, pat_len); + ret = strnstr(data, tmp, str_len); + } + fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } +#endif +#ifdef HAVE_MEMMEM + if (1) { + t = clock(); srand48(1331); + for (i = 0; i < repeat; ++i) { + int y = lrand48() % (str_len - pat_len); + char *ret; + ret = memmem(data, str_len, data + y, pat_len); +// printf("%d, %d\n", (int)(ret - data), y); + } + fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } +#endif + return 0; +} -- 2.47.3