Added the khmm library

author Attractive Chaos <attractor@live.co.uk>

Thu, 13 Jan 2011 17:55:01 +0000 (12:55 -0500)

committer Attractive Chaos <attractor@live.co.uk>

Thu, 13 Jan 2011 17:55:01 +0000 (12:55 -0500)
author Attractive Chaos <attractor@live.co.uk>
Thu, 13 Jan 2011 17:55:01 +0000 (12:55 -0500)
committer Attractive Chaos <attractor@live.co.uk>
Thu, 13 Jan 2011 17:55:01 +0000 (12:55 -0500)
diff --git a/khmm.c b/khmm.c

new file mode 100644 (file)

index 0000000..711ade5
--- /dev/null
+++ b/khmm.c
@@ -0,0 +1,423 @@
+#include <math.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include "khmm.h"
+
+// new/delete hmm_par_t
+
+hmm_par_t *hmm_new_par(int m, int n)
+{
+       hmm_par_t *hp;
+       int i;
+       assert(m > 0 && n > 0);
+       hp = (hmm_par_t*)calloc(1, sizeof(hmm_par_t));
+       hp->m = m; hp->n = n;
+       hp->a0 = (FLOAT*)calloc(n, sizeof(FLOAT));
+       hp->a = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
+       hp->e = (FLOAT**)calloc2(m + 1, n, sizeof(FLOAT));
+       hp->ae = (FLOAT**)calloc2((m + 1) * n, n, sizeof(FLOAT));
+       for (i = 0; i != n; ++i) hp->e[m][i] = 1.0;
+       return hp;
+}
+void hmm_delete_par(hmm_par_t *hp)
+{
+       int i;
+       if (hp == 0) return;
+       for (i = 0; i != hp->n; ++i) free(hp->a[i]);
+       for (i = 0; i <= hp->m; ++i) free(hp->e[i]);
+       for (i = 0; i < (hp->m + 1) * hp->n; ++i) free(hp->ae[i]);
+       free(hp->a); free(hp->e); free(hp->a0); free(hp->ae);
+       free(hp);
+}
+
+// new/delete hmm_data_t
+
+hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp)
+{
+       hmm_data_t *hd;
+       hd = (hmm_data_t*)calloc(1, sizeof(hmm_data_t));
+       hd->L = L;
+       hd->seq = (char*)malloc(L + 1);
+       memcpy(hd->seq + 1, seq, L);
+       return hd;
+}
+void hmm_delete_data(hmm_data_t *hd)
+{
+       int i;
+       if (hd == 0) return;
+       for (i = 0; i <= hd->L; ++i) {
+               if (hd->f) free(hd->f[i]);
+               if (hd->b) free(hd->b[i]);
+       }
+       free(hd->f); free(hd->b); free(hd->s); free(hd->v); free(hd->p); free(hd->seq);
+       free(hd);
+}
+
+// new/delete hmm_exp_t
+
+hmm_exp_t *hmm_new_exp(const hmm_par_t *hp)
+{
+       hmm_exp_t *he;
+       assert(hp);
+       he = (hmm_exp_t*)calloc(1, sizeof(hmm_exp_t));
+       he->m = hp->m; he->n = hp->n;
+       he->A0 = (FLOAT*)calloc(hp->n, sizeof(FLOAT));
+       he->A = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
+       he->E = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
+       return he;
+}
+void hmm_delete_exp(hmm_exp_t *he)
+{
+       int i;
+       if (he == 0) return;
+       for (i = 0; i != he->n; ++i) free(he->A[i]);
+       for (i = 0; i <= he->m; ++i) free(he->E[i]);
+       free(he->A); free(he->E); free(he->A0);
+       free(he);
+}
+
+// Viterbi algorithm
+
+FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd)
+{
+       FLOAT **la, **le, *preV, *curV, max;
+       int **Vmax, max_l; // backtrace matrix
+       int k, l, b, u;
+       
+       if (hd->v) free(hd->v);
+       hd->v = (int*)calloc(hd->L+1, sizeof(int));
+       la = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
+       le = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
+       Vmax = (int**)calloc2(hd->L+1, hp->n, sizeof(int));
+       preV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
+       curV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
+       for (k = 0; k != hp->n; ++k)
+               for (l = 0; l != hp->n; ++l)
+                       la[k][l] = log(hp->a[l][k]); // this is not a bug
+       for (b = 0; b != hp->m; ++b)
+               for (k = 0; k != hp->n; ++k)
+                       le[b][k] = log(hp->e[b][k]);
+       for (k = 0; k != hp->n; ++k) le[hp->m][k] = 0.0;
+       // V_k(1)
+       for (k = 0; k != hp->n; ++k) {
+               preV[k] = le[(int)hd->seq[1]][k] + log(hp->a0[k]);
+               Vmax[1][k] = 0;
+       }
+       // all the rest
+       for (u = 2; u <= hd->L; ++u) {
+               FLOAT *tmp, *leu = le[(int)hd->seq[u]];
+               for (k = 0; k != hp->n; ++k) {
+                       FLOAT *laa = la[k];
+                       for (l = 0, max = -HMM_INF, max_l = -1; l != hp->n; ++l) {
+                               if (max < preV[l] + laa[l]) {
+                                       max = preV[l] + laa[l];
+                                       max_l = l;
+                               }
+                       }
+                       assert(max_l >= 0); // cannot be zero
+                       curV[k] = leu[k] + max;
+                       Vmax[u][k] = max_l;
+               }
+               tmp = curV; curV = preV; preV = tmp; // swap
+       }
+       // backtrace
+       for (k = 0, max_l = -1, max = -HMM_INF; k != hp->n; ++k) {
+               if (max < preV[k]) {
+                       max = preV[k]; max_l = k;
+               }
+       }
+       assert(max_l >= 0); // cannot be zero
+       hd->v[hd->L] = max_l;
+       for (u = hd->L; u >= 1; --u)
+               hd->v[u-1] = Vmax[u][hd->v[u]];
+       for (k = 0; k != hp->n; ++k) free(la[k]);
+       for (b = 0; b < hp->m; ++b) free(le[b]);
+       for (u = 0; u <= hd->L; ++u) free(Vmax[u]);
+       free(la); free(le); free(Vmax); free(preV); free(curV);
+       hd->status |= HMM_VITERBI;
+       return max;
+}
+
+// forward algorithm
+
+void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd)
+{
+       FLOAT sum, tmp, **at;
+       int u, k, l;
+       int n, m, L;
+       assert(hp && hd);
+       // allocate memory for hd->f and hd->s
+       n = hp->n; m = hp->m; L = hd->L;
+       if (hd->s) free(hd->s);
+       if (hd->f) { 
+               for (k = 0; k <= hd->L; ++k) free(hd->f[k]);
+               free(hd->f);
+       }
+       hd->f = (FLOAT**)calloc2(hd->L+1, hp->n, sizeof(FLOAT));
+       hd->s = (FLOAT*)calloc(hd->L+1, sizeof(FLOAT));
+       hd->status &= ~(unsigned)HMM_FORWARD;
+       // at[][] array helps to improve the cache efficiency
+       at = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
+       // transpose a[][]
+       for (k = 0; k != n; ++k)
+               for (l = 0; l != n; ++l)
+                       at[k][l] = hp->a[l][k];
+       // f[0], but it should never be used
+       hd->s[0] = 1.0;
+       for (k = 0; k != n; ++k) hd->f[0][k] = 0.0;
+       // f[1]
+       for (k = 0, sum = 0.0; k != n; ++k)
+               sum += (hd->f[1][k] = hp->a0[k] * hp->e[(int)hd->seq[1]][k]);
+       for (k = 0; k != n; ++k) hd->f[1][k] /= sum;
+       hd->s[1] = sum;
+       // f[2..hmmL], the core loop
+       for (u = 2; u <= L; ++u) {
+               FLOAT *fu = hd->f[u], *fu1 = hd->f[u-1], *eu = hp->e[(int)hd->seq[u]];
+               for (k = 0, sum = 0.0; k != n; ++k) {
+                       FLOAT *aa = at[k];
+                       for (l = 0, tmp = 0.0; l != n; ++l) tmp += fu1[l] * aa[l];
+                       sum += (fu[k] = eu[k] * tmp);
+               }
+               for (k = 0; k != n; ++k) fu[k] /= sum;
+               hd->s[u] = sum;
+       }
+       // free at array
+       for (k = 0; k != hp->n; ++k) free(at[k]);
+       free(at);
+       hd->status |= HMM_FORWARD;
+}
+
+//  precalculate hp->ae
+
+void hmm_pre_backward(hmm_par_t *hp)
+{
+       int m, n, b, k, l;
+       assert(hp);
+       m = hp->m; n = hp->n;
+       for (b = 0; b <= m; ++b) {
+               for (k = 0; k != n; ++k) {
+                       FLOAT *p = hp->ae[b * hp->n + k];
+                       for (l = 0; l != n; ++l)
+                               p[l] = hp->e[b][l] * hp->a[k][l];
+               }
+       }
+}
+
+// backward algorithm
+
+void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd)
+{
+       FLOAT tmp;
+       int k, l, u;
+       int m, n, L;
+       assert(hp && hd);
+       assert(hd->status & HMM_FORWARD);
+       // allocate memory for hd->b
+       m = hp->m; n = hp->n; L = hd->L;
+       if (hd->b) { 
+               for (k = 0; k <= hd->L; ++k) free(hd->b[k]);
+               free(hd->b);
+       }
+       hd->status &= ~(unsigned)HMM_BACKWARD;
+       hd->b = (FLOAT**)calloc2(L+1, hp->n, sizeof(FLOAT));
+       // b[L]
+       for (k = 0; k != hp->n; ++k) hd->b[L][k] = 1.0 / hd->s[L];
+       // b[1..L-1], the core loop
+       for (u = L-1; u >= 1; --u) {
+               FLOAT *bu1 = hd->b[u+1], **p = hp->ae + (int)hd->seq[u+1] * n;
+               for (k = 0; k != n; ++k) {
+                       FLOAT *q = p[k];
+                       for (l = 0, tmp = 0.0; l != n; ++l) tmp += q[l] * bu1[l];
+                       hd->b[u][k] = tmp / hd->s[u];
+               }
+       }
+       hd->status |= HMM_BACKWARD;
+       for (l = 0, tmp = 0.0; l != n; ++l)
+               tmp += hp->a0[l] * hd->b[1][l] * hp->e[(int)hd->seq[1]][l];
+       if (tmp > 1.0 + 1e-6 || tmp < 1.0 - 1e-6) // in theory, tmp should always equal to 1
+               fprintf(stderr, "++ Underflow may have happened (%lg).\n", tmp);
+}
+
+// log-likelihood of the observation
+
+FLOAT hmm_lk(const hmm_data_t *hd)
+{
+    FLOAT sum = 0.0, prod = 1.0;
+       int u, L;
+       L = hd->L;
+       assert(hd->status & HMM_FORWARD);
+       for (u = 1; u <= L; ++u) {
+               prod *= hd->s[u];
+               if (prod < HMM_TINY || prod >= 1.0/HMM_TINY) { // reset
+                       sum += log(prod);
+                       prod = 1.0;
+               }
+       }
+       sum += log(prod);
+       return sum;
+}
+
+// posterior decoding
+
+void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd)
+{
+       int u, k;
+       assert(hd->status && HMM_BACKWARD);
+       if (hd->p) free(hd->p);
+       hd->p = (int*)calloc(hd->L + 1, sizeof(int));
+       for (u = 1; u <= hd->L; ++u) {
+               FLOAT prob, max, *fu = hd->f[u], *bu = hd->b[u], su = hd->s[u];
+               int max_k;
+               for (k = 0, max = -1.0, max_k = -1; k != hp->n; ++k) {
+                       if (max < (prob = fu[k] * bu[k] * su)) {
+                               max = prob; max_k = k;
+                       }
+               }
+               assert(max_k >= 0);
+               hd->p[u] = max_k;
+       }
+       hd->status |= HMM_POSTDEC;
+}
+
+// posterior probability of states
+
+FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob)
+{
+       FLOAT sum = 0.0, ss = hd->s[u], *fu = hd->f[u], *bu = hd->b[u];
+       int k;
+       for (k = 0; k != hp->n; ++k)
+               sum += (prob[k] = fu[k] * bu[k] * ss);
+       return sum; // in theory, this should always equal to 1.0
+}
+
+// expected counts
+
+hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd)
+{
+       int k, l, u, b, m, n;
+       hmm_exp_t *he;
+       assert(hd->status & HMM_BACKWARD);
+       he = hmm_new_exp(hp);
+       // initialization
+       m = hp->m; n = hp->n;
+       for (k = 0; k != n; ++k)
+               for (l = 0; l != n; ++l) he->A[k][l] = HMM_TINY;
+       for (b = 0; b <= m; ++b)
+               for (l = 0; l != n; ++l) he->E[b][l] = HMM_TINY;
+       // calculate A_{kl} and E_k(b), k,l\in[0,n)
+       for (u = 1; u < hd->L; ++u) {
+               FLOAT *fu = hd->f[u], *bu = hd->b[u], *bu1 = hd->b[u+1], ss = hd->s[u];
+               FLOAT *Ec = he->E[(int)hd->seq[u]], **p = hp->ae + (int)hd->seq[u+1] * n;
+               for (k = 0; k != n; ++k) {
+                       FLOAT *q = p[k], *AA = he->A[k], fuk = fu[k];
+                       for (l = 0; l != n; ++l) // this is cache-efficient
+                               AA[l] += fuk * q[l] * bu1[l];
+                       Ec[k] += fuk * bu[k] * ss;
+               }
+       }
+       // calculate A0_l
+       for (l = 0; l != n; ++l)
+               he->A0[l] += hp->a0[l] * hp->e[(int)hd->seq[1]][l] * hd->b[1][l];
+       return he;
+}
+
+FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he)
+{
+       int k, l, b;
+       FLOAT sum = 0.0;
+       for (k = 0; k != hp->n; ++k) {
+               FLOAT tmp;
+               for (b = 0, tmp = 0.0; b != hp->m; ++b) tmp += he->E[b][k];
+               for (b = 0; b != hp->m; ++b)
+                       sum += he->E[b][k] * log(he->E[b][k] / tmp);
+       }
+       for (k = 0; k != hp->n; ++k) {
+               FLOAT tmp, *A = he->A[k];
+               for (l = 0, tmp = 0.0; l != hp->n; ++l) tmp += A[l];
+               for (l = 0; l != hp->n; ++l) sum += A[l] * log(A[l] / tmp);
+       }
+       return (he->Q0 = sum);
+}
+
+// add he0 to he1
+
+void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1)
+{
+       int b, k, l;
+       assert(he0->m == he1->m && he0->n == he1->n);
+       for (k = 0; k != he1->n; ++k) {
+               he1->A0[k] += he0->A0[k];
+               for (l = 0; l != he1->n; ++l)
+                       he1->A[k][l] += he0->A[k][l];
+       }
+       for (b = 0; b != he1->m; ++b) {
+               for (l = 0; l != he1->n; ++l)
+                       he1->E[b][l] += he0->E[b][l];
+       }
+}
+
+// the EM-Q function
+
+FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he)
+{
+       FLOAT sum = 0.0;
+       int bb, k, l;
+       for (bb = 0; bb != he->m; ++bb) {
+               FLOAT *eb = hp->e[bb], *Eb = he->E[bb];
+               for (k = 0; k != hp->n; ++k) {
+                       if (eb[k] <= 0.0) return -HMM_INF;
+                       sum += Eb[k] * log(eb[k]);
+               }
+       }
+       for (k = 0; k != he->n; ++k) {
+               FLOAT *Ak = he->A[k], *ak = hp->a[k];
+               for (l = 0; l != he->n; ++l) {
+                       if (ak[l] <= 0.0) return -HMM_INF;
+                       sum += Ak[l] * log(ak[l]);
+               }
+       }
+       return (sum -= he->Q0);
+}
+
+// simulate sequence
+
+char *hmm_simulate(const hmm_par_t *hp, int L)
+{
+       int i, k, l, b;
+       FLOAT x, y, **et;
+       char *seq;
+       seq = (char*)calloc(L+1, 1);
+       // calculate the transpose of hp->e[][]
+       et = (FLOAT**)calloc2(hp->n, hp->m, sizeof(FLOAT));
+       for (k = 0; k != hp->n; ++k)
+               for (b = 0; b != hp->m; ++b)
+                       et[k][b] = hp->e[b][k];
+       // the initial state, drawn from a0[]
+       x = drand48();
+       for (k = 0, y = 0.0; k != hp->n; ++k) {
+               y += hp->a0[k];
+               if (y >= x) break;
+       }
+       // main loop
+       for (i = 0; i != L; ++i) {
+               FLOAT *el, *ak = hp->a[k];
+               x = drand48();
+               for (l = 0, y = 0.0; l != hp->n; ++l) {
+                       y += ak[l];
+                       if (y >= x) break;
+               }
+               el = et[l];
+               x = drand48();
+               for (b = 0, y = 0.0; b != hp->m; ++b) {
+                       y += el[b];
+                       if (y >= x) break;
+               } 
+               seq[i] = b;
+               k = l;
+       }
+       for (k = 0; k != hp->n; ++k) free(et[k]);
+       free(et);
+       return seq;
+}
diff --git a/khmm.h b/khmm.h

new file mode 100644 (file)

index 0000000..d87673b
--- /dev/null
+++ b/khmm.h
@@ -0,0 +1,107 @@
+#ifndef AC_SCHMM_H_
+#define AC_SCHMM_H_
+
+/*
+ * Last Modified: 2008-03-10
+ * Version: 0.1.0-8
+ *
+ * 2008-03-10, 0.1.0-8: make icc report two more "VECTORIZED"
+ * 2008-03-10, 0.1.0-7: accelerate for some CPU
+ * 2008-02-07, 0.1.0-6: simulate sequences
+ * 2008-01-15, 0.1.0-5: goodness of fit
+ * 2007-11-20, 0.1.0-4: add function declaration of hmm_post_decode()
+ * 2007-11-09: fix a memory leak
+ */
+
+#include <stdlib.h>
+
+#define HMM_VERSION "0.1.0-7"
+
+#define HMM_FORWARD  0x02
+#define HMM_BACKWARD 0x04
+#define HMM_VITERBI  0x40
+#define HMM_POSTDEC  0x80
+
+#ifndef FLOAT
+#define FLOAT double
+#endif
+#define HMM_TINY     1e-25
+#define HMM_INF      1e300
+
+typedef struct
+{
+       int m, n; // number of symbols, number of states
+       FLOAT **a, **e; // transition matrix and emitting probilities
+       FLOAT **ae; // auxiliary array for acceleration, should be calculated by hmm_pre_backward()
+       FLOAT *a0; // trasition matrix from the start state
+} hmm_par_t;
+
+typedef struct
+{
+       int L;
+       unsigned status;
+       char *seq;
+       FLOAT **f, **b, *s;
+       int *v; // Viterbi path
+       int *p; // posterior decoding
+} hmm_data_t;
+
+typedef struct
+{
+       int m, n;
+       FLOAT Q0, **A, **E, *A0;
+} hmm_exp_t;
+
+typedef struct
+{
+       int l, *obs;
+       FLOAT *thr;
+} hmm_gof_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+       /* initialize and destroy hmm_par_t */
+       hmm_par_t *hmm_new_par(int m, int n);
+       void hmm_delete_par(hmm_par_t *hp);
+       /* initialize and destroy hmm_data_t */
+       hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp);
+       void hmm_delete_data(hmm_data_t *hd);
+       /* initialize and destroy hmm_exp_t */
+       hmm_exp_t *hmm_new_exp(const hmm_par_t *hp);
+       void hmm_delete_exp(hmm_exp_t *he);
+       /* Viterbi, forward and backward algorithms */
+       FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd);
+       void hmm_pre_backward(hmm_par_t *hp);
+       void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd);
+       void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd);
+       /* log-likelihood of the observations (natural based) */
+       FLOAT hmm_lk(const hmm_data_t *hd);
+       /* posterior probability at the position on the sequence */
+       FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob);
+       /* posterior decoding */
+       void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd);
+       /* expected counts of transitions and emissions */
+       hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd);
+       /* add he0 counts to he1 counts*/
+       void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1);
+       /* the Q function that should be maximized in EM */
+       FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he);
+       FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he);
+       /* simulate sequences */
+       char *hmm_simulate(const hmm_par_t *hp, int L);
+#ifdef __cplusplus
+}
+#endif
+
+static inline void **calloc2(int n_row, int n_col, int size)
+{
+       char **p;
+       int k;
+       p = (char**)malloc(sizeof(char*) * n_row);
+       for (k = 0; k != n_row; ++k)
+               p[k] = (char*)calloc(n_col, size);
+       return (void**)p;
+}
+
+#endif
author	Attractive Chaos <attractor@live.co.uk>
	Thu, 13 Jan 2011 17:55:01 +0000 (12:55 -0500)
committer	Attractive Chaos <attractor@live.co.uk>
	Thu, 13 Jan 2011 17:55:01 +0000 (12:55 -0500)
khmm.c	[new file with mode: 0644]	patch \| blob
khmm.h	[new file with mode: 0644]	patch \| blob