From: Heng Li <lh3@live.co.uk>
Date: Fri, 6 May 2011 02:21:29 +0000 (-0400)
Subject: a heuristic to relax the 255 max score limit
X-Git-Tag: ksprintf-final~35
X-Git-Url: http://www.kaiwu.me/postgresql/commit/static/gitweb.js?a=commitdiff_plain;h=3a2145ef86b4c20e7a724b6fc9a92bd4326a4a2e;p=klib.git

a heuristic to relax the 255 max score limit
---

diff --git a/ksw.c b/ksw.c
index fe83a11..7cb5d89 100644
--- a/ksw.c
+++ b/ksw.c
@@ -79,8 +79,8 @@ ksw_query_t *ksw_qinit(int p, int qlen, const uint8_t *query, int m, const int8_
 
 int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, unsigned _o, unsigned _e) // the first gap costs -(_o+_e)
 {
-	int slen, i, score;
-	__m128i zero, gapo, gape, shift, gmax, *H0, *H1, *E;
+	int slen, i, sum, score;
+	__m128i zero, gapo, gape, shift, gmax, reduce, thres, *H0, *H1, *E;
 
 #define __max_16(ret, xx) do { \
 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
@@ -95,6 +95,8 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, unsigned _o, un
 	gapo = _mm_set1_epi8(_o + _e);
 	gape = _mm_set1_epi8(_e);
 	shift = _mm_set1_epi8(q->shift);
+	thres = _mm_set1_epi8(254 - q->mdiff); // when max score exceeds this, shift all scores by "reduce" below
+	reduce = _mm_set1_epi8(127);
 	H0 = q->H0; H1 = q->H1; E = q->E;
 	slen = q->slen;
 	for (i = 0; i < slen; ++i) {
@@ -102,7 +104,7 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, unsigned _o, un
 		_mm_store_si128(H0 + i, zero);
 	}
 	// the core loop
-	for (i = 0; i < tlen; ++i) {
+	for (i = 0, sum = 0; i < tlen; ++i) {
 		int j, k, cmp;
 		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
@@ -133,7 +135,6 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, unsigned _o, un
 			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
 		}
 		gmax = _mm_max_epu8(gmax, max); // NB: H(i,j) updated in the lazy-F loop cannot exceed max
-		//for (score=0;score<16;++score)printf("%d ", ((int8_t*)&gmax)[score]);printf("\n");
 		// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
 		for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3
 			f = _mm_slli_si128(f, 1);
@@ -148,10 +149,23 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, unsigned _o, un
 			}
 		}
 end_loop:
+		if (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(gmax, thres), zero)) != 0xffff) { // reduce
+			/* When overflow is going to happen, subtract 127 from all scores. This heuristic
+			 * may miss the best alignment, but in practice this should happen very rarely. */
+			sum += 127;
+			gmax = _mm_subs_epu8(gmax, reduce);
+			//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&thres)[k]);printf("\n");
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_subs_epu8(_mm_load_si128(H1 + j), reduce);
+				_mm_store_si128(H1 + j, h);
+				e = _mm_subs_epu8(_mm_load_si128(E + j), reduce);
+				_mm_store_si128(E + j, e);
+			}
+		}
 		S = H1; H1 = H0; H0 = S; // swap H0 and H1
 	}
 	__max_16(score, gmax);
-	return score;
+	return score + sum;
 }
 
 /*******************************************