]> git.kaiwu.me - nginx.git/commitdiff
Better recheck of dead upstream servers.
authorMaxim Dounin <mdounin@mdounin.ru>
Wed, 12 Oct 2011 14:22:48 +0000 (14:22 +0000)
committerMaxim Dounin <mdounin@mdounin.ru>
Wed, 12 Oct 2011 14:22:48 +0000 (14:22 +0000)
Previously nginx used to mark backend again as live as soon as fail_timeout
passes (10s by default) since last failure.  On the other hand, detecting
dead backend takes up to 60s (proxy_connect_timeout) in typical situation
"backend is down and doesn't respond to any packets".  This resulted in
suboptimal behaviour in the above situation (up to 23% of requests were
directed to dead backend with default settings).

More detailed description of the problem may be found here (in Russian):
http://mailman.nginx.org/pipermail/nginx-ru/2011-August/042172.html

Fix is to only allow one request after fail_timeout passes, and
mark backend as "live" only if this request succeeds.

Note that with new code backend will not be marked "live" unless "check"
request is completed, and this may take a while in some specific workloads
(e.g. streaming).  This is believed to be acceptable.

src/http/modules/ngx_http_upstream_ip_hash_module.c
src/http/ngx_http_upstream_round_robin.c
src/http/ngx_http_upstream_round_robin.h

index dffbf22b28f2970533ca465a09631433974cbbe1..4c031eb4798067088d413a2693126dd366169435 100644 (file)
@@ -185,8 +185,8 @@ ngx_http_upstream_get_ip_hash_peer(ngx_peer_connection_t *pc, void *data)
                     break;
                 }
 
-                if (now - peer->accessed > peer->fail_timeout) {
-                    peer->fails = 0;
+                if (now - peer->checked > peer->fail_timeout) {
+                    peer->checked = now;
                     break;
                 }
             }
index bb9a704b8959f3707adc7d5a25e0d8cb7620bfd0..138872c5fdfcc0e324edc700bf94c2e4a7f69d7f 100644 (file)
@@ -443,8 +443,8 @@ ngx_http_upstream_get_round_robin_peer(ngx_peer_connection_t *pc, void *data)
                             break;
                         }
 
-                        if (now - peer->accessed > peer->fail_timeout) {
-                            peer->fails = 0;
+                        if (now - peer->checked > peer->fail_timeout) {
+                            peer->checked = now;
                             break;
                         }
 
@@ -491,8 +491,8 @@ ngx_http_upstream_get_round_robin_peer(ngx_peer_connection_t *pc, void *data)
                             break;
                         }
 
-                        if (now - peer->accessed > peer->fail_timeout) {
-                            peer->fails = 0;
+                        if (now - peer->checked > peer->fail_timeout) {
+                            peer->checked = now;
                             break;
                         }
 
@@ -663,15 +663,16 @@ ngx_http_upstream_free_round_robin_peer(ngx_peer_connection_t *pc, void *data,
         return;
     }
 
+    peer = &rrp->peers->peer[rrp->current];
+
     if (state & NGX_PEER_FAILED) {
         now = ngx_time();
 
-        peer = &rrp->peers->peer[rrp->current];
-
         /* ngx_lock_mutex(rrp->peers->mutex); */
 
         peer->fails++;
         peer->accessed = now;
+        peer->checked = now;
 
         if (peer->max_fails) {
             peer->current_weight -= peer->weight / peer->max_fails;
@@ -686,6 +687,14 @@ ngx_http_upstream_free_round_robin_peer(ngx_peer_connection_t *pc, void *data,
         }
 
         /* ngx_unlock_mutex(rrp->peers->mutex); */
+
+    } else {
+
+        /* mark peer live if check passed */
+
+        if (peer->accessed < peer->checked) {
+            peer->fails = 0;
+        }
     }
 
     rrp->current++;
index a9cb257c74f06d85f4313d0849304f6112c65433..195f4d8cad567315423f538cc9ede5605f890f45 100644 (file)
@@ -23,6 +23,7 @@ typedef struct {
 
     ngx_uint_t                      fails;
     time_t                          accessed;
+    time_t                          checked;
 
     ngx_uint_t                      max_fails;
     time_t                          fail_timeout;