commit da767eaaf6128eccd349a54ec6eac2a68dcacacb
Author: Willy Tarreau <w@1wt.eu>
Date:   Wed Jul 31 19:15:45 2019 +0200

    BUG/MINOR: debug: fix a small race in the thread dumping code
    
    If a thread dump is requested from a signal handler, it may interrupt
    a thread already waiting for a dump to complete, and may see the
    threads_to_dump variable go to zero while others are waiting, steal
    the lock and prevent other threads from ever completing. This tends
    to happen when dumping many threads upon a watchdog timeout, to threads
    waiting for their turn.
    
    Instead now we proceed in two steps :
      1) the last dumped thread sets all bits again
      2) all threads only wait for their own bit to appear, then clear it
         and quit
    
    This way there's no risk that a bit performs a double flip in the same
    loop and threads cannot get stuck here anymore.
    
    This should be backported to 2.0 as it clarifies stack traces.
    
    (cherry picked from commit c07736209db764fb2aef6f18ed3687a504c35771)
    Signed-off-by: Willy Tarreau <w@1wt.eu>

diff --git a/src/debug.c b/src/debug.c
index 059bc6b9..07624ca5 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -440,8 +440,8 @@ void debug_handler(int sig, siginfo_t *si, void *arg)
 	 *   1- wait for our turn, i.e. when all lower bits are gone.
 	 *   2- perform the action if our bit is set
 	 *   3- remove our bit to let the next one go, unless we're
-	 *      the last one and have to put them all but ours
-	 *   4- wait for zero and clear our bit if it's set
+	 *      the last one and have to put them all as a signal
+	 *   4- wait out bit to re-appear, then clear it and quit.
 	 */
 
 	/* wait for all previous threads to finish first */
@@ -454,7 +454,7 @@ void debug_handler(int sig, siginfo_t *si, void *arg)
 			ha_thread_dump(thread_dump_buffer, tid, thread_dump_tid);
 		if ((threads_to_dump & all_threads_mask) == tid_bit) {
 			/* last one */
-			HA_ATOMIC_STORE(&threads_to_dump, all_threads_mask & ~tid_bit);
+			HA_ATOMIC_STORE(&threads_to_dump, all_threads_mask);
 			thread_dump_buffer = NULL;
 		}
 		else
@@ -462,14 +462,13 @@ void debug_handler(int sig, siginfo_t *si, void *arg)
 	}
 
 	/* now wait for all others to finish dumping. The last one will set all
-	 * bits again to broadcast the leaving condition.
+	 * bits again to broadcast the leaving condition so we'll see ourselves
+	 * present again. This way the threads_to_dump variable never passes to
+	 * zero until all visitors have stopped waiting.
 	 */
-	while (threads_to_dump & all_threads_mask) {
-		if (threads_to_dump & tid_bit)
-			HA_ATOMIC_AND(&threads_to_dump, ~tid_bit);
-		else
-			ha_thread_relax();
-	}
+	while (!(threads_to_dump & tid_bit))
+		ha_thread_relax();
+	HA_ATOMIC_AND(&threads_to_dump, ~tid_bit);
 
 	/* mark the current thread as stuck to detect it upon next invocation
 	 * if it didn't move.