diff mbox

[OSSTEST,7/7] ms-ownerdaemon: Cope with db restart. Retry recording dead tasks.

Message ID 1452195496-16016-8-git-send-email-ian.jackson@eu.citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ian Jackson Jan. 7, 2016, 7:38 p.m. UTC
In chan-destroy-stuff, instead of accessing the db directly, add the
dead task(s) to a queue, and arrange to look at that queue.

Errors are handled by setting an `after' handler which we cancel if we
are successful.

The after handler requeues a queue run attempt as the first thing
(which will arrange that a further retry will occur if things are
still broken) and then attempts to reconnect to the database.

I have tested this with a test instance by renaming the `tasks' table
under its feet, and it functions as expected.

DEPLOYMENT NOTE: The owner daemon cannot be restarted without shutting
everything down.  So this update should first be deployed in
Cambridge, probably, to see how it goes.  Also, it is less critical in
the main Xen production test lab because there the db and the owner
daemon are co-hosted on the same VM.

Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
---
 Osstest/Executive.pm |    1 +
 ms-ownerdaemon       |   37 +++++++++++++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 4 deletions(-)
diff mbox

Patch

diff --git a/Osstest/Executive.pm b/Osstest/Executive.pm
index 2314577..d31fafb 100644
--- a/Osstest/Executive.pm
+++ b/Osstest/Executive.pm
@@ -113,6 +113,7 @@  augmentconfigdefaults(
 augmentconfigdefaults(
     OwnerDaemonHost => $c{ControlDaemonHost},
     QueueDaemonHost => $c{ControlDaemonHost},
+    OwnerDaemonDbRetry => $c{QueueDaemonRetry},
 );
 
 #---------- configuration reader etc. ----------
diff --git a/ms-ownerdaemon b/ms-ownerdaemon
index 502dcfe..318549a 100755
--- a/ms-ownerdaemon
+++ b/ms-ownerdaemon
@@ -22,16 +22,37 @@ 
 source ./tcl/daemonlib.tcl
 
 
+set dead_tasks {}
+
 proc chan-destroy-stuff {chan} {
+    global dead_tasks
+
     upvar #0 chanawait($chan) await
     catch { unset await }
 
     upvar #0 chantasks($chan) tasks
     if {![info exists tasks]} return
 
+    puts-chan-desc $chan "-- $tasks"
+
+    foreach task $tasks {
+	lappend dead_tasks $task
+    }
+    after idle record-dead-tasks
+}
+
+proc record-dead-tasks {} {
+    global c dead_tasks
+
+    if {![llength $dead_tasks]} return
+
+    puts "record-dead-tasks ... $dead_tasks"
+
+    set retry [expr {$c(OwnerDaemonDbRetry) * 1000}]
+    set eafter [after $retry record-dead-tasks-retry]
+
     jobdb::transaction resources {
-        puts-chan-desc $chan "-- $tasks"
-        foreach task $tasks {
+        foreach task $dead_tasks {
             jobdb::db-execute "
                 UPDATE tasks
                    SET live = 'f'
@@ -39,12 +60,20 @@  proc chan-destroy-stuff {chan} {
             "
         }
     }
-    puts-chan-desc $chan "== $tasks"
-    unset tasks
 
+    after cancel $eafter
+    puts "record-dead-tasks OK. $dead_tasks"
+    set dead_tasks {}
     after idle await-endings-notify
 }
 
+proc record-dead-tasks-retry {} {
+    after idle record-dead-tasks
+    puts "** reconnecting/retrying **"
+    catch { jobdb::db-close }
+    jobdb::db-open
+}
+
 proc await-endings-notify {} {
     global chanawait
     foreach chan [array names chanawait] {