Daemon crash when restarting with in-progress jobs and many workers

2014-07-31 00:55:56 +01:00
parent 8e35a7158e
commit 2402855797
4 changed files with 19 additions and 4 deletions
--- a/Netdisco/lib/App/Netdisco/Daemon/LocalQueue.pm
+++ b/Netdisco/lib/App/Netdisco/Daemon/LocalQueue.pm
@@ -5,7 +5,7 @@ use Dancer::Plugin::DBIC 'schema';

 use base 'Exporter';
 our @EXPORT = ();
-our @EXPORT_OK = qw/ add_jobs capacity_for take_jobs reset_jobs/;
+our @EXPORT_OK = qw/ add_jobs capacity_for take_jobs reset_jobs release_jobs /;
 our %EXPORT_TAGS = ( all => \@EXPORT_OK );

 schema('daemon')->deploy;
@@ -59,4 +59,11 @@ sub reset_jobs {
        ->update({wid => 0});
 }

+# not used by workers, only the daemon when reinitializing a worker
+sub release_jobs {
+  my ($jid) = @_;
+  debug "releasing local job ID $jid";
+  $queue->search({job => $jid})->delete;
+}
+
 1;
--- a/Netdisco/lib/App/Netdisco/JobQueue/PostgreSQL.pm
+++ b/Netdisco/lib/App/Netdisco/JobQueue/PostgreSQL.pm
@@ -143,12 +143,19 @@ sub jq_lock {
  return $happy;
 }

+# PostgreSQL engine depends on LocalQueue, which is accessed synchronously via
+# the main daemon process. This is only used by daemon workers which can use
+# MCE ->do() method.
 sub jq_defer {
  my $job = shift;
  my $happy = false;

-  # lock db row and update to show job is available
  try {
+    # other local workers are polling the central queue, so
+    # to prevent a race, first delete the job in our local queue
+    MCE->do('release_jobs', $job->id);
+
+    # lock db row and update to show job is available
    schema('netdisco')->txn_do(sub {
      schema('netdisco')->resultset('Admin')
        ->find($job->id, {for => 'update'})