This commit adds a table 'device_skip' that is used to restrict job queue searches to avoid jobs that are not permitted on this backend via *_no ACLs, or jobs on devices that have previously encountered multiple SNMP timeouts. When the backend loads or a device is added, a row is added to the table if that device should not be polled on this backend (together with the job actions which are to be skipped/denied). When a device SNMP connect fails a counter in the same row (or a new row) is incremented. There is also a new report 'SNMP Connect Failures' to show the devices with non-zero SNMP connect failure counters. A configurable limit in the setting 'max_deferrals' is used to set the threshold of no longer polling the device. To reset the deferrals/failures count, restart the Netdisco backend (which regenerates 'device_skip' cache entries). Squashed commit of the following: commitb5e32c219dAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:55:14 2017 +0100 show all failed connections in report commitffce3cee84Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:12:39 2017 +0100 only resolve fqdn once commitcc4f680f01Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:10:20 2017 +0100 Revert "only resolve fqdn once" This reverts commit3d136a54de. commitd8d082b30eAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:09:05 2017 +0100 a report to show SNMP failures commit3d136a54deAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 19:37:58 2017 +0100 only resolve fqdn once commit4550b8a84cAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 17:27:43 2017 +0100 skipover now implicit from deferrals/actionset; fix sql where logic with better correlation commitb51edbccd2Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 16:11:29 2017 +0100 only abort lock if action matches badactions commit415559b24fAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:56:42 2017 +0100 set skipover true when adding to actionset commit1086f2c467Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:50:56 2017 +0100 fix empty actionset commit31962580b8Merge:9b2e993e6808133bAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:25:08 2017 +0100 Merge branch 'og-device_skip' of github.com:netdisco/netdisco into og-device_skip commit6808133bdbAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:19:54 2017 +0100 in-job checks for acls are required for netdisco-do foreground actions commit3944dd7813Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:18:30 2017 +0100 avoid extra device lookup commit9b2e993e0fAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 12:31:36 2017 +0100 also delete device_skip rows when deleting device commitb55854e91dAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 11:34:27 2017 +0100 actions in device_skip table are now an array/set commit5e126eef07Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 09:36:33 2017 +0100 typo commit44266f2767Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 09:14:25 2017 +0100 *able checks within jobs should not be necessary with skiplist commite7c22e7d11Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 08:58:57 2017 +0100 increment deferrals field when job is deferred commit88ae9c00baAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 08:40:27 2017 +0100 turn connect fail into defer commiteac1857043Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 08:26:59 2017 +0100 rename failures column to be deferrals commit96ed444bbbAuthor: Oliver Gorwits <oliver@cpan.org> Date: Mon May 22 22:52:51 2017 +0100 set up list of jobs the backend instance should skip commit3a0019296dAuthor: Oliver Gorwits <oliver@cpan.org> Date: Mon May 22 22:01:50 2017 +0100 separate out is_*able last_* checks commitcf8589aba2Author: Oliver Gorwits <oliver@cpan.org> Date: Sun May 21 22:35:38 2017 +0100 change from ignore to skip name commited193356f8Author: Oliver Gorwits <oliver@cpan.org> Date: Sun May 21 14:52:33 2017 +0100 device_ignore table to track devices to skip in polling
91 lines
2.6 KiB
Perl
91 lines
2.6 KiB
Perl
package App::Netdisco::Backend::Worker::Manager;
|
|
|
|
use Dancer qw/:moose :syntax :script/;
|
|
|
|
use List::Util 'sum';
|
|
use App::Netdisco::Util::Backend;
|
|
|
|
use Role::Tiny;
|
|
use namespace::clean;
|
|
|
|
use App::Netdisco::JobQueue
|
|
qw/jq_locked jq_getsome jq_getsomep jq_lock jq_prime_skiplist/;
|
|
|
|
sub worker_begin {
|
|
my $self = shift;
|
|
my $wid = $self->wid;
|
|
|
|
return debug "mgr ($wid): no need for manager... skip begin"
|
|
if setting('workers')->{'no_manager'};
|
|
|
|
debug "entering Manager ($wid) worker_begin()";
|
|
|
|
# rebuild device skip hints
|
|
jq_prime_skiplist;
|
|
|
|
# requeue jobs locally
|
|
debug "mgr ($wid): searching for jobs booked to this processing node";
|
|
my @jobs = jq_locked;
|
|
|
|
if (scalar @jobs) {
|
|
info sprintf "mgr (%s): found %s jobs booked to this processing node",
|
|
$wid, scalar @jobs;
|
|
$self->{queue}->enqueuep(100, @jobs);
|
|
}
|
|
}
|
|
|
|
sub worker_body {
|
|
my $self = shift;
|
|
my $wid = $self->wid;
|
|
|
|
if (setting('workers')->{'no_manager'}) {
|
|
prctl sprintf 'netdisco-backend: worker #%s manager: inactive', $wid;
|
|
return debug "mgr ($wid): no need for manager... quitting"
|
|
}
|
|
|
|
while (1) {
|
|
prctl sprintf 'netdisco-backend: worker #%s manager: gathering', $wid;
|
|
my $num_slots = 0;
|
|
|
|
$num_slots = parse_max_workers( setting('workers')->{tasks} )
|
|
- $self->{queue}->pending();
|
|
debug "mgr ($wid): getting potential jobs for $num_slots workers (HP)";
|
|
|
|
# get some high priority jobs
|
|
# TODO also check for stale jobs in Netdisco DB
|
|
foreach my $job ( jq_getsomep($num_slots) ) {
|
|
|
|
# mark job as running
|
|
next unless jq_lock($job);
|
|
info sprintf "mgr (%s): job %s booked out for this processing node",
|
|
$wid, $job->job;
|
|
|
|
# copy job to local queue
|
|
$self->{queue}->enqueuep(100, $job);
|
|
}
|
|
|
|
$num_slots = parse_max_workers( setting('workers')->{tasks} )
|
|
- $self->{queue}->pending();
|
|
debug "mgr ($wid): getting potential jobs for $num_slots workers (NP)";
|
|
|
|
# get some normal priority jobs
|
|
# TODO also check for stale jobs in Netdisco DB
|
|
foreach my $job ( jq_getsome($num_slots) ) {
|
|
|
|
# mark job as running
|
|
next unless jq_lock($job);
|
|
info sprintf "mgr (%s): job %s booked out for this processing node",
|
|
$wid, $job->job;
|
|
|
|
# copy job to local queue
|
|
$self->{queue}->enqueue($job);
|
|
}
|
|
|
|
debug "mgr ($wid): sleeping now...";
|
|
prctl sprintf 'netdisco-backend: worker #%s manager: idle', $wid;
|
|
sleep( setting('workers')->{sleep_time} || 1 );
|
|
}
|
|
}
|
|
|
|
1;
|