This commit adds a table 'device_skip' that is used to restrict job queue searches to avoid jobs that are not permitted on this backend via *_no ACLs, or jobs on devices that have previously encountered multiple SNMP timeouts. When the backend loads or a device is added, a row is added to the table if that device should not be polled on this backend (together with the job actions which are to be skipped/denied). When a device SNMP connect fails a counter in the same row (or a new row) is incremented. There is also a new report 'SNMP Connect Failures' to show the devices with non-zero SNMP connect failure counters. A configurable limit in the setting 'max_deferrals' is used to set the threshold of no longer polling the device. To reset the deferrals/failures count, restart the Netdisco backend (which regenerates 'device_skip' cache entries). Squashed commit of the following: commitb5e32c219dAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:55:14 2017 +0100 show all failed connections in report commitffce3cee84Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:12:39 2017 +0100 only resolve fqdn once commitcc4f680f01Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:10:20 2017 +0100 Revert "only resolve fqdn once" This reverts commit3d136a54de. commitd8d082b30eAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 20:09:05 2017 +0100 a report to show SNMP failures commit3d136a54deAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 19:37:58 2017 +0100 only resolve fqdn once commit4550b8a84cAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 17:27:43 2017 +0100 skipover now implicit from deferrals/actionset; fix sql where logic with better correlation commitb51edbccd2Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 16:11:29 2017 +0100 only abort lock if action matches badactions commit415559b24fAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:56:42 2017 +0100 set skipover true when adding to actionset commit1086f2c467Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:50:56 2017 +0100 fix empty actionset commit31962580b8Merge:9b2e993e6808133bAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:25:08 2017 +0100 Merge branch 'og-device_skip' of github.com:netdisco/netdisco into og-device_skip commit6808133bdbAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:19:54 2017 +0100 in-job checks for acls are required for netdisco-do foreground actions commit3944dd7813Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 13:18:30 2017 +0100 avoid extra device lookup commit9b2e993e0fAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 12:31:36 2017 +0100 also delete device_skip rows when deleting device commitb55854e91dAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 11:34:27 2017 +0100 actions in device_skip table are now an array/set commit5e126eef07Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 09:36:33 2017 +0100 typo commit44266f2767Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 09:14:25 2017 +0100 *able checks within jobs should not be necessary with skiplist commite7c22e7d11Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 08:58:57 2017 +0100 increment deferrals field when job is deferred commit88ae9c00baAuthor: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 08:40:27 2017 +0100 turn connect fail into defer commiteac1857043Author: Oliver Gorwits <oliver@cpan.org> Date: Tue May 23 08:26:59 2017 +0100 rename failures column to be deferrals commit96ed444bbbAuthor: Oliver Gorwits <oliver@cpan.org> Date: Mon May 22 22:52:51 2017 +0100 set up list of jobs the backend instance should skip commit3a0019296dAuthor: Oliver Gorwits <oliver@cpan.org> Date: Mon May 22 22:01:50 2017 +0100 separate out is_*able last_* checks commitcf8589aba2Author: Oliver Gorwits <oliver@cpan.org> Date: Sun May 21 22:35:38 2017 +0100 change from ignore to skip name commited193356f8Author: Oliver Gorwits <oliver@cpan.org> Date: Sun May 21 14:52:33 2017 +0100 device_ignore table to track devices to skip in polling
304 lines
7.4 KiB
Perl
304 lines
7.4 KiB
Perl
package App::Netdisco::JobQueue::PostgreSQL;
|
||
|
||
use Dancer qw/:moose :syntax :script/;
|
||
use Dancer::Plugin::DBIC 'schema';
|
||
|
||
use App::Netdisco::Util::Device
|
||
qw/is_discoverable is_macsuckable is_arpnipable/;
|
||
use App::Netdisco::Backend::Job;
|
||
|
||
|
||
use Net::Domain 'hostfqdn';
|
||
use Module::Load ();
|
||
use Try::Tiny;
|
||
|
||
use base 'Exporter';
|
||
our @EXPORT = ();
|
||
our @EXPORT_OK = qw/
|
||
jq_getsome
|
||
jq_getsomep
|
||
jq_locked
|
||
jq_queued
|
||
jq_prime_skiplist
|
||
jq_lock
|
||
jq_defer
|
||
jq_complete
|
||
jq_log
|
||
jq_userlog
|
||
jq_insert
|
||
jq_delete
|
||
/;
|
||
our %EXPORT_TAGS = ( all => \@EXPORT_OK );
|
||
|
||
# this can take a few seconds - only do it once
|
||
our $fqdn = undef;
|
||
|
||
sub _getsome {
|
||
my ($num_slots, $where) = @_;
|
||
return () if ((!defined $num_slots) or ($num_slots < 1));
|
||
return () if ((!defined $where) or (ref {} ne ref $where));
|
||
|
||
my $fqdn ||= (hostfqdn || 'localhost');
|
||
my $jobs = schema('netdisco')->resultset('Admin');
|
||
|
||
my $rs = $jobs->search({
|
||
status => 'queued',
|
||
device => { '-not_in' =>
|
||
$jobs->skipped($fqdn, setting('workers')->{'max_deferrals'})
|
||
->columns('device')->as_query },
|
||
%$where,
|
||
}, { order_by => 'random()', rows => $num_slots });
|
||
|
||
my @returned = ();
|
||
while (my $job = $rs->next) {
|
||
push @returned, App::Netdisco::Backend::Job->new({ $job->get_columns });
|
||
}
|
||
return @returned;
|
||
}
|
||
|
||
sub jq_getsome {
|
||
return _getsome(shift,
|
||
{ action => { -in => setting('job_prio')->{'normal'} } }
|
||
);
|
||
}
|
||
|
||
sub jq_getsomep {
|
||
return _getsome(shift, {
|
||
-or => [{
|
||
username => { '!=' => undef },
|
||
action => { -in => setting('job_prio')->{'normal'} },
|
||
},{
|
||
action => { -in => setting('job_prio')->{'high'} },
|
||
}],
|
||
});
|
||
}
|
||
|
||
sub jq_locked {
|
||
my $fqdn ||= (hostfqdn || 'localhost');
|
||
my @returned = ();
|
||
|
||
my $rs = schema('netdisco')->resultset('Admin')
|
||
->search({status => "queued-$fqdn"});
|
||
|
||
while (my $job = $rs->next) {
|
||
push @returned, App::Netdisco::Backend::Job->new({ $job->get_columns });
|
||
}
|
||
return @returned;
|
||
}
|
||
|
||
sub jq_queued {
|
||
my $job_type = shift;
|
||
|
||
return schema('netdisco')->resultset('Admin')->search({
|
||
device => { '!=' => undef},
|
||
action => $job_type,
|
||
status => { -like => 'queued%' },
|
||
})->get_column('device')->all;
|
||
}
|
||
|
||
# given a device, tests if any of the primary acls applies
|
||
# returns a list of job actions to be denied/skipped on this host.
|
||
sub _get_denied_actions {
|
||
my $device = shift;
|
||
my @badactions = ();
|
||
|
||
push @badactions, ('discover', @{ setting('job_prio')->{high} })
|
||
if not is_discoverable($device);
|
||
|
||
push @badactions, (qw/macsuck nbtstat/)
|
||
if not is_macsuckable($device);
|
||
|
||
push @badactions, 'arpnip'
|
||
if not is_arpnipable($device);
|
||
|
||
return @badactions;
|
||
}
|
||
|
||
sub jq_prime_skiplist {
|
||
my $fqdn ||= (hostfqdn || 'localhost');
|
||
my @devices = schema('netdisco')->resultset('Device')->all;
|
||
my $rs = schema('netdisco')->resultset('DeviceSkip');
|
||
my %actionset = ();
|
||
|
||
foreach my $d (@devices) {
|
||
my @badactions = _get_denied_actions($d);
|
||
$actionset{$d->ip} = \@badactions if scalar @badactions;
|
||
}
|
||
|
||
schema('netdisco')->txn_do(sub {
|
||
$rs->search({ backend => $fqdn })->delete;
|
||
$rs->populate([
|
||
map {{
|
||
backend => $fqdn,
|
||
device => $_,
|
||
actionset => $actionset{$_},
|
||
}} keys %actionset
|
||
]);
|
||
});
|
||
}
|
||
|
||
sub jq_lock {
|
||
my $job = shift;
|
||
my $fqdn ||= (hostfqdn || 'localhost');
|
||
my $happy = false;
|
||
|
||
# need to handle device discovered since backend daemon started
|
||
# and the skiplist was primed. these should be checked against
|
||
# the various acls and have device_skip entry added if needed,
|
||
# and return false if it should have been skipped.
|
||
my @badactions = _get_denied_actions($job->device);
|
||
if (scalar @badactions) {
|
||
schema('netdisco')->resultset('DeviceSkip')->find_or_create({
|
||
backend => $fqdn, device => $job->device,
|
||
},{ key => 'device_skip_pkey' })->add_to_actionset(@badactions);
|
||
|
||
return false if scalar grep {$_ eq $job->action} @badactions;
|
||
}
|
||
|
||
# lock db row and update to show job has been picked
|
||
try {
|
||
schema('netdisco')->txn_do(sub {
|
||
schema('netdisco')->resultset('Admin')
|
||
->search({job => $job->job}, {for => 'update'})
|
||
->update({ status => "queued-$fqdn" });
|
||
|
||
return unless
|
||
schema('netdisco')->resultset('Admin')
|
||
->count({job => $job->job, status => "queued-$fqdn"});
|
||
|
||
# remove any duplicate jobs, needed because we have race conditions
|
||
# when queueing jobs of a type for all devices
|
||
schema('netdisco')->resultset('Admin')->search({
|
||
status => 'queued',
|
||
device => $job->device,
|
||
port => $job->port,
|
||
action => $job->action,
|
||
subaction => $job->subaction,
|
||
}, {for => 'update'})->delete();
|
||
|
||
$happy = true;
|
||
});
|
||
}
|
||
catch {
|
||
error $_;
|
||
};
|
||
|
||
return $happy;
|
||
}
|
||
|
||
sub jq_defer {
|
||
my $job = shift;
|
||
my $fqdn ||= (hostfqdn || 'localhost');
|
||
my $happy = false;
|
||
|
||
# note this taints all actions on the device. for example if both
|
||
# macsuck and arpnip are allowed, but macsuck fails 10 times, then
|
||
# arpnip (and every other action) will be prevented on the device.
|
||
|
||
# seeing as defer is only triggered by an SNMP connect failure, this
|
||
# behaviour seems reasonable, to me (or desirable, perhaps).
|
||
|
||
try {
|
||
schema('netdisco')->txn_do(sub {
|
||
schema('netdisco')->resultset('DeviceSkip')->find_or_create({
|
||
backend => $fqdn, device => $job->device,
|
||
},{ key => 'device_skip_pkey' })->increment_deferrals;
|
||
|
||
# lock db row and update to show job is available
|
||
schema('netdisco')->resultset('Admin')
|
||
->find($job->job, {for => 'update'})
|
||
->update({ status => 'queued', started => undef });
|
||
});
|
||
$happy = true;
|
||
}
|
||
catch {
|
||
error $_;
|
||
};
|
||
|
||
return $happy;
|
||
}
|
||
|
||
sub jq_complete {
|
||
my $job = shift;
|
||
my $happy = false;
|
||
|
||
# lock db row and update to show job is done/error
|
||
try {
|
||
schema('netdisco')->txn_do(sub {
|
||
schema('netdisco')->resultset('Admin')
|
||
->find($job->job, {for => 'update'})->update({
|
||
status => $job->status,
|
||
log => $job->log,
|
||
started => $job->started,
|
||
finished => $job->finished,
|
||
});
|
||
});
|
||
$happy = true;
|
||
}
|
||
catch {
|
||
error $_;
|
||
};
|
||
|
||
return $happy;
|
||
}
|
||
|
||
sub jq_log {
|
||
return schema('netdisco')->resultset('Admin')->search({}, {
|
||
order_by => { -desc => [qw/entered device action/] },
|
||
rows => 50,
|
||
})->with_times->hri->all;
|
||
}
|
||
|
||
sub jq_userlog {
|
||
my $user = shift;
|
||
return schema('netdisco')->resultset('Admin')->search({
|
||
username => $user,
|
||
finished => { '>' => \"(now() - interval '5 seconds')" },
|
||
})->with_times->all;
|
||
}
|
||
|
||
sub jq_insert {
|
||
my $jobs = shift;
|
||
$jobs = [$jobs] if ref [] ne ref $jobs;
|
||
my $happy = false;
|
||
|
||
try {
|
||
schema('netdisco')->txn_do(sub {
|
||
schema('netdisco')->resultset('Admin')->populate([
|
||
map {{
|
||
device => $_->{device},
|
||
port => $_->{port},
|
||
action => $_->{action},
|
||
subaction => ($_->{extra} || $_->{subaction}),
|
||
username => $_->{username},
|
||
userip => $_->{userip},
|
||
status => 'queued',
|
||
}} @$jobs
|
||
]);
|
||
});
|
||
$happy = true;
|
||
}
|
||
catch {
|
||
error $_;
|
||
};
|
||
|
||
return $happy;
|
||
}
|
||
|
||
sub jq_delete {
|
||
my $id = shift;
|
||
|
||
if ($id) {
|
||
schema('netdisco')->txn_do(sub {
|
||
schema('netdisco')->resultset('Admin')->find($id)->delete();
|
||
});
|
||
}
|
||
else {
|
||
schema('netdisco')->txn_do(sub {
|
||
schema('netdisco')->resultset('Admin')->delete();
|
||
});
|
||
}
|
||
}
|
||
|
||
true;
|