Avoid lock/defer of jobs deined via ACL

This commit adds a table 'device_skip' that is used to restrict job queue
searches to avoid jobs that are not permitted on this backend via *_no ACLs,
or jobs on devices that have previously encountered multiple SNMP timeouts.

When the backend loads or a device is added, a row is added to the table if
that device should not be polled on this backend (together with the job
actions which are to be skipped/denied). When a device SNMP connect fails a
counter in the same row (or a new row) is incremented.

There is also a new report 'SNMP Connect Failures' to show the devices with
non-zero SNMP connect failure counters. A configurable limit in the setting
'max_deferrals' is used to set the threshold of no longer polling the device.

To reset the deferrals/failures count, restart the Netdisco backend (which
regenerates 'device_skip' cache entries).

Squashed commit of the following:

commit b5e32c219d
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:55:14 2017 +0100

    show all failed connections in report

commit ffce3cee84
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:12:39 2017 +0100

    only resolve fqdn once

commit cc4f680f01
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:10:20 2017 +0100

    Revert "only resolve fqdn once"

    This reverts commit 3d136a54de.

commit d8d082b30e
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:09:05 2017 +0100

    a report to show SNMP failures

commit 3d136a54de
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 19:37:58 2017 +0100

    only resolve fqdn once

commit 4550b8a84c
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 17:27:43 2017 +0100

    skipover now implicit from deferrals/actionset; fix sql where logic with better correlation

commit b51edbccd2
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 16:11:29 2017 +0100

    only abort lock if action matches badactions

commit 415559b24f
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:56:42 2017 +0100

    set skipover true when adding to actionset

commit 1086f2c467
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:50:56 2017 +0100

    fix empty actionset

commit 31962580b8
Merge: 9b2e993e 6808133b
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:25:08 2017 +0100

    Merge branch 'og-device_skip' of github.com:netdisco/netdisco into og-device_skip

commit 6808133bdb
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:19:54 2017 +0100

    in-job checks for acls are required for netdisco-do foreground actions

commit 3944dd7813
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:18:30 2017 +0100

    avoid extra device lookup

commit 9b2e993e0f
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 12:31:36 2017 +0100

    also delete device_skip rows when deleting device

commit b55854e91d
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 11:34:27 2017 +0100

    actions in device_skip table are now an array/set

commit 5e126eef07
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 09:36:33 2017 +0100

    typo

commit 44266f2767
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 09:14:25 2017 +0100

    *able checks within jobs should not be necessary with skiplist

commit e7c22e7d11
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 08:58:57 2017 +0100

    increment deferrals field when job is deferred

commit 88ae9c00ba
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 08:40:27 2017 +0100

    turn connect fail into defer

commit eac1857043
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 08:26:59 2017 +0100

    rename failures column to be deferrals

commit 96ed444bbb
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Mon May 22 22:52:51 2017 +0100

    set up list of jobs the backend instance should skip

commit 3a0019296d
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Mon May 22 22:01:50 2017 +0100

    separate out is_*able last_* checks

commit cf8589aba2
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Sun May 21 22:35:38 2017 +0100

    change from ignore to skip name

commit ed193356f8
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Sun May 21 14:52:33 2017 +0100

    device_ignore table to track devices to skip in polling
This commit is contained in:
Oliver Gorwits
2017-05-27 08:50:08 +01:00
parent 47a5f40efe
commit 9a72d7e74a
21 changed files with 382 additions and 55 deletions

View File

@@ -3,7 +3,11 @@ package App::Netdisco::JobQueue::PostgreSQL;
use Dancer qw/:moose :syntax :script/;
use Dancer::Plugin::DBIC 'schema';
use App::Netdisco::Util::Device
qw/is_discoverable is_macsuckable is_arpnipable/;
use App::Netdisco::Backend::Job;
use Net::Domain 'hostfqdn';
use Module::Load ();
use Try::Tiny;
@@ -15,26 +19,35 @@ our @EXPORT_OK = qw/
jq_getsomep
jq_locked
jq_queued
jq_log
jq_userlog
jq_prime_skiplist
jq_lock
jq_defer
jq_complete
jq_log
jq_userlog
jq_insert
jq_delete
/;
our %EXPORT_TAGS = ( all => \@EXPORT_OK );
# this can take a few seconds - only do it once
our $fqdn = undef;
sub _getsome {
my ($num_slots, $where) = @_;
return () if ((!defined $num_slots) or ($num_slots < 1));
return () if ((!defined $where) or (ref {} ne ref $where));
my $rs = schema('netdisco')->resultset('Admin')
->search(
{ status => 'queued', %$where },
{ order_by => 'random()', rows => $num_slots },
);
my $fqdn ||= (hostfqdn || 'localhost');
my $jobs = schema('netdisco')->resultset('Admin');
my $rs = $jobs->search({
status => 'queued',
device => { '-not_in' =>
$jobs->skipped($fqdn, setting('workers')->{'max_deferrals'})
->columns('device')->as_query },
%$where,
}, { order_by => 'random()', rows => $num_slots });
my @returned = ();
while (my $job = $rs->next) {
@@ -61,7 +74,7 @@ sub jq_getsomep {
}
sub jq_locked {
my $fqdn = hostfqdn || 'localhost';
my $fqdn ||= (hostfqdn || 'localhost');
my @returned = ();
my $rs = schema('netdisco')->resultset('Admin')
@@ -83,26 +96,65 @@ sub jq_queued {
})->get_column('device')->all;
}
sub jq_log {
return schema('netdisco')->resultset('Admin')->search({}, {
order_by => { -desc => [qw/entered device action/] },
rows => 50,
})->with_times->hri->all;
# given a device, tests if any of the primary acls applies
# returns a list of job actions to be denied/skipped on this host.
sub _get_denied_actions {
my $device = shift;
my @badactions = ();
push @badactions, ('discover', @{ setting('job_prio')->{high} })
if not is_discoverable($device);
push @badactions, (qw/macsuck nbtstat/)
if not is_macsuckable($device);
push @badactions, 'arpnip'
if not is_arpnipable($device);
return @badactions;
}
sub jq_userlog {
my $user = shift;
return schema('netdisco')->resultset('Admin')->search({
username => $user,
finished => { '>' => \"(now() - interval '5 seconds')" },
})->with_times->all;
sub jq_prime_skiplist {
my $fqdn ||= (hostfqdn || 'localhost');
my @devices = schema('netdisco')->resultset('Device')->all;
my $rs = schema('netdisco')->resultset('DeviceSkip');
my %actionset = ();
foreach my $d (@devices) {
my @badactions = _get_denied_actions($d);
$actionset{$d->ip} = \@badactions if scalar @badactions;
}
schema('netdisco')->txn_do(sub {
$rs->search({ backend => $fqdn })->delete;
$rs->populate([
map {{
backend => $fqdn,
device => $_,
actionset => $actionset{$_},
}} keys %actionset
]);
});
}
sub jq_lock {
my $job = shift;
my $fqdn = hostfqdn || 'localhost';
my $fqdn ||= (hostfqdn || 'localhost');
my $happy = false;
# need to handle device discovered since backend daemon started
# and the skiplist was primed. these should be checked against
# the various acls and have device_skip entry added if needed,
# and return false if it should have been skipped.
my @badactions = _get_denied_actions($job->device);
if (scalar @badactions) {
schema('netdisco')->resultset('DeviceSkip')->find_or_create({
backend => $fqdn, device => $job->device,
},{ key => 'device_skip_pkey' })->add_to_actionset(@badactions);
return false if scalar grep {$_ eq $job->action} @badactions;
}
# lock db row and update to show job has been picked
try {
schema('netdisco')->txn_do(sub {
@@ -136,11 +188,23 @@ sub jq_lock {
sub jq_defer {
my $job = shift;
my $fqdn ||= (hostfqdn || 'localhost');
my $happy = false;
# note this taints all actions on the device. for example if both
# macsuck and arpnip are allowed, but macsuck fails 10 times, then
# arpnip (and every other action) will be prevented on the device.
# seeing as defer is only triggered by an SNMP connect failure, this
# behaviour seems reasonable, to me (or desirable, perhaps).
try {
# lock db row and update to show job is available
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('DeviceSkip')->find_or_create({
backend => $fqdn, device => $job->device,
},{ key => 'device_skip_pkey' })->increment_deferrals;
# lock db row and update to show job is available
schema('netdisco')->resultset('Admin')
->find($job->job, {for => 'update'})
->update({ status => 'queued', started => undef });
@@ -178,6 +242,21 @@ sub jq_complete {
return $happy;
}
sub jq_log {
return schema('netdisco')->resultset('Admin')->search({}, {
order_by => { -desc => [qw/entered device action/] },
rows => 50,
})->with_times->hri->all;
}
sub jq_userlog {
my $user = shift;
return schema('netdisco')->resultset('Admin')->search({
username => $user,
finished => { '>' => \"(now() - interval '5 seconds')" },
})->with_times->all;
}
sub jq_insert {
my $jobs = shift;
$jobs = [$jobs] if ref [] ne ref $jobs;