Files
netdisco/lib/App/Netdisco/JobQueue/PostgreSQL.pm
Oliver Gorwits 9a72d7e74a Avoid lock/defer of jobs deined via ACL
This commit adds a table 'device_skip' that is used to restrict job queue
searches to avoid jobs that are not permitted on this backend via *_no ACLs,
or jobs on devices that have previously encountered multiple SNMP timeouts.

When the backend loads or a device is added, a row is added to the table if
that device should not be polled on this backend (together with the job
actions which are to be skipped/denied). When a device SNMP connect fails a
counter in the same row (or a new row) is incremented.

There is also a new report 'SNMP Connect Failures' to show the devices with
non-zero SNMP connect failure counters. A configurable limit in the setting
'max_deferrals' is used to set the threshold of no longer polling the device.

To reset the deferrals/failures count, restart the Netdisco backend (which
regenerates 'device_skip' cache entries).

Squashed commit of the following:

commit b5e32c219d
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:55:14 2017 +0100

    show all failed connections in report

commit ffce3cee84
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:12:39 2017 +0100

    only resolve fqdn once

commit cc4f680f01
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:10:20 2017 +0100

    Revert "only resolve fqdn once"

    This reverts commit 3d136a54de.

commit d8d082b30e
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 20:09:05 2017 +0100

    a report to show SNMP failures

commit 3d136a54de
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 19:37:58 2017 +0100

    only resolve fqdn once

commit 4550b8a84c
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 17:27:43 2017 +0100

    skipover now implicit from deferrals/actionset; fix sql where logic with better correlation

commit b51edbccd2
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 16:11:29 2017 +0100

    only abort lock if action matches badactions

commit 415559b24f
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:56:42 2017 +0100

    set skipover true when adding to actionset

commit 1086f2c467
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:50:56 2017 +0100

    fix empty actionset

commit 31962580b8
Merge: 9b2e993e 6808133b
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:25:08 2017 +0100

    Merge branch 'og-device_skip' of github.com:netdisco/netdisco into og-device_skip

commit 6808133bdb
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:19:54 2017 +0100

    in-job checks for acls are required for netdisco-do foreground actions

commit 3944dd7813
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 13:18:30 2017 +0100

    avoid extra device lookup

commit 9b2e993e0f
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 12:31:36 2017 +0100

    also delete device_skip rows when deleting device

commit b55854e91d
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 11:34:27 2017 +0100

    actions in device_skip table are now an array/set

commit 5e126eef07
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 09:36:33 2017 +0100

    typo

commit 44266f2767
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 09:14:25 2017 +0100

    *able checks within jobs should not be necessary with skiplist

commit e7c22e7d11
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 08:58:57 2017 +0100

    increment deferrals field when job is deferred

commit 88ae9c00ba
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 08:40:27 2017 +0100

    turn connect fail into defer

commit eac1857043
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Tue May 23 08:26:59 2017 +0100

    rename failures column to be deferrals

commit 96ed444bbb
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Mon May 22 22:52:51 2017 +0100

    set up list of jobs the backend instance should skip

commit 3a0019296d
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Mon May 22 22:01:50 2017 +0100

    separate out is_*able last_* checks

commit cf8589aba2
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Sun May 21 22:35:38 2017 +0100

    change from ignore to skip name

commit ed193356f8
Author: Oliver Gorwits <oliver@cpan.org>
Date:   Sun May 21 14:52:33 2017 +0100

    device_ignore table to track devices to skip in polling
2017-05-27 08:50:08 +01:00

304 lines
7.4 KiB
Perl
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package App::Netdisco::JobQueue::PostgreSQL;
use Dancer qw/:moose :syntax :script/;
use Dancer::Plugin::DBIC 'schema';
use App::Netdisco::Util::Device
qw/is_discoverable is_macsuckable is_arpnipable/;
use App::Netdisco::Backend::Job;
use Net::Domain 'hostfqdn';
use Module::Load ();
use Try::Tiny;
use base 'Exporter';
our @EXPORT = ();
our @EXPORT_OK = qw/
jq_getsome
jq_getsomep
jq_locked
jq_queued
jq_prime_skiplist
jq_lock
jq_defer
jq_complete
jq_log
jq_userlog
jq_insert
jq_delete
/;
our %EXPORT_TAGS = ( all => \@EXPORT_OK );
# this can take a few seconds - only do it once
our $fqdn = undef;
sub _getsome {
my ($num_slots, $where) = @_;
return () if ((!defined $num_slots) or ($num_slots < 1));
return () if ((!defined $where) or (ref {} ne ref $where));
my $fqdn ||= (hostfqdn || 'localhost');
my $jobs = schema('netdisco')->resultset('Admin');
my $rs = $jobs->search({
status => 'queued',
device => { '-not_in' =>
$jobs->skipped($fqdn, setting('workers')->{'max_deferrals'})
->columns('device')->as_query },
%$where,
}, { order_by => 'random()', rows => $num_slots });
my @returned = ();
while (my $job = $rs->next) {
push @returned, App::Netdisco::Backend::Job->new({ $job->get_columns });
}
return @returned;
}
sub jq_getsome {
return _getsome(shift,
{ action => { -in => setting('job_prio')->{'normal'} } }
);
}
sub jq_getsomep {
return _getsome(shift, {
-or => [{
username => { '!=' => undef },
action => { -in => setting('job_prio')->{'normal'} },
},{
action => { -in => setting('job_prio')->{'high'} },
}],
});
}
sub jq_locked {
my $fqdn ||= (hostfqdn || 'localhost');
my @returned = ();
my $rs = schema('netdisco')->resultset('Admin')
->search({status => "queued-$fqdn"});
while (my $job = $rs->next) {
push @returned, App::Netdisco::Backend::Job->new({ $job->get_columns });
}
return @returned;
}
sub jq_queued {
my $job_type = shift;
return schema('netdisco')->resultset('Admin')->search({
device => { '!=' => undef},
action => $job_type,
status => { -like => 'queued%' },
})->get_column('device')->all;
}
# given a device, tests if any of the primary acls applies
# returns a list of job actions to be denied/skipped on this host.
sub _get_denied_actions {
my $device = shift;
my @badactions = ();
push @badactions, ('discover', @{ setting('job_prio')->{high} })
if not is_discoverable($device);
push @badactions, (qw/macsuck nbtstat/)
if not is_macsuckable($device);
push @badactions, 'arpnip'
if not is_arpnipable($device);
return @badactions;
}
sub jq_prime_skiplist {
my $fqdn ||= (hostfqdn || 'localhost');
my @devices = schema('netdisco')->resultset('Device')->all;
my $rs = schema('netdisco')->resultset('DeviceSkip');
my %actionset = ();
foreach my $d (@devices) {
my @badactions = _get_denied_actions($d);
$actionset{$d->ip} = \@badactions if scalar @badactions;
}
schema('netdisco')->txn_do(sub {
$rs->search({ backend => $fqdn })->delete;
$rs->populate([
map {{
backend => $fqdn,
device => $_,
actionset => $actionset{$_},
}} keys %actionset
]);
});
}
sub jq_lock {
my $job = shift;
my $fqdn ||= (hostfqdn || 'localhost');
my $happy = false;
# need to handle device discovered since backend daemon started
# and the skiplist was primed. these should be checked against
# the various acls and have device_skip entry added if needed,
# and return false if it should have been skipped.
my @badactions = _get_denied_actions($job->device);
if (scalar @badactions) {
schema('netdisco')->resultset('DeviceSkip')->find_or_create({
backend => $fqdn, device => $job->device,
},{ key => 'device_skip_pkey' })->add_to_actionset(@badactions);
return false if scalar grep {$_ eq $job->action} @badactions;
}
# lock db row and update to show job has been picked
try {
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('Admin')
->search({job => $job->job}, {for => 'update'})
->update({ status => "queued-$fqdn" });
return unless
schema('netdisco')->resultset('Admin')
->count({job => $job->job, status => "queued-$fqdn"});
# remove any duplicate jobs, needed because we have race conditions
# when queueing jobs of a type for all devices
schema('netdisco')->resultset('Admin')->search({
status => 'queued',
device => $job->device,
port => $job->port,
action => $job->action,
subaction => $job->subaction,
}, {for => 'update'})->delete();
$happy = true;
});
}
catch {
error $_;
};
return $happy;
}
sub jq_defer {
my $job = shift;
my $fqdn ||= (hostfqdn || 'localhost');
my $happy = false;
# note this taints all actions on the device. for example if both
# macsuck and arpnip are allowed, but macsuck fails 10 times, then
# arpnip (and every other action) will be prevented on the device.
# seeing as defer is only triggered by an SNMP connect failure, this
# behaviour seems reasonable, to me (or desirable, perhaps).
try {
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('DeviceSkip')->find_or_create({
backend => $fqdn, device => $job->device,
},{ key => 'device_skip_pkey' })->increment_deferrals;
# lock db row and update to show job is available
schema('netdisco')->resultset('Admin')
->find($job->job, {for => 'update'})
->update({ status => 'queued', started => undef });
});
$happy = true;
}
catch {
error $_;
};
return $happy;
}
sub jq_complete {
my $job = shift;
my $happy = false;
# lock db row and update to show job is done/error
try {
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('Admin')
->find($job->job, {for => 'update'})->update({
status => $job->status,
log => $job->log,
started => $job->started,
finished => $job->finished,
});
});
$happy = true;
}
catch {
error $_;
};
return $happy;
}
sub jq_log {
return schema('netdisco')->resultset('Admin')->search({}, {
order_by => { -desc => [qw/entered device action/] },
rows => 50,
})->with_times->hri->all;
}
sub jq_userlog {
my $user = shift;
return schema('netdisco')->resultset('Admin')->search({
username => $user,
finished => { '>' => \"(now() - interval '5 seconds')" },
})->with_times->all;
}
sub jq_insert {
my $jobs = shift;
$jobs = [$jobs] if ref [] ne ref $jobs;
my $happy = false;
try {
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('Admin')->populate([
map {{
device => $_->{device},
port => $_->{port},
action => $_->{action},
subaction => ($_->{extra} || $_->{subaction}),
username => $_->{username},
userip => $_->{userip},
status => 'queued',
}} @$jobs
]);
});
$happy = true;
}
catch {
error $_;
};
return $happy;
}
sub jq_delete {
my $id = shift;
if ($id) {
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('Admin')->find($id)->delete();
});
}
else {
schema('netdisco')->txn_do(sub {
schema('netdisco')->resultset('Admin')->delete();
});
}
}
true;