we had situations where the manager would start workers on the same job, either because of race conditions or because at the time of queueing it wasn't known that the jobs were targeting the same device (due to device aliases). this commit removes duplicate jobs, reduces the need for locking on the job queue, and makes use of lldpRemChassisId to try to deduplicate jobs before they are started. in effect we have several goes to prevent duplicate jobs: 1. at neighbor discovery time we try to skip queueing same lldpRemChassisId 2. at job selection we 'error out' jobs with same profile as job selected 3. at job selection we check for running job with same profile as selected 4. the job manager process also checks for duplicate job profiles 5. at job lock we abort if the job was 'errored out' all together this seems to work well. a test on a large university network of 303 devices (four core routers and the rest edge routers, runing VRF with many duplicate identities), ~1200 subnets, ~50k hosts, resulted in no DB deadlock or contention and a complete discover+arpnip+macsuck (909 jobs) in ~3 minutes (with ~150 duplicate jobs identified and skipped).
109 lines
3.2 KiB
Perl
109 lines
3.2 KiB
Perl
package App::Netdisco::Backend::Role::Manager;
|
|
|
|
use Dancer qw/:moose :syntax :script/;
|
|
|
|
use List::Util 'sum';
|
|
use App::Netdisco::Util::MCE;
|
|
|
|
use App::Netdisco::JobQueue
|
|
qw/jq_locked jq_getsome jq_getsomep jq_lock jq_warm_thrusters/;
|
|
|
|
use Role::Tiny;
|
|
use namespace::clean;
|
|
|
|
sub worker_begin {
|
|
my $self = shift;
|
|
my $wid = $self->wid;
|
|
|
|
return debug "mgr ($wid): no need for manager... skip begin"
|
|
if setting('workers')->{'no_manager'};
|
|
|
|
debug "entering Manager ($wid) worker_begin()";
|
|
|
|
# job queue initialisation
|
|
jq_warm_thrusters;
|
|
|
|
# requeue jobs locally
|
|
debug "mgr ($wid): searching for jobs booked to this processing node";
|
|
my @jobs = jq_locked;
|
|
|
|
if (scalar @jobs) {
|
|
info sprintf "mgr (%s): found %s jobs booked to this processing node",
|
|
$wid, scalar @jobs;
|
|
$self->{queue}->enqueuep(100, @jobs);
|
|
}
|
|
}
|
|
|
|
# creates a 'signature' for each job so that we can check for duplicates ...
|
|
# it happens from time to time due to the distributed nature of the job queue
|
|
# and manager(s) - also kinder to the DB to skip here rather than jq_lock()
|
|
my $memoize = sub {
|
|
no warnings 'uninitialized';
|
|
my $job = shift;
|
|
return join chr(28), map {$job->{$_}}
|
|
(qw/action port subaction/, ($job->{device_key} ? 'device_key' : 'device'));
|
|
};
|
|
|
|
sub worker_body {
|
|
my $self = shift;
|
|
my $wid = $self->wid;
|
|
|
|
if (setting('workers')->{'no_manager'}) {
|
|
prctl sprintf 'nd2: #%s mgr: inactive', $wid;
|
|
return debug "mgr ($wid): no need for manager... quitting"
|
|
}
|
|
|
|
while (1) {
|
|
prctl sprintf 'nd2: #%s mgr: gathering', $wid;
|
|
my $num_slots = 0;
|
|
my %seen_job = ();
|
|
|
|
$num_slots = parse_max_workers( setting('workers')->{tasks} )
|
|
- $self->{queue}->pending();
|
|
debug "mgr ($wid): getting potential jobs for $num_slots workers (HP)";
|
|
|
|
# get some high priority jobs
|
|
# TODO also check for stale jobs in Netdisco DB
|
|
foreach my $job ( jq_getsomep($num_slots) ) {
|
|
next if $seen_job{ $memoize->($job) }++;
|
|
|
|
# mark job as running
|
|
next unless jq_lock($job);
|
|
info sprintf "mgr (%s): job %s booked out for this processing node",
|
|
$wid, $job->id;
|
|
|
|
# copy job to local queue
|
|
$self->{queue}->enqueuep(100, $job);
|
|
}
|
|
|
|
$num_slots = parse_max_workers( setting('workers')->{tasks} )
|
|
- $self->{queue}->pending();
|
|
debug "mgr ($wid): getting potential jobs for $num_slots workers (NP)";
|
|
|
|
# get some normal priority jobs
|
|
# TODO also check for stale jobs in Netdisco DB
|
|
foreach my $job ( jq_getsome($num_slots) ) {
|
|
next if $seen_job{ $memoize->($job) }++;
|
|
|
|
# mark job as running
|
|
next unless jq_lock($job);
|
|
info sprintf "mgr (%s): job %s booked out for this processing node",
|
|
$wid, $job->id;
|
|
|
|
# copy job to local queue
|
|
$self->{queue}->enqueue($job);
|
|
}
|
|
|
|
#if (scalar grep {$_ > 1} values %seen_job) {
|
|
# debug 'WARNING: saw duplicate jobs after getsome()';
|
|
# use DDP; debug p %seen_job;
|
|
#}
|
|
|
|
debug "mgr ($wid): sleeping now...";
|
|
prctl sprintf 'nd2: #%s mgr: idle', $wid;
|
|
sleep( setting('workers')->{sleep_time} || 1 );
|
|
}
|
|
}
|
|
|
|
1;
|