we had situations where the manager would start workers on the same job, either because of race conditions or because at the time of queueing it wasn't known that the jobs were targeting the same device (due to device aliases). this commit removes duplicate jobs, reduces the need for locking on the job queue, and makes use of lldpRemChassisId to try to deduplicate jobs before they are started. in effect we have several goes to prevent duplicate jobs: 1. at neighbor discovery time we try to skip queueing same lldpRemChassisId 2. at job selection we 'error out' jobs with same profile as job selected 3. at job selection we check for running job with same profile as selected 4. the job manager process also checks for duplicate job profiles 5. at job lock we abort if the job was 'errored out' all together this seems to work well. a test on a large university network of 303 devices (four core routers and the rest edge routers, runing VRF with many duplicate identities), ~1200 subnets, ~50k hosts, resulted in no DB deadlock or contention and a complete discover+arpnip+macsuck (909 jobs) in ~3 minutes (with ~150 duplicate jobs identified and skipped).
85 lines
2.6 KiB
Perl
85 lines
2.6 KiB
Perl
package App::Netdisco::Worker::Plugin::Discover::CanonicalIP;
|
||
|
||
use Dancer ':syntax';
|
||
use App::Netdisco::Worker::Plugin;
|
||
use aliased 'App::Netdisco::Worker::Status';
|
||
|
||
use App::Netdisco::Transport::SNMP ();
|
||
use App::Netdisco::Util::Permission 'check_acl_only';
|
||
use App::Netdisco::Util::DNS 'ipv4_from_hostname';
|
||
use Dancer::Plugin::DBIC 'schema';
|
||
|
||
register_worker({ phase => 'main', driver => 'snmp' }, sub {
|
||
my ($job, $workerconf) = @_;
|
||
|
||
my $device = $job->device;
|
||
return unless $device->in_storage;
|
||
my $snmp = App::Netdisco::Transport::SNMP->reader_for($device)
|
||
or return Status->defer("discover failed: could not SNMP connect to $device");
|
||
|
||
my $old_ip = $device->ip;
|
||
my $new_ip = $old_ip;
|
||
my $revofname = ipv4_from_hostname($snmp->name);
|
||
|
||
if (setting('reverse_sysname') and $revofname) {
|
||
if (App::Netdisco::Transport::SNMP->test_connection( $new_ip )) {
|
||
$new_ip = $revofname;
|
||
}
|
||
else {
|
||
debug sprintf ' [%s] device - cannot renumber to %s - SNMP connect failed',
|
||
$old_ip, $revofname;
|
||
}
|
||
}
|
||
|
||
if (scalar @{ setting('device_identity') }) {
|
||
my @idmaps = @{ setting('device_identity') };
|
||
my $devips = $device->device_ips->order_by('alias');
|
||
|
||
ALIAS: while (my $alias = $devips->next) {
|
||
next if $alias->alias eq $old_ip;
|
||
|
||
foreach my $map (@idmaps) {
|
||
next unless ref {} eq ref $map;
|
||
|
||
foreach my $key (sort keys %$map) {
|
||
# lhs matches device, rhs matches device_ip
|
||
if (check_acl_only($device, $key)
|
||
and check_acl_only($alias, $map->{$key})) {
|
||
|
||
if (App::Netdisco::Transport::SNMP->test_connection( $alias->alias )) {
|
||
$new_ip = $alias->alias;
|
||
last ALIAS;
|
||
}
|
||
else {
|
||
debug sprintf ' [%s] device - cannot renumber to %s - SNMP connect failed',
|
||
$old_ip, $alias->alias;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
} # ALIAS
|
||
}
|
||
|
||
return if $new_ip eq $old_ip;
|
||
|
||
schema('netdisco')->txn_do(sub {
|
||
# delete target device with the same vendor and serial number
|
||
schema('netdisco')->resultset('Device')->search({
|
||
ip => $new_ip, vendor => $device->vendor, serial => $device->serial,
|
||
})->delete;
|
||
|
||
# if target device exists then this will die
|
||
$device->renumber($new_ip)
|
||
or die "cannot renumber to: $new_ip"; # rollback
|
||
|
||
# is not done in renumber but required otherwise confusing at job end!
|
||
schema('netdisco')->resultset('Admin')
|
||
->find({job => $job->id})->update({device => $new_ip});
|
||
|
||
return Status->noop(sprintf ' [%s] device - changed IP to %s (%s)',
|
||
$old_ip, $device->ip, ($device->dns || ''));
|
||
});
|
||
});
|
||
|
||
true;
|