source: trunk/nagios-velvice/velvice.cgi @ 326

Last change on this file since 326 was 325, checked in by g7moreau, 6 years ago
  • class hoop on td and not on <a>
  • Property svn:keywords set to Id
File size: 19.6 KB
Line 
1#!/usr/bin/env perl
2#
3# 2014/05/15 Gabriel Moreau <Gabriel.Moreau@univ-grenoble-alpes.fr>
4# 2017/06/22 Gabriel Moreau - big update
5# 2018/06/25 Gabriel Moreau - make velvice generic
6#
7# velvice.cgi
8# Copyright (C) 2014-2018, LEGI UMR 5519 / CNRS UGA G-INP, Grenoble, France
9#
10# Need NagiosStatus http://exchange.nagios.org/directory/Addons/APIs/Perl/NagiosStatus-2Epm/details
11# Possible command http://old.nagios.org/developerinfo/externalcommands/commandlist.php
12#
13# apt-get install perl-modules libnagios-object-perl libhtml-parser-perl liburi-encode-perl libcolor-calc-perl libyaml-syck-perl
14# apt-get install libdatetime-event-recurrence-perl libdatetime-set-perl
15
16use strict;
17use warnings;
18use version; our $VERSION = version->declare('0.9.2');
19
20use CGI;
21use HTML::Entities ();
22use Nagios::StatusLog;
23use URI::Encode qw(uri_encode uri_decode);
24use Color::Calc ();
25use YAML::Syck;
26
27my $query           = CGI->new();
28my $cgi_check       = uri_decode($query->param('check'));
29my $cgi_script_name = $query->script_name();
30my $cgi_path        = $cgi_script_name =~ s{/[^/]+\.cgi$}{}r;
31undef $query;
32
33my %STATUS_DB = (
34   CRITICAL => {id => 3, color => '#F88888'},
35   WARNING  => {id => 2, color => '#FFFF00'},
36   PENDING  => {id => 1, color => '#E0E0E0'},
37   );
38
39my $config = {};
40$config = YAML::Syck::LoadFile('/etc/nagios3/velvice.yml') if -e '/etc/nagios3/velvice.yml';
41$config->{'nagios-server'}                ||= {};
42$config->{'nagios-server'}{'status-file'} ||= '/var/cache/nagios3/status.dat';
43$config->{'nagios-server'}{'nagios-cmd'}  ||= '/var/lib/nagios3/rw/nagios.cmd';
44$config->{'nagios-server'}{'portal-url'}  ||= $cgi_path =~ s{/cgi-bin/}{/}r . '/';
45$config->{'nagios-server'}{'status-cgi'}  ||= "$cgi_path/status.cgi";
46$config->{'nagios-server'}{'stylesheets'} ||= $config->{'nagios-server'}{'portal-url'} =~ s{/?$}{/stylesheets}r;
47$config->{'nagios-server'}{'image'}       ||= $config->{'nagios-server'}{'portal-url'} =~ s{/?$}{/image}r;
48$config->{'host-mapping'}                 ||= {};
49$config->{'color-downtime'}               ||= {};
50$config->{'color-downtime'}{'day-min'}    ||=  3;
51$config->{'color-downtime'}{'day-max'}    ||= 50;
52$config->{'color-downtime'}{'factor'}     ||=  0.7;
53$config->{'remote-action'}                ||= {};
54$config->{'refresh'}                      ||=  0;
55
56sub hostmapping {
57   my $host = shift;
58
59   return exists $config->{'host-mapping'}{$host} ? $config->{'host-mapping'}{$host} : $host;
60   }
61
62sub downtime {
63   my ($time_change) = @_;
64
65   my $now = time;
66   return sprintf '%.1f', ($now - $time_change) / (60 * 3600);
67   }
68
69sub alertcolor {
70   my ($status, $downtime) = @_;
71
72   my $color = '#0000FF';
73   $color = $STATUS_DB{$status}->{'color'} if exists $STATUS_DB{$status};
74
75   $downtime = $downtime - $config->{'color-downtime'}{'day-min'}; # same color first days
76   $downtime = $config->{'color-downtime'}{'day-max'} if $downtime > $config->{'color-downtime'}{'day-max'}; # max 50 days for color
77   $downtime =  0 if $downtime <  0;
78
79   my $factor = ($downtime * $config->{'color-downtime'}{'factor'}) / $config->{'color-downtime'}{'day-max'};
80   return Color::Calc::color_light_html($color, $factor);
81   }
82
83sub nosbreak {
84   my ($str) = @_;
85   
86   return $str =~ s/\s/\&nbsp;/gr;
87   }
88
89my $log = Nagios::StatusLog->new(
90   Filename => $config->{'nagios-server'}{'status-file'},
91   Version  => 3.0
92   );
93
94# refresh configuration
95if (exists $config->{'refreshments'}) {
96   require DateTime::Event::Recurrence;
97   require DateTime::SpanSet;
98
99   my @refreshments;
100   SET:
101   for my $set (@{$config->{'refreshments'}}) {
102      my $start   = DateTime::Event::Recurrence->weekly(days => $set->{'days'}, hours => $set->{'start'});
103      my $end     = DateTime::Event::Recurrence->weekly(days => $set->{'days'}, hours => $set->{'end'});
104      my $spanset = DateTime::SpanSet->from_sets(start_set => $start, end_set => $end);
105      push @refreshments, {refresh => $set->{'refresh'}, spanset => $spanset};
106      }
107
108   my $now = DateTime->now(time_zone => 'local');
109   SET:
110   for my $set (@refreshments) {
111      next SET if not $set->{'spanset'}->contains($now);
112 
113      $config->{'refresh'} = $set->{'refresh'};
114      last SET;
115      }
116   }
117
118my %hostdown;
119my @serviceproblems;
120my %hostcount;
121my @futurecheck;
122HOST:
123for my $host (sort $log->list_hosts()) {
124   my $host_stat = $log->host($host);
125
126   if ($host_stat->status eq 'DOWN') {TESTIF:{
127      for my $srv ($log->list_services_on_host($host)) {
128         last TESTIF if $log->service($host, $srv)->status eq 'OK' or $log->service($host, $srv)->status eq 'PENDING';
129         }
130
131      $hostdown{$host} = $host_stat;
132      next HOST;
133      }}
134
135   SRV:
136   for my $srv ($log->list_services_on_host($host)) {
137      my $status = $log->service($host, $srv)->status;
138
139      next SRV if $status eq 'OK';
140
141      push @serviceproblems, $log->service($host, $srv);
142   
143      my $downtime = downtime($log->service($host, $srv)->last_state_change);
144      my $color    = alertcolor($status, $downtime);
145
146      my $status_id = 0;
147      $status_id = $STATUS_DB{$status}->{'id'} if exists $STATUS_DB{$status};
148
149      #$hostcount{$host}++;
150      $hostcount{$host} ||= {count => 0, color => $color, status_id => $status_id, downtime => $downtime};
151      $hostcount{$host}->{'count'}++;
152      if (($status_id >= $hostcount{$host}->{'status_id'}) and ($downtime < $hostcount{$host}->{'downtime'})) {
153         $hostcount{$host}->{'downtime'}  = $downtime;
154         $hostcount{$host}->{'status_id'} = $status_id;
155         $hostcount{$host}->{'color'}     = $color;
156         }
157      }
158   }
159
160my $now = time;
161my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime $now;
162$year += 1900;
163$mon++;
164my $date = nosbreak(sprintf '%04i-%02i-%02i %02i:%02i', $year, $mon, $mday, $hour, $min);
165
166my $htmlpage = <<"ENDH";
167Content-Type: text/html
168
169<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
170<html lang="en">
171<head>
172 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
173ENDH
174$htmlpage .= " <meta http-equiv=\"Refresh\" content=\"$config->{'refresh'}\">" if $config->{'refresh'} > 59; # minimum 1 min
175$htmlpage .= <<"ENDH";
176 <title>Nagios  Velvice</title>
177 <link rel="stylesheet"    type="text/css"  href="$config->{'nagios-server'}{'stylesheets'}/velvice.css">
178 <link rel="shortcut icon" type="image/ico" href="$config->{'nagios-server'}{'image'}/favicon.ico">
179</head>
180<body>
181<div class="header">
182 <h1>
183  <ul>
184    <li>Nagios Velvice Alert Panel : <a href="$config->{'nagios-server'}{'portal-url'}">Core Server</a></li>
185    <li><small><a id="refresh" href="$cgi_script_name">$date</a></small></li>
186  </ul>
187 </h1>
188</div>
189ENDH
190
191my %service_name   = ();
192my %service_status = ();
193for my $srv (@serviceproblems) {
194   $service_name{$srv->service_description}++;
195   $service_status{$srv->status}++;
196   }
197
198if (scalar @serviceproblems == 0) {
199   $htmlpage .= "<p>No alert to recheck.</p>\n";
200   }
201else {
202
203   $htmlpage .= "<p>Alert to recheck - Level:\n";
204   $htmlpage .= join ",\n",
205      " <span class='button'><a href='$cgi_script_name?check=all'>ALL</a><small>" . scalar(@serviceproblems) . '</small></span>',
206      map(" <span class='button'><a href='$cgi_script_name?check=" . lc(uri_encode($_)) . "'>$_</a><small>$service_status{$_}</small></span>",
207         sort keys %service_status);
208   $htmlpage .= ".\n";
209   $htmlpage .= " <br />\n";
210   $htmlpage .= " Service:\n";
211   $htmlpage .= join ",\n",
212      map(" <span class='button'><a href='$cgi_script_name?check=" . lc(uri_encode($_)) . "'>" . nosbreak($_) . "</a><small>$service_name{$_}</small></span>",
213         sort keys %service_name);
214   $htmlpage .= ".\n";
215   $htmlpage .= "</p>\n";
216
217   my $nagios_cmd;
218   open $nagios_cmd, '>>', $config->{'nagios-server'}{'nagios-cmd'} or die "Can't open file filename: $!";
219
220   my %remote_sshdown = ();
221   my %remote_db      = ();
222   my $remote_flag;
223
224   my $current_host  = '';
225   $htmlpage .= "<table border=\"1\">\n";
226   SERVICE_PROBLEMS:
227   for my $srv (@serviceproblems) {
228      my $hostname = $srv->host_name;
229      my $service  = $srv->service_description;
230      my $status   = $srv->status;
231      my $downtime = downtime($srv->last_state_change);
232      my $output   = HTML::Entities::encode($srv->plugin_output) =~ s/^[A-Z_\s]+?[:-]//r;
233
234      my $color = alertcolor($status, $downtime);
235      my $stylecolor = "style='background:$color;'";
236      $htmlpage .= " <tr>\n";
237      if ($hostname ne $current_host) {
238         $current_host  = $hostname;
239         my $rowspan    = $hostcount{$hostname}->{'count'};
240         my $rowcolor   = "style='background:" . $hostcount{$hostname}->{'color'} . ";'";
241         $htmlpage .= "  <td $rowcolor rowspan='$rowspan'>"
242            . "<a href=\"$cgi_script_name?check=" . uri_encode($hostname) . '">&#8623;</a></td>' . "\n";
243         $htmlpage .= "  <td $rowcolor class='hoop' rowspan='$rowspan'>"
244            . "<a href=\"$config->{'nagios-server'}{'status-cgi'}?host=" . uri_encode($hostname) . "\">$hostname</a></td>\n";
245         }
246
247      my $bold;
248      ACTION_STYLE:
249      for my $act_name (keys %{$config->{'remote-action'}}) {
250         my $act_regex = $config->{'remote-action'}{$act_name}{'regex'};
251         $bold++ if $service =~ m/$act_regex/ and $config->{'remote-action'}{$act_name}{'style'} eq 'bold';
252         }
253      $htmlpage .= $bold ? "  <td $stylecolor class='hoop bold'>" : "  <td $stylecolor class='hoop'>";
254      $htmlpage .= "$service</td>\n";
255
256      $htmlpage .= "  <td $stylecolor class='hoop'>$status</td>\n";
257      $htmlpage .= "  <td $stylecolor class='comment'>$output</td>\n";
258      $htmlpage .= "  <td $stylecolor class='days'>$downtime days</td>\n";
259
260      if (($cgi_check =~ m/all/i)
261            or ($cgi_check =~ m/^$service$/i)
262            or ($cgi_check =~ m/critical/i and $status eq 'CRITICAL')
263            or ($cgi_check =~ m/warning/i  and $status eq 'WARNING')
264            or ($cgi_check =~ m/pending/i  and $status eq 'PENDING')
265            or ($cgi_check eq $hostname    and $status =~ m/^(CRITICAL|WARNING|PENDING)$/)
266            ) {
267         $now++;
268         my $interval = $srv->next_check() - $srv->last_check() || 300; # 5 * 60 = 300
269         $interval =  240 if $interval <  240;
270         $interval = 3000 if $interval > 3000;
271         my $future = $now + 20 + int(rand($interval - 20));
272
273         $htmlpage .= "  <td class='checking'>" . ($future - $now) . "</td>\n";
274         #$htmlpage .= " -- <b>CHECK</b> [$now/" . ($future - $now) . "]";
275         printf $nagios_cmd "[%lu] SCHEDULE_FORCED_SVC_CHECK;%s;%s;%lu\n", $now, $hostname, $service, $now;
276         # delay future command
277         push @futurecheck, sprintf "[%lu] SCHEDULE_FORCED_SVC_CHECK;%s;%s;%lu", $future, $hostname, $service, $future;
278         }
279
280      ACTION_PUSH_AND_DEPEND:
281      for my $act_name (keys %{$config->{'remote-action'}}) {
282         my $act_regex  = $config->{'remote-action'}{$act_name}{'regex'};
283         my $act_status = $config->{'remote-action'}{$act_name}{'status'} || 'ALL';
284         my $act_depend = $config->{'remote-action'}{$act_name}{'depend'} || 'SSH';
285
286         if ($service =~ m/$act_regex/ and ($act_status eq 'ALL' or $status =~ m/$act_status/)) {
287            $remote_db{$act_name} ||= [];
288            push @{$remote_db{$act_name}}, $hostname;
289            $remote_flag++;
290            }
291
292         # check depend service otherwise
293         $remote_sshdown{$act_depend} ||= {};
294         $remote_sshdown{$act_depend}->{$hostname}++ if $service =~ m/$act_depend/;
295         }
296
297      $htmlpage .= " </tr>\n";
298      }
299
300   $htmlpage .= "</table>\n";
301   close $nagios_cmd;
302
303   # host down
304   if (%hostdown) {
305      $htmlpage .= "<br />\n";
306      $htmlpage .= "<table border='1'>\n";
307      HOST_DOWN:
308      for my $host (sort keys %hostdown) {
309         my $host_stat = $hostdown{$host};
310         my $hostname = $host_stat->host_name;
311         my $downtime = downtime($host_stat->last_state_change);
312         my $color = alertcolor('CRITICAL', $downtime);
313         $htmlpage .= " <tr style='background:$color'>\n";
314         $htmlpage .= "  <td class='hoop'><a href=\"$config->{'nagios-server'}{'status-cgi'}?host=" . uri_encode($hostname) . "\">$hostname</a></td>\n";
315         my @host_service;
316         for my $srv ($log->list_services_on_host($host)) {
317            push @host_service, $log->service($host, $srv)->service_description;
318            }
319         $htmlpage .= "  <td><small>" . join(', ', @host_service) . "</small></td>\n";
320         $htmlpage .= "  <td style='text-align:right;'>$downtime days</td>\n";
321         $htmlpage .= " </tr>\n";
322         }
323      $htmlpage .= "</table>\n";
324      }
325
326   # remote action
327   if ($remote_flag) {
328      require Nagios::Object::Config;
329      my $parser = Nagios::Object::Config->new();
330      $parser->parse("/var/cache/nagios3/objects.cache");
331
332      $htmlpage .= "<div class='action'>\n";
333      REMOTE_ACTION:
334      for my $act_name (keys %remote_db) {
335         my $act_depend = $config->{'remote-action'}{$act_name}{'depend'} || 'SSH';
336
337         my @action = grep !exists $remote_sshdown{$act_depend}->{$_}, @{$remote_db{$act_name}};
338         if (@action) {
339            my $srv_title = $config->{'remote-action'}{$act_name}{'title'} || "Action: $act_name";
340            $htmlpage .= "<h2>$srv_title</h2>\n";
341            $htmlpage .= "<pre>\n";
342            my $remote_action = $config->{'remote-action'}{$act_name}{'command'};
343            $remote_action = $config->{'remote-action'}{$act_name}{'command-one'}
344               if @action == 1 and exists $config->{'remote-action'}{$act_name}{'command-one'};
345            my @hosts;
346            for my $host (@action) {
347               my $object = $parser->find_object("$host", "Nagios::Host");
348               push @hosts, hostmapping($object->address =~ s/\..*$//r);
349               }
350            my $hosts_list = join ' ', @hosts;
351            $htmlpage .= ' ' . $remote_action =~ s{\%m}{$hosts_list}r;
352            $htmlpage .= "</pre>\n";
353            }
354         }
355      $htmlpage .= "</div>\n";
356      }
357   }
358
359$htmlpage .= <<"ENDH";
360<hr clear="all">
361<div class="footer">
362 <b><a href="http://servforge.legi.grenoble-inp.fr/projects/soft-trokata/wiki/SoftWare/NagiosVelvice">Velvice</a>
363   - version: $VERSION</b>
364   (<a href="http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/velvice.html">online manual</a>)
365   - Written by Gabriel Moreau
366 <ul>
367  <li>Licence GNU GPL version 2 or later and Perl equivalent</li>
368  <li>Copyright (C) 2014-2018, LEGI UMR 5519 / CNRS UGA G-INP, Grenoble, France</li>
369 </ul>
370</div>
371</body>
372</html>
373ENDH
374
375print $htmlpage;
376
377# delayed future check
378if (@futurecheck) {
379   sleep 2;
380   my $nagios_cmd;
381   open $nagios_cmd, '>>', $config->{'nagios-server'}{'nagios-cmd'} or die "Can't open file filename: $!";
382   print $nagios_cmd "$_\n" for @futurecheck;
383   close $nagios_cmd;
384   }
385
386__END__
387
388
389=head1 NAME
390
391velvice.cgi - nagios velvice alert panel
392
393=head1 USAGE
394
395 velvice.cgi
396 velvice.cgi?check=XXX
397
398
399=head1 DESCRIPTION
400
401=begin html
402
403<img width="700" alt="Nagios Velvice Alert Panel" title="Nagios Velvice Alert Panel" style="float:right" src="velvice.png" />
404
405=end html
406
407Nagios VELVICE is an acronym for "Nagios leVEL serVICE status".
408
409The Nagios web page is sometimes very graphically charged
410and does not necessarily contain the information you need at a glance.
411For example, it is quite complicated to restart controls on multiple hosts in one click.
412
413For example, a server that is down should take only one line and not one per service...
414Similarly, a service that has been down for 5 minutes or since yesterday
415has more weight than a service that has fallen for 15 days.
416
417With Velvice Panel, a broken down server takes only one line.
418Services that have been falling for a long time gradually lose their color and become pastel colors.
419
420With Velvice Panel, it is possible through a single click
421to redo a check of all services that are in the CRITICAL state.
422Similarly, it is possible to restart a check on all SSH services in breakdowns ...
423In order not to clog the Nagios server, checks are shifted by 2 seconds in time.
424
425There is also a link to the web page of the main Nagios server.
426For each computer, you have a direct link to its dedicated web page on this server.
427
428
429=head1 CONFIGURATION FILE SPECIFICATION
430
431The configuration file must be F</etc/nagios3/velvice.yml>.
432This is not a required file.
433The file is in YAML format because this is a human-readable text file style.
434Other formats could have been Plain XML, RDF, JSON... but they are much less readable.
435
436You can find in the software nagios-velvice an example of configuration:
437L<velvice.sample.yml|http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/velvice.sample.yml>.
438This one is in fact the master reference specification!
439
440The main keys C<nagios-server> and C<color-downtime> have good default values.
441No secondary key is required...
442The Velvice script try hard to replace ~ by the good value automatically.
443
444 nagios-server:
445   status-file: /var/cache/nagios3/status.dat
446   nagios-cmd:  /var/lib/nagios3/rw/nagios.cmd
447   portal-url:  ~/nagios3/
448   status-cgi:  ~/cgi-bin/nagios3/status.cgi
449   stylesheets: ~/nagios3/stylesheets
450
451The background color of the faulty service line display remains stable with a bright color for at least 3 days.
452Then, it decreases and becomes pastel after 53 days with an intensity of 70% (100% is white and 0% is black).
453
454 color-downtime:
455   day-min:  3
456   day-max: 50
457   factor:   0.7
458
459With key C<host-mapping>,
460it's good to map C<localhost> to the real name of the computer (hostname).
461
462 host-mapping:
463   localhost:  srv-nagios
464   toto:       titi
465
466The only important key is C<remote-action>.
467You can affiliate as many subkeys as you want.
468Let's take an example:
469
470 remote-action:
471   oom-killer:
472     regex: ^OOM Killer
473     title:  OOM Killer
474     command:     tssh -c 'sudo rm /var/lib/nagios3/nagios_oom_killer.log' %m
475     command-one: ssh %m 'sudo rm /var/lib/nagios3/nagios_oom_killer.log'
476     depend: ^SSH
477     status: ALL
478     style: bold
479
480C<oom-killer> is just a key for your remote action.
481The regex is used to find which service has a problem...
482The title is use in the result web page (not mandatory - otherwise, it will be C<Action: oom-killer>).
483The C<command> is just written on this web page.
484You have the responsibility to copy / cut it on a terminal.
485For security reasons, the nagios server does not have the right to launch the command on the remote host.
486The wildcard C<%m> is replaced by the list of the host (separated by the space).
487Sometime, the command could be different if there is only one computer (just SSH and no parallel SSH).
488If your command is based on SSH,
489you can have an SSH action only if the remote SSH is running.
490So you can make the remote action depend on the SSH service through a regular expression of your choice.
491
492The last two keys.
493The C<status> key is for CRITICAL or WARNING (or ALL).
494The key C<style> is there to mark in bold the service in error on the web page.
495
496=head1 SEE ALSO
497
498yamllint(1), ysh(1), YAML, Nagios::StatusLog, Color::Calc
499
500In Debian GNU/Linux distribution, packages for C<yamllint> and C<ysh> are:
501
502=over
503
504=item * C<yamllint> - Linter for YAML files (Python)
505
506=item * C<libyaml-shell-perl> - YAML test shell (Perl)
507
508=back
509
510
511Own project ressources:
512
513=over
514
515=item * L<Web Site|http://servforge.legi.grenoble-inp.fr/projects/soft-trokata/wiki/SoftWare/NagiosVelvice>
516
517=item * L<Online Manual|http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/velvice.html>
518
519=item * L<SVN Repository|http://servforge.legi.grenoble-inp.fr/svn/soft-trokata/trunk/nagios-velvice>
520
521=item * L<Debian Package|http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/download/>
522
523=back
524
525
526=head1 VERSION
527
528$Id: velvice.cgi 325 2018-07-21 10:13:05Z g7moreau $
529
530
531=head1 AUTHOR
532
533Written by Gabriel Moreau <Gabriel.Moreau(A)univ-grenoble-alpes.fr>, LEGI UMR 5519, CNRS, Grenoble - France
534
535
536=head1 LICENSE AND COPYRIGHT
537
538Licence GNU GPL version 2 or later and Perl equivalent
539
540Copyright (C) 2014-2018, LEGI UMR 5519 / CNRS UGA G-INP, Grenoble, France
Note: See TracBrowser for help on using the repository browser.