source: trunk/nagios-velvice/velvice.cgi @ 322

Last change on this file since 322 was 322, checked in by g7moreau, 6 years ago
  • Add refreshments config for the web page
  • Property svn:keywords set to Id
File size: 19.5 KB
Line 
1#!/usr/bin/env perl
2#
3# 2014/05/15 Gabriel Moreau <Gabriel.Moreau@univ-grenoble-alpes.fr>
4# 2017/06/22 Gabriel Moreau - big update
5# 2018/06/25 Gabriel Moreau - make velvice generic
6#
7# velvice.cgi
8# Copyright (C) 2014-2018, LEGI UMR 5519 / CNRS UGA G-INP, Grenoble, France
9#
10# Need NagiosStatus http://exchange.nagios.org/directory/Addons/APIs/Perl/NagiosStatus-2Epm/details
11# Possible command http://old.nagios.org/developerinfo/externalcommands/commandlist.php
12#
13# apt-get install perl-modules libnagios-object-perl libhtml-parser-perl liburi-encode-perl libcolor-calc-perl libyaml-syck-perl
14# apt-get install libdatetime-event-recurrence-perl libdatetime-set-perl
15
16use strict;
17use warnings;
18use version; our $VERSION = version->declare('0.9.0');
19
20use CGI;
21use HTML::Entities ();
22use Nagios::StatusLog;
23use URI::Encode qw(uri_encode uri_decode);
24use Color::Calc ();
25use YAML::Syck;
26
27my $query           = CGI->new();
28my $cgi_check       = uri_decode($query->param('check'));
29my $cgi_script_name = $query->script_name();
30my $cgi_path        = $cgi_script_name =~ s{/[^/]+\.cgi$}{}r;
31undef $query;
32
33my %STATUS_DB = (
34   CRITICAL => {id => 3, color => '#F88888'},
35   WARNING  => {id => 2, color => '#FFFF00'},
36   PENDING  => {id => 1, color => '#E0E0E0'},
37   );
38
39my $config = {};
40$config = YAML::Syck::LoadFile('/etc/nagios3/velvice.yml') if -e '/etc/nagios3/velvice.yml';
41$config->{'nagios-server'}                ||= {};
42$config->{'nagios-server'}{'status-file'} ||= '/var/cache/nagios3/status.dat';
43$config->{'nagios-server'}{'nagios-cmd'}  ||= '/var/lib/nagios3/rw/nagios.cmd';
44$config->{'nagios-server'}{'portal-url'}  ||= $cgi_path =~ s{/cgi-bin/}{/}r . '/';
45$config->{'nagios-server'}{'status-cgi'}  ||= "$cgi_path/status.cgi";
46$config->{'nagios-server'}{'stylesheets'} ||= $config->{'nagios-server'}{'portal-url'} =~ s{/?$}{/stylesheets}r;
47$config->{'nagios-server'}{'image'}       ||= $config->{'nagios-server'}{'portal-url'} =~ s{/?$}{/image}r;
48$config->{'host-mapping'}                 ||= {};
49$config->{'color-downtime'}               ||= {};
50$config->{'color-downtime'}{'day-min'}    ||=  3;
51$config->{'color-downtime'}{'day-max'}    ||= 50;
52$config->{'color-downtime'}{'factor'}     ||=  0.7;
53$config->{'remote-action'}                ||= {};
54$config->{'refresh'}                      ||=  0;
55
56sub hostmapping {
57   my $host = shift;
58
59   return exists $config->{'host-mapping'}{$host} ? $config->{'host-mapping'}{$host} : $host;
60   }
61
62sub downtime {
63   my ($time_change) = @_;
64
65   my $now = time;
66   return sprintf '%.1f', ($now - $time_change) / (60 * 3600);
67   }
68
69sub alertcolor {
70   my ($status, $downtime) = @_;
71
72   my $color = '#0000FF';
73   $color = $STATUS_DB{$status}->{'color'} if exists $STATUS_DB{$status};
74
75   $downtime = $downtime - $config->{'color-downtime'}{'day-min'}; # same color first days
76   $downtime = $config->{'color-downtime'}{'day-max'} if $downtime > $config->{'color-downtime'}{'day-max'}; # max 50 days for color
77   $downtime =  0 if $downtime <  0;
78
79   my $factor = ($downtime * $config->{'color-downtime'}{'factor'}) / $config->{'color-downtime'}{'day-max'};
80   return Color::Calc::color_light_html($color, $factor);
81   }
82
83sub nosbreak {
84   my ($str) = @_;
85   
86   return $str =~ s/\s/\&nbsp;/gr;
87   }
88
89my $log = Nagios::StatusLog->new(
90   Filename => $config->{'nagios-server'}{'status-file'},
91   Version  => 3.0
92   );
93
94# refresh configuration
95if (exists $config->{'refreshments'}) {
96   require DateTime::Event::Recurrence;
97   require DateTime::SpanSet;
98
99   my @refreshments;
100   SET:
101   for my $set (@{$config->{'refreshments'}}) {
102      my $start   = DateTime::Event::Recurrence->weekly(days => $set->{'days'}, hours => $set->{'start'});
103      my $end     = DateTime::Event::Recurrence->weekly(days => $set->{'days'}, hours => $set->{'end'});
104      my $spanset = DateTime::SpanSet->from_sets(start_set => $start, end_set => $end);
105      push @refreshments, {refresh => $set->{'refresh'}, spanset => $spanset};
106      }
107
108   my $now = DateTime->now();
109   SET:
110   for my $set (@refreshments) {
111      next SET if not $set->{'spanset'}->contains($now);
112 
113      $config->{'refresh'} = $set->{'refresh'};
114      last SET;
115      }
116   }
117
118my %hostdown;
119my @serviceproblems;
120my %hostcount;
121my @futurecheck;
122HOST:
123for my $host (sort $log->list_hosts()) {
124   my $host_stat = $log->host($host);
125
126   if ($host_stat->status eq 'DOWN') {TESTIF:{
127      for my $srv ($log->list_services_on_host($host)) {
128         last TESTIF if $log->service($host, $srv)->status eq 'OK' or $log->service($host, $srv)->status eq 'PENDING';
129         }
130
131      $hostdown{$host} = $host_stat;
132      next HOST;
133      }}
134
135   SRV:
136   for my $srv ($log->list_services_on_host($host)) {
137      my $status = $log->service($host, $srv)->status;
138
139      next SRV if $status eq 'OK';
140
141      push @serviceproblems, $log->service($host, $srv);
142   
143      my $downtime = downtime($log->service($host, $srv)->last_state_change);
144      my $color    = alertcolor($status, $downtime);
145
146      my $status_id = 0;
147      $status_id = $STATUS_DB{$status}->{'id'} if exists $STATUS_DB{$status};
148
149      #$hostcount{$host}++;
150      $hostcount{$host} ||= {count => 0, color => $color, status_id => $status_id, downtime => $downtime};
151      $hostcount{$host}->{'count'}++;
152      if (($status_id >= $hostcount{$host}->{'status_id'}) and ($downtime < $hostcount{$host}->{'downtime'})) {
153         $hostcount{$host}->{'downtime'}  = $downtime;
154         $hostcount{$host}->{'status_id'} = $status_id;
155         $hostcount{$host}->{'color'}     = $color;
156         }
157      }
158   }
159
160my $now = time;
161my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime $now;
162$year += 1900;
163$mon++;
164my $date = nosbreak(sprintf '%04i-%02i-%02i %02i:%02i', $year, $mon, $mday, $hour, $min);
165
166my $htmlpage = <<"ENDH";
167Content-Type: text/html
168
169<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
170<html lang="en">
171<head>
172 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
173 <meta http-equiv="Refresh"      content="$config->{'refresh'}">
174 <title>Nagios  Velvice</title>
175 <link rel="stylesheet"    type="text/css"  href="$config->{'nagios-server'}{'stylesheets'}/velvice.css">
176 <link rel="shortcut icon" type="image/ico" href="$config->{'nagios-server'}{'image'}/favicon.ico">
177</head>
178<body>
179<div class="header">
180 <h1>
181  <ul>
182    <li>Nagios Velvice Alert Panel : <a href="$config->{'nagios-server'}{'portal-url'}">Core Server</a></li>
183    <li><small><a id="refresh" href="$cgi_script_name">$date</a></small></li>
184  </ul>
185 </h1>
186</div>
187ENDH
188
189my %service_name   = ();
190my %service_status = ();
191for my $srv (@serviceproblems) {
192   $service_name{$srv->service_description}++;
193   $service_status{$srv->status}++;
194   }
195
196if (scalar @serviceproblems == 0) {
197   $htmlpage .= "<p>No alert to recheck.</p>\n";
198   }
199else {
200
201   $htmlpage .= "<p>Alert to recheck - Level:\n";
202   $htmlpage .= join ",\n",
203      " <span class='button'><a href='$cgi_script_name?check=all'>ALL</a><small>" . scalar(@serviceproblems) . '</small></span>',
204      map(" <span class='button'><a href='$cgi_script_name?check=" . lc(uri_encode($_)) . "'>$_</a><small>$service_status{$_}</small></span>",
205         sort keys %service_status);
206   $htmlpage .= ".\n";
207   $htmlpage .= " <br />\n";
208   $htmlpage .= " Service:\n";
209   $htmlpage .= join ",\n",
210      map(" <span class='button'><a href='$cgi_script_name?check=" . lc(uri_encode($_)) . "'>" . nosbreak($_) . "</a><small>$service_name{$_}</small></span>",
211         sort keys %service_name);
212   $htmlpage .= ".\n";
213   $htmlpage .= "</p>\n";
214
215   my $nagios_cmd;
216   open $nagios_cmd, '>>', $config->{'nagios-server'}{'nagios-cmd'} or die "Can't open file filename: $!";
217
218   my %remote_sshdown = ();
219   my %remote_db      = ();
220   my $remote_flag;
221
222   my $current_host  = '';
223   $htmlpage .= "<table border=\"1\">\n";
224   SERVICE_PROBLEMS:
225   for my $srv (@serviceproblems) {
226      my $hostname = $srv->host_name;
227      my $service  = $srv->service_description;
228      my $status   = $srv->status;
229      my $downtime = downtime($srv->last_state_change);
230      my $output   = HTML::Entities::encode($srv->plugin_output) =~ s/^[A-Z_\s]+?[:-]//r;
231
232      my $color = alertcolor($status, $downtime);
233      my $stylecolor = "style='background:$color;'";
234      $htmlpage .= " <tr>\n";
235      if ($hostname ne $current_host) {
236         $current_host  = $hostname;
237         my $rowspan    = $hostcount{$hostname}->{'count'};
238         my $rowcolor   = "style='background:" . $hostcount{$hostname}->{'color'} . ";'";
239         $htmlpage .= "  <td $rowcolor rowspan='$rowspan'>"
240            . "<a href=\"$cgi_script_name?check=" . uri_encode($hostname) . '">&#8623;</a></td>' . "\n";
241         $htmlpage .= "  <td $rowcolor class='hoop' rowspan='$rowspan'>"
242            . "<a href=\"$config->{'nagios-server'}{'status-cgi'}?host=" . uri_encode($hostname) . "\">$hostname</a></td>\n";
243         }
244
245      my $bold;
246      ACTION_STYLE:
247      for my $act_name (keys %{$config->{'remote-action'}}) {
248         my $act_regex = $config->{'remote-action'}{$act_name}{'regex'};
249         $bold++ if $service =~ m/$act_regex/ and $config->{'remote-action'}{$act_name}{'style'} eq 'bold';
250         }
251      $htmlpage .= $bold ? "  <td $stylecolor class='hoop bold'>" : "  <td $stylecolor class='hoop'>";
252      $htmlpage .= "$service</td>\n";
253
254      $htmlpage .= "  <td $stylecolor class='hoop'>$status</td>\n";
255      $htmlpage .= "  <td $stylecolor class='comment'>$output</td>\n";
256      $htmlpage .= "  <td $stylecolor class='days'>$downtime days</td>\n";
257
258      if (($cgi_check =~ m/all/i)
259            or ($cgi_check =~ m/^$service$/i)
260            or ($cgi_check =~ m/critical/i and $status eq 'CRITICAL')
261            or ($cgi_check =~ m/warning/i  and $status eq 'WARNING')
262            or ($cgi_check =~ m/pending/i  and $status eq 'PENDING')
263            or ($cgi_check eq $hostname    and $status =~ m/^(CRITICAL|WARNING|PENDING)$/)
264            ) {
265         $now++;
266         my $interval = $srv->next_check() - $srv->last_check() || 300; # 5 * 60 = 300
267         $interval =  240 if $interval <  240;
268         $interval = 3000 if $interval > 3000;
269         my $future = $now + 20 + int(rand($interval - 20));
270
271         $htmlpage .= "  <td class='checking'>" . ($future - $now) . "</td>\n";
272         #$htmlpage .= " -- <b>CHECK</b> [$now/" . ($future - $now) . "]";
273         printf $nagios_cmd "[%lu] SCHEDULE_FORCED_SVC_CHECK;%s;%s;%lu\n", $now, $hostname, $service, $now;
274         # delay future command
275         push @futurecheck, sprintf "[%lu] SCHEDULE_FORCED_SVC_CHECK;%s;%s;%lu", $future, $hostname, $service, $future;
276         }
277
278      ACTION_PUSH_AND_DEPEND:
279      for my $act_name (keys %{$config->{'remote-action'}}) {
280         my $act_regex  = $config->{'remote-action'}{$act_name}{'regex'};
281         my $act_status = $config->{'remote-action'}{$act_name}{'status'} || 'ALL';
282         my $act_depend = $config->{'remote-action'}{$act_name}{'depend'} || 'SSH';
283
284         if ($service =~ m/$act_regex/ and ($act_status eq 'ALL' or $status =~ m/$act_status/)) {
285            $remote_db{$act_name} ||= [];
286            push @{$remote_db{$act_name}}, $hostname;
287            $remote_flag++;
288            }
289
290         # check depend service otherwise
291         $remote_sshdown{$act_depend} ||= {};
292         $remote_sshdown{$act_depend}->{$hostname}++ if $service =~ m/$act_depend/;
293         }
294
295      $htmlpage .= " </tr>\n";
296      }
297
298   $htmlpage .= "</table>\n";
299   close $nagios_cmd;
300
301   # host down
302   if (%hostdown) {
303      $htmlpage .= "<br />\n";
304      $htmlpage .= "<table border='1'>\n";
305      HOST_DOWN:
306      for my $host (sort keys %hostdown) {
307         my $host_stat = $hostdown{$host};
308         my $hostname = $host_stat->host_name;
309         my $downtime = downtime($host_stat->last_state_change);
310         my $color = alertcolor('CRITICAL', $downtime);
311         $htmlpage .= " <tr style='background:$color'>\n";
312         $htmlpage .= "  <td><a class='hoop' href=\"$config->{'nagios-server'}{'status-cgi'}?host=" . uri_encode($hostname) . "\">$hostname</a></td>\n";
313         my @host_service;
314         for my $srv ($log->list_services_on_host($host)) {
315            push @host_service, $log->service($host, $srv)->service_description;
316            }
317         $htmlpage .= "  <td><small>" . join(', ', @host_service) . "</small></td>\n";
318         $htmlpage .= "  <td style='text-align:right;'>$downtime days</td>\n";
319         $htmlpage .= " </tr>\n";
320         }
321      $htmlpage .= "</table>\n";
322      }
323
324   # remote action
325   if ($remote_flag) {
326      require Nagios::Object::Config;
327      my $parser = Nagios::Object::Config->new();
328      $parser->parse("/var/cache/nagios3/objects.cache");
329
330      $htmlpage .= "<div class='action'>\n";
331      REMOTE_ACTION:
332      for my $act_name (keys %remote_db) {
333         my $act_depend = $config->{'remote-action'}{$act_name}{'depend'} || 'SSH';
334
335         my @action = grep !exists $remote_sshdown{$act_depend}->{$_}, @{$remote_db{$act_name}};
336         if (@action) {
337            my $srv_title = $config->{'remote-action'}{$act_name}{'title'} || "Action: $act_name";
338            $htmlpage .= "<h2>$srv_title</h2>\n";
339            $htmlpage .= "<pre>\n";
340            my $remote_action = $config->{'remote-action'}{$act_name}{'command'};
341            $remote_action = $config->{'remote-action'}{$act_name}{'command-one'}
342               if @action == 1 and exists $config->{'remote-action'}{$act_name}{'command-one'};
343            my @hosts;
344            for my $host (@action) {
345               my $object = $parser->find_object("$host", "Nagios::Host");
346               push @hosts, hostmapping($object->address =~ s/\..*$//r);
347               }
348            my $hosts_list = join ' ', @hosts;
349            $htmlpage .= ' ' . $remote_action =~ s{\%m}{$hosts_list}r;
350            $htmlpage .= "</pre>\n";
351            }
352         }
353      $htmlpage .= "</div>\n";
354      }
355   }
356
357$htmlpage .= <<"ENDH";
358<hr clear="all">
359<div class="footer">
360 <b><a href="http://servforge.legi.grenoble-inp.fr/projects/soft-trokata/wiki/SoftWare/NagiosVelvice">Velvice</a>
361   - version: $VERSION</b>
362   (<a href="http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/velvice.html">online manual</a>)
363   - Written by Gabriel Moreau
364 <ul>
365  <li>Licence GNU GPL version 2 or later and Perl equivalent</li>
366  <li>Copyright (C) 2014-2018, LEGI UMR 5519 / CNRS UGA G-INP, Grenoble, France</li>
367 </ul>
368</div>
369</body>
370</html>
371ENDH
372
373print $htmlpage;
374
375# delayed future check
376if (@futurecheck) {
377   sleep 2;
378   my $nagios_cmd;
379   open $nagios_cmd, '>>', $config->{'nagios-server'}{'nagios-cmd'} or die "Can't open file filename: $!";
380   print $nagios_cmd "$_\n" for @futurecheck;
381   close $nagios_cmd;
382   }
383
384__END__
385
386
387=head1 NAME
388
389velvice.cgi - nagios velvice alert panel
390
391=head1 USAGE
392
393 velvice.cgi
394 velvice.cgi?check=XXX
395
396
397=head1 DESCRIPTION
398
399=begin html
400
401<img width="700" alt="Nagios Velvice Alert Panel" title="Nagios Velvice Alert Panel" style="float:right" src="velvice.png" />
402
403=end html
404
405Nagios VELVICE is an acronym for "Nagios leVEL serVICE status".
406
407The Nagios web page is sometimes very graphically charged
408and does not necessarily contain the information you need at a glance.
409For example, it is quite complicated to restart controls on multiple hosts in one click.
410
411For example, a server that is down should take only one line and not one per service...
412Similarly, a service that has been down for 5 minutes or since yesterday
413has more weight than a service that has fallen for 15 days.
414
415With Velvice Panel, a broken down server takes only one line.
416Services that have been falling for a long time gradually lose their color and become pastel colors.
417
418With Velvice Panel, it is possible through a single click
419to redo a check of all services that are in the CRITICAL state.
420Similarly, it is possible to restart a check on all SSH services in breakdowns ...
421In order not to clog the Nagios server, checks are shifted by 2 seconds in time.
422
423There is also a link to the web page of the main Nagios server.
424For each computer, you have a direct link to its dedicated web page on this server.
425
426
427=head1 CONFIGURATION FILE SPECIFICATION
428
429The configuration file must be F</etc/nagios3/velvice.yml>.
430This is not a required file.
431The file is in YAML format because this is a human-readable text file style.
432Other formats could have been Plain XML, RDF, JSON... but they are much less readable.
433
434You can find in the software nagios-velvice an example of configuration:
435L<velvice.sample.yml|http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/velvice.sample.yml>.
436This one is in fact the master reference specification!
437
438The main keys C<nagios-server> and C<color-downtime> have good default values.
439No secondary key is required...
440The Velvice script try hard to replace ~ by the good value automatically.
441
442 nagios-server:
443   status-file: /var/cache/nagios3/status.dat
444   nagios-cmd:  /var/lib/nagios3/rw/nagios.cmd
445   portal-url:  ~/nagios3/
446   status-cgi:  ~/cgi-bin/nagios3/status.cgi
447   stylesheets: ~/nagios3/stylesheets
448
449The background color of the faulty service line display remains stable with a bright color for at least 3 days.
450Then, it decreases and becomes pastel after 53 days with an intensity of 70% (100% is white and 0% is black).
451
452 color-downtime:
453   day-min:  3
454   day-max: 50
455   factor:   0.7
456
457With key C<host-mapping>,
458it's good to map C<localhost> to the real name of the computer (hostname).
459
460 host-mapping:
461   localhost:  srv-nagios
462   toto:       titi
463
464The only important key is C<remote-action>.
465You can affiliate as many subkeys as you want.
466Let's take an example:
467
468 remote-action:
469   oom-killer:
470     regex: ^OOM Killer
471     title:  OOM Killer
472     command:     tssh -c 'sudo rm /var/lib/nagios3/nagios_oom_killer.log' %m
473     command-one: ssh %m 'sudo rm /var/lib/nagios3/nagios_oom_killer.log'
474     depend: ^SSH
475     status: ALL
476     style: bold
477
478C<oom-killer> is just a key for your remote action.
479The regex is used to find which service has a problem...
480The title is use in the result web page (not mandatory - otherwise, it will be C<Action: oom-killer>).
481The C<command> is just written on this web page.
482You have the responsibility to copy / cut it on a terminal.
483For security reasons, the nagios server does not have the right to launch the command on the remote host.
484The wildcard C<%m> is replaced by the list of the host (separated by the space).
485Sometime, the command could be different if there is only one computer (just SSH and no parallel SSH).
486If your command is based on SSH,
487you can have an SSH action only if the remote SSH is running.
488So you can make the remote action depend on the SSH service through a regular expression of your choice.
489
490The last two keys.
491The C<status> key is for CRITICAL or WARNING (or ALL).
492The key C<style> is there to mark in bold the service in error on the web page.
493
494=head1 SEE ALSO
495
496yamllint(1), ysh(1), YAML, Nagios::StatusLog, Color::Calc
497
498In Debian GNU/Linux distribution, packages for C<yamllint> and C<ysh> are:
499
500=over
501
502=item * C<yamllint> - Linter for YAML files (Python)
503
504=item * C<libyaml-shell-perl> - YAML test shell (Perl)
505
506=back
507
508
509Own project ressources:
510
511=over
512
513=item * L<Web Site|http://servforge.legi.grenoble-inp.fr/projects/soft-trokata/wiki/SoftWare/NagiosVelvice>
514
515=item * L<Online Manual|http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/velvice.html>
516
517=item * L<SVN Repository|http://servforge.legi.grenoble-inp.fr/svn/soft-trokata/trunk/nagios-velvice>
518
519=item * L<Debian Package|http://servforge.legi.grenoble-inp.fr/pub/soft-trokata/nagios-velvice/download/>
520
521=back
522
523
524=head1 VERSION
525
526$Id: velvice.cgi 322 2018-07-20 19:26:28Z g7moreau $
527
528
529=head1 AUTHOR
530
531Written by Gabriel Moreau <Gabriel.Moreau(A)univ-grenoble-alpes.fr>, LEGI UMR 5519, CNRS, Grenoble - France
532
533
534=head1 LICENSE AND COPYRIGHT
535
536Licence GNU GPL version 2 or later and Perl equivalent
537
538Copyright (C) 2014-2018, LEGI UMR 5519 / CNRS UGA G-INP, Grenoble, France
Note: See TracBrowser for help on using the repository browser.