Overview
Set up Nagios Core for infrastructure and service monitoring with host definitions, check commands, notification contacts, and custom plugins. Covers configuration, common check setups, and plugin development.
Instructions
Task A: Install and Configure Nagios
bash
# Install Nagios Core on Ubuntu
sudo apt-get update
sudo apt-get install -y nagios4 nagios-plugins nagios-nrpe-plugin
sudo systemctl enable nagios4
sudo systemctl start nagios4
# Set admin password for web UI
sudo htpasswd -c /etc/nagios4/htpasswd.users nagiosadmin
cfg
# /etc/nagios4/nagios.cfg — Key configuration directives
cfg_dir=/etc/nagios4/conf.d
cfg_dir=/etc/nagios4/servers
log_file=/var/log/nagios4/nagios.log
command_file=/var/nagios4/rw/nagios.cmd
check_result_path=/var/nagios4/spool/checkresults
status_update_interval=10
check_external_commands=1
enable_notifications=1
execute_service_checks=1
Task B: Define Hosts and Services
cfg
# /etc/nagios4/servers/web-servers.cfg — Web server host definitions
define host {
use linux-server
host_name web-01
alias Web Server 01
address 192.168.1.10
max_check_attempts 5
check_period 24x7
notification_interval 30
notification_period 24x7
contacts platform-team
hostgroups web-servers
}
define host {
use linux-server
host_name web-02
alias Web Server 02
address 192.168.1.11
max_check_attempts 5
check_period 24x7
notification_interval 30
notification_period 24x7
contacts platform-team
hostgroups web-servers
}
define hostgroup {
hostgroup_name web-servers
alias Web Servers
members web-01,web-02
}
cfg
# /etc/nagios4/servers/web-services.cfg — Service check definitions
define service {
use generic-service
hostgroup_name web-servers
service_description HTTP
check_command check_http!-p 80 -e "200,301"
max_check_attempts 3
check_interval 2
retry_interval 1
notification_interval 15
contacts platform-team
}
define service {
use generic-service
hostgroup_name web-servers
service_description HTTPS Certificate
check_command check_http!-S -p 443 -C 30
check_interval 360
notification_interval 60
contacts platform-team
}
define service {
use generic-service
hostgroup_name web-servers
service_description Disk Usage
check_command check_nrpe!check_disk
max_check_attempts 3
check_interval 10
contacts platform-team
}
define service {
use generic-service
hostgroup_name web-servers
service_description CPU Load
check_command check_nrpe!check_load
check_interval 5
contacts platform-team
}
define service {
use generic-service
hostgroup_name web-servers
service_description Memory Usage
check_command check_nrpe!check_mem
check_interval 5
contacts platform-team
}
Task C: Configure Commands and NRPE
cfg
# /etc/nagios4/conf.d/commands.cfg — Custom check commands
define command {
command_name check_nrpe
command_line /usr/lib/nagios/plugins/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -t 30
}
define command {
command_name check_http_content
command_line /usr/lib/nagios/plugins/check_http -H $HOSTADDRESS$ -p $ARG1$ -u $ARG2$ -s "$ARG3$"
}
define command {
command_name check_postgres
command_line /usr/lib/nagios/plugins/check_pgsql -H $HOSTADDRESS$ -d $ARG1$ -l $ARG2$
}
cfg
# /etc/nagios/nrpe.cfg — NRPE configuration on remote hosts
server_address=0.0.0.0
allowed_hosts=192.168.1.5
dont_blame_nrpe=0
command[check_disk]=/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /
command[check_load]=/usr/lib/nagios/plugins/check_load -w 5,4,3 -c 10,8,6
command[check_mem]=/usr/lib/nagios/plugins/check_mem.pl -w 80 -c 90 -f
command[check_procs]=/usr/lib/nagios/plugins/check_procs -w 250 -c 400
command[check_swap]=/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
Task D: Notification Contacts
cfg
# /etc/nagios4/conf.d/contacts.cfg — Contact and notification definitions
define contact {
contact_name marta
alias Marta (Platform Lead)
email marta@example.com
service_notification_period 24x7
host_notification_period 24x7
service_notification_options w,u,c,r
host_notification_options d,u,r
service_notification_commands notify-service-by-email
host_notification_commands notify-host-by-email
}
define contactgroup {
contactgroup_name platform-team
alias Platform Engineering Team
members marta,tom,nina
}
define command {
command_name notify-service-by-slack
command_line /usr/local/bin/nagios-slack-notify.sh "$NOTIFICATIONTYPE$" "$SERVICEDESC$" "$HOSTALIAS$" "$SERVICESTATE$" "$SERVICEOUTPUT$"
}
Task E: Custom Plugin
bash
#!/bin/bash
# /usr/lib/nagios/plugins/check_api_health — Custom API health check plugin
# Usage: check_api_health -u <url> -w <warn_ms> -c <crit_ms>
URL=""
WARN=1000
CRIT=3000
while getopts "u:w:c:" opt; do
case $opt in
u) URL="$OPTARG" ;;
w) WARN="$OPTARG" ;;
c) CRIT="$OPTARG" ;;
esac
done
START=$(date +%s%N)
RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$URL")
END=$(date +%s%N)
DURATION=$(( (END - START) / 1000000 ))
if [ "$RESPONSE" != "200" ]; then
echo "CRITICAL - HTTP $RESPONSE from $URL | response_time=${DURATION}ms"
exit 2
elif [ "$DURATION" -gt "$CRIT" ]; then
echo "CRITICAL - Response time ${DURATION}ms > ${CRIT}ms | response_time=${DURATION}ms"
exit 2
elif [ "$DURATION" -gt "$WARN" ]; then
echo "WARNING - Response time ${DURATION}ms > ${WARN}ms | response_time=${DURATION}ms"
exit 1
else
echo "OK - Response time ${DURATION}ms | response_time=${DURATION}ms"
exit 0
fi
bash
# Verify configuration before reloading
sudo nagios4 -v /etc/nagios4/nagios.cfg
sudo systemctl reload nagios4
Best Practices
- Always run
nagios -vto verify config before reloading to prevent outages - Use NRPE for remote checks that need local access (disk, CPU, memory)
- Set
max_check_attempts> 1 to avoid alerting on transient failures - Use hostgroups and servicegroups to apply checks to multiple hosts at once
- Output performance data (
| metric=value) from plugins for graphing integration - Use
check_intervalandretry_intervalto balance monitoring granularity with load