pg/sample/replication_monitor.rb

223 lines
5.3 KiB
Ruby

# -*- ruby -*-
# vim: set noet nosta sw=4 ts=4 :
#
# Get the current WAL segment and offset from a master postgresql
# server, and compare slave servers to see how far behind they
# are in MB. This script should be easily modified for use with
# Nagios/Mon/Monit/Zabbix/whatever, or wrapping it in a display loop,
# and is suitable for both WAL shipping or streaming forms of replication.
#
# Mahlon E. Smith <mahlon@martini.nu>
#
# First argument is the master server, all other arguments are treated
# as slave machines.
#
# db_replication.monitor db-master.example.com ...
#
require 'ostruct'
require 'optparse'
require 'pathname'
require 'etc'
require 'pg'
require 'pp'
### A class to encapsulate the PG handles.
###
class PGMonitor
VERSION = %q$Id$
# When to consider a slave as 'behind', measured in WAL segments.
# The default WAL segment size is 16, so we'll alert after
# missing two WAL files worth of data.
#
LAG_ALERT = 32
### Create a new PGMonitor object.
###
def initialize( opts, hosts )
@opts = opts
@master = hosts.shift
@slaves = hosts
@current_wal = {}
@failures = []
end
attr_reader :opts, :current_wal, :master, :slaves, :failures
### Perform the connections and check the lag.
###
def check
# clear prior failures, get current xlog info
@failures = []
return unless self.get_current_wal
# check all slaves
self.slaves.each do |slave|
begin
slave_db = PG.connect(
:dbname => self.opts.database,
:host => slave,
:port => self.opts.port,
:user => self.opts.user,
:password => self.opts.pass,
:sslmode => 'prefer'
)
xlog = slave_db.exec( 'SELECT pg_last_xlog_receive_location()' ).getvalue( 0, 0 )
slave_db.close
lag_in_megs = ( self.find_lag( xlog ).to_f / 1024 / 1024 ).abs
if lag_in_megs >= LAG_ALERT
failures << { :host => slave,
:error => "%0.2fMB behind the master." % [ lag_in_megs ] }
end
rescue => err
failures << { :host => slave, :error => err.message }
end
end
end
#########
protected
#########
### Ask the master for the current xlog information, to compare
### to slaves. Returns true on success. On failure, populates
### the failures array and returns false.
###
def get_current_wal
master_db = PG.connect(
:dbname => self.opts.database,
:host => self.master,
:port => self.opts.port,
:user => self.opts.user,
:password => self.opts.pass,
:sslmode => 'prefer'
)
self.current_wal[ :segbytes ] = master_db.exec( 'SHOW wal_segment_size' ).
getvalue( 0, 0 ).sub( /\D+/, '' ).to_i << 20
current = master_db.exec( 'SELECT pg_current_xlog_location()' ).getvalue( 0, 0 )
self.current_wal[ :segment ], self.current_wal[ :offset ] = current.split( /\// )
master_db.close
return true
# If we can't get any of the info from the master, then there is no
# point in a comparison with slaves.
#
rescue => err
self.failures << { :host => self.master,
:error => 'Unable to retrieve required info from the master (%s)' % [ err.message ] }
return false
end
### Given an +xlog+ position from a slave server, return
### the number of bytes the slave needs to replay before it
### is caught up to the master.
###
def find_lag( xlog )
s_segment, s_offset = xlog.split( /\// )
m_segment = self.current_wal[ :segment ]
m_offset = self.current_wal[ :offset ]
m_segbytes = self.current_wal[ :segbytes ]
return (( m_segment.hex - s_segment.hex ) * m_segbytes) + ( m_offset.hex - s_offset.hex )
end
end
### Parse command line arguments. Return a struct of global options.
###
def parse_args( args )
options = OpenStruct.new
options.database = 'postgres'
options.port = 5432
options.user = Etc.getpwuid( Process.uid ).name
options.sslmode = 'prefer'
opts = OptionParser.new do |opts|
opts.banner = "Usage: #{$0} [options] <master> <slave> [slave2, slave3...]"
opts.separator ''
opts.separator 'Connection options:'
opts.on( '-d', '--database DBNAME',
"specify the database to connect to (default: \"#{options.database}\")" ) do |db|
options.database = db
end
opts.on( '-h', '--host HOSTNAME', 'database server host' ) do |host|
options.host = host
end
opts.on( '-p', '--port PORT', Integer,
"database server port (default: \"#{options.port}\")" ) do |port|
options.port = port
end
opts.on( '-U', '--user NAME',
"database user name (default: \"#{options.user}\")" ) do |user|
options.user = user
end
opts.on( '-W', 'force password prompt' ) do |pw|
print 'Password: '
begin
system 'stty -echo'
options.pass = $stdin.gets.chomp
ensure
system 'stty echo'
puts
end
end
opts.separator ''
opts.separator 'Other options:'
opts.on_tail( '--help', 'show this help, then exit' ) do
$stderr.puts opts
exit
end
opts.on_tail( '--version', 'output version information, then exit' ) do
puts PGMonitor::VERSION
exit
end
end
opts.parse!( args )
return options
end
if __FILE__ == $0
opts = parse_args( ARGV )
raise ArgumentError, "At least two PostgreSQL servers are required." if ARGV.length < 2
mon = PGMonitor.new( opts, ARGV )
mon.check
if mon.failures.empty?
puts "All is well!"
exit 0
else
puts "Database replication delayed or broken."
mon.failures.each do |bad|
puts "%s: %s" % [ bad[ :host ], bad[ :error ] ]
end
exit 1
end
end