223 lines
5.3 KiB
Ruby
223 lines
5.3 KiB
Ruby
# -*- ruby -*-
|
|
# vim: set noet nosta sw=4 ts=4 :
|
|
#
|
|
# Get the current WAL segment and offset from a master postgresql
|
|
# server, and compare slave servers to see how far behind they
|
|
# are in MB. This script should be easily modified for use with
|
|
# Nagios/Mon/Monit/Zabbix/whatever, or wrapping it in a display loop,
|
|
# and is suitable for both WAL shipping or streaming forms of replication.
|
|
#
|
|
# Mahlon E. Smith <mahlon@martini.nu>
|
|
#
|
|
# First argument is the master server, all other arguments are treated
|
|
# as slave machines.
|
|
#
|
|
# db_replication.monitor db-master.example.com ...
|
|
#
|
|
|
|
require 'ostruct'
|
|
require 'optparse'
|
|
require 'pathname'
|
|
require 'etc'
|
|
require 'pg'
|
|
require 'pp'
|
|
|
|
|
|
### A class to encapsulate the PG handles.
|
|
###
|
|
class PGMonitor
|
|
|
|
VERSION = %q$Id$
|
|
|
|
# When to consider a slave as 'behind', measured in WAL segments.
|
|
# The default WAL segment size is 16, so we'll alert after
|
|
# missing two WAL files worth of data.
|
|
#
|
|
LAG_ALERT = 32
|
|
|
|
### Create a new PGMonitor object.
|
|
###
|
|
def initialize( opts, hosts )
|
|
@opts = opts
|
|
@master = hosts.shift
|
|
@slaves = hosts
|
|
@current_wal = {}
|
|
@failures = []
|
|
end
|
|
|
|
attr_reader :opts, :current_wal, :master, :slaves, :failures
|
|
|
|
|
|
### Perform the connections and check the lag.
|
|
###
|
|
def check
|
|
# clear prior failures, get current xlog info
|
|
@failures = []
|
|
return unless self.get_current_wal
|
|
|
|
# check all slaves
|
|
self.slaves.each do |slave|
|
|
begin
|
|
slave_db = PG.connect(
|
|
:dbname => self.opts.database,
|
|
:host => slave,
|
|
:port => self.opts.port,
|
|
:user => self.opts.user,
|
|
:password => self.opts.pass,
|
|
:sslmode => 'prefer'
|
|
)
|
|
|
|
xlog = slave_db.exec( 'SELECT pg_last_xlog_receive_location()' ).getvalue( 0, 0 )
|
|
slave_db.close
|
|
|
|
lag_in_megs = ( self.find_lag( xlog ).to_f / 1024 / 1024 ).abs
|
|
if lag_in_megs >= LAG_ALERT
|
|
failures << { :host => slave,
|
|
:error => "%0.2fMB behind the master." % [ lag_in_megs ] }
|
|
end
|
|
rescue => err
|
|
failures << { :host => slave, :error => err.message }
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
#########
|
|
protected
|
|
#########
|
|
|
|
### Ask the master for the current xlog information, to compare
|
|
### to slaves. Returns true on success. On failure, populates
|
|
### the failures array and returns false.
|
|
###
|
|
def get_current_wal
|
|
master_db = PG.connect(
|
|
:dbname => self.opts.database,
|
|
:host => self.master,
|
|
:port => self.opts.port,
|
|
:user => self.opts.user,
|
|
:password => self.opts.pass,
|
|
:sslmode => 'prefer'
|
|
)
|
|
|
|
self.current_wal[ :segbytes ] = master_db.exec( 'SHOW wal_segment_size' ).
|
|
getvalue( 0, 0 ).sub( /\D+/, '' ).to_i << 20
|
|
|
|
current = master_db.exec( 'SELECT pg_current_xlog_location()' ).getvalue( 0, 0 )
|
|
self.current_wal[ :segment ], self.current_wal[ :offset ] = current.split( /\// )
|
|
|
|
master_db.close
|
|
return true
|
|
|
|
# If we can't get any of the info from the master, then there is no
|
|
# point in a comparison with slaves.
|
|
#
|
|
rescue => err
|
|
self.failures << { :host => self.master,
|
|
:error => 'Unable to retrieve required info from the master (%s)' % [ err.message ] }
|
|
|
|
return false
|
|
end
|
|
|
|
|
|
### Given an +xlog+ position from a slave server, return
|
|
### the number of bytes the slave needs to replay before it
|
|
### is caught up to the master.
|
|
###
|
|
def find_lag( xlog )
|
|
s_segment, s_offset = xlog.split( /\// )
|
|
m_segment = self.current_wal[ :segment ]
|
|
m_offset = self.current_wal[ :offset ]
|
|
m_segbytes = self.current_wal[ :segbytes ]
|
|
|
|
return (( m_segment.hex - s_segment.hex ) * m_segbytes) + ( m_offset.hex - s_offset.hex )
|
|
end
|
|
|
|
end
|
|
|
|
|
|
### Parse command line arguments. Return a struct of global options.
|
|
###
|
|
def parse_args( args )
|
|
options = OpenStruct.new
|
|
options.database = 'postgres'
|
|
options.port = 5432
|
|
options.user = Etc.getpwuid( Process.uid ).name
|
|
options.sslmode = 'prefer'
|
|
|
|
opts = OptionParser.new do |opts|
|
|
opts.banner = "Usage: #{$0} [options] <master> <slave> [slave2, slave3...]"
|
|
|
|
opts.separator ''
|
|
opts.separator 'Connection options:'
|
|
|
|
opts.on( '-d', '--database DBNAME',
|
|
"specify the database to connect to (default: \"#{options.database}\")" ) do |db|
|
|
options.database = db
|
|
end
|
|
|
|
opts.on( '-h', '--host HOSTNAME', 'database server host' ) do |host|
|
|
options.host = host
|
|
end
|
|
|
|
opts.on( '-p', '--port PORT', Integer,
|
|
"database server port (default: \"#{options.port}\")" ) do |port|
|
|
options.port = port
|
|
end
|
|
|
|
opts.on( '-U', '--user NAME',
|
|
"database user name (default: \"#{options.user}\")" ) do |user|
|
|
options.user = user
|
|
end
|
|
|
|
opts.on( '-W', 'force password prompt' ) do |pw|
|
|
print 'Password: '
|
|
begin
|
|
system 'stty -echo'
|
|
options.pass = $stdin.gets.chomp
|
|
ensure
|
|
system 'stty echo'
|
|
puts
|
|
end
|
|
end
|
|
|
|
opts.separator ''
|
|
opts.separator 'Other options:'
|
|
|
|
opts.on_tail( '--help', 'show this help, then exit' ) do
|
|
$stderr.puts opts
|
|
exit
|
|
end
|
|
|
|
opts.on_tail( '--version', 'output version information, then exit' ) do
|
|
puts PGMonitor::VERSION
|
|
exit
|
|
end
|
|
end
|
|
|
|
opts.parse!( args )
|
|
return options
|
|
end
|
|
|
|
|
|
|
|
if __FILE__ == $0
|
|
opts = parse_args( ARGV )
|
|
raise ArgumentError, "At least two PostgreSQL servers are required." if ARGV.length < 2
|
|
mon = PGMonitor.new( opts, ARGV )
|
|
|
|
mon.check
|
|
if mon.failures.empty?
|
|
puts "All is well!"
|
|
exit 0
|
|
else
|
|
puts "Database replication delayed or broken."
|
|
mon.failures.each do |bad|
|
|
puts "%s: %s" % [ bad[ :host ], bad[ :error ] ]
|
|
end
|
|
exit 1
|
|
end
|
|
end
|
|
|
|
|