PostgreSQL Monitoring Scripts
3 Jul 2016
Get replication lag:
#!/bin/bash ### PURPOSE: # # When things go as expected: # --------------------------- # 1. Gets the replication lag and prints that time to STDOUT. Time is in seconds. # 2. $? will be 0 # # Ideal number: less than 1 second? # # When things wrong: # ------------------ # 1. Prints to STDOUT or STDERR whatever psql does when it can't connect, or something # else bad happens, and # 2. $? will be whatever psql decides # Die on first error set -e # Die on first error in pipelines set -o pipefail # Die on first unset variable set -u psql \ -X \ -U $PGUSER \ -h $PGHOST \ -p $PGPORT \ -d $PGDATABASE \ --set AUTOCOMMIT=on \ --set ON_ERROR_STOP=on \ -P format=unaligned \ -P t \ -q \ -c "select extract(epoch from now() - pg_last_xact_replay_timestamp())"
Get number of waiting clients
#!/bin/bash ### PURPOSE: # # When things go as expected: # --------------------------- # 1. Prints, to STDOUT, the number of clients waiting for locks to be released. # 2. $? will be 0 # # Ideal number: Unsure. Less than 5? Less than 10? # # When things wrong: # ------------------ # 1. Prints to STDOUT or STDERR whatever psql does when it can't connect, or something # else bad happens, and # 2. $? will be whatever psql decides # Die on first error set -e # Die on first error in pipelines set -o pipefail # Die on first unset variable set -u psql \ -X \ -U $PGUSER \ -h $PGHOST \ -p $PGPORT \ -d $PGDATABASE \ --set AUTOCOMMIT=on \ --set ON_ERROR_STOP=on \ -P format=unaligned \ -P t \ -q \ -c "select count(*) from pg_stat_activity where waiting is true"
Get connection count ratio
#!/bin/bash ### PURPOSE: # # When things go as expected: # --------------------------- # 1. Gets how close our current number of connections is to max_connections, as a percentage. # 2. $? will be 0 # # Ideal number: < 80%? # # When things wrong: # ------------------ # 1. Prints to STDOUT or STDERR whatever psql does when it can't connect, or something # else bad happens, and # 2. $? will be whatever psql decides # Die on first error set -e # Die on first error in pipelines set -o pipefail # Die on first unset variable set -u psql \ -X \ -U $PGUSER \ -h $PGHOST \ -p $PGPORT \ -d $PGDATABASE \ --set AUTOCOMMIT=on \ --set ON_ERROR_STOP=on \ -P format=unaligned \ -P t \ -q \ -c "select count(*)*100/(select current_setting('max_connections')::int) as conn_ratio from pg_stat_activity"
Get checkpoint stats
#!/bin/bash ### PURPOSE: # # When things go as expected: # --------------------------- # 1. Prints the ratio (as a percentatge) of checkpoints requested due to checkpoint_segments being hit, # over checkpoints hit because checkpoint_timeout has elapsed since the last checkpoint. # 2. $? will be 0 # # Ideal ratio: < 100% (that is, more checkpoints are hit because of timeout instead of segments filling up) # # When things wrong: # ------------------ # 1. Prints to STDOUT or STDERR whatever psql does when it can't connect, or something # else bad happens, and # 2. $? will be whatever psql decides # Die on first error set -e # Die on first error in pipelines set -o pipefail # Die on first unset variable set -u psql \ -X \ -U $PGUSER \ -h $PGHOST \ -p $PGPORT \ -d $PGDATABASE \ --set AUTOCOMMIT=on \ --set ON_ERROR_STOP=on \ -P format=unaligned \ -P t \ -q \ --field-separator ' ' \ -c "select (checkpoints_req::float / checkpoints_timed::float) * 100 as checkpoint_req_timed_ratio from pg_stat_bgwriter"
Get cache hit ratio
#!/bin/bash ### PURPOSE: # # When things go as expected: # --------------------------- # 1. Gets the cache hit ratio for for every database in the cluster and outputs it as a percentage. # 2. $? will be 0 # # Ideal number: > 90% # # When things wrong: # ------------------ # 1. Prints to STDOUT or STDERR whatever psql does when it can't connect, or something # else bad happens, and # 2. $? will be whatever psql decides # Die on first error set -e # Die on first error in pipelines set -o pipefail # Die on first unset variable set -u psql \ -X \ -U $PGUSER \ -h $PGHOST \ -p $PGPORT \ -d $PGDATABASE \ --set AUTOCOMMIT=on \ --set ON_ERROR_STOP=on \ -P format=unaligned \ -P t \ -q \ -c "select sum(blks_hit)*100/sum(blks_hit+blks_read) as hit_ratio from pg_stat_database"
Get longest idle transaction
#!/bin/bash ### PURPOSE: # # When things go as expected: # --------------------------- # 1. Finds the connection that has been 'idle in transaction' the longest, # and prints that time to STDOUT. Time is in seconds. # 2. $? will be 0 # # Ideal number: less than 300 seconds (5 minutes) # # When things wrong: # ------------------ # 1. Prints to STDOUT or STDERR whatever psql does when it can't connect, or something # else bad happens, and # 2. $? will be whatever psql decides # Die on first error set -e # Die on first error in pipelines set -o pipefail # Die on first unset variable set -u psql \ -X \ -U $PGUSER \ -h $PGHOST \ -p $PGPORT \ -d $PGDATABASE \ --set AUTOCOMMIT=on \ --set ON_ERROR_STOP=on \ -P format=unaligned \ -P t \ -q \ -c "select extract(epoch from (max(now() - xact_start))) as max_idle_in_seconds from pg_stat_activity where state = 'idle in transaction' or state = 'idle in transaction (aborted)'"