1 参考文档

其他文档：Configure PostgreSQL Replication With Repmgr_qq_38461429的博客-CSDN博客

2 环境

主机名	IP	OS	角色	软件包
pg1	192.168.44.131	Centos 7	primary	postgresql 11 and repmgr
pg2	192.168.44.131	Centos 7	standby	postgresql 11 and repmgr
pg3	192.168.44.135	Centos 7	standby	postgresql 11 and repmgr

备注
以下安装方式均采用在线安装方式。

3 修改hosts

1
2
3

192.168.44.131 pg1
192.168.44.132 pg2
192.168.44.135 pg3

4 配置SSH免密登陆

# 生产密钥
ssh-keygen -t rsa
# 写入文件
cat id_rsa.pub > authorized_keys
# 赋权 600
chmod +600 authorized_keys
# 拷贝 pg2\pg3 密钥到一个authorized_keys文件
ssh pg2 cat /root/.ssh/id_rsa.pub >> authorized_keys
ssh pg3 cat /root/.ssh/id_rsa.pub >> authorized_keys
# 拷贝authorized_keys 到pg2\pg3
scp authorized_keys pg2:/root/.ssh/
scp authorized_keys pg3:/root/.ssh/
# ssh pg1\pg2\pg3 测试是否生效

注意
后面postgresql安装后，也需要配置postgres用户的SSH免密登陆。

4 所有节点安装PG软件和postgis插件

# Install the repository RPM:
sudo yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm

# Install PostgreSQL:
sudo yum install -y postgresql11-server

# Install Postgis:
sudo yum install -y postgis25_11

# Install rsync
sudo yum install -y rsync

‎如果您希望‎‎repmgr‎‎复制位于 PostgreSQL 数据目录之外的配置文件和/或测试功能，您还需要在两个服务器之间建立无密码的 SSH 连接，并且应安装‎‎rsync。

5 所有节点安装repmgr

官方教程

# 安装源
curl https://dl.2ndquadrant.com/default/release/get/11/rpm | sudo bash
# 检查源
sudo yum repolist
# 安装repmgr11
sudo yum install repmgr11

# 添加repmgr命令到环境变量
vim /etc/profile

PATH=/usr/pgsql-11/bin:$PATH

6 primary节点配置

初始化数据库

# Optionally initialize the database and enable automatic start:
# 初始化数据库
sudo /usr/pgsql-11/bin/postgresql-11-setup initdb
sudo systemctl enable postgresql-11
sudo systemctl start postgresql-11

修改配置postgresql.conf

# 在文件末尾添加如下配置
# repmgr 配置
# walsender 进程的最大数量
max_wal_senders = 10
# 复制槽的最大数量
max_replication_slots = 10
wal_level = 'hot_standby'
hot_standby = on
# Log配置
archive_mode = on
archive_command = '/bin/true'
wal_log_hints = on

一般只需要在primary节点修改配置即可，我为了方便主从切换，其他从节点可能升级为主节点，就把从节点的配置同样进行了修改。

修改配置pg_hba.conf

local   replication   repmgr                              trust
host    replication   repmgr      127.0.0.1/32            trust
host    replication   repmgr      192.168.44.0/24          trust

local   repmgr        repmgr                              trust
host    repmgr        repmgr      127.0.0.1/32            trust
host    repmgr        repmgr      192.168.44.0/24          trust

创建repmgr用户和repmgr数据库

# 切换到postgres用户执行，root用户执行报错
bash-4.2$ createuser --superuser repmgr
bash-4.2$ createdb --owner=repmgr repmgr
bash-4.2$ psql -c ”ALTER USER repmgr SET search_path TO repmgr, "$user", public;“

或者

# 登陆数据库后执行相关创建
# psql
postgres# create user repmgr superuser login;

postgres# alter user  repmgr with password 'repmgr';

postgres# create database repmgr;

postgres# alter database repmgr owner to repmgr;

postgres# ALTER USER repmgr SET search_path TO repmgr, "$user", public;

重启数据库

1	systemctl restart postgresql-11

7 standby测试连接primary节点数据库

[root@pg2 ~]#  psql 'host=pg1 user=repmgr dbname=repmgr connect_timeout=2'
psql (11.12)
输入 "help" 来获取帮助信息.

repmgr=#

注意
standby节点不要初始化数据库，不用启动数据库，保证/var/lib/pgsql/11/data/目录为空

8 注册Primary节点

修改配置repmgr.conf

[root@pg1 ~]#  vim /etc/rempgr/11/rempgr.conf
# 添加如下配置
node_id=1
node_name='pg1'
conninfo='host=pg1 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/var/lib/pgsql/11/data'

log_level=on
log_facility=STDERR
log_file='/var/log/repmgr/repmgr.log'
pg_bindir='/usr/pgsql-11/bin'

注册Primary节点

[root@pg1 ~]# su postgres
bash-4.2$
bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf primary register
INFO: connecting to primary database...
NOTICE: attempting to install extension "repmgr"
NOTICE: "repmgr" extension successfully installed
NOTICE: primary node record (ID: 1) registered

验证集群状态

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | * running |          | default  | 100      | 1        | host=pg1 user=repmgr dbname=repmgr connect_timeout=2

查看repmgr数据表中的信息

bash-4.2$ psql -U repmgr -d repmgr -h pg1
psql (11.12)
Type "help" for help.

repmgr=# select * from repmgr.nodes;
 node_id | upstream_node_id | active | node_name |  type   | location | priority |                       conninfo                       | repluser | slot_name |        config_file
---------+------------------+--------+-----------+---------+----------+----------+------------------------------------------------------+----------+-----------+----------------------------
       1 |                  | t      | pg1       | primary | default  |      100 | host=pg1 user=repmgr dbname=repmgr connect_timeout=2 | repmgr   |           | /etc/repmgr/11/repmgr.conf
(1 row)

9 standby节点同步主节点数据

配置repmbr.conf

node_id=2
node_name='pg2'
conninfo='host=pg2 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/var/lib/pgsql/11/data'

‎使用--dry-run选项检查可克隆待机状态‎

bash-4.2$ repmgr -h pg1 -U repmgr -d repmgr -f /etc/repmgr/11/repmgr.conf standby clone --dry-run
NOTICE: destination directory "/var/lib/pgsql/11/data" provided
INFO: connecting to source node
DETAIL: connection string is: host=pg1 user=repmgr dbname=repmgr
DETAIL: current installation size is 31 MB
INFO: "repmgr" extension is installed in database "repmgr"
INFO: replication slot usage not requested;  no replication slot will be set up for this standby
INFO: parameter "max_wal_senders" set to 10
NOTICE: checking for available walsenders on the source node (2 required)
INFO: sufficient walsenders available on the source node
DETAIL: 2 required, 10 available
NOTICE: checking replication connections can be made to the source server (2 required)
INFO: required number of replication connections could be made to the source server
DETAIL: 2 replication connections required
WARNING: data checksums are not enabled and "wal_log_hints" is "off"
DETAIL: pg_rewind requires "wal_log_hints" to be enabled
NOTICE: standby will attach to upstream node 1
HINT: consider using the -c/--fast-checkpoint option
INFO: all prerequisites for "standby clone" are met

‎如果没有报告任何问题，则可以同步到备用节点‎

bash-4.2$ repmgr -h pg1 -U repmgr -d repmgr -f /etc/repmgr/11/repmgr.conf standby clone
NOTICE: destination directory "/var/lib/pgsql/11/data" provided
INFO: connecting to source node
DETAIL: connection string is: host=pg1 user=repmgr dbname=repmgr
DETAIL: current installation size is 31 MB
INFO: replication slot usage not requested;  no replication slot will be set up for this standby
NOTICE: checking for available walsenders on the source node (2 required)
NOTICE: checking replication connections can be made to the source server (2 required)
WARNING: data checksums are not enabled and "wal_log_hints" is "off"
DETAIL: pg_rewind requires "wal_log_hints" to be enabled
INFO: checking and correcting permissions on existing directory "/var/lib/pgsql/11/data"
NOTICE: starting backup (using pg_basebackup)...
HINT: this may take some time; consider using the -c/--fast-checkpoint option
INFO: executing:
  pg_basebackup -l "repmgr base backup"  -D /var/lib/pgsql/11/data -h pg1 -p 5432 -U repmgr -X stream
NOTICE: standby clone (using pg_basebackup) complete
NOTICE: you can now start your PostgreSQL server
HINT: for example: pg_ctl -D /var/lib/pgsql/11/data start
HINT: after starting the server, you need to register this standby with "repmgr standby register"

连接到主节点数据库查询

这表明先前克隆备用( pg2显示的字段 application_name)连接到主要的IP地址 192.168.44.132

[root@pg1 ~]# su - postgres -c "psql -U repmgr -d repmgr -h pg1"
psql (11.12)
输入 "help" 来获取帮助信息.

repmgr=# SELECT * FROM pg_stat_replication;
 pid  | usesysid | usename | application_name |  client_addr   | client_hostname | client_port |         backend_start         | backend_xmin |   state   |  sent_lsn  | write_lsn  | flush_lsn  | replay_lsn | write_lag | flush_lag | r
eplay_lag | sync_priority | sync_state
------+----------+---------+------------------+----------------+-----------------+-------------+-------------------------------+--------------+-----------+------------+------------+------------+------------+-----------+-----------+--
----------+---------------+------------
 1122 |    16384 | repmgr  | pg2              | 192.168.44.132 |                 |       43672 | 2021-08-11 14:20:09.038097+08 |              | streaming | 0/180001B0 | 0/180001B0 | 0/180001B0 | 0/180001B0 |           |           |
          |             0 | async
(1 行记录)

连接从节点数据库查询

repmgr=# SELECT * FROM pg_stat_wal_receiver;
  pid  |  status   | receive_start_lsn | receive_start_tli | received_lsn | received_tli |      last_msg_send_time       |     last_msg_receipt_time     | latest_end_lsn |       latest_end_time        | slot_name | sender_host | send
er_port |                                                                                                                  conninfo

-------+-----------+-------------------+-------------------+--------------+--------------+-------------------------------+-------------------------------+----------------+------------------------------+-----------+-------------+-----
--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------
 65103 | streaming | 0/18000000        |                 1 | 0/180001B0   |            1 | 2021-08-11 16:58:06.901487+08 | 2021-08-11 16:58:06.900314+08 | 0/180001B0     | 2021-08-11 14:25:17.92164+08 |           | pg1         |
   5432 | user=repmgr passfile=/var/lib/pgsql/.pgpass connect_timeout=2 dbname=replication host=pg1 port=5432 application_name=pg2 fallback_application_name=walreceiver sslmode=prefer sslcompression=0 krbsrvname=postgres target_sessi
on_attrs=any
(1 行记录)

10 注册standby

注册standby节点到集群

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf standby register
INFO: connecting to local node "pg2" (ID: 2)
INFO: connecting to primary database
WARNING: --upstream-node-id not supplied, assuming upstream node is primary (node ID 1)
INFO: standby registration complete
NOTICE: standby node "pg2" (ID: 2) successfully registered

检查集群状态

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | * running |          | default  | 100      | 1        | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | standby |   running | pg1      | default  | 100      | 1        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2

备注：standby节点默认为只读模式，切换为主节点后才能够写入数据。

主从集群配置完成，暂时只能手动切换

修改主节点的数据库数据，即可实时同步到从节点了。

11 主从切换(只适用于主从双节点)

备注
主从节点切换，只适用于主从双节点集群；3个以上节点集群执行切换命令后原主节点会自动宕掉，需要手动对其进行故障恢复。参考第 12章节故障转移。

11.1 场景

各节点正常运行情况下，手动切换主从节点。

手动switchover，切换主从节点，提升从节点为主，主节点自动变为从节点

11.2 操作

查看集群状态

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | standby |   running | pg2      | default  | 100      | 4        | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | primary | * running |          | default  | 100      | 4        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2

在standby节点执行命令，升级为primary节点

# repmgr standby switchover
bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf standby switchover
# 下为执行过程
NOTICE: executing switchover on node "pg1" (ID: 1)
NOTICE: local node "pg1" (ID: 1) will be promoted to primary; current primary "pg2" (ID: 2) will be demoted to standby
NOTICE: stopping current primary node "pg2" (ID: 2)
NOTICE: issuing CHECKPOINT on node "pg2" (ID: 2)
DETAIL: executing server command "/usr/pgsql-11/bin/pg_ctl  -D '/var/lib/pgsql/11/data' -W -m fast stop"
INFO: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
INFO: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
NOTICE: current primary has been cleanly shut down at location 0/23000028
NOTICE: promoting standby to primary
DETAIL: promoting server "pg1" (ID: 1) using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' promote"
等待服务器进程加载 .... 完成
服务器加载完毕
NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
NOTICE: STANDBY PROMOTE successful
DETAIL: server "pg1" (ID: 1) was successfully promoted to primary
INFO: local node 2 can attach to rejoin target node 1
DETAIL: local node's recovery point: 0/23000028; rejoin target node's fork point: 0/23000098
NOTICE: setting node 2's upstream to node 1
WARNING: unable to ping "host=pg2 user=repmgr dbname=repmgr connect_timeout=2"
DETAIL: PQping() returned "PQPING_NO_RESPONSE"
NOTICE: starting server using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' start"
NOTICE: NODE REJOIN successful
DETAIL: node 2 is now attached to node 1
NOTICE: node  "pg1" (ID: 1) promoted to primary, node "pg2" (ID: 2) demoted to standby
NOTICE: switchover was successful
DETAIL: node "pg1" is now primary and node "pg2" is attached as standby
NOTICE: STANDBY SWITCHOVER has completed successfully

查看集群状态

可以看到standby节点已升级为primary节点，原primary节点自动降级为standby节点。

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | * running |          | default  | 100      | 5        | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | standby |   running | pg1      | default  | 100      | 4        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2
bash-4.2$

其他standby节点指向新的primary节点

1	bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf standby follow

12 故障转移，手动切换

12.1 场景

主节点宕机，升级从节点为primary，恢复原primary节点，并降级为standby.

12.2 操作

查看集群状态

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | * running |          | default  | 100      | 5        | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | standby |   running | pg1      | default  | 100      | 5        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2
bash-4.2$

关闭主节点，模拟宕机

1	systemctl stop postgresql-11.service

查看集群状态

pg1 已经掉线

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status        | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+---------------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | ? unreachable | ?        | default  | 100      |          | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | standby |   running     | ? pg1    | default  | 100      | 5        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2

WARNING: following issues were detected
  - unable to connect to node "pg1" (ID: 1)
  - node "pg1" (ID: 1) is registered as an active primary but is unreachable
  - unable to connect to node "pg2" (ID: 2)'s upstream node "pg1" (ID: 1)
  - unable to determine if node "pg2" (ID: 2) is attached to its upstream node "pg1" (ID: 1)

HINT: execute with --verbose option to see connection error messages
bash-4.2$

手动提升 pg2 节点为 primary节点

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf standby promote
NOTICE: promoting standby to primary
DETAIL: promoting server "pg2" (ID: 2) using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' promote"
could not change directory to "/root": 权限不够
waiting for server to promote.... done
server promoted
NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
NOTICE: STANDBY PROMOTE successful
DETAIL: server "pg2" (ID: 2) was successfully promoted to primary
bash-4.2$

查看集群状态

pg2 已经为primary节点

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | - failed  | ?        | default  | 100      |          | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | primary | * running |          | default  | 100      | 6        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2

WARNING: following issues were detected
  - unable to connect to node "pg1" (ID: 1)

HINT: execute with --verbose option to see connection error messages
bash-4.2$

集群其他standby节点重新指向新primary

1	bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf standby follow

将宕机primary节点，降级为standby节点

# 清空宕机节点数据目录
bash-4.2$ rm -rf /var/lib/pgsql/11/data/*

# 重新从新primary节点克隆备份服务器
bash-4.2$ repmgr -h pg2 -U repmgr -d repmgr -f /etc/repmgr/11/repmgr.conf standby clone
NOTICE: destination directory "/var/lib/pgsql/11/data" provided
INFO: connecting to source node
DETAIL: connection string is: host=pg2 user=repmgr dbname=repmgr
DETAIL: current installation size is 75 MB
INFO: replication slot usage not requested;  no replication slot will be set up for this standby
NOTICE: checking for available walsenders on the source node (2 required)
NOTICE: checking replication connections can be made to the source server (2 required)
INFO: checking and correcting permissions on existing directory "/var/lib/pgsql/11/data"
NOTICE: starting backup (using pg_basebackup)...
HINT: this may take some time; consider using the -c/--fast-checkpoint option
INFO: executing:
  /usr/pgsql-11/bin/pg_basebackup -l "repmgr base backup"  -D /var/lib/pgsql/11/data -h pg2 -p 5432 -U repmgr -X stream
NOTICE: standby clone (using pg_basebackup) complete
NOTICE: you can now start your PostgreSQL server
HINT: for example: pg_ctl -D /var/lib/pgsql/11/data start
HINT: after starting the server, you need to re-register this standby with "repmgr standby register --force" to update the existing node record
bash-4.2$ 

# 启动数据库
systemctl start postgresql-11.service

# 注册standby 节点
bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf standby register --force
INFO: connecting to local node "pg1" (ID: 1)
INFO: connecting to primary database
INFO: standby registration complete
NOTICE: standby node "pg1" (ID: 1) successfully registered
bash-4.2$

查看集群状态

原宕机节点已经降级为standby节点。

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | standby |   running | pg2      | default  | 100      | 6        | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | primary | * running |          | default  | 100      | 6        | host=pg2 user=repmgr dbname=repmgr connect_timeout=2
bash-4.2$

12.3 补充

官方文档

主节点注销

1	repmgr primary unregister -f /etc/repmgr/11/repmgr.conf --node-id=3

从节点注销

1	repmgr standby unregister -f /etc/repmgr/11/repmgr.conf --node-id=3

13 故障自动转移

13.1 编辑sudoers文件

vi /etc/sudoers

# postgres用户，所有sudo命令都不用密码
postgres   ALL=(ALL) NOPASSWD: ALL

或者

# postgres用户，特定sudo命令不需要密码
Defaults:postgres !requiretty
postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-11.service, /usr/bin/systemctl start postgresql-11.service, /usr/bin/systemctl restart postgresql-11.service, /usr/bin/systemctl reload postgresql-11.service, /usr/bin/systemctl start repmgr11.service, /usr/bin/systemctl stop repmgr11.service

备注:

修改sudoers文件是必须的，否则standby节点在自动切换接入新primary节点时候，会报错重启postgres命令失败。

原因：postgres用户采用sudo 命令执行postgresql数据重启操作的时候需要密码验证，没有权限。

13.2 配置postgresql.conf（所有节点）

在文件内添加如下内容，

1	shared_preload_libraries = 'repmgr'

13.3 配置repmgr.conf文件（所有节点）

# 必须项
failover='automatic'
promote_command='/usr/pgsql-11/bin/repmgr standby promote -f /etc/repmgr/11/repmgr.conf --log-to-file'
follow_command='/usr/pgsql-11/bin/repmgr standby follow -f /etc/repmgr/11/repmgr.conf --log-to-file --upstream-node-id=%n'
# 可选项
# 注意：standby的priority值需要更改，因为默认是100，而primary使用的是默认值。这里设置standby的priority为60，其他standby的priority为40。另外，priority的值越大，成为primary的优先级就越高。
priority=70
connection_check_type=ping
reconnect_attempts=6
reconnect_interval=10
monitoring_history=yes
monitor_interval_secs=2
standby_disconnect_on_failover=true
primary_visibility_consensus=true
log_status_interval=60
service_start_command = 'sudo /usr/bin/systemctl start postgresql-11.service'
service_stop_command = 'sudo /usr/bin/systemctl stop postgresql-11.service'
service_restart_command = 'sudo /usr/bin/systemctl restart postgresql-11.service'
service_reload_command = 'sudo /usr/bin/systemctl reload postgresql-11.service'
repmgrd_service_start_command = 'sudo /usr/bin/systemctl start repmgr11.service'
repmgrd_service_stop_command = 'sudo /usr/bin/systemctl stop repmgr11.service'

13.3 启动repmgr服务（所有节点）

1 2	systemctl start repmgrd11.service systemctl enable repmgrd11.service

启动完成后，可以在primary或者standby节点查询集群的events，如下：

[root@pg3 .ssh]#  su - postgres -c "repmgr cluster event --event=repmgrd_start"
 Node ID | Name | Event         | OK | Timestamp           | Details
---------+------+---------------+----+---------------------+------------------------------------------------------
 3       | pg3  | repmgrd_start | t  | 2021-08-17 12:10:43 | monitoring connection to upstream node "pg1" (ID: 1)
 2       | pg2  | repmgrd_start | t  | 2021-08-17 12:08:54 | monitoring connection to upstream node "pg1" (ID: 1)
 1       | pg1  | repmgrd_start | t  | 2021-08-17 12:08:32 | monitoring cluster primary "pg1" (ID: 1)

13.4 故障模拟测试

13.4.1停止pg1节点（primary）的postgresql服务

1	systemctl stop postgresql-11.service

查看集群状态，gp1节点状态变为unreachable

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status        | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+---------------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | ? unreachable | ?        | default  | 100      |          | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | standby |   running     | ? pg1    | default  | 70       | 10       | host=pg2 user=repmgr dbname=repmgr connect_timeout=2
 3  | pg3  | standby |   running     | ? pg1    | default  | 60       | 10       | host=pg3 user=repmgr dbname=repmgr connect_timeout=2

WARNING: following issues were detected
  - unable to connect to node "pg1" (ID: 1)
  - node "pg1" (ID: 1) is registered as an active primary but is unreachable
  - unable to connect to node "pg2" (ID: 2)'s upstream node "pg1" (ID: 1)
  - unable to determine if node "pg2" (ID: 2) is attached to its upstream node "pg1" (ID: 1)
  - unable to connect to node "pg3" (ID: 3)'s upstream node "pg1" (ID: 1)
  - unable to determine if node "pg3" (ID: 3) is attached to its upstream node "pg1" (ID: 1)

HINT: execute with --verbose option to see connection error messages

1分钟后，再次查看集群状态，pg2升级为primary,gp3自动连接到pg2,作为主节点。

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | primary | - failed  | ?        | default  | 100      |          | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | primary | * running |          | default  | 70       | 11       | host=pg2 user=repmgr dbname=repmgr connect_timeout=2
 3  | pg3  | standby |   running | pg2      | default  | 60       | 10       | host=pg3 user=repmgr dbname=repmgr connect_timeout=2

WARNING: following issues were detected
  - unable to connect to node "pg1" (ID: 1)

HINT: execute with --verbose option to see connection error messages

13.4.2 旧primary节点恢复

当旧primary故障恢复后，并不会自动转换为standby，而是以primary角色独自运行，这时就需要将其重新加入到集群中。如下：

关闭postgresql服务

1	bash-4.2$ repmgr node service --action=stop --checkpoint

重新加入集群

bash-4.2$ repmgr -f /etc/repmgr/11/repmgr.conf -d "host=pg2 user=repmgr dbname=repmgr" node rejoin --force-rewind
NOTICE: pg_rewind execution required for this node to attach to rejoin target node 2
DETAIL: rejoin target server's timeline 11 forked off current database system timeline 10 before current recovery point 0/39000028
NOTICE: executing pg_rewind
DETAIL: pg_rewind command is "/usr/pgsql-11/bin/pg_rewind -D '/var/lib/pgsql/11/data' --source-server='host=pg2 user=repmgr dbname=repmgr connect_timeout=2'"
NOTICE: 0 files copied to /var/lib/pgsql/11/data
NOTICE: setting node 1's upstream to node 2
WARNING: unable to ping "host=pg1 user=repmgr dbname=repmgr connect_timeout=2"
DETAIL: PQping() returned "PQPING_NO_RESPONSE"
NOTICE: starting server using "sudo /usr/bin/systemctl start postgresql-11.service"
NOTICE: NODE REJOIN successful
DETAIL: node 1 is now attached to node 2

查看集群状态

bash-4.2$ repmgr cluster show
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+------------------------------------------------------
 1  | pg1  | standby |   running | pg2      | default  | 100      | 10       | host=pg1 user=repmgr dbname=repmgr connect_timeout=2
 2  | pg2  | primary | * running |          | default  | 70       | 11       | host=pg2 user=repmgr dbname=repmgr connect_timeout=2
 3  | pg3  | standby |   running | pg2      | default  | 60       | 11       | host=pg3 user=repmgr dbname=repmgr connect_timeout=2

到此，postgres高可用自动故障转移方案成功，配合VIP服务，可实现自动故障转移及服务高可用。

备注
如果不能重新加入，可以将旧primary强制(-F)转换为standby。或参考 13章节，手动清除，克隆，加入standby。

repmgr -h pg2 -U repmgr -d repmgr -f /etc/repmgr/11/repmgr.conf standby clone -F

systemctl start postgresql-12.service

repmgr -f /etc/repmgr/11/repmgr.conf standby register -F

问题汇总

错误1

从节点同步主节点时，报错连接失败

bash-4.2$ repmgr -h pg1 -U repmgr -d repmgr -f /etc/repmgr/11/repmgr.conf standby clone –dry-run
NOTICE: destination directory “/var/lib/pgsql/11/data” provided
INFO: connecting to source node
DETAIL: connection string is: host=pg1 user=repmgr dbname=repmgr
DETAIL: current installation size is 31 MB
INFO: “repmgr” extension is installed in database “repmgr”
INFO: replication slot usage not requested; no replication slot will be set up for this standby
INFO: parameter “max_wal_senders” set to 10
NOTICE: checking for available walsenders on the source node (2 required)
INFO: sufficient walsenders available on the source node
DETAIL: 2 required, 10 available
NOTICE: checking replication connections can be made to the source server (2 required)
ERROR: connection to database failed
DETAIL:
致命错误: 没有来自主机 “192.168.44.132”, 用户”repmgr”, SSL 关闭的复制连接的pg_hba.conf记录

ERROR: connection to database failed
DETAIL:
致命错误: 没有来自主机 “192.168.44.132”, 用户”repmgr”, SSL 关闭的复制连接的pg_hba.conf记录

ERROR: unable to establish necessary replication connections
HINT: check replication permissions on the source server

解决办法：

少加了前三条授权，添加上后问题解决。

local   replication   repmgr                              trust
host    replication   repmgr      127.0.0.1/32            trust
host    replication   repmgr      192.168.44.0/24          trust

local   repmgr        repmgr                              trust
host    repmgr        repmgr      127.0.0.1/32            trust
host    repmgr        repmgr      192.168.44.0/24         trust

错误2

同样同步时错误

bash-4.2$ repmgr -h pg1 -U repmgr -d repmgr -f /etc/repmgr/11/repmgr.conf standby clone
NOTICE: destination directory “/var/lib/pgsql/11/data” provided
INFO: connecting to source node
DETAIL: connection string is: host=pg1 user=repmgr dbname=repmgr
DETAIL: current installation size is 31 MB
INFO: replication slot usage not requested; no replication slot will be set up for this standby
NOTICE: checking for available walsenders on the source node (2 required)
NOTICE: checking replication connections can be made to the source server (2 required)
WARNING: data checksums are not enabled and “wal_log_hints” is “off”
DETAIL: pg_rewind requires “wal_log_hints” to be enabled
INFO: checking and correcting permissions on existing directory “/var/lib/pgsql/11/data”
NOTICE: starting backup (using pg_basebackup)…
HINT: this may take some time; consider using the -c/–fast-checkpoint option
INFO: executing:
pg_basebackup -l “repmgr base backup” -D /var/lib/pgsql/11/data -h pg1 -p 5432 -U repmgr -X stream
pg_basebackup: 无法得到来自服务器的预写日志终止位置: 错误: 无法打开文件 “./postgresql.conf.bak”: 权限不够
pg_basebackup: 删除数据目录 “/var/lib/pgsql/11/data” 的内容
ERROR: unable to take a base backup of the source server
HINT: data directory (“/var/lib/pgsql/11/data”) may need to be cleaned up manually

原因：

1 数据库数据目录下存在非postgres用户权限的文件，postgresql.conf.bak权限是root权限，授权为postgres用户权限即可

2 从节点的数据目录要保证为空。

错误3

# repmgr standby switchover 命令
bash-4.2$ repmgr standby switchover -f /etc/repmgr/11/repmgr.conf
NOTICE: checking switchover on node "pg2" (ID: 2) in --dry-run mode
WARNING: unable to connect to remote host "pg1" via SSH
ERROR: unable to connect via SSH to host "pg1", user ""

解决办法：

创建postgres 用户免密SSH登陆。

错误4

bash-4.2$ repmgr standby switchover
NOTICE: executing switchover on node "pg1" (ID: 1)
ERROR: unable to execute "repmgr" on "pg2"
HINT: check "pg_bindir" is set to the correct path in "repmgr.conf"; current value: (not set)

解决办法：

修改repmgr.conf配置文件，增加 pg_bindir参数，设置postgresql的bin路径。

1	pg_bindir='/usr/pgsql-11/bin'

错误5

standby升级为primary节点后，其他低权重的standby连接新primary使出现postgresql重启错误。

[2021-08-17 14:24:08] [INFO] local node 3 can attach to follow target node 2
[2021-08-17 14:24:08] [DETAIL] local node's recovery point: 0/3802CD38; follow target node's fork point: 0/3802CD38
[2021-08-17 14:24:08] [NOTICE] setting node 3's upstream to node 2
[2021-08-17 14:24:08] [NOTICE] restarting server using "sudo /usr/bin/systemctl restart postgresql-11.service"
[2021-08-17 14:24:10] [ERROR] unable to restart server
[2021-08-17 14:24:10] [NOTICE] STANDBY FOLLOW failed
[2021-08-17 14:24:10] [ERROR] connection to database failed

解决办法：配置postgres用户sudo命令免输入密码

vi /etc/sudoers

# postgres用户，所有sudo命令都不用密码
postgres   ALL=(ALL) NOPASSWD: ALL

或者

# postgres用户，特定sudo命令不需要密码
Defaults:postgres !requiretty
postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-11.service, /usr/bin/systemctl start postgresql-11.service, /usr/bin/systemctl restart postgresql-11.service, /usr/bin/systemctl reload postgresql-11.service, /usr/bin/systemctl start repmgr11.service, /usr/bin/systemctl stop repmgr11.service