准备内存修改ssh 公钥互信准备ssh连接调过keys处理提前修改主机名创建主从创建管理用户备份恢复数据库到从节点修改从节点参数开始同步slave节点设置Slave上的relay_log配置mha主配置文件与检测mha perl配置vip确认mha 工具包中脚本默认路径及检测mha check 问题处理配置supervisor配置2binlog_backup_app1.shsupervisor服务配置supervisorctl使用帮助,supervisord日志查看下面是手动安装supervisord的操作mha_healthy_check脚本只读的说明只读(锁库锁表解锁)innodb_read_onlytransaction_read_only(tx_read_only )锁定解锁主从问题处理errno 1062 errno 1032断电导致数据同步问题处理方案确认数据一致12361236问题过程一致性检测MHA相关脚本/etc/masterha/app1.conf日志状态查看问题处理
主节点不涉及重启,在线修改+修改my.cnf,不需要重启。
从节点配置过程中需要重启,直接修改my.cnf,重启。
xxxxxxxxxx
101SET GLOBAL innodb_buffer_pool_size=402653184;
2SET GLOBAL innodb_buffer_pool_size=225769803776; #设置24G
3SELECT @@innodb_buffer_pool_size; #查询
4SELECT @@innodb_buffer_pool_chunk_size;
5
6修改配置文件的调整方法,修改my.cnf配置示例,缺省单位kb:
7innodb_buffer_pool_size = 2147483648 #设置2G
8innodb_buffer_pool_size = 225769803776 #设置24G
9innodb_buffer_pool_size = 2G #设置2G
10innodb_buffer_pool_size = 500M #设置500M
xxxxxxxxxx
1461#无交互生成ssh key
2ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa_mha -N "" -C "mysql mha use only"
3
4
5id_rsa
6-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
7
8mkdir /root/.ssh
9cat << EOFEOF > /root/.ssh/id_rsa_mha
10-----BEGIN RSA PRIVATE KEY-----
11MIIJKwIBAAKCAgEAuCSK4kBryryAlzYQBTxdjojC9kPMTFh2rFRJ56qMNzBsDYh2
12y/nypSmairoWDdAUgy6yh/MwXh2AFwrhTCSpCDBIPQaSPP6Dt03vdYu8cuQbD5wT
13BWVWDMmc1qJQulSeAhJu/d2c8iy7+9vnu+LAtZLi2sOahxwIunOcdWI4VGkGpE+1
14+3OPT4rZD1rO8DSMyachyc1bX6eX+Fj/5VQYgWqmZb1Ht2EAmEpUTutvTPmydjsO
15MKRbZIzD4sFRFHKk9PFbDnNe+rce6EJ4e38F2nsRjJXsbRKF1Z/lXthYtz7Hftfx
16IHqer1wg7RdNnQqzuHTl1mlKjVgf3XWqbu6rt53EGyUnI2AxJdE7t9qS13VLeJbK
172SZ3mchcF5N6CI5lp9owazqfZrG3rvJ5oRNYOBtimR/Lm3BtvNbdundgXGUtFlco
18D9JiJNDKNpBONJ8D3oaeHXj74Rblh8ta8U8SJS8DBbqEegdY2BWC4QZGvjwhi3Ra
19iQ2IX+22wwE4zq3PeKzdMaR+TMB19tWZwb2AdmFL5cwxt2Zoc3EYAB3S4srNejkT
20K76TG3+18me2WcUnTM9y8cwaZ1HN0PPg3N4/cRyNGtWQ8p0Sje3kZ0QLxw9BPTx3
213mnW37FOP14uTgBFf1rHY/GyTcCk/Q5Ox8g48Jmou9+GxEmDLsxhZNTQkM0CAwEA
22AQKCAgEApn9cGHD5RemlN5QJbkKDsVIVPkhVHB7Mxsq7vD2n5Gri+GWQTKSmchIp
23CxBwynJ/aZG6mVQZQq/X1n3e8lMfhyjo2Eztwzq56+8K+jSr4xI/KaEMTc+vu4Y8
24SpGe/qtt5x+1vdM4inxKSNMwN+EyL7stZXKBcXK2rafFKxnxJtmDyOPN/xsg1GIO
25G2UXQXeyaM62rVoCKX5G2aXXUwrz0WiWJd28T6HkwM5MZsNX8ri/VUnWbcLeZA8n
26OwtifWOfpV3p4CoPEhCrYIRhngBfUYn/dydEC+c4ncyiv1xM8vtPtHAaNSzJ+HML
27znRImBtKHxmLi4/OefjZbwtqXD1vguAbVjySypQzVDWWIZbyv4ZtjtWn0Wv86poL
282/P8v9mm3BztcmNdW8R9P9wKqEZdd5ZS5Ev7SWYL/3QqmxkjbM2nB6s5cvafRMNX
29obbP97EoHu0VRzYMRKZaHY97MP7H99BcH+sZ/4hLRVzLDCaMJvdyIAgHoouLg8xj
30s2xe+iI9VSSAPNk9C+sVQCv1bt2T9+JPf+hJvCtw4eJ9KMD1t8BVccH6AEtPzNxA
31Q8qTdFy7eQ47MvJFpYed6jmUqWYate76PxyL8RvrE7xyKuoAzy9bMpE74sbBYW9J
32Sn+RPk4QxBSR5PSbl5/PW83S+CungHzgsgDaIUNKKtsX05DMKCECggEBAOhI+6oX
33dVizKqd/+mtMUEzRkD9aNKlyImZhxspOxhxFl3Sh8/0oQ0aUcorYxdz0GOvt+e5k
3435vLolF3VErB6SwLiFwkNift72mUxfX4QghTLTto7xRyYzeiG7DFmnYBkjRA16ci
35i9vhZLx+ZpExOqaAsKFB7KeJcgAjdEZ/CaTJ9Sw4e9PiRw/ir0auaMkruOLUfRur
36IjTZzF1fCJYMrMmb9ZJivknh4ZWtN9/3a/EQB6iqSqcVn3odwtegx/Q6MTWk2sXX
37tri2RWc1i2kLbdFU2Vfy2gULZ62PnNpbpRGq/u/FLzmT0Zq0itFi39d62tJp0Q+5
38T3vJ6FwC6czrBwkCggEBAMrxTuhpw2JFqn4yB5cHN91zrUD1mX6P67ZDIKw89cCk
39JCHTrYLFQ334GzLhl7SZ/F/X+WyqyIav5U358c60d0VU+eSQCNMO4/PUEl197bXr
405mnkmsRX4nFgbaFvHd+qhmOfRHzdMeMZsiMALhZVN4i3CE5+k8mdPqNENMTrZcU1
410NfDH2bndLjD/zY+ziA5f+Q+WOZ4BZhJRPbpQbUDU0PRc0TwFp1mmnB9c16DWq20
42+ojuucrvtS2jyVpQdaOqiN3Y2TU0gLD2cg/uRvtbNIYq+PHy880w3vLEgcF363Gn
43yRch+n1n2nJvvLvuVpphKF/+OA3M+M8KHiXgufSmyKUCggEBAMpWyOJhjNa3O3Ct
44CnZtA4XnBsAo07bGtTB5+7A+7lOySzxN1zuu1uukZCKOsczG1O6vXgdL13ze5YYR
45bFuyjmrPoPiikoo1uVc/QTICD4Mw8TXPMnOdMNBFZdmWR/aDZ9gco73zWeEvdZC8
46ipvvRifD8hiGVJ+cGrIEMEU4ZC4nvmnGyhbIBR7RV906EA0tCOCOOGzhpepjzqGH
47JvOOFvWH5WKeoSO6n8bCHOn7nQtf60wBCwaAz2qQgilS5NrZ1gPklq3AOmqsW8iL
482+SlEARI5Qq3gTLjvGxzrS6IpkG9slUqbvi4ytz4Z/P9LP+HBRZFFteblKFAQCfm
49YcMe+AECggEBALoUFlD+INtrm09ImjL9uFiCJ1rHlu04E120ITIsVv1Cw6qMh6y/
50KQpIws+dj9dV1dUfl6Du2xy8DVm5EuGAEVht+Q98jcrP/W1lh01YGOzUlxmDNfFP
51z947F2qNEuwxVDXmJAsFWiesOOzkzl1F17ywbppibynO108mS+g/Grp/hLVBQPq+
52wQRlGg3n+mILtttV23e+lwqyK9zltYy+tssZMkMEjQfXu0XnkTAY6r0ZkxqZIZwU
53vKBTbF02JxdhV0tq+IZTVKht39oBle4FC5z1q/tU0/zRkasxSPRW+3fj63sdeasg
543WhOUdRTTe8RpM6Xr1fgGEbmNQxwCLFtd80CggEBAL9eYg1KTt1apIvu47BEInHf
55+F4BskqH9kyoI79S2CPtRW3eKDZkkM4wImqZdesnAgJnvqwlSoF/T6Breb3FjGew
56nhve3FZga3pXN2KVs9s93eqXHQgcdBqTFIpDEGnK9Oxsr+o0FVdIiXmJXq3z+cBf
57P1KGSSEdg+d5tfvsQIXbSHmCO0aYg3fx12psNCzrgQcWIpkfYhgeoXgxJ+gD4YqO
58sL3k5mQyX3aMAAXt0KoEPLeSTLW3ZG2cPCRRujUqsgcqlCIYfD5yfbPosa9IDI9A
59vuoSEJ51/aFwxtfPfLweEImc+76DSPDFu9HhGJ7wavZYs4pNhbwOMRXrDykDvXo=
60-----END RSA PRIVATE KEY-----
61EOFEOF
62
63chmod 600 /root/.ssh/id_rsa_mha
64
65-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
66
67mkdir /root/.ssh
68if [ -e "/root/.ssh/authorized_keys" ] ; then echo "File has existed ,do nothing " ; else touch /root/.ssh/authorized_keys ; echo "File isn't exist , touch it" ; fi
69if [ -e "/root/.ssh/authorized_keys" ] ; then echo "#" >> /root/.ssh/authorized_keys ; sed -i -e "/PYje4zUtbYRyp4DxCROal/s/^/#/g" -e '$a''ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC4JIriQGvKvICXNhAFPF2OiML2Q8xMWHasVEnnqow3MGwNiHbL+fKlKZqKuhYN0BSDLrKH8zBeHYAXCuFMJKkIMEg9BpI8/oO3Te91i7xy5BsPnBMFZVYMyZzWolC6VJ4CEm793ZzyLLv72+e74sC1kuLaw5qHHAi6c5x1YjhUaQakT7X7c49PitkPWs7wNIzJpyHJzVtfp5f4WP/lVBiBaqZlvUe3YQCYSlRO629M+bJ2Ow4wpFtkjMPiwVEUcqT08VsOc176tx7oQnh7fwXaexGMlextEoXVn+Ve2Fi3Psd+1/Egep6vXCDtF02dCrO4dOXWaUqNWB/ddapu7qu3ncQbJScjYDEl0Tu32pLXdUt4lsrZJneZyFwXk3oIjmWn2jBrOp9msbeu8nmhE1g4G2KZH8ubcG281t26d2BcZS0WVygP0mIk0Mo2kE40nwPehp4dePvhFuWHy1rxTxIlLwMFuoR6B1jYFYLhBka+PCGLdFqJDYhf7bbDATjOrc94rN0xpH5MwHX21ZnBvYB2YUvlzDG3ZmhzcRgAHdLiys16ORMrvpMbf7XyZ7ZZxSdMz3LxzBpnUc3Q8+Dc3j9xHI0a1ZDynRKN7eRnRAvHD0E9PHfeadbfsU4/Xi5OAEV/Wsdj8bJNwKT9Dk7HyDjwmai734bESYMuzGFk1NCQzQ== mysql mha use only' /root/.ssh/authorized_keys; fi
70chmod 600 /root/.ssh/authorized_keys
71history -r
72
73-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
74
75
76
77ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC4JIriQGvKvICXNhAFPF2OiML2Q8xMWHasVEnnqow3MGwNiHbL+fKlKZqKuhYN0BSDLrKH8zBeHYAXCuFMJKkIMEg9BpI8/oO3Te91i7xy5BsPnBMFZVYMyZzWolC6VJ4CEm793ZzyLLv72+e74sC1kuLaw5qHHAi6c5x1YjhUaQakT7X7c49PitkPWs7wNIzJpyHJzVtfp5f4WP/lVBiBaqZlvUe3YQCYSlRO629M+bJ2Ow4wpFtkjMPiwVEUcqT08VsOc176tx7oQnh7fwXaexGMlextEoXVn+Ve2Fi3Psd+1/Egep6vXCDtF02dCrO4dOXWaUqNWB/ddapu7qu3ncQbJScjYDEl0Tu32pLXdUt4lsrZJneZyFwXk3oIjmWn2jBrOp9msbeu8nmhE1g4G2KZH8ubcG281t26d2BcZS0WVygP0mIk0Mo2kE40nwPehp4dePvhFuWHy1rxTxIlLwMFuoR6B1jYFYLhBka+PCGLdFqJDYhf7bbDATjOrc94rN0xpH5MwHX21ZnBvYB2YUvlzDG3ZmhzcRgAHdLiys16ORMrvpMbf7XyZ7ZZxSdMz3LxzBpnUc3Q8+Dc3j9xHI0a1ZDynRKN7eRnRAvHD0E9PHfeadbfsU4/Xi5OAEV/Wsdj8bJNwKT9Dk7HyDjwmai734bESYMuzGFk1NCQzQ== mysql mha use only
78
79
80ssh-copy-id -i .ssh/id_rsa_mha root@10.10.101.53
81ssh-copy-id -i .ssh/id_rsa_mha root@10.10.101.54
82ssh-copy-id -i .ssh/id_rsa_mha root@10.10.101.41
83
84ssh-copy-id -i .ssh/id_rsa_mha root@20.99.46.36
85ssh-copy-id -i .ssh/id_rsa_mha root@10.10.101.54
86ssh-copy-id -i .ssh/id_rsa_mha root@10.10.101.41
87
88# Host表示一个段,Hostname可以是域名、ip
89# StrictHostKeyChecking no 不需要验证对方指纹
90# IdentityFile /root/.ssh/id_rsa_mha # 完整路径,是必须的,
91# 如果masterha_check_ssh检测不通过,可以用ssh hostname/ip -v来显示详细。
92
93cat >> /root/.ssh/config << EOF
94Host *
95 Hostname %h
96 User root
97 Port 22
98 StrictHostKeyChecking no
99 IdentityFile /root/.ssh/id_rsa_mha
100EOF
101history -r
102chmod 600 /root/.ssh/config
103
104scp /root/.ssh/config m:/root/.ssh/ ; \
105scp /root/.ssh/config 10.10.101.54:/root/.ssh/ ; \
106scp /root/.ssh/id_rsa_mha 10.10.101.53:/root/.ssh/ ; \
107scp /root/.ssh/id_rsa_mha 10.10.101.54:/root/.ssh/ ; \
108scp /etc/hosts 10.10.101.53:/etc/ ; \
109scp /etc/hosts 10.10.101.54:/etc/
110
111
112scp /root/.ssh/config 10.10.101.53:/root/.ssh/ ; \
113scp /root/.ssh/config 10.10.101.54:/root/.ssh/ ; \
114scp /root/.ssh/id_rsa_mha 10.10.101.53:/root/.ssh/ ; \
115scp /root/.ssh/id_rsa_mha 10.10.101.54:/root/.ssh/ ; \
116scp /etc/hosts 10.10.101.53:/etc/ ; \
117scp /etc/hosts 10.10.101.54:/etc/
118
119
120scp /root/.ssh/config 10.10.101.41:/root/.ssh/ ; \
121scp /root/.ssh/id_rsa_mha 10.10.101.41:/root/.ssh/ ; \
122scp /etc/hosts 10.10.101.41:/etc/
123
124
125
126
127scp .ssh/id_rsa_mha .ssh/config 10.10.101.41:/root/.ssh/
128scp .ssh/id_rsa_mha .ssh/config 10.10.101.53:/root/.ssh/
129scp .ssh/id_rsa_mha .ssh/config 10.10.101.54:/root/.ssh/
130
131
132
133
134scp .ssh/id_rsa_mha .ssh/config 10.10.101.41:/root/.ssh/
135scp .ssh/id_rsa_mha .ssh/config 10.10.101.53:/root/.ssh/
136scp .ssh/id_rsa_mha .ssh/config 10.10.101.54:/root/.ssh/
137
138
139
140
141scp /root/.ssh/config 10.10.101.53:/root/.ssh/
142scp /root/.ssh/config 10.10.101.54:/root/.ssh/
143
144
145scp /etc/hosts 10.10.101.53:/etc/
146scp /etc/hosts 10.10.101.54:/etc/
xxxxxxxxxx
11/root/.ssh/id_rsa (0x56049f23c1a0), explicit #
因为修改主机名会导致服务不能正常停止,可参考管理文档shutdown.md
xxxxxxxxxx
341# 如果/etc/masterha/app1.conf中的节点参数hostname使用了ip,则不需要修改/etc/hosts;如果其中使用了主机名称,则需要添加/etc/hosts
2
3cat >> /etc/hosts << EOF
4172.16.8.6 mhaslave
5172.16.8.5 mhamaster
6172.23.9.47 mhamanager
7EOF
8cat /etc/hosts
9
10## 先在管理节点上添加,即可同步
11cat >> /root/.ssh/config << EOF
12Host *
13Hostname %h
14User root
15Port 22
16StrictHostKeyChecking no
17IdentityFile /root/.ssh/id_rsa_mha
18EOF
19history -r
20
21
22
23# 远程修改主机名
24ssh mhaslave "hostnamectl set-hostname mhaslave"
25ssh mhamaster "hostnamectl set-hostname mhamaster"
26ssh mhamanager "hostnamectl set-hostname mhamanager"
27
28# 本地修改
29
30hostnamectl set-hostname mhaslave
31hostnamectl set-hostname mhamaster
32hostnamectl set-hostname mhamanager
33
34
参考Percona\innobackupex.md
过程:
xxxxxxxxxx
61use mysql;
2create user 'rpl'@'172.16.16.%' identified by 'Rpl@mysql46';
3grant replication slave on *.* to 'rpl'@'172.16.16.%';
4flush privileges;
5
6
xxxxxxxxxx
111另外创建一个管理用户,类似于root,但登陆权限设置在局域网网段,区分repl用户
2create user 'rpl'@'10.10.101.%' identified by 'MySQL_10.10.Repl';
3grant replication slave on *.* to 'rpl'@'10.10.101.%';
4flush privileges;
5reset master;
6
7###### 临时用
8create user 'rpl'@'20.99.46.%' identified by 'Rpl@my123';
9grant replication slave on *.* to 'rpl'@'20.99.46.%';
10flush privileges;
11reset master;
xxxxxxxxxx
31reset master;
2reset slave ;
3reset slave all;
x
1create user 'rpl'@'172.16.8.%' identified by 'yourmysqlpassword';
2grant replication slave on *.* to 'rpl'@'172.16.8.%';
3flush privileges;
4reset master;
5
6create user 'rpl'@'100.64.10.%' identified by 'Rpl_pass';
7grant replication slave on *.* to 'rpl'@'100.64.10.%';
8flush privileges;
9reset master;
10
11
xxxxxxxxxx
11
xtrabackup
xxxxxxxxxx
21scp -r /data/incr_mysqlbackup/mysql/WEEK_44 mhaslave:/data/
2
xxxxxxxxxx
121# 从percona备份中找到binlog_position 中的 filename 'bin.000009', position '353722478'
2binlog_pos = filename 'bin.000009', position '353722478', GTID of the last change '15a672e4-00c9-11eb-b410-fa163e97fbaa:1-1073652 11 ,
3
4
5
6
7# xtrabackup_info 中有log信息点,详见percana备份记录文件-percona/innobackupxe.md
8change master to master_host='172.16.16.44', master_user='rpl', master_password='Rpl@mysql46',master_port=36436,master_log_file='bin.000009',master_log_pos=353722478;
9start slave;
10show slave status \G;
11
12
从MHA Failover
的过程中可以了解到,MHA Manager
在恢复(补齐)其他Slave数据时会用到relay-log
,因此这些relay-log
需要被保留。
而默认情况下,SQL线程在回放完毕后,MySQL会主动删除relay-log
,需要禁用
该功能,确保relay-log
不被自动删除。
在所有Slave节点
中配置如下参数,然后重启mysql
服务即可。
xxxxxxxxxx
121# 所有的Slave节点[mysqld]
2# 关闭relay-log主动删除的功能
3relay_log_purge = 0
4
5
6show global variables like '%relay_log_purge%';
7+-----------------+-------+
8| Variable_name | Value |
9+-----------------+-------+
10| relay_log_purge | ON |
11+-----------------+-------+
12set global relay_log_purge=off;
需要在节点服务器执行
xxxxxxxxxx
81ln -s /usr/local/mysql/bin/mysqlbinlog /usr/bin/mysqlbinlog
2ln -s /usr/local/mysql/bin/mysql /usr/bin/mysql
3ln -s /usr/local/bin/ifconfig /sbin/ifconfig
4
5cp master_ip_failover /usr/local/bin/master_ip_failover
6chmod a+x /usr/local/bin/master_ip_failover
7
8#ifconfig是为了不去修改perl脚本。而/sbin又是/usr/sbin的链接,实际到了/usr/sbin/ifconfig
xxxxxxxxxx
31# 配置vip
2ifconfig eth0:88 172.16.16.200/24
3ip a
xxxxxxxxxx
21/usr/local/bin/masterha_check_ssh --conf=/etc/masterha/app1.conf
2/usr/local/bin/masterha_check_repl --conf=/etc/masterha/app1.conf
xxxxxxxxxx
131[root@mhamanager mha]# rpm -ql mha4mysql-manager
2/usr/bin/masterha_check_repl
3/usr/bin/masterha_check_ssh
4/usr/bin/masterha_check_status
5/usr/bin/masterha_conf_host
6/usr/bin/masterha_manager
7/usr/bin/masterha_master_monitor
8/usr/bin/masterha_master_switch
9/usr/bin/masterha_secondary_check
10/usr/bin/masterha_stop
11...
12帮助文件
13...
x
1[server default]
2# 这两个参数需要根据不同的集群进行修改
3manager_workdir=/var/log/masterha/app1
4manager_log=/var/log/masterha/app1/manager.log
5# 按照master服务器存放binlog的实际路径进行修改,主要为了让MHA拉取binlog
6master_binlog_dir=/data/mysql_data/
7# 设置自动failover的脚本
8master_ip_failover_script= /usr/local/bin/master_ip_failover
9#master_ip_failover_script= /usr/local/bin/master_ip_failover --interface=eth0 --key=88 --vip=10.10.101.55/24 --gateway=10.10.101.254 如果使用自定义参数,需要在脚本中提前定义。
10# 设置手动切换时候的脚本 (供(masterha_master_switch使用)
11master_ip_online_change_script=/usr/local/bin/master_ip_failover
12log_level=debug
13# 监控的用户
14user=root
15# 监控用户的密码
16password=yourmysqlpassword
17# 监控主库的时间间隔,默认是3秒,尝试三次没有回应的时候自动进行railover
18#ping_interval=3
19ping_interval=999999
20# 检测方式是insert,MHA-0.56开始支持insert
21# 会在Master中生成一个 infra 数据库
22ping_type=SELECT
23# 设置远端mysql在发生切换时binlog的保存位置
24remote_workdir=/tmp
25# 复制用的密码
26repl_password=yourrepomysqlpassword
27# 复制的用户
28repl_user=rpl
29# 告警脚本,可自行修改,这里没有使用
30#report_script=/usr/local/send_report
31# 通过从机进行二次探测的脚本, IP地址按照实际的情况进行修改
32secondary_check_script=/usr/local/bin/masterha_secondary_check -s slave1 --user=root --master_host=master --master_port=37389 --port=22
33# 设置故障发生后关闭故障主机的脚本(主要作用是关闭主机防止发生脑裂,这里没有使用,类似Fence功能)
34#shutdown_script="/usr/local/bin/power_manager --command=stopssh2 --host=test-1 --ssh_user=root"
35# 定义ssh的用户
36ssh_user=root
37
38[server1]
39# 这个hostname也可以配置成IP地址,同 ip 参数一样
40# 如果这里写名字,需要DNS配合,或者使用 /etc/hosts
41hostname=mhamaster
42ip=10.10.101.53
43port=37389
44ssh_port=22
45# candidate_master参数的意思为:设置为候选Master,如果发生主从切换,该主机会被提升为Master,即使这个服务器上的数据不是最新的(会用relay-log补全)
46candidate_master=1
47[server2]
48hostname=mhaslave
49ip=10.10.101.54
50port=37389
51ssh_port=22
52candidate_master=1
53# check_repl_delay参数的意思为:默认情况下如果一个slave落后master 100M的relay logs的话,MHA将不会选择该slave作为一个新的master;
54# 因为对于这个slave的恢复需要花费很长时间;
55# 通过设置check_repl_delay=0,MHA触发切换在选择一个新的master的时候将会忽略复制延时;
56# 这个参数对于设置了candidate_master=1的主机非常有用,因为这个候选主在切换的过程中一定是新的master
57check_repl_delay=0
58
59[binlog1]
60no_master=1
61hostname=mhamaster
62master_binlog_dir=/data/mysql_data/
63# 将binlog指定到服务时,数据目录会复制一份到此服务器路径。或者指定任意服务器。
xxxxxxxxxx
91
2Thu Oct 15 21:53:33 2020 - [info] /usr/local/bin/master_ip_failover --interface=eth0 --key=88 --vip=20.99.46.60/24 --gateway=20.99.46.1 --command=status --ssh_user=root --orig_master_host=20.99.46.37 --orig_master_ip=20.99.46.37 --orig_master_port=3306
3Unknown option: interface
4Unknown option: key
5Unknown option: vip
6Unknown option: gateway
7
8此处的报错:需要将参数去掉。参考需要在 /usr/local/bin/master_ip_failover中自定义添加,以让perl脚本识别。
9
/etc/supervisor.conf
mha启动调整为通过supervisord监控实现。
注意:将引用/usr/bin/masterha_manager,注意修改路径,否则启动不了。
xxxxxxxxxx
241[unix_http_server]
2file=/var/run/supervisor.sock ; (the path to the socket file)
3[supervisord]
4logfile=/var/log/supervisord.log ; (main log file;default $CWD/supervisord.log)
5logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB)
6logfile_backups=10 ; (num of main logfile rotation backups;default 10)
7loglevel=info ; (log level;default info; others: debug,warn,trace)
8pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
9nodaemon=false ; (start in foreground if true;default false)
10minfds=1024 ; (min. avail startup file descriptors;default 1024)
11minprocs=200 ; (min. avail process descriptors;default 200)
12[rpcinterface:supervisor]
13supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
14[supervisorctl]
15serverurl=unix:///var/run/supervisor.sock ; use a unix:// URL for a unix socket
16[program:mha_manager]
17command=nohup /usr/local/bin/masterha_manager --conf=/etc/masterha/app1.conf --remove_dead_master_conf --ignore_last_failover process_name=%(program_name)s
18autostart=true
19autorestart=false
20redirect_stderr=true
21stdout_logfile=/var/log/masterha/app1/manager.log
22stdout_logfile_maxbytes=10MB
23stdout_logfile_backups=5
24user=root
xxxxxxxxxx
381[unix_http_server]
2file=/var/run/supervisor.sock ; (the path to the socket file)
3[supervisord]
4logfile=/var/log/supervisord.log ; (main log file;default $CWD/supervisord.log)
5logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB)
6logfile_backups=10 ; (num of main logfile rotation backups;default 10)
7loglevel=info ; (log level;default info; others: debug,warn,trace)
8pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
9nodaemon=false ; (start in foreground if true;default false)
10minfds=1024 ; (min. avail startup file descriptors;default 1024)
11minprocs=200 ; (min. avail process descriptors;default 200)
12[rpcinterface:supervisor]
13supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
14[supervisorctl]
15serverurl=unix:///var/run/supervisor.sock ; use a unix:// URL for a unix socket
16
17
18[program:binlog_backup_app1]
19command=/root/binlog_backup_app1.sh
20autostart=true
21autorestart=true
22redirect_stderr=true
23stdout_logfile=/data/binlog/app1/dump.log
24stdout_logfile_maxbytes=10MB
25stdout_logfile_backups=5
26user=root
27
28[program:mha_manager_app1]
29command=nohup /usr/local/bin/masterha_manager --conf=/etc/masterha/app1.conf --remove_dead_master_conf --ignore_last_failover process_name=%(program_name)s
30autostart=true
31autorestart=false
32redirect_stderr=true
33stdout_logfile=/var/log/masterha/app1/manager.log
34stdout_logfile_maxbytes=10MB
35stdout_logfile_backups=5
36user=root
37(END)
38
x
1
2
3BACKUP_BIN=/usr/bin/mysqlbinlog
4LOCAL_BACKUP_DIR=/data/binlog/app1
5BACKUP_LOG=/data/binlog/app1/dump.log
6REMOTE_HOST=master
7REMOTE_PORT=3306
8SERVER_ID=324
9REMOTE_USER=root
10REMOTE_PASS=yourmysqlpassword
11#time to wait before reconnecting after failure
12SLEEP_SECONDS=10
13count=1
14
15## 检查备份目录
16if [[ ! -d ${LOCAL_BACKUP_DIR} ]];then
17 mkdir -p ${LOCAL_BACKUP_DIR}
18fi
19
20cd ${LOCAL_BACKUP_DIR}
21
22## 运行while循环,连接断开后等待指定时间,重新连接
23while [ $count -lt 50 ]
24
25do
26
27 FIRST_BINLOG=$(mysql --host=${REMOTE_HOST} --user=${REMOTE_USER} --password=${REMOTE_PASS} -e 'show binary logs'|grep -v "Log_name"|awk '{print $1}'|head -n 1)
28
29 if [[ ! -d ${LOCAL_BACKUP_DIR} ]];then
30 mkdir -p ${LOCAL_BACKUP_DIR}
31 fi
32
33 cd ${LOCAL_BACKUP_DIR}
34
35 if [ `ls -A "${LOCAL_BACKUP_DIR}" |grep -v "dump.log" |wc -l` -eq 0 ];then
36 LAST_FILE=${FIRST_BINLOG} ##如果备份目录中没有备份文件则 LAST_FILE=FIRST_FILE
37 else
38 LAST_FILE=`ls -l ${LOCAL_BACKUP_DIR} |grep -v "dump.log" |tail -n 1 |awk '{print $9}'` ##last_file取序列最大的binlog文件
39 fi
40
41 date >> ${BACKUP_LOG}
42 echo "${BACKUP_BIN} -R --raw --host=${REMOTE_HOST} --user=${REMOTE_USER} --password=${REMOTE_PASS} ${LAST_FILE} --stop-never --stop-never-slave-server-id=${SERVER_ID}" >> ${BACKUP_LOG}
43
44
45 ${BACKUP_BIN} -R --raw --host=${REMOTE_HOST} --user=${REMOTE_USER} --password=${REMOTE_PASS} ${LAST_FILE} --stop-never --stop-never-slave-server-id=${SERVER_ID}
46 echo "`date +"%Y/%m/%d %H:%M:%S"` mysqlbinlog停止,返回代码:$?" >> ${BACKUP_LOG}
47 echo "${SLEEP_SECONDS}秒后再次连接并继续备份" >> ${BACKUP_LOG}
48 sleep ${SLEEP_SECONDS}
49 count=`expr $count + 1`
50
51done
52
53echo "exit............." >> ${BACKUP_LOG}
54
xxxxxxxxxx
81# systemctl enable --now supervisord
2# systemctl status supervisord
3
4● supervisord.service - Process Monitoring and Control Daemon
5 Loaded: loaded (/usr/lib/systemd/system/supervisord.service; disabled; vendor preset: disabled)
6 Active: inactive (dead)
7[root@mhamanamger ~]# vi /usr/lib/systemd/system/supervisord.service
8### systemctl enable supervisord
mha_manager的工作是实现主从的切换,但是完成mysql的主从切换之后,进程mha_manager自动退出,将不再监控。如果只是2个节点(1主一从)倒是也没必要留着这个进程,但如果用在3个及以上节点时,再次的切换主从将无法实现,故引入了supervisord服务来使进程mha_manager重生。注:supervisord是一个监控nohub的很好的方案,不仅可以用在mha_manager的后台执行上。
supervisord
是服务进程
supervisordctl
是进程管理命令(执行后,exit退出),就像systemd一样,可以将加入到/etc/supervisord.conf 主配置文件中的进程进行实时的start、stop、restart、status操作。管理方式 是操作+服务名
xxxxxxxxxx
261[root@mha ~]# /usr/bin/python /usr/bin/supervisord -c /etc/supervisord.conf
2#服务进程,不过,如果是rpm安装的话,应该有相应的systemd,而不用这个方式。
3
4Unlinking stale socket /var/run/supervisor.sock
5[root@mha ~]# supervisorctl -c /etc/supervisord.conf
6mha_manager RUNNING pid 192544, uptime 0:00:28
7supervisor> status
8mha_manager RUNNING pid 192544, uptime 0:00:33
9supervisor> exit
10[root@mha ~]# supervisorctl -c /etc/supervisord.conf # 进入管理状态,
11mha_manager EXITED Jun 26 05:26 PM
12supervisor> start mha_manager
13mha_manager: started
14supervisor> exit
15[root@mha ~]# supervisorctl -c /etc/supervisord.conf # 进入管理状态
16mha_manager RUNNING pid 194459, uptime 0:08:30
17supervisor> restart mha_manager
18mha_manager: stopped
19mha_manager: started
20supervisor> restart mha_manager
21mha_manager: ERROR (not running)
22mha_manager: started
23supervisor>
24supervisor> status
25mha_manager RUNNING pid 196263, uptime 0:00:14
26supervisor> exit
supervisord只是一个脚本后台自动化监控系统,本质上是调用mha4mysql工具中的/usr/bin/masterha_manager,所以真正的日志是/usr/bin/masterha_manager输出到日志中的。
即为配置文件中的
xxxxxxxxxx
11stdout_logfile=/var/log/masterha/app1/manager.log
由于mha_health_check需要检测此日志文件,所有一定要查看日志输出
xxxxxxxxxx
41Sun Nov 1 11:52:06 2020 - [debug] Set short wait_timeout on master: 1999998 seconds
2Sun Nov 1 11:52:06 2020 - [debug] Trying to get advisory lock..
3Sun Nov 1 11:52:06 2020 - [info] Ping(SELECT) succeeded, waiting until MySQL doesn't respond..
4
xxxxxxxxxx
101[Unit]
2Description=Process Monitoring and Control Daemon
3After=rc-local.service nss-user-lookup.target
4
5[Service]
6Type=forking
7ExecStart=/usr/bin/supervisord -c /etc/supervisord.conf
8
9[Install]
10WantedBy=multi-user.target
x1[root@mhamanamger ~]# supervisord -c /etc/supervisord.conf 启动后台服务
2[root@mhamanamger ~]# supervisorctl -c /etc/supervisord.conf
3mha_manager FATAL Exited too quickly (process log may have details)
4supervisor> start
5Error: start requires a process name
6start <name> Start a process
7start <gname>:* Start all processes in a group
8start <name> <name> Start multiple processes or groups
9start all Start all processes
10supervisor> start all
11mha_manager: ERROR (spawn error) # 推测由于命令中的路径不存在,
12supervisor> exit
13[root@mhamanamger ~]# whereis masterha_manager
14masterha_manager: /usr/bin/masterha_manager /usr/share/man/man1/masterha_manager.1.gz
15[root@mhamanamger ~]# ln -s /usr/bin/masterha_manager /usr/local/bin/masterha_manager 创建一个快捷方式
16[root@mhamanamger ~]# supervisorctl -c /etc/supervisord.conf
17mha_manager FATAL Exited too quickly (process log may have details)
18supervisor> start all
19mha_manager: started
20supervisor>
21
问题处理
[root@mhamanamger ~]# supervisorctl -c /etc/supervisord.conf 由于是默认路径,可以直接用命令不加参数 unix:///var/run/supervisor.sock no such file # 是因为服务未启动,服务启动与监控的进程是否有效是无关的。 supervisor> status unix:///var/run/supervisor.sock no such file
检查项目涉及supervisor/mha状态。
修改脚本 中的master节点名称,即可。
x10 23 * * * /usr/bin/sh /root/mha_health_check.sh > /dev/null 2>&1
2
3(crontab -l ; echo "# Only Use for Check MHA status")| crontab -
4(crontab -l ; echo "0 23 * * * /usr/bin/sh /root/mha_health_check.sh > /dev/null 2>&1")| crontab -
5crontab -l
/root/mha_health_check.sh
SSH_USER="root" SSH_PORT="22" MYSQL_USER="root" MYSQL_PASSWORD="yourmysqlpassword" # root密码 MYSQL_PORT="3506" MASTER_HOST="master" #hostname mha_conf="app1"
x1#!/bin/bash
2
3###############################################
4# 1. 检查MHA Manager 托管服务
5# 2. 检查当前MHA 集群状态
6# 3. 检查主从数据库目标库上数据同步状况(默认不开启,后续完善)
7###############################################
8##
9## Filename : MHA_health_check.sh
10## Date : 2018-07
11## Author : xuty
12## Desc : mha health check
13## Version : MySQL 5.7.20
14
15
16source /etc/profile
17
18##
19## ========== global var ============
20##
21
22LOG="/root/mha_health.log"
23DATE=`date +%Y%m%d_%H%M%S`
24SSH_USER="root"
25SSH_PORT="22"
26MYSQL_USER="root"
27MYSQL_PASSWORD="yourmysqlpassword"
28MYSQL_PORT="3506"
29MASTER_HOST="master"
30mha_conf="app1"
31
32##
33## ========== function =============
34##
35
36function supervisord_check() {
37supervisord_status=`ps aux |grep supervisord |grep -v grep |wc -l`
38if [[ $supervisord_status -eq 1 ]];then
39echo "*** supervisord_check : [Success]" >> $LOG
40else
41echo "*** supervisord_check : [Failed]" >> $LOG
42exit 1
43fi
44write_line
45}
46
47function masterha_check() {
48managerlog=`tail -n 1 /var/log/masterha/${mha_conf}/manager.log|grep succeeded|wc -l`
49if [[ $managerlog -eq 1 ]];then
50echo "*** masterha_${mha_conf}_check : [Success]" >> $LOG
51else
52echo "*** masterha_${mha_conf}_check : [Failed]" >> $LOG
53exit 1
54fi
55
56write_line
57
58}
59
60function replication_status_check() {
61
62ssh -p$SSH_PORT $MASTER_HOST "mysqlrplshow --master=root:${MYSQL_PASSWORD}@${MASTER_HOST}:${MYSQL_PORT} --discover-slaves-login=${MYSQL_USER}:${MYSQL_PASSWORD} --verbose" |tail -n 6 >> $LOG
63
64write_line
65}
66
67
68function replication_data_check() {
69
70ssh -P$SSH_PORT $MASTER_HOST "/usr/local/mysql/bin/mysql -u$MYSQL_USER -p$MYSQL_PASSWORD -P$MYSQL_PORT -e 'drop table IF EXISTS percona.checksums'"
71
72ssh -P$SSH_PORT $MASTER_HOST "pt-table-checksum --nocheck-replication-filters --no-check-binlog-format --databases=$MYSQL_CHECK_DB h=$MASTER_USER,u=$MYSQL_USER,p=$MYSQL_PASSWORD -S $MYSQL_SOCKET"
73
74if [[ $? -ne 0 && $? -ne 16 ]];then
75echo "*** pt-table-checksum run Failed,Please Check!" >> $LOG
76exit 1
77fi
78data_diff=`ssh -P$SSH_PORT $SLAVE_HOST "/usr/local/mysql/bin/mysql -u$MYSQL_USER -p$MYSQL_PASSWORD -P$MYSQL_PORT -e 'select db,tbl,ts from percona.checksums where master_cnt <> this_cnt OR master_crc <> this_crc OR
79ISNULL(master_crc) <> ISNULL(this_crc)'"`
80
81if [[ $? -ne 0 ]];then
82echo "*** data_diff get Failed,Please Check! " >> $LOG
83exit 1
84fi
85
86if [[ `echo "data_diff"|grep ERROR |wc -l` -gt 1 ]];then
87echo "*** replcation_data_check : [Failed] " >> $LOG
88echo " $data_diff " >> $LOG
89exit 1;
90fi
91
92if [[ `echo "$data_diff" |wc -l` -gt 1 ]];then
93echo "*** replcation_data_check : [Failed] " >> $LOG
94echo "*** Failed Failed Failed! ! ! PLease Check replication_data" >> $LOG
95echo " $data_diff " >> $LOG
96exit 1
97else
98echo "*** replcation_data_check : [Success]" >> $LOG
99echo "*** Check Complete OK , DATE is $DATE" >> $LOG
100fi
101}
102
103
104function start_log(){
105
106echo -e "\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> $DATE Begin Check <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" >> $LOG
107write_line
108}
109
110function write_line(){
111echo "-------------------------------------------" >> $LOG
112}
113
114
115################# main #################
116
117#开始检查日志记录
118start_log
119
120#supervisord检查
121supervisord_check
122
123#masterha检查
124masterha_check
125
126#主从状态检查
127replication_status_check
128
129#主从数据一致性检查
130#replication_data_check
https://www.linuxidc.com/Linux/2017-09/146661.htm
在主从中,我们设置从库只读: 1、如果只是打开read_only=on 在从库上启用确保只接受来自主库的更新,不接受来自客户端的更新。但是不能保证连到从库具有super权限误写数据。 2、mysql 5.7.8开始支持super_read_only参数,如果super_read_only=on,在从库直接受来自主库的更新,连接到从库的含有super用户权限也不能更新,确保从库不被写如异常数据。
如果设置了super_read_only =on ,那么默认的read_only 也设置为on,如果再设置super_read_only =off,此时read_only 还是on,如果主从角色发生变化注意read_only也设置为off。
日常维护: 主库可读写 从库只读
从库只读的两种选择:以下参数可放入my.cnf
1、read_only = on; 2、read_only = on; super_read_only = on;
区别是如果从库也设置了super_read_only=on,可很好确保了从库不被误写数据,即便是变更时候,不小心在从库执行了super权限用户语句也会失败,不会造成主从不一致。
那么我们的从库是否一定要设置为super_read_only = on 吗?
从库重启后维护: 如果传统主从架构,遇到宿主机宕机虚拟机漂移后启动数据库,还是从库本身是物理机异常宕机,还是从库计划内升级系统补丁等操作重启后建议显示把从库设置为super_read_only=on只读。 如果是分布式,建议启动管理agent由管理的agent负责把从节点加入主库并且把从库设置为read_only=on。
其他问题: 如果使用nbu备份,备份策略是从库,由于备份用户需要super权限,需要在备份时候先把super_read_only=off备份结束后设置super_read_only=on。
xxxxxxxxxx
311mysql> set global read_only=0;
2Query OK, 0 rows affected (0.00 sec)
3
4mysql> show global variables like "%read_only%";
5+-----------------------+-------+
6| Variable_name | Value |
7+-----------------------+-------+
8| innodb_read_only | OFF |
9| read_only | OFF |
10| super_read_only | OFF |
11| transaction_read_only | OFF |
12| tx_read_only | OFF |
13+-----------------------+-------+
145 rows in set (0.00 sec)
15
16mysql> set global super_read_only=on; # 此时read_only super_read_only均开启。
17Query OK, 0 rows affected (0.00 sec)
18
19mysql> show global variables like "%read_only%";
20+-----------------------+-------+
21| Variable_name | Value |
22+-----------------------+-------+
23| innodb_read_only | OFF |
24| read_only | ON | # 此时read_only super_read_only均开启。
25| super_read_only | ON |
26| transaction_read_only | OFF |
27| tx_read_only | OFF |
28+-----------------------+-------+
295 rows in set (0.00 sec)
30
31
在mysql 5.7中,只对innodb引擎的库表启作用,在8.0版本中也对MysqlIAM起作用。属于更底层的限制。
https://dev.mysql.com/doc/refman/5.7/en/innodb-read-only-instance.html
启动server 在read-only模式。对于分布在数据库应用或者数据设置为只读介质。
也可以用于数据仓库共享相同的数据目录在多个实例之间。
作用层面:事务
参数 tx_read_only 或者 transaction_read_only 用于设置事务的访问模式,可设置为 OFF/ON,默认值为 OFF,表示事务可读,可写,设置为 ON 表示事务只读,不可写。
transaction_read_only 参数在 5.7.20 版本引入,tx_read_only 参数在 8.0.3 版本被移除,这两个参数意义完全一样,只是名称不同,transaction_read_only 名称更加规范,在高版本 MySQL 中,建议使用 transaction_read_only。
该参数可以在全局范围内设置,也可以在 session 级设置,在全局范围内设置该参数后,对于已有的连接并不会生效,因为已有连接的 session 级参数仍然保持原样,因此需要杀掉已有连接,让应用重新建立连接,以便使该参数对所有连接生效。
xxxxxxxxxx
21SET GLOBAL transaction_read_only = ON;
2SET SESSION transaction_read_only = ON;
如果设置 transaction_read_only 为 ON,此时向表中写入数据,会产生报错,如下: ERROR 1792 (25006): Cannot execute statement in a READ ONLY transaction.
xxxxxxxxxx
31read_only = on
2super_read_only = on
3tx_read_only = on
xxxxxxxxxx
251flush tables with read lock;
2set global read_only = on;
3set global super_read_only = on;
4set global tx_read_only = on;
5show global variables like "%read_only%";
6
7
8# 全锁效果
9mysql> show global variables like "%read_only%";
10+-----------------------+-------+
11| Variable_name | Value |
12+-----------------------+-------+
13| innodb_read_only | OFF |
14| read_only | ON |
15| super_read_only | ON |
16| transaction_read_only | ON |
17| tx_read_only | ON |
18+-----------------------+-------+
195 rows in set (0.00 sec)
20# 执行写入时会有jdbc报错。
21The MySQL server is running with the --super-read-only option so it cannot execute this statement
22# 表明--super-read-only在起作用
23
24
25
xxxxxxxxxx
171unlock tables;
2set global read_only = off;
3set global super_read_only = off;;
4set global tx_read_only = off;
5
6
7mysql> show global variables like "%read_only%";
8+-----------------------+-------+
9| Variable_name | Value |
10+-----------------------+-------+
11| innodb_read_only | OFF |
12| read_only | ON |
13| super_read_only | OFF |
14| transaction_read_only | OFF |
15| tx_read_only | OFF |
16+-----------------------+-------+
175 rows in set (0.00 sec)
xxxxxxxxxx
71Stop slave;
2Set @@SESSION.GTID_NEXT='564637ab-deab-11ea-8ae1-fa163e1b04fd:79';
3Begin;Commit;Set @@SESSION.GTID_NEXT = AUTOMATIC;
4Start slave;
5show slave status \G;
6
7# 处理时建议锁库,锁表后,备份重做主从。
报错
xxxxxxxxxx
11Last_Error: Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker 1 failed executing transaction 'aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056319' at master log bin.000004, end_log_pos 490372649. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
如下
xxxxxxxxxx
691
2mysql> start slave;
3Query OK, 0 rows affected (0.01 sec)
4
5mysql> show slave status \G;
6*************************** 1. row ***************************
7 Slave_IO_State: Waiting for master to send event
8 Master_Host: 172.16.10.45
9 Master_User: rpl
10 Master_Port: 3306
11 Connect_Retry: 60
12 Master_Log_File: bin.000006
13 Read_Master_Log_Pos: 347165
14 Relay_Log_File: relay.000023
15 Relay_Log_Pos: 3408
16 Relay_Master_Log_File: bin.000004
17 Slave_IO_Running: Yes
18 Slave_SQL_Running: No
19 Replicate_Do_DB:
20 Replicate_Ignore_DB:
21 Replicate_Do_Table:
22 Replicate_Ignore_Table:
23 Replicate_Wild_Do_Table:
24 Replicate_Wild_Ignore_Table:
25 Last_Errno: 1032
26 Last_Error: Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker 1 failed executing transaction 'aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056319' at master log bin.000004, end_log_pos 490372649. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
27 Skip_Counter: 0
28 Exec_Master_Log_Pos: 490366914
29 Relay_Log_Space: 842019902
30 Until_Condition: None
31 Until_Log_File:
32 Until_Log_Pos: 0
33 Master_SSL_Allowed: No
34 Master_SSL_CA_File:
35 Master_SSL_CA_Path:
36 Master_SSL_Cert:
37 Master_SSL_Cipher:
38 Master_SSL_Key:
39 Seconds_Behind_Master: NULL
40Master_SSL_Verify_Server_Cert: No
41 Last_IO_Errno: 0
42 Last_IO_Error:
43 Last_SQL_Errno: 1032
44 Last_SQL_Error: Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker 1 failed executing transaction 'aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056319' at master log bin.000004, end_log_pos 490372649. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
45 Replicate_Ignore_Server_Ids:
46 Master_Server_Id: 45
47 Master_UUID: aa3f944d-bc78-11ea-b893-0cda411dd9fe
48 Master_Info_File: mysql.slave_master_info
49 SQL_Delay: 0
50 SQL_Remaining_Delay: NULL
51 Slave_SQL_Running_State:
52 Master_Retry_Count: 86400
53 Master_Bind:
54 Last_IO_Error_Timestamp:
55 Last_SQL_Error_Timestamp: 200804 12:44:25
56 Master_SSL_Crl:
57 Master_SSL_Crlpath:
58 Retrieved_Gtid_Set: aa3f944d-bc78-11ea-b893-0cda411dd9fe:4055597:4055815:4056313-4933344
59 Executed_Gtid_Set: aa3f944d-bc78-11ea-b893-0cda411dd9fe:1-4056318,
60c1efedd3-bc78-11ea-bfd7-0cda411d8317:1-3
61 Auto_Position: 1
62 Replicate_Rewrite_DB:
63 Channel_Name:
64 Master_TLS_Version:
651 row in set (0.00 sec)
66
67ERROR:
68No query specified
69
#按提示定位到表:edc_data_3.qrtz_scheduler_state,从库表中数据与主库不一致:
xxxxxxxxxx
421
2mysql> select * from performance_schema.replication_applier_status_by_worker\G;
3*************************** 1. row ***************************
4CHANNEL_NAME:
5WORKER_ID: 1
6THREAD_ID: NULL
7SERVICE_STATE: OFF
8LAST_SEEN_TRANSACTION: aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056319
9LAST_ERROR_NUMBER: 1032
10LAST_ERROR_MESSAGE: Worker 1 failed executing transaction 'aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056319' at master log bin.000004, end_log_pos 490372649; Could not execute Update_rows event on table edc_data_3.0.qrtz_scheduler_state; Can't find record in 'qrtz_scheduler_state', Error_code: 1032; handler error HA_ERR_KEY_NOT_FOUND; the event's master log bin.000004, end_log_pos 490372649
11LAST_ERROR_TIMESTAMP: 2020-08-04 12:44:25
12*************************** 2. row ***************************
13CHANNEL_NAME:
14WORKER_ID: 2
15THREAD_ID: NULL
16SERVICE_STATE: OFF
17LAST_SEEN_TRANSACTION: aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056315
18LAST_ERROR_NUMBER: 0
19LAST_ERROR_MESSAGE:
20LAST_ERROR_TIMESTAMP: 0000-00-00 00:00:00
21*************************** 3. row ***************************
22CHANNEL_NAME:
23WORKER_ID: 3
24THREAD_ID: NULL
25SERVICE_STATE: OFF
26LAST_SEEN_TRANSACTION: aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056316
27LAST_ERROR_NUMBER: 0
28LAST_ERROR_MESSAGE:
29LAST_ERROR_TIMESTAMP: 0000-00-00 00:00:00
30*************************** 4. row ***************************
31CHANNEL_NAME:
32WORKER_ID: 4
33THREAD_ID: NULL
34SERVICE_STATE: OFF
35LAST_SEEN_TRANSACTION:
36LAST_ERROR_NUMBER: 0
37LAST_ERROR_MESSAGE:
38LAST_ERROR_TIMESTAMP: 0000-00-00 00:00:00
394 rows in set (0.00 sec)
40
41ERROR:
42No query specified
清空从库表,或找到不同的地方:直接修改从库或将从库表清空,将主库备份的表还原或插入到从库表中。
但此断电影响的不同步多由于数据未即时写入binlog,出现时也一般会有多个位置未未同步,如果执行几个之后,还是不断有此种新的问题出现,可以直接将主库备份,还原到从库,重建主从关系。
备份表的命令:
注意:不加-d时导出sql,加-d时导出表结构。然后将其中的表insert语句复制出来即可。
x
1命令行下具体用法如下: mysqldump -u用戶名 -p密码 -d 數據库名 表名 脚本名;
2
31、导出數據库為dbname的表结构(其中用戶名為root,密码為dbpasswd,生成的脚本名為db.sql)
4mysqldump -uroot -pdbpasswd -d dbname >db.sql;
5
62、导出數據库為dbname某张表(test)结构
7mysqldump -uroot -pdbpasswd -d dbname test>db.sql;
8
93、导出數據库為dbname所有表结构及表數據(不加-d)
10mysqldump -uroot -pdbpasswd dbname >db.sql;
11
124、导出數據库為dbname某张表(test)结构及表數據(不加-d)
13mysqldump -uroot -pdbpasswd dbname test>db.sql;
x1mysql> truncate qrtz_scheduler_state;
2Query OK, 0 rows affected (0.01 sec)
3
4mysql> INSERT INTO `qrtz_scheduler_state` VALUES ('edc','2a23bb5f-896f-49d7-8f5a-f3297ba890a11595818572314',1596343879614,7500);
5Query OK, 1 row affected (0.00 sec)
6
7mysql> Set @@SESSION.GTID_NEXT='aa3f944d-bc78-11ea-b893-0cda411dd9fe:4056319';
8Query OK, 0 rows affected (0.00 sec)
9
10mysql> Begin;commit;Set @@SESSION.GTID_NEXT = AUTOMATIC; start slave ;show slave status \G; #多行命令
11Query OK, 0 rows affected (0.00 sec)
12
13Query OK, 0 rows affected (0.00 sec)
14
15Query OK, 0 rows affected (0.00 sec)
16
17Query OK, 0 rows affected (0.00 sec)
18
19*************************** 1. row ***************************
20Slave_IO_State: Waiting for master to send event
21Master_Host: 172.16.10.45
22Master_User: rpl
23Master_Port: 3306
24Connect_Retry: 60
25Master_Log_File: bin.000006
26Read_Master_Log_Pos: 366587
27Relay_Log_File: relay.000023
28Relay_Log_Pos: 3408
29Relay_Master_Log_File: bin.000004
30Slave_IO_Running: Yes
31Slave_SQL_Running: Yes
32Replicate_Do_DB:
33...
34...
35Master_SSL_Crlpath:
36Retrieved_Gtid_Set: aa3f944d-bc78-11ea-b893-0cda411dd9fe:4055597:4055815:4056313-4933371
37Executed_Gtid_Set: aa3f944d-bc78-11ea-b893-0cda411dd9fe:1-4056319,
38c1efedd3-bc78-11ea-bfd7-0cda411d8317:1-5
39Auto_Position: 1
40Replicate_Rewrite_DB:
41Channel_Name:
42Master_TLS_Version:
431 row in set (0.00 sec)
44
45ERROR:
46No query specified
47
48mysql> show slave status \G; #前面的多行命令可能结果不是实时的,需要再次执行本命令。
比如是恢复的数据库,还是提示不一致时,可以直接跳过
解决mysql开启GTID主从同步出现1236错误问题
https://blog.51cto.com/hnr520/1883282
xxxxxxxxxx
211SET GTID_NEXT='15b785b6-ba17-11ea-8f0a-00163e50d19a:1';
2BEGIN; COMMIT;
3SET GTID_NEXT="AUTOMATIC";
4START SLAVE;
5START SLAVE;
6show slave status \G;
7START SLAVE;
8show slave status \G;
9
10STOP SLAVE;
11SET GTID_NEXT='15b785b6-ba17-11ea-8f0a-00163e50d19a:20';
12BEGIN; COMMIT;
13SET GTID_NEXT="AUTOMATIC";
14START SLAVE;
15show slave status\G;
16
17STOP SLAVE;
18SET GLOBAL sql_slave_skip_counter =1;
19START SLAVE;
20
21
xxxxxxxxxx
2581
2
3=================从库
4
5mysql> show slave status \G;
6*************************** 1. row ***************************
7 Slave_IO_State:
8 Master_Host: 100.64.88.102
9 Master_User: rpl
10 Master_Port: 3306
11 Connect_Retry: 60
12 Master_Log_File: bin.000013
13 Read_Master_Log_Pos: 210
14 Relay_Log_File: relay.000015
15 Relay_Log_Pos: 4
16 Relay_Master_Log_File: bin.000013
17 Slave_IO_Running: No
18 Slave_SQL_Running: Yes
19 Replicate_Do_DB:
20 Replicate_Ignore_DB:
21 Replicate_Do_Table:
22 Replicate_Ignore_Table:
23 Replicate_Wild_Do_Table:
24 Replicate_Wild_Ignore_Table:
25 Last_Errno: 0
26 Last_Error:
27 Skip_Counter: 0
28 Exec_Master_Log_Pos: 210
29 Relay_Log_Space: 930
30 Until_Condition: None
31 Until_Log_File:
32 Until_Log_Pos: 0
33 Master_SSL_Allowed: No
34 Master_SSL_CA_File:
35 Master_SSL_CA_Path:
36 Master_SSL_Cert:
37 Master_SSL_Cipher:
38 Master_SSL_Key:
39 Seconds_Behind_Master: NULL
40Master_SSL_Verify_Server_Cert: No
41 Last_IO_Errno: 1236
42 Last_IO_Error: Got fatal error 1236 from master when reading data from binary log: 'The slave is connecting using CHANGE MASTER TO MASTER_AUTO_POSITION = 1, but the master has purged binary logs containing GTIDs that the slave requires.'
43 Last_SQL_Errno: 0
44 Last_SQL_Error:
45 Replicate_Ignore_Server_Ids:
46 Master_Server_Id: 101
47 Master_UUID: 704664e0-2a40-11ea-bccc-0050562a8640
48 Master_Info_File: mysql.slave_master_info
49 SQL_Delay: 0
50 SQL_Remaining_Delay: NULL
51 Slave_SQL_Running_State: Slave has read all relay log; waiting for more updates
52 Master_Retry_Count: 86400
53 Master_Bind:
54 Last_IO_Error_Timestamp: 200629 09:08:36
55 Last_SQL_Error_Timestamp:
56 Master_SSL_Crl:
57 Master_SSL_Crlpath:
58 Retrieved_Gtid_Set:
59 Executed_Gtid_Set: 704664e0-2a40-11ea-bccc-0050562a8640:1
60 Auto_Position: 1
61 Replicate_Rewrite_DB:
62 Channel_Name:
63 Master_TLS_Version:
641 row in set (0.00 sec)
65
66ERROR:
67No query specified
68
69mysql> show global variables like '%gtid%'\G
70*************************** 1. row ***************************
71Variable_name: binlog_gtid_simple_recovery
72 Value: ON
73*************************** 2. row ***************************
74Variable_name: enforce_gtid_consistency
75 Value: ON
76*************************** 3. row ***************************
77Variable_name: gtid_executed
78 Value: 704664e0-2a40-11ea-bccc-0050562a8640:1
79*************************** 4. row ***************************
80Variable_name: gtid_executed_compression_period
81 Value: 1000
82*************************** 5. row ***************************
83Variable_name: gtid_mode
84 Value: ON
85*************************** 6. row ***************************
86Variable_name: gtid_owned
87 Value:
88*************************** 7. row ***************************
89Variable_name: gtid_purged
90 Value: 704664e0-2a40-11ea-bccc-0050562a8640:1
91*************************** 8. row ***************************
92Variable_name: session_track_gtids
93 Value: OFF
948 rows in set (0.00 sec)
95
96mysql> stop slave;
97Query OK, 0 rows affected (0.00 sec)
98
99mysql> reset slave
100 -> ;
101Query OK, 0 rows affected (0.00 sec)
102
103mysql> reset master;
104Query OK, 0 rows affected (0.01 sec)
105
106mysql> set @@global.gtid_purged='704664e0-2a40-11ea-bccc-0050562a8640:1-925660';
107Query OK, 0 rows affected (0.00 sec)
108
109mysql> show global variables like '%gtid%'\G
110*************************** 1. row ***************************
111Variable_name: binlog_gtid_simple_recovery
112 Value: ON
113*************************** 2. row ***************************
114Variable_name: enforce_gtid_consistency
115 Value: ON
116*************************** 3. row ***************************
117Variable_name: gtid_executed
118 Value: 704664e0-2a40-11ea-bccc-0050562a8640:1-925660
119*************************** 4. row ***************************
120Variable_name: gtid_executed_compression_period
121 Value: 1000
122*************************** 5. row ***************************
123Variable_name: gtid_mode
124 Value: ON
125*************************** 6. row ***************************
126Variable_name: gtid_owned
127 Value:
128*************************** 7. row ***************************
129Variable_name: gtid_purged
130 Value: 704664e0-2a40-11ea-bccc-0050562a8640:1-925660
131*************************** 8. row ***************************
132Variable_name: session_track_gtids
133 Value: OFF
1348 rows in set (0.00 sec)
135
136mysql> change master to master_host='100.64.88.102', master_user='rpl', master_password='Mysql@57rpl', master_port=3306, master_auto_position=1;
137Query OK, 0 rows affected, 2 warnings (0.01 sec)
138
139mysql> start slave;
140Query OK, 0 rows affected (0.00 sec)
141
142mysql> show slave status \G;
143*************************** 1. row ***************************
144 Slave_IO_State: Waiting for master to send event
145 Master_Host: 100.64.88.102
146 Master_User: rpl
147 Master_Port: 3306
148 Connect_Retry: 60
149 Master_Log_File: bin.000031
150 Read_Master_Log_Pos: 194
151 Relay_Log_File: relay.000002
152 Relay_Log_Pos: 355
153 Relay_Master_Log_File: bin.000030
154 Slave_IO_Running: Yes
155 Slave_SQL_Running: Yes
156 Replicate_Do_DB:
157 Replicate_Ignore_DB:
158 Replicate_Do_Table:
159 Replicate_Ignore_Table:
160 Replicate_Wild_Do_Table:
161 Replicate_Wild_Ignore_Table:
162 Last_Errno: 0
163 Last_Error:
164 Skip_Counter: 0
165 Exec_Master_Log_Pos: 154
166 Relay_Log_Space: 1027
167 Until_Condition: None
168 Until_Log_File:
169 Until_Log_Pos: 0
170 Master_SSL_Allowed: No
171 Master_SSL_CA_File:
172 Master_SSL_CA_Path:
173 Master_SSL_Cert:
174 Master_SSL_Cipher:
175 Master_SSL_Key:
176 Seconds_Behind_Master: 10838317
177Master_SSL_Verify_Server_Cert: No
178 Last_IO_Errno: 0
179 Last_IO_Error:
180 Last_SQL_Errno: 0
181 Last_SQL_Error:
182 Replicate_Ignore_Server_Ids:
183 Master_Server_Id: 101
184 Master_UUID: 704664e0-2a40-11ea-bccc-0050562a8640
185 Master_Info_File: mysql.slave_master_info
186 SQL_Delay: 0
187 SQL_Remaining_Delay: NULL
188 Slave_SQL_Running_State: Waiting for slave workers to process their queues
189 Master_Retry_Count: 86400
190 Master_Bind:
191 Last_IO_Error_Timestamp:
192 Last_SQL_Error_Timestamp:
193 Master_SSL_Crl:
194 Master_SSL_Crlpath:
195 Retrieved_Gtid_Set: 704664e0-2a40-11ea-bccc-0050562a8640:925661
196 Executed_Gtid_Set: 704664e0-2a40-11ea-bccc-0050562a8640:1-925660
197 Auto_Position: 1
198 Replicate_Rewrite_DB:
199 Channel_Name:
200 Master_TLS_Version:
2011 row in set (0.00 sec)
202
203ERROR:
204No query specified
205
206mysql>
207
208
209
210
211=================
212
213
214mysql> stop slave;
215Query OK, 0 rows affected (0.00 sec)
216
217mysql> set @@global.gtid_purged='704664e0-2a40-11ea-bccc-0050562a8640:1-925660';
218ERROR 1840 (HY000): @@GLOBAL.GTID_PURGED can only be set when @@GLOBAL.GTID_EXECUTED is empty.
219mysql> show global variables like '%gtid%'\G
220*************************** 1. row ***************************
221Variable_name: binlog_gtid_simple_recovery
222 Value: ON
223*************************** 2. row ***************************
224Variable_name: enforce_gtid_consistency
225 Value: ON
226*************************** 3. row ***************************
227Variable_name: gtid_executed
228 Value: 704664e0-2a40-11ea-bccc-0050562a8640:1-342817
229*************************** 4. row ***************************
230Variable_name: gtid_executed_compression_period
231 Value: 1000
232*************************** 5. row ***************************
233Variable_name: gtid_mode
234 Value: ON
235*************************** 6. row ***************************
236Variable_name: gtid_owned
237 Value:
238*************************** 7. row ***************************
239Variable_name: gtid_purged
240 Value: 704664e0-2a40-11ea-bccc-0050562a8640:1:12
241*************************** 8. row ***************************
242Variable_name: session_track_gtids
243 Value: OFF
2448 rows in set (0.00 sec)
245
246mysql> reset slave;
247Query OK, 0 rows affected (0.00 sec)
248
249mysql> reset master
250 -> ;
251Query OK, 0 rows affected (0.01 sec)
252
253mysql> set @@global.gtid_purged='704664e0-2a40-11ea-bccc-0050562a8640:1-925660';
254Query OK, 0 rows affected (0.00 sec)
255
256mysql> start slave;
257Query OK, 0 rows affected (0.01 sec)
258
x
1pt-table-checksum --user=root --password=yourmysqlpassword --host=10.10.101.54 --databases=anna --replicate=cnail.checksums --create-replicate-table --no-check-binlog-format
2pt-table-checksum --user=root --password=yourmysqlpassword --host=10.10.101.54 --replicate=cnail.checksums --create-replicate-table --no-check-binlog-format
3pt-table-checksum --user=root --password=Mysql_57 --host=10.10.101.54 --replicate=cnail.checksums --create-replicate-table --no-check-binlog-format
4
5pt-table-sync --charset=utf8 --ignore-databases=mysql,sys,percona dsn=u=root,p=yourmysqlpassword,h=10.10.101.53,P=37389 dsn=u=root,p=yourmysqlpassword,h=10.10.101.54,P=37389 --execute --print
6
7pt-table-sync --charset=utf8mb4 --ignore-databases=mysql,sys,percona dsn=u=root,p=yourmysqlpassword,h=10.10.101.53,P=37389 dsn=u=root,p=yourmysqlpassword,h=10.10.101.54,P=37389 --execute --print
8
master_ip_failover
xxxxxxxxxx
21my $ssh_start_vip = "/sbin/ifconfig eth0:$key $vip";
2my $ssh_stop_vip = "/sbin/ifconfig eth0:$key down";
masterha/app1.conf
x
1[server default]
2# 这两个参数需要根据不同的集群进行修改
3manager_workdir=/var/log/masterha/app1
4manager_log=/var/log/masterha/app1/manager.log
5# 按照master服务器存放binlog的实际路径进行修改,主要为了让MHA拉取binlog
6master_binlog_dir=/data/mysql_data/
7# 设置自动failover的脚本
8master_ip_failover_script= /usr/local/bin/master_ip_failover --interface=eth0 --key=88 --vip=10.10.101.55/24 --gateway=10.10.101.254
9# 设置手动切换时候的脚本 (供(masterha_master_switch使用)
10master_ip_online_change_script=/usr/local/bin/master_ip_failover --interface=eth0 --key=88 --vip=10.10.101.55/24 --gateway=10.10.101.254
11log_level=debug
12# 监控的用户
13user=root
14# 监控用户的密码
15password=yourmysqlpassword
16# 监控主库的时间间隔,默认是3秒,尝试三次没有回应的时候自动进行railover
17ping_interval=3
18#ping_interval=999999
19# 检测方式是insert,MHA-0.56开始支持insert
20# 会在Master中生成一个 infra 数据库
21ping_type=SELECT
22# 设置远端mysql在发生切换时binlog的保存位置
23remote_workdir=/tmp
24# 复制用的密码
25repl_password=yourrepomysqlpassword
26# 复制的用户
27repl_user=rpl
28# 告警脚本,可自行修改,这里没有使用
29#report_script=/usr/local/send_report
30# 通过从机进行二次探测的脚本, IP地址按照实际的情况进行修改
31secondary_check_script=/usr/local/bin/masterha_secondary_check -s slave1 --user=root --master_host=master --master_port=37389 --port=22
32# 设置故障发生后关闭故障主机的脚本(主要作用是关闭主机防止发生脑裂,这里没有使用,类似Fence功能)
33#shutdown_script="/usr/local/bin/power_manager --command=stopssh2 --host=test-1 --ssh_user=root"
34# 定义ssh的用户
35ssh_user=root
36
37[server1]
38# 这个hostname也可以配置成IP地址,同 ip 参数一样
39# 如果这里写名字,需要DNS配合,或者使用 /etc/hosts
40hostname=mhamaster
41ip=10.10.101.53
42port=37389
43ssh_port=22
44# candidate_master参数的意思为:设置为候选Master,如果发生主从切换,该主机会被提升为Master,即使这个服务器上的数据不是最新的(会用relay-log补全)
45candidate_master=1
46[server2]
47hostname=mhaslave
48ip=10.10.101.54
49port=37389
50ssh_port=22
51candidate_master=1
52# check_repl_delay参数的意思为:默认情况下如果一个slave落后master 100M的relay logs的话,MHA将不会选择该slave作为一个新的master;
53# 因为对于这个slave的恢复需要花费很长时间;
54# 通过设置check_repl_delay=0,MHA触发切换在选择一个新的master的时候将会忽略复制延时;
55# 这个参数对于设置了candidate_master=1的主机非常有用,因为这个候选主在切换的过程中一定是新的master
56check_repl_delay=0
57
58[binlog1]
59no_master=1
60hostname=mhamaster
61master_binlog_dir=/data/mysql_data/
62
xxxxxxxxxx
131mysql> show master logs;
2+------------+------------+
3| Log_name | File_size |
4+------------+------------+
5| bin.000003 | 1073744384 |
6| bin.000004 | 1073744375 |
7| bin.000005 | 258289828 |
8| bin.000006 | 57864 |
9+------------+------------+
104 rows in set (0.00 sec)
11
12mysql> show slave status \G;
13Empty set (0.00 sec)
主从同步问题处理