Heartbeat/corosync+pacemaker+ldirectord 实现Nginx负载均衡
系统:CentOS 5.4
IP分配:
HA1 eth0:192.168.0.66 eth1:192.168.10.1 HA2 eth0:192.168.0.69 eth1:192.168.10.2 VIP 192.168.0.120
1. 安装pacemaker和heartbeat
[root@HA1 ~]# wget -O /etc/yum.repos.d/pacemaker.repo http://clusterlabs.org/rpm/epel-5/clusterlabs.repo
[root@HA1 ~]# wget ftp://ftp.pbone.net/mirror/centos.karan.org/el5/extras/testing/i386/RPMS/libesmtp-1.0.4-6.el5.kb.i386.rpm
[root@HA1 ~]# rpm -ivh libesmtp-1.0.4-6.el5.kb.i386.rpm
[root@HA1 ~]# yum install -y pacemaker heartbeat
2. 安装ldirectord
[root@HA1 ~]# yum install -y ldirectord
3. 配置
3.1 配置Heartbeat
[root@HA1 ~]# cp /usr/share/doc/heartbeat-3.0.1/{ha.cf,authkeys} /etc/ha.d/
[root@HA1 ~]# cat /etc/ha.d/authkeys
auth 1 1 crc
[root@HA1 ~]# cat /etc/ha.d/ha.cf |grep -v “#”
logfile /var/log/ha-log logfacility local0 keepalive 2 deadtime 30 warntime 10 initdead 120 udpport 695 ucast eth1 192.168.10.2 # 注意此处HA2改为:ucast eth1 192.168.10.1 auto_failback on watchdog /dev/watchdog node HA1 node HA2 ping 192.168.0.1 respawn hacluster /usr/lib/heartbeat/ipfail apiauth ipfail gid=haclient uid=hacluster crm on
3.2 用corosync替换heartbeat(可选)
corosync是基于OpenAIS构建的集群引擎,可替代heartbeat进行心跳检测。
The Corosync Cluster Engine is an open source project Licensed under the BSD License derived from the OpenAIS project. OpenAIS uses a UDP multicast based communication protocol to periodically check for node availability.
[root@HA1 ~]# yum install -y corosync
[root@HA1 ~]# corosync-keygen
Corosync Cluster Engine Authentication key generator.
Gathering 1024 bits for key from /dev/random.
Press keys on your keyboard to generate entropy.
Writing corosync key to /etc/corosync/authkey.
[root@HA1 ~]# scp /etc/corosync/authkey HA2:/etc/corosync/
[root@HA1 ~]# cp /etc/corosync/corosync.conf.example /etc/corosync/corosync.conf
[root@HA1 ~]# vi !$
# Please read the corosync.conf.5 manual page
compatibility: whitetank
totem {
version: 2
secauth: off
threads: 0
interface {
ringnumber: 0
bindnetaddr: 192.168.10.0
mcastaddr: 226.94.1.1
mcastport: 5405
}
}
logging {
fileline: off
to_stderr: yes
to_logfile: yes
to_syslog: yes
logfile: /var/log/corosync.log
debug: off
timestamp: on
logger_subsys {
subsys: AMF
debug: off
}
}
amf {
mode: disabled
}
service {
# Load the Pacemaker Cluster Resource Manager
ver: 0
name: pacemaker
use_mgmtd: yes
}
[root@HA1 ~]# scp /etc/corosync/corosync.conf HA2:/etc/corosync/corosync.conf
[root@HA1 ~]# service corosync start
Starting Corosync Cluster Engine (corosync): [ OK ]
[root@HA1 ~]# chkconfig –level 2345 corosync on
[root@HA1 ~]# chkconfig –level 2345 heartbeat off
在HA2上执行:
[root@HA2 ~]# chown root:root /etc/corosync/authkey
[root@HA2 ~]# chmod 400 /etc/corosync/authkey
[root@HA2 ~]# service corosync start
Starting Corosync Cluster Engine (corosync): [ OK ]
[root@HA2 ~]# chkconfig –level 2345 corosync on
[root@HA2 ~]# chkconfig –level 2345 heartbeat off
3.3 安装配置ldirectord
[root@HA1 ~]# cat /etc/ha.d/ldirectord.cf
checktimeout=5 checkinterval=7 autoreload=yes logfile="/var/log/ldirectord.log" quiescent=yes emailalert=shidl@baihe.com # A server with a page at the main root of the site that displays "Nginx" virtual=192.168.0.120:80 real=192.168.0.66:80 gate real=192.168.0.69:80 gate service=http request="/lb.html" # 在根目录下编写lb.html,内容为live receive="live" scheduler=wlc protocol=tcp checktype=negotiate
3.4 配置hosts
[root@HA1 ~]# cat /etc/hosts
# Do not remove the following line, or various programs # that require network functionality will fail. 127.0.0.1 vpc localhost.localdomain localhost ::1 localhost6.localdomain6 localhost6 192.168.10.1 HA1 192.168.10.2 HA2
3.5 配置lo:0设备
[root@HA1 ~]# cat >>/etc/sysconfig/network-scripts/ifcfg-lo:0<<EOF <pre>DEVICE=lo:0 IPADDR=192.168.0.120 NETMASK="255.255.255.255" ONBOOT=yes NAME=loopback EOF
3.6 启用转发,禁用arp
[root@HA1 ~]# vi /etc/sysctl.conf
修改net.ipv4.ip_forward = 0为net.ipv4.ip_forward = 1
添加下面行:
net.ipv4.conf.all.arp_ignore = 1 net.ipv4.conf.eth0.arp_ignore = 1 net.ipv4.conf.all.arp_announce = 2 net.ipv4.conf.eth0.arp_announce = 2
[root@HA1 ~]# sysctl -p
# 将配置拷贝到HA2
[root@HA1 ~]# scp /etc/ha.d/{ha.cf,authkeys,ldirectord.cf} HA2:/etc/ha.d/
[root@HA1 ~]# scp /etc/{hosts,sysctl.conf} HA2:/etc/
[root@HA1 ~]# scp /etc/sysconfig/network-scripts/ifcfg-lo:0 HA2:/etc/sysconfig/network-scripts/
在HA2上修改/etc/ha.d/ha.cf
将ucast eth1 192.168.10.2 改为:ucast eth1 192.168.10.1
并使sysctl.conf配置生效:
[root@HA2~]# sysctl -p
3.7 在HA1和HA2上配置并安装好nginx
编写nginx lsb资源代理脚本(注意nginx安装路径):
[root@HA1 ~]# cat /etc/init.d/nginxd
#!/bin/sh
# source function library
. /etc/rc.d/init.d/functions
# Source networking configuration.
. /etc/sysconfig/network
# Check that networking is up.
[ ${NETWORKING} = "no" ] && exit 0
RETVAL=0
prog="nginx"
nginxDir=/usr/local/nginx
nginxd=$nginxDir/sbin/nginx
nginxConf=$nginxDir/conf/nginx.conf
nginxPid=$nginxDir/nginx.pid
nginx_check()
{
if [[ -e $nginxPid ]]; then
ps aux |grep -v grep |grep -q nginx
if (( $? == 0 )); then
echo "$prog already running..."
exit 1
else
rm -rf $nginxPid &> /dev/null
fi
fi
}
start()
{
nginx_check
if (( $? != 0 )); then
true
else
echo -n $"Starting $prog:"
daemon $nginxd -c $nginxConf
RETVAL=$?
echo
[ $RETVAL = 0 ] && touch /var/lock/subsys/nginx
return $RETVAL
fi
}
stop()
{
echo -n $"Stopping $prog:"
killproc $nginxd
RETVAL=$?
echo
[ $RETVAL = 0 ] && rm -f /var/lock/subsys/nginx $nginxPid
}
reload()
{
echo -n $"Reloading $prog:"
killproc $nginxd -HUP
RETVAL=$?
echo
}
case "$1" in
start)
start
;;
stop)
stop
;;
restart)
stop
start
;;
reload)
reload
;;
status)
status $prog
RETVAL=$?
;;
*)
echo $"Usage: $0 {start|stop|restart|reload|status}"
RETVAL=1
esac
exit $RETVAL
[root@HA1 ~]# chmod +x /etc/init.d/nginxd
[root@HA1 ~]# scp /etc/init.d/nginxd HA2: /etc/init.d/nginxd
[root@HA1 ~]# service network restart
[root@HA1 ~]# service heartbeat start
[root@HA2 ~]# service network restart
[root@HA2 ~]# service heartbeat start
4. 配置集群资源:
Heartbeat和其他应用提供的ocf代理脚本或许会有错误,我们可以用下面方法排错:
要检查 OCF 脚本,请首先设置所需的环境变量。例如,当测试IPaddr OCF 脚本时,您必须通过设置一个变量名称前缀为 OCF_RESKEY_的环境变量来设置变量 ip 的值。对于此示例,可运行以下命令:
export OCF_RESKEY_ip= /usr/lib/ocf/resource.d/heartbeat/IPaddr validate-all /usr/lib/ocf/resource.d/heartbeat/IPaddr start /usr/lib/ocf/resource.d/heartbeat/IPaddr stop
如果此操作不成功,很可能是您缺少某个必需变量或者只是输错了参数。
排错ldirectord ocf代理脚本:
export OCF_ROOT=/usr/lib/ocf
根据自己的环境设置修改下面两行:
[root@HA1 ~]# vi /usr/lib/ocf/resource.d/heartbeat/ldirectord
LDIRCONF=${OCF_RESKEY_configfile:-/etc/ha.d/ldirectord.cf}
LDIRECTORD=${OCF_RESKEY_ldirectord:-/usr/sbin/ldirectord}
[root@HA1 ~]# /usr/lib/ocf/resource.d/heartbeat/ldirectord monitor
[root@HA1 ~]# echo $?
7 # ldirectord未运行返回7,运行正常返回0
[root@HA1 ~]# crm
crm(live)# configure
crm(live)configure# node HA1
crm(live)configure# node HA2
crm(live)configure# primitive ldirectord ocf:heartbeat:ldirectord \
> params configfile=”/etc/ha.d/ldirectord.cf” \
> op monitor interval=”30s” timeout=”20s” \
> meta migration-threshold=”10″ target-role=”Started”
crm(live)configure# primitive vip ocf:heartbeat:IPaddr2 \
> params lvs_support=”true” ip=”192.168.0.120″ cidr_netmask=”24″ broadcast=”192.168.0.255″ \
> op monitor interval=”1m” timeout=”20s” \
> meta migration-threshold=”10″
crm(live)configure# primitive nginxd lsb:nginxd \
> op monitor interval=”30s” timeout=”20s” \
> meta migration-threshold=”10″ target-role=”Started”
crm(live)configure# group load-balancing vip ldirectord
crm(live)configure# clone cl-nginxd nginxd
crm(live)configure# location perfer-ha1 load-balancing \
> rule $id=”prefer-ha1-rule” 100: #uname eq HA1
crm(live)configure# property stonith-enabled=”false” \
> no-quorum-policy=”ignore” \
> start-failure-is-fatal=”false” \
> expected-quorum-votes=”2″
crm(live)configure# verify
crm(live)configure# commit
crm(live)configure# end
crm(live)# status
============
Last updated: Thu Nov 12 01:00:13 2009
Stack: Heartbeat
Current DC: HA2 – partition with quorum
Version: 1.0.6-f709c638237cdff7556cb6ab615f32826c0f8c06
2 Nodes configured, 2 expected votes
2 Resources configured.
============
Online: [ HA2 HA1 ]
Clone Set: cl-nginxd
Started: [ HA2 HA1 ]
Resource Group: load-balancing
vip (ocf::heartbeat:IPaddr2): Started HA1
ldirectord (ocf::heartbeat:ldirectord): Started HA1
crm(live)# quit
bye
5. 验证
[root@HA1 ~]# ipvsadm -l
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
-> RemoteAddress:Port Forward Weight ActiveConn InActConn
TCP 192.168.0.120:http wlc
-> 192.168.0.69:http Route 1 0 0
-> 192.168.0.66:http Local 1 0 0
用浏览器访问网站看是否正常。
禁用HA1的eth1网卡,在HA2上看故障转移情况。
[root@HA2 ~]# crm
crm(live)# status
============
Last updated: Thu Nov 12 18:40:54 2009
Stack: Heartbeat
Current DC: HA2 – partition WITHOUT quorum
Version: 1.0.6-f709c638237cdff7556cb6ab615f32826c0f8c06
2 Nodes configured, 2 expected votes
2 Resources configured.
============
Online: [ HA2 ]
OFFLINE: [ HA1 ]
Clone Set: cl-nginxd
Started: [ HA2 ]
Stopped: [ nginxd:0 ]
Resource Group: load-balancing
vip (ocf::heartbeat:IPaddr2): Started HA2
ldirectord (ocf::heartbeat:ldirectord): Started HA2
启用HA1的eth1网卡,在HA1上看故障转移情况。
[root@HA1 ~]# crm status
============
Last updated: Thu Nov 12 18:42:55 2009
Stack: Heartbeat
Current DC: HA1 – partition with quorum
Version: 1.0.6-f709c638237cdff7556cb6ab615f32826c0f8c06
2 Nodes configured, 2 expected votes
2 Resources configured.
============
Online: [ HA2 HA1 ]
Clone Set: cl-nginxd
Started: [ HA1 HA2 ]
Resource Group: load-balancing
vip (ocf::heartbeat:IPaddr2): Started HA1
ldirectord (ocf::heartbeat:ldirectord): Started HA1
6. 参考: