zookeeper: Add health check
This commit is contained in:
parent
58b34f86c4
commit
9c1b309164
|
@ -66,7 +66,7 @@ prom_hbase_num_clusterrequests = Gauge('hbase_clusterrequests', 'HBase Clusterre
|
||||||
prom_hbase_regions_in_transition_stale = Gauge('regions_in_transition_stale', 'Number of stale regions in transition')
|
prom_hbase_regions_in_transition_stale = Gauge('regions_in_transition_stale', 'Number of stale regions in transition')
|
||||||
prom_zookeeper_num = Gauge('zookeeper_num', 'Known ZooKeeper Servers')
|
prom_zookeeper_num = Gauge('zookeeper_num', 'Known ZooKeeper Servers')
|
||||||
prom_zookeeper_num_live = Gauge('zookeeper_num_live', 'Live ZooKeeper Servers')
|
prom_zookeeper_num_live = Gauge('zookeeper_num_live', 'Live ZooKeeper Servers')
|
||||||
prom_zookeeper_num_dead = Gauge('zookeeper_num_dead', 'Dead ZooKeeper Servers')
|
prom_zookeeper_has_leader = Gauge('zookeeper_has_leader', 'ZooKeeper cluser has a leader')
|
||||||
|
|
||||||
# HDFS/HBase
|
# HDFS/HBase
|
||||||
hdfs_config_file = "/etc/hadoop/conf/hdfs-site.xml"
|
hdfs_config_file = "/etc/hadoop/conf/hdfs-site.xml"
|
||||||
|
@ -138,6 +138,42 @@ class zk():
|
||||||
logging.debug("ZooKeeper: Connection re-established")
|
logging.debug("ZooKeeper: Connection re-established")
|
||||||
# Handle being connected/reconnected to Zookeeper
|
# Handle being connected/reconnected to Zookeeper
|
||||||
|
|
||||||
|
def active_servers(address_list):
|
||||||
|
zk_has_leader = 0
|
||||||
|
zk_leader_address = ""
|
||||||
|
num_active_servers = 0
|
||||||
|
re_mode = re.compile(r'^Mode:\s*(.+?)\s*$')
|
||||||
|
|
||||||
|
for address in address_list:
|
||||||
|
cmd = 'echo stat | nc ' + address + ' 2181'
|
||||||
|
p = Popen(['/bin/sh', '-c', cmd], stdout=PIPE, stderr=PIPE, close_fds=False)
|
||||||
|
output, error = p.communicate()
|
||||||
|
output = output.splitlines()
|
||||||
|
error = error.splitlines()
|
||||||
|
|
||||||
|
for line in output:
|
||||||
|
match = re_mode.match(line.decode('utf-8'))
|
||||||
|
if match:
|
||||||
|
mode = match.group(1)
|
||||||
|
logging.info("zk: server %s: %s", address, mode)
|
||||||
|
num_active_servers += 1
|
||||||
|
|
||||||
|
if match.group(1) == "leader":
|
||||||
|
has_leader = 1
|
||||||
|
zk_leader_address = address
|
||||||
|
prom_zookeeper_has_leader.set(has_leader)
|
||||||
|
|
||||||
|
for line in error:
|
||||||
|
logging.info(line)
|
||||||
|
|
||||||
|
prom_zookeeper_num_live.set(num_active_servers)
|
||||||
|
logging.info("zk: %d active ZooKeeper servers", num_active_servers)
|
||||||
|
if has_leader:
|
||||||
|
logging.info("zk: Zookeeper has leader: True")
|
||||||
|
logging.info("zk: leader: %s", zk_leader_address)
|
||||||
|
else:
|
||||||
|
logging.info("zk: Zookeeper has leader: False")
|
||||||
|
|
||||||
|
|
||||||
class jmx_query():
|
class jmx_query():
|
||||||
|
|
||||||
|
@ -609,6 +645,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
m = zk.get_znode_data(znode_hbase + "/master")
|
m = zk.get_znode_data(znode_hbase + "/master")
|
||||||
|
zk.active_servers(zk_server)
|
||||||
|
|
||||||
if not m:
|
if not m:
|
||||||
logging.info("ZooKeeper: Failed to get HBase master")
|
logging.info("ZooKeeper: Failed to get HBase master")
|
||||||
|
@ -619,7 +656,6 @@ if __name__ == '__main__':
|
||||||
jmx.main(hdfs_namenodes)
|
jmx.main(hdfs_namenodes)
|
||||||
hbase_exporter().main(hbase_master)
|
hbase_exporter().main(hbase_master)
|
||||||
#prom_zookeeper_num_live.set(nzookeeper_live)
|
#prom_zookeeper_num_live.set(nzookeeper_live)
|
||||||
#prom_zookeeper_num_dead.set(nzk_server - nzookeeper_live)
|
|
||||||
|
|
||||||
nruns += 1
|
nruns += 1
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue