2019-04-12 03:04:15 +00:00
#!/usr/bin/env python3
2019-04-04 13:37:54 +00:00
#
# HBase Prometheus Exporter
#
# Björn Busse <bj.rn@baerlin.eu>
#
2019-04-12 20:31:45 +00:00
#
# TODO:
#
# * Remove timestamp from log msg or make them optional,
# we already have it in the journal -
# at least when not running in a container
#
2019-04-12 21:23:01 +00:00
# * Add hdfs/hbase binaries to container
2019-04-04 13:37:54 +00:00
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
2021-01-10 21:35:23 +00:00
import configargparse
2019-04-04 13:37:54 +00:00
from bs4 import BeautifulSoup
2020-10-21 19:56:26 +00:00
from collections import defaultdict
2019-04-04 13:37:54 +00:00
from flatten_json import flatten
2020-06-17 14:33:14 +00:00
import datetime as dt
2019-04-04 13:37:54 +00:00
import io
import json
2020-06-03 14:50:25 +00:00
from kazoo.client import KazooClient
2019-04-04 13:37:54 +00:00
import logging
import os
from prometheus_client import start_http_server, Summary
from prometheus_client.core import GaugeMetricFamily, REGISTRY
from prometheus_client import Gauge
2020-10-21 19:56:26 +00:00
import pybase
2019-04-04 13:37:54 +00:00
import random
import re
import requests
2020-06-16 18:58:51 +00:00
from struct import unpack
2019-04-04 13:37:54 +00:00
import subprocess
2019-04-12 20:31:45 +00:00
from subprocess import Popen,PIPE
2019-04-04 13:37:54 +00:00
import sys
import time
import traceback
import xml.etree.ElementTree as et
2020-07-01 18:01:34 +00:00
sys.path.append('/usr/local/lib/hbase-protobuf-python')
sys.path.append('/usr/local/lib/hbase-protobuf-python/server')
sys.path.append('/usr/local/lib/hbase-protobuf-python/server/zookeeper')
2020-10-21 19:56:26 +00:00
from ZooKeeper_pb2 import Master as hbMaster
2019-04-04 13:37:54 +00:00
2019-04-12 16:05:50 +00:00
tmp_path = '/tmp/'
2019-04-15 09:30:01 +00:00
logpath = tmp_path
2019-04-04 13:37:54 +00:00
2019-04-15 15:18:17 +00:00
# ZooKeeper
zk_reconnect_interval_s = 30
2019-04-04 13:37:54 +00:00
# Prom vars
REQUEST_TIME = Summary('request_processing_seconds', 'Time spent processing request')
prom_hdfs_total = Gauge('hdfs_bytes_total', 'HDFS total bytes')
prom_hdfs_used = Gauge('hdfs_bytes_used', 'HDFS used bytes')
prom_hdfs_remaining = Gauge('hdfs_bytes_remaining', 'HDFS remaining bytes')
2019-04-11 11:39:58 +00:00
prom_hdfs_num_datanodes_live = Gauge('hdfs_datanodes_live', 'HDFS Live DataNodes')
prom_hdfs_num_datanodes_dead = Gauge('hdfs_datanodes_dead', 'HDFS Dead DataNodes')
2019-04-12 19:44:40 +00:00
prom_hbase_up = Gauge('hbase_up', 'HBase is up and running, a master is elected')
prom_hbase_healthy = Gauge('hbase_healthy', 'HBase is up and running, a master is elected, no inconsistencies are detected, hbase is queryable')
2019-04-12 03:04:15 +00:00
prom_hbase_num_regionservers_live = Gauge('hbase_regionservers_live', 'HBase Live Regionservers')
prom_hbase_num_regionservers_dead = Gauge('hbase_regionservers_dead', 'HBase Dead Regionservers')
prom_hbase_num_clusterrequests = Gauge('hbase_clusterrequests', 'HBase Clusterrequests')
2020-06-10 20:18:38 +00:00
prom_hbase_num_regions_in_transition_stale = Gauge('number_of_regions_in_transition_stale', 'Number of stale regions in transition')
prom_hbase_num_inconsistencies = Gauge('number_of_inconsistencies', 'Number of inconsistencies in HBase')
2020-10-21 19:56:26 +00:00
prom_hbase_readable = Gauge('hbase_is_readable', 'HBase is readable')
2020-06-10 20:18:38 +00:00
prom_hbase_writeable = Gauge('hbase_is_writeable', 'HBase is writeable')
2019-09-17 19:51:02 +00:00
prom_zookeeper_num = Gauge('zookeeper_num', 'Known ZooKeeper Servers')
prom_zookeeper_num_live = Gauge('zookeeper_num_live', 'Live ZooKeeper Servers')
2021-03-03 19:37:15 +00:00
prom_zookeeper_has_leader = Gauge('zookeeper_has_leader', 'ZooKeeper cluster has a leader')
2021-03-05 00:01:03 +00:00
prom_zookeeper_num_connections = Gauge('zookeeper_num_connections', 'ZooKeeper connection count for all ZooKeeper servers combined')
2019-04-04 13:37:54 +00:00
# HDFS/HBase
hdfs_config_file = "/etc/hadoop/conf/hdfs-site.xml"
cmd_hbase_active_master = ['/usr/hdp/current/hbase-client/bin/hbase-jruby', '/usr/hdp/current/hbase-client/bin/get-active-master.rb']
2021-03-04 19:52:58 +00:00
cmd_hbase = 'hbase'
cmd_hbase_test = 'tests/hbase/bin/hbase'
2019-04-04 13:37:54 +00:00
cmd_hdfs_namenodes = ['hdfs', 'getconf', '-namenodes']
2019-04-12 23:15:06 +00:00
# Use command line arguments to set the following vars
2019-04-18 17:47:33 +00:00
# Do not change them here (See TODO)
2019-04-04 13:37:54 +00:00
namenodes = ""
2021-03-05 02:32:40 +00:00
jmx_use_tls = False
2019-04-12 23:15:06 +00:00
hbase_master_ui_default_port = 16010
hdfs_namenode_default_port = 50070
2019-04-15 09:30:01 +00:00
cluster_is_kerberized = False
2020-06-17 21:00:19 +00:00
hbase_hbck_remove_lockfile = True
2019-04-12 23:15:06 +00:00
class zk():
zk_client = ""
@classmethod
2020-06-03 14:50:25 +00:00
def main(self, address_list, use_tls, timeout=5):
2019-04-12 23:15:06 +00:00
2020-06-03 14:50:25 +00:00
addresses = ','.join(address_list)
zk_client = KazooClient(addresses, use_ssl=use_tls, read_only=True)
2019-04-15 15:18:17 +00:00
try:
zk_client.start(timeout)
except Exception as e:
logging.debug("ZooKeeper Error: " + str(e))
return False
2019-04-12 23:15:06 +00:00
self.zk_client = zk_client
2019-04-18 17:47:33 +00:00
self.zk_client.add_listener(self.listener)
return True
2020-06-10 19:49:30 +00:00
2019-04-18 17:47:33 +00:00
def listener(state):
if state == kz_client.KazooState.CONNECTED:
logging.info("ZooKeeper: Client connected")
else:
logging.info("ZooKeeper: Failed to connect to ZooKeeper")
2019-04-12 23:15:06 +00:00
2020-06-10 19:49:30 +00:00
2019-04-12 23:15:06 +00:00
@classmethod
2020-06-10 20:18:38 +00:00
def znode_data(self, znode):
2019-04-18 17:47:33 +00:00
data = ""
2019-04-12 23:15:06 +00:00
try:
2019-04-18 17:47:33 +00:00
self.zk_client.exists(znode)
2019-04-12 23:15:06 +00:00
except Exception as e:
2019-04-18 17:47:33 +00:00
logging.info("ZooKeeper: znode does not exist: " + znode)
2019-04-12 23:15:06 +00:00
return False
2019-04-18 17:47:33 +00:00
try:
data = self.zk_client.get(znode)
except:
logging.info("ZooKeeper: Could not get znode data from " + znode)
return False
return data
2020-06-10 19:49:30 +00:00
2019-04-15 15:18:17 +00:00
def listener(state):
if state == KazooState.LOST:
logging.debug("ZooKeeper: Connection lost")
# Register somewhere that the session was lost
elif state == KazooState.SUSPENDED:
logging.debug("ZooKeeper: Connection suspended")
# Handle being disconnected from Zookeeper
else:
logging.debug("ZooKeeper: Connection re-established")
# Handle being connected/reconnected to Zookeeper
2020-06-10 19:49:30 +00:00
2020-06-09 15:22:52 +00:00
def active_servers(address_list):
zk_has_leader = 0
zk_leader_address = ""
num_active_servers = 0
2021-03-05 00:01:03 +00:00
num_zk_connections = 0
2020-06-09 15:22:52 +00:00
re_mode = re.compile(r'^Mode:\s*(.+?)\s*$')
2021-03-05 00:01:03 +00:00
re_connections = re.compile(r'^Connections:\s*(.+?)\s*$')
2020-06-09 15:22:52 +00:00
for address in address_list:
2021-03-04 21:42:35 +00:00
logging.info("Probing zookeeper at address: " + address)
2020-06-09 15:22:52 +00:00
cmd = 'echo stat | nc ' + address + ' 2181'
p = Popen(['/bin/sh', '-c', cmd], stdout=PIPE, stderr=PIPE, close_fds=False)
output, error = p.communicate()
output = output.splitlines()
error = error.splitlines()
for line in output:
match = re_mode.match(line.decode('utf-8'))
if match:
mode = match.group(1)
logging.info("zk: server %s: %s", address, mode)
num_active_servers += 1
2021-03-04 21:28:07 +00:00
if match.group(1) == "leader" or match.group(1) == "standalone":
2021-03-04 16:52:32 +00:00
zk_has_leader = 1
2020-06-09 15:22:52 +00:00
zk_leader_address = address
2021-03-04 16:52:32 +00:00
prom_zookeeper_has_leader.set(zk_has_leader)
2020-06-09 15:22:52 +00:00
2021-03-05 00:01:03 +00:00
match = re_connections.match(line.decode('utf-8'))
if match:
nconns = match.group(1)
logging.info("zk: server has %s connections", nconns)
num_zk_connections += int(nconns)
2020-06-09 15:22:52 +00:00
for line in error:
2021-03-04 21:28:07 +00:00
logging.info(line.decode('utf-8'))
2020-06-09 15:22:52 +00:00
2021-03-05 00:01:03 +00:00
prom_zookeeper_num_connections.set(num_zk_connections)
2020-06-09 15:22:52 +00:00
prom_zookeeper_num_live.set(num_active_servers)
logging.info("zk: %d active ZooKeeper servers", num_active_servers)
2021-03-04 16:52:32 +00:00
if zk_has_leader:
2020-06-09 15:22:52 +00:00
logging.info("zk: Zookeeper has leader: True")
logging.info("zk: leader: %s", zk_leader_address)
else:
logging.info("zk: Zookeeper has leader: False")
2019-04-04 13:37:54 +00:00
class jmx_query():
2020-06-08 13:53:57 +00:00
def __init__(self, relay_complete_jmx):
self.relay_complete_jmx = relay_complete_jmx
self.prom_jmx_keys = []
self.prom_jmx = {}
2019-04-04 13:37:54 +00:00
def main(self, hdfs_namenode_hosts):
2020-06-10 20:18:38 +00:00
hdfs_active_namenode = self.active_namenode(hdfs_namenode_hosts)
2020-06-16 18:58:51 +00:00
hbase_active_master = hbase_exporter.zk_active_master()
2021-03-05 02:32:40 +00:00
hdfs_jmx = True
hbase_jmx = True
2019-04-04 13:37:54 +00:00
if not hdfs_active_namenode:
2019-04-12 16:05:50 +00:00
logging.info("Failed to determine active HDFS namenode")
2021-03-05 02:32:40 +00:00
hdfs_jmx = False
2019-04-12 03:04:15 +00:00
if not hbase_active_master:
logging.info("Failed to determine active HBase master")
2021-03-05 02:32:40 +00:00
hbase_jmx = False
if hdfs_jmx:
url = self.get_url('hdfs', hdfs_active_namenode)
logging.info("hdfs: Fetching jmx data")
self.jmx_data(url)
2019-04-04 13:37:54 +00:00
2021-03-05 02:32:40 +00:00
if hbase_jmx:
url = self.get_url('hbase', hbase_active_master)
logging.info("hbase: Fetching jmx data")
self.jmx_data(url)
2019-04-12 03:04:15 +00:00
2019-04-12 19:47:58 +00:00
2019-04-12 03:04:15 +00:00
def get_url(self, service, hostname):
2019-04-12 20:47:48 +00:00
2021-03-05 02:32:40 +00:00
if (jmx_use_tls):
2019-04-12 03:04:15 +00:00
url_scheme = "https://"
else:
url_scheme = "http://"
if service == 'hdfs':
2019-09-17 18:51:13 +00:00
url = url_scheme + hostname + ":" + str(hdfs_namenode_default_port) + "/jmx"
2019-04-12 03:04:15 +00:00
elif service == 'hbase':
2019-09-17 18:51:13 +00:00
url = url_scheme + hostname + ":" + str(hbase_master_ui_default_port) + "/jmx"
2019-04-12 03:04:15 +00:00
return url
2019-04-12 19:47:58 +00:00
2020-06-10 20:18:38 +00:00
def jmx_data(self, url):
2019-04-04 13:37:54 +00:00
jmx = self.query(url)
if (jmx == False):
2019-04-10 13:33:00 +00:00
logging.info("Could not read jmx data from: " + url)
2019-04-04 13:37:54 +00:00
return False
for k, v in jmx.items():
if not v is None:
self.lookup_keys(k, v)
2019-04-12 03:04:15 +00:00
return True
2019-04-04 13:37:54 +00:00
2019-04-12 19:47:58 +00:00
2020-06-10 20:18:38 +00:00
def active_namenode(self, hdfs_namenode_hosts):
2019-04-12 16:05:50 +00:00
if not which(cmd_hdfs_namenodes[0]):
logging.info("Could not find hdfs executable in PATH")
return False
2019-04-04 13:37:54 +00:00
try:
r = subprocess.run(cmd_hdfs_namenodes, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as e:
2019-04-12 23:15:06 +00:00
logging.debug("Type error: " + str(e))
2019-04-04 13:37:54 +00:00
return False
hosts = r.stdout.decode('utf-8').split(" ")
tree = et.parse(hdfs_config_file)
root = tree.getroot()
has_ha_element = False
active_namenode = None
for property in root:
if "dfs.ha.namenodes" in property.find("name").text:
has_ha_element = True
nameservice_id = property.find("name").text[len("dfs.ha.namenodes")+1:]
namenodes = property.find("value").text.split(",")
for node in namenodes:
# Get namenode address and check if it is the active node
for n in root:
prefix = "dfs.namenode.rpc-address." + nameservice_id + "."
element_text = n.find("name").text
if prefix in element_text:
node_address = n.find("value").text.split(":")[0]
2020-06-04 18:44:16 +00:00
# Needs to either run with root privileges or as hdfs user
2019-04-10 11:12:14 +00:00
cmd = ['hdfs haadmin -getServiceState ' + node]
2019-04-04 13:37:54 +00:00
r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if len(r.stderr.decode("utf-8")) > 0:
2019-04-10 13:33:00 +00:00
logging.debug(r.stderr.decode("utf-8"))
2019-04-04 13:37:54 +00:00
if "active" in r.stdout.decode("utf-8").lower():
2020-06-10 20:18:38 +00:00
logging.info("hdfs: Active namenode: " + node_address + " (" + node + ")")
2019-04-04 13:37:54 +00:00
return node_address
2020-06-04 18:44:16 +00:00
if has_ha_element:
logging.info("Hadoop High-Availability: True")
else:
logging.info("Hadoop High-Availability: False")
2019-04-04 13:37:54 +00:00
return False
2019-04-12 19:47:58 +00:00
2019-04-04 13:37:54 +00:00
def query(self, url):
2019-04-12 17:21:12 +00:00
2019-04-04 13:37:54 +00:00
try:
r = requests.get(url)
except Exception as e:
2019-04-10 13:33:00 +00:00
logging.info("Could not connect to: " + url)
2019-04-04 13:37:54 +00:00
return False
jmx = json.loads(r.text)
jmx = flatten(jmx)
return(jmx)
2019-04-12 19:47:58 +00:00
2019-04-04 13:37:54 +00:00
def lookup_keys(self, key, value):
2020-06-08 13:53:57 +00:00
denylist = ["Name", "name", "Type", "Object",
"ObjectName", "Valid", "tag.Context", "tag.Hostname"]
if key.endswith("capacityUsed"):
2019-04-04 13:37:54 +00:00
prom_hdfs_used.set(value)
2019-04-11 11:39:58 +00:00
logging.debug("Found jmx key: " + key)
2019-04-10 14:19:46 +00:00
elif key.endswith("capacityTotal"):
2019-04-04 13:37:54 +00:00
prom_hdfs_total.set(value)
2019-04-11 11:39:58 +00:00
logging.debug("Found jmx key: " + key)
2019-04-10 14:19:46 +00:00
elif key.endswith("capacityRemaining"):
2019-04-04 13:37:54 +00:00
prom_hdfs_remaining.set(value)
2019-04-11 11:39:58 +00:00
logging.debug("Found jmx key: " + key)
elif key.endswith("NumLiveDataNodes"):
prom_hdfs_num_datanodes_live.set(value)
logging.debug("Found jmx key: " + key)
elif key.endswith("NumDeadDataNodes"):
prom_hdfs_num_datanodes_dead.set(value)
logging.debug("Found jmx key: " + key)
2019-04-12 03:04:15 +00:00
elif key.endswith("numRegionServers"):
prom_hbase_num_regionservers_live.set(value)
logging.debug("Found jmx key: " + key)
elif key.endswith("numDeadRegionServers"):
prom_hbase_num_regionservers_dead.set(value)
logging.debug("Found jmx key: " + key)
elif key.endswith("clusterRequests"):
prom_hbase_num_clusterrequests.set(value)
logging.debug("Found jmx key: " + key)
2020-06-08 13:53:57 +00:00
else:
if not self.relay_complete_jmx:
return
jmx_key = key.split("_", 2)
if jmx_key[2] not in denylist:
jmx_key = "jmx_" + key
jmx_key = jmx_key.replace(".", "_")
jmx_key = jmx_key.replace("-", "_")
logging.debug("Found jmx key: " + jmx_key)
if not isinstance(value, str) and not type(value) is list:
prom_jmx_key = "prom_" + jmx_key
# Check if key is already registered
if prom_jmx_key not in self.prom_jmx_keys:
self.prom_jmx_keys.append(prom_jmx_key)
self.prom_jmx[prom_jmx_key] = Gauge(prom_jmx_key, prom_jmx_key)
# Set prometheys value
self.prom_jmx[prom_jmx_key].set(value)
2019-04-04 13:37:54 +00:00
class hbase_exporter():
2020-10-21 19:56:26 +00:00
def __init__(self):
self.hbase_read_success = 0
self.hbase_write_success = 0
def main(self, zk_server, hbase_master_hosts, run_hbck):
2020-06-16 18:58:51 +00:00
hbase_active_master = self.zk_active_master()
2019-04-12 03:04:15 +00:00
if not hbase_active_master:
2020-06-17 14:33:14 +00:00
logging.info("hbase: Failed to determine active HBase master")
2019-04-12 19:44:40 +00:00
prom_hbase_up.set(0)
prom_hbase_healthy.set(0)
2019-04-12 03:04:15 +00:00
return False
2020-07-01 18:01:34 +00:00
prom_hbase_up.set(1)
2020-06-10 20:18:38 +00:00
self.stale_regions_in_transition(hbase_active_master)
2020-06-17 14:33:14 +00:00
msg = 'hbase: {0} stale regions in transition '\
.format(self.num_regions_in_transition_stale)
logging.info(msg)
prom_hbase_num_regions_in_transition_stale.set(self.num_regions_in_transition_stale)
if run_hbck:
self.hbck_inconsistencies()
logging.info("hbase-hbck: Number of inconsistencies: %d", self.num_inconsistencies)
prom_hbase_num_inconsistencies.set(self.num_inconsistencies)
2020-10-21 19:56:26 +00:00
self.hbase_read_write_test(zk_server)
if self.hbase_read_success:
logging.info("hbase: Read test succeeded")
prom_hbase_readable.set(1)
else:
logging.info("hbase: Read test failed!")
prom_hbase_readable.set(0)
2020-06-17 14:33:14 +00:00
if self.hbase_write_success:
logging.info("hbase: Write test succeeded")
prom_hbase_writeable.set(1)
else:
2020-10-21 19:56:26 +00:00
logging.info("hbase: Write test failed!")
2020-06-17 14:33:14 +00:00
prom_hbase_writeable.set(0)
2019-04-12 19:44:40 +00:00
2020-06-17 21:00:19 +00:00
hbase_health = self.check_health(run_hbck)
prom_hbase_healthy.set(hbase_health)
2019-04-12 19:47:58 +00:00
2020-06-17 14:33:14 +00:00
def check_health(self, run_hbck):
# Only check for inconsistencies if we actually ran hbck
if run_hbck and self.num_inconsistencies > 0:
2019-04-12 19:44:40 +00:00
return False
2020-06-04 18:44:16 +00:00
if self.num_regions_in_transition_stale > 0:
2019-04-12 19:44:40 +00:00
return False
2020-06-10 19:49:30 +00:00
if self.hbase_write_success != 0:
return False
2019-04-12 19:44:40 +00:00
return True
2019-04-04 13:37:54 +00:00
2019-04-12 19:47:58 +00:00
2020-06-17 14:33:14 +00:00
# The prefered method to get the active
2020-06-16 18:58:51 +00:00
# HBase Master by directly looking into ZooKeeper
2019-04-12 03:04:15 +00:00
@staticmethod
2020-06-16 18:58:51 +00:00
def zk_active_master():
msg = zk.znode_data(znode_hbase + "/master")
if not msg:
logging.info("ZooKeeper: Failed to get HBase master")
return False
else:
msg = msg[0]
first_byte, meta_length = unpack(">cI", msg[:5])
msg = msg[meta_length + 9:]
2020-10-21 19:56:26 +00:00
master = hbMaster()
2020-06-16 18:58:51 +00:00
master.ParseFromString(msg)
return master.master.host_name
# An alternative way to get the HBase Master
# without directly looking into ZooKeeper
@staticmethod
def jruby_active_master():
2019-04-12 16:05:50 +00:00
if not which(cmd_hbase_active_master[0]):
logging.info("Could not find hdfs executable in PATH")
return False
2019-04-04 13:37:54 +00:00
try:
r = subprocess.run(cmd_hbase_active_master, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2019-04-18 17:47:33 +00:00
except:
2019-04-12 16:05:50 +00:00
return False
if 'Master not running' in r.stdout.decode('utf-8'):
2019-04-04 13:37:54 +00:00
return False
2019-04-12 20:57:01 +00:00
active_master = r.stdout.decode('utf-8').rstrip()
return active_master
2019-04-04 13:37:54 +00:00
2019-04-12 19:47:58 +00:00
2020-06-10 20:18:38 +00:00
def stale_regions_in_transition(self, hbase_master):
2019-04-04 13:37:54 +00:00
host = hbase_master.rstrip("\n\r")
2019-09-17 18:51:13 +00:00
port = hbase_master_ui_default_port
2019-04-04 13:37:54 +00:00
url = 'http://%(host)s:%(port)s/master-status' % locals()
logging.debug('GET %s', url)
try:
req = requests.get(url)
except requests.exceptions.RequestException as e:
logging.debug(e)
logging.debug('Failed to request ' + url)
return False
logging.debug("Response: %s %s", req.status_code, req.reason)
if req.status_code != 200:
logging.debug('Got a http return code != 200')
2019-04-11 11:39:58 +00:00
num_regions_in_transition_stale = self.hbaseui_parse_output(req.content)
2019-04-04 13:37:54 +00:00
2019-04-11 11:39:58 +00:00
if num_regions_in_transition_stale is None:
2020-06-17 21:00:19 +00:00
logging.debug('hbase-ui: Parse error - failed to find number of stale regions in transition')
2019-04-04 13:37:54 +00:00
2019-04-11 11:39:58 +00:00
if not isinstance(num_regions_in_transition_stale, int):
2020-06-17 21:00:19 +00:00
logging.debug('hbase-ui: Parse error - got non-integer for stale regions in transition')
2019-04-04 13:37:54 +00:00
2020-06-04 18:44:16 +00:00
self.num_regions_in_transition_stale = num_regions_in_transition_stale
2019-04-04 13:37:54 +00:00
2019-04-12 19:47:58 +00:00
2019-04-04 13:37:54 +00:00
def hbaseui_parse_output(self, content):
soup = BeautifulSoup(content, 'html.parser')
2020-06-17 21:00:19 +00:00
num_regions_in_transition_stale = 0
2019-04-04 13:37:54 +00:00
try:
headings = soup.findAll('h2')
for heading in headings:
2020-06-17 21:00:19 +00:00
# The section only exists if there are stale regions in transition
2019-04-04 13:37:54 +00:00
if heading.get_text() == "Regions in Transition":
2020-06-17 21:00:19 +00:00
logging.info('hbase-ui: Found Regions in Transition section header')
logging.info('hbase-ui: Looking for table')
2019-04-04 13:37:54 +00:00
table = heading.find_next('table')
2019-04-11 11:39:58 +00:00
num_regions_in_transition_stale = self.hbaseui_parse_table(table)
if not isinstance(num_regions_in_transition_stale, int):
2020-06-17 21:00:19 +00:00
logging.info('hbase-ui: Got non-integer \'{0}\' for stale regions in transition when parsing HBase Master UI'\
2019-04-11 11:39:58 +00:00
.format(num_regions_in_transition_stale))
2019-04-04 13:37:54 +00:00
except (AttributeError, TypeError):
2020-06-17 21:00:19 +00:00
logging.info('hbase-ui: Failed to parse HBase Master UI status page')
return -1
return num_regions_in_transition_stale
2019-04-04 13:37:54 +00:00
2019-04-12 19:47:58 +00:00
2020-06-10 20:18:38 +00:00
def hbck_inconsistencies(self):
2019-04-12 23:15:06 +00:00
2019-04-04 13:37:54 +00:00
re_status = re.compile(r'^Status:\s*(.+?)\s*$')
2020-06-08 21:06:32 +00:00
re_duplicate = re.compile(r'(.*)ERROR\s\[main\]\sutil\.HBaseFsck\:\sAnother\sinstance\sof\shbck\sis\srunning(.*)$')
2019-04-04 13:37:54 +00:00
re_inconsistencies = re.compile(r'^\s*(\d+)\s+inconsistencies\s+detected\.?\s*$')
2020-06-03 14:50:25 +00:00
self.num_inconsistencies = None
2019-04-04 13:37:54 +00:00
hbck_status = None
2021-03-04 23:03:33 +00:00
logging.info("hbase: Running hbck consistency check with cmd: " + cmd_hbase)
2021-03-04 19:52:58 +00:00
p = Popen([cmd_hbase, 'hbck'], stdout=PIPE, stderr=PIPE, close_fds=False)
2019-04-04 13:37:54 +00:00
output, error = p.communicate()
output = output.splitlines()
2020-06-08 21:06:32 +00:00
error = error.splitlines()
2019-04-04 13:37:54 +00:00
for line in output:
2020-06-04 18:44:16 +00:00
match = re_inconsistencies.match(line.decode('utf-8'))
2019-04-04 13:37:54 +00:00
if match:
2020-06-03 14:50:25 +00:00
self.num_inconsistencies = match.group(1)
2020-06-17 14:33:14 +00:00
logging.info('hbase-hbck: Number of inconsistencies: %s', self.num_inconsistencies)
2019-04-04 13:37:54 +00:00
continue
2020-06-04 18:44:16 +00:00
match = re_status.match(line.decode('utf-8'))
2019-04-04 13:37:54 +00:00
if match:
hbck_status = match.group(1)
2020-06-10 19:49:30 +00:00
logging.info('hbase-hbck: hbck status = %s', hbck_status)
2019-04-04 13:37:54 +00:00
break
2020-06-08 21:06:32 +00:00
for line in error:
match = re_duplicate.match(line.decode('utf-8'))
if match:
hbck_status = match.group(0)
2020-06-10 19:49:30 +00:00
logging.info('hbase-hbck: hbck status = %s', hbck_status)
2020-06-17 21:00:19 +00:00
hdfs_lock_uri = re.findall('hdfs://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', hbck_status)
for uri in hdfs_lock_uri:
logging.info('hbase-hbck: Locked by lockfile: {0}'.format(hdfs_lock_uri[0]))
if hbase_hbck_remove_lockfile:
hdfs_remove_file(uri)
else:
logging.info('hbase-hbck: Please remove lockfile manually if no hbck is running')
2020-06-08 21:06:32 +00:00
break
2019-04-04 13:37:54 +00:00
if hbck_status is None:
2020-06-10 19:49:30 +00:00
logging.info('hbase-hbck: Failed to find hbck status result')
2020-06-03 14:50:25 +00:00
if self.num_inconsistencies is None:
2020-06-10 19:49:30 +00:00
logging.info('hbase-hbck: Failed to find number of inconsistencies')
2020-06-03 14:50:25 +00:00
self.num_inconsistencies = -1
2019-04-04 13:37:54 +00:00
2020-06-03 14:50:25 +00:00
if self.num_inconsistencies != None:
self.num_inconsistencies = int(self.num_inconsistencies)
2019-04-04 13:37:54 +00:00
2020-06-03 14:50:25 +00:00
if not isinstance(self.num_inconsistencies, int):
2020-06-10 19:49:30 +00:00
logging.info('hbase-hbck: Non-integer detected for the number of inconsistencies')
2020-06-03 14:50:25 +00:00
self.num_inconsistencies = -1
2019-04-12 19:44:40 +00:00
return False
2020-06-08 21:06:32 +00:00
if p.returncode != 0:
2020-06-10 19:49:30 +00:00
logging.info("hbase-hbck: Failed to run hbck (%d)" % (p.returncode))
2020-06-08 21:06:32 +00:00
self.num_inconsistencies = -1
return False
2019-04-12 19:47:58 +00:00
2019-04-04 13:37:54 +00:00
@staticmethod
def hbaseui_parse_table(table):
for row in table.findChildren('tr'):
for col in row.findChildren('td'):
if 'Regions in Transition for more than ' in col.get_text():
next_sibling = col.findNext('td')
2019-04-11 11:39:58 +00:00
num_regions_in_transition_stale = next_sibling.get_text().strip()
return num_regions_in_transition_stale
2019-04-04 13:37:54 +00:00
return None
2019-04-12 19:47:58 +00:00
2020-10-21 19:56:26 +00:00
def result_to_dict(self, rsp):
ds = defaultdict(dict)
for cell in rsp.flatten_cells():
ds[cell.family][cell.qualifier] = cell.value
return ds
2020-06-10 19:49:30 +00:00
2020-10-21 19:56:26 +00:00
def hbase_read_write_test(self, zk_server):
key = "0x42devoops".encode('utf-8')
pybase_client = pybase.NewClient(zk_server)
cf = "t".encode('utf-8')
values = {
cf: {
"ops".encode('utf-8'): "devoops".encode('utf-8'),
}
}
2020-06-10 19:49:30 +00:00
2020-10-21 19:56:26 +00:00
# Read access
try:
2021-03-04 19:52:58 +00:00
rsp = pybase_client.get(hbase_table, key)
2020-10-21 19:56:26 +00:00
self.hbase_read_success = 1
2021-03-04 19:57:24 +00:00
rspd = self.result_to_dict(rsp)
logging.debug('hbase: Read: ')
for k, v in rspd.items():
logging.debug('key: %', k)
logging.debug('value: %', v)
2020-10-21 19:56:26 +00:00
except:
self.hbase_read_success = 0
2020-06-10 19:49:30 +00:00
2020-10-21 19:56:26 +00:00
# Write access
try:
self.hbase_write_success = 1
rsp = pybase_client.put(table, key, values)
except:
2020-06-10 19:49:30 +00:00
self.hbase_write_success = 0
2020-10-21 19:56:26 +00:00
# Delete what we wrote
logging.info("Deleting at " + key.decode('utf-8'))
try:
2021-03-04 20:08:51 +00:00
pybase_client.delete(hbase_table, key, values)
2020-10-21 19:56:26 +00:00
except Exception as e:
logging.error('Failed to delete: %s', str(e))
self.hbase_write_success = 0
return
return
2020-06-10 19:49:30 +00:00
2020-06-17 21:00:19 +00:00
def hdfs_remove_file(hdfs_uri):
p = Popen(['hadoop', 'fs', '-rm', hdfs_uri], stdout=PIPE, stderr=PIPE, close_fds=False)
output, error = p.communicate()
output = output.splitlines()
error = error.splitlines()
for line in output:
logging.info("hdfs-rm: %s", line)
for line in error:
logging.info("hdfs-rm: %s", line)
return error
2019-04-12 16:05:50 +00:00
def which(program):
def is_executable(fn):
return os.path.isfile(fn) and os.access(fn, os.X_OK)
filepath, fname = os.path.split(program)
if filepath:
if is_executable(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
exec_file = os.path.join(path, program)
if is_executable(exec_file):
return exec_file
return None
2019-04-04 13:37:54 +00:00
2019-09-17 18:51:13 +00:00
2019-04-04 13:37:54 +00:00
if __name__ == '__main__':
2019-09-17 18:51:13 +00:00
hbase_master_default_address = 'localhost:' + str(hbase_master_ui_default_port)
hdfs_namenode_default_address = 'localhost:' + str(hdfs_namenode_default_port)
2021-01-10 21:35:23 +00:00
parser = configargparse.ArgParser( description="")
2021-01-11 20:51:56 +00:00
parser.add_argument('--hbase-master', dest='hbase_master', env_var='HBASE_MASTER', action='append', help="HBase master address, can be specified multiple times", type=str, default=hbase_master_default_address)
2021-03-04 19:52:58 +00:00
parser.add_argument('--hbase-pseudo-distributed', dest='hbase_pseudo_distributed', env_var='HBASE_PSEUDO_DISTRIBUTED', help="Indicated whether HBase is run in pdeudo-distributed mode", type=bool, default=False)
parser.add_argument('--hbase-table', dest='hbase_table', env_var='HBASE_TABLE', help="The HBase table for the write test", type=str, required=True)
2021-01-11 20:51:56 +00:00
parser.add_argument('--hdfs-namenode', dest='hdfs_namenode', env_var='HDFS_NAMENODE', action='append', help="HDFS namenode address, can be specified multiple times", type=str, default=hdfs_namenode_default_address)
parser.add_argument('--zookeeper-server-address', dest='zk_server', env_var='ZK_SERVER', action='append', help="ZooKeeper server address, can be specified multiple times", type=str, required=True)
parser.add_argument('--zookeeper-use-tls', dest='zk_use_tls', env_var='ZK_USE_TLS', help="Use TLS when connecting to ZooKeeper", type=bool, default=False)
parser.add_argument('--exporter-port', dest='prom_http_port', env_var='PROM_HTTP_PORT', help="Listen port for Prometheus export", type=int, default=9010)
parser.add_argument('--export-refresh-rate', dest='prom_export_interval_s', env_var='PROM_EXPORT_INTERVAL_S', help="Time between metrics are gathered in seconds", type=int, default=60)
parser.add_argument('--hbck-refresh-rate', dest='hbase_hbck_interval_s', env_var='HBASE_HBCK_INTERVAL_S', help="Minimum time between two consecutive hbck runs in seconds", type=int, default=600)
parser.add_argument('--relay-jmx', dest='relay_jmx', env_var='RELAY_JMX', help="Relay complete JMX data", type=bool, default=False)
parser.add_argument('--logfile', dest='logfile', env_var='LOGFILE', help="Path to optional logfile", type=str)
parser.add_argument('--loglevel', dest='loglevel', env_var='LOGLEVEL', help="Loglevel, default: INFO", type=str, default='INFO')
2019-04-15 09:30:01 +00:00
args = parser.parse_args()
2019-09-17 18:51:13 +00:00
prom_http_port = args.prom_http_port
logfile = args.logfile
loglevel = args.loglevel
2020-06-03 14:50:25 +00:00
zk_server = args.zk_server
zk_use_tls = args.zk_use_tls
2019-09-17 18:51:13 +00:00
hbase_master = args.hbase_master
hdfs_namenodes = args.hdfs_namenode
2020-06-08 13:53:57 +00:00
relay_complete_jmx = args.relay_jmx
2020-06-17 14:33:14 +00:00
prom_export_interval_s = args.prom_export_interval_s
hbase_hbck_interval_s = args.hbase_hbck_interval_s
2021-03-04 19:52:58 +00:00
hbase_pseudo_distributed = args.hbase_pseudo_distributed
hbase_table = args.hbase_table
2019-09-17 18:51:13 +00:00
del locals()['args']
2019-04-15 09:30:01 +00:00
2020-06-03 14:50:25 +00:00
nzk_server = len(zk_server)
prom_zookeeper_num.set(nzk_server)
2019-09-17 19:51:02 +00:00
2019-04-15 09:30:01 +00:00
# Optional File Logging
2020-06-03 14:50:25 +00:00
if logfile:
2019-09-17 18:51:13 +00:00
tlog = logfile.rsplit('/', 1)
logpath = tlog[0]
logfile = tlog[1]
2019-04-15 09:30:01 +00:00
if not os.access(logpath, os.W_OK):
2019-09-17 18:51:13 +00:00
# Our logger is not set up yet, so we use print here
print("Logging: Can not write to directory. Skippking filelogging handler")
2019-04-15 09:30:01 +00:00
else:
2019-09-17 18:51:13 +00:00
fn = logpath + '/' + logfile
file_handler = logging.FileHandler(filename=fn)
# Our logger is not set up yet, so we use print here
print("Logging: Logging to " + fn)
2019-04-15 09:30:01 +00:00
2019-04-12 03:04:15 +00:00
stdout_handler = logging.StreamHandler(sys.stdout)
2019-04-15 09:30:01 +00:00
if 'file_handler' in locals():
handlers = [file_handler, stdout_handler]
else:
handlers = [stdout_handler]
2019-04-12 03:04:15 +00:00
logging.basicConfig(
2019-09-17 18:51:13 +00:00
level=logging.INFO,
2019-04-12 03:04:15 +00:00
format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
handlers=handlers
)
2019-04-12 23:15:06 +00:00
logger = logging.getLogger(__name__)
2019-09-17 18:51:13 +00:00
level = logging.getLevelName(loglevel)
logger.setLevel(level)
2019-04-12 03:04:15 +00:00
2021-03-04 22:10:25 +00:00
# Try the hbase from test suite if no hbase on PATH
2021-03-04 19:52:58 +00:00
if not which('hbase'):
2021-03-04 23:03:33 +00:00
logging.error("hbase: Could not find HBase executable in PATH")
if not os.access(cmd_hbase_test, os.X_OK):
logging.error("hbase: Could not find HBase executable from test suite (" + cmd_hbase_test + ")")
else:
logging.info("hbase: Using HBase executable from test suite")
2021-03-04 19:52:58 +00:00
cmd_hbase = cmd_hbase_test
2019-04-04 13:37:54 +00:00
# Start the Prometheus server
2019-04-15 15:18:17 +00:00
try:
start_http_server(prom_http_port)
except Exception as e:
logging.debug("Failed to start Prometheus webserver: " + str(e))
2019-04-18 17:47:33 +00:00
logging.info("There might be another instance of " + sys.argv[0] + \
" already running, can not bind to " + str(prom_http_port) + ", exiting..")
2019-04-15 15:18:17 +00:00
sys.exit()
2019-04-12 23:15:06 +00:00
nruns = 0
2019-04-04 13:37:54 +00:00
2019-04-12 23:15:06 +00:00
# Start a ZooKeeper client
2019-04-15 15:18:17 +00:00
r = False
2019-09-17 18:51:13 +00:00
nzk = 0
2019-04-15 15:18:17 +00:00
2019-09-17 19:51:02 +00:00
# Try to connect to one of the known servers
2019-04-15 15:18:17 +00:00
while not r:
2020-06-03 14:50:25 +00:00
r = zk.main(zk_server, zk_use_tls)
time.sleep(zk_reconnect_interval_s)
2019-04-15 15:18:17 +00:00
if cluster_is_kerberized:
2019-04-18 17:47:33 +00:00
znode_hbase = "/hbase"
2021-03-04 19:52:58 +00:00
elif hbase_pseudo_distributed:
znode_hbase = "/hbase"
2019-04-15 09:30:01 +00:00
else:
2019-04-18 17:47:33 +00:00
znode_hbase = "/hbase-unsecure"
2019-04-15 09:30:01 +00:00
2020-06-10 20:18:38 +00:00
clusterid = zk.znode_data(znode_hbase + "/hbaseid")
2019-04-12 23:15:06 +00:00
if not clusterid:
logging.info("ZooKeeper: Could not read clusterid")
else:
2019-04-18 17:47:33 +00:00
logging.info("ZooKeeper: Clusterid: " + str(clusterid[0]))
2019-04-04 13:37:54 +00:00
2020-06-08 13:53:57 +00:00
jmx = jmx_query(relay_complete_jmx)
2019-04-04 13:37:54 +00:00
while True:
2020-06-17 14:33:14 +00:00
nruns += 1
run_hbck = False
# Set the initial hbck timer
if nruns == 1:
hbase_hbck_timer_s = dt.datetime.now()
run_hbck = True
2020-06-16 18:58:51 +00:00
hbase_active_master = hbase_exporter.zk_active_master()
2020-06-17 15:16:25 +00:00
logging.debug("hbase: Active master: {0}".format(hbase_active_master))
2020-06-16 18:58:51 +00:00
2020-06-09 15:22:52 +00:00
zk.active_servers(zk_server)
2019-04-18 17:47:33 +00:00
2020-06-08 13:53:57 +00:00
jmx.main(hdfs_namenodes)
2019-04-04 13:37:54 +00:00
2020-06-17 14:33:14 +00:00
hbase_hbck_time_s = int((dt.datetime.now() - hbase_hbck_timer_s).total_seconds())
2020-06-17 21:00:19 +00:00
logging.debug("hbase-hbck: Timer: {0} seconds".format(hbase_hbck_time_s))
2020-06-17 14:33:14 +00:00
# Do an hbck on the first run and then whenever the interval
# between to consecutive runs in seconds is higher than the configured interval
if hbase_hbck_interval_s < hbase_hbck_time_s or run_hbck:
run_hbck = True
# Set a new hbck timer
hbase_hbck_timer_s = dt.datetime.now()
else:
hbck_t_next_s = hbase_hbck_interval_s - hbase_hbck_time_s
if hbck_t_next_s < prom_export_interval_s:
# Minimum wait time is our export refresh rate -
# the time how long we sleep between two runs
hbck_t_next_s = prom_export_interval_s
logging.info("hbase-hbck: Skipping. hbck is only run every {0} seconds. Next run in {1} seconds"
.format(hbase_hbck_interval_s, hbck_t_next_s))
2020-10-21 19:56:26 +00:00
hbase_exporter().main(zk_server, hbase_master, run_hbck)
2020-06-17 14:33:14 +00:00
#prom_zookeeper_num_live.set(nzookeeper_live)
2019-04-04 13:37:54 +00:00
2019-04-12 23:15:06 +00:00
if nruns == 1:
2019-04-04 13:37:54 +00:00
logging.info("Started HBase exporter")
2020-06-17 14:33:14 +00:00
logging.info("Sleeping for {0} seconds ".format(prom_export_interval_s))
time.sleep(prom_export_interval_s)