2019-04-12 03:04:15 +00:00
|
|
|
#!/usr/bin/env python3
|
2019-04-04 13:37:54 +00:00
|
|
|
#
|
|
|
|
# HBase Prometheus Exporter
|
|
|
|
#
|
|
|
|
# Björn Busse <bj.rn@baerlin.eu>
|
|
|
|
#
|
|
|
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
from __future__ import division
|
|
|
|
from __future__ import print_function
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from flatten_json import flatten
|
|
|
|
import io
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
from prometheus_client import start_http_server, Summary
|
|
|
|
from prometheus_client.core import GaugeMetricFamily, REGISTRY
|
|
|
|
from prometheus_client import Gauge
|
|
|
|
import random
|
|
|
|
import re
|
|
|
|
import requests
|
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import traceback
|
|
|
|
import xml.etree.ElementTree as et
|
|
|
|
|
|
|
|
|
|
|
|
logfile = ''
|
2019-04-12 16:05:50 +00:00
|
|
|
tmp_path = '/tmp/'
|
|
|
|
log_path = tmp_path
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
# Prometheus
|
|
|
|
prom_http_port = 9010
|
|
|
|
prom_scrape_interval_s = 10
|
|
|
|
|
|
|
|
# Prom vars
|
|
|
|
REQUEST_TIME = Summary('request_processing_seconds', 'Time spent processing request')
|
2019-04-11 11:39:58 +00:00
|
|
|
prom_hbase_num_regions_in_transition_stale = Gauge('number_of_regions_in_transition_stale', 'Number of stale regions in transition')
|
|
|
|
prom_hbase_num_inconsistencies = Gauge('number_of_inconsistencies', 'Number of inconsistencies in HBase')
|
2019-04-04 13:37:54 +00:00
|
|
|
prom_hdfs_total = Gauge('hdfs_bytes_total', 'HDFS total bytes')
|
|
|
|
prom_hdfs_used = Gauge('hdfs_bytes_used', 'HDFS used bytes')
|
|
|
|
prom_hdfs_remaining = Gauge('hdfs_bytes_remaining', 'HDFS remaining bytes')
|
2019-04-11 11:39:58 +00:00
|
|
|
prom_hdfs_num_datanodes_live = Gauge('hdfs_datanodes_live', 'HDFS Live DataNodes')
|
|
|
|
prom_hdfs_num_datanodes_dead = Gauge('hdfs_datanodes_dead', 'HDFS Dead DataNodes')
|
2019-04-12 03:04:15 +00:00
|
|
|
prom_hbase_num_regionservers_live = Gauge('hbase_regionservers_live', 'HBase Live Regionservers')
|
|
|
|
prom_hbase_num_regionservers_dead = Gauge('hbase_regionservers_dead', 'HBase Dead Regionservers')
|
|
|
|
prom_hbase_num_clusterrequests = Gauge('hbase_clusterrequests', 'HBase Clusterrequests')
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
# HDFS/HBase
|
|
|
|
hdfs_config_file = "/etc/hadoop/conf/hdfs-site.xml"
|
|
|
|
cmd_hbase_active_master = ['/usr/hdp/current/hbase-client/bin/hbase-jruby', '/usr/hdp/current/hbase-client/bin/get-active-master.rb']
|
|
|
|
cmd_hdfs_namenodes = ['hdfs', 'getconf', '-namenodes']
|
|
|
|
namenodes = ""
|
|
|
|
namenode_use_tls = False
|
|
|
|
|
|
|
|
class jmx_query():
|
|
|
|
|
|
|
|
def main(self, hdfs_namenode_hosts):
|
|
|
|
|
|
|
|
hdfs_active_namenode = self.get_active_namenode()
|
2019-04-12 03:04:15 +00:00
|
|
|
hbase_active_master = hbase_exporter.get_active_master()
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
if not hdfs_active_namenode:
|
2019-04-12 16:05:50 +00:00
|
|
|
logging.info("Failed to determine active HDFS namenode")
|
2019-04-12 03:04:15 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
if not hbase_active_master:
|
|
|
|
logging.info("Failed to determine active HBase master")
|
|
|
|
return False
|
2019-04-04 13:37:54 +00:00
|
|
|
|
2019-04-12 03:04:15 +00:00
|
|
|
url = self.get_url('hdfs', hdfs_active_namenode)
|
|
|
|
self.get_jmx_data(url)
|
|
|
|
url = self.get_url('hbase', hbase_active_master)
|
|
|
|
self.get_jmx_data(url)
|
|
|
|
|
|
|
|
def get_url(self, service, hostname):
|
|
|
|
if (namenode_use_tls):
|
|
|
|
url_scheme = "https://"
|
|
|
|
else:
|
|
|
|
url_scheme = "http://"
|
|
|
|
|
|
|
|
if service == 'hdfs':
|
|
|
|
url = url_scheme + hostname + ":" + str(hdfs_namenode_port) + "/jmx"
|
|
|
|
elif service == 'hbase':
|
|
|
|
url = url_scheme + hostname + ":" + str(hbase_master_ui_port) + "/jmx"
|
|
|
|
return url
|
|
|
|
|
|
|
|
def get_jmx_data(self, url):
|
2019-04-04 13:37:54 +00:00
|
|
|
jmx = self.query(url)
|
|
|
|
|
|
|
|
if (jmx == False):
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info("Could not read jmx data from: " + url)
|
2019-04-04 13:37:54 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
for k, v in jmx.items():
|
|
|
|
if not v is None:
|
|
|
|
self.lookup_keys(k, v)
|
|
|
|
|
2019-04-12 03:04:15 +00:00
|
|
|
return True
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
def get_active_namenode(hdfs_namenode_hosts):
|
2019-04-12 16:05:50 +00:00
|
|
|
|
|
|
|
if not which(cmd_hdfs_namenodes[0]):
|
|
|
|
logging.info("Could not find hdfs executable in PATH")
|
|
|
|
return False
|
|
|
|
|
2019-04-04 13:37:54 +00:00
|
|
|
try:
|
|
|
|
r = subprocess.run(cmd_hdfs_namenodes, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
except Exception as e:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.debug("type error: " + str(e))
|
2019-04-12 16:05:50 +00:00
|
|
|
logging.info("Failed to determine active master")
|
2019-04-04 13:37:54 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
hosts = r.stdout.decode('utf-8').split(" ")
|
|
|
|
|
|
|
|
tree = et.parse(hdfs_config_file)
|
|
|
|
root = tree.getroot()
|
|
|
|
has_ha_element = False
|
|
|
|
active_namenode = None
|
|
|
|
|
|
|
|
if has_ha_element:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info("Hadoop High-Availability")
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
for property in root:
|
|
|
|
if "dfs.ha.namenodes" in property.find("name").text:
|
|
|
|
has_ha_element = True
|
|
|
|
nameservice_id = property.find("name").text[len("dfs.ha.namenodes")+1:]
|
|
|
|
namenodes = property.find("value").text.split(",")
|
|
|
|
|
|
|
|
for node in namenodes:
|
|
|
|
# Get namenode address and check if it is the active node
|
|
|
|
for n in root:
|
|
|
|
prefix = "dfs.namenode.rpc-address." + nameservice_id + "."
|
|
|
|
element_text = n.find("name").text
|
|
|
|
|
|
|
|
if prefix in element_text:
|
|
|
|
node_address = n.find("value").text.split(":")[0]
|
|
|
|
|
2019-04-10 11:12:14 +00:00
|
|
|
cmd = ['hdfs haadmin -getServiceState ' + node]
|
2019-04-04 13:37:54 +00:00
|
|
|
r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
|
|
|
|
if len(r.stderr.decode("utf-8")) > 0:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.debug(r.stderr.decode("utf-8"))
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
if "active" in r.stdout.decode("utf-8").lower():
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info("Active namenode: " + node_address + " (" + node + ")")
|
2019-04-04 13:37:54 +00:00
|
|
|
return node_address
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
def query(self, url):
|
2019-04-12 17:21:12 +00:00
|
|
|
|
2019-04-04 13:37:54 +00:00
|
|
|
try:
|
|
|
|
r = requests.get(url)
|
|
|
|
except Exception as e:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info("Could not connect to: " + url)
|
2019-04-04 13:37:54 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
jmx = json.loads(r.text)
|
|
|
|
jmx = flatten(jmx)
|
|
|
|
return(jmx)
|
|
|
|
|
|
|
|
def lookup_keys(self, key, value):
|
2019-04-10 14:19:46 +00:00
|
|
|
if key.endswith("capacityUsed"):
|
2019-04-04 13:37:54 +00:00
|
|
|
prom_hdfs_used.set(value)
|
2019-04-11 11:39:58 +00:00
|
|
|
logging.debug("Found jmx key: " + key)
|
2019-04-10 14:19:46 +00:00
|
|
|
elif key.endswith("capacityTotal"):
|
2019-04-04 13:37:54 +00:00
|
|
|
prom_hdfs_total.set(value)
|
2019-04-11 11:39:58 +00:00
|
|
|
logging.debug("Found jmx key: " + key)
|
2019-04-10 14:19:46 +00:00
|
|
|
elif key.endswith("capacityRemaining"):
|
2019-04-04 13:37:54 +00:00
|
|
|
prom_hdfs_remaining.set(value)
|
2019-04-11 11:39:58 +00:00
|
|
|
logging.debug("Found jmx key: " + key)
|
|
|
|
elif key.endswith("NumLiveDataNodes"):
|
|
|
|
prom_hdfs_num_datanodes_live.set(value)
|
|
|
|
logging.debug("Found jmx key: " + key)
|
|
|
|
elif key.endswith("NumDeadDataNodes"):
|
|
|
|
prom_hdfs_num_datanodes_dead.set(value)
|
|
|
|
logging.debug("Found jmx key: " + key)
|
2019-04-12 03:04:15 +00:00
|
|
|
elif key.endswith("numRegionServers"):
|
|
|
|
prom_hbase_num_regionservers_live.set(value)
|
|
|
|
logging.debug("Found jmx key: " + key)
|
|
|
|
elif key.endswith("numDeadRegionServers"):
|
|
|
|
prom_hbase_num_regionservers_dead.set(value)
|
|
|
|
logging.debug("Found jmx key: " + key)
|
|
|
|
elif key.endswith("clusterRequests"):
|
|
|
|
prom_hbase_num_clusterrequests.set(value)
|
|
|
|
logging.debug("Found jmx key: " + key)
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
class hbase_exporter():
|
|
|
|
|
|
|
|
def main(self, hbase_master_hosts):
|
|
|
|
hbase_active_master = self.get_active_master()
|
2019-04-12 03:04:15 +00:00
|
|
|
|
|
|
|
if not hbase_active_master:
|
|
|
|
logging.info("Failed to determine active HBase master")
|
|
|
|
return False
|
|
|
|
|
2019-04-04 13:37:54 +00:00
|
|
|
self.get_stale_regions_in_transition(hbase_active_master)
|
|
|
|
#self.hbck_get_inconsistencies()
|
|
|
|
|
2019-04-12 03:04:15 +00:00
|
|
|
@staticmethod
|
|
|
|
def get_active_master():
|
2019-04-12 16:05:50 +00:00
|
|
|
|
|
|
|
if not which(cmd_hbase_active_master[0]):
|
|
|
|
logging.info("Could not find hdfs executable in PATH")
|
|
|
|
return False
|
|
|
|
|
2019-04-04 13:37:54 +00:00
|
|
|
try:
|
|
|
|
r = subprocess.run(cmd_hbase_active_master, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
except Exception as e:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.debug("type error: " + str(e))
|
2019-04-12 16:05:50 +00:00
|
|
|
logging.info("Failed to determine active HBase master")
|
|
|
|
return False
|
|
|
|
|
|
|
|
if 'Master not running' in r.stdout.decode('utf-8'):
|
2019-04-04 13:37:54 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
return r.stdout.decode('utf-8')
|
|
|
|
|
|
|
|
def get_stale_regions_in_transition(self, hbase_master):
|
|
|
|
host = hbase_master.rstrip("\n\r")
|
|
|
|
port = hbase_master_ui_port
|
|
|
|
url = 'http://%(host)s:%(port)s/master-status' % locals()
|
|
|
|
|
|
|
|
logging.debug('GET %s', url)
|
|
|
|
|
|
|
|
try:
|
|
|
|
req = requests.get(url)
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
logging.debug(e)
|
|
|
|
logging.debug('Failed to request ' + url)
|
|
|
|
return False
|
|
|
|
|
|
|
|
logging.debug("Response: %s %s", req.status_code, req.reason)
|
|
|
|
|
|
|
|
if req.status_code != 200:
|
|
|
|
logging.debug('Got a http return code != 200')
|
|
|
|
|
2019-04-11 11:39:58 +00:00
|
|
|
num_regions_in_transition_stale = self.hbaseui_parse_output(req.content)
|
2019-04-04 13:37:54 +00:00
|
|
|
|
2019-04-11 11:39:58 +00:00
|
|
|
if num_regions_in_transition_stale is None:
|
2019-04-04 13:37:54 +00:00
|
|
|
logging.debug('Parse error - failed to find number of stale regions in transition')
|
|
|
|
|
2019-04-11 11:39:58 +00:00
|
|
|
if not isinstance(num_regions_in_transition_stale, int):
|
2019-04-04 13:37:54 +00:00
|
|
|
logging.debug('Parse error - got non-integer for regions stale in transition')
|
|
|
|
|
|
|
|
msg = '{0} regions stale in transition '\
|
2019-04-11 11:39:58 +00:00
|
|
|
.format(num_regions_in_transition_stale)
|
|
|
|
prom_hbase_num_regions_in_transition_stale.set(num_regions_in_transition_stale)
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info(msg)
|
2019-04-11 11:39:58 +00:00
|
|
|
return num_regions_in_transition_stale
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
def hbaseui_parse_output(self, content):
|
|
|
|
soup = BeautifulSoup(content, 'html.parser')
|
2019-04-11 11:39:58 +00:00
|
|
|
num_regions_in_transition_stale = 0
|
2019-04-04 13:37:54 +00:00
|
|
|
try:
|
|
|
|
headings = soup.findAll('h2')
|
|
|
|
for heading in headings:
|
|
|
|
if heading.get_text() == "Regions in Transition":
|
|
|
|
logging.debug('Found Regions in Transition section header')
|
|
|
|
logging.debug('Looking for table')
|
|
|
|
table = heading.find_next('table')
|
2019-04-11 11:39:58 +00:00
|
|
|
num_regions_in_transition_stale = self.hbaseui_parse_table(table)
|
|
|
|
if not isinstance(num_regions_in_transition_stale, int):
|
2019-04-04 13:37:54 +00:00
|
|
|
logging.debug('Got non-integer \'{0}\' for stale regions in transition when parsing HBase Master UI'\
|
2019-04-11 11:39:58 +00:00
|
|
|
.format(num_regions_in_transition_stale))
|
2019-04-04 13:37:54 +00:00
|
|
|
|
2019-04-11 11:39:58 +00:00
|
|
|
return num_regions_in_transition_stale
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
except (AttributeError, TypeError):
|
2019-04-10 14:19:46 +00:00
|
|
|
logging.info('Failed to parse HBase Master UI status page')
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
def hbck_get_inconsistencies(self):
|
|
|
|
re_status = re.compile(r'^Status:\s*(.+?)\s*$')
|
|
|
|
re_inconsistencies = re.compile(r'^\s*(\d+)\s+inconsistencies\s+detected\.?\s*$')
|
2019-04-11 11:39:58 +00:00
|
|
|
num_inconsistencies = None
|
2019-04-04 13:37:54 +00:00
|
|
|
hbck_status = None
|
|
|
|
|
|
|
|
p = Popen(['hbase', 'hbck'], stdout=PIPE, stderr=PIPE, close_fds=False)
|
|
|
|
output, error = p.communicate()
|
|
|
|
output = output.splitlines()
|
|
|
|
|
|
|
|
if p.returncode != 0:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info("Failed to run hbck (%d)" % (p.returncode))
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
for line in output:
|
|
|
|
match = re_inconsistencies.match(line)
|
|
|
|
|
|
|
|
if match:
|
2019-04-11 11:39:58 +00:00
|
|
|
num_inconsistencies = match.group(1)
|
2019-04-04 13:37:54 +00:00
|
|
|
logging.info('Number of inconsistencies: %s', hbck_status)
|
|
|
|
continue
|
|
|
|
|
|
|
|
match = re_status.match(line)
|
|
|
|
|
|
|
|
if match:
|
|
|
|
hbck_status = match.group(1)
|
|
|
|
logging.info('hbck status = %s', hbck_status)
|
|
|
|
break
|
|
|
|
|
|
|
|
if hbck_status is None:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info('Failed to find hbck status result')
|
2019-04-11 11:39:58 +00:00
|
|
|
if num_inconsistencies is None:
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info('Failed to find number of inconsistencies')
|
2019-04-04 13:37:54 +00:00
|
|
|
|
2019-04-11 11:39:58 +00:00
|
|
|
if num_inconsistencies != None:
|
|
|
|
num_inconsistencies = int(num_inconsistencies)
|
2019-04-04 13:37:54 +00:00
|
|
|
|
2019-04-11 11:39:58 +00:00
|
|
|
if not isinstance(num_inconsistencies, int):
|
2019-04-10 13:33:00 +00:00
|
|
|
logging.info('Error: Non-integer detected for the number of inconsistencies')
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def hbaseui_parse_table(table):
|
|
|
|
for row in table.findChildren('tr'):
|
|
|
|
for col in row.findChildren('td'):
|
|
|
|
if 'Regions in Transition for more than ' in col.get_text():
|
|
|
|
next_sibling = col.findNext('td')
|
2019-04-11 11:39:58 +00:00
|
|
|
num_regions_in_transition_stale = next_sibling.get_text().strip()
|
|
|
|
return num_regions_in_transition_stale
|
2019-04-04 13:37:54 +00:00
|
|
|
return None
|
|
|
|
|
2019-04-12 16:05:50 +00:00
|
|
|
def which(program):
|
|
|
|
|
|
|
|
def is_executable(fn):
|
|
|
|
return os.path.isfile(fn) and os.access(fn, os.X_OK)
|
|
|
|
|
|
|
|
filepath, fname = os.path.split(program)
|
|
|
|
|
|
|
|
if filepath:
|
|
|
|
if is_executable(program):
|
|
|
|
return program
|
|
|
|
else:
|
|
|
|
for path in os.environ["PATH"].split(os.pathsep):
|
|
|
|
exec_file = os.path.join(path, program)
|
|
|
|
if is_executable(exec_file):
|
|
|
|
return exec_file
|
|
|
|
|
|
|
|
return None
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
2019-04-12 16:05:50 +00:00
|
|
|
file_handler = logging.FileHandler(filename=log_path + 'hbase-exporter.log')
|
2019-04-12 03:04:15 +00:00
|
|
|
stdout_handler = logging.StreamHandler(sys.stdout)
|
|
|
|
handlers = [file_handler, stdout_handler]
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.DEBUG,
|
|
|
|
format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
|
|
|
|
handlers=handlers
|
|
|
|
)
|
|
|
|
|
|
|
|
logger = logging.getLogger('LOGGER_NAME')
|
|
|
|
|
2019-04-04 13:37:54 +00:00
|
|
|
parser = argparse.ArgumentParser( description="")
|
|
|
|
parser.add_argument('--hbase-master-hosts', dest='hbase_masters', help="Comma seperated list of HBase master hosts", type=str)
|
|
|
|
parser.add_argument('--hdfs-namenode-hosts', dest='hdfs_namenodes', help="Comma seperated list of HDFS namenode hosts", type=str)
|
|
|
|
parser.add_argument('--logfile', dest='logfile', help="Path to logfile, if logging to a file is desired", type=str)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2019-04-12 03:04:15 +00:00
|
|
|
# Optional File Logging
|
|
|
|
#if logfile:
|
|
|
|
#handler = logging.FileHandler(logfile)
|
|
|
|
#handler.setLevel(logging.INFO)
|
|
|
|
#log.addHandler(handler)
|
|
|
|
#logging.basicConfig(filename=logfile, level=logging.INFO)
|
2019-04-04 13:37:54 +00:00
|
|
|
|
|
|
|
# Start the Prometheus server
|
|
|
|
start_http_server(prom_http_port)
|
|
|
|
nscrapes = 0
|
|
|
|
|
|
|
|
if (args.hbase_masters is None):
|
|
|
|
hbase_master_hosts = ['localhost']
|
|
|
|
hbase_master_ui_port = 16010
|
|
|
|
|
|
|
|
if args.hdfs_namenodes is None:
|
|
|
|
hdfs_namenode_hosts = ['localhost']
|
|
|
|
hdfs_namenode_port = 50070
|
|
|
|
|
|
|
|
while True:
|
|
|
|
jmx_query().main(hdfs_namenode_hosts)
|
|
|
|
hbase_exporter().main(hbase_master_hosts)
|
|
|
|
|
|
|
|
nscrapes += 1
|
|
|
|
|
|
|
|
if nscrapes == 1:
|
|
|
|
logging.info("Started HBase exporter")
|
|
|
|
|
|
|
|
time.sleep(prom_scrape_interval_s)
|