Improve log messages, optionally remove hbck lockfile

This commit is contained in:
Björn Busse 2020-06-17 23:00:19 +02:00
parent 2e6e541d14
commit 241b41cbcb
1 changed files with 37 additions and 18 deletions

View File

@ -83,6 +83,8 @@ namenode_use_tls = False
hbase_master_ui_default_port = 16010
hdfs_namenode_default_port = 50070
cluster_is_kerberized = False
hbase_hbck_remove_lockfile = True
class zk():
zk_client = ""
@ -386,26 +388,21 @@ class hbase_exporter():
logging.info("hbase: Write test failed! Is Thrift up and running?")
prom_hbase_writeable.set(0)
self.check_health(run_hbck)
hbase_health = self.check_health(run_hbck)
prom_hbase_healthy.set(hbase_health)
def check_health(self, run_hbck):
# Only check for inconsistencies if we actually ran hbck
if run_hbck and self.num_inconsistencies > 0:
prom_hbase_healthy.set(0)
return False
if self.num_regions_in_transition_stale > 0:
prom_hbase_healthy.set(0)
return False
if self.hbase_write_success != 0:
prom_hbase_healthy.set(0)
return False
prom_hbase_up.set(1)
prom_hbase_healthy.set(1)
return True
@ -446,7 +443,6 @@ class hbase_exporter():
if 'Master not running' in r.stdout.decode('utf-8'):
return False
prom_hbase_up.set(1)
active_master = r.stdout.decode('utf-8').rstrip()
return active_master
@ -474,33 +470,35 @@ class hbase_exporter():
num_regions_in_transition_stale = self.hbaseui_parse_output(req.content)
if num_regions_in_transition_stale is None:
logging.debug('Parse error - failed to find number of stale regions in transition')
logging.debug('hbase-ui: Parse error - failed to find number of stale regions in transition')
if not isinstance(num_regions_in_transition_stale, int):
logging.debug('Parse error - got non-integer for stale regions in transition')
logging.debug('hbase-ui: Parse error - got non-integer for stale regions in transition')
self.num_regions_in_transition_stale = num_regions_in_transition_stale
def hbaseui_parse_output(self, content):
soup = BeautifulSoup(content, 'html.parser')
num_regions_in_transition_stale = -1
num_regions_in_transition_stale = 0
try:
headings = soup.findAll('h2')
for heading in headings:
# The section only exists if there are stale regions in transition
if heading.get_text() == "Regions in Transition":
logging.debug('Found Regions in Transition section header')
logging.debug('Looking for table')
logging.info('hbase-ui: Found Regions in Transition section header')
logging.info('hbase-ui: Looking for table')
table = heading.find_next('table')
num_regions_in_transition_stale = self.hbaseui_parse_table(table)
if not isinstance(num_regions_in_transition_stale, int):
logging.debug('Got non-integer \'{0}\' for stale regions in transition when parsing HBase Master UI'\
logging.info('hbase-ui: Got non-integer \'{0}\' for stale regions in transition when parsing HBase Master UI'\
.format(num_regions_in_transition_stale))
return num_regions_in_transition_stale
except (AttributeError, TypeError):
logging.info('Failed to parse HBase Master UI status page')
logging.info('hbase-ui: Failed to parse HBase Master UI status page')
return -1
return num_regions_in_transition_stale
def hbck_inconsistencies(self):
@ -538,6 +536,13 @@ class hbase_exporter():
if match:
hbck_status = match.group(0)
logging.info('hbase-hbck: hbck status = %s', hbck_status)
hdfs_lock_uri = re.findall('hdfs://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', hbck_status)
for uri in hdfs_lock_uri:
logging.info('hbase-hbck: Locked by lockfile: {0}'.format(hdfs_lock_uri[0]))
if hbase_hbck_remove_lockfile:
hdfs_remove_file(uri)
else:
logging.info('hbase-hbck: Please remove lockfile manually if no hbck is running')
break
if hbck_status is None:
@ -593,6 +598,20 @@ class hbase_exporter():
return True
def hdfs_remove_file(hdfs_uri):
p = Popen(['hadoop', 'fs', '-rm', hdfs_uri], stdout=PIPE, stderr=PIPE, close_fds=False)
output, error = p.communicate()
output = output.splitlines()
error = error.splitlines()
for line in output:
logging.info("hdfs-rm: %s", line)
for line in error:
logging.info("hdfs-rm: %s", line)
return error
def which(program):
def is_executable(fn):
@ -727,7 +746,7 @@ if __name__ == '__main__':
jmx.main(hdfs_namenodes)
hbase_hbck_time_s = int((dt.datetime.now() - hbase_hbck_timer_s).total_seconds())
logging.info("hbase-hbck: Timer: {0} seconds".format(hbase_hbck_time_s))
logging.debug("hbase-hbck: Timer: {0} seconds".format(hbase_hbck_time_s))
# Do an hbck on the first run and then whenever the interval
# between to consecutive runs in seconds is higher than the configured interval