check_ceph_osdmaps
The snippet can be accessed without any authentication.
Authored by
Kevin Hrpcek
#!/usr/bin/env python3
"""
Check that OSD map counts are within tolerance
"""
import argparse
import errno
import json
import logging
from pathlib import Path
import shlex
from subprocess import CalledProcessError, run as subprocess_run, PIPE, STDOUT
import sys
DESCRIPTION = __doc__
def main():
args = get_args()
configure_logging(args.verbose)
exitcode = None
# Buffer warnings so we can output plugin status as line 1
warnings = []
# OSD map info is expected to be identical across all OSDs in the cluster
# So we just need to find one OSD and treat its map status as representative
osd_status = map_count = None
osd_sockets = (
p for p in Path("/var/run/ceph").iterdir() if p.name.startswith("ceph-osd")
)
# Stabilize OSD order across runs
# If OSD #1 on this host goes down, oh well, it should be stable enough
sorted_sockets = list(sorted(osd_sockets))
for socket_path in sorted_sockets:
cmd = f"ceph daemon {socket_path} status"
try:
out = run_command(cmd)
osd_status = json.loads(out)
assert "newest_map" in osd_status
assert "oldest_map" in osd_status
break
except CalledProcessError as e:
if e.returncode == errno.EINVAL:
warnings.append(f"EINVAL from '{cmd}'")
warnings.append(f"stdout: {str(e.stdout).strip()}")
warnings.append(f"stderr: {str(e.stderr).strip()}")
continue
except json.decoder.JSONDecodeError:
warnings.append(f"JSON decode error on status from {socket_path}: {out}")
continue
except AssertionError:
warnings.append(f"Missing map keys in status from {socket_path}: {out}")
continue
if osd_status is None:
if not sorted_sockets:
# Whether or not OSDs are supposed to be running is a different plugin's problem
summary = "No running OSDs found"
exitcode = 0
else:
summary = "No OSD status found despite OSD sockets being present"
exitcode = 2
else:
map_count = osd_status["newest_map"] - osd_status["oldest_map"]
summary = f"{map_count} stored OSD maps estimated per OSD"
if map_count >= args.critical:
exitcode = 2
elif map_count >= args.warning:
exitcode = 1
else:
exitcode = 0
if warnings:
# Ensure that warnings imply state WARNING or higher
exitcode = max(exitcode, 1)
plugin_status = {0: "OK", 1: "WARNING", 2: "CRITICAL"}[exitcode]
print(f"{plugin_status}: {summary}")
for warning in warnings:
logging.warning(warning)
if map_count is not None:
print(f"| osdmaps={map_count}")
sys.exit(exitcode)
def get_args():
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument(
"--verbose",
"-v",
action="count",
default=0,
help="Verbose output, specify multiple times for increased verbosity.",
)
parser.add_argument(
"-w",
"--warning",
type=int,
required=True,
help="Warning threshold. See specific checks for value types",
)
parser.add_argument(
"-c",
"--critical",
type=int,
required=True,
help="Critical threshold. See specific checks for value types",
)
return parser.parse_args()
def configure_logging(verbosity):
"""Update logging level"""
if verbosity >= 0:
level = max(30 - (10 * verbosity), 10)
logging.basicConfig(level=level, format="")
def run_command(command, check=True, timeout=None, shell=False):
"""Wrapper around subprocess.run"""
logging.debug(f"Running command '{command}'")
if not shell:
command = shlex.split(command)
proc = subprocess_run(
command,
check=check,
timeout=timeout,
shell=shell,
stdout=PIPE,
stderr=STDOUT,
encoding="utf8",
)
out = proc.stdout.strip()
logging.debug(f"Command '{command}' exited and output:")
logging.debug(out)
return out
if __name__ == "__main__":
main()
Please register or sign in to comment