Skip to content
Snippets Groups Projects

check_ceph_osdmaps

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by Kevin Hrpcek
    snippetfile1.txt 4.16 KiB
    #!/usr/bin/env python3
    """
    Check that OSD map counts are within tolerance
    """
    import argparse
    import errno
    import json
    import logging
    from pathlib import Path
    import shlex
    from subprocess import CalledProcessError, run as subprocess_run, PIPE, STDOUT
    import sys
    
    DESCRIPTION = __doc__
    
    
    def main():
        args = get_args()
        configure_logging(args.verbose)
        exitcode = None
        # Buffer warnings so we can output plugin status as line 1
        warnings = []
    
        # OSD map info is expected to be identical across all OSDs in the cluster
        # So we just need to find one OSD and treat its map status as representative
    
        osd_status = map_count = None
        osd_sockets = (
            p for p in Path("/var/run/ceph").iterdir() if p.name.startswith("ceph-osd")
        )
        # Stabilize OSD order across runs
        # If OSD #1 on this host goes down, oh well, it should be stable enough
        sorted_sockets = list(sorted(osd_sockets))
        for socket_path in sorted_sockets:
            cmd = f"ceph daemon {socket_path} status"
            try:
                out = run_command(cmd)
                osd_status = json.loads(out)
                assert "newest_map" in osd_status
                assert "oldest_map" in osd_status
                break
            except CalledProcessError as e:
                if e.returncode == errno.EINVAL:
                    warnings.append(f"EINVAL from '{cmd}'")
                    warnings.append(f"stdout: {str(e.stdout).strip()}")
                    warnings.append(f"stderr: {str(e.stderr).strip()}")
                    continue
            except json.decoder.JSONDecodeError:
                warnings.append(f"JSON decode error on status from {socket_path}: {out}")
                continue
            except AssertionError:
                warnings.append(f"Missing map keys in status from {socket_path}: {out}")
                continue
    
        if osd_status is None:
            if not sorted_sockets:
                # Whether or not OSDs are supposed to be running is a different plugin's problem
                summary = "No running OSDs found"
                exitcode = 0
            else:
                summary = "No OSD status found despite OSD sockets being present"
                exitcode = 2
        else:
            map_count = osd_status["newest_map"] - osd_status["oldest_map"]
            summary = f"{map_count} stored OSD maps estimated per OSD"
            if map_count >= args.critical:
                exitcode = 2
            elif map_count >= args.warning:
                exitcode = 1
            else:
                exitcode = 0
    
        if warnings:
            # Ensure that warnings imply state WARNING or higher
            exitcode = max(exitcode, 1)
        plugin_status = {0: "OK", 1: "WARNING", 2: "CRITICAL"}[exitcode]
        print(f"{plugin_status}: {summary}")
    
        for warning in warnings:
            logging.warning(warning)
        if map_count is not None:
            print(f"| osdmaps={map_count}")
        sys.exit(exitcode)
    
    def get_args():
        parser = argparse.ArgumentParser(description=DESCRIPTION)
        parser.add_argument(
            "--verbose",
            "-v",
            action="count",
            default=0,
            help="Verbose output, specify multiple times for increased verbosity.",
        )
        parser.add_argument(
            "-w",
            "--warning",
            type=int,
            required=True,
            help="Warning threshold. See specific checks for value types",
        )
        parser.add_argument(
            "-c",
            "--critical",
            type=int,
            required=True,
            help="Critical threshold. See specific checks for value types",
        )
        return parser.parse_args()
    
    
    def configure_logging(verbosity):
        """Update logging level"""
        if verbosity >= 0:
            level = max(30 - (10 * verbosity), 10)
            logging.basicConfig(level=level, format="")
    
    
    def run_command(command, check=True, timeout=None, shell=False):
        """Wrapper around subprocess.run"""
        logging.debug(f"Running command '{command}'")
        if not shell:
            command = shlex.split(command)
        proc = subprocess_run(
            command,
            check=check,
            timeout=timeout,
            shell=shell,
            stdout=PIPE,
            stderr=STDOUT,
            encoding="utf8",
        )
        out = proc.stdout.strip()
        logging.debug(f"Command '{command}' exited and output:")
        logging.debug(out)
        return out
    
    
    if __name__ == "__main__":
        main()
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment