compare.py

#!/usr/bin/env python
# encoding: utf-8
"""

Top-level routines to compare two files.


Created by rayg Apr 2009.
Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
"""

#from pprint import pprint, pformat

import os, sys, logging, datetime, glob, re
from numpy import *
import numpy
from urllib.parse import quote

import locale
locale.setlocale(locale.LC_ALL,'') # Initialize our locale

import matplotlib
# this is a hack to keep glance from needing pyqt unless you run the gui
if "gui" in sys.argv[1:] :
    try :
        matplotlib.use('Qt5Agg')
        import glance.gui_controller as gui_control
    except ImportError :
        print ("*** Unable to import PyQt5. Please install PyQt5 and add it to your PYTHONPATH in order to use the Glance GUI. ***")
        raise
else :
    matplotlib.use('Agg')

import glance.io     as io
import glance.data   as dataobj
import glance.report as reportModule
import glance.stats  as statistics
import glance.plot   as plot
import glance.plotcreatefns as plotcreate
import glance.collocation   as collocation
import glance.config_organizer as config_organizer

from glance.util        import clean_path, rsync_or_copy_files, get_glance_version_string, get_run_identification_info, setup_dir_if_needed
from glance.load        import get_UV_info_from_magnitude_direction_info, load_variable_data, open_and_process_files, handle_lon_lat_info, handle_lon_lat_info_for_one_file, ValueErrorStringToFloat
from glance.lonlat_util import VariableComparisonError
from glance.constants   import *
from glance.gui_constants import A_CONST, B_CONST

LOG = logging.getLogger(__name__)

def _get_all_commands_help_string (commands_dict, ) :
    """
    given the dictonary of commands, compose the string with brief information about all of them
    """

    to_return = "Available commands in Glance:\n"

    for command_name in commands_dict :
        short_desc = commands_dict[command_name].__doc__.split('\n')[0]
        to_return += "\t%-16s %s\n" % (command_name, short_desc)

    return to_return

def _get_possible_files_from_dir (dir_path) :
    """given a path to a directory, return all the paths to files we think we can open in that directory

    """

    # find all the appropriate files in a_path
    possible_extensions = io.get_acceptable_file_extensions()
    found_files = set()
    for filepath in glob.iglob(os.path.join(dir_path, "**"), recursive=True, ):
        ext_txt = filepath.split(".")[-1]

        if ext_txt in possible_extensions:
            found_files.add(filepath)

    return found_files

def _match_files_from_dirs (a_path, b_path, strip_expressions=None, ) :
    """given two paths to directories, try to match up the files we can analyze in them

    """

    if strip_expressions is None :
        strip_expressions = [ ]

    # find all the files in the a path we might be able to open
    found_a_files = _get_possible_files_from_dir(a_path)

    LOG.debug("Found " + str(len(found_a_files)) + " possible file(s) in the A directory: ")
    for filepath in found_a_files :
        LOG.debug(filepath)

    """

    # TODO, when we get to python 3.9, we can use str.removeprefix but until then
    def _remove_prefix(text, prefix):
        if text.startswith(prefix):
            return text[len(prefix):]
        return None

    # test to see if there is a matching file in the b_path for each a_path file
    file_pairs = set()
    for a_filepath in found_a_files :
        inner_path = _remove_prefix(a_filepath, a_path)[1:] # for some reason this leaves a prefix / on the inner_path, so we need to remove that
        b_filepath = os.path.join(b_path, inner_path)
        if os.path.exists(b_filepath) :
            file_pairs.add((a_filepath, b_filepath,))
    """

    # find all the files in the b path we might be able to open
    found_b_files = _get_possible_files_from_dir(b_path)

    LOG.debug("Found " + str(len(found_b_files)) + " possible file(s) in the B directory: ")
    for filepath in found_a_files:
        LOG.debug(filepath)

    def strip_expressions_from_base (file_path, expressions,) :
        clean_name = os.path.basename(file_path)
        for expr in expressions :
            clean_name = re.sub(expr, '', clean_name)
        return clean_name

    # try to pair up our files if possible
    file_pairs = set()
    for a_filepath in found_a_files :
        clean_a = strip_expressions_from_base(a_filepath, strip_expressions,)
        for b_filepath in found_b_files :
            clean_b = strip_expressions_from_base(b_filepath, strip_expressions,)
            if clean_a == clean_b :
                file_pairs.add((a_filepath, b_filepath,))

    return file_pairs

# TODO, I'd like to move this into a different file at some point
def _get_name_info_for_variable (original_display_name, variable_run_info) :
    """
    based on the variable run info, figure out the various names for
    the variable and return them
    
    the various names are:
    
    technical_name -            the name the variable is listed under in the file
    b_variable_technical_name - the name the variable is listed under in the b file (may be the same as technical_name)
    explanation_name -          the more verbose name that will be shown to the user to identify the variable
    original_display_name -     the display name given by the user to describe the variable
    """
    
    # figure out the various name related info
    technical_name = variable_run_info[VARIABLE_TECH_NAME_KEY]
    explanation_name = technical_name # for now, will add to this later
    
    # if B has an alternate variable name, figure that out
    b_variable_technical_name = technical_name
    if VARIABLE_B_TECH_NAME_KEY in variable_run_info :
        b_variable_technical_name = variable_run_info[VARIABLE_B_TECH_NAME_KEY]
        # put both names in our explanation
        explanation_name = explanation_name + " / " + b_variable_technical_name
    
    # show both the display and current explanation names if they differ
    if not (original_display_name == explanation_name) :
        explanation_name = original_display_name + ' (' + explanation_name + ')'
    
    return technical_name, b_variable_technical_name, explanation_name

def colocateToFile_library_call(a_path, b_path, var_list=None,
                                options_set=None,
                                # todo, this doesn't yet do anything
                                do_document=False,
                                # todo, the output channel does nothing at the moment
                                output_channel=sys.stdout) :
    """
    this method handles the actual work of the colocateData command line tool
    and can be used as a library routine.
    
    TODO, properly document the options
    """

    # set some values for defaults
    var_list = [ ] if var_list is None else var_list
    options_set = { } if options_set is None else options_set

    # load the user settings from either the command line or a user defined config file
    pathsTemp, runInfo, defaultValues, requestedNames, usedConfigFile = config_organizer.load_config_or_options(a_path, b_path,
                                                                                                                options_set,
                                                                                                                requestedVars = var_list)

    # deal with the input and output files
    setup_dir_if_needed(pathsTemp[OUT_FILE_KEY], "output")
    
    # make copies of the input files for colocation TODO, fix paths
    [pathsTemp[A_FILE_KEY], pathsTemp[B_FILE_KEY]] = rsync_or_copy_files ([pathsTemp[A_FILE_KEY], pathsTemp[B_FILE_KEY]],
                                                                          target_directory=pathsTemp[OUT_FILE_KEY],
                                                                          additionalFileNameSuffix='-collocated')

    # open the files
    LOG.info("Processing File A:")
    aFile = dataobj.FileInfo(pathsTemp[A_FILE_KEY], allowWrite=True)
    if aFile is None: