compare.py

#!/usr/bin/env python
# encoding: utf-8
"""

Top-level routines to compare two files.


Created by rayg Apr 2009.
Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
"""

import os, sys, logging, re, subprocess, datetime
import imp as imp
from pprint import pprint, pformat
from numpy import *
import pkg_resources
from pycdf import CDFError
from subprocess import check_call as sh
from urllib import quote

import glance.io     as io
import glance.delta  as delta
import glance.data   as dataobj
import glance.plot   as plot
import glance.report as report
import glance.stats  as statistics
import glance.plotcreatefns as plotcreate
import glance.collocation   as collocation

LOG = logging.getLogger(__name__)

# these are the built in defaults for the settings
glance_setting_defaults = {'shouldIncludeReport':       True,
                           'shouldIncludeImages':       False,
                           'doFork':                    False,
                           'useThreadsToControlMemory': False,
                           'useSharedRangeForOriginal': False,
                           'noLonLatVars':              False,
                           'detail_DPI':                150,
                           'thumb_DPI':                 50}

# these are the built in longitude/latitude defaults
glance_lon_lat_defaults = {'longitude': 'pixel_longitude',
                           'latitude':  'pixel_latitude',
                           'lon_lat_epsilon': 0.0,
                           'data_filter_function_lon_in_a': None,
                           'data_filter_function_lat_in_a': None,
                           'data_filter_function_lon_in_b': None,
                           'data_filter_function_lat_in_b': None
                           }

# these are the built in default settings for the variable analysis
glance_analysis_defaults = {'epsilon': 0.0,
                            'epsilon_percent': None,
                            'missing_value': None,
                            'epsilon_failure_tolerance': 0.0,
                            'nonfinite_data_tolerance':  0.0,
                            'total_data_failure_tolerance': None,
                            'minimum_acceptable_squared_correlation_coefficient': None
                            }

UNITS_CONSTANT = "units"

def _clean_path(string_path) :
    """
    Return a clean form of the path without any '.', '..', or '~'
    """
    clean_path = None
    if string_path is not None :
        clean_path = os.path.abspath(os.path.expanduser(string_path))
    
    return clean_path

def _parse_varnames(names, terms, epsilon=0.0, missing=None):
    """filter variable names and substitute default epsilon and missing settings if none provided
    returns (variable name, epsilon, missing) triples
    
    >>> _parse_varnames( ['foo','bar', 'baz', 'zoom', 'cat'], ['f..:0.5:-999', 'ba.*:0.001', 'c.t::-9999'], 1e-7 )
    set([('foo', 0.5, -999.0), ('cat', 9.9999999999999995e-08, -9999.0), ('bar', 0.001, None), ('baz', 0.001, None)])
    
    names   - all the variable names in the file (ie. names that should be considered valid)
    terms   - variable selection terms given from the command line
    epsilon - a default epsilon to be used for all variables that do not have a specific epsilon given
    missing - a default fill value to be used for all variables that do not have a specific fill value given
    """
    terms = [x.split(':') for x in terms]
    terms = [(re.compile(x[0]).match,x[1:]) for x in terms]
    def _cvt_em(eps=None, mis=None):
        eps = float(eps) if eps else epsilon
        mis = float(mis) if mis else missing
        return eps, mis
    sel = [ ((x,)+_cvt_em(*em)) for x in names for (t,em) in terms if t(x) ]
    return set(sel)

def _check_file_names(fileAObject, fileBObject) :
    """
    get information about the names in the two files and how they compare to each other
    """
    # get information about the variables stored in the files
    aNames = set(fileAObject())
    bNames = set(fileBObject())
    
    # get the variable names they have in common
    commonNames = aNames.intersection(bNames)
    # which names are unique to only one of the two files?
    uniqueToANames = aNames - commonNames
    uniqueToBNames = bNames - commonNames
    
    return _check_shared_names(set(fileAObject()), set(fileBObject()))

def _check_shared_names (nameSetA, nameSetB) :
    """
    compare the names in the two sets
    """
    
    # what names do they have in common?
    commonNames = nameSetA.intersection(nameSetB)
    # what names are unique to each set?
    uniqueToANames = nameSetA - commonNames
    uniqueToBNames = nameSetB - commonNames
    
    return {'sharedVars': commonNames,  'uniqueToAVars': uniqueToANames, 'uniqueToBVars': uniqueToBNames}

def _resolve_names(fileAObject, fileBObject, defaultValues,
                   requestedNames, usingConfigFileFormat=False) :
    """
    figure out which names the two files share and which are unique to each file, as well as which names
    were requested and are in both sets
    
    usingConfigFileFormat signals whether the requestedNames parameter will be in the form of the inputed
    names from the command line or a more complex dictionary holding information about the names read in
    from a configuration file
    
    Note: if we ever need a variable with different names in file A and B to be comparable, this logic
    will need to be changed.
    """
    # look at the names present in the two files and compare them
    nameComparison = _check_file_names(fileAObject, fileBObject)
    
    # figure out which set should be selected based on the user requested names
    fileCommonNames = nameComparison['sharedVars']
    finalNames = {}
    if (usingConfigFileFormat) :
        
        # if the user didn't ask for any, try everything
        if (len(requestedNames) is 0) :
            finalFromCommandLine = _parse_varnames(fileCommonNames, ['.*'],
                                                   defaultValues['epsilon'], defaultValues['missing_value'])
            for name, epsilon, missing in finalFromCommandLine :
                # we'll use the variable's name as the display name for the time being
                finalNames[name] = {}
                # make sure we pick up any other controlling defaults
                finalNames[name].update(defaultValues) 
                # but override the values that would have been determined by _parse_varnames
                finalNames[name]['variable_name'] = name
                finalNames[name]['epsilon'] = epsilon
                
                # load the missing value if it was not provided
                missing, missing_b = _get_missing_values_if_needed((fileAObject, fileBObject), name,
                                                                   missing_value_A=missing, missing_value_B=missing)
                finalNames[name]['missing_value'] = missing 
                finalNames[name]['missing_value_alt_in_b'] = missing_b
                
                # get any information about the units listed in the files
                finalNames[name]['units_a'] = fileAObject.get_attribute(name, UNITS_CONSTANT)
                finalNames[name]['units_b'] = fileBObject.get_attribute(name, UNITS_CONSTANT)
                
        # otherwise just do the ones the user asked for
        else : 
            # check each of the names the user asked for to see if it is either in the list of common names
            # or, if the user asked for an alternate name mapping in file B, if the two mapped names are in
            # files A and B respectively
            for dispName in requestedNames :
                
                # hang on to info on the current variable
                currNameInfo = requestedNames[dispName] 
                
                # get the variable name 
                if 'variable_name' in currNameInfo :
                    name = currNameInfo['variable_name']
                    name_b = name
                    
                    if ('alternate_name_in_B' in currNameInfo) :
                        name_b = currNameInfo['alternate_name_in_B']
                    
                    if ( (name in fileCommonNames) and (not currNameInfo.has_key('alternate_name_in_B')) ) or \
                            ( (currNameInfo.has_key('alternate_name_in_B') and
                              ((name   in nameComparison['uniqueToAVars']) or (name   in fileCommonNames))  and
                              ((name_b in nameComparison['uniqueToBVars']) or (name_b in fileCommonNames))) ) :
                        finalNames[dispName] = defaultValues.copy() 
                        finalNames[dispName]['display_name'] = dispName
                        finalNames[dispName].update(currNameInfo)
                        
                        # load the missing value if it was not provided
                        missing = finalNames[dispName]['missing_value']
                        if ('missing_value_alt_in_b' in finalNames[dispName]) :
                            missing_b = finalNames[dispName]['missing_value_alt_in_b']
                        else :
                            missing_b = missing
                        finalNames[dispName]['missing_value'], finalNames[dispName]['missing_value_alt_in_b'] = \
                                    _get_missing_values_if_needed((fileAObject, fileBObject), name, name_b,
                                                                  missing, missing_b)
                        
                        # get any information about the units listed in the files
                        finalNames[dispName]['units_a'] = fileAObject.get_attribute(name,   UNITS_CONSTANT)
                        finalNames[dispName]['units_b'] = fileBObject.get_attribute(name_b, UNITS_CONSTANT)
                        
                else :
                    LOG.warn('No technical variable name was given for the entry described as "' + dispName + '". ' +
                             'Skipping this variable.')
    else:
        # format command line input similarly to the stuff from the config file
        print (requestedNames)
        finalFromCommandLine = _parse_varnames(fileCommonNames, requestedNames,
                                               defaultValues['epsilon'], defaultValues['missing_value'])
        for name, epsilon, missing in finalFromCommandLine :
            ## we'll use the variable's name as the display name for the time being
            finalNames[name] = {}
            # make sure we pick up any other controlling defaults
            finalNames[name].update(defaultValues) 
            # but override the values that would have been determined by _parse_varnames
            finalNames[name]['variable_name'] = name
            finalNames[name]['epsilon'] = epsilon
            
            # load the missing value if it was not provided
            missing, missing_b = _get_missing_values_if_needed((fileAObject, fileBObject), name,
                                                               missing_value_A=missing, missing_value_B=missing)
            finalNames[name]['missing_value'] = missing 
            finalNames[name]['missing_value_alt_in_b'] = missing_b
            
            # get any information about the units listed in the files
            finalNames[name]['units_a'] = fileAObject.get_attribute(name, UNITS_CONSTANT)
            finalNames[name]['units_b'] = fileBObject.get_attribute(name, UNITS_CONSTANT)
    
    LOG.debug("Final selected set of variables to analyze:")
    LOG.debug(str(finalNames))
    
    return finalNames, nameComparison

def _get_missing_values_if_needed((fileA, fileB),
                                  var_name, alt_var_name=None, 
                                  missing_value_A=None, missing_value_B=None) :
    """
    get the missing values for two files based on the variable name(s)
    if the alternate variable name is passed it will be used for the
    second file in place of the primary variable name
    """
    # if we don't have an alternate variable name, use the existing one
    if alt_var_name is None :
        alt_var_name = var_name
    
    if missing_value_A is None :
        missing_value_A = fileA.missing_value(var_name)
    
    if missing_value_B is None :
        missing_value_B = fileB.missing_value(alt_var_name)
    
    return missing_value_A, missing_value_B

def _load_config_or_options(aPath, bPath, optionsSet, requestedVars = [ ]) :
    """
    load information on how the user wants to run the command from a dictionary of options 
    and info on the files and variables to compare
    note: the options may include a configuration file, which will override many of the
    settings in the options
    """
    
    # basic defaults for stuff we will need to return
    runInfo = {}
    runInfo.update(glance_setting_defaults) # get the default settings
    if ('noLonLatVars' not in optionsSet) or (not optionsSet['noLonLatVars']):
        runInfo.update(glance_lon_lat_defaults) # get the default lon/lat info
    
    # by default, we don't have any particular variables to analyze
    desiredVariables = { }
    # use the built in default values, to start with
    defaultsToUse = glance_analysis_defaults.copy()
    
    requestedNames = None
    
    # set up the paths, they can only come from the command line
    paths = {}
    paths['a']   = aPath 
    paths['b']   = bPath
    paths['out'] = optionsSet['outputpath']
    
    # the colocation selection can only come from the command line options
    # note: since this is really only coming from the user's selection of the call,
    # this is ok for the moment, may want to reconsider later (FUTURE)
    runInfo['doColocate'] = ('doColocate' in optionsSet) and (optionsSet['doColocate'])
    
    # check to see if the user wants to use a config file and if the path exists
    requestedConfigFile = optionsSet['configFile']
    usedConfigFile = False
    
    if (requestedConfigFile is not None) and (requestedConfigFile != "") :
        if not os.path.exists(requestedConfigFile) :
            LOG.warn("Could not open config file: \"" + requestedConfigFile + "\"")
            LOG.warn("Unable to continue analysis without selected configuration file.")
            sys.exit(1)
            
        else :
            
            LOG.info ("Using Config File Settings")
            
            # this will handle relative paths
            requestedConfigFile = os.path.abspath(os.path.expanduser(requestedConfigFile))
            
            # split out the file base name and the file path
            (filePath, fileName) = os.path.split(requestedConfigFile)
            splitFileName = fileName.split('.')
            fileBaseName = fileName[:-3] # remove the '.py' from the end
            
            # hang onto info about the config file for later
            runInfo['config_file_name'] = fileName
            runInfo['config_file_path'] = requestedConfigFile
            
            # load the file
            LOG.debug ('loading config file: ' + str(requestedConfigFile))
            glanceRunConfig = imp.load_module(fileBaseName, file(requestedConfigFile, 'U'),
                                              filePath, ('.py' , 'U', 1))
            
            # this is an exception, since it is not advertised to the user we don't expect it to be in the file
            # (at least not at the moment, it could be added later and if they did happen to put it in the
            # config file, it would override this line)
            runInfo['shouldIncludeReport'] = not optionsSet['imagesOnly'] if 'imagesOnly'   in optionsSet else False
            runInfo['noLonLatVars']        = optionsSet['noLonLatVars']   if 'noLonLatVars' in optionsSet else False
            
            # get everything from the config file
            runInfo.update(glanceRunConfig.settings)
            if ('noLonLatVars' not in runInfo) or (not runInfo['noLonLatVars']) :
                runInfo.update(glanceRunConfig.lat_lon_info) # get info on the lat/lon variables
            
            # get any requested names
            requestedNames = glanceRunConfig.setOfVariables.copy()
            # user selected defaults, if they omit any we'll still be using the program defaults
            defaultsToUse.update(glanceRunConfig.defaultValues)
            
            usedConfigFile = True
    
    # if we didn't get the info from the config file for some reason
    # (the user didn't want to, we couldn't, etc...) get it from the command line options
    if not usedConfigFile:
        
        LOG.info ('Using Command Line Settings')
        
        # so get everything from the options directly
        runInfo['shouldIncludeReport'] = not optionsSet['imagesOnly']
        runInfo['shouldIncludeImages'] = not optionsSet['htmlOnly']
        runInfo['doFork'] = optionsSet['doFork']
        
        # only record these if we are using lon/lat
        runInfo['noLonLatVars']       = optionsSet['noLonLatVars']
        if not runInfo['noLonLatVars'] :
            runInfo['latitude']        = optionsSet['latitudeVar']  or runInfo['latitude']
            runInfo['longitude']       = optionsSet['longitudeVar'] or runInfo['longitude']
            runInfo['lon_lat_epsilon'] = optionsSet['lonlatepsilon']
        
        # get any requested names from the command line
        requestedNames = requestedVars or ['.*'] 
        
        # user selected defaults
        defaultsToUse['epsilon']         = optionsSet['epsilon']
        defaultsToUse['missing_value']   = optionsSet['missing']
        
        # note: there is no way to set the tolerances from the command line
    
    return paths, runInfo, defaultsToUse, requestedNames, usedConfigFile

class VariableLoadError(Exception):
    """
    The exception raised when a variable could not be loaded.
    
        msg  -- explanation of which variable could be loaded (and, if possible, why)
    """
    def __init__(self, msg):
        self.msg = msg
    def __str__(self):
        return self.msg

def _get_variable_from_file(fileObject, variableName, dataType, filter=None) :
    """
    load a variable, using the given data type and applying a filter if one is given
    
    This may throw a VariableLoadError if the variable cannot be loaded.
    """
    
    dataToReturn = None
    exceptionToRaise = None
    
    # get the data from the file
    if fileObject.file_object is None :
        exceptionToRaise = VariableLoadError("File was not properly opened so variable '" + variableName + "' could not be loaded.")
    else :
        try :
            dataToReturn = array(fileObject.file_object[variableName], dtype=dataType)
        except CDFError :
            exceptionToRaise = VariableLoadError('Unable to retrieve ' + variableName + ' data. The variable name ' + 
                      ' may not exist in this file or an error may have occured while attempting to' +
                      ' access the data. Details of file access error observed: ' + str(CDFError))
    
    if (exceptionToRaise is not None) :
        raise exceptionToRaise
    
    if (filter is not None) and (dataToReturn is not None) :
        dataToReturn = filter(dataToReturn)
    
    return dataToReturn

def _get_and_analyze_lon_lat (fileObject,
                              latitudeVariableName, longitudeVariableName,
                              latitudeDataFilterFn=None, longitudeDataFilterFn=None) :
    """
    get the longitude and latitude data from the given file, assuming they are in the given variable names
    and analyze them to identify spacially invalid data (ie. data that would fall off the earth)
    
    This may result in a VariableLoadError if the variable cannot be loaded.
    """
    # get the data from the file
    
    # get the longitude
    LOG.info ('longitude name: ' + longitudeVariableName)
    # TODO, should this dtype be a float?
    longitudeData = _get_variable_from_file(fileObject, longitudeVariableName,
                                            float, filter=longitudeDataFilterFn)
    # get the latitude
    LOG.info ('latitude name: '  + latitudeVariableName)
    # TODO, should this dtype be a float?
    latitudeData  = _get_variable_from_file(fileObject, latitudeVariableName,
                                            float, filter=latitudeDataFilterFn)
    
    # we are going to have issues with our comparision if they aren't the same shape
    LOG.debug('latitude  shape: ' + str(latitudeData.shape))
    LOG.debug('longitude shape: ' + str(longitudeData.shape))
    assert (latitudeData.shape == longitudeData.shape)
    
    # build a mask of our spacially invalid data
    invalidLatitude  = (latitudeData < -90)     | (latitudeData > 90)   | ~isfinite(latitudeData)
    invalidLongitude = (longitudeData < -180)   | (longitudeData > 360) | ~isfinite(longitudeData)
    spaciallyInvalidMask = invalidLatitude | invalidLongitude
    
    # get the missing value as well
    longitudeMissingVal = fileObject.file_object.missing_value(longitudeVariableName)
    latitudeMissingVal  = fileObject.file_object.missing_value( latitudeVariableName)
    
    # analyze our spacially invalid data
    percentageOfSpaciallyInvalidPts, numberOfSpaciallyInvalidPts = _get_percentage_from_mask(spaciallyInvalidMask)
    
    spatialStatInfo = {
                       'totNumInvPts': numberOfSpaciallyInvalidPts,
                       'perInvPts':    percentageOfSpaciallyInvalidPts
                       }
    
    return dataobj.DataObject(longitudeData, fillValue=longitudeMissingVal, ignoreMask=invalidLongitude), \
           dataobj.DataObject(latitudeData,  fillValue=latitudeMissingVal,  ignoreMask=invalidLatitude), spatialStatInfo

def _get_percentage_from_mask(dataMask) :
    """
    given a mask that marks the elements we want the percentage of as True (and is the size of our original data),
    figure out what percentage of the whole they are
    """
    numMarkedDataPts = sum(dataMask)
    totalDataPts = dataMask.size
    
    # avoid dividing by 0
    if totalDataPts is 0 :
        return 0.0, 0
    
    percentage = 100.0 * float(numMarkedDataPts) / float(totalDataPts)
    
    return percentage, numMarkedDataPts

# TODO, this comparison needs to encorporate epsilon percent as well
def _check_lon_lat_equality(longitudeADataObject, latitudeADataObject,
                            longitudeBDataObject, latitudeBDataObject,
                            llepsilon, doMakeImages, outputPath,
                            fullDPI=None, thumbDPI=None) :
    """
    check to make sure the longitude and latitude are equal everywhere that's not in the ignore masks
    if they are not and doMakeImages was passed as True, generate appropriate figures to show where
    return the number of points where they are not equal (0 would mean they're the same)
    
    If the latitude or longitude cannot be compared, this may raise a VariableComparisonError.
    """
    # first of all, if the latitude and longitude are not the same shape, then things can't ever be "equal"
    if (longitudeADataObject.data.shape != longitudeBDataObject.data.shape) :
        raise VariableComparisonError ("Unable to compare longitue variables due to different sizes (" + str(longitudeADataObject.data.shape) +
                                       ") and (" + str(longitudeBDataObject.data.shape) +").")
    if (latitudeADataObject.data.shape  !=  latitudeBDataObject.data.shape) :
        raise VariableComparisonError ("Unable to compare latitude variables due to different sizes (" + str(latitudeADataObject.data.shape) +
                                       ") and (" + str(latitudeBDataObject.data.shape) +").")
    
    # get information about how the latitude and longitude differ
    longitudeDiffInfo = dataobj.DiffInfoObject(longitudeADataObject, longitudeBDataObject, epsilonValue=llepsilon)
    latitudeDiffInfo  = dataobj.DiffInfoObject(latitudeADataObject,  latitudeBDataObject,  epsilonValue=llepsilon)
    
    # how much difference is there between the two sets?
    lon_lat_not_equal_mask = longitudeDiffInfo.diff_data_object.masks.mismatch_mask | latitudeDiffInfo.diff_data_object.masks.mismatch_mask
    lon_lat_not_equal_points_count = sum(lon_lat_not_equal_mask)
    lon_lat_not_equal_points_percent = (float(lon_lat_not_equal_points_count) / float(lon_lat_not_equal_mask.size)) * 100.0
    
    # if we have unequal points, create user legible info about the problem
    if (lon_lat_not_equal_points_count > 0) :
        LOG.warn("Possible mismatch in values stored in file a and file b longitude and latitude values."
                 + " Depending on the degree of mismatch, some data value comparisons may be "
                 + "distorted or spacially nonsensical.")
        # if we are making images, make two showing the invalid lons/lats
        if (doMakeImages) :
            
            if ((len(longitudeADataObject.data[~longitudeADataObject.masks.ignore_mask]) > 0) and
                (len( latitudeADataObject.data[~ latitudeADataObject.masks.ignore_mask]) > 0)) :
                plot.plot_and_save_spacial_mismatch(longitudeADataObject, latitudeADataObject,
                                                   lon_lat_not_equal_mask,
                                                   "A", "Lon./Lat. Points Mismatched between A and B\n" +
                                                   "(Shown in A)",
                                                   "LonLatMismatch",
                                                   outputPath, True,
                                                   fullDPI=fullDPI, thumbDPI=thumbDPI, units="degrees")
            
            if ((len(longitudeBDataObject.data[~longitudeBDataObject.masks.ignore_mask]) > 0) and
                (len( latitudeBDataObject.data[~ latitudeBDataObject.masks.ignore_mask]) > 0)) :
                plot.plot_and_save_spacial_mismatch(longitudeBDataObject, latitudeBDataObject,
                                                   lon_lat_not_equal_mask,
                                                   "B", "Lon./Lat. Points Mismatched between A and B\n" +
                                                   "(Shown in B)",
                                                   "LonLatMismatch",
                                                   outputPath, True,
                                                   fullDPI=fullDPI, thumbDPI=thumbDPI, units="degrees")
    
    # setup our return data
    returnInfo = {}
    returnInfo['lon_lat_not_equal_points_count']   = lon_lat_not_equal_points_count
    returnInfo['lon_lat_not_equal_points_percent'] = lon_lat_not_equal_points_percent
    
    return returnInfo

def _compare_spatial_invalidity(longitude_a_object, longitude_b_object,
                                latitude_a_object,  latitude_b_object,
                                spatial_info, do_include_images, output_path,
                                fullDPI=None, thumbDPI=None) :
    """ 
    Given information about where the two files are spatially invalid, figure
    out what invalidity they share and save information or plots for later use
    also build a shared longitude/latitude based on A but also including valid
    points in B
    """
    # make our common invalid masks
    invalid_in_a_mask = longitude_a_object.masks.ignore_mask | latitude_a_object.masks.ignore_mask
    invalid_in_b_mask = longitude_b_object.masks.ignore_mask | latitude_b_object.masks.ignore_mask
    invalid_in_common_mask = invalid_in_a_mask | invalid_in_b_mask
    
    # make a "common" longitude/latitude based on A
    longitude_common = longitude_a_object.data.copy()
    latitude_common  =  latitude_a_object.data.copy()
    
    # compare our spacialy invalid info
    spatial_info['perInvPtsInBoth'] = spatial_info['file A']['perInvPts']
            # a default that will hold if the two files have the same spatially invalid pts
    if not all(invalid_in_a_mask.ravel() == invalid_in_b_mask.ravel()) : 
        LOG.info("Mismatch in number of spatially invalid points. " +
                 "Files may not have corresponding data where expected.")
        
        # figure out which points are only valid in one of the two files
        valid_only_in_mask_a = (~invalid_in_a_mask) & invalid_in_b_mask
        spatial_info['file A']['numInvPts'] = sum(valid_only_in_mask_a.ravel())
        valid_only_in_mask_b = (~invalid_in_b_mask) & invalid_in_a_mask
        spatial_info['file B']['numInvPts'] = sum(valid_only_in_mask_b.ravel())
        
        # so how many do they have together?
        spatial_info['perInvPtsInBoth'] = _get_percentage_from_mask(invalid_in_common_mask)[0]
        # make a "clean" version of the lon/lat
        longitude_common[valid_only_in_mask_a] = longitude_a_object.data[valid_only_in_mask_a]
        longitude_common[valid_only_in_mask_b] = longitude_b_object.data[valid_only_in_mask_b]
        latitude_common [valid_only_in_mask_a] =  latitude_a_object.data[valid_only_in_mask_a]
        latitude_common [valid_only_in_mask_b] =  latitude_b_object.data[valid_only_in_mask_b]
        
        # plot the points that are only valid one file and not the other
        if ((spatial_info['file A']['numInvPts'] > 0) and (do_include_images) and
            (len(longitude_a_object.data[~invalid_in_a_mask]) > 0) and
            (len( latitude_a_object.data[~invalid_in_a_mask]) > 0)) :
            plot.plot_and_save_spacial_mismatch(longitude_a_object, latitude_a_object,
                                               valid_only_in_mask_a,
                                               "A", "Points only valid in\nFile A\'s longitude & latitude",
                                               "SpatialMismatch",
                                               output_path, True,
                                               fullDPI=fullDPI, thumbDPI=thumbDPI, units="degrees")
        if ((spatial_info['file B']['numInvPts'] > 0) and (do_include_images) and
            (len(longitude_b_object.data[~invalid_in_b_mask]) > 0) and
            (len( latitude_b_object.data[~invalid_in_b_mask]) > 0)
            ) :
            plot.plot_and_save_spacial_mismatch(longitude_b_object, latitude_b_object,
                                               valid_only_in_mask_b,
                                               "B", "Points only valid in\nFile B\'s longitude & latitude",
                                               "SpatialMismatch",
                                               output_path, True,
                                               fullDPI=fullDPI, thumbDPI=thumbDPI, units="degrees")
    
    return invalid_in_common_mask, spatial_info, longitude_common, latitude_common

class VariableComparisonError(Exception):
    """
    The exception raised when a variable could not be compared.
    
        msg  -- explanation of which variable could be compared (and, if possible, why)
    """
    def __init__(self, msg):
        self.msg = msg
    def __str__(self):
        return self.msg

def _handle_lon_lat_info (lon_lat_settings, a_file_object, b_file_object, output_path,
                          should_make_images=False, should_check_equality=True,
                          fullDPI=None, thumbDPI=None) :
    """
    Manage loading and comparing longitude and latitude information for two files
    
    This may result in a VariableLoadError if the longitude or latitude cannot be loaded.
    This may result in a VariableComparisonError if the longitude or latitude cannot be compared due to size.
    
    """
    # a place to save some general stats about our lon/lat data
    spatialInfo = { }
    
    # if there is no lon/lat specified, stop now
    if ( ('longitude' not in lon_lat_settings) or ('latitude' not in lon_lat_settings)
        or (('noLonLatVars' in lon_lat_settings) and lon_lat_settings['noLonLatVars']) ) :
        return { }, spatialInfo
    
    # if we should not be comparing against the logitude and latitude, stop now
    print ('lon_lat_settings: ' + str(lon_lat_settings))
    
    # figure out the names to be used for the longitude and latitude variables
    a_longitude_name = lon_lat_settings['longitude']
    a_latitude_name =  lon_lat_settings['latitude']
    b_longitude_name = a_longitude_name
    b_latitude_name =  a_latitude_name
    
    # if we have alternate b names, use those for b instead
    if ('longitude_alt_name_in_b' in lon_lat_settings) :
        b_longitude_name = lon_lat_settings['longitude_alt_name_in_b']
    if ( 'latitude_alt_name_in_b' in lon_lat_settings):
        b_latitude_name  = lon_lat_settings['latitude_alt_name_in_b']
        
    # if we need to load our lon/lat from different files, open those files
    
    # for the a file, do we have an alternate?
    file_for_a_lon_lat = a_file_object
    if ('a_lon_lat_from_alt_file' in lon_lat_settings) :
        LOG.info("Loading alternate file (" + lon_lat_settings['a_lon_lat_from_alt_file'] + ") for file a longitude/latitude.")
        file_for_a_lon_lat = dataobj.FileInfo(lon_lat_settings['a_lon_lat_from_alt_file'])
    
    # for the b file, do we have an alternate?
    file_for_b_lon_lat = b_file_object
    if ('b_lon_lat_from_alt_file' in lon_lat_settings) :
        LOG.info("Loading alternate file (" + lon_lat_settings['b_lon_lat_from_alt_file'] + ") for file b longitude/latitude.")
        file_for_b_lon_lat = dataobj.FileInfo(lon_lat_settings['b_lon_lat_from_alt_file'])
    
    # load our longitude and latitude and do some analysis on them
    longitude_a_object, latitude_a_object, spatialInfo['file A'] = \
        _get_and_analyze_lon_lat (file_for_a_lon_lat, a_latitude_name, a_longitude_name, 
                                  lon_lat_settings['data_filter_function_lat_in_a'], lon_lat_settings['data_filter_function_lon_in_a'])
    longitude_b_object, latitude_b_object, spatialInfo['file B'] = \
        _get_and_analyze_lon_lat (file_for_b_lon_lat, b_latitude_name, b_longitude_name,
                                  lon_lat_settings['data_filter_function_lat_in_b'], lon_lat_settings['data_filter_function_lon_in_b'])
    
    # if we need to, test the level of equality of the "valid" values in our lon/lat
    if should_check_equality :
        
        moreSpatialInfo = _check_lon_lat_equality(longitude_a_object, latitude_a_object,
                                                  longitude_b_object, latitude_b_object,
                                                  lon_lat_settings['lon_lat_epsilon'],
                                                  should_make_images, output_path,
                                                  fullDPI=fullDPI, thumbDPI=thumbDPI)
        # update our existing spatial information
        spatialInfo.update(moreSpatialInfo)
        
        # compare our spatially invalid info to see if the two files have invalid longitudes and latitudes in the same places
        spaciallyInvalidMask, spatialInfo, longitude_common, latitude_common = \
                                _compare_spatial_invalidity(longitude_a_object, longitude_b_object,
                                                            latitude_a_object,  latitude_b_object,
                                                            spatialInfo, should_make_images, output_path,
                                                            fullDPI=fullDPI, thumbDPI=thumbDPI)
    else:
        spaciallyInvalidMask = None
        longitude_common     = None
        latitude_common      = None
    
    # FUTURE, return the lon/lat objects instead?
    return {
            'a':      {
                       "lon":       longitude_a_object.data,
                       "lat":       latitude_a_object.data,
                       "inv_mask":  longitude_a_object.masks.ignore_mask,
                       "lon_fill":  longitude_a_object.fill_value,
                       "lat_fill":  latitude_a_object.fill_value
                       },
            'b':      {
                       "lon":       longitude_b_object.data,
                       "lat":       latitude_b_object.data,
                       "inv_mask":  longitude_b_object.masks.ignore_mask,
                       "lon_fill":  longitude_b_object.fill_value,
                       "lat_fill":  latitude_b_object.fill_value
                       },
            'common': {
                       "lon":       longitude_common,
                       "lat":       latitude_common,
                       "inv_mask":  spaciallyInvalidMask
                       }
            }, \
           spatialInfo

def _open_and_process_files (args, numFilesExpected):
    """
    open files listed in the args and get information about the variables in them
    """
    # get all the file names
    fileNames = args[:numFilesExpected]
    # open all the files & get their variable names
    files = {}
    commonNames = None
    for fileName in fileNames:
        LOG.info("opening %s" % fileName)
        files[fileName] = {}
        tempFileObject = (io.open(fileName))
        files[fileName]['fileObject'] = tempFileObject
        tempNames = set(tempFileObject())
        LOG.debug ('variable names for ' + fileName + ': ' + str(tempNames)) 
        files[fileName]['varNames'] = tempNames
        if commonNames is None :
            commonNames = tempNames
        else :
            commonNames = commonNames.intersection(tempNames)
    files['commonVarNames'] = commonNames
    
    return files

def _check_pass_or_fail(varRunInfo, variableStats, defaultValues) :
    """
    Check whether the variable passed analysis, failed analysis, or
    did not need to be quantitatively tested
    
    also returns information about the fractions of failure
    """
    
    passValues = [ ]
    
    # test the epsilon value tolerance
    
    # get the tolerance for failures compared to epsilon
    epsilonTolerance = None
    if ('epsilon_failure_tolerance' in varRunInfo) :
        epsilonTolerance = varRunInfo['epsilon_failure_tolerance']
    else :
        epsilonTolerance = defaultValues['epsilon_failure_tolerance']
    
    # did we fail based on the epsilon?
    failed_fraction = variableStats['Numerical Comparison Statistics']['diff_outside_epsilon_fraction']
    passed_epsilon  = None
    if epsilonTolerance is not None :
        passed_epsilon = failed_fraction <= epsilonTolerance
    passValues.append(passed_epsilon)
    
    # test the nonfinite tolerance
    
    # get the tolerance for failures in amount of nonfinite data (in spatially valid areas)
    nonfiniteTolerance = None
    if ('nonfinite_data_tolerance'  in varRunInfo) :
        nonfiniteTolerance = varRunInfo['nonfinite_data_tolerance']
    else :
        nonfiniteTolerance = defaultValues['nonfinite_data_tolerance']
    
    # did we fail based on nonfinite data
    non_finite_diff_fraction = variableStats['Finite Data Statistics']['finite_in_only_one_fraction']
    passed_nonfinite         = None
    if nonfiniteTolerance is not None :
        passed_nonfinite = non_finite_diff_fraction <= nonfiniteTolerance
    passValues.append(passed_nonfinite)
    
    # test if the total failed percentage is acceptable
    
    # get the total percentage of failed data that is acceptable
    totalFailTolerance = None
    if ('total_data_failure_tolerance' in varRunInfo) :
        totalFailTolerance = varRunInfo['total_data_failure_tolerance']
    
    # did we fail based on all data failures?
    passed_all_percentage = None
    if totalFailTolerance is not None :
        passed_all_percentage = (non_finite_diff_fraction + failed_fraction) <= totalFailTolerance
    passValues.append(passed_all_percentage)
    
    # test the r-squared correlation coefficent
    
    # get the minimum acceptable r-squared correlation coefficient
    min_r_squared = None
    if ('minimum_acceptable_squared_correlation_coefficient' in varRunInfo) :
        min_r_squared = varRunInfo['minimum_acceptable_squared_correlation_coefficient']
    else :
        min_r_squared = defaultValues['minimum_acceptable_squared_correlation_coefficient']
    
    # did we fail based on the r-squared correlation coefficient?
    r_squared_value  = None
    passed_r_squared = None
    if min_r_squared is not None :
        r_squared_value  = variableStats['Numerical Comparison Statistics']['r-squared correlation']
        passed_r_squared = r_squared_value >= min_r_squared
    passValues.append(passed_r_squared)
    
    # figure out the overall pass/fail result
    didPass = None
    for passValue in passValues :
        # if passValue isn't none, we need to update didPass
        if passValue is not None :
            if didPass is not None :
                didPass = passValue and didPass
            else :
                didPass = passValue
    
    return didPass, failed_fraction, non_finite_diff_fraction, r_squared_value

def _get_run_identification_info( ) :
    """
    get info about what user/machine/version of glance is being used
    """
    info_to_return = { }
    
    # get info on who's doing the run and where
    info_to_return['machine'] = os.uname()[1]      # the name of the machine running the report
    info_to_return['user'] = os.getenv("LOGNAME")  #os.getlogin() # the name of the user running the report
    info_to_return['version'] = _get_glance_version_string()
    
    return info_to_return

def _get_glance_version_string() :
    version_num = pkg_resources.require('glance')[0].version
    
    return "glance, version " + str(version_num) 

def _get_name_info_for_variable(original_display_name, variable_run_info) :
    """
    based on the variable run info, figure out the various names for
    the variable and return them
    
    the various names are:
    
    technical_name -            the name the variable is listed under in the file
    b_variable_technical_name - the name the variable is listed under in the b file (may be the same as technical_name)
    explanation_name -          the more verbose name that will be shown to the user to identify the variable
    original_display_name -     the display name given by the user to describe the variable
    """
    
    # figure out the various name related info
    technical_name = variable_run_info['variable_name']
    explanation_name = technical_name # for now, will add to this later
    
    # if B has an alternate variable name, figure that out
    b_variable_technical_name = technical_name
    if 'alternate_name_in_B' in variable_run_info :
        b_variable_technical_name = variable_run_info['alternate_name_in_B']
        # put both names in our explanation
        explanation_name = explanation_name + " / " + b_variable_technical_name
    
    # show both the display and current explanation names if they differ
    if not (original_display_name == explanation_name) :
        explanation_name = original_display_name + ' (' + explanation_name + ')'
    
    return technical_name, b_variable_technical_name, explanation_name

def _load_variable_data(fileObject, variableNameInFile,
                        dataFilter=None,
                        variableToFilterOn=None,
                        variableBasedFilter=None,
                        fileDescriptionForDisplay="file") :
    """
    load data for a variable from a file
    optionally filter the variable data based on a data filter or another variable
    
    dataFilter must be in the form of (lambda data: some manipulation returning the new data)
    variableBasedFilter must be in the form of (lambda data, filterData: some manipulation returning the new data))
    """
    
    # get the data for the variable
    LOG.debug("loading basic data for variable " + variableNameInFile + " from " + fileDescriptionForDisplay)
    variableData = fileObject[variableNameInFile]
    
    # apply the basic filter if there is one
    if dataFilter is not None :
        LOG.debug ("applying filter function to data from " + fileDescriptionForDisplay + " for variable " + variableNameInFile)
        variableData = dataFilter(variableData)
    
    # if we've got another variable to filter on, do that
    if (variableToFilterOn is not None) and (variableBasedFilter is not None) :
        LOG.debug ("filtering data from " + fileDescriptionForDisplay + " for variable " + variableNameInFile
                   + " based on additional data from variable " + variableToFilterOn)
        dataToFilterOn = fileObject[variableToFilterOn]
        variableData = variableBasedFilter(variableData, dataToFilterOn)
    
    return variableData

def _uri_needs_rsync(uri_to_check) :
    """
    check if the uri requires an rsync in order to access the data
    this will return some false positives if you phrase local uri's with the machine name
    for ex. you are on the machine "lotus" and you use the path "rsync:://lotus/data/"
    """
    return not os.path.exists(uri_to_check)

def _get_UV_info_from_magnitude_direction_info(fileObject, magnitudeName, directionName, invalidMask=None) :
    """
    If there are magnitude and direction names, load that information and calculate the u and v that correspond to it
    """
    
    # if we don't have magnitude and direction, we can't calculate the U and V values
    if (magnitudeName is None) or (directionName is None) :
        return None, None
    
    # load the magnitude and direction data sets
    magnitude = _load_variable_data(fileObject, magnitudeName)
    direction = _load_variable_data(fileObject, directionName)
    
    # convert the magnitude and direction data into u and v vectors
    uData, vData = delta.convert_mag_dir_to_U_V_vector(magnitude, direction, invalidMask=invalidMask)
    
    return uData, vData

def rsync_or_copy_files (list_of_files, target_directory='.', additionalFileNameSuffix='') :
    """
    If the files in the list are remote, rsync them, otherwise, just copy
    them to the target directory
    """
    newPaths = [ ]
    
    for file_uri in list_of_files :
        fileName = os.path.split(file_uri)[1]
        baseFile, ext = os.path.splitext(fileName)
        newPath = os.path.join(target_directory, baseFile + additionalFileNameSuffix + ext)
        newPaths.append(newPath)
        
        if _uri_needs_rsync(file_uri) :
            cmd = ['rsync', '-Cuav', file_uri, newPath]
        else :
            cmd = ['cp', os.path.abspath(file_uri), newPath]
        LOG.debug('running ' + ' '.join(cmd)) 
        sh(cmd)
    
    return newPaths

def colocateToFile_library_call(a_path, b_path, var_list=[ ],
                                options_set={ },
                                # todo, this doesn't yet do anything
                                do_document=False,
                                # todo, the output channel does nothing at the moment
                                output_channel=sys.stdout) :
    """
    this method handles the actual work of the colocateData command line tool
    and can be used as a library routine.
    
    TODO, properly document the options
    """
    
    # load the user settings from either the command line or a user defined config file
    pathsTemp, runInfo, defaultValues, requestedNames, usedConfigFile = _load_config_or_options(a_path, b_path,
                                                                                                options_set,
                                                                                                requestedVars = var_list)
    
    # deal with the input and output files
    if not (os.path.isdir(pathsTemp['out'])) :
        LOG.info("Specified output directory (" + pathsTemp['out'] + ") does not exist.")
        LOG.info("Creating output directory.")
        os.makedirs(pathsTemp['out'])
    
    # make copies of the input files for colocation TODO, fix paths
    [pathsTemp['a'], pathsTemp['b']] = rsync_or_copy_files ([pathsTemp['a'], pathsTemp['b']],
                                                            target_directory=pathsTemp['out'],
                                                            additionalFileNameSuffix='-collocated')
    
    # open the files
    LOG.info("Processing File A:")
    aFile = dataobj.FileInfo(pathsTemp['a'], allowWrite=True)
    if aFile is None:
        LOG.warn("Unable to continue with comparison because file a (" + pathsTemp['a'] + ") could not be opened.")
        sys.exit(1)
    LOG.info("Processing File B:")
    bFile = dataobj.FileInfo(pathsTemp['b'], allowWrite=True)
    if bFile is None:
        LOG.warn("Unable to continue with comparison because file b (" + pathsTemp['b'] + ") could not be opened.")
        sys.exit(1)
    
    # get information about the names the user requested
    finalNames, nameStats = _resolve_names(aFile.file_object, bFile.file_object,
                                           defaultValues,
                                           requestedNames, usedConfigFile)
    
    # return for lon_lat_data variables will be in the form 
    #{"lon": longitude_data,      "lat": latitude_data,      "inv_mask": spaciallyInvalidMaskData}
    # or { } if there is no lon/lat info
    lon_lat_data = { }
    try :
        lon_lat_data, _ = _handle_lon_lat_info (runInfo, aFile, bFile, pathsTemp['out'], should_check_equality=False,
                                                fullDPI=runInfo['detail_DPI'], thumbDPI=runInfo['thumb_DPI'])