Newer
Older
#!/usr/bin/env python
# encoding: utf-8
"""
Top-level routines to compare two files.
Created by rayg Apr 2009.
Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
"""
(no author)
committed
import os, sys, logging, re, subprocess, datetime
(no author)
committed
import imp as imp
from numpy import *
(no author)
committed
import pkg_resources
from pycdf import CDFError
(no author)
committed
from subprocess import check_call as sh
import glance.io as io
import glance.delta as delta
(no author)
committed
import glance.plotcreatefns as plotcreate
(no author)
committed
import glance.report as report
from urllib import quote
# these are the built in defaults for the settings
glance_setting_defaults = {'shouldIncludeReport': True,
'shouldIncludeImages': False,
'doFork': False,
'useThreadsToControlMemory': False,
(no author)
committed
'useSharedRangeForOriginal': False,
'noLonLatVars': False}
# these are the built in longitude/latitude defaults
glance_lon_lat_defaults = {'longitude': 'pixel_longitude',
'latitude': 'pixel_latitude',
(no author)
committed
'lon_lat_epsilon': 0.0,
'data_filter_function_lon_in_a': None,
'data_filter_function_lat_in_a': None,
'data_filter_function_lon_in_b': None,
'data_filter_function_lat_in_b': None
}
(no author)
committed
# these are the built in default settings for the variable analysis
(no author)
committed
glance_analysis_defaults = {'epsilon': 0.0,
'missing_value': None,
'epsilon_failure_tolerance': 0.0,
'nonfinite_data_tolerance': 0.0
(no author)
committed
}
def _cvt_names(namelist, epsilon, missing):
""""if variable names are of the format name:epsilon, yield name,epsilon, missing
otherwise yield name,default-epsilon,default-missing
"""
for name in namelist:
if ':' not in name:
yield name, epsilon
else:
n,e,m = name.split(':')
if not e: e = epsilon
else: e = float(e)
if not m: m = missing
else: m = float(m)
yield n, e, m
def _parse_varnames(names, terms, epsilon=0.0, missing=None):
"""filter variable names and substitute default epsilon and missing settings if none provided
returns name,epsilon,missing triples
>>> _parse_varnames( ['foo','bar', 'baz', 'zoom', 'cat'], ['f..:0.5:-999', 'ba.*:0.001', 'c.t::-9999'], 1e-7 )
set([('foo', 0.5, -999.0), ('cat', 9.9999999999999995e-08, -9999.0), ('bar', 0.001, None), ('baz', 0.001, None)])
"""
terms = [x.split(':') for x in terms]
terms = [(re.compile(x[0]).match,x[1:]) for x in terms]
def _cvt_em(eps=None, mis=None):
eps = float(eps) if eps else epsilon
mis = float(mis) if mis else missing
return eps, mis
sel = [ ((x,)+_cvt_em(*em)) for x in names for (t,em) in terms if t(x) ]
return set(sel)
(no author)
committed
def _setup_file(fileNameAndPath, prefexText='', allowWrite=False) :
'''
open the provided file name/path and extract information on the md5sum and last modification time
optional prefext text may be passed in for informational output formatting
'''
(no author)
committed
(no author)
committed
# some info to return
fileInfo = {'path': fileNameAndPath}
# check to see if the path exists to be opened
if not (os.path.exists(fileNameAndPath)) :
LOG.warn("Requested file " + fileNameAndPath + " could not be opened because it does not exist.")
return None, fileInfo
# open the file
(no author)
committed
LOG.info(prefexText + " opening " + fileNameAndPath)
fileNameAndPath = os.path.abspath(os.path.expanduser(fileNameAndPath))
LOG.debug("User provided path after normalization and user expansion: " + fileNameAndPath)
(no author)
committed
fileObject = io.open(fileNameAndPath, allowWrite=allowWrite)
# get the file md5sum
tempSubProcess = subprocess.Popen("md5sum \'" + fileNameAndPath + "\'", shell=True, stdout=subprocess.PIPE)
(no author)
committed
fileInfo['md5sum'] = tempSubProcess.communicate()[0].split()[0]
(no author)
committed
LOG.info(prefexText + " file md5sum: " + str(fileInfo['md5sum']))
# get the last modified stamp
statsForFile = os.stat(fileNameAndPath)
(no author)
committed
fileInfo['lastModifiedTime'] = datetime.datetime.fromtimestamp(statsForFile.st_mtime).ctime() # should time zone be forced?
(no author)
committed
LOG.info (prefexText + " file was last modified: " + fileInfo['lastModifiedTime'])
(no author)
committed
return fileObject, fileInfo
(no author)
committed
def _check_file_names(fileAObject, fileBObject) :
"""
(no author)
committed
get information about the names in the two files and how they compare to each other
"""
# get information about the variables stored in the files
aNames = set(fileAObject())
bNames = set(fileBObject())
# get the variable names they have in common
commonNames = aNames.intersection(bNames)
# which names are unique to only one of the two files?
uniqueToANames = aNames - commonNames
uniqueToBNames = bNames - commonNames
(no author)
committed
return {'sharedVars': commonNames, 'uniqueToAVars': uniqueToANames, 'uniqueToBVars': uniqueToBNames}
def _resolve_names(fileAObject, fileBObject, defaultValues,
requestedNames, usingConfigFileFormat=False) :
"""
figure out which names the two files share and which are unique to each file, as well as which names
were requested and are in both sets
usingConfigFileFormat signals whether the requestedNames parameter will be in the form of the inputed
names from the command line or a more complex dictionary holding information about the names read in
from a configuration file
Note: if we ever need a variable with different names in file A and B to be comparable, this logic
will need to be changed.
"""
# look at the names present in the two files and compare them
nameComparison = _check_file_names(fileAObject, fileBObject)
# figure out which set should be selected based on the user requested names
(no author)
committed
fileCommonNames = nameComparison['sharedVars']
finalNames = {}
if (usingConfigFileFormat) :
# if the user didn't ask for any, try everything
(no author)
committed
if (len(requestedNames) is 0) :
(no author)
committed
finalFromCommandLine = _parse_varnames(fileCommonNames, ['.*'],
defaultValues['epsilon'], defaultValues['missing_value'])
for name, epsilon, missing in finalFromCommandLine :
# we'll use the variable's name as the display name for the time being
finalNames[name] = {}
# make sure we pick up any other controlling defaults
finalNames[name].update(defaultValues)
# but override the values that would have been determined by _parse_varnames
finalNames[name]['variable_name'] = name
finalNames[name]['epsilon'] = epsilon
(no author)
committed
# load the missing value if it was not provided
missing, missing_b = _get_missing_values_if_needed((fileAObject, fileBObject), name,
missing_value_A=missing, missing_value_B=missing)
finalNames[name]['missing_value'] = missing
finalNames[name]['missing_value_alt_in_b'] = missing_b
(no author)
committed
# otherwise just do the ones the user asked for
else :
(no author)
committed
# check each of the names the user asked for to see if it is either in the list of common names
# or, if the user asked for an alternate name mapping in file B, if the two mapped names are in
# files A and B respectively
(no author)
committed
for dispName in requestedNames :
(no author)
committed
(no author)
committed
# hang on to info on the current variable
currNameInfo = requestedNames[dispName]
(no author)
committed
(no author)
committed
# get the variable name
if 'variable_name' in currNameInfo :
name = currNameInfo['variable_name']
name_b = name
(no author)
committed
(no author)
committed
if ('alternate_name_in_B' in currNameInfo) :
name_b = currNameInfo['alternate_name_in_B']
if ( (name in fileCommonNames) and (not currNameInfo.has_key('alternate_name_in_B')) ) or \
( (currNameInfo.has_key('alternate_name_in_B') and
((name in nameComparison['uniqueToAVars']) or (name in fileCommonNames)) and
((name_b in nameComparison['uniqueToBVars']) or (name_b in fileCommonNames))) ) :
(no author)
committed
finalNames[dispName] = defaultValues.copy()
finalNames[dispName]['display_name'] = dispName
finalNames[dispName].update(currNameInfo)
# load the missing value if it was not provided
missing = finalNames[dispName]['missing_value']
if ('missing_value_alt_in_b' in finalNames[dispName]) :
missing_b = finalNames[dispName]['missing_value_alt_in_b']
else :
missing_b = missing
finalNames[dispName]['missing_value'], finalNames[dispName]['missing_value_alt_in_b'] = \
_get_missing_values_if_needed((fileAObject, fileBObject), name, name_b,
missing, missing_b)
(no author)
committed
else :
LOG.warn('No technical variable name was given for the entry described as "' + dispName + '". ' +
'Skipping this variable.')
(no author)
committed
else:
# format command line input similarly to the stuff from the config file
print (requestedNames)
(no author)
committed
finalFromCommandLine = _parse_varnames(fileCommonNames, requestedNames,
defaultValues['epsilon'], defaultValues['missing_value'])
for name, epsilon, missing in finalFromCommandLine :
## we'll use the variable's name as the display name for the time being
finalNames[name] = {}
# make sure we pick up any other controlling defaults
finalNames[name].update(defaultValues)
# but override the values that would have been determined by _parse_varnames
finalNames[name]['variable_name'] = name
finalNames[name]['epsilon'] = epsilon
(no author)
committed
# load the missing value if it was not provided
missing, missing_b = _get_missing_values_if_needed((fileAObject, fileBObject), name,
missing_value_A=missing, missing_value_B=missing)
(no author)
committed
finalNames[name]['missing_value'] = missing
finalNames[name]['missing_value_alt_in_b'] = missing_b
(no author)
committed
LOG.debug("Final selected set of variables to analyze:")
LOG.debug(str(finalNames))
(no author)
committed
return finalNames, nameComparison
def _get_missing_values_if_needed((fileA, fileB),
var_name, alt_var_name=None,
missing_value_A=None, missing_value_B=None) :
"""
get the missing values for two files based on the variable name(s)
if the alternate variable name is passed it will be used for the
second file in place of the primary variable name
"""
# if we don't have an alternate variable name, use the existing one
if alt_var_name is None :
alt_var_name = var_name
if missing_value_A is None :
missing_value_A = fileA.missing_value(var_name)
if missing_value_B is None :
missing_value_B = fileB.missing_value(alt_var_name)
return missing_value_A, missing_value_B
def _load_config_or_options(aPath, bPath, optionsSet, requestedVars = [ ]) :
(no author)
committed
"""
load information on how the user wants to run the command from a dictionary of options
and info on the files and variables to compare
note: the options may include a configuration file, which will override many of the
settings in the options
(no author)
committed
"""
# basic defaults for stuff we will need to return
runInfo = {}
runInfo.update(glance_setting_defaults) # get the default settings
(no author)
committed
if ('noLonLatVars' not in optionsSet) or (not optionsSet['noLonLatVars']):
runInfo.update(glance_lon_lat_defaults) # get the default lon/lat info
(no author)
committed
(no author)
committed
# by default, we don't have any particular variables to analyze
(no author)
committed
desiredVariables = { }
(no author)
committed
# use the built in default values, to start with
defaultsToUse = glance_analysis_defaults.copy()
requestedNames = None
# set up the paths, they can only come from the command line
paths = {}
(no author)
committed
paths['a'] = aPath
paths['b'] = bPath
paths['out'] = optionsSet['outputpath']
(no author)
committed
(no author)
committed
# the colocation selection can only come from the command line options
# TODO since this is really only coming from the user's selection of the call,
# this is ok for the moment, may want to reconsider later
runInfo['doColocate'] = ('doColocate' in optionsSet) and (optionsSet['doColocate'])
(no author)
committed
# check to see if the user wants to use a config file and if the path exists
requestedConfigFile = optionsSet['configFile']
(no author)
committed
usedConfigFile = False
if (not (requestedConfigFile is None)) and os.path.exists(requestedConfigFile):
LOG.info ("Using Config File Settings")
# this will handle relative paths
(no author)
committed
requestedConfigFile = os.path.abspath(os.path.expanduser(requestedConfigFile))
# split out the file base name and the file path
(filePath, fileName) = os.path.split(requestedConfigFile)
splitFileName = fileName.split('.')
fileBaseName = fileName[:-3] # remove the '.py' from the end
(no author)
committed
# hang onto info about the config file for later
runInfo['config_file_name'] = fileName
runInfo['config_file_path'] = requestedConfigFile
(no author)
committed
# load the file
LOG.debug ('loading config file: ' + str(requestedConfigFile))
(no author)
committed
glanceRunConfig = imp.load_module(fileBaseName, file(requestedConfigFile, 'U'),
filePath, ('.py' , 'U', 1))
# this is an exception, since it is not advertised to the user we don't expect it to be in the file
# (at least not at the moment, it could be added later and if they did happen to put it in the
# config file, it would override this line)
(no author)
committed
runInfo['shouldIncludeReport'] = not optionsSet['imagesOnly'] if 'imagesOnly' in optionsSet else False
runInfo['noLonLatVars'] = optionsSet['noLonLatVars'] if 'noLonLatVars' in optionsSet else False
(no author)
committed
# get everything from the config file
(no author)
committed
runInfo.update(glanceRunConfig.settings)
(no author)
committed
if ('noLonLatVars' not in runInfo) or (not runInfo['noLonLatVars']) :
runInfo.update(glanceRunConfig.lat_lon_info) # get info on the lat/lon variables
(no author)
committed
# get any requested names
requestedNames = glanceRunConfig.setOfVariables.copy()
# user selected defaults, if they omit any we'll still be using the program defaults
defaultsToUse.update(glanceRunConfig.defaultValues)
usedConfigFile = True
(no author)
committed
(no author)
committed
# if we didn't get the info from the config file for some reason
# (the user didn't want to, we couldn't, etc...) get it from the command line options
if not usedConfigFile:
LOG.info ('Using Command Line Settings')
# so get everything from the options directly
runInfo['shouldIncludeReport'] = not optionsSet['imagesOnly']
runInfo['shouldIncludeImages'] = not optionsSet['htmlOnly']
runInfo['doFork'] = optionsSet['doFork']
(no author)
committed
# only record these if we are using lon/lat
runInfo['noLonLatVars'] = optionsSet['noLonLatVars']
if not runInfo['noLonLatVars'] :
runInfo['latitude'] = optionsSet['latitudeVar'] or runInfo['latitude']
runInfo['longitude'] = optionsSet['longitudeVar'] or runInfo['longitude']
runInfo['lon_lat_epsilon'] = optionsSet['lonlatepsilon']
(no author)
committed
# get any requested names from the command line
requestedNames = requestedVars or ['.*']
(no author)
committed
# user selected defaults
defaultsToUse['epsilon'] = optionsSet['epsilon']
defaultsToUse['missing_value'] = optionsSet['missing']
(no author)
committed
# note: there is no way to set the tolerances from the command line
(no author)
committed
(no author)
committed
return paths, runInfo, defaultsToUse, requestedNames, usedConfigFile
def _get_and_analyze_lon_lat (fileObject,
latitudeVariableName, longitudeVariableName,
latitudeDataFilterFn=None, longitudeDataFilterFn=None) :
"""
get the longitude and latitude data from the given file, assuming they are in the given variable names
and analyze them to identify spacially invalid data (ie. data that would fall off the earth)
"""
(no author)
committed
# get the data from the file TODO, handle these exits out in the calling method?
LOG.info ('longitude name: ' + longitudeVariableName)
try :
longitudeData = array(fileObject[longitudeVariableName], dtype=float)
except CDFError :
LOG.warn ('Unable to retrieve longitude data. The variable name (' + longitudeVariableName +
') may not exist in this file or an error may have occured while attempting to' +
' access the data.')
LOG.warn ('Unable to continue analysis without longitude data. Aborting analysis.')
sys.exit(1)
LOG.info ('latitude name: ' + latitudeVariableName)
try :
latitudeData = array(fileObject[latitudeVariableName], dtype=float)
except CDFError :
LOG.warn ('Unable to retrieve latitude data. The variable name (' + latitudeVariableName +
') may not exist in this file or an error may have occured while attempting to' +
' access the data.')
LOG.warn ('Unable to continue analysis without latitude data. Aborting analysis.')
sys.exit(1)
# if we have filters, use them
if not (latitudeDataFilterFn is None) :
latitudeData = latitudeDataFilterFn(latitudeData)
LOG.debug ('latitude size after application of filter: ' + str(latitudeData.shape))
if not (longitudeDataFilterFn is None) :
longitudeData = longitudeDataFilterFn(longitudeData)
LOG.debug ('longitude size after application of filter: ' + str(longitudeData.shape))
# build a mask of our spacially invalid data TODO, load actual valid range attributes?
invalidLatitude = (latitudeData < -90) | (latitudeData > 90) | ~isfinite(latitudeData)
invalidLongitude = (longitudeData < -180) | (longitudeData > 360) | ~isfinite(longitudeData)
spaciallyInvalidMask = invalidLatitude | invalidLongitude
# analyze our spacially invalid data
percentageOfSpaciallyInvalidPts, numberOfSpaciallyInvalidPts = _get_percentage_from_mask(spaciallyInvalidMask)
(no author)
committed
return longitudeData, latitudeData, spaciallyInvalidMask, {
'totNumInvPts': numberOfSpaciallyInvalidPts,
'perInvPts': percentageOfSpaciallyInvalidPts
}
def _get_percentage_from_mask(dataMask) :
"""
given a mask that marks the elements we want the percentage of as True (and is the size of our original data),
figure out what percentage of the whole they are
"""
numMarkedDataPts = sum(dataMask)
totalDataPts = dataMask.size
# avoid dividing by 0
if totalDataPts is 0 :
return 0.0, 0
percentage = 100.0 * float(numMarkedDataPts) / float(totalDataPts)
return percentage, numMarkedDataPts
def _check_lon_lat_equality(longitudeA, latitudeA,
longitudeB, latitudeB,
ignoreMaskA, ignoreMaskB,
llepsilon, doMakeImages, outputPath) :
(no author)
committed
"""
check to make sure the longitude and latitude are equal everywhere that's not in the ignore masks
if they are not and doMakeImages was passed as True, generate appropriate figures to show where
return the number of points where they are not equal (0 would mean they're the same)
"""
# first of all, if the latitude and longitude are not the same shape, then things can't ever be "equal"
if (longitudeA.shape != longitudeB.shape) | (latitudeA.shape != latitudeB.shape) :
return None
lon_lat_not_equal_points_count = 0
lon_lat_not_equal_points_percent = 0.0
# get information about how the latitude and longitude differ
longitudeDiff, finiteLongitudeMask, _, _, lon_not_equal_mask, _, _, _ = delta.diff(longitudeA, longitudeB,
llepsilon,
(None, None),
(ignoreMaskA, ignoreMaskB))
latitudeDiff, finiteLatitudeMask, _, _, lat_not_equal_mask, _, _, _ = delta.diff(latitudeA, latitudeB,
llepsilon,
(None, None),
(ignoreMaskA, ignoreMaskB))
lon_lat_not_equal_mask = lon_not_equal_mask | lat_not_equal_mask
lon_lat_not_equal_points_count = sum(lon_lat_not_equal_mask)
(no author)
committed
lon_lat_not_equal_points_percent = (float(lon_lat_not_equal_points_count) / float(lon_lat_not_equal_mask.size)) * 100.0
# if we have unequal points, create user legible info about the problem
if (lon_lat_not_equal_points_count > 0) :
(no author)
committed
LOG.warn("Possible mismatch in values stored in file a and file b longitude and latitude values."
+ " Depending on the degree of mismatch, some data value comparisons may be "
+ "distorted or spacially nonsensical.")
# if we are making images, make two showing the invalid lons/lats
if (doMakeImages) :
if (len(longitudeA[~ignoreMaskA]) > 0) and (len(latitudeA[~ignoreMaskA]) > 0) :
plot.plot_and_save_spacial_trouble(longitudeA, latitudeA,
lon_lat_not_equal_mask,
ignoreMaskA,
"A", "Lon./Lat. Points Mismatched between A and B\n" +
"(Shown in A)",
"LonLatMismatch",
outputPath, True)
if (len(longitudeB[~ignoreMaskB]) > 0) and (len(latitudeB[~ignoreMaskB]) > 0) :
plot.plot_and_save_spacial_trouble(longitudeB, latitudeB,
lon_lat_not_equal_mask,
ignoreMaskB,
"B", "Lon./Lat. Points Mismatched between A and B\n" +
"(Shown in B)",
"LonLatMismatch",
outputPath, True)
# setup our return data
returnInfo = {}
returnInfo['lon_lat_not_equal_points_count'] = lon_lat_not_equal_points_count
returnInfo['lon_lat_not_equal_points_percent'] = lon_lat_not_equal_points_percent
return returnInfo
(no author)
committed
(no author)
committed
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
def _compare_spatial_invalidity(invalid_in_a_mask, invalid_in_b_mask, spatial_info,
longitude_a, longitude_b, latitude_a, latitude_b,
do_include_images, output_path) :
"""
Given information about where the two files are spatially invalid, figure
out what invalidity they share and save information or plots for later use
also build a shared longitude/latitude based on A but also including valid
points in B
"""
# for convenience,
# make a combined mask
invalid_in_common_mask = invalid_in_a_mask | invalid_in_b_mask
# make a "common" latitude based on A
longitude_common = longitude_a
latitude_common = latitude_a
# compare our spacialy invalid info
spatial_info['perInvPtsInBoth'] = spatial_info['file A']['perInvPts']
# a default that will hold if the two files have the same spatially invalid pts
if not all(invalid_in_a_mask.ravel() == invalid_in_b_mask.ravel()) :
LOG.info("Mismatch in number of spatially invalid points. " +
"Files may not have corresponding data where expected.")
# figure out which points are only valid in one of the two files
valid_only_in_mask_a = (~invalid_in_a_mask) & invalid_in_b_mask
spatial_info['file A']['numInvPts'] = sum(valid_only_in_mask_a.ravel())
valid_only_in_mask_b = (~invalid_in_b_mask) & invalid_in_a_mask
spatial_info['file B']['numInvPts'] = sum(valid_only_in_mask_b.ravel())
# so how many do they have together?
spatial_info['perInvPtsInBoth'] = _get_percentage_from_mask(invalid_in_common_mask)[0]
# make a "clean" version of the lon/lat
longitude_common[valid_only_in_mask_a] = longitude_a[valid_only_in_mask_a]
longitude_common[valid_only_in_mask_b] = longitude_b[valid_only_in_mask_b]
latitude_common [valid_only_in_mask_a] = latitude_a [valid_only_in_mask_a]
latitude_common [valid_only_in_mask_b] = latitude_b [valid_only_in_mask_b]
# plot the points that are only valid one file and not the other
if ((spatial_info['file A']['numInvPts'] > 0) and (do_include_images) and
(len(longitude_a[~invalid_in_a_mask]) > 0) and (len(latitude_a[~invalid_in_a_mask]) > 0)) :
(no author)
committed
plot.plot_and_save_spacial_trouble(longitude_a, latitude_a,
valid_only_in_mask_a,
invalid_in_a_mask,
"A", "Points only valid in\nFile A\'s longitude & latitude",
(no author)
committed
"SpatialMismatch",
output_path, True)
if ((spatial_info['file B']['numInvPts'] > 0) and (do_include_images) and
(len(longitude_b[~invalid_in_b_mask]) > 0) and (len(latitude_b[~invalid_in_b_mask]) > 0)
) :
(no author)
committed
plot.plot_and_save_spacial_trouble(longitude_b, latitude_b,
valid_only_in_mask_b,
invalid_in_b_mask,
"B", "Points only valid in\nFile B\'s longitude & latitude",
(no author)
committed
"SpatialMismatch",
output_path, True)
return invalid_in_common_mask, spatial_info, longitude_common, latitude_common
(no author)
committed
def _handle_lon_lat_info (lon_lat_settings, a_file_object, b_file_object, output_path,
should_make_images=False, should_check_equality=True) :
"""
Manage loading and comparing longitude and latitude information for two files
Note: if the error message is returned as anything but None, something uncrecoverable
occured while trying to get the lon/lat info. TODO, replace this with a proper thrown exception
"""
# a place to save some general stats about our lon/lat data
(no author)
committed
spatialInfo = { }
# a place to put possible error messages TODO remove this in favor of an exception
error_msg = None
(no author)
committed
# if there is no lon/lat specified, stop now
if ('longitude' not in lon_lat_settings) or ('latitude' not in lon_lat_settings) :
return { }, spatialInfo, error_msg
# if we should not be comparing against the logitude and latitude, stop now
print ('lon_lat_settings: ' + str(lon_lat_settings))
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
# figure out the names to be used for the longitude and latitude variables
a_longitude_name = lon_lat_settings['longitude']
a_latitude_name = lon_lat_settings['latitude']
b_longitude_name = a_longitude_name
b_latitude_name = a_latitude_name
# if we have alternate b names, use those for b instead
if ('longitude_alt_name_in_b' in lon_lat_settings) :
b_longitude_name = lon_lat_settings['longitude_alt_name_in_b']
if ( 'latitude_alt_name_in_b' in lon_lat_settings):
b_latitude_name = lon_lat_settings['latitude_alt_name_in_b']
# if we need to load our lon/lat from different files, open those files
# for the a file, do we have an alternate?
file_for_a_lon_lat = a_file_object
if ('a_lon_lat_from_alt_file' in lon_lat_settings) :
LOG.info("Loading alternate file (" + lon_lat_settings['a_lon_lat_from_alt_file'] + ") for file a longitude/latitude.")
file_for_a_lon_lat, _ = _setup_file(lon_lat_settings['a_lon_lat_from_alt_file'], "\t")
# for the b file, do we have an alternate?
file_for_b_lon_lat = b_file_object
if ('b_lon_lat_from_alt_file' in lon_lat_settings) :
LOG.info("Loading alternate file (" + lon_lat_settings['b_lon_lat_from_alt_file'] + ") for file b longitude/latitude.")
file_for_b_lon_lat, _ = _setup_file(lon_lat_settings['b_lon_lat_from_alt_file'], "\t")
# load our longitude and latitude and do some analysis on them
longitude_a, latitude_a, spaciallyInvalidMaskA, spatialInfo['file A'] = \
_get_and_analyze_lon_lat (file_for_a_lon_lat, a_latitude_name, a_longitude_name,
lon_lat_settings['data_filter_function_lat_in_a'], lon_lat_settings['data_filter_function_lon_in_a'])
longitude_b, latitude_b, spaciallyInvalidMaskB, spatialInfo['file B'] = \
_get_and_analyze_lon_lat (file_for_b_lon_lat, b_latitude_name, b_longitude_name,
lon_lat_settings['data_filter_function_lat_in_b'], lon_lat_settings['data_filter_function_lon_in_b'])
(no author)
committed
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
# if we need to, test the level of equality of the "valid" values in our lon/lat
if should_check_equality :
moreSpatialInfo = _check_lon_lat_equality(longitude_a, latitude_a, longitude_b, latitude_b,
spaciallyInvalidMaskA, spaciallyInvalidMaskB,
lon_lat_settings['lon_lat_epsilon'],
should_make_images, output_path)
# if we got the worst type of error result from the comparison this data is too dissimilar to continue
if moreSpatialInfo is None :
error_msg = ("Unable to reconcile sizes of longitude and latitude for variables "
+ str(lon_lat_settings['longitude']) + str(longitude_a.shape) + "/"
+ str(lon_lat_settings['latitude']) + str(latitude_a.shape) + " in file A and variables "
+ str(b_longitude_name) + str(longitude_b.shape) + "/"
+ str(b_latitude_name) + str(latitude_b.shape) + " in file B. Aborting attempt to compare files.")
return { }, { }, error_msg # things have gone wrong
# update our existing spatial information
spatialInfo.update(moreSpatialInfo)
# compare our spatially invalid info to see if the two files have invalid longitudes and latitudes in the same places
spaciallyInvalidMask, spatialInfo, longitude_common, latitude_common = \
_compare_spatial_invalidity(spaciallyInvalidMaskA, spaciallyInvalidMaskB, spatialInfo,
longitude_a, longitude_b, latitude_a, latitude_b,
should_make_images, output_path)
else:
spaciallyInvalidMask = None
longitude_common = None
latitude_common = None
(no author)
committed
return {'a': {"lon": longitude_a, "lat": latitude_a, "inv_mask": spaciallyInvalidMaskA},
'b': {"lon": longitude_b, "lat": latitude_b, "inv_mask": spaciallyInvalidMaskB},
'common': {"lon": longitude_common, "lat": latitude_common, "inv_mask": spaciallyInvalidMask} }, \
spatialInfo, error_msg
def _open_and_process_files (args, numFilesExpected):
"""
open files listed in the args and get information about the variables in them
"""
# get all the file names
fileNames = args[:numFilesExpected]
# open all the files & get their variable names
files = {}
commonNames = None
for fileName in fileNames:
LOG.info("opening %s" % fileName)
files[fileName] = {}
tempFileObject = (io.open(fileName))
files[fileName]['fileObject'] = tempFileObject
tempNames = set(tempFileObject())
LOG.debug ('variable names for ' + fileName + ': ' + str(tempNames))
files[fileName]['varNames'] = tempNames
if commonNames is None :
commonNames = tempNames
else :
commonNames = commonNames.intersection(tempNames)
files['commonVarNames'] = commonNames
return files
def _check_pass_or_fail(varRunInfo, variableStats, defaultValues) :
"""
Check whether the variable passed analysis, failed analysis, or
did not need to be quantitatively tested
also returns information about the fractions of failure
"""
didPass = None
# get our tolerance values
# get the tolerance for failures in comparison compared to epsilon
epsilonTolerance = None
if ('epsilon_failure_tolerance' in varRunInfo) :
epsilonTolerance = varRunInfo['epsilon_failure_tolerance']
else :
epsilonTolerance = defaultValues['epsilon_failure_tolerance']
# get the tolerance for failures in amount of nonfinite data
# found in spatially valid areas
nonfiniteTolerance = None
if ('nonfinite_data_tolerance' in varRunInfo) :
nonfiniteTolerance = varRunInfo['nonfinite_data_tolerance']
else :
nonfiniteTolerance = defaultValues['nonfinite_data_tolerance']
# test to see if we passed or failed
# check for our epsilon tolerance
failed_fraction = 0.0
if not (epsilonTolerance is None) :
failed_fraction = variableStats['Numerical Comparison Statistics']['diff_outside_epsilon_fraction']
didPass = failed_fraction <= epsilonTolerance
(no author)
committed
# check to see if it failed on nonfinite data
non_finite_diff_fraction = 0.0
if not (nonfiniteTolerance is None) :
(no author)
committed
non_finite_diff_fraction = variableStats['Finite Data Statistics']['finite_in_only_one_fraction']
(no author)
committed
passedNonFinite = non_finite_diff_fraction <= nonfiniteTolerance
(no author)
committed
# combine the two test results
if (didPass is None) :
didPass = passedNonFinite
else :
didPass = didPass and passedNonFinite
return didPass, failed_fraction, non_finite_diff_fraction
def _get_run_identification_info( ) :
"""
get info about what user/machine/version of glance is being used
"""
info_to_return = { }
# get info on who's doing the run and where
info_to_return['machine'] = os.uname()[1] # the name of the machine running the report
info_to_return['user'] = os.getenv("LOGNAME") #os.getlogin() # the name of the user running the report
info_to_return['version'] = _get_glance_version_string()
return info_to_return
(no author)
committed
def _get_glance_version_string() :
version_num = pkg_resources.require('glance')[0].version
return "glance, version " + str(version_num)
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
def _get_name_info_for_variable(original_display_name, variable_run_info) :
"""
based on the variable run info, figure out the various names for
the variable and return them
the various names are:
technical_name - the name the variable is listed under in the file
b_variable_technical_name - the name the variable is listed under in the b file (may be the same as technical_name)
explanation_name - the more verbose name that will be shown to the user to identify the variable
original_display_name - the display name given by the user to describe the variable
"""
# figure out the various name related info
technical_name = variable_run_info['variable_name']
explanation_name = technical_name # for now, will add to this later
# if B has an alternate variable name, figure that out
b_variable_technical_name = technical_name
if 'alternate_name_in_B' in variable_run_info :
b_variable_technical_name = variable_run_info['alternate_name_in_B']
# put both names in our explanation
explanation_name = explanation_name + " / " + b_variable_technical_name
# show both the display and current explanation names if they differ
if not (original_display_name == explanation_name) :
explanation_name = original_display_name + ' (' + explanation_name + ')'
return technical_name, b_variable_technical_name, explanation_name
(no author)
committed
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
def _load_variable_data(fileObject, variableNameInFile,
dataFilter=None,
variableToFilterOn=None,
variableBasedFilter=None,
fileDescriptionForDisplay="file") :
"""
load data for a variable from a file
optionally filter the variable data based on a data filter or another variable
dataFilter must be in the form of (lambda data: some manipulation returning the new data)
variableBasedFilter must be in the form of (lambda data, filterData: some manipulation returning the new data))
"""
# get the data for the variable
LOG.debug("loading basic data for variable " + variableNameInFile + " from " + fileDescriptionForDisplay)
variableData = fileObject[variableNameInFile]
# apply the basic filter if there is one
if dataFilter is not None :
LOG.debug ("applying filter function to data from " + fileDescriptionForDisplay + " for variable " + variableNameInFile)
variableData = dataFilter(variableData)
# if we've got another variable to filter on, do that
if (variableToFilterOn is not None) and (variableBasedFilter is not None) :
LOG.debug ("filtering data from " + fileDescriptionForDisplay + " for variable " + variableNameInFile
+ " based on additional data from variable " + variableToFilterOn)
dataToFilterOn = fileObject[variableToFilterOn]
variableData = variableBasedFilter(variableData, dataToFilterOn)
return variableData
def _uri_needs_rsync(uri_to_check) :
"""
check if the uri requires an rsync in order to access the data
this will return some false positives if you phrase local uri's with the machine name
for ex. you are on the machine "lotus" and you use the path "rsync:://lotus/data/"
"""
return not os.path.exists(uri_to_check)
def rsync_or_copy_files (list_of_files, target_directory='.') :
"""
If the files in the list are remote, rsync them, otherwise, just copy
them to the target directory
"""
for file_uri in list_of_files :
if _uri_needs_rsync(file_uri) :
cmd = ['rsync', '-Cuav', file_uri, os.path.join(target_directory, os.path.split(file_uri)[1])]
else :
cmd = ['cp', os.path.abspath(file_uri), os.path.join(target_directory, os.path.split(file_uri)[1])]
LOG.debug('running ' + ' '.join(cmd))
sh(cmd)
def colocateToFile_library_call(a_path, b_path, var_list=[ ],
options_set={ },
# todo, this doesn't yet do anything
do_document=False,
# todo, the output channel does nothing at the moment
output_channel=sys.stdout) :
"""
this method handles the actual work of the colocateData command line tool
and can be used as a library routine.
TODO, properly document the options
"""
# load the user settings from either the command line or a user defined config file
pathsTemp, runInfo, defaultValues, requestedNames, usedConfigFile = _load_config_or_options(a_path, b_path,
options_set,
requestedVars = var_list)
# deal with the input and output files
if not (os.path.isdir(pathsTemp['out'])) :
LOG.info("Specified output directory (" + pathsTemp['out'] + ") does not exist.")
LOG.info("Creating output directory.")
os.makedirs(pathsTemp['out'])
# make copies of the input files for colocation
rsync_or_copy_files ([pathsTemp['a'], pathsTemp['b']], target_directory=pathsTemp['out'])
pathsTemp['a'] = os.path.join(pathsTemp['out'], os.path.split(pathsTemp['a'])[1])
pathsTemp['b'] = os.path.join(pathsTemp['out'], os.path.split(pathsTemp['b'])[1])
# open the files
LOG.info("Processing File A:")
aFile, _ = _setup_file(pathsTemp['a'], "\t", allowWrite = True)
if aFile is None:
LOG.warn("Unable to continue with comparison because file a (" + pathsTemp['a'] + ") could not be opened.")
sys.exit(1)
LOG.info("Processing File B:")
bFile, _ = _setup_file(pathsTemp['b'], "\t", allowWrite = True)
if bFile is None:
LOG.warn("Unable to continue with comparison because file b (" + pathsTemp['b'] + ") could not be opened.")
sys.exit(1)
# get information about the names the user requested
finalNames, nameStats = _resolve_names(aFile, bFile,
defaultValues,
requestedNames, usedConfigFile)
# return for lon_lat_data variables will be in the form
#{"lon": longitude_data, "lat": latitude_data, "inv_mask": spaciallyInvalidMaskData}
# or { } if there is no lon/lat info
lon_lat_data, _, fatalErrorMsg = _handle_lon_lat_info (runInfo, aFile, bFile, pathsTemp['out'], should_check_equality=False)
if fatalErrorMsg is not None :
LOG.warn(fatalErrorMsg)
sys.exit(1)
# handle the longitude and latitude colocation
LOG.info("Colocating raw longitude and latitude information")
aColocationInfomation, bColocationInformation, totalNumberOfMatchedPoints = \
delta.create_colocation_mapping_within_epsilon((lon_lat_data['a']['lon'], lon_lat_data['a']['lat']),
(lon_lat_data['b']['lon'], lon_lat_data['b']['lat']),
runInfo['lon_lat_epsilon'],
invalidAMask=lon_lat_data['a']['inv_mask'],
invalidBMask=lon_lat_data['b']['inv_mask'])
(colocatedLongitude, colocatedLatitude, (numMultipleMatchesInA, numMultipleMatchesInB)), \
(unmatchedALongitude, unmatchedALatitude), \
(unmatchedBLongitude, unmatchedBLatitude) = \
delta.create_colocated_lonlat_with_lon_lat_colocation(aColocationInfomation, bColocationInformation,
totalNumberOfMatchedPoints,
lon_lat_data['a']['lon'], lon_lat_data['a']['lat'],
lon_lat_data['b']['lon'], lon_lat_data['b']['lat'])
# TODO, based on unmatched, issue warnings and record info in the file?
LOG.debug("colocated shape of the longitude: " + str(colocatedLongitude.shape))
LOG.debug("colocated shape of the latitude: " + str(colocatedLatitude.shape))
LOG.debug(str(numMultipleMatchesInA) + " lon/lat pairs contain A points used for multiple matches.")
LOG.debug(str(numMultipleMatchesInB) + " lon/lat pairs contain B points used for multiple matches.")
LOG.debug(str(len(unmatchedALatitude)) + " A lon/lat points could not be matched.")
LOG.debug(str(len(unmatchedBLatitude)) + " B lon/lat points could not be matched.")
# go through each of the possible variables in our files
# and do our colocation for whichever ones we can
for displayName in finalNames:
# pull out the information for this variable analysis run
varRunInfo = finalNames[displayName].copy()
# get the various names
technical_name, b_variable_technical_name, \
explanationName = _get_name_info_for_variable(displayName, varRunInfo)
print('analyzing: ' + explanationName + ')')
# load the variable data
aData = _load_variable_data(aFile, technical_name,
dataFilter = varRunInfo['data_filter_function_a'] if 'data_filter_function_a' in varRunInfo else None,
variableToFilterOn = varRunInfo['variable_to_filter_on_a'] if 'variable_to_filter_on_a' in varRunInfo else None,
variableBasedFilter = varRunInfo['variable_based_filter_a'] if 'variable_based_filter_a' in varRunInfo else None,
fileDescriptionForDisplay = "file A")
bData = _load_variable_data(bFile, b_variable_technical_name,
dataFilter = varRunInfo['data_filter_function_b'] if 'data_filter_function_b' in varRunInfo else None,
variableToFilterOn = varRunInfo['variable_to_filter_on_b'] if 'variable_to_filter_on_b' in varRunInfo else None,
variableBasedFilter = varRunInfo['variable_based_filter_b'] if 'variable_based_filter_b' in varRunInfo else None,
fileDescriptionForDisplay = "file B")
# colocate the data for this variable if we have longitude/latitude data
if (len(lon_lat_data.keys()) > 0) and runInfo['doColocate'] :
(no author)
committed
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
# match up our points in A and B
(aData, bData, (numberOfMultipleMatchesInA, numberOfMultipleMatchesInB)), \
(aUnmatchedData, unmatchedALongitude, unmatchedALatitude), \
(bUnmatchedData, unmatchedBLongitude, unmatchedBLatitude) = \
delta.create_colocated_data_with_lon_lat_colocation(aColocationInfomation, bColocationInformation,
colocatedLongitude, colocatedLatitude,
aData, bData,
missingData=varRunInfo['missing_value'],
altMissingDataInB=varRunInfo['missing_value_alt_in_b'],
# TODO, should missing data be considered?
invalidAMask=lon_lat_data['a']['inv_mask'],
invalidBMask=lon_lat_data['b']['inv_mask'])
LOG.debug(str(numberOfMultipleMatchesInA) + " data pairs contain A data points used for multiple matches.")
LOG.debug(str(numberOfMultipleMatchesInB) + " data pairs contain B data points used for multiple matches.")
LOG.debug(str(len(aUnmatchedData)) + " A data points could not be matched.")
LOG.debug(str(len(bUnmatchedData)) + " B data points could not be matched.")
# save the colocated data information in the output files
aFile.create_new_variable(technical_name + '-colocated', # TODO, how should this suffix be handled?
missingvalue = varRunInfo['missing'] if 'missing' in varRunInfo else None,
data = aData,
variabletocopyattributesfrom = technical_name)
bFile.create_new_variable(b_variable_technical_name + '-colocated', # TODO, how should this suffix be handled?
missingvalue = varRunInfo['missing_value_alt_in_b'] if 'missing_value_alt_in_b' in varRunInfo else None,
data = bData,
variabletocopyattributesfrom = b_variable_technical_name)
# TODO, save the unmatched data and info on multiple matches
else :
LOG.debug(explanationName + " was not selected for colocation and will be ignored.")
# the end of the loop to examine all the variables
# we're done with the files, so close them up
aFile.close()
bFile.close()
return
def reportGen_library_call (a_path, b_path, var_list=[ ],
options_set={ },
# todo, this doesn't yet do anything
do_document=False,
# todo, the output channel does nothing at the moment
output_channel=sys.stdout) :
"""
this method handles the actual work of the reportGen command line tool
and can also be used as a library routine, pass in the slightly parsed
command line input, or call it as a library function... be sure to fill
out the options
TODO at the moment the options are very brittle and need to be fully filled
or this method will fail badly (note: the addition of some glance defaults
has minimized the problem, but you still need to be careful when dealing with
optional boolean values. this needs more work.)
"""
# load the user settings from either the command line or a user defined config file
pathsTemp, runInfo, defaultValues, requestedNames, usedConfigFile = _load_config_or_options(a_path, b_path,
options_set,
requestedVars = var_list)
# note some of this information for debugging purposes
LOG.debug('paths: ' + str(pathsTemp))
LOG.debug('defaults: ' + str(defaultValues))
LOG.debug('run information: ' + str(runInfo))
# if we wouldn't generate anything, just stop now
if (not runInfo['shouldIncludeImages']) and (not runInfo['shouldIncludeReport']) :
LOG.warn("User selection of no image generation and no report generation will result in no " +
"content being generated. Aborting generation function.")
return
# hang onto info to identify who/what/when/where/etc. the report is being run by/for
runInfo.update(_get_run_identification_info( ))
# deal with the input and output files
if not (os.path.isdir(pathsTemp['out'])) :
LOG.info("Specified output directory (" + pathsTemp['out'] + ") does not exist.")
LOG.info("Creating output directory.")
os.makedirs(pathsTemp['out'])
# open the files
files = {}
LOG.info("Processing File A:")
aFile, files['file A'] = _setup_file(pathsTemp['a'], "\t")
if aFile is None:
LOG.warn("Unable to continue with comparison because file a (" + pathsTemp['a'] + ") could not be opened.")
sys.exit(1)
LOG.info("Processing File B:")
bFile, files['file B'] = _setup_file(pathsTemp['b'], "\t")
if bFile is None:
LOG.warn("Unable to continue with comparison because file b (" + pathsTemp['b'] + ") could not be opened.")
sys.exit(1)