diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index 2b0464bd2fabbafd59238584789dd71d0927a488..daad73e6fb79f6ff4300db8d00d4baf740505a4b 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -55,38 +55,27 @@ glance_analysis_defaults = {'epsilon': 0.0, 'minimum_acceptable_squared_correlation_coefficient': None } -# TODO, remove? -def _cvt_names(namelist, epsilon, missing): - """"if variable names are of the format name:epsilon, yield name,epsilon, missing - otherwise yield name,default-epsilon,default-missing - """ - for name in namelist: - if ':' not in name: - yield name, epsilon - else: - n,e,m = name.split(':') - if not e: e = epsilon - else: e = float(e) - if not m: m = missing - else: m = float(m) - yield n, e, m - def _clean_path(string_path) : """ Return a clean form of the path without any '.', '..', or '~' """ + clean_path = None if string_path is not None : clean_path = os.path.abspath(os.path.expanduser(string_path)) - else : - clean_path = string_path + return clean_path -# TODO comment more clearly def _parse_varnames(names, terms, epsilon=0.0, missing=None): """filter variable names and substitute default epsilon and missing settings if none provided - returns name,epsilon,missing triples + returns (variable name, epsilon, missing) triples + >>> _parse_varnames( ['foo','bar', 'baz', 'zoom', 'cat'], ['f..:0.5:-999', 'ba.*:0.001', 'c.t::-9999'], 1e-7 ) set([('foo', 0.5, -999.0), ('cat', 9.9999999999999995e-08, -9999.0), ('bar', 0.001, None), ('baz', 0.001, None)]) + + names - all the variable names in the file (ie. names that should be considered valid) + terms - variable selection terms given from the command line + epsilon - a default epsilon to be used for all variables that do not have a specific epsilon given + missing - a default fill value to be used for all variables that do not have a specific fill value given """ terms = [x.split(':') for x in terms] terms = [(re.compile(x[0]).match,x[1:]) for x in terms] @@ -373,6 +362,29 @@ def _load_config_or_options(aPath, bPath, optionsSet, requestedVars = [ ]) : return paths, runInfo, defaultsToUse, requestedNames, usedConfigFile +def _get_variable_can_end_program(fileObject, variableName, dataType, canEndProgram=True) : + """ + load a variable, exiting the program if there is an error + and canEndProgram is passed as True + + TODO, instead of exiting, throw an exception + """ + + dataToReturn = None + + # get the data from the file + try : + dataToReturn = array(fileObject[variableName], dtype=dataType) + except CDFError : + LOG.warn ('Unable to retrieve ' + variableName + ' data. The variable name ' + + ' may not exist in this file or an error may have occured while attempting to' + + ' access the data.') + if canEndProgram : + LOG.warn ('Unable to continue analysis without ' + variableName + ' data. Aborting analysis.') + sys.exit(1) + + return dataToReturn + def _get_and_analyze_lon_lat (fileObject, latitudeVariableName, longitudeVariableName, latitudeDataFilterFn=None, longitudeDataFilterFn=None) : @@ -381,24 +393,16 @@ def _get_and_analyze_lon_lat (fileObject, and analyze them to identify spacially invalid data (ie. data that would fall off the earth) """ # get the data from the file TODO, handle these exits out in the calling method? + + # get the longitude LOG.info ('longitude name: ' + longitudeVariableName) - try : - longitudeData = array(fileObject[longitudeVariableName], dtype=float) - except CDFError : - LOG.warn ('Unable to retrieve longitude data. The variable name (' + longitudeVariableName + - ') may not exist in this file or an error may have occured while attempting to' + - ' access the data.') - LOG.warn ('Unable to continue analysis without longitude data. Aborting analysis.') - sys.exit(1) + # TODO, should this dtype be a float? + longitudeData = _get_variable_can_end_program(fileObject, longitudeVariableName, float) + + # get the latitude LOG.info ('latitude name: ' + latitudeVariableName) - try : - latitudeData = array(fileObject[latitudeVariableName], dtype=float) - except CDFError : - LOG.warn ('Unable to retrieve latitude data. The variable name (' + latitudeVariableName + - ') may not exist in this file or an error may have occured while attempting to' + - ' access the data.') - LOG.warn ('Unable to continue analysis without latitude data. Aborting analysis.') - sys.exit(1) + # TODO, should this dtype be a float? + latitudeData = _get_variable_can_end_program(fileObject, latitudeVariableName, float) # if we have filters, use them if not (latitudeDataFilterFn is None) : @@ -414,7 +418,7 @@ def _get_and_analyze_lon_lat (fileObject, assert (latitudeData.shape == longitudeData.shape) # build a mask of our spacially invalid data TODO, load actual valid range attributes? - invalidLatitude = (latitudeData < -90) | (latitudeData > 90) | ~isfinite(latitudeData) + invalidLatitude = (latitudeData < -90) | (latitudeData > 90) | ~isfinite(latitudeData) invalidLongitude = (longitudeData < -180) | (longitudeData > 360) | ~isfinite(longitudeData) spaciallyInvalidMask = invalidLatitude | invalidLongitude @@ -641,7 +645,7 @@ def _handle_lon_lat_info (lon_lat_settings, a_file_object, b_file_object, output return { }, { }, error_msg # things have gone wrong # update our existing spatial information spatialInfo.update(moreSpatialInfo) - + # compare our spatially invalid info to see if the two files have invalid longitudes and latitudes in the same places spaciallyInvalidMask, spatialInfo, longitude_common, latitude_common = \ _compare_spatial_invalidity(spaciallyInvalidMaskA, spaciallyInvalidMaskB, spatialInfo, @@ -1278,10 +1282,11 @@ def reportGen_library_call (a_path, b_path, var_list=[ ], doPlotSettingsDict = varRunInfo, aUData=aUData, aVData=aVData, bUData=bUData, bVData=bVData, - binIndex= varRunInfo['binIndex'] if 'binIndex' in varRunInfo else None, - tupleIndex=varRunInfo['tupleIndex'] if 'tupleIndex' in varRunInfo else None, - binName= varRunInfo['binName'] if 'binName' in varRunInfo else 'bin', - tupleName= varRunInfo['tupleName'] if 'tupleName' in varRunInfo else 'tuple') + binIndex= varRunInfo['binIndex'] if 'binIndex' in varRunInfo else None, + tupleIndex= varRunInfo['tupleIndex'] if 'tupleIndex' in varRunInfo else None, + binName= varRunInfo['binName'] if 'binName' in varRunInfo else 'bin', + tupleName= varRunInfo['tupleName'] if 'tupleName' in varRunInfo else 'tuple', + epsilonPercent=varRunInfo['epsilon_percent'] if 'epsilon_percent' in varRunInfo else None) print("\tfinished creating figures for: " + explanationName) @@ -1373,7 +1378,7 @@ def stats_library_call(afn, bfn, var_list=[ ], doc_each = do_document and len(names)==1 doc_atend = do_document and len(names)!=1 - for name,epsilon,missing in names: + for name, epsilon, missing in names: aData = aFile[name] bData = bFile[name] if missing is None: diff --git a/pyglance/glance/data.py b/pyglance/glance/data.py index 988b60e254f3f23a39a2333e98ec09ad287c8f1f..a6bd5fbeeb40a723ebaebf415ec3a51b8c44e2ac 100644 --- a/pyglance/glance/data.py +++ b/pyglance/glance/data.py @@ -249,7 +249,16 @@ class DiffInfoObject (object) : aDataObject.data[valid_in_both].astype(sharedType) # the valid data which is too different between the two sets according to the given epsilon - outside_epsilon_mask = (abs(raw_diff) > epsilonValue) & valid_in_both # TODO also use epsilon percent + outside_epsilon_mask = np.zeros(shape, dtype=np.bool) + if (epsilonValue is not None) : + outside_epsilon_mask = outside_epsilon_mask | \ + (abs(raw_diff) > epsilonValue) & valid_in_both + if (epsilonPercent is not None) : + # TODO, test this part of the formula + outside_epsilon_mask = outside_epsilon_mask | \ + ((abs(raw_diff) > (aDataObject.data * (float(epsilonPercent) / 100.0))) & valid_in_both) + #outside_epsilon_mask = (abs(raw_diff) > epsilonValue) & valid_in_both # TODO also use epsilon percent + # trouble points = mismatched nans, mismatched missing-values, differences that are too large trouble_pt_mask = ( (aDataObject.masks.non_finite_mask ^ bDataObject.masks.non_finite_mask) | (aDataObject.masks.missing_mask ^ bDataObject.masks.missing_mask) | diff --git a/pyglance/glance/figures.py b/pyglance/glance/figures.py index 563c5c4b149cf47676e7328d0c0d9fa134910f83..83a554d938d9c20dd63ec17eac155d81d1a43acb 100644 --- a/pyglance/glance/figures.py +++ b/pyglance/glance/figures.py @@ -264,12 +264,12 @@ def create_histogram(data, bins, title, xLabel, yLabel, displayStats=False) : # the location is in the form x, y (I think) if displayStats : # info on the basic stats - tempMask = ones(data.shape,dtype=bool) - tempStats = statistics.stats(data, tempMask, None) + tempMask = ones(data.shape, dtype=bool) + tempStats = statistics.NumericalComparisonStatistics.basic_analysis(data, tempMask) medianVal = tempStats['median_diff'] meanVal = tempStats['mean_diff'] stdVal = tempStats['std_diff'] - numPts = data.size + numPts = data.size # info on the display of our statistics xbounds = axes.get_xbound() diff --git a/pyglance/glance/mainreport.txt b/pyglance/glance/mainreport.txt index 9de0b50083701fe31b716f02a24a65c29d4acb6a..9902781f7d5c15f610628b3c1a15140c64178847 100644 --- a/pyglance/glance/mainreport.txt +++ b/pyglance/glance/mainreport.txt @@ -195,6 +195,11 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. <td> Variable: <a href="${tempVarRunInfo['variable_report_path_escaped']}">${varDisplayName}</a> <br> Epsilon used: ${tempVarRunInfo['epsilon']} <br> + ## if there is an epsilon percent, also show that + % if ('epsilon_percent' in tempVarRunInfo) and (tempVarRunInfo['epsilon_percent'] is not None) : + Epsilon percent (of file A) used: ${tempVarRunInfo['epsilon_percent']}% <br> + % endif + ## if there's an r squared correlation value, also show that % if rSquaredCorr is not None : R-squared Correlation Coefficient: ${rSquaredCorr}<br> % endif diff --git a/pyglance/glance/plot.py b/pyglance/glance/plot.py index 12ed1bb89ae591b11e5ac1c38dd10a4b19be2556..80adf3391ce9422edbd49bde71cc84fa4cbae579 100644 --- a/pyglance/glance/plot.py +++ b/pyglance/glance/plot.py @@ -157,7 +157,8 @@ def plot_and_save_comparison_figures (aData, bData, aUData=None, aVData=None, bUData=None, bVData=None, binIndex=None, tupleIndex=None, - binName='bin', tupleName='tuple') : + binName='bin', tupleName='tuple', + epsilonPercent=None) : """ Plot images for a set of figures based on the data sets and settings passed in. The images will be saved to disk according to the settings. @@ -235,7 +236,7 @@ def plot_and_save_comparison_figures (aData, bData, # compare the two data sets to get our difference data and trouble info aDataObject = dataobj.DataObject(aData, fillValue=missingValue, ignoreMask=spaciallyInvalidMaskA) bDataObject = dataobj.DataObject(bData, fillValue=missingValueAltInB, ignoreMask=spaciallyInvalidMaskB) - diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=epsilon) #TODO, needs epsilon percent + diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=epsilon, epsilonPercent=epsilonPercent) #TODO, for the moment, unpack these values into local variables rawDiffData = diffInfo.diff_data_object.data goodMask = diffInfo.diff_data_object.masks.valid_mask @@ -293,7 +294,8 @@ def plot_and_save_comparison_figures (aData, bData, # only used for line plots binIndex=binIndex, tupleIndex=tupleIndex, - binName=binName, tupleName=tupleName + binName=binName, tupleName=tupleName, + epsilonPercent=epsilonPercent ) plottingFunctions.update(moreFunctions) diff --git a/pyglance/glance/plotcreatefns.py b/pyglance/glance/plotcreatefns.py index 6cfd744e2381fa1740a58ab0a6f9cac4dc2fc2c1..a98bb2c994327a7b23b02e2a1a149766b40244a0 100644 --- a/pyglance/glance/plotcreatefns.py +++ b/pyglance/glance/plotcreatefns.py @@ -178,7 +178,10 @@ class PlottingFunctionFactory : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None ) : _abstract @@ -219,7 +222,11 @@ class BasicComparisonPlotsFunctionFactory (PlottingFunctionFactory) : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None + ) : functionsToReturn = { } @@ -246,6 +253,7 @@ class BasicComparisonPlotsFunctionFactory (PlottingFunctionFactory) : assert(bData.shape == goodInBothMask.shape) assert(goodInBothMask.shape == outsideEpsilonMask.shape) + # TODO, if there's an epsilon percent, how should the epsilon lines be drawn? functionsToReturn['scatter'] = ((lambda : figures.create_scatter_plot(aData[goodInBothMask], bData[goodInBothMask], "Value in File A vs Value in File B", "File A Value", "File B Value", @@ -292,7 +300,11 @@ class MappedContourPlotFunctionFactory (PlottingFunctionFactory) : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None + ) : # the default for plotting geolocated data @@ -457,7 +469,11 @@ class MappedQuiverPlotFunctionFactory (PlottingFunctionFactory) : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None + ) : # the default for plotting geolocated data @@ -629,7 +645,11 @@ class LinePlotsFunctionFactory (PlottingFunctionFactory) : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None + ) : """ This method generates line plotting functions for one dimensional data @@ -727,7 +747,11 @@ class BinTupleAnalysisFunctionFactory (PlottingFunctionFactory) : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None + ) : """ This method generates histogram and sample line plot functions for complex three dimensional data @@ -762,28 +786,7 @@ class BinTupleAnalysisFunctionFactory (PlottingFunctionFactory) : goodInBothMask = reorderMapObject.reorder_for_bin_tuple(goodInBothMask) troubleMask = reorderMapObject.reorder_for_bin_tuple(troubleMask) outsideEpsilonMask = reorderMapObject.reorder_for_bin_tuple(outsideEpsilonMask) - """ - aData, caseInfo1 = delta.reorder_for_bin_tuple(aData, binIndex, tupleIndex) - bData, caseInfo2 = delta.reorder_for_bin_tuple(bData, binIndex, tupleIndex) - goodInAMask, caseInfo3 = delta.reorder_for_bin_tuple(goodInAMask, binIndex, tupleIndex) - goodInBMask, caseInfo4 = delta.reorder_for_bin_tuple(goodInBMask, binIndex, tupleIndex) - absDiffData, caseInfo5 = delta.reorder_for_bin_tuple(absDiffData, binIndex, tupleIndex) - rawDiffData, caseInfo6 = delta.reorder_for_bin_tuple(rawDiffData, binIndex, tupleIndex) - goodInBothMask, caseInfo7 = delta.reorder_for_bin_tuple(goodInBothMask, binIndex, tupleIndex) - troubleMask, caseInfo8 = delta.reorder_for_bin_tuple(troubleMask, binIndex, tupleIndex) - outsideEpsilonMask, caseInfo9 = delta.reorder_for_bin_tuple(outsideEpsilonMask, binIndex, tupleIndex) - - # assert that all our case dimensions were originally the same - # TODO, should I have just compared the shape of all the data before modifying it? - assert(caseInfo1 == caseInfo2) - assert(caseInfo2 == caseInfo3) - assert(caseInfo3 == caseInfo4) - assert(caseInfo4 == caseInfo5) - assert(caseInfo5 == caseInfo6) - assert(caseInfo6 == caseInfo7) - assert(caseInfo7 == caseInfo8) - assert(caseInfo8 == caseInfo9) - """ + # our list of functions that will later create the plots functionsToReturn = { } @@ -901,7 +904,11 @@ class IMShowPlotFunctionFactory (PlottingFunctionFactory) : # only used for line plots binIndex=None, tupleIndex=None, - binName=None, tupleName=None + binName=None, tupleName=None, + + # the optional epsilon for comparison of a percent of A + epsilonPercent=None + ) : """ This method generates imshow plotting functions for two dimensional data @@ -985,7 +992,6 @@ class IMShowPlotFunctionFactory (PlottingFunctionFactory) : return functionsToReturn - if __name__=='__main__': import doctest doctest.testmod() diff --git a/pyglance/glance/stats.py b/pyglance/glance/stats.py index 3a7ab36e88c62672db2c339228fcd588216998dc..7fe5f2db256cd716373ee8068240593a7a9211bc 100644 --- a/pyglance/glance/stats.py +++ b/pyglance/glance/stats.py @@ -36,6 +36,23 @@ class StatisticalData (object) : """ self.title = None + + def dictionary_form(self) : + """ + get a dictionary form of the statistics + + note: child classes should override this method + """ + return { } + + def doc_strings(self) : + """ + get documentation strings that match the + dictionary form of the statistics + + note: child classes should override this method + """ + return { } class MissingValueStatistics (StatisticalData) : """ @@ -52,33 +69,503 @@ class MissingValueStatistics (StatisticalData) : common_missing_fraction - fraction of points that are missing in both data sets """ + def __init__(self, diffInfoObject) : + """ + build our fill value related statistics based on the comparison + of two data sets + """ + self.title = 'Missing Value Statistics' + + # pull out some masks for later use + a_missing_mask = diffInfoObject.a_data_object.masks.missing_mask + b_missing_mask = diffInfoObject.b_data_object.masks.missing_mask + + assert(a_missing_mask.shape == b_missing_mask.shape) + + # figure out some basic statistics + self.a_missing_count = np.sum(a_missing_mask) + self.b_missing_count = np.sum(b_missing_mask) + self.common_missing_count = np.sum(a_missing_mask & b_missing_mask) + + # make the assumption that a and b are the same size and only use the size of a's mask + total_num_values = a_missing_mask.size + + # figure out some fraction statistics + self.a_missing_fraction = float(self.a_missing_count) / float(total_num_values) + self.b_missing_fraction = float(self.b_missing_count) / float(total_num_values) + self.common_missing_fraction = float(self.common_missing_count) / float(total_num_values) + + def dictionary_form(self) : + """ + get a dictionary form of the statistics + """ + + toReturn = { + 'a_missing_count': self.a_missing_count, + 'a_missing_fraction': self.a_missing_fraction, + 'b_missing_count': self.b_missing_count, + 'b_missing_fraction': self.b_missing_fraction, + 'common_missing_count': self.common_missing_count, + 'common_missing_fraction': self.common_missing_fraction + } + + return toReturn + + # TODO, replace the giant doc dictionary with use of this method + def doc_strings(self) : + """ + get documentation strings that match the + dictionary form of the statistics + """ + + toReturn = { + 'a_missing_count': "number of values flagged missing in A", + 'a_missing_fraction': "fraction of values flagged missing in A", + 'b_missing_count': "number of values flagged missing in B", + 'b_missing_fraction': "fraction of values flagged missing in B", + 'common_missing_count': "number of missing values in common between A and B", + 'common_missing_fraction': "fraction of missing values in common between A and B" + } + + return toReturn -# --------------------- general statistics methods ------------------ +class FiniteDataStatistics (StatisticalData) : + """ + A class representing information about where finite values are found + in a pair of data sets. + + includes the following statistics: + + a_finite_count - the number of finite data values in the a data set + a_finite_fraction - the fraction of finite data values in the a data set + b_finite_count - the number of finite data values in the b data set + b_finite_fraction - the fraction of finite data values in the b data set + common_finite_count - the number of finite values the two data sets have in common + common_finite_fraction - the fraction of finite values the two data sets have in common + finite_in_only_one_count - the number of points that are finite in only one of the two sets + finite_in_only_one_fraction - the fraction of points that are finite in only one of the two sets + """ + + def __init__(self, diffInfoObject) : + """ + build our finite data related statistics based on the comparison + of two data sets + """ + self.title = 'Finite Data Statistics' + + # pull out some data we will use later + a_is_finite_mask = diffInfoObject.a_data_object.masks.valid_mask + b_is_finite_mask = diffInfoObject.b_data_object.masks.valid_mask + common_ignore_mask = diffInfoObject.diff_data_object.masks.ignore_mask + + assert(a_is_finite_mask.shape == b_is_finite_mask.shape) + assert(b_is_finite_mask.shape == common_ignore_mask.shape) + + # figure out some basic statistics + self.a_finite_count = np.sum(a_is_finite_mask) + self.b_finite_count = np.sum(b_is_finite_mask) + self.common_finite_count = np.sum(a_is_finite_mask & b_is_finite_mask) + # use an exclusive or to check which points are finite in only one of the two data sets + self.finite_in_only_one_count = np.sum((a_is_finite_mask ^ b_is_finite_mask) & ~common_ignore_mask) + + # make the assumption that a and b are the same size and only use the size of a's mask + total_num_values = a_is_finite_mask.size + + # calculate some fractional statistics + self.a_finite_fraction = float(self.a_finite_count) / float(total_num_values) + self.b_finite_fraction = float(self.b_finite_count) / float(total_num_values) + self.common_finite_fraction = float(self.common_finite_count) / float(total_num_values) + self.finite_in_only_one_fraction = float(self.finite_in_only_one_count) / float(total_num_values) + + def dictionary_form(self) : + """ + get a dictionary form of the statistics + """ + + toReturn = { + 'a_finite_count': self.a_finite_count, + 'a_finite_fraction': self.a_finite_fraction, + 'b_finite_count': self.b_finite_count, + 'b_finite_fraction': self.b_finite_fraction, + 'common_finite_count': self.common_finite_count, + 'common_finite_fraction': self.common_finite_fraction, + 'finite_in_only_one_count': self.finite_in_only_one_count, + 'finite_in_only_one_fraction': self.finite_in_only_one_fraction, + } + + return toReturn + + # TODO, replace the giant doc dictionary with use of this method + def doc_strings(self) : + """ + get documentation strings that match the + dictionary form of the statistics + """ + + toReturn = { + 'a_finite_count': "number of finite values in A", + 'a_finite_fraction': "fraction of finite values in A (out of all data points in A)", + 'b_finite_count': "number of finite values in B", + 'b_finite_fraction': "fraction of finite values in B (out of all data points in B)", + 'common_finite_count': "number of finite values in common between A and B", + 'common_finite_fraction': "fraction of finite values in common between A and B", + 'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " + + "only the common spatially valid area is considerd for this statistic", + 'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " + + "only the common spatially valid area is considerd for this statistic" + } + + return toReturn -def _get_missing_value_stats(a_missing_mask, b_missing_mask) : +class NotANumberStatistics (StatisticalData) : """ - Get a list of statistics about missing data values in data sets a and b, - given masks describing the positions of the missing data (such masks - can be gotten from the diff function), - the return value will be a dictionary of statistics + A class representing information about where non-finite values are found + in a pair of data sets. + + includes the following statistics: + + a_nan_count - the number of non finite values that are present in the a data set + a_nan_fraction - the fraction of non finite values that are present in the a data set + b_nan_count - the number of non finite values that are present in the b data set + b_nan_fraction - the fraction of non finite values that are present in the b data set + common_nan_count - the number of non finite values that are shared between the data sets + common_nan_fraction - the fraction of non finite values that are shared between the data sets """ - # calculate information about the missing values - num_a_missing = np.sum(a_missing_mask) - num_b_missing = np.sum(b_missing_mask) - num_common_missing = np.sum(a_missing_mask & b_missing_mask) - - # make the assumption that a and b are the same size and only use the size of a's mask - total_num_values = a_missing_mask.size - - missing_value_stats = {'a_missing_count': num_a_missing, - 'a_missing_fraction': (float(num_a_missing) / float(total_num_values)), - 'b_missing_count': num_b_missing, - 'b_missing_fraction': (float(num_b_missing) / float(total_num_values)), - 'common_missing_count': num_common_missing, - 'common_missing_fraction': (float(num_common_missing) / float(total_num_values)) - } - - return missing_value_stats + + def __init__(self, diffInfoObject) : + """ + build our nonfinite data related statistics based on the comparison + of two data sets + """ + self.title = 'NaN Statistics' + + # pull out some masks we will use + a_nan_mask = diffInfoObject.a_data_object.masks.non_finite_mask + b_nan_mask = diffInfoObject.b_data_object.masks.non_finite_mask + + assert(a_nan_mask.shape == b_nan_mask.shape) + + # get some basic statistics + self.a_nan_count = np.sum(a_nan_mask) + self.b_nan_count = np.sum(b_nan_mask) + self.common_nan_count = np.sum(a_nan_mask & b_nan_mask) + + # make the assumption that a and b are the same size and only use the size of a + total_num_values = a_nan_mask.size + + # calculate some fractional statistics + self.a_nan_fraction = float(self.a_nan_count) / float(total_num_values) + self.b_nan_fraction = float(self.b_nan_count) / float(total_num_values) + self.common_nan_fraction = float(self.common_nan_count) / float(total_num_values) + + def dictionary_form(self) : + """ + get a dictionary form of the statistics + """ + + toReturn = { + 'a_nan_count': self.a_nan_count, + 'a_nan_fraction': self.a_nan_fraction, + 'b_nan_count': self.b_nan_count, + 'b_nan_fraction': self.b_nan_fraction, + 'common_nan_count': self.common_nan_count, + 'common_nan_fraction': self.common_nan_fraction + } + + return toReturn + + # TODO, replace the giant doc dictionary with use of this method + def doc_strings(self) : + """ + get documentation strings that match the + dictionary form of the statistics + """ + + toReturn = { + 'a_nan_count': "number of NaNs in A", + 'a_nan_fraction': "fraction of NaNs in A", + 'b_nan_count': "number of NaNs in B", + 'b_nan_fraction': "fraction of NaNs in B", + 'common_nan_count': "number of NaNs in common between A and B", + 'common_nan_fraction': "fraction of NaNs in common between A and B" + } + + return toReturn + +class GeneralStatistics (StatisticalData) : + """ + A class representing general information about a pair of data sets. + + includes the following statistics: + + a_missing_value - the fill data value in the a set + b_missing_value - the fill data value in the b set + epsilon - the fixed epsilon value + epsilon_percent - the percentage of the a set that will be used for comparison + max_a - the maximum value in the a set + max_b - the maximum value in the b set + min_a - the minimum value in the a set + min_b - the minimum value in the b set + num_data_points - the total number of data points in each of the sets + shape - the shape of each of the data sets + spatially_invalid_pts_ignored_in_a - number of points corresponding to invalid lat/lon in a set + spatially_invalid_pts_ignored_in_b - number of points corresponding to invalid lat/lon in b set + """ + + def __init__(self, diffInfoObject) : + """ + build our general statistics based on the comparison + of two data sets + """ + self.title = 'NaN Statistics' + + # pull out some masks for later use + a_missing_mask = diffInfoObject.a_data_object.masks.missing_mask + b_missing_mask = diffInfoObject.b_data_object.masks.missing_mask + ignore_in_a_mask = diffInfoObject.a_data_object.masks.ignore_mask + ignore_in_b_mask = diffInfoObject.b_data_object.masks.ignore_mask + good_in_a_mask = diffInfoObject.a_data_object.masks.valid_mask + good_in_b_mask = diffInfoObject.b_data_object.masks.valid_mask + + assert(a_missing_mask.shape == b_missing_mask.shape) + assert(b_missing_mask.shape == ignore_in_a_mask.shape) + assert(ignore_in_a_mask.shape == ignore_in_b_mask.shape) + assert(ignore_in_b_mask.shape == good_in_a_mask.shape) + assert(good_in_a_mask.shape == good_in_b_mask.shape) + + # get the number of data points + total_num_values = a_missing_mask.size + + # fill in our statistics + self.a_missing_value = diffInfoObject.a_data_object.fill_value + self.b_missing_value = diffInfoObject.b_data_object.fill_value + self.epsilon = diffInfoObject.epsilon_value + self.epsilon_percent = diffInfoObject.epsilon_percent + self.max_a = delta.max_with_mask(diffInfoObject.a_data_object.data, good_in_a_mask) + self.min_a = delta.min_with_mask(diffInfoObject.a_data_object.data, good_in_a_mask) + self.max_b = delta.max_with_mask(diffInfoObject.b_data_object.data, good_in_b_mask) + self.min_b = delta.min_with_mask(diffInfoObject.b_data_object.data, good_in_b_mask) + self.num_data_points = total_num_values + self.shape = a_missing_mask.shape + # also calculate the invalid points + self.spatially_invalid_pts_ignored_in_a = np.sum(ignore_in_a_mask) + self.spatially_invalid_pts_ignored_in_b = np.sum(ignore_in_b_mask) + + def dictionary_form(self) : + """ + get a dictionary form of the statistics + """ + + toReturn = { + 'a_missing_value': self.a_missing_value, + 'b_missing_value': self.b_missing_value, + 'epsilon': self.epsilon, + 'epsilon_percent': self.epsilon_percent, + 'max_a': self.max_a, + 'max_b': self.max_b, + 'min_a': self.min_a, + 'min_b': self.min_b, + 'num_data_points': self.num_data_points, + 'shape': self.shape, + 'spatially_invalid_pts_ignored_in_a': self.spatially_invalid_pts_ignored_in_a, + 'spatially_invalid_pts_ignored_in_b': self.spatially_invalid_pts_ignored_in_b + } + + return toReturn + + # TODO, replace the giant doc dictionary with use of this method + def doc_strings(self) : + """ + get documentation strings that match the + dictionary form of the statistics + """ + + toReturn = { + 'a_missing_value': 'the value that is considered \"missing\" data when it is found in A', + 'b_missing_value': 'the value that is considered \"missing\" data when it is found in B', + 'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable', + 'epsilon_percent': 'the percentage of difference (of A\'s value) that is acceptable between A and B (optional)', + 'max_a': 'the maximum finite, non-missing value found in A', + 'max_b': 'the maximum finite, non-missing value found in B', + 'min_a': 'the minimum finite, non-missing value found in A', + 'min_b': 'the minimum finite, non-missing value found in B', + 'num_data_points': "number of data values in A", + 'shape': "shape of A", + 'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' + + ' ignored for the purposes of data analysis and presentation', + 'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' + + ' ignored for the purposes of data analysis and presentation', + } + + return toReturn + +class NumericalComparisonStatistics (StatisticalData) : + """ + A class representing more complex comparisons between a pair of data sets. + + includes the following statistics: + + correlation - the Pearson correlation r-coefficient from comparing finite values of the sets + r_squared_correlation - the square of the correlation + diff_outside_epsilon_count - the number of points that fall outside the acceptable epsilon settings + diff_outside_epsilon_fraction - the fraction of points that fall outside the acceptable epsilon settings + perfect_match_count - the number of points that match perfectly between the sets + perfect_match_fraction - the fraction of points that match perfectly between the sets + trouble_points_count - the number of points that have possible issues according to the current analysis + trouble_points_fraction - the fraction of points that have possible issues according to the current analysis + + It may also contain additional statistics. This is indicated by the does_include_simple boolean. + The possible additional statistics include: + + rms_diff - the root mean squared of the absolute difference between the two data sets + std_diff - the standard deviation of the absolute difference between the two data sets + mean_diff - the mean of the absolute difference between the two data sets + median_diff - the median of the absolute difference between the two data sets + max_diff - the maximum of the absolute difference between the two data sets + + These statistics can also be generated separately in dictionary form by calling the + basic_analysis method on this class. + """ + + def __init__(self, diffInfoObject, include_basic_analysis=True) : + """ + build our comparison statistics based on the comparison + of two data sets + + the include_basic_analysis flag indicates whether the statistics generated by the + basic_analysis method should also be generated + """ + self.title = 'Numerical Comparison Statistics' + + # pull out some info we will use later + valid_in_both = diffInfoObject.diff_data_object.masks.valid_mask + outside_epsilon_mask = diffInfoObject.diff_data_object.masks.outside_epsilon_mask + trouble_mask = diffInfoObject.diff_data_object.masks.trouble_mask + aData = diffInfoObject.a_data_object.data + bData = diffInfoObject.b_data_object.data + + assert (valid_in_both.shape == outside_epsilon_mask.shape) + assert (outside_epsilon_mask.shape == trouble_mask.shape) + assert (trouble_mask.shape == aData.shape) + assert (aData.shape == bData.shape) + + # fill in some simple statistics + self.diff_outside_epsilon_count = np.sum(outside_epsilon_mask) + self.perfect_match_count = NumericalComparisonStatistics._get_num_perfect(aData, bData, + goodMask=valid_in_both) + self.correlation = delta.compute_correlation(aData, bData, valid_in_both) + self.r_squared_correlation = self.correlation * self.correlation + self.trouble_points_count = np.sum(trouble_mask) + + # we actually want the total number of _finite_ values rather than all the data + total_num_finite_values = np.sum(valid_in_both) + + # calculate some more complex statistics + self.trouble_points_fraction = float(self.trouble_points_count) / float(aData.size) + # be careful not to divide by zero if we don't have finite data + if total_num_finite_values > 0 : + self.diff_outside_epsilon_fraction = float(self.diff_outside_epsilon_count) / float(total_num_finite_values) + self.perfect_match_fraction = float(self.perfect_match_count) / float(total_num_finite_values) + else: + self.diff_outside_epsilon_fraction = 0.0 + self.perfect_match_fraction = 0.0 + + # if desired, do the basic analysis + self.does_include_simple = include_basic_analysis + if (include_basic_analysis) : + basic_dict = NumericalComparisonStatistics.basic_analysis(diffInfoObject.diff_data_object.data, + valid_in_both) + self.rms_diff = basic_dict['rms_diff'] + self.std_diff = basic_dict['std_diff'] + self.mean_diff = basic_dict['mean_diff'] + self.median_diff = basic_dict['median_diff'] + self.max_diff = basic_dict['max_diff'] + self.temp_analysis = basic_dict + + def dictionary_form(self) : + """ + get a dictionary form of the statistics + """ + + toReturn = { + 'correlation': self.correlation, + 'r-squared correlation': self.r_squared_correlation, + 'diff_outside_epsilon_count': self.diff_outside_epsilon_count, + 'diff_outside_epsilon_fraction': self.diff_outside_epsilon_fraction, + 'perfect_match_count': self.perfect_match_count, + 'perfect_match_fraction': self.perfect_match_fraction, + 'trouble_points_count': self.trouble_points_count, + 'trouble_points_fraction': self.trouble_points_fraction + } + toReturn.update(self.temp_analysis) + + return toReturn + + # TODO, replace the giant doc dictionary with use of this method + def doc_strings(self) : + """ + get documentation strings that match the + dictionary form of the statistics + """ + + toReturn = { + 'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B", + 'diff_outside_epsilon_count': "number of finite differences falling outside acceptable epsilon definitions; " + + "note: this value includes data excluded by both epsilon and epsilon_percent if " + + "both have been defined", + 'diff_outside_epsilon_fraction': "fraction of finite differences falling outside acceptable epsilon " + + "definitions (out of common_finite_count)", + 'max_diff': "Maximum difference of finite values", + 'mean_diff': "mean difference of finite values", + 'median_diff': "median difference of finite values", + 'perfect_match_count': "number of perfectly matched finite data points between A and B", + 'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)", + 'rms_diff': "root mean square (RMS) difference of finite values", + 'r-squared correlation': "the square of the r correlation (see correlation)", + 'std_diff': "standard deviation of difference of finite values", + 'trouble_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' + + ' or are unacceptable when compared according to the current epsilon definitions', + 'trouble_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' + + ' or are unacceptable when compared according to the current epsilon definitions', + } + + return toReturn + + @staticmethod + def basic_analysis(diffData, valid_mask): + """ + do some very minimal analysis of the differences + """ + + # if all the data is invalid, + # we can't do any of these forms of statistical analysis + if np.sum(valid_mask) <= 0 : + return { } + + # calculate our statistics + absDiffData = abs(diffData) + root_mean_square_value = delta.calculate_root_mean_square(diffData, valid_mask) + return { 'rms_diff': root_mean_square_value, + 'std_diff': np.std(absDiffData[valid_mask]), + 'mean_diff': np.mean(absDiffData[valid_mask]), + 'median_diff': np.median(absDiffData[valid_mask]), + 'max_diff': np.max(absDiffData[valid_mask]) + } + + @staticmethod + def _get_num_perfect(aData, bData, goodMask=None): + """ + get the number of data points where + the value in A perfectly matches the value in B + """ + numPerfect = 0 + if not (goodMask is None) : + numPerfect = np.sum(aData[goodMask] == bData[goodMask]) + else : + numPerfect = np.sum(aData == bData) + return numPerfect + +# --------------------- general statistics methods ------------------ def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), @@ -92,187 +579,22 @@ def summarize(a, b, epsilon=0., bDataObject = dataobj.DataObject(b, fillValue=b_missing_value, ignoreMask=ignoreInBMask) diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=epsilon, epsilonPercent=epsilonPercent) - #TODO, for the moment, unpack these values into local variables - diffData = diffInfo.diff_data_object.data - finite_mask = diffInfo.diff_data_object.masks.valid_mask - finite_a_mask = diffInfo.a_data_object.masks.valid_mask - finite_b_mask = diffInfo.b_data_object.masks.valid_mask - trouble = diffInfo.diff_data_object.masks.trouble_mask - outside_epsilon = diffInfo.diff_data_object.masks.outside_epsilon_mask - anfin = diffInfo.a_data_object.masks.non_finite_mask - bnfin = diffInfo.b_data_object.masks.non_finite_mask - amis = diffInfo.a_data_object.masks.missing_mask - bmis = diffInfo.b_data_object.masks.missing_mask - ignoreInAMask = diffInfo.a_data_object.masks.ignore_mask - ignoreInBMask = diffInfo.b_data_object.masks.ignore_mask - - general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, - ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask) - additional_statistics = stats(diffData, finite_mask) #*nfo) # grab some additional comparison statistics - comparison_stats = _get_numerical_data_stats(a, b, diffData, finite_mask, outside_epsilon, trouble, additional_statistics) - nan_stats = _get_nan_stats(anfin, bnfin) - missing_stats = _get_missing_value_stats(amis, bmis) - finite_stats = _get_finite_data_stats(finite_a_mask, finite_b_mask, (ignoreInAMask | ignoreInBMask)) + + general_stats = GeneralStatistics(diffInfo) + comparison_stats = NumericalComparisonStatistics(diffInfo) + nan_stats = NotANumberStatistics(diffInfo) + missing_stats = MissingValueStatistics(diffInfo) + finite_stats = FiniteDataStatistics(diffInfo) out = {} - out['NaN Statistics'] = nan_stats - out['Missing Value Statistics'] = missing_stats - out['Finite Data Statistics'] = finite_stats - out['Numerical Comparison Statistics'] = comparison_stats - out['General Statistics'] = general_stats + out[nan_stats.title] = nan_stats.dictionary_form() + out[missing_stats.title] = missing_stats.dictionary_form() + out[finite_stats.title] = finite_stats.dictionary_form() + out[comparison_stats.title] = comparison_stats.dictionary_form() + out[general_stats.title] = general_stats.dictionary_form() return out -def stats(diffData, mask, *etc): - - # if there are no values after the mask, - # we can't do any of these forms of - # statistical analysis - if np.sum(mask) <= 0 : - return { } - - absDiffData = abs(diffData) - root_mean_square_value = delta.calculate_root_mean_square(diffData, mask) - return { 'rms_diff': root_mean_square_value, - 'std_diff': np.std(absDiffData[mask]), - 'mean_diff': np.mean(absDiffData[mask]), - 'median_diff': np.median(absDiffData[mask]), - 'max_diff': np.max(absDiffData[mask]) - } - -def _get_num_perfect(a, b, ignoreMask=None): - numPerfect = 0 - if not (ignoreMask is None) : - numPerfect = np.sum(a[~ignoreMask] == b[~ignoreMask]) - else : - numPerfect = np.sum(a == b) - return numPerfect - -def _get_numerical_data_stats(a, b, diff_data, data_is_finite_mask, - outside_epsilon_mask, trouble_mask, - additional_statistics={}) : - """ - Get a list of numerical comparison related statistics about a and b, - given a and b and some other information about them. - the return value will be a dictionary of statistics - """ - # calculate our various statistics - num_finite_values_too_different = np.sum(outside_epsilon_mask) - num_perfect = _get_num_perfect(a, b, ~data_is_finite_mask) - r_corr = delta.compute_correlation(a, b, data_is_finite_mask) - num_trouble = np.sum(trouble_mask) - - # we actually want the total number of _finite_ values rather than all the data - total_num_finite_values = np.sum(data_is_finite_mask) - - # no dividing by 0! - fraction_too_different = 0.0 - fraction_perfect = 0.0 - if total_num_finite_values > 0 : - fraction_too_different = num_finite_values_too_different / float(total_num_finite_values) - fraction_perfect = num_perfect / float(total_num_finite_values) - - comparison = { 'correlation': r_corr, - 'r-squared correlation': r_corr * r_corr, - 'diff_outside_epsilon_count': num_finite_values_too_different, - 'diff_outside_epsilon_fraction': fraction_too_different, - 'perfect_match_count': num_perfect, - 'perfect_match_fraction': fraction_perfect, - 'trouble_points_count': num_trouble, - 'trouble_points_fraction': float(num_trouble) / float(a.size) - } - comparison.update(additional_statistics) - - return comparison - -def _get_general_data_stats(a, b, - a_missing_value, b_missing_value, - epsilon, - spatial_ignore_in_a_mask, spatial_ignore_in_b_mask, - bad_in_a, bad_in_b - ) : - """ - Get a list of general statistics about a and b, given a and b and some other information - about them. - the return value will be a dictionary of statistics - """ - # figure out how much spatial trouble we had - num_ignored_in_a = np.sum(spatial_ignore_in_a_mask) - num_ignored_in_b = np.sum(spatial_ignore_in_b_mask) - - # get the number of data points - total_num_values = a.size - - general_stats = {'a_missing_value': a_missing_value, - 'b_missing_value': b_missing_value, - 'epsilon': epsilon, - 'max_a': delta.max_with_mask(a, ~bad_in_a), - 'max_b': delta.max_with_mask(b, ~bad_in_b), - 'min_a': delta.min_with_mask(a, ~bad_in_a), - 'min_b': delta.min_with_mask(b, ~bad_in_b), - 'num_data_points': total_num_values, - 'shape': a.shape, - 'spatially_invalid_pts_ignored_in_a': num_ignored_in_a, - 'spatially_invalid_pts_ignored_in_b': num_ignored_in_b - } - - return general_stats - -def _get_finite_data_stats(a_is_finite_mask, b_is_finite_mask, common_ignore_mask) : - """ - Get a list of statistics about finite data values in data sets a and b, - given masks describing the positions of the finite data in each file (such masks - can be gotten from the diff function), - the return value will be a dictionary of statistics - """ - # calculate information about the finite data - num_a_finite = np.sum(a_is_finite_mask) - num_b_finite = np.sum(b_is_finite_mask) - num_common_finite = np.sum(a_is_finite_mask & b_is_finite_mask) - num_finite_in_only_one = np.sum((a_is_finite_mask ^ b_is_finite_mask) & (~ common_ignore_mask)) # use an exclusive OR - - # make the assumption that a and b are the same size and only use the size of a's mask - total_num_values = a_is_finite_mask.size - - finite_value_stats = {'a_finite_count': num_a_finite, - 'a_finite_fraction': (float(num_a_finite) / float(total_num_values)), - 'b_finite_count': num_b_finite, - 'b_finite_fraction': (float(num_b_finite) / float(total_num_values)), - 'common_finite_count': num_common_finite, - 'common_finite_fraction': (float(num_common_finite) / float(total_num_values)), - 'finite_in_only_one_count': num_finite_in_only_one, - 'finite_in_only_one_fraction': (float(num_finite_in_only_one) / float(total_num_values)), - } - - return finite_value_stats - - - -def _get_nan_stats(a_nan_mask, b_nan_mask) : - """ - Get a list of statistics about non-numerical values in data sets a and b, - the return value will be a dictionary of statistics - """ - # find the nan values in the data - num_a_nans = np.sum(a_nan_mask) - num_b_nans = np.sum(b_nan_mask) - num_common_nans = np.sum(a_nan_mask & b_nan_mask) - - # make the assumption that a and b are the same size and only use the size of a - total_num_values = a_nan_mask.size - - nan_stats = {'a_nan_count': num_a_nans, - 'a_nan_fraction': (float(num_a_nans) / float(total_num_values)), - 'b_nan_count': num_b_nans, - 'b_nan_fraction': (float(num_b_nans) / float(total_num_values)), - 'common_nan_count': num_common_nans, - 'common_nan_fraction': (float(num_common_nans) / float(total_num_values)) - } - - return nan_stats - - - # -------------------------- documentation ----------------------------- STATISTICS_DOC = { 'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " + @@ -282,6 +604,7 @@ STATISTICS_DOC = { 'general': "Finite values are non-missing and finite (not Na 'a_missing_value': 'the value that is considered \"missing\" data when it is found in A', 'b_missing_value': 'the value that is considered \"missing\" data when it is found in B', 'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable', + 'epsilon_percent': 'the percentage of difference (of A\'s value) that is acceptable between A and B (optional)', 'max_a': 'the maximum finite, non-missing value found in A', 'max_b': 'the maximum finite, non-missing value found in B', 'min_a': 'the minimum finite, non-missing value found in A', diff --git a/pyglance/glance/variablereport.txt b/pyglance/glance/variablereport.txt index 360209da3143730c587ed93641a822e05638ebb8..f244599970c76d188ec0fb1bbc68dc320d861d6d 100644 --- a/pyglance/glance/variablereport.txt +++ b/pyglance/glance/variablereport.txt @@ -152,6 +152,11 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. ## display the epsilon epsilon value: ${runInfo['epsilon']} <br> + ## if there is an epsilon percent, also show that + % if ('epsilon_percent' in runInfo) and (runInfo['epsilon_percent'] is not None) : + epsilon percent: ${runInfo['epsilon_percent']}% <br> + % endif + ## display the missing value % if ('missing_value_alt_in_b' in runInfo) and (not (runInfo['missing_value_alt_in_b'] is runInfo['missing_value'])) : "missing" data value in A: ${str(runInfo['missing_value'])}<br>