From cf6b6e5d965aa5d66496eec5ffb5f918d29f087e Mon Sep 17 00:00:00 2001 From: "(no author)" <(no author)@8a9318a1-56ba-4d59-b755-99d26321be01> Date: Mon, 17 May 2010 20:05:00 +0000 Subject: [PATCH] moving statistics functions to a separate module git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@108 8a9318a1-56ba-4d59-b755-99d26321be01 --- pyglance/glance/compare.py | 71 ++----- pyglance/glance/delta.py | 318 +---------------------------- pyglance/glance/figures.py | 13 +- pyglance/glance/filters.py | 2 +- pyglance/glance/io.py | 2 +- pyglance/glance/plot.py | 6 - pyglance/glance/plotcreatefns.py | 10 +- pyglance/glance/report.py | 4 +- pyglance/glance/stats.py | 298 +++++++++++++++++++++++++++ pyglance/glance/variablereport.txt | 1 - 10 files changed, 328 insertions(+), 397 deletions(-) create mode 100644 pyglance/glance/stats.py diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index 24ae328..1cd3af2 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -11,19 +11,18 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. import os, sys, logging, re, subprocess, datetime import imp as imp -from pprint import pprint, pformat from numpy import * import pkg_resources from pycdf import CDFError from subprocess import check_call as sh +from urllib import quote -import glance.io as io -import glance.delta as delta -import glance.plot as plot -import glance.plotcreatefns as plotcreate +import glance.io as io +import glance.delta as delta +import glance.plot as plot import glance.report as report - -from urllib import quote +import glance.stats as statistics +import glance.plotcreatefns as plotcreate LOG = logging.getLogger(__name__) @@ -1146,11 +1145,11 @@ def reportGen_library_call (a_path, b_path, var_list=[ ], if not do_not_test_with_lon_lat : mask_a_to_use = lon_lat_data['a']['inv_mask'] mask_b_to_use = lon_lat_data['b']['inv_mask'] - variable_stats = delta.summarize(aData, bData, - varRunInfo['epsilon'], - (varRunInfo['missing_value'], - varRunInfo['missing_value_alt_in_b']), - mask_a_to_use, mask_b_to_use) + variable_stats = statistics.summarize(aData, bData, + varRunInfo['epsilon'], + (varRunInfo['missing_value'], + varRunInfo['missing_value_alt_in_b']), + mask_a_to_use, mask_b_to_use) # add a little additional info to our variable run info before we squirrel it away varRunInfo['time'] = datetime.datetime.ctime(datetime.datetime.now()) # todo is this needed? @@ -1299,7 +1298,7 @@ def reportGen_library_call (a_path, b_path, var_list=[ ], # make the glossary print ('generating glossary') - report.generate_and_save_doc_page(delta.STATISTICS_DOC, pathsTemp['out']) + report.generate_and_save_doc_page(statistics.STATISTICS_DOC, pathsTemp['out']) return @@ -1348,17 +1347,17 @@ def stats_library_call(afn, bfn, var_list=[ ], print >> output_channel, '-'*32 print >> output_channel, name print >> output_channel, '' - lal = list(delta.summarize(aval,bval,epsilon,(amiss,bmiss)).items()) + lal = list(statistics.summarize(aval,bval,epsilon,(amiss,bmiss)).items()) lal.sort() for dictionary_title, dict_data in lal: print >> output_channel, '%s' % dictionary_title dict_data for each_stat in sorted(list(dict_data)): print >> output_channel, ' %s: %s' % (each_stat, dict_data[each_stat]) - if doc_each: print >> output_channel, (' ' + delta.STATISTICS_DOC[each_stat]) + if doc_each: print >> output_channel, (' ' + statistics.STATISTICS_DOC[each_stat]) print >> output_channel, '' if doc_atend: - print >> output_channel, ('\n\n' + delta.STATISTICS_DOC_STR) + print >> output_channel, ('\n\n' + statistics.STATISTICS_DOC_STR) def main(): import optparse @@ -1467,46 +1466,6 @@ python -m glance plot.compare_spectra(bspc,aspc) plot.show() - def noisecheck(*args): - """gives statistics for dataset comparisons against truth with and without noise - usage: noisecheck truth-file noise-file actual-file variable1{:epsilon{:missing}} {variable2...} - glance noisecheck /Volumes/snaapy/data/justins/abi_graffir/coreg/pure/l2_data/geocatL2.GOES-R.2005155.220000.hdf.gz /Volumes/snaapy/data/justins/abi_graffir/noise/noise1x/l2_data/geocatL2.GOES-R.2005155.220000.hdf - """ # TODO ******* standardize with method? - afn,noizfn,bfn = args[:3] - LOG.info("opening truth file %s" % afn) - a = io.open(afn) - LOG.info("opening actual file %s" % noizfn) - noiz = io.open(noizfn) - LOG.info("opening noise file %s" % bfn) - b = io.open(bfn) - - anames = set(a()) - bnames = set(b()) - cnames = anames.intersection(bnames) # common names - pats = args[3:] or ['.*'] - names = _parse_varnames( cnames, pats, options.epsilon, options.missing ) - for name,epsilon,missing in names: - aData = a[name] - bData = b[name] - nData = noiz[name] - if missing is None: - amiss = a.missing_value(name) - bmiss = b.missing_value(name) - else: - amiss,bmiss = missing,missing - x = aData - y = bData - z = nData - def scat(x,xn,y): - from pylab import plot,show,scatter - scatter(x,y) - show() - nfo = delta.rms_corr_withnoise(x,y,z,epsilon,(amiss,bmiss),plot=scat) - print '-'*32 - print name - for kv in sorted(nfo.items()): - print ' %s: %s' % kv - def stats(*args): """create statistics summary of variables Summarize difference statistics between listed variables. diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py index bdd998f..23178a2 100644 --- a/pyglance/glance/delta.py +++ b/pyglance/glance/delta.py @@ -7,12 +7,12 @@ Created by rayg Apr 2009. Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. """ -import os, sys, logging +import logging import numpy as np from numpy import * -from scipy.stats import pearsonr, spearmanr, pointbiserialr +from scipy.stats import pearsonr -compute_r = pearsonr #spearmanr +compute_r = pearsonr LOG = logging.getLogger(__name__) @@ -25,7 +25,8 @@ datatype_upcasts = { uint32: int64 } -def _missing(x,missing_value=None): +# TODO, where is this being used? +def _missing(x, missing_value=None): if missing_value is not None: return isnan(x) | (x==missing_value) return isnan(x) @@ -139,59 +140,6 @@ def corr(x,y,mask): return nan return compute_r(xf,yf)[0] -# TODO, should this ultimately be removed? -def rms_corr_withnoise(truth, actual, noiz, epsilon=0., (amissing,bmissing)=(None,None), plot=None): - """ compute RMS and R statistics for truth vs actual and truth+noiz vs actual - """ - x=truth - y=actual - z=noiz - d, good, _, bad, _, _, _, _ = diff(x,y,epsilon,(amissing,bmissing)) - #d,good,bad,_,_,_ = diff(x,y,epsilon,(amissing,bmissing)) - # compute RMS error - rmse = sqrt(sum(d[good]**2)) / d.size - gf = good.flatten() - # raise NotImplementedError # FIXME we're getting NaN for R - xf = x.flatten()[gf] - yf = y.flatten()[gf] - assert(sum(~isfinite(yf))==0) - assert(sum(~isfinite(xf))==0) - # compute correlation coefficient - r = compute_r(xf,yf)[0] - # create xpn, x plus noise - xpn = array(x) - xpn[good] += z[good] - xpnf = xpn.flatten()[gf] - # compute RMS error versus noise - dpn, good, _, bad, _, _, _, _ = diff(xpn,y,epsilon,(amissing,bmissing)) - #dpn,good,bad,_,_,_ = diff(xpn,y,epsilon,(amissing,bmissing)) - rmsepn = sqrt(sum(dpn[good]**2)) / d.size - assert(sum(~isfinite(xpnf))==0) - rpn = compute_r(xpnf,yf)[0] - if plot: plot(xf,xpnf,yf) - return { 'rms_error': rmse, - 'correlation': r, - 'rms_error_with_noise': rmsepn, - 'correlation_with_noise': rpn, - } - -def stats(diffData, mask, *etc): - - # if there are no values after the mask, - # we can't do any of these forms of - # statistical analysis - if sum(mask) <= 0 : - return { } - - absDiffData = abs(diffData) - rms = calculate_root_mean_square(diffData, mask) - return { 'rms_diff': rms, - 'std_diff': std(absDiffData[mask]), - 'mean_diff': mean(absDiffData[mask]), - 'median_diff': median(absDiffData[mask]), - 'max_diff': max(absDiffData[mask]) - } - def convert_mag_dir_to_U_V_vector(magnitude_data, direction_data, invalidMask=None): """ This method is intended to convert magnitude and direction data into (U, V) vector data. @@ -306,159 +254,6 @@ def calculate_root_mean_square (data, goodMask=None) : return rootMeanSquare -def _get_num_perfect(a, b, ignoreMask=None): - numPerfect = 0 - if not (ignoreMask is None) : - numPerfect = sum(a[~ignoreMask] == b[~ignoreMask]) - else : - numPerfect = sum(a == b) - return numPerfect - -def _get_nan_stats(a_nan_mask, b_nan_mask) : - """ - Get a list of statistics about non-numerical values in data sets a and b, - the return value will be a dictionary of statistics - """ - # find the nan values in the data - num_a_nans = sum(a_nan_mask) - num_b_nans = sum(b_nan_mask) - num_common_nans = sum(a_nan_mask & b_nan_mask) - - # make the assumption that a and b are the same size and only use the size of a - total_num_values = a_nan_mask.size - - nan_stats = {'a_nan_count': num_a_nans, - 'a_nan_fraction': (float(num_a_nans) / float(total_num_values)), - 'b_nan_count': num_b_nans, - 'b_nan_fraction': (float(num_b_nans) / float(total_num_values)), - 'common_nan_count': num_common_nans, - 'common_nan_fraction': (float(num_common_nans) / float(total_num_values)) - } - - return nan_stats - -def _get_missing_value_stats(a_missing_mask, b_missing_mask) : - """ - Get a list of statistics about missing data values in data sets a and b, - given masks describing the positions of the missing data (such masks - can be gotten from the diff function), - the return value will be a dictionary of statistics - """ - # calculate information about the missing values - num_a_missing = sum(a_missing_mask) - num_b_missing = sum(b_missing_mask) - num_common_missing = sum(a_missing_mask & b_missing_mask) - - # make the assumption that a and b are the same size and only use the size of a's mask - total_num_values = a_missing_mask.size - - missing_value_stats = {'a_missing_count': num_a_missing, - 'a_missing_fraction': (float(num_a_missing) / float(total_num_values)), - 'b_missing_count': num_b_missing, - 'b_missing_fraction': (float(num_b_missing) / float(total_num_values)), - 'common_missing_count': num_common_missing, - 'common_missing_fraction': (float(num_common_missing) / float(total_num_values)) - } - - return missing_value_stats - -def _get_finite_data_stats(a_is_finite_mask, b_is_finite_mask, common_ignore_mask) : - """ - Get a list of statistics about finite data values in data sets a and b, - given masks describing the positions of the finite data in each file (such masks - can be gotten from the diff function), - the return value will be a dictionary of statistics - """ - # calculate information about the finite data - num_a_finite = sum(a_is_finite_mask) - num_b_finite = sum(b_is_finite_mask) - num_common_finite = sum(a_is_finite_mask & b_is_finite_mask) - num_finite_in_only_one = sum((a_is_finite_mask ^ b_is_finite_mask) & (~ common_ignore_mask)) # use an exclusive OR - - # make the assumption that a and b are the same size and only use the size of a's mask - total_num_values = a_is_finite_mask.size - - finite_value_stats = {'a_finite_count': num_a_finite, - 'a_finite_fraction': (float(num_a_finite) / float(total_num_values)), - 'b_finite_count': num_b_finite, - 'b_finite_fraction': (float(num_b_finite) / float(total_num_values)), - 'common_finite_count': num_common_finite, - 'common_finite_fraction': (float(num_common_finite) / float(total_num_values)), - 'finite_in_only_one_count': num_finite_in_only_one, - 'finite_in_only_one_fraction': (float(num_finite_in_only_one) / float(total_num_values)), - } - - return finite_value_stats - -def _get_general_data_stats(a, b, - a_missing_value, b_missing_value, - epsilon, - spatial_ignore_in_a_mask, spatial_ignore_in_b_mask, - bad_in_a, bad_in_b - ) : - """ - Get a list of general statistics about a and b, given a and b and some other information - about them. - the return value will be a dictionary of statistics - """ - # figure out how much spatial trouble we had - num_ignored_in_a = sum(spatial_ignore_in_a_mask) - num_ignored_in_b = sum(spatial_ignore_in_b_mask) - - # get the number of data points - total_num_values = a.size - - general_stats = {'a_missing_value': a_missing_value, - 'b_missing_value': b_missing_value, - 'epsilon': epsilon, - 'max_a': max_with_mask(a, bad_in_a), - 'max_b': max_with_mask(b, bad_in_b), - 'min_a': min_with_mask(a, bad_in_a), - 'min_b': min_with_mask(b, bad_in_b), - 'num_data_points': total_num_values, - 'shape': a.shape, - 'spatially_invalid_pts_ignored_in_a': num_ignored_in_a, - 'spatially_invalid_pts_ignored_in_b': num_ignored_in_b - } - - return general_stats - -def _get_numerical_data_stats(a, b, diff_data, data_is_finite_mask, - outside_epsilon_mask, trouble_mask, - additional_statistics={}) : - """ - Get a list of numerical comparison related statistics about a and b, - given a and b and some other information about them. - the return value will be a dictionary of statistics - """ - # calculate our various statistics - num_finite_values_too_different = sum(outside_epsilon_mask) - num_perfect = _get_num_perfect(a, b, ~data_is_finite_mask) - r_corr = corr(a, b, data_is_finite_mask) - num_trouble = sum(trouble_mask) - - # we actually want the total number of _finite_ values rather than all the data - total_num_finite_values = sum(data_is_finite_mask) - - # no dividing by 0! - fraction_too_different = 0.0 - fraction_perfect = 0.0 - if total_num_finite_values > 0 : - fraction_too_different = num_finite_values_too_different / float(total_num_finite_values) - fraction_perfect = num_perfect / float(total_num_finite_values) - - comparison = { 'correlation': r_corr, - 'diff_outside_epsilon_count': num_finite_values_too_different, - 'diff_outside_epsilon_fraction': fraction_too_different, - 'perfect_match_count': num_perfect, - 'perfect_match_fraction': fraction_perfect, - 'trouble_points_count': num_trouble, - 'trouble_points_fraction': float(num_trouble) / float(a.size) - } - comparison.update(additional_statistics) - - return comparison - # get the min, ignoring the stuff in mask def min_with_mask(data, mask=None) : @@ -487,40 +282,6 @@ def max_with_mask(data, mask=None) : toReturn = temp[temp.argmax()] return toReturn -def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), ignoreInAMask=None, ignoreInBMask=None): - """return dictionary of statistics dictionaries - stats not including 'nan' in name exclude nans in either arrays - """ - - diffData, finite_mask, (finite_a_mask, finite_b_mask), \ - trouble, outside_epsilon, (anfin, bnfin), \ - (amis, bmis), (ignoreInAMask, ignoreInBMask) = nfo = diff(a, b, epsilon, - (a_missing_value, b_missing_value), - (ignoreInAMask, ignoreInBMask)) - ''' - d, valid_mask, trouble, (anfin, bnfin), (amis, bmis), outside_epsilon = nfo = diff(a,b, - epsilon, - (a_missing_value, b_missing_value), - (ignoreInAMask, ignoreInBMask)) - ''' - - general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, - ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask) - additional_statistics = stats(*nfo) # grab some additional comparison statistics - comparison_stats = _get_numerical_data_stats(a, b, diffData, finite_mask, outside_epsilon, trouble, additional_statistics) - nan_stats = _get_nan_stats(anfin, bnfin) - missing_stats = _get_missing_value_stats(amis, bmis) - finite_stats = _get_finite_data_stats(finite_a_mask, finite_b_mask, (ignoreInAMask | ignoreInBMask)) - - out = {} - out['NaN Statistics'] = nan_stats - out['Missing Value Statistics'] = missing_stats - out['Finite Data Statistics'] = finite_stats - out['Numerical Comparison Statistics'] = comparison_stats - out['General Statistics'] = general_stats - - return out - def create_colocation_mapping_within_epsilon((alongitude, alatitude), (blongitude, blatitude), lonlatEpsilon, @@ -832,75 +593,6 @@ def create_colocated_data_with_lon_lat_colocation(listOfColocatedALonLat, listOf (unmatchedAPoints, unmatchedALongitude, unmatchedALatitude), \ (unmatchedBPoints, unmatchedBLongitude, unmatchedBLatitude) -STATISTICS_DOC = { 'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " + - "both finite and not, unless otherwise specified", - - # general statistics - 'a_missing_value': 'the value that is considered \"missing\" data when it is found in A', - 'b_missing_value': 'the value that is considered \"missing\" data when it is found in B', - 'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable', - 'max_a': 'the maximum finite, non-missing value found in A', - 'max_b': 'the maximum finite, non-missing value found in B', - 'min_a': 'the minimum finite, non-missing value found in A', - 'min_b': 'the minimum finite, non-missing value found in B', - 'num_data_points': "number of data values in A", - 'shape': "shape of A", - 'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' + - ' ignored for the purposes of data analysis and presentation', - 'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' + - ' ignored for the purposes of data analysis and presentation', - - # finite data stats descriptions - 'a_finite_count': "number of finite values in A", - 'a_finite_fraction': "fraction of finite values in A (out of all data points in A)", - 'b_finite_count': "number of finite values in B", - 'b_finite_fraction': "fraction of finite values in B (out of all data points in B)", - 'common_finite_count': "number of finite values in common between A and B", - 'common_finite_fraction': "fraction of finite values in common between A and B", - 'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " + - "only the common spatially valid area is considerd for this statistic", - 'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " + - "only the common spatially valid area is considerd for this statistic", - - # missing data value statistics - 'a_missing_count': "number of values flagged missing in A", - 'a_missing_fraction': "fraction of values flagged missing in A", - 'b_missing_count': "number of values flagged missing in B", - 'b_missing_fraction': "fraction of values flagged missing in B", - 'common_missing_count': "number of missing values in common between A and B", - 'common_missing_fraction': "fraction of missing values in common between A and B", - - # NaN related statistics - 'a_nan_count': "number of NaNs in A", - 'a_nan_fraction': "fraction of NaNs in A", - 'b_nan_count': "number of NaNs in B", - 'b_nan_fraction': "fraction of NaNs in B", - 'common_nan_count': "number of NaNs in common between A and B", - 'common_nan_fraction': "fraction of NaNs in common between A and B", - - # Numerical comparison statistics - 'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B", - 'diff_outside_epsilon_count': "number of finite differences falling outside epsilon", - 'diff_outside_epsilon_fraction': "fraction of finite differences falling outside epsilon (out of common_finite_count)", - 'max_diff': "Maximum difference of finite values", - 'mean_diff': "mean difference of finite values", - 'median_diff': "median difference of finite values", - 'perfect_match_count': "number of perfectly matched finite data points between A and B", - 'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)", - 'rms_diff': "root mean square (RMS) difference of finite values", - 'std_diff': "standard deviation of difference of finite values", - 'trouble_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' + - ' or are unacceptable when compared according to the current epsilon value', - 'trouble_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' + - ' or are unacceptable when compared according to the current epsilon value', - - # note: the statistics described below may no longer be generated? - 'mean_percent_change': "Percent change from A to B for finite values, averaged", - 'max_percent_change': "Percent change from A to B for finite values, maximum value" - - } -STATISTICS_DOC_STR = '\n'.join( '%s:\n %s' % x for x in sorted(list(STATISTICS_DOC.items())) ) + '\n' - if __name__=='__main__': import doctest doctest.testmod() diff --git a/pyglance/glance/figures.py b/pyglance/glance/figures.py index 45edc65..ea3b8c3 100644 --- a/pyglance/glance/figures.py +++ b/pyglance/glance/figures.py @@ -13,21 +13,18 @@ matplotlib.use('Agg') # use the Anti-Grain Geometry rendering engine from pylab import * -from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt -from matplotlib import cm import matplotlib.colors as colors from matplotlib.ticker import FormatStrFormatter -from PIL import Image - -import os, sys, logging +import logging import numpy as np from numpy import ma import glance.graphics as maps import glance.delta as delta import glance.report as report +import glance.stats as statistics LOG = logging.getLogger(__name__) @@ -268,10 +265,10 @@ def create_histogram(data, bins, title, xLabel, yLabel, displayStats=False) : if displayStats : # info on the basic stats tempMask = ones(data.shape,dtype=bool) - tempStats = delta.stats(data, tempMask, None) + tempStats = statistics.stats(data, tempMask, None) medianVal = tempStats['median_diff'] - meanVal = tempStats['mean_diff'] - stdVal = tempStats['std_diff'] + meanVal = tempStats['mean_diff'] + stdVal = tempStats['std_diff'] numPts = data.size # info on the display of our statistics diff --git a/pyglance/glance/filters.py b/pyglance/glance/filters.py index 2c550b3..c7ee4ae 100644 --- a/pyglance/glance/filters.py +++ b/pyglance/glance/filters.py @@ -7,7 +7,7 @@ Created by Eva Schiffer August 2009. Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. """ -import numpy as np +import numpy as np def trim_off_of_top (data, num_elements_to_trim) : """ diff --git a/pyglance/glance/io.py b/pyglance/glance/io.py index eb112fd..de5ecec 100644 --- a/pyglance/glance/io.py +++ b/pyglance/glance/io.py @@ -7,7 +7,7 @@ Created by rayg Apr 2009. Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. """ -import os, sys, logging +import os, logging from pyhdf.SD import SD,SDC, SDS, HDF4Error try: diff --git a/pyglance/glance/plot.py b/pyglance/glance/plot.py index 08eac02..6cfa540 100644 --- a/pyglance/glance/plot.py +++ b/pyglance/glance/plot.py @@ -13,21 +13,15 @@ matplotlib.use('Agg') # use the Anti-Grain Geometry rendering engine from pylab import * -from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt -from matplotlib import cm -import matplotlib.colors as colors -from matplotlib.ticker import FormatStrFormatter from PIL import Image import os, sys, logging import numpy as np -from numpy import ma import glance.graphics as maps import glance.delta as delta -import glance.report as report import glance.figures as figures import glance.plotcreatefns as plotfns diff --git a/pyglance/glance/plotcreatefns.py b/pyglance/glance/plotcreatefns.py index a649061..a03ee87 100644 --- a/pyglance/glance/plotcreatefns.py +++ b/pyglance/glance/plotcreatefns.py @@ -13,23 +13,15 @@ matplotlib.use('Agg') # use the Anti-Grain Geometry rendering engine from pylab import * -from mpl_toolkits.mplot3d import axes3d -import matplotlib.pyplot as plt -from matplotlib import cm import matplotlib.colors as colors -from matplotlib.ticker import FormatStrFormatter -from PIL import Image - -import os, sys, logging +import logging import random as random import numpy as np from numpy import ma import glance.graphics as maps import glance.delta as delta -import glance.report as report -import glance.filters as filters # TODO, should this be moved? import glance.figures as figures LOG = logging.getLogger(__name__) diff --git a/pyglance/glance/report.py b/pyglance/glance/report.py index ae0e618..773eef2 100644 --- a/pyglance/glance/report.py +++ b/pyglance/glance/report.py @@ -7,9 +7,9 @@ Created by rayg Apr 2009. Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. """ -import os, sys, logging +import sys, logging -from pkg_resources import resource_string, resource_stream, resource_filename +from pkg_resources import resource_string, resource_filename #, resource_stream from mako.template import Template import types as types import numpy as np diff --git a/pyglance/glance/stats.py b/pyglance/glance/stats.py new file mode 100644 index 0000000..c9e754f --- /dev/null +++ b/pyglance/glance/stats.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +This module handles statistical analysis of data sets. The code present in +this module is based on previous versions of delta.py. + +Created by evas Apr 2010. +Copyright (c) 2010 University of Wisconsin SSEC. All rights reserved. +""" + +#import glance.data as data +import glance.delta as delta +#from glance.data import MaskSetObject + +import numpy as np + +# --------------------- general statistics methods ------------------ + +def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), ignoreInAMask=None, ignoreInBMask=None): + """return dictionary of statistics dictionaries + stats not including 'nan' in name exclude nans in either arrays + """ + + diffData, finite_mask, (finite_a_mask, finite_b_mask), \ + trouble, outside_epsilon, (anfin, bnfin), \ + (amis, bmis), (ignoreInAMask, ignoreInBMask) = nfo = delta.diff(a, b, epsilon, + (a_missing_value, b_missing_value), + (ignoreInAMask, ignoreInBMask)) + ''' + d, valid_mask, trouble, (anfin, bnfin), (amis, bmis), outside_epsilon = nfo = diff(a,b, + epsilon, + (a_missing_value, b_missing_value), + (ignoreInAMask, ignoreInBMask)) + ''' + + general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, + ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask) + additional_statistics = stats(*nfo) # grab some additional comparison statistics + comparison_stats = _get_numerical_data_stats(a, b, diffData, finite_mask, outside_epsilon, trouble, additional_statistics) + nan_stats = _get_nan_stats(anfin, bnfin) + missing_stats = _get_missing_value_stats(amis, bmis) + finite_stats = _get_finite_data_stats(finite_a_mask, finite_b_mask, (ignoreInAMask | ignoreInBMask)) + + out = {} + out['NaN Statistics'] = nan_stats + out['Missing Value Statistics'] = missing_stats + out['Finite Data Statistics'] = finite_stats + out['Numerical Comparison Statistics'] = comparison_stats + out['General Statistics'] = general_stats + + return out + +def stats(diffData, mask, *etc): + + # if there are no values after the mask, + # we can't do any of these forms of + # statistical analysis + if np.sum(mask) <= 0 : + return { } + + absDiffData = abs(diffData) + root_mean_square_value = delta.calculate_root_mean_square(diffData, mask) + return { 'rms_diff': root_mean_square_value, + 'std_diff': np.std(absDiffData[mask]), + 'mean_diff': np.mean(absDiffData[mask]), + 'median_diff': np.median(absDiffData[mask]), + 'max_diff': np.max(absDiffData[mask]) + } + +def _get_num_perfect(a, b, ignoreMask=None): + numPerfect = 0 + if not (ignoreMask is None) : + numPerfect = np.sum(a[~ignoreMask] == b[~ignoreMask]) + else : + numPerfect = np.sum(a == b) + return numPerfect + +def _get_numerical_data_stats(a, b, diff_data, data_is_finite_mask, + outside_epsilon_mask, trouble_mask, + additional_statistics={}) : + """ + Get a list of numerical comparison related statistics about a and b, + given a and b and some other information about them. + the return value will be a dictionary of statistics + """ + # calculate our various statistics + num_finite_values_too_different = np.sum(outside_epsilon_mask) + num_perfect = _get_num_perfect(a, b, ~data_is_finite_mask) + r_corr = delta.corr(a, b, data_is_finite_mask) + num_trouble = np.sum(trouble_mask) + + # we actually want the total number of _finite_ values rather than all the data + total_num_finite_values = np.sum(data_is_finite_mask) + + # no dividing by 0! + fraction_too_different = 0.0 + fraction_perfect = 0.0 + if total_num_finite_values > 0 : + fraction_too_different = num_finite_values_too_different / float(total_num_finite_values) + fraction_perfect = num_perfect / float(total_num_finite_values) + + comparison = { 'correlation': r_corr, + 'diff_outside_epsilon_count': num_finite_values_too_different, + 'diff_outside_epsilon_fraction': fraction_too_different, + 'perfect_match_count': num_perfect, + 'perfect_match_fraction': fraction_perfect, + 'trouble_points_count': num_trouble, + 'trouble_points_fraction': float(num_trouble) / float(a.size) + } + comparison.update(additional_statistics) + + return comparison + +def _get_general_data_stats(a, b, + a_missing_value, b_missing_value, + epsilon, + spatial_ignore_in_a_mask, spatial_ignore_in_b_mask, + bad_in_a, bad_in_b + ) : + """ + Get a list of general statistics about a and b, given a and b and some other information + about them. + the return value will be a dictionary of statistics + """ + # figure out how much spatial trouble we had + num_ignored_in_a = np.sum(spatial_ignore_in_a_mask) + num_ignored_in_b = np.sum(spatial_ignore_in_b_mask) + + # get the number of data points + total_num_values = a.size + + general_stats = {'a_missing_value': a_missing_value, + 'b_missing_value': b_missing_value, + 'epsilon': epsilon, + 'max_a': delta.max_with_mask(a, bad_in_a), + 'max_b': delta.max_with_mask(b, bad_in_b), + 'min_a': delta.min_with_mask(a, bad_in_a), + 'min_b': delta.min_with_mask(b, bad_in_b), + 'num_data_points': total_num_values, + 'shape': a.shape, + 'spatially_invalid_pts_ignored_in_a': num_ignored_in_a, + 'spatially_invalid_pts_ignored_in_b': num_ignored_in_b + } + + return general_stats + +def _get_finite_data_stats(a_is_finite_mask, b_is_finite_mask, common_ignore_mask) : + """ + Get a list of statistics about finite data values in data sets a and b, + given masks describing the positions of the finite data in each file (such masks + can be gotten from the diff function), + the return value will be a dictionary of statistics + """ + # calculate information about the finite data + num_a_finite = np.sum(a_is_finite_mask) + num_b_finite = np.sum(b_is_finite_mask) + num_common_finite = np.sum(a_is_finite_mask & b_is_finite_mask) + num_finite_in_only_one = np.sum((a_is_finite_mask ^ b_is_finite_mask) & (~ common_ignore_mask)) # use an exclusive OR + + # make the assumption that a and b are the same size and only use the size of a's mask + total_num_values = a_is_finite_mask.size + + finite_value_stats = {'a_finite_count': num_a_finite, + 'a_finite_fraction': (float(num_a_finite) / float(total_num_values)), + 'b_finite_count': num_b_finite, + 'b_finite_fraction': (float(num_b_finite) / float(total_num_values)), + 'common_finite_count': num_common_finite, + 'common_finite_fraction': (float(num_common_finite) / float(total_num_values)), + 'finite_in_only_one_count': num_finite_in_only_one, + 'finite_in_only_one_fraction': (float(num_finite_in_only_one) / float(total_num_values)), + } + + return finite_value_stats + +def _get_missing_value_stats(a_missing_mask, b_missing_mask) : + """ + Get a list of statistics about missing data values in data sets a and b, + given masks describing the positions of the missing data (such masks + can be gotten from the diff function), + the return value will be a dictionary of statistics + """ + # calculate information about the missing values + num_a_missing = np.sum(a_missing_mask) + num_b_missing = np.sum(b_missing_mask) + num_common_missing = np.sum(a_missing_mask & b_missing_mask) + + # make the assumption that a and b are the same size and only use the size of a's mask + total_num_values = a_missing_mask.size + + missing_value_stats = {'a_missing_count': num_a_missing, + 'a_missing_fraction': (float(num_a_missing) / float(total_num_values)), + 'b_missing_count': num_b_missing, + 'b_missing_fraction': (float(num_b_missing) / float(total_num_values)), + 'common_missing_count': num_common_missing, + 'common_missing_fraction': (float(num_common_missing) / float(total_num_values)) + } + + return missing_value_stats + +def _get_nan_stats(a_nan_mask, b_nan_mask) : + """ + Get a list of statistics about non-numerical values in data sets a and b, + the return value will be a dictionary of statistics + """ + # find the nan values in the data + num_a_nans = np.sum(a_nan_mask) + num_b_nans = np.sum(b_nan_mask) + num_common_nans = np.sum(a_nan_mask & b_nan_mask) + + # make the assumption that a and b are the same size and only use the size of a + total_num_values = a_nan_mask.size + + nan_stats = {'a_nan_count': num_a_nans, + 'a_nan_fraction': (float(num_a_nans) / float(total_num_values)), + 'b_nan_count': num_b_nans, + 'b_nan_fraction': (float(num_b_nans) / float(total_num_values)), + 'common_nan_count': num_common_nans, + 'common_nan_fraction': (float(num_common_nans) / float(total_num_values)) + } + + return nan_stats + + + +# -------------------------- documentation ----------------------------- + +STATISTICS_DOC = { 'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " + + "both finite and not, unless otherwise specified", + + # general statistics + 'a_missing_value': 'the value that is considered \"missing\" data when it is found in A', + 'b_missing_value': 'the value that is considered \"missing\" data when it is found in B', + 'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable', + 'max_a': 'the maximum finite, non-missing value found in A', + 'max_b': 'the maximum finite, non-missing value found in B', + 'min_a': 'the minimum finite, non-missing value found in A', + 'min_b': 'the minimum finite, non-missing value found in B', + 'num_data_points': "number of data values in A", + 'shape': "shape of A", + 'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' + + ' ignored for the purposes of data analysis and presentation', + 'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' + + ' ignored for the purposes of data analysis and presentation', + + # finite data stats descriptions + 'a_finite_count': "number of finite values in A", + 'a_finite_fraction': "fraction of finite values in A (out of all data points in A)", + 'b_finite_count': "number of finite values in B", + 'b_finite_fraction': "fraction of finite values in B (out of all data points in B)", + 'common_finite_count': "number of finite values in common between A and B", + 'common_finite_fraction': "fraction of finite values in common between A and B", + 'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " + + "only the common spatially valid area is considerd for this statistic", + 'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " + + "only the common spatially valid area is considerd for this statistic", + + # missing data value statistics + 'a_missing_count': "number of values flagged missing in A", + 'a_missing_fraction': "fraction of values flagged missing in A", + 'b_missing_count': "number of values flagged missing in B", + 'b_missing_fraction': "fraction of values flagged missing in B", + 'common_missing_count': "number of missing values in common between A and B", + 'common_missing_fraction': "fraction of missing values in common between A and B", + + # NaN related statistics + 'a_nan_count': "number of NaNs in A", + 'a_nan_fraction': "fraction of NaNs in A", + 'b_nan_count': "number of NaNs in B", + 'b_nan_fraction': "fraction of NaNs in B", + 'common_nan_count': "number of NaNs in common between A and B", + 'common_nan_fraction': "fraction of NaNs in common between A and B", + + # Numerical comparison statistics + 'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B", + 'diff_outside_epsilon_count': "number of finite differences falling outside epsilon", + 'diff_outside_epsilon_fraction': "fraction of finite differences falling outside epsilon (out of common_finite_count)", + 'max_diff': "Maximum difference of finite values", + 'mean_diff': "mean difference of finite values", + 'median_diff': "median difference of finite values", + 'perfect_match_count': "number of perfectly matched finite data points between A and B", + 'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)", + 'rms_diff': "root mean square (RMS) difference of finite values", + 'std_diff': "standard deviation of difference of finite values", + 'trouble_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' + + ' or are unacceptable when compared according to the current epsilon value', + 'trouble_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' + + ' or are unacceptable when compared according to the current epsilon value', + + # note: the statistics described below may no longer be generated? + 'mean_percent_change': "Percent change from A to B for finite values, averaged", + 'max_percent_change': "Percent change from A to B for finite values, maximum value" + + } +STATISTICS_DOC_STR = '\n'.join( '%s:\n %s' % x for x in sorted(list(STATISTICS_DOC.items())) ) + '\n' + +if __name__=='__main__': + import doctest + doctest.testmod() diff --git a/pyglance/glance/variablereport.txt b/pyglance/glance/variablereport.txt index b135063..360209d 100644 --- a/pyglance/glance/variablereport.txt +++ b/pyglance/glance/variablereport.txt @@ -6,7 +6,6 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. </%doc> <%! - import types as types import glance.report as report %> -- GitLab