From cf6b6e5d965aa5d66496eec5ffb5f918d29f087e Mon Sep 17 00:00:00 2001
From: "(no author)" <(no author)@8a9318a1-56ba-4d59-b755-99d26321be01>
Date: Mon, 17 May 2010 20:05:00 +0000
Subject: [PATCH] moving statistics functions to a separate module

git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@108 8a9318a1-56ba-4d59-b755-99d26321be01
---
 pyglance/glance/compare.py         |  71 ++-----
 pyglance/glance/delta.py           | 318 +----------------------------
 pyglance/glance/figures.py         |  13 +-
 pyglance/glance/filters.py         |   2 +-
 pyglance/glance/io.py              |   2 +-
 pyglance/glance/plot.py            |   6 -
 pyglance/glance/plotcreatefns.py   |  10 +-
 pyglance/glance/report.py          |   4 +-
 pyglance/glance/stats.py           | 298 +++++++++++++++++++++++++++
 pyglance/glance/variablereport.txt |   1 -
 10 files changed, 328 insertions(+), 397 deletions(-)
 create mode 100644 pyglance/glance/stats.py

diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py
index 24ae328..1cd3af2 100644
--- a/pyglance/glance/compare.py
+++ b/pyglance/glance/compare.py
@@ -11,19 +11,18 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
 
 import os, sys, logging, re, subprocess, datetime
 import imp as imp
-from pprint import pprint, pformat
 from numpy import *
 import pkg_resources
 from pycdf import CDFError
 from subprocess import check_call as sh
+from urllib import quote
 
-import glance.io as io
-import glance.delta as delta
-import glance.plot as plot
-import glance.plotcreatefns as plotcreate
+import glance.io     as io
+import glance.delta  as delta
+import glance.plot   as plot
 import glance.report as report
-
-from urllib import quote
+import glance.stats  as statistics
+import glance.plotcreatefns as plotcreate
 
 LOG = logging.getLogger(__name__)
 
@@ -1146,11 +1145,11 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
             if not do_not_test_with_lon_lat :
                 mask_a_to_use = lon_lat_data['a']['inv_mask']
                 mask_b_to_use = lon_lat_data['b']['inv_mask']
-            variable_stats = delta.summarize(aData, bData,
-                                             varRunInfo['epsilon'],
-                                             (varRunInfo['missing_value'],
-                                             varRunInfo['missing_value_alt_in_b']),
-                                             mask_a_to_use, mask_b_to_use)
+            variable_stats = statistics.summarize(aData, bData,
+                                                  varRunInfo['epsilon'],
+                                                  (varRunInfo['missing_value'],
+                                                   varRunInfo['missing_value_alt_in_b']),
+                                                  mask_a_to_use, mask_b_to_use)
             
             # add a little additional info to our variable run info before we squirrel it away
             varRunInfo['time'] = datetime.datetime.ctime(datetime.datetime.now())  # todo is this needed?
@@ -1299,7 +1298,7 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
         
         # make the glossary
         print ('generating glossary')
-        report.generate_and_save_doc_page(delta.STATISTICS_DOC, pathsTemp['out'])
+        report.generate_and_save_doc_page(statistics.STATISTICS_DOC, pathsTemp['out'])
     
     return
 
@@ -1348,17 +1347,17 @@ def stats_library_call(afn, bfn, var_list=[ ],
         print >> output_channel, '-'*32
         print >> output_channel, name
         print >> output_channel, '' 
-        lal = list(delta.summarize(aval,bval,epsilon,(amiss,bmiss)).items()) 
+        lal = list(statistics.summarize(aval,bval,epsilon,(amiss,bmiss)).items()) 
         lal.sort()
         for dictionary_title, dict_data in lal:
             print >> output_channel, '%s' %  dictionary_title
             dict_data
             for each_stat in sorted(list(dict_data)):
                 print >> output_channel, '  %s: %s' % (each_stat, dict_data[each_stat])
-                if doc_each: print >> output_channel, ('    ' + delta.STATISTICS_DOC[each_stat])
+                if doc_each: print >> output_channel, ('    ' + statistics.STATISTICS_DOC[each_stat])
             print >> output_channel, '' 
     if doc_atend:
-        print >> output_channel, ('\n\n' + delta.STATISTICS_DOC_STR)
+        print >> output_channel, ('\n\n' + statistics.STATISTICS_DOC_STR)
 
 def main():
     import optparse
@@ -1467,46 +1466,6 @@ python -m glance
         plot.compare_spectra(bspc,aspc)
         plot.show()
     
-    def noisecheck(*args):
-        """gives statistics for dataset comparisons against truth with and without noise
-        usage: noisecheck truth-file noise-file actual-file variable1{:epsilon{:missing}} {variable2...}
-        glance noisecheck /Volumes/snaapy/data/justins/abi_graffir/coreg/pure/l2_data/geocatL2.GOES-R.2005155.220000.hdf.gz /Volumes/snaapy/data/justins/abi_graffir/noise/noise1x/l2_data/geocatL2.GOES-R.2005155.220000.hdf 
-        """ # TODO ******* standardize with method?
-        afn,noizfn,bfn = args[:3]
-        LOG.info("opening truth file %s" % afn)
-        a = io.open(afn)
-        LOG.info("opening actual file %s" % noizfn)
-        noiz = io.open(noizfn)
-        LOG.info("opening noise file %s" % bfn)
-        b = io.open(bfn)
-        
-        anames = set(a())
-        bnames = set(b()) 
-        cnames = anames.intersection(bnames) # common names
-        pats = args[3:] or ['.*']
-        names = _parse_varnames( cnames, pats, options.epsilon, options.missing )
-        for name,epsilon,missing in names:
-            aData = a[name]
-            bData = b[name]
-            nData = noiz[name]
-            if missing is None:
-                amiss = a.missing_value(name)
-                bmiss = b.missing_value(name)
-            else:
-                amiss,bmiss = missing,missing
-            x = aData
-            y = bData
-            z = nData
-            def scat(x,xn,y):
-                from pylab import plot,show,scatter
-                scatter(x,y)
-                show()
-            nfo = delta.rms_corr_withnoise(x,y,z,epsilon,(amiss,bmiss),plot=scat)
-            print '-'*32
-            print name
-            for kv in sorted(nfo.items()):
-                print '  %s: %s' % kv
-    
     def stats(*args):
         """create statistics summary of variables
         Summarize difference statistics between listed variables.
diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py
index bdd998f..23178a2 100644
--- a/pyglance/glance/delta.py
+++ b/pyglance/glance/delta.py
@@ -7,12 +7,12 @@ Created by rayg Apr 2009.
 Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
 """
 
-import os, sys, logging
+import logging
 import numpy as np
 from numpy import *
-from scipy.stats import pearsonr, spearmanr, pointbiserialr
+from scipy.stats import pearsonr
 
-compute_r = pearsonr #spearmanr
+compute_r = pearsonr
 
 LOG = logging.getLogger(__name__)
 
@@ -25,7 +25,8 @@ datatype_upcasts = {
     uint32: int64
     }
 
-def _missing(x,missing_value=None):
+# TODO, where is this being used?
+def _missing(x, missing_value=None):
     if missing_value is not None:
         return isnan(x) | (x==missing_value)
     return isnan(x)
@@ -139,59 +140,6 @@ def corr(x,y,mask):
         return nan
     return compute_r(xf,yf)[0]
 
-# TODO, should this ultimately be removed?
-def rms_corr_withnoise(truth, actual, noiz, epsilon=0., (amissing,bmissing)=(None,None), plot=None):
-    """ compute RMS and R statistics for truth vs actual and truth+noiz vs actual
-    """
-    x=truth
-    y=actual
-    z=noiz
-    d, good, _, bad, _, _, _, _ = diff(x,y,epsilon,(amissing,bmissing))
-    #d,good,bad,_,_,_ = diff(x,y,epsilon,(amissing,bmissing))
-    # compute RMS error
-    rmse = sqrt(sum(d[good]**2)) / d.size
-    gf = good.flatten()
-    # raise NotImplementedError # FIXME we're getting NaN for R
-    xf = x.flatten()[gf]
-    yf = y.flatten()[gf]
-    assert(sum(~isfinite(yf))==0)
-    assert(sum(~isfinite(xf))==0)
-    # compute correlation coefficient
-    r = compute_r(xf,yf)[0]
-    # create xpn, x plus noise
-    xpn = array(x)
-    xpn[good] += z[good]
-    xpnf = xpn.flatten()[gf]
-    # compute RMS error versus noise
-    dpn, good, _, bad, _, _, _, _ = diff(xpn,y,epsilon,(amissing,bmissing))
-    #dpn,good,bad,_,_,_ = diff(xpn,y,epsilon,(amissing,bmissing))
-    rmsepn = sqrt(sum(dpn[good]**2)) / d.size
-    assert(sum(~isfinite(xpnf))==0)
-    rpn = compute_r(xpnf,yf)[0]
-    if plot: plot(xf,xpnf,yf)
-    return { 'rms_error': rmse,
-             'correlation': r,
-             'rms_error_with_noise': rmsepn,
-             'correlation_with_noise': rpn,
-             }
-
-def stats(diffData, mask, *etc):
-    
-    # if there are no values after the mask,
-    # we can't do any of these forms of
-    # statistical analysis
-    if sum(mask) <= 0 :
-        return { }
-    
-    absDiffData = abs(diffData)
-    rms = calculate_root_mean_square(diffData, mask)
-    return {    'rms_diff': rms, 
-                'std_diff': std(absDiffData[mask]), 
-                'mean_diff': mean(absDiffData[mask]), 
-                'median_diff': median(absDiffData[mask]),
-                'max_diff': max(absDiffData[mask])
-                }
-
 def convert_mag_dir_to_U_V_vector(magnitude_data, direction_data, invalidMask=None):
     """
     This method is intended to convert magnitude and direction data into (U, V) vector data.
@@ -306,159 +254,6 @@ def calculate_root_mean_square (data, goodMask=None) :
     
     return rootMeanSquare
 
-def _get_num_perfect(a, b, ignoreMask=None):
-    numPerfect = 0
-    if not (ignoreMask is None) :
-        numPerfect = sum(a[~ignoreMask] == b[~ignoreMask])
-    else :
-        numPerfect = sum(a == b)
-    return numPerfect
-
-def _get_nan_stats(a_nan_mask, b_nan_mask) :
-    """
-    Get a list of statistics about non-numerical values in data sets a and b,
-    the return value will be a dictionary of statistics
-    """
-    # find the nan values in the data
-    num_a_nans = sum(a_nan_mask)
-    num_b_nans = sum(b_nan_mask)
-    num_common_nans = sum(a_nan_mask & b_nan_mask)
-    
-    # make the assumption that a and b are the same size and only use the size of a
-    total_num_values = a_nan_mask.size
-    
-    nan_stats = {'a_nan_count': num_a_nans,
-                 'a_nan_fraction': (float(num_a_nans) / float(total_num_values)),
-                 'b_nan_count': num_b_nans,
-                 'b_nan_fraction': (float(num_b_nans) / float(total_num_values)),
-                 'common_nan_count': num_common_nans,
-                 'common_nan_fraction': (float(num_common_nans) / float(total_num_values))
-                 }
-    
-    return nan_stats
-    
-def _get_missing_value_stats(a_missing_mask, b_missing_mask) :
-    """
-    Get a list of statistics about missing data values in data sets a and b,
-    given masks describing the positions of the missing data (such masks
-    can be gotten from the diff function), 
-    the return value will be a dictionary of statistics
-    """
-    # calculate information about the missing values
-    num_a_missing = sum(a_missing_mask)
-    num_b_missing = sum(b_missing_mask)
-    num_common_missing = sum(a_missing_mask & b_missing_mask)
-    
-    # make the assumption that a and b are the same size and only use the size of a's mask
-    total_num_values = a_missing_mask.size
-    
-    missing_value_stats = {'a_missing_count': num_a_missing,
-                           'a_missing_fraction': (float(num_a_missing) / float(total_num_values)),
-                           'b_missing_count': num_b_missing,
-                           'b_missing_fraction': (float(num_b_missing) / float(total_num_values)),
-                           'common_missing_count': num_common_missing,
-                           'common_missing_fraction': (float(num_common_missing) / float(total_num_values))
-                           }
-    
-    return missing_value_stats
-    
-def _get_finite_data_stats(a_is_finite_mask, b_is_finite_mask, common_ignore_mask) :
-    """
-    Get a list of statistics about finite data values in data sets a and b,
-    given masks describing the positions of the finite data in each file (such masks
-    can be gotten from the diff function),
-    the return value will be a dictionary of statistics
-    """
-    # calculate information about the finite data
-    num_a_finite = sum(a_is_finite_mask)
-    num_b_finite = sum(b_is_finite_mask)
-    num_common_finite = sum(a_is_finite_mask & b_is_finite_mask)
-    num_finite_in_only_one = sum((a_is_finite_mask ^ b_is_finite_mask) & (~ common_ignore_mask)) # use an exclusive OR
-    
-    # make the assumption that a and b are the same size and only use the size of a's mask
-    total_num_values = a_is_finite_mask.size
-    
-    finite_value_stats = {'a_finite_count': num_a_finite,
-                          'a_finite_fraction': (float(num_a_finite) / float(total_num_values)),
-                          'b_finite_count': num_b_finite,
-                          'b_finite_fraction': (float(num_b_finite) / float(total_num_values)),
-                          'common_finite_count': num_common_finite,
-                          'common_finite_fraction': (float(num_common_finite) / float(total_num_values)),
-                          'finite_in_only_one_count': num_finite_in_only_one,
-                          'finite_in_only_one_fraction': (float(num_finite_in_only_one) / float(total_num_values)),
-                          }
-    
-    return finite_value_stats
-
-def _get_general_data_stats(a, b,
-                            a_missing_value, b_missing_value,
-                            epsilon, 
-                            spatial_ignore_in_a_mask, spatial_ignore_in_b_mask,
-                            bad_in_a, bad_in_b
-                            ) :
-    """
-    Get a list of general statistics about a and b, given a and b and some other information
-    about them.
-    the return value will be a dictionary of statistics
-    """
-    # figure out how much spatial trouble we had
-    num_ignored_in_a = sum(spatial_ignore_in_a_mask)
-    num_ignored_in_b = sum(spatial_ignore_in_b_mask)
-    
-    # get the number of data points
-    total_num_values = a.size
-    
-    general_stats = {'a_missing_value': a_missing_value,
-                     'b_missing_value': b_missing_value,
-                     'epsilon': epsilon,
-                     'max_a': max_with_mask(a, bad_in_a),
-                     'max_b': max_with_mask(b, bad_in_b),
-                     'min_a': min_with_mask(a, bad_in_a),
-                     'min_b': min_with_mask(b, bad_in_b),
-                     'num_data_points': total_num_values,
-                     'shape': a.shape,
-                     'spatially_invalid_pts_ignored_in_a': num_ignored_in_a,
-                     'spatially_invalid_pts_ignored_in_b': num_ignored_in_b
-                     }
-    
-    return general_stats
-
-def _get_numerical_data_stats(a, b, diff_data,  data_is_finite_mask,
-                              outside_epsilon_mask, trouble_mask,
-                              additional_statistics={}) : 
-    """
-    Get a list of numerical comparison related statistics about a and b,
-    given a and b and some other information about them.
-    the return value will be a dictionary of statistics
-    """
-    # calculate our various statistics
-    num_finite_values_too_different = sum(outside_epsilon_mask)
-    num_perfect = _get_num_perfect(a, b, ~data_is_finite_mask)
-    r_corr = corr(a, b, data_is_finite_mask)
-    num_trouble = sum(trouble_mask)
-    
-    # we actually want the total number of _finite_ values rather than all the data
-    total_num_finite_values = sum(data_is_finite_mask)
-    
-    # no dividing by 0!
-    fraction_too_different = 0.0
-    fraction_perfect = 0.0
-    if total_num_finite_values > 0 :
-        fraction_too_different = num_finite_values_too_different / float(total_num_finite_values)
-        fraction_perfect = num_perfect / float(total_num_finite_values)
-    
-    comparison = {  'correlation': r_corr,
-                    'diff_outside_epsilon_count': num_finite_values_too_different,
-                    'diff_outside_epsilon_fraction': fraction_too_different,
-                    'perfect_match_count': num_perfect,
-                    'perfect_match_fraction': fraction_perfect,
-                     'trouble_points_count': num_trouble, 
-                     'trouble_points_fraction': float(num_trouble) / float(a.size)
-                    }
-    comparison.update(additional_statistics)
-    
-    return comparison
-
 # get the min, ignoring the stuff in mask
 def min_with_mask(data, mask=None) :
     
@@ -487,40 +282,6 @@ def max_with_mask(data, mask=None) :
         toReturn = temp[temp.argmax()]
     return toReturn
 
-def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), ignoreInAMask=None, ignoreInBMask=None):
-    """return dictionary of statistics dictionaries
-    stats not including 'nan' in name exclude nans in either arrays
-    """
-    
-    diffData, finite_mask, (finite_a_mask, finite_b_mask), \
-    trouble, outside_epsilon, (anfin, bnfin), \
-    (amis, bmis), (ignoreInAMask, ignoreInBMask) = nfo = diff(a, b, epsilon,
-                                                              (a_missing_value, b_missing_value),
-                                                              (ignoreInAMask, ignoreInBMask))
-    '''
-    d, valid_mask, trouble, (anfin, bnfin), (amis, bmis), outside_epsilon = nfo = diff(a,b,
-                                                                                       epsilon,
-                                                                                       (a_missing_value, b_missing_value),
-                                                                                       (ignoreInAMask, ignoreInBMask))
-                                                                                       '''
-    
-    general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, 
-                                            ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask) 
-    additional_statistics = stats(*nfo) # grab some additional comparison statistics 
-    comparison_stats = _get_numerical_data_stats(a, b, diffData, finite_mask, outside_epsilon, trouble, additional_statistics) 
-    nan_stats = _get_nan_stats(anfin, bnfin)
-    missing_stats = _get_missing_value_stats(amis, bmis)
-    finite_stats = _get_finite_data_stats(finite_a_mask, finite_b_mask, (ignoreInAMask | ignoreInBMask)) 
-    
-    out = {}
-    out['NaN Statistics'] = nan_stats
-    out['Missing Value Statistics'] = missing_stats
-    out['Finite Data Statistics'] = finite_stats
-    out['Numerical Comparison Statistics'] = comparison_stats
-    out['General Statistics'] = general_stats
-    
-    return out
-
 def create_colocation_mapping_within_epsilon((alongitude, alatitude),
                                              (blongitude, blatitude),
                                              lonlatEpsilon,
@@ -832,75 +593,6 @@ def create_colocated_data_with_lon_lat_colocation(listOfColocatedALonLat, listOf
            (unmatchedAPoints, unmatchedALongitude, unmatchedALatitude), \
            (unmatchedBPoints, unmatchedBLongitude, unmatchedBLatitude)
 
-STATISTICS_DOC = {  'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " +
-                               "both finite and not, unless otherwise specified",
-                    
-                    # general statistics
-                    'a_missing_value': 'the value that is considered \"missing\" data when it is found in A',
-                    'b_missing_value': 'the value that is considered \"missing\" data when it is found in B',
-                    'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable',
-                    'max_a': 'the maximum finite, non-missing value found in A',
-                    'max_b': 'the maximum finite, non-missing value found in B',
-                    'min_a': 'the minimum finite, non-missing value found in A',
-                    'min_b': 'the minimum finite, non-missing value found in B',
-                    'num_data_points': "number of data values in A",
-                    'shape': "shape of A",
-                    'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' +
-                                                            ' ignored for the purposes of data analysis and presentation',
-                    'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' +
-                                                            ' ignored for the purposes of data analysis and presentation',
-                    
-                    # finite data stats descriptions
-                    'a_finite_count': "number of finite values in A",
-                    'a_finite_fraction': "fraction of finite values in A (out of all data points in A)",
-                    'b_finite_count': "number of finite values in B",
-                    'b_finite_fraction': "fraction of finite values in B (out of all data points in B)",
-                    'common_finite_count': "number of finite values in common between A and B",
-                    'common_finite_fraction': "fraction of finite values in common between A and B",
-                    'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " +
-                                                "only the common spatially valid area is considerd for this statistic",
-                    'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " +
-                                                "only the common spatially valid area is considerd for this statistic",
-                    
-                    # missing data value statistics
-                    'a_missing_count': "number of values flagged missing in A",
-                    'a_missing_fraction': "fraction of values flagged missing in A",
-                    'b_missing_count': "number of values flagged missing in B",
-                    'b_missing_fraction': "fraction of values flagged missing in B",
-                    'common_missing_count': "number of missing values in common between A and B",
-                    'common_missing_fraction': "fraction of missing values in common between A and B",
-                    
-                    # NaN related statistics
-                    'a_nan_count': "number of NaNs in A",
-                    'a_nan_fraction': "fraction of NaNs in A",
-                    'b_nan_count': "number of NaNs in B",
-                    'b_nan_fraction': "fraction of NaNs in B",
-                    'common_nan_count': "number of NaNs in common between A and B",
-                    'common_nan_fraction': "fraction of NaNs in common between A and B",
-                    
-                    # Numerical comparison statistics
-                    'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B",
-                    'diff_outside_epsilon_count': "number of finite differences falling outside epsilon",
-                    'diff_outside_epsilon_fraction': "fraction of finite differences falling outside epsilon (out of common_finite_count)",
-                    'max_diff': "Maximum difference of finite values",
-                    'mean_diff': "mean difference of finite values",
-                    'median_diff': "median difference of finite values",
-                    'perfect_match_count': "number of perfectly matched finite data points between A and B",
-                    'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)",
-                    'rms_diff': "root mean square (RMS) difference of finite values",
-                    'std_diff': "standard deviation of difference of finite values",
-                    'trouble_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' +
-                                            ' or are unacceptable when compared according to the current epsilon value',
-                    'trouble_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' +
-                                            ' or are unacceptable when compared according to the current epsilon value',
-                    
-                    # note: the statistics described below may no longer be generated?
-                    'mean_percent_change': "Percent change from A to B for finite values, averaged",
-                    'max_percent_change': "Percent change from A to B for finite values, maximum value"
-                    
-                    }
-STATISTICS_DOC_STR = '\n'.join( '%s:\n    %s' % x for x in sorted(list(STATISTICS_DOC.items())) ) + '\n'
-
 if __name__=='__main__':
     import doctest
     doctest.testmod()
diff --git a/pyglance/glance/figures.py b/pyglance/glance/figures.py
index 45edc65..ea3b8c3 100644
--- a/pyglance/glance/figures.py
+++ b/pyglance/glance/figures.py
@@ -13,21 +13,18 @@ matplotlib.use('Agg') # use the Anti-Grain Geometry rendering engine
 
 from pylab import *
 
-from mpl_toolkits.mplot3d import axes3d
 import matplotlib.pyplot as plt
-from matplotlib import cm
 import matplotlib.colors as colors
 from matplotlib.ticker import FormatStrFormatter
 
-from PIL import Image
-
-import os, sys, logging
+import logging
 import numpy as np
 from numpy import ma 
 
 import glance.graphics as maps
 import glance.delta    as delta
 import glance.report   as report
+import glance.stats    as statistics
 
 LOG = logging.getLogger(__name__)
 
@@ -268,10 +265,10 @@ def create_histogram(data, bins, title, xLabel, yLabel, displayStats=False) :
     if displayStats :
         # info on the basic stats
         tempMask = ones(data.shape,dtype=bool)
-        tempStats = delta.stats(data, tempMask, None)
+        tempStats = statistics.stats(data, tempMask, None)
         medianVal = tempStats['median_diff']
-        meanVal = tempStats['mean_diff']
-        stdVal = tempStats['std_diff']
+        meanVal   = tempStats['mean_diff']
+        stdVal    = tempStats['std_diff']
         numPts = data.size
         
         # info on the display of our statistics
diff --git a/pyglance/glance/filters.py b/pyglance/glance/filters.py
index 2c550b3..c7ee4ae 100644
--- a/pyglance/glance/filters.py
+++ b/pyglance/glance/filters.py
@@ -7,7 +7,7 @@ Created by Eva Schiffer August 2009.
 Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
 """
 
-import numpy      as np
+import numpy as np
 
 def trim_off_of_top (data, num_elements_to_trim) :
     """
diff --git a/pyglance/glance/io.py b/pyglance/glance/io.py
index eb112fd..de5ecec 100644
--- a/pyglance/glance/io.py
+++ b/pyglance/glance/io.py
@@ -7,7 +7,7 @@ Created by rayg Apr 2009.
 Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
 """
 
-import os, sys, logging
+import os, logging
 
 from pyhdf.SD import SD,SDC, SDS, HDF4Error
 try:
diff --git a/pyglance/glance/plot.py b/pyglance/glance/plot.py
index 08eac02..6cfa540 100644
--- a/pyglance/glance/plot.py
+++ b/pyglance/glance/plot.py
@@ -13,21 +13,15 @@ matplotlib.use('Agg') # use the Anti-Grain Geometry rendering engine
 
 from pylab import *
 
-from mpl_toolkits.mplot3d import axes3d
 import matplotlib.pyplot as plt
-from matplotlib import cm
-import matplotlib.colors as colors
-from matplotlib.ticker import FormatStrFormatter
 
 from PIL import Image
 
 import os, sys, logging
 import numpy as np
-from numpy import ma 
 
 import glance.graphics as maps
 import glance.delta    as delta
-import glance.report   as report
 import glance.figures  as figures
 import glance.plotcreatefns as plotfns
 
diff --git a/pyglance/glance/plotcreatefns.py b/pyglance/glance/plotcreatefns.py
index a649061..a03ee87 100644
--- a/pyglance/glance/plotcreatefns.py
+++ b/pyglance/glance/plotcreatefns.py
@@ -13,23 +13,15 @@ matplotlib.use('Agg') # use the Anti-Grain Geometry rendering engine
 
 from pylab import *
 
-from mpl_toolkits.mplot3d import axes3d
-import matplotlib.pyplot as plt
-from matplotlib import cm
 import matplotlib.colors as colors
-from matplotlib.ticker import FormatStrFormatter
 
-from PIL import Image
-
-import os, sys, logging
+import logging
 import random as random
 import numpy as np
 from numpy import ma 
 
 import glance.graphics as maps
 import glance.delta    as delta
-import glance.report   as report
-import glance.filters  as filters # TODO, should this be moved?
 import glance.figures  as figures
 
 LOG = logging.getLogger(__name__)
diff --git a/pyglance/glance/report.py b/pyglance/glance/report.py
index ae0e618..773eef2 100644
--- a/pyglance/glance/report.py
+++ b/pyglance/glance/report.py
@@ -7,9 +7,9 @@ Created by rayg Apr 2009.
 Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
 """
 
-import os, sys, logging
+import sys, logging
 
-from pkg_resources import resource_string, resource_stream, resource_filename
+from pkg_resources import resource_string, resource_filename #, resource_stream
 from mako.template import Template
 import types as types
 import numpy as np
diff --git a/pyglance/glance/stats.py b/pyglance/glance/stats.py
new file mode 100644
index 0000000..c9e754f
--- /dev/null
+++ b/pyglance/glance/stats.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+This module handles statistical analysis of data sets. The code present in
+this module is based on previous versions of delta.py.
+
+Created by evas Apr 2010.
+Copyright (c) 2010 University of Wisconsin SSEC. All rights reserved.
+"""
+
+#import glance.data  as data
+import glance.delta as delta
+#from   glance.data import MaskSetObject
+
+import numpy as np
+
+# --------------------- general statistics methods ------------------
+
+def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), ignoreInAMask=None, ignoreInBMask=None):
+    """return dictionary of statistics dictionaries
+    stats not including 'nan' in name exclude nans in either arrays
+    """
+    
+    diffData, finite_mask, (finite_a_mask, finite_b_mask), \
+    trouble, outside_epsilon, (anfin, bnfin), \
+    (amis, bmis), (ignoreInAMask, ignoreInBMask) = nfo = delta.diff(a, b, epsilon,
+                                                                    (a_missing_value, b_missing_value),
+                                                                    (ignoreInAMask, ignoreInBMask))
+    '''
+    d, valid_mask, trouble, (anfin, bnfin), (amis, bmis), outside_epsilon = nfo = diff(a,b,
+                                                                                       epsilon,
+                                                                                       (a_missing_value, b_missing_value),
+                                                                                       (ignoreInAMask, ignoreInBMask))
+                                                                                       '''
+    
+    general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, 
+                                            ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask) 
+    additional_statistics = stats(*nfo) # grab some additional comparison statistics 
+    comparison_stats = _get_numerical_data_stats(a, b, diffData, finite_mask, outside_epsilon, trouble, additional_statistics) 
+    nan_stats = _get_nan_stats(anfin, bnfin)
+    missing_stats = _get_missing_value_stats(amis, bmis)
+    finite_stats = _get_finite_data_stats(finite_a_mask, finite_b_mask, (ignoreInAMask | ignoreInBMask)) 
+    
+    out = {}
+    out['NaN Statistics'] = nan_stats
+    out['Missing Value Statistics'] = missing_stats
+    out['Finite Data Statistics'] = finite_stats
+    out['Numerical Comparison Statistics'] = comparison_stats
+    out['General Statistics'] = general_stats
+    
+    return out
+
+def stats(diffData, mask, *etc):
+    
+    # if there are no values after the mask,
+    # we can't do any of these forms of
+    # statistical analysis
+    if np.sum(mask) <= 0 :
+        return { }
+    
+    absDiffData = abs(diffData)
+    root_mean_square_value = delta.calculate_root_mean_square(diffData, mask)
+    return {    'rms_diff': root_mean_square_value, 
+                'std_diff':       np.std(absDiffData[mask]), 
+                'mean_diff':     np.mean(absDiffData[mask]), 
+                'median_diff': np.median(absDiffData[mask]),
+                'max_diff':       np.max(absDiffData[mask])
+                }
+
+def _get_num_perfect(a, b, ignoreMask=None):
+    numPerfect = 0
+    if not (ignoreMask is None) :
+        numPerfect = np.sum(a[~ignoreMask] == b[~ignoreMask])
+    else :
+        numPerfect = np.sum(a == b)
+    return numPerfect
+
+def _get_numerical_data_stats(a, b, diff_data,  data_is_finite_mask,
+                              outside_epsilon_mask, trouble_mask,
+                              additional_statistics={}) : 
+    """
+    Get a list of numerical comparison related statistics about a and b,
+    given a and b and some other information about them.
+    the return value will be a dictionary of statistics
+    """
+    # calculate our various statistics
+    num_finite_values_too_different = np.sum(outside_epsilon_mask)
+    num_perfect = _get_num_perfect(a, b, ~data_is_finite_mask)
+    r_corr = delta.corr(a, b, data_is_finite_mask)
+    num_trouble = np.sum(trouble_mask)
+    
+    # we actually want the total number of _finite_ values rather than all the data
+    total_num_finite_values = np.sum(data_is_finite_mask)
+    
+    # no dividing by 0!
+    fraction_too_different = 0.0
+    fraction_perfect = 0.0
+    if total_num_finite_values > 0 :
+        fraction_too_different = num_finite_values_too_different / float(total_num_finite_values)
+        fraction_perfect = num_perfect / float(total_num_finite_values)
+    
+    comparison = {  'correlation': r_corr,
+                    'diff_outside_epsilon_count': num_finite_values_too_different,
+                    'diff_outside_epsilon_fraction': fraction_too_different,
+                    'perfect_match_count': num_perfect,
+                    'perfect_match_fraction': fraction_perfect,
+                     'trouble_points_count': num_trouble, 
+                     'trouble_points_fraction': float(num_trouble) / float(a.size)
+                    }
+    comparison.update(additional_statistics)
+    
+    return comparison
+
+def _get_general_data_stats(a, b,
+                            a_missing_value, b_missing_value,
+                            epsilon, 
+                            spatial_ignore_in_a_mask, spatial_ignore_in_b_mask,
+                            bad_in_a, bad_in_b
+                            ) :
+    """
+    Get a list of general statistics about a and b, given a and b and some other information
+    about them.
+    the return value will be a dictionary of statistics
+    """
+    # figure out how much spatial trouble we had
+    num_ignored_in_a = np.sum(spatial_ignore_in_a_mask)
+    num_ignored_in_b = np.sum(spatial_ignore_in_b_mask)
+    
+    # get the number of data points
+    total_num_values = a.size
+    
+    general_stats = {'a_missing_value': a_missing_value,
+                     'b_missing_value': b_missing_value,
+                     'epsilon': epsilon,
+                     'max_a': delta.max_with_mask(a, bad_in_a),
+                     'max_b': delta.max_with_mask(b, bad_in_b),
+                     'min_a': delta.min_with_mask(a, bad_in_a),
+                     'min_b': delta.min_with_mask(b, bad_in_b),
+                     'num_data_points': total_num_values,
+                     'shape': a.shape,
+                     'spatially_invalid_pts_ignored_in_a': num_ignored_in_a,
+                     'spatially_invalid_pts_ignored_in_b': num_ignored_in_b
+                     }
+    
+    return general_stats
+
+def _get_finite_data_stats(a_is_finite_mask, b_is_finite_mask, common_ignore_mask) :
+    """
+    Get a list of statistics about finite data values in data sets a and b,
+    given masks describing the positions of the finite data in each file (such masks
+    can be gotten from the diff function),
+    the return value will be a dictionary of statistics
+    """
+    # calculate information about the finite data
+    num_a_finite = np.sum(a_is_finite_mask)
+    num_b_finite = np.sum(b_is_finite_mask)
+    num_common_finite = np.sum(a_is_finite_mask & b_is_finite_mask)
+    num_finite_in_only_one = np.sum((a_is_finite_mask ^ b_is_finite_mask) & (~ common_ignore_mask)) # use an exclusive OR
+    
+    # make the assumption that a and b are the same size and only use the size of a's mask
+    total_num_values = a_is_finite_mask.size
+    
+    finite_value_stats = {'a_finite_count': num_a_finite,
+                          'a_finite_fraction': (float(num_a_finite) / float(total_num_values)),
+                          'b_finite_count': num_b_finite,
+                          'b_finite_fraction': (float(num_b_finite) / float(total_num_values)),
+                          'common_finite_count': num_common_finite,
+                          'common_finite_fraction': (float(num_common_finite) / float(total_num_values)),
+                          'finite_in_only_one_count': num_finite_in_only_one,
+                          'finite_in_only_one_fraction': (float(num_finite_in_only_one) / float(total_num_values)),
+                          }
+    
+    return finite_value_stats
+
+def _get_missing_value_stats(a_missing_mask, b_missing_mask) :
+    """
+    Get a list of statistics about missing data values in data sets a and b,
+    given masks describing the positions of the missing data (such masks
+    can be gotten from the diff function), 
+    the return value will be a dictionary of statistics
+    """
+    # calculate information about the missing values
+    num_a_missing = np.sum(a_missing_mask)
+    num_b_missing = np.sum(b_missing_mask)
+    num_common_missing = np.sum(a_missing_mask & b_missing_mask)
+    
+    # make the assumption that a and b are the same size and only use the size of a's mask
+    total_num_values = a_missing_mask.size
+    
+    missing_value_stats = {'a_missing_count': num_a_missing,
+                           'a_missing_fraction': (float(num_a_missing) / float(total_num_values)),
+                           'b_missing_count': num_b_missing,
+                           'b_missing_fraction': (float(num_b_missing) / float(total_num_values)),
+                           'common_missing_count': num_common_missing,
+                           'common_missing_fraction': (float(num_common_missing) / float(total_num_values))
+                           }
+    
+    return missing_value_stats
+
+def _get_nan_stats(a_nan_mask, b_nan_mask) :
+    """
+    Get a list of statistics about non-numerical values in data sets a and b,
+    the return value will be a dictionary of statistics
+    """
+    # find the nan values in the data
+    num_a_nans = np.sum(a_nan_mask)
+    num_b_nans = np.sum(b_nan_mask)
+    num_common_nans = np.sum(a_nan_mask & b_nan_mask)
+    
+    # make the assumption that a and b are the same size and only use the size of a
+    total_num_values = a_nan_mask.size
+    
+    nan_stats = {'a_nan_count': num_a_nans,
+                 'a_nan_fraction': (float(num_a_nans) / float(total_num_values)),
+                 'b_nan_count': num_b_nans,
+                 'b_nan_fraction': (float(num_b_nans) / float(total_num_values)),
+                 'common_nan_count': num_common_nans,
+                 'common_nan_fraction': (float(num_common_nans) / float(total_num_values))
+                 }
+    
+    return nan_stats
+
+
+
+# -------------------------- documentation -----------------------------
+
+STATISTICS_DOC = {  'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " +
+                               "both finite and not, unless otherwise specified",
+                    
+                    # general statistics
+                    'a_missing_value': 'the value that is considered \"missing\" data when it is found in A',
+                    'b_missing_value': 'the value that is considered \"missing\" data when it is found in B',
+                    'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable',
+                    'max_a': 'the maximum finite, non-missing value found in A',
+                    'max_b': 'the maximum finite, non-missing value found in B',
+                    'min_a': 'the minimum finite, non-missing value found in A',
+                    'min_b': 'the minimum finite, non-missing value found in B',
+                    'num_data_points': "number of data values in A",
+                    'shape': "shape of A",
+                    'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' +
+                                                            ' ignored for the purposes of data analysis and presentation',
+                    'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' +
+                                                            ' ignored for the purposes of data analysis and presentation',
+                    
+                    # finite data stats descriptions
+                    'a_finite_count': "number of finite values in A",
+                    'a_finite_fraction': "fraction of finite values in A (out of all data points in A)",
+                    'b_finite_count': "number of finite values in B",
+                    'b_finite_fraction': "fraction of finite values in B (out of all data points in B)",
+                    'common_finite_count': "number of finite values in common between A and B",
+                    'common_finite_fraction': "fraction of finite values in common between A and B",
+                    'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " +
+                                                "only the common spatially valid area is considerd for this statistic",
+                    'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " +
+                                                "only the common spatially valid area is considerd for this statistic",
+                    
+                    # missing data value statistics
+                    'a_missing_count': "number of values flagged missing in A",
+                    'a_missing_fraction': "fraction of values flagged missing in A",
+                    'b_missing_count': "number of values flagged missing in B",
+                    'b_missing_fraction': "fraction of values flagged missing in B",
+                    'common_missing_count': "number of missing values in common between A and B",
+                    'common_missing_fraction': "fraction of missing values in common between A and B",
+                    
+                    # NaN related statistics
+                    'a_nan_count': "number of NaNs in A",
+                    'a_nan_fraction': "fraction of NaNs in A",
+                    'b_nan_count': "number of NaNs in B",
+                    'b_nan_fraction': "fraction of NaNs in B",
+                    'common_nan_count': "number of NaNs in common between A and B",
+                    'common_nan_fraction': "fraction of NaNs in common between A and B",
+                    
+                    # Numerical comparison statistics
+                    'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B",
+                    'diff_outside_epsilon_count': "number of finite differences falling outside epsilon",
+                    'diff_outside_epsilon_fraction': "fraction of finite differences falling outside epsilon (out of common_finite_count)",
+                    'max_diff': "Maximum difference of finite values",
+                    'mean_diff': "mean difference of finite values",
+                    'median_diff': "median difference of finite values",
+                    'perfect_match_count': "number of perfectly matched finite data points between A and B",
+                    'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)",
+                    'rms_diff': "root mean square (RMS) difference of finite values",
+                    'std_diff': "standard deviation of difference of finite values",
+                    'trouble_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' +
+                                            ' or are unacceptable when compared according to the current epsilon value',
+                    'trouble_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' +
+                                            ' or are unacceptable when compared according to the current epsilon value',
+                    
+                    # note: the statistics described below may no longer be generated?
+                    'mean_percent_change': "Percent change from A to B for finite values, averaged",
+                    'max_percent_change': "Percent change from A to B for finite values, maximum value"
+                    
+                    }
+STATISTICS_DOC_STR = '\n'.join( '%s:\n    %s' % x for x in sorted(list(STATISTICS_DOC.items())) ) + '\n'
+
+if __name__=='__main__':
+    import doctest
+    doctest.testmod()
diff --git a/pyglance/glance/variablereport.txt b/pyglance/glance/variablereport.txt
index b135063..360209d 100644
--- a/pyglance/glance/variablereport.txt
+++ b/pyglance/glance/variablereport.txt
@@ -6,7 +6,6 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
 </%doc>
 
 <%!
-    import types as types
     import glance.report as report
 %>
 
-- 
GitLab