-
Eva Schiffer authoredEva Schiffer authored
stats.py 54.95 KiB
#!/usr/bin/env python
# encoding: utf-8
"""
This module handles statistical analysis of data sets. The code present in
this module is based on previous versions of delta.py.
Created by evas Apr 2010.
Copyright (c) 2010 University of Wisconsin SSEC. All rights reserved.
"""
import glance.data as dataobj
import glance.delta as delta
import numpy as np
# I don't like this design, but it's what I could come up
# with for now. FUTURE: Reconsider this design again later.
class StatisticalData (object) :
"""
This class represents a set of statistical data generated from
the examination of data sets. What form of data is accepted for
analysis is relatively abstract.
All Statistics Data objects should have a title and be able to provide
a dictionary of their statistics (see dictionary_form function) and
a dictionary documenting their statistics.
Child classes can include whatever actual statistics they like.
"""
def __init__ (self) :
"""
a minimal constructor that only sets the title
"""
self.title = None
def dictionary_form(self) :
"""
get a dictionary form of the statistics
note: child classes should override this method
"""
return { }
@staticmethod
def doc_strings(inspect=False) :
"""
get documentation strings that match the
dictionary form of the statistics this class
creates
note: child classes should override this method
"""
return { }
def make_prefix_and_suffix (self, descriptionText) :
"""
given text describing a statistic (or none)
return an appropriate prefix and suffix
"""
prefix = "" if descriptionText is None else str(descriptionText) + '_'
suffix = "" if descriptionText is None else '_' + str(descriptionText)
return prefix, suffix
class MissingValueStatistics (StatisticalData) :
"""
A class representing information about where fill values are found
in data. It can analyze either a pair of data sets encapsulated in a
glance.data.DiffInfoObject or a single data set in a glance.data.DataObject.
if a DiffInfoObject is given it will produce the following statistics:
common_missing_count - count of points that are missing in both data sets
common_missing_fraction - fraction of points that are missing in both data sets
it will also include the following intermediary objects with stats about the
individual data sets in the DiffInfoObject:
a_missing_stats - a MissingValueStatistics object specific to the a data set
b_missing_stats - a MissingValueStatistics object specific to the b data set
when turned into a dictionary these become:
a_missing_count - count of points that are missing in the a data set
a_missing_fraction - fraction of points that are missing in the a data set
b_missing_count - count of points that are missing in the b data set
b_missing_fraction - fraction of points that are missing in the b data set
if it is only given a DataObject it will produce the following :
<data set descrption>missing_count - count of points that are missing in the data set
<data set descrption>missing_fraction - fraction of points that are missing in the data set
"""
_doc_strings = \
{
'a_missing_count': "number of values flagged missing in A",
'a_missing_fraction': "fraction of values flagged missing in A",
'b_missing_count': "number of values flagged missing in B",
'b_missing_fraction': "fraction of values flagged missing in B",
'common_missing_count': "number of missing values in common between A and B",
'common_missing_fraction': "fraction of missing values in common between A and B"
}
_doc_strings_inspection = \
{
'missing_count': "number of values flagged missing",
'missing_fraction': "fraction of values flagged missing",
}
def __init__(self, diffInfoObject=None, dataObject=None, dataSetDescription=None) :
"""
build our fill value related statistics
diffInfoObject is assumed to be a glance.data.DiffInfoObject
dataObject is assumed to be a glance.data.DataObject
Either the diffInfoObject or the dataObject must be passed in. If the
diffInfoObject is passed the dataObject will be ignored and the
a_data_object and b_data_object associated with the diffInfoObject
will be analyzed.
If only dataObject is analysed dataSetDescription will be used in labeling
the resulting dictionary form statistics.
"""
self.title = 'Missing Value Statistics'
self.is_one_data_set = False
# if we don't have comparison information and we do have a single data set
if (diffInfoObject is None) and (dataObject is not None) :
# we have one data set and should save the prefix information
self.is_one_data_set = True
self.desc_text = dataSetDescription
noData = len(dataObject.data.shape) <= 0
# figure out some basic statistics
self.missing_count = np.sum(dataObject.masks.missing_mask)
self.missing_fraction = float(self.missing_count) / float(dataObject.data.size) if not noData else np.nan
# if we have a comparison object analyze the data associated with that comparison
elif diffInfoObject is not None :
noData = len(diffInfoObject.a_data_object.data.shape) <= 0
# analyze each of the original data sets that are being compared
self.a_missing_stats = MissingValueStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a")
self.b_missing_stats = MissingValueStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b")
# common statistics
self.common_missing_count = np.sum(diffInfoObject.a_data_object.masks.missing_mask & diffInfoObject.b_data_object.masks.missing_mask)
self.common_missing_fraction = float(self.common_missing_count) / float(diffInfoObject.a_data_object.data.size) if not noData else np.nan
else :
raise ValueError ("No data set was given when requesting statistical analysis of missing values.")
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# if we only have stats for one data set
if self.is_one_data_set :
temp_prefix, _ = self.make_prefix_and_suffix(self.desc_text)
toReturn = {
temp_prefix + 'missing_count': self.missing_count,
temp_prefix + 'missing_fraction': self.missing_fraction,
}
# otherwise we must have stats for a comparison
else :
toReturn = {
'common_missing_count': self.common_missing_count,
'common_missing_fraction': self.common_missing_fraction,
}
a_dict = self.a_missing_stats.dictionary_form()
toReturn.update(a_dict)
b_dict = self.b_missing_stats.dictionary_form()
toReturn.update(b_dict)
return toReturn
@staticmethod
def doc_strings(inspect=False) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return MissingValueStatistics._doc_strings if not inspect else MissingValueStatistics._doc_strings_inspection
class FiniteDataStatistics (StatisticalData) :
"""
A class representing information about where finite values are found
in data. It can analyze either a pair of data sets encapsulated in a
glance.data.DiffInfoObject or a single data set in a glance.data.DataObject.
when a single data set is analyzed the following stats are produced:
<data prefix>finite_count - the number of finite data values in the data set
<data prefix>finite_fraction - the fraction of finite data values in the data set
if a DiffInfoObject is given for analysis the following statistics are produced:
common_finite_count - the number of finite values the two data sets have in common
common_finite_fraction - the fraction of finite values the two data sets have in common
finite_in_only_one_count - the number of points that are finite in only one of the two sets
finite_in_only_one_fraction - the fraction of points that are finite in only one of the two sets
it will also include the following intermediary objects with stats about the
individual data sets in the DiffInfoObject:
a_finite_stats - a FiniteDataStatistics object with further stats on the a data set
b_finite_stats - a FiniteDataStatistics object with further stats on the b data set
and the dictionary form will includes the following statistics:
a_finite_count - the number of finite data values in the a data set
a_finite_fraction - the fraction of finite data values in the a data set
b_finite_count - the number of finite data values in the b data set
b_finite_fraction - the fraction of finite data values in the b data set
"""
_doc_strings = {
'a_finite_count': "number of finite values in A",
'a_finite_fraction': "fraction of finite values in A (out of all data points in A)",
'b_finite_count': "number of finite values in B",
'b_finite_fraction': "fraction of finite values in B (out of all data points in B)",
'common_finite_count': "number of finite values in common between A and B",
'common_finite_fraction': "fraction of finite values in common between A and B",
'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " +
"only the common spatially valid area is considerd for this statistic",
'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " +
"only the common spatially valid area is considerd for this statistic"
}
_doc_strings_inspection = \
{
'finite_count': "number of finite values",
'finite_fraction': "fraction of finite values (out of all data points in set)",
}
def __init__(self, diffInfoObject=None, dataObject=None, dataSetDescription=None) :
"""
build our finite data related statistics
diffInfoObject is assumed to be a glance.data.DiffInfoObject
dataObject is assumed to be a glance.data.DataObject
Either the diffInfoObject or the dataObject must be passed in. If the
diffInfoObject is passed the dataObject will be ignored and the
a_data_object and b_data_object associated with the diffInfoObject
will be analyzed.
If only dataObject is analysed dataSetDescription will be used in labeling
the resulting dictionary form statistics.
"""
self.title = 'Finite Data Statistics'
self.is_one_data_set = False
# if we don't have comparison information and we do have a single data set
if (diffInfoObject is None) and (dataObject is not None) :
# we have one data set and should save the prefix information
self.is_one_data_set = True
self.desc_text = dataSetDescription
# figure out some basic statistics
self.finite_count = np.sum(dataObject.masks.valid_mask) if len(dataObject.data.shape) > 0 else 0
self.finite_fraction = float(self.finite_count) / float(dataObject.data.size) if len(dataObject.data.shape) > 0 else np.nan
# if we have a comparison object analyze the data associated with that comparison
elif diffInfoObject is not None :
noData = len(diffInfoObject.a_data_object.data.shape) <= 0
# analyze each of the original data sets that are being compared
self.a_finite_stats = FiniteDataStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a")
self.b_finite_stats = FiniteDataStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b")
# calculate some common statistics
self.common_finite_count = np.sum(diffInfoObject.a_data_object.masks.valid_mask & diffInfoObject.b_data_object.masks.valid_mask) \
if not noData else 0
# use an exclusive or to check which points are finite in only one of the two data sets
self.finite_in_only_one_count = np.sum((diffInfoObject.a_data_object.masks.valid_mask ^ diffInfoObject.b_data_object.masks.valid_mask) \
& ~diffInfoObject.diff_data_object.masks.ignore_mask) \
if not noData else 0
self.common_finite_fraction = float(self.common_finite_count) / float(diffInfoObject.a_data_object.data.size) \
if not noData else np.nan
self.finite_in_only_one_fraction = float(self.finite_in_only_one_count) / float(diffInfoObject.a_data_object.data.size) \
if not noData else np.nan
else:
raise ValueError ("No data set was given when requesting statistical analysis of finite values.")
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# if we only have stats for one data set
if self.is_one_data_set :
temp_prefix, _ = self.make_prefix_and_suffix(self.desc_text)
toReturn = {
temp_prefix + 'finite_count': self.finite_count,
temp_prefix + 'finite_fraction': self.finite_fraction,
}
# otherwise we must have stats for a comparison
else :
toReturn = {
'common_finite_count': self.common_finite_count,
'common_finite_fraction': self.common_finite_fraction,
'finite_in_only_one_count': self.finite_in_only_one_count,
'finite_in_only_one_fraction': self.finite_in_only_one_fraction,
}
a_dict = self.a_finite_stats.dictionary_form()
toReturn.update(a_dict)
b_dict = self.b_finite_stats.dictionary_form()
toReturn.update(b_dict)
return toReturn
@staticmethod
def doc_strings(inspect=False) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return FiniteDataStatistics._doc_strings if not inspect else FiniteDataStatistics._doc_strings_inspection
class NotANumberStatistics (StatisticalData) :
"""
A class representing information about where non-numerical values are found
in data. It can analyze either a pair of data sets encapsulated in a
glance.data.DiffInfoObject or a single data set in a glance.data.DataObject.
when a single data set is analyzed the following stats are produced:
nan_count - the number of non finite values that are present in the data set
nan_fraction - the fraction of non finite values that are present in the data set
if a DiffInfoObject is given for analysis the following statistics are produced:
common_nan_count - the number of non finite values that are shared between the data sets
common_nan_fraction - the fraction of non finite values that are shared between the data sets
if a DiffInfoObject is given the object will also have:
a_finite_stats - a NotANumberStatistics object with further stats on the a data set
b_finite_stats - a NotANumberStatistics object with further stats on the b data set
and the dictionary form will includes the following statistics:
a_nan_count - the number of non finite values that are present in the a data set
a_nan_fraction - the fraction of non finite values that are present in the a data set
b_nan_count - the number of non finite values that are present in the b data set
b_nan_fraction - the fraction of non finite values that are present in the b data set
"""
_doc_strings = {
'a_nan_count': "number of NaNs in A",
'a_nan_fraction': "fraction of NaNs in A",
'b_nan_count': "number of NaNs in B",
'b_nan_fraction': "fraction of NaNs in B",
'common_nan_count': "number of NaNs in common between A and B",
'common_nan_fraction': "fraction of NaNs in common between A and B"
}
_doc_strings_inspection = \
{
'nan_count': "number of NaNs",
'nan_fraction': "fraction of NaNs",
}
def __init__(self, diffInfoObject=None, dataObject=None, dataSetDescription=None) :
"""
build our nonfinite data related statistics
diffInfoObject is assumed to be a glance.data.DiffInfoObject
dataObject is assumed to be a glance.data.DataObject
Either the diffInfoObject or the dataObject must be passed in. If the
diffInfoObject is passed the dataObject will be ignored and the
a_data_object and b_data_object associated with the diffInfoObject
will be analyzed.
If only dataObject is analysed dataSetDescription will be used in labeling
the resulting dictionary form statistics.
"""
self.title = 'NaN Statistics'
self.is_one_data_set = False
# if we don't have comparison information and we do have a single data set
if (diffInfoObject is None) and (dataObject is not None) :
# we have one data set and should save the prefix information
self.is_one_data_set = True
self.desc_text = dataSetDescription
noData = len(dataObject.data.shape) <= 0
# get some basic statistics
self.nan_count = np.sum(dataObject.masks.non_finite_mask)
self.nan_fraction = float(self.nan_count) / float(dataObject.data.size) if not noData else np.nan
# if we have a comparison object analyze the data associated with that comparison
elif diffInfoObject is not None :
noData = len(diffInfoObject.a_data_object.data.shape) <= 0
# analyze each of the original data sets that are being compared
self.a_nan_stats = NotANumberStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a")
self.b_nan_stats = NotANumberStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b")
# calculate some common statistics
self.common_nan_count = np.sum(diffInfoObject.a_data_object.masks.non_finite_mask & diffInfoObject.b_data_object.masks.non_finite_mask)
self.common_nan_fraction = float(self.common_nan_count) / float(diffInfoObject.a_data_object.data.size) if not noData else np.nan
else:
raise ValueError ("No data set was given when requesting statistical analysis of NaN values.")
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# if we only have stats for one data set
if self.is_one_data_set :
temp_prefix, _ = self.make_prefix_and_suffix(self.desc_text)
toReturn = {
temp_prefix + 'nan_count': self.nan_count,
temp_prefix + 'nan_fraction': self.nan_fraction,
}
# otherwise we must have stats for a comparison
else :
toReturn = {
'common_nan_count': self.common_nan_count,
'common_nan_fraction': self.common_nan_fraction
}
a_dict = self.a_nan_stats.dictionary_form()
toReturn.update(a_dict)
b_dict = self.b_nan_stats.dictionary_form()
toReturn.update(b_dict)
return toReturn
@staticmethod
def doc_strings(inspect=False) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return NotANumberStatistics._doc_strings if not inspect else NotANumberStatistics._doc_strings_inspection
class GeneralStatistics (StatisticalData) :
"""
A class representing general information about data. It can analyze either a
pair of data sets encapsulated in a glance.data.DiffInfoObject or a single
data set in a glance.data.DataObject.
if a single DataObject is given the following will be produced:
(some of these are labeled with any dataSetDescription given in the
constructor)
missing_value - the fill data value
max - the maximum value
min - the minimum value
num_data_points - the total number of data points
shape - the shape of the data
spatially_invalid_pts_ignored - number of points corresponding to invalid lat/lon in the set
(optional if no /lon lat mapped)
mean - the mean of the data values
median - the median of the data values
std_val - the standard deviation of the data values
if a DiffInfoObject is given these comparison stats will be produced:
epsilon - the fixed epsilon value
epsilon_percent - the percentage of the a set that will be used for comparison
num_data_points - the number of data points in each of the sets
shape - the shape of each of the data sets
it will also have the following self owned variables:
a_gen_stats - a GeneralStatistics object with further stats on the a data set
b_gen_stats - a GeneralStatistics object with further stats on the b data set
in dictionary form those objects will produce:
a_missing_value - the fill data value in the a set
b_missing_value - the fill data value in the b set
max_a - the maximum value in the a set
max_b - the maximum value in the b set
min_a - the minimum value in the a set
min_b - the minimum value in the b set
"""
_doc_strings = {
'a_missing_value': 'the value that is considered \"missing\" or \"fill\" data when it is found in A',
'b_missing_value': 'the value that is considered \"missing\" or \"fill\" data when it is found in B',
'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable',
'epsilon_percent': 'the percentage of difference (of A\'s value) that is acceptable between A and B (optional)',
'max_a': 'the maximum finite, non-missing value found in A',
'max_b': 'the maximum finite, non-missing value found in B',
'min_a': 'the minimum finite, non-missing value found in A',
'min_b': 'the minimum finite, non-missing value found in B',
'num_data_points': "number of data values in A",
'shape': "shape of A",
'spatially_invalid_pts_ignored_a': 'number of points with invalid latitude/longitude information in A that were' +
' ignored for the purposes of data analysis and presentation',
'spatially_invalid_pts_ignored_b': 'number of points with invalid latitude/longitude information in B that were' +
' ignored for the purposes of data analysis and presentation',
# these are new!
'mean_a': "the mean of all finite, non-missing values found in A",
'mean_b': "the mean of all finite, non-missing values found in B",
'median_a': "the median of all finite, non-missing values in A",
'median_b': "the median of all finite, non-missing values in B",
'std_val_a': "the standard deviation of all finite, non-missing values in A",
'std_val_b': "the standard deviation of all finite, non-missing values in B",
}
_doc_strings_inspect = \
{
'missing_value': 'the value that is considered \"missing\" or \"fill\" data in this data set',
'max': 'the maximum finite, non-missing value found in the data',
'min': 'the minimum finite, non-missing value found in the data',
'num_data_points': "number of data points (may be valid or invalid data)",
'shape': "shape of the data",
'spatially_invalid_pts_ignored': 'number of points with invalid latitude/longitude information ' +
'in the data that were' +
' ignored for the purposes of data analysis and presentation',
'mean': "the mean of all finite, non-missing values in the data",
'median': "the median of all finite, non-missing values in the data",
'std_val': "the standard deviation of all finite, non-missing values in the data",
}
def __init__(self, diffInfoObject=None, dataObject=None,
doExtras=False, dataSetDescription=None) :
"""
build our general statistics based on the comparison of two data sets
diffInfoObject is assumed to be a glance.data.DiffInfoObject
dataObject is assumed to be a glance.data.DataObject
Either the diffInfoObject or the dataObject must be passed in. If the
diffInfoObject is passed the dataObject will be ignored and the
a_data_object and b_data_object associated with the diffInfoObject
will be analyzed.
If only dataObject is analyzed dataSetDescription will be
used in labeling the resulting dictionary form statistics.
If you are passing a single dataObject and would like shape and size
statistics reported as well, pass doExtras as True (otherwise these
stats will be omitted).
"""
self.title = 'General Statistics'
self.is_one_data_set = False
# if we don't have comparison information and we do have a single data set
if (diffInfoObject is None) and (dataObject is not None) :
# we have one data set and should save the prefix/suffix information
self.is_one_data_set = True
self.do_extras = doExtras
self.desc_text = dataSetDescription
# grab the valid data for some calculations
tempGoodData = dataObject.data[dataObject.masks.valid_mask]
noData = (tempGoodData.size <= 0) or (len(dataObject.data.shape) <= 0)
# fill in our statistics
self.missing_value = dataObject.select_fill_value()
self.max = np.max(tempGoodData) if not noData else np.nan
self.min = np.min(tempGoodData) if not noData else np.nan
self.mean = np.mean(tempGoodData) if not noData else np.nan
self.median = np.median(tempGoodData) if not noData else np.nan
self.std_val = np.std(tempGoodData) if not noData else np.nan
# also calculate the invalid points
self.spatially_invalid_pts_ignored = np.sum(dataObject.masks.ignore_mask)
# if we should also do extra stats, do so
if (doExtras) :
self.num_data_points = dataObject.masks.missing_mask.size if not noData else 0
self.shape = dataObject.masks.missing_mask.shape if not dataObject.is_scalar else "a single scalar value"
# if we have a comparison object analyze the data associated with that comparison
elif diffInfoObject is not None :
noData = len(diffInfoObject.a_data_object.data.shape) <= 0
# analyze each of the original data sets that are being compared
self.a_gen_stats = GeneralStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a")
self.b_gen_stats = GeneralStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b")
# fill in our statistics
self.epsilon = diffInfoObject.epsilon_value
self.epsilon_percent = diffInfoObject.epsilon_percent
self.num_data_points = diffInfoObject.a_data_object.masks.missing_mask.size if not noData else 0
self.shape = diffInfoObject.a_data_object.masks.missing_mask.shape
# if we have at least one scalar, we need to build the shape info differently
if diffInfoObject.a_data_object.is_scalar or diffInfoObject.b_data_object.is_scalar :
if diffInfoObject.a_data_object.is_scalar and diffInfoObject.b_data_object.is_scalar :
self.shape = "a single scalar value"
elif diffInfoObject.a_data_object.is_scalar :
self.shape = "a single scalar value in A and " + str(diffInfoObject.b_data_object.masks.missing_mask.shape) + " in B"
elif diffInfoObject.b_data_object.is_scalar :
self.shape = str(diffInfoObject.a_data_object.masks.missing_mask.shape) + " in A and a single scalar value in B"
# also calculate the invalid points
self.spatially_invalid_pts_ignored_in_a = np.sum(diffInfoObject.a_data_object.masks.ignore_mask)
self.spatially_invalid_pts_ignored_in_b = np.sum(diffInfoObject.b_data_object.masks.ignore_mask)
else:
raise ValueError ("No data set was given when requesting general statistical analysis.")
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# if we only have stats for one data set
if self.is_one_data_set :
temp_prefix, temp_suffix = self.make_prefix_and_suffix(self.desc_text)
toReturn = {
temp_prefix + 'missing_value': self.missing_value,
'max' + temp_suffix: self.max,
'min' + temp_suffix: self.min,
'mean' + temp_suffix: self.mean,
'median' + temp_suffix: self.median,
'std_val' + temp_suffix: self.std_val,
'spatially_invalid_pts_ignored' + temp_suffix: self.spatially_invalid_pts_ignored,
}
if self.do_extras :
toReturn['num_data_points'] = self.num_data_points
toReturn['shape'] = self.shape
# otherwise we must have stats for a comparison
else :
toReturn = {
'epsilon': self.epsilon,
'epsilon_percent': self.epsilon_percent,
'num_data_points': self.num_data_points,
'shape': self.shape,
}
a_dict = self.a_gen_stats.dictionary_form()
toReturn.update(a_dict)
b_dict = self.b_gen_stats.dictionary_form()
toReturn.update(b_dict)
return toReturn
@staticmethod
def doc_strings(inspect=False) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return GeneralStatistics._doc_strings if not inspect else GeneralStatistics._doc_strings_inspect
class NumericalComparisonStatistics (StatisticalData) :
"""
A class representing more complex comparisons between a pair of data sets.
includes the following statistics:
correlation - the Pearson correlation r-coefficient from comparing finite values of the sets
r_squared_correlation - the square of the correlation
diff_outside_epsilon_count - the number of points that fall outside the acceptable epsilon settings
diff_outside_epsilon_fraction - the fraction of points that fall outside the acceptable epsilon settings
perfect_match_count - the number of points that match perfectly between the sets
perfect_match_fraction - the fraction of points that match perfectly between the sets
mismatch_points_count - the number of points that have possible issues according to the current analysis
mismatch_points_fraction - the fraction of points that have possible issues according to the current analysis
It may also contain additional statistics. This is indicated by the does_include_simple boolean.
The possible additional statistics include:
rms_val - the root mean squared of the difference between the two data sets
std_val - the standard deviation of the difference between the two data sets
mean_diff - the mean of the absolute difference between the two data sets
median_diff - the median of the absolute difference between the two data sets
max_diff - the maximum of the absolute difference between the two data sets
mean_delta - the mean of the difference between the two data sets
median_delta - the median of the difference between the two data sets
max_delta - the maximum of the difference between the two data sets
min_delta - the minimum of the difference between the two data sets
These statistics can also be generated separately in dictionary form by calling the
basic_analysis method on this class.
"""
_doc_strings = {
'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B",
'diff_outside_epsilon_count': "number of finite differences falling outside acceptable epsilon definitions; " +
"note: this value includes data excluded by both epsilon and epsilon_percent if " +
"both have been defined",
'diff_outside_epsilon_fraction': "fraction of finite differences falling outside acceptable epsilon " +
"definitions (out of common_finite_count)",
'max_diff': "maximum absolute valued difference of the finite values",
'mean_diff': "mean of the absolute value difference of the finite values",
'median_diff': "median of the absolute value difference of the finite values",
'mean_delta': "mean of the subtractive difference of the finite values",
'median_delta': "median of the subtractive difference of the finite values",
'max_delta': "maximum finite data value from the data set of B file - A file",
'min_delta': "minimum finite data value from the data set of B file - A file",
'perfect_match_count': "number of perfectly matched finite data points between A and B",
'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)",
'rms_val': "root mean square (RMS) difference of finite values",
'r-squared correlation': "the square of the r correlation (see correlation)",
'std_val': "standard deviation of difference of finite values",
'mismatch_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' +
' or are unacceptable when compared according to the current epsilon definitions',
'mismatch_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' +
' or are unacceptable when compared according to the current epsilon definitions',
}
def __init__(self, diffInfoObject, include_basic_analysis=True) :
"""
build our comparison statistics based on the comparison
of two data sets
the include_basic_analysis flag indicates whether the statistics generated by the
basic_analysis method should also be generated
"""
self.title = 'Numerical Comparison Statistics'
# pull out some info we will use later
valid_in_both = diffInfoObject.diff_data_object.masks.valid_mask
aData = diffInfoObject.a_data_object.data
bData = diffInfoObject.b_data_object.data
total_num_finite_values = np.sum(valid_in_both) # just the finite values, not all data
noData = len(diffInfoObject.a_data_object.data.shape) <= 0
# fill in some simple statistics
self.diff_outside_epsilon_count = np.sum(diffInfoObject.diff_data_object.masks.outside_epsilon_mask)
self.perfect_match_count = NumericalComparisonStatistics._get_num_perfect(aData, bData,
goodMask=valid_in_both)
self.correlation = delta.compute_correlation(aData, bData, valid_in_both) if not noData else np.nan
self.r_squared_correlation = self.correlation * self.correlation if not noData else np.nan
self.mismatch_points_count = np.sum(diffInfoObject.diff_data_object.masks.mismatch_mask)
# calculate some more complex statistics, be careful not to divide by zero
self.mismatch_points_fraction = float(self.mismatch_points_count) / float(aData.size) if not noData else 0.0
self.diff_outside_epsilon_fraction = float(self.diff_outside_epsilon_count) / float(total_num_finite_values) if (total_num_finite_values > 0) else 0.0
self.perfect_match_fraction = float(self.perfect_match_count) / float(total_num_finite_values) if (total_num_finite_values > 0) else np.nan
# if desired, do the basic analysis
self.temp_analysis = NumericalComparisonStatistics.basic_analysis(diffInfoObject.diff_data_object.data, valid_in_both) if include_basic_analysis else { }
self.rms_val = self.temp_analysis['rms_val'] if not noData else np.nan
self.std_val = self.temp_analysis['std_val'] if not noData else np.nan
self.mean_diff = self.temp_analysis['mean_diff'] if not noData else np.nan
self.median_diff = self.temp_analysis['median_diff'] if not noData else np.nan
self.max_diff = self.temp_analysis['max_diff'] if not noData else np.nan
self.mean_delta = self.temp_analysis['mean_delta'] if not noData else np.nan
self.median_delta = self.temp_analysis['median_delta'] if not noData else np.nan
self.max_delta = self.temp_analysis['max_delta'] if not noData else np.nan
self.min_delta = self.temp_analysis['min_delta'] if not noData else np.nan
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'correlation': self.correlation,
'r-squared correlation': self.r_squared_correlation,
'diff_outside_epsilon_count': self.diff_outside_epsilon_count,
'diff_outside_epsilon_fraction': self.diff_outside_epsilon_fraction,
'perfect_match_count': self.perfect_match_count,
'perfect_match_fraction': self.perfect_match_fraction,
'mismatch_points_count': self.mismatch_points_count,
'mismatch_points_fraction': self.mismatch_points_fraction
}
toReturn.update(self.temp_analysis)
return toReturn
@staticmethod
def doc_strings( ) :
"""get documentation strings that match the dictionary form of the statistics
"""
return NumericalComparisonStatistics._doc_strings
@staticmethod
def basic_analysis(diffData, valid_mask):
"""do some very minimal analysis of the differences
"""
# if everything's invalid, stop now
noData = np.sum(valid_mask) <= 0
# calculate and return statistics
root_mean_square_value = delta.calculate_root_mean_square(diffData, valid_mask) if not noData else np.nan
tempDiffData = diffData[valid_mask] if not noData else None
absDiffData = np.abs(tempDiffData) if not noData else None
return { 'rms_val': root_mean_square_value,
'std_val': np.std(tempDiffData) if not noData else np.nan,
'mean_diff': np.mean(absDiffData) if not noData else np.nan,
'median_diff': np.median(absDiffData) if not noData else np.nan,
'max_diff': np.max(absDiffData) if not noData else np.nan,
'mean_delta': np.mean(tempDiffData) if not noData else np.nan,
'median_delta': np.median(tempDiffData) if not noData else np.nan,
'max_delta': np.max(tempDiffData) if not noData else np.nan,
'min_delta': np.min(tempDiffData) if not noData else np.nan,
}
@staticmethod
def _get_num_perfect(aData, bData, goodMask=None):
"""
get the number of data points where
the value in A perfectly matches the value in B
"""
numPerfect = 0
if goodMask is None :
numPerfect = np.sum(aData == bData)
else :
numPerfect = np.sum(aData[goodMask] == bData[goodMask])
return numPerfect
class StatisticalAnalysis (StatisticalData) :
"""
This class represents a complete statistical analysis of two data sets.
It includes the following sets of statistics:
general - a GeneralStatistics object
comparison - a NumericalComparisonStatistics object
notANumber - a NotANumberStatistics object
missingValue - a MissingValueStatistics object
finiteData - a FiniteDataStatistics object
It can also provide a dictionary form of the statistics and
documentation for the statistics.
"""
def __init__ (self) :
"""
this is a blank constructor to support our new class method creation pattern
"""
self.title = "Statistical Summary"
@classmethod
def withSimpleData (in_class,
a_data, b_data,
a_missing_value=None, b_missing_value=None,
a_ignore_mask=None, b_ignore_mask=None,
epsilon=0., epsilon_percent=None) :
"""
do a full statistical analysis of the data, after building the data objects
"""
new_object = in_class()
aDataObject = dataobj.DataObject(a_data, fillValue=a_missing_value, ignoreMask=a_ignore_mask)
bDataObject = dataobj.DataObject(b_data, fillValue=b_missing_value, ignoreMask=b_ignore_mask)
diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject,
epsilonValue=epsilon, epsilonPercent=epsilon_percent)
new_object._create_stats(diffInfo)
return new_object
@classmethod
def withDataObjects (in_class,
a_data_object, b_data_object,
epsilon=0., epsilon_percent=None) :
"""
do a full statistical analysis of the data, using the given data objects
"""
new_object = in_class()
diffInfo = dataobj.DiffInfoObject(a_data_object, b_data_object,
epsilonValue=epsilon, epsilonPercent=epsilon_percent)
new_object._create_stats(diffInfo)
return new_object
def _create_stats(self, diffInfoObject) :
"""
build and set all of the statistics sets
"""
self.general = GeneralStatistics (diffInfoObject=diffInfoObject)
self.comparison = NumericalComparisonStatistics(diffInfoObject)
self.notANumber = NotANumberStatistics (diffInfoObject=diffInfoObject)
self.missingValue = MissingValueStatistics (diffInfoObject=diffInfoObject)
self.finiteData = FiniteDataStatistics (diffInfoObject=diffInfoObject)
def check_pass_or_fail(self,
epsilon_failure_tolerance =np.nan, epsilon_failure_tolerance_default =None,
non_finite_data_tolerance =np.nan, non_finite_data_tolerance_default =None,
total_data_failure_tolerance=np.nan, total_data_failure_tolerance_default=None,
min_acceptable_r_squared =np.nan, min_acceptable_r_squared_default =None
) :
"""
Check whether the variable passed analysis, failed analysis, or
did not need to be quantitatively tested
also returns information about the fractions of failure
"""
passValues = [ ]
# test the epsilon value tolerance
# get the tolerance for failures compared to epsilon
epsilonTolerance = epsilon_failure_tolerance if epsilon_failure_tolerance is not np.nan else epsilon_failure_tolerance_default
# did we fail based on the epsilon?
failed_fraction = self.comparison.diff_outside_epsilon_fraction
passed_epsilon = None if (epsilonTolerance is None) else (failed_fraction <= epsilonTolerance)
passValues.append(passed_epsilon)
# test the nonfinite tolerance
# get the tolerance for failures in amount of nonfinite data (in spatially valid areas)
nonfiniteTolerance = non_finite_data_tolerance if non_finite_data_tolerance is not np.nan else non_finite_data_tolerance_default
# did we fail based on nonfinite data
non_finite_diff_fraction = self.finiteData.finite_in_only_one_fraction
passed_nonfinite = None if (nonfiniteTolerance is None) else (non_finite_diff_fraction <= nonfiniteTolerance)
passValues.append(passed_nonfinite)
# test if the total failed percentage is acceptable
# get the total percentage of failed data that is acceptable
totalFailTolerance = total_data_failure_tolerance if total_data_failure_tolerance is not np.nan else total_data_failure_tolerance_default
# did we fail based on all data failures?
passed_all_percentage = None if (totalFailTolerance is None) else ((non_finite_diff_fraction + failed_fraction) <= totalFailTolerance)
passValues.append(passed_all_percentage)
# test the r-squared correlation coefficent
# get the minimum acceptable r-squared correlation coefficient
min_r_squared = min_acceptable_r_squared if (min_acceptable_r_squared is not np.nan) else min_acceptable_r_squared_default
# did we fail based on the r-squared correlation coefficient?
r_squared_value = None if (min_r_squared is None) else self.comparison.r_squared_correlation
passed_r_squared = None if (min_r_squared is None) else (r_squared_value >= min_r_squared)
passValues.append(passed_r_squared)
# figure out the overall pass/fail result
didPass = None
for passValue in passValues :
# if passValue isn't none, we need to update didPass
if passValue is not None :
if didPass is not None :
didPass = passValue and didPass
else :
didPass = passValue
return didPass, failed_fraction, non_finite_diff_fraction, r_squared_value
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# build a dictionary of all our statistics
toReturn[self.general.title] = self.general.dictionary_form()
toReturn[self.comparison.title] = self.comparison.dictionary_form()
toReturn[self.notANumber.title] = self.notANumber.dictionary_form()
toReturn[self.missingValue.title] = self.missingValue.dictionary_form()
toReturn[self.finiteData.title] = self.finiteData.dictionary_form()
return toReturn
def doc_strings(self) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return StatisticalAnalysis.doc_strings( )
# TODO, use this method instead of the dictionary at the bottom of this module
@staticmethod
def doc_strings( ) :
"""get documentation strings that match the dictionary form of the statistics
"""
toReturn = { }
toReturn.update( GeneralStatistics.doc_strings())
toReturn.update(NumericalComparisonStatistics.doc_strings())
toReturn.update( NotANumberStatistics.doc_strings())
toReturn.update( MissingValueStatistics.doc_strings())
toReturn.update( FiniteDataStatistics.doc_strings())
return toReturn
class StatisticalInspectionAnalysis (StatisticalData) :
"""
This class represents a complete statistical analysis of a data set.
It includes the following sets of statistics:
general - a GeneralStatistics object
notANumber - a NotANumberStatistics object
missingValue - a MissingValueStatistics object
finiteData - a FiniteDataStatistics object
It can also provide a dictionary form of the statistics and
documentation for the statistics.
"""
def __init__ (self) :
"""
this is a blank constructor to support our new class method creation pattern
"""
self.title = "Statistical Summary"
@classmethod
def withSimpleData (in_class,
dataSet,
missingValue=None,
ignoreMask=None) :
"""
do a full statistical analysis of the data, after building the data object
"""
new_object = in_class()
dataObject = dataobj.DataObject(dataSet, fillValue=missingValue, ignoreMask=ignoreMask)
dataObject.self_analysis()
new_object._create_stats(dataObject)
return new_object
@classmethod
def withDataObjects (in_class,
dataObject) :
"""
do a full statistical analysis of the data, using the given data object
"""
new_object = in_class()
dataObject.self_analysis()
new_object._create_stats(dataObject)
return new_object
def _create_stats(self, dataObject) :
"""
build and set all of the statistics sets
"""
self.general = GeneralStatistics( dataObject=dataObject,
doExtras=True)
self.notANumber = NotANumberStatistics( dataObject=dataObject)
self.missingValue = MissingValueStatistics(dataObject=dataObject)
self.finiteData = FiniteDataStatistics( dataObject=dataObject)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# build a dictionary of all our statistics
toReturn[self.general.title] = self.general.dictionary_form()
toReturn[self.notANumber.title] = self.notANumber.dictionary_form()
toReturn[self.missingValue.title] = self.missingValue.dictionary_form()
toReturn[self.finiteData.title] = self.finiteData.dictionary_form()
return toReturn
def doc_strings(self) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return StatisticalAnalysis.doc_strings( )
# TODO, use this method instead of the dictionary at the bottom of this module
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
toReturn = { }
toReturn.update( GeneralStatistics.doc_strings(inspect=True))
toReturn.update( NotANumberStatistics.doc_strings(inspect=True))
toReturn.update(MissingValueStatistics.doc_strings(inspect=True))
toReturn.update( FiniteDataStatistics.doc_strings(inspect=True))
return toReturn
# -------------------------- documentation -----------------------------
# TODO, can this be moved?
STATISTICS_DOC_STR = '\n'.join( '%s:\n %s' % x for x in sorted(list( StatisticalAnalysis.doc_strings().items())) ) + '\n'
INSP_STATISTICS_DOC_STR = '\n'.join( '%s:\n %s' % x for x in sorted(list(StatisticalInspectionAnalysis.doc_strings().items())) ) + '\n'
if __name__=='__main__':
import doctest
doctest.testmod()