-
(no author) authored
git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@185 8a9318a1-56ba-4d59-b755-99d26321be01
(no author) authoredgit-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@185 8a9318a1-56ba-4d59-b755-99d26321be01
stats.py 45.55 KiB
#!/usr/bin/env python
# encoding: utf-8
"""
This module handles statistical analysis of data sets. The code present in
this module is based on previous versions of delta.py.
Created by evas Apr 2010.
Copyright (c) 2010 University of Wisconsin SSEC. All rights reserved.
"""
import glance.data as dataobj
import glance.delta as delta
import numpy as np
# TODO, I don't like this design, but it's what I could come up
# with for now. Reconsider this again later.
class StatisticalData (object) :
"""
This class represents a set of statistical data generated from
the examination of two data sets. This data set is relatively
abstract.
All Statistics Data objects should have a title and be able to provide
a dictionary of their statistics (see dictionary_form function) and
a dictionary documenting their statistics.
Child classes can include whatever actual statistics they like.
"""
def __init__ (self) :
"""
a minimal constructor that only sets the title
"""
self.title = None
def dictionary_form(self) :
"""
get a dictionary form of the statistics
note: child classes should override this method
"""
return { }
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics this class
creates
note: child classes should override this method
"""
return { }
class MissingValueStatistics (StatisticalData) :
"""
A class representing information about where fill values are found
in a pair of data sets.
includes the following statistics:
a_missing_count - count of points that are missing in the a data set
a_missing_fraction - fraction of points that are missing in the a data set
b_missing_count - count of points that are missing in the b data set
b_missing_fraction - fraction of points that are missing in the b data set
common_missing_count - count of points that are missing in both data sets
common_missing_fraction - fraction of points that are missing in both data sets
"""
_doc_strings = {
'a_missing_count': "number of values flagged missing in A",
'a_missing_fraction': "fraction of values flagged missing in A",
'b_missing_count': "number of values flagged missing in B",
'b_missing_fraction': "fraction of values flagged missing in B",
'common_missing_count': "number of missing values in common between A and B",
'common_missing_fraction': "fraction of missing values in common between A and B"
}
def __init__(self, diffInfoObject) :
"""
build our fill value related statistics based on the comparison
of two data sets
"""
self.title = 'Missing Value Statistics'
# pull out some masks for later use
a_missing_mask = diffInfoObject.a_data_object.masks.missing_mask
b_missing_mask = diffInfoObject.b_data_object.masks.missing_mask
assert(a_missing_mask.shape == b_missing_mask.shape)
# figure out some basic statistics
self.a_missing_count = np.sum(a_missing_mask)
self.b_missing_count = np.sum(b_missing_mask)
self.common_missing_count = np.sum(a_missing_mask & b_missing_mask)
# make the assumption that a and b are the same size and only use the size of a's mask
total_num_values = a_missing_mask.size
# figure out some fraction statistics
self.a_missing_fraction = float(self.a_missing_count) / float(total_num_values)
self.b_missing_fraction = float(self.b_missing_count) / float(total_num_values)
self.common_missing_fraction = float(self.common_missing_count) / float(total_num_values)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'a_missing_count': self.a_missing_count,
'a_missing_fraction': self.a_missing_fraction,
'b_missing_count': self.b_missing_count,
'b_missing_fraction': self.b_missing_fraction,
'common_missing_count': self.common_missing_count,
'common_missing_fraction': self.common_missing_fraction
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return MissingValueStatistics._doc_strings
class MissingValueInspectionStatistics (StatisticalData) :
"""
A class representing information about where fill values are found
in a data.
includes the following statistics:
missing_count - count of points that are missing in the a data set
missing_fraction - fraction of points that are missing in the a data set
"""
_doc_strings = {
'missing_count': "number of values flagged missing",
'missing_fraction': "fraction of values flagged missing",
}
def __init__(self, dataObject) :
"""
build our fill value related statistics based on the data set
"""
self.title = 'Missing Value Statistics'
# pull out a mask for later use
missing_mask = dataObject.masks.missing_mask
# figure out some basic statistics
self.missing_count = np.sum(missing_mask)
self.missing_fraction = float(self.missing_count) / float(missing_mask.size)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'missing_count': self.missing_count,
'missing_fraction': self.missing_fraction,
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return MissingValueInspectionStatistics._doc_strings
class FiniteDataStatistics (StatisticalData) :
"""
A class representing information about where finite values are found
in a pair of data sets.
includes the following statistics:
a_finite_count - the number of finite data values in the a data set
a_finite_fraction - the fraction of finite data values in the a data set
b_finite_count - the number of finite data values in the b data set
b_finite_fraction - the fraction of finite data values in the b data set
common_finite_count - the number of finite values the two data sets have in common
common_finite_fraction - the fraction of finite values the two data sets have in common
finite_in_only_one_count - the number of points that are finite in only one of the two sets
finite_in_only_one_fraction - the fraction of points that are finite in only one of the two sets
"""
_doc_strings = {
'a_finite_count': "number of finite values in A",
'a_finite_fraction': "fraction of finite values in A (out of all data points in A)",
'b_finite_count': "number of finite values in B",
'b_finite_fraction': "fraction of finite values in B (out of all data points in B)",
'common_finite_count': "number of finite values in common between A and B",
'common_finite_fraction': "fraction of finite values in common between A and B",
'finite_in_only_one_count': "number of values that changed finite-ness between A and B; " +
"only the common spatially valid area is considerd for this statistic",
'finite_in_only_one_fraction': "fraction of values that changed finite-ness between A and B; " +
"only the common spatially valid area is considerd for this statistic"
}
def __init__(self, diffInfoObject) :
"""
build our finite data related statistics based on the comparison
of two data sets
"""
self.title = 'Finite Data Statistics'
# pull out some data we will use later
a_is_finite_mask = diffInfoObject.a_data_object.masks.valid_mask
b_is_finite_mask = diffInfoObject.b_data_object.masks.valid_mask
common_ignore_mask = diffInfoObject.diff_data_object.masks.ignore_mask
assert(a_is_finite_mask.shape == b_is_finite_mask.shape)
assert(b_is_finite_mask.shape == common_ignore_mask.shape)
# figure out some basic statistics
self.a_finite_count = np.sum(a_is_finite_mask)
self.b_finite_count = np.sum(b_is_finite_mask)
self.common_finite_count = np.sum(a_is_finite_mask & b_is_finite_mask)
# use an exclusive or to check which points are finite in only one of the two data sets
self.finite_in_only_one_count = np.sum((a_is_finite_mask ^ b_is_finite_mask) & ~common_ignore_mask)
# make the assumption that a and b are the same size and only use the size of a's mask
total_num_values = a_is_finite_mask.size
# calculate some fractional statistics
self.a_finite_fraction = float(self.a_finite_count) / float(total_num_values)
self.b_finite_fraction = float(self.b_finite_count) / float(total_num_values)
self.common_finite_fraction = float(self.common_finite_count) / float(total_num_values)
self.finite_in_only_one_fraction = float(self.finite_in_only_one_count) / float(total_num_values)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'a_finite_count': self.a_finite_count,
'a_finite_fraction': self.a_finite_fraction,
'b_finite_count': self.b_finite_count,
'b_finite_fraction': self.b_finite_fraction,
'common_finite_count': self.common_finite_count,
'common_finite_fraction': self.common_finite_fraction,
'finite_in_only_one_count': self.finite_in_only_one_count,
'finite_in_only_one_fraction': self.finite_in_only_one_fraction,
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return FiniteDataStatistics._doc_strings
class FiniteDataInspectionStatistics (StatisticalData) :
"""
A class representing information about where finite values are found
in a data set.
includes the following statistics:
finite_count - the number of finite data values in the data set
finite_fraction - the fraction of finite data values in the data set
"""
_doc_strings = {
'finite_count': "number of finite values",
'finite_fraction': "fraction of finite values (out of all data points in set)",
}
def __init__(self, dataObject) :
"""
build our finite data related statistics based on the data set
"""
self.title = 'Finite Data Statistics'
# pull out some data we will use later
is_finite_mask = dataObject.masks.valid_mask
# figure out some basic statistics
self.finite_count = np.sum(is_finite_mask)
self.finite_fraction = float(self.finite_count) / float(is_finite_mask.size)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'finite_count': self.finite_count,
'finite_fraction': self.finite_fraction,
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return FiniteDataInspectionStatistics._doc_strings
class NotANumberStatistics (StatisticalData) :
"""
A class representing information about where non-finite values are found
in a pair of data sets.
includes the following statistics:
a_nan_count - the number of non finite values that are present in the a data set
a_nan_fraction - the fraction of non finite values that are present in the a data set
b_nan_count - the number of non finite values that are present in the b data set
b_nan_fraction - the fraction of non finite values that are present in the b data set
common_nan_count - the number of non finite values that are shared between the data sets
common_nan_fraction - the fraction of non finite values that are shared between the data sets
"""
_doc_strings = {
'a_nan_count': "number of NaNs in A",
'a_nan_fraction': "fraction of NaNs in A",
'b_nan_count': "number of NaNs in B",
'b_nan_fraction': "fraction of NaNs in B",
'common_nan_count': "number of NaNs in common between A and B",
'common_nan_fraction': "fraction of NaNs in common between A and B"
}
def __init__(self, diffInfoObject) :
"""
build our nonfinite data related statistics based on the comparison
of two data sets
"""
self.title = 'NaN Statistics'
# pull out some masks we will use
a_nan_mask = diffInfoObject.a_data_object.masks.non_finite_mask
b_nan_mask = diffInfoObject.b_data_object.masks.non_finite_mask
assert(a_nan_mask.shape == b_nan_mask.shape)
# get some basic statistics
self.a_nan_count = np.sum(a_nan_mask)
self.b_nan_count = np.sum(b_nan_mask)
self.common_nan_count = np.sum(a_nan_mask & b_nan_mask)
# make the assumption that a and b are the same size and only use the size of a
total_num_values = a_nan_mask.size
# calculate some fractional statistics
self.a_nan_fraction = float(self.a_nan_count) / float(total_num_values)
self.b_nan_fraction = float(self.b_nan_count) / float(total_num_values)
self.common_nan_fraction = float(self.common_nan_count) / float(total_num_values)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'a_nan_count': self.a_nan_count,
'a_nan_fraction': self.a_nan_fraction,
'b_nan_count': self.b_nan_count,
'b_nan_fraction': self.b_nan_fraction,
'common_nan_count': self.common_nan_count,
'common_nan_fraction': self.common_nan_fraction
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return NotANumberStatistics._doc_strings
class NotANumberInspectionStatistics (StatisticalData) :
"""
A class representing information about where non-finite values are found
in a data set.
includes the following statistics:
nan_count - the number of non finite values that are present in the data set
nan_fraction - the fraction of non finite values that are present in the data set
"""
_doc_strings = {
'nan_count': "number of NaNs",
'nan_fraction': "fraction of NaNs",
}
def __init__(self, dataObject) :
"""
build our nonfinite data related statistics based on the data set
"""
self.title = 'NaN Statistics'
# pull out a mask we will use
nan_mask = dataObject.masks.non_finite_mask
# get some basic statistics
self.nan_count = np.sum(nan_mask)
self.nan_fraction = float(self.nan_count) / float(nan_mask.size)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'nan_count': self.nan_count,
'nan_fraction': self.nan_fraction,
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return NotANumberInspectionStatistics._doc_strings
class GeneralStatistics (StatisticalData) :
"""
A class representing general information about a pair of data sets.
includes the following statistics:
a_missing_value - the fill data value in the a set
b_missing_value - the fill data value in the b set
epsilon - the fixed epsilon value
epsilon_percent - the percentage of the a set that will be used for comparison
max_a - the maximum value in the a set
max_b - the maximum value in the b set
min_a - the minimum value in the a set
min_b - the minimum value in the b set
num_data_points - the total number of data points in each of the sets
shape - the shape of each of the data sets
spatially_invalid_pts_ignored_in_a - number of points corresponding to invalid lat/lon in a set
spatially_invalid_pts_ignored_in_b - number of points corresponding to invalid lat/lon in b set
"""
_doc_strings = {
'a_missing_value': 'the value that is considered \"missing\" data when it is found in A',
'b_missing_value': 'the value that is considered \"missing\" data when it is found in B',
'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable',
'epsilon_percent': 'the percentage of difference (of A\'s value) that is acceptable between A and B (optional)',
'max_a': 'the maximum finite, non-missing value found in A',
'max_b': 'the maximum finite, non-missing value found in B',
'min_a': 'the minimum finite, non-missing value found in A',
'min_b': 'the minimum finite, non-missing value found in B',
'num_data_points': "number of data values in A",
'shape': "shape of A",
'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' +
' ignored for the purposes of data analysis and presentation',
'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' +
' ignored for the purposes of data analysis and presentation',
}
def __init__(self, diffInfoObject) :
"""
build our general statistics based on the comparison
of two data sets
"""
self.title = 'General Statistics'
# pull out some masks for later use
a_missing_mask = diffInfoObject.a_data_object.masks.missing_mask
b_missing_mask = diffInfoObject.b_data_object.masks.missing_mask
ignore_in_a_mask = diffInfoObject.a_data_object.masks.ignore_mask
ignore_in_b_mask = diffInfoObject.b_data_object.masks.ignore_mask
good_in_a_mask = diffInfoObject.a_data_object.masks.valid_mask
good_in_b_mask = diffInfoObject.b_data_object.masks.valid_mask
assert(a_missing_mask.shape == b_missing_mask.shape)
assert(b_missing_mask.shape == ignore_in_a_mask.shape)
assert(ignore_in_a_mask.shape == ignore_in_b_mask.shape)
assert(ignore_in_b_mask.shape == good_in_a_mask.shape)
assert(good_in_a_mask.shape == good_in_b_mask.shape)
# get the number of data points
total_num_values = a_missing_mask.size
# fill in our statistics
self.a_missing_value = diffInfoObject.a_data_object.select_fill_value()
self.b_missing_value = diffInfoObject.b_data_object.select_fill_value()
self.epsilon = diffInfoObject.epsilon_value
self.epsilon_percent = diffInfoObject.epsilon_percent
self.max_a = delta.max_with_mask(diffInfoObject.a_data_object.data, good_in_a_mask)
self.min_a = delta.min_with_mask(diffInfoObject.a_data_object.data, good_in_a_mask)
self.max_b = delta.max_with_mask(diffInfoObject.b_data_object.data, good_in_b_mask)
self.min_b = delta.min_with_mask(diffInfoObject.b_data_object.data, good_in_b_mask)
self.num_data_points = total_num_values
self.shape = a_missing_mask.shape
# also calculate the invalid points
self.spatially_invalid_pts_ignored_in_a = np.sum(ignore_in_a_mask)
self.spatially_invalid_pts_ignored_in_b = np.sum(ignore_in_b_mask)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'a_missing_value': self.a_missing_value,
'b_missing_value': self.b_missing_value,
'epsilon': self.epsilon,
'epsilon_percent': self.epsilon_percent,
'max_a': self.max_a,
'max_b': self.max_b,
'min_a': self.min_a,
'min_b': self.min_b,
'num_data_points': self.num_data_points,
'shape': self.shape,
'spatially_invalid_pts_ignored_in_a': self.spatially_invalid_pts_ignored_in_a,
'spatially_invalid_pts_ignored_in_b': self.spatially_invalid_pts_ignored_in_b
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return GeneralStatistics._doc_strings
class GeneralInspectionStatistics (StatisticalData) :
"""
A class representing general information about a data set.
includes the following statistics:
missing_value - the fill data value
max - the maximum value
min - the minimum value
num_data_points - the total number of data points
shape - the shape of the data
spatially_invalid_pts_ignored - number of points corresponding to invalid lat/lon in the set
(optional if no /lon lat mapped)
mean - the mean of the data values
median - the median of the data values
std_val - the standard deviation of the data values
"""
_doc_strings = {
'missing_value': 'the value that is considered \"missing\" data when it is found in the data',
'max': 'the maximum finite, non-missing value found in the data',
'min': 'the minimum finite, non-missing value found in the data',
'num_data_points': "number of data points (may be valid or invalid data)",
'shape': "shape of the data",
'spatially_invalid_pts_ignored': 'number of points with invalid latitude/longitude information ' +
'in the data that were' +
' ignored for the purposes of data analysis and presentation',
'mean': "the mean of all finite, non-missing values in the data",
'median': "the median of all finite, non-missing values in the data",
'std_val': "the standard deviation of all finite, non-missing values in the data",
}
def __init__(self, dataObject) :
"""
build our general statistics based on the data set
"""
self.title = 'General Statistics'
# pull out some masks for later use
missing_mask = dataObject.masks.missing_mask
ignore_mask = dataObject.masks.ignore_mask
good_mask = dataObject.masks.valid_mask
# grab the valid data for some calculations
tempGoodData = dataObject.data[good_mask]
#assert(missing_mask.shape == ignore_mask.shape)
#assert(ignore_mask.shape == good_mask.shape )
# get the number of data points
total_num_values = missing_mask.size
# fill in our statistics
self.missing_value = dataObject.select_fill_value()
self.max = np.max(tempGoodData)
self.min = np.min(tempGoodData)
self.mean = np.mean(tempGoodData)
self.median = np.median(tempGoodData)
self.std_val = np.std(tempGoodData)
self.num_data_points = total_num_values
self.shape = missing_mask.shape
# also calculate the invalid points
self.spatially_invalid_pts_ignored = np.sum(ignore_mask)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'missing_value': self.missing_value,
'max': self.max,
'min': self.min,
'mean': self.mean,
'median': self.median,
'std_val': self.std_val,
'num_data_points': self.num_data_points,
'shape': self.shape,
'spatially_invalid_pts_ignored': self.spatially_invalid_pts_ignored,
}
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return GeneralInspectionStatistics._doc_strings
class NumericalComparisonStatistics (StatisticalData) :
"""
A class representing more complex comparisons between a pair of data sets.
includes the following statistics:
correlation - the Pearson correlation r-coefficient from comparing finite values of the sets
r_squared_correlation - the square of the correlation
diff_outside_epsilon_count - the number of points that fall outside the acceptable epsilon settings
diff_outside_epsilon_fraction - the fraction of points that fall outside the acceptable epsilon settings
perfect_match_count - the number of points that match perfectly between the sets
perfect_match_fraction - the fraction of points that match perfectly between the sets
mismatch_points_count - the number of points that have possible issues according to the current analysis
mismatch_points_fraction - the fraction of points that have possible issues according to the current analysis
It may also contain additional statistics. This is indicated by the does_include_simple boolean.
The possible additional statistics include:
rms_val - the root mean squared of the difference between the two data sets
std_val - the standard deviation of the difference between the two data sets
mean_diff - the mean of the absolute difference between the two data sets
median_diff - the median of the absolute difference between the two data sets
max_diff - the maximum of the absolute difference between the two data sets
mean_delta - the mean of the difference between the two data sets
median_delta - the median of the difference between the two data sets
max_delta - the maximum of the difference between the two data sets
min_delta - the minimum of the difference between the two data sets
These statistics can also be generated separately in dictionary form by calling the
basic_analysis method on this class.
"""
_doc_strings = {
'correlation': "Pearson correlation r-coefficient (0.0-1.0) for finite values of A and B",
'diff_outside_epsilon_count': "number of finite differences falling outside acceptable epsilon definitions; " +
"note: this value includes data excluded by both epsilon and epsilon_percent if " +
"both have been defined",
'diff_outside_epsilon_fraction': "fraction of finite differences falling outside acceptable epsilon " +
"definitions (out of common_finite_count)",
'max_diff': "maximum absolute valued difference of the finite values",
'mean_diff': "mean of the absolute value difference of the finite values",
'median_diff': "median of the absolute value difference of the finite values",
'mean_delta': "mean of the subtractive difference of the finite values",
'median_delta': "median of the subtractive difference of the finite values",
'max_delta': "maximum finite data value from the data set of B file - A file",
'min_delta': "minimum finite data value from the data set of B file - A file",
'perfect_match_count': "number of perfectly matched finite data points between A and B",
'perfect_match_fraction': "fraction of finite values perfectly matching between A and B (out of common_finite_count)",
'rms_val': "root mean square (RMS) difference of finite values",
'r-squared correlation': "the square of the r correlation (see correlation)",
'std_val': "standard deviation of difference of finite values",
'mismatch_points_count': 'number of points that differ in finite/missing status between the input data sets A and B,' +
' or are unacceptable when compared according to the current epsilon definitions',
'mismatch_points_fraction': 'fraction of points that differ in finite/missing status between the input data sets A and B,' +
' or are unacceptable when compared according to the current epsilon definitions',
}
def __init__(self, diffInfoObject, include_basic_analysis=True) :
"""
build our comparison statistics based on the comparison
of two data sets
the include_basic_analysis flag indicates whether the statistics generated by the
basic_analysis method should also be generated
"""
self.title = 'Numerical Comparison Statistics'
# pull out some info we will use later
valid_in_both = diffInfoObject.diff_data_object.masks.valid_mask
outside_epsilon_mask = diffInfoObject.diff_data_object.masks.outside_epsilon_mask
mismatch_mask = diffInfoObject.diff_data_object.masks.mismatch_mask
aData = diffInfoObject.a_data_object.data
bData = diffInfoObject.b_data_object.data
assert (valid_in_both.shape == outside_epsilon_mask.shape)
assert (outside_epsilon_mask.shape == mismatch_mask.shape)
assert (mismatch_mask.shape == aData.shape)
assert (aData.shape == bData.shape)
# fill in some simple statistics
self.diff_outside_epsilon_count = np.sum(outside_epsilon_mask)
self.perfect_match_count = NumericalComparisonStatistics._get_num_perfect(aData, bData,
goodMask=valid_in_both)
self.correlation = delta.compute_correlation(aData, bData, valid_in_both)
self.r_squared_correlation = self.correlation * self.correlation
self.mismatch_points_count = np.sum(mismatch_mask)
# we actually want the total number of _finite_ values rather than all the data
total_num_finite_values = np.sum(valid_in_both)
# calculate some more complex statistics
self.mismatch_points_fraction = float(self.mismatch_points_count) / float(aData.size)
# be careful not to divide by zero if we don't have finite data
if total_num_finite_values > 0 :
self.diff_outside_epsilon_fraction = float(self.diff_outside_epsilon_count) / float(total_num_finite_values)
self.perfect_match_fraction = float(self.perfect_match_count) / float(total_num_finite_values)
else:
self.diff_outside_epsilon_fraction = 0.0
self.perfect_match_fraction = 0.0
# if desired, do the basic analysis
self.does_include_simple = include_basic_analysis
if (include_basic_analysis) :
basic_dict = NumericalComparisonStatistics.basic_analysis(diffInfoObject.diff_data_object.data,
valid_in_both)
if len(basic_dict) > 0 :
self.rms_val = basic_dict['rms_val']
self.std_val = basic_dict['std_val']
self.mean_diff = basic_dict['mean_diff']
self.median_diff = basic_dict['median_diff']
self.max_diff = basic_dict['max_diff']
self.mean_delta = basic_dict['mean_delta']
self.median_delta = basic_dict['median_delta']
self.max_delta = basic_dict['max_delta']
self.min_delta = basic_dict['min_delta']
else :
self.rms_val = np.nan
self.std_val = np.nan
self.mean_diff = np.nan
self.median_diff = np.nan
self.max_diff = np.nan
self.mean_delta = np.nan
self.median_delta = np.nan
self.max_delta = np.nan
self.min_delta = np.nan
self.temp_analysis = basic_dict
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = {
'correlation': self.correlation,
'r-squared correlation': self.r_squared_correlation,
'diff_outside_epsilon_count': self.diff_outside_epsilon_count,
'diff_outside_epsilon_fraction': self.diff_outside_epsilon_fraction,
'perfect_match_count': self.perfect_match_count,
'perfect_match_fraction': self.perfect_match_fraction,
'mismatch_points_count': self.mismatch_points_count,
'mismatch_points_fraction': self.mismatch_points_fraction
}
toReturn.update(self.temp_analysis)
return toReturn
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return NumericalComparisonStatistics._doc_strings
@staticmethod
def basic_analysis(diffData, valid_mask):
"""
do some very minimal analysis of the differences
"""
# if all the data is invalid,
# we can't do any of these forms of statistical analysis
if np.sum(valid_mask) <= 0 :
return { }
# calculate our statistics
root_mean_square_value = delta.calculate_root_mean_square(diffData, valid_mask)
tempDiffData = diffData[valid_mask]
absDiffData = np.abs(tempDiffData)
return { 'rms_val': root_mean_square_value,
'std_val': np.std(tempDiffData),
'mean_diff': np.mean(absDiffData),
'median_diff': np.median(absDiffData),
'max_diff': np.max(absDiffData),
'mean_delta': np.mean(tempDiffData),
'median_delta': np.median(tempDiffData),
'max_delta': np.max(tempDiffData),
'min_delta': np.min(tempDiffData)
}
@staticmethod
def _get_num_perfect(aData, bData, goodMask=None):
"""
get the number of data points where
the value in A perfectly matches the value in B
"""
numPerfect = 0
if not (goodMask is None) :
numPerfect = np.sum(aData[goodMask] == bData[goodMask])
else :
numPerfect = np.sum(aData == bData)
return numPerfect
class StatisticalAnalysis (StatisticalData) :
"""
This class represents a complete statistical analysis of two data sets.
It includes the following sets of statistics:
general - a GeneralStatistics object
comparison - a NumericalComparisonStatistics object
notANumber - a NotANumberStatistics object
missingValue - a MissingValueStatistics object
finiteData - a FiniteDataStatistics object
It can also provide a dictionary form of the statistics or the
documentation of the statistics.
"""
def __init__ (self) :
"""
this is a blank constructor to support our new class method creation pattern
"""
self.title = "Statistical Summary"
@classmethod
def withSimpleData (in_class,
a_data, b_data,
a_missing_value=None, b_missing_value=None,
a_ignore_mask=None, b_ignore_mask=None,
epsilon=0., epsilon_percent=None) :
"""
do a full statistical analysis of the data, after building the data objects
"""
new_object = in_class()
aDataObject = dataobj.DataObject(a_data, fillValue=a_missing_value, ignoreMask=a_ignore_mask)
bDataObject = dataobj.DataObject(b_data, fillValue=b_missing_value, ignoreMask=b_ignore_mask)
diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject,
epsilonValue=epsilon, epsilonPercent=epsilon_percent)
new_object._create_stats(diffInfo)
return new_object
@classmethod
def withDataObjects (in_class,
a_data_object, b_data_object,
epsilon=0., epsilon_percent=None) :
"""
do a full statistical analysis of the data, using the given data objects
"""
new_object = in_class()
diffInfo = dataobj.DiffInfoObject(a_data_object, b_data_object,
epsilonValue=epsilon, epsilonPercent=epsilon_percent)
new_object._create_stats(diffInfo)
return new_object
def _create_stats(self, diffInfoObject) :
"""
build and set all of the statistics sets
"""
self.general = GeneralStatistics(diffInfoObject)
self.comparison = NumericalComparisonStatistics(diffInfoObject)
self.notANumber = NotANumberStatistics(diffInfoObject)
self.missingValue = MissingValueStatistics(diffInfoObject)
self.finiteData = FiniteDataStatistics(diffInfoObject)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# build a dictionary of all our statistics
toReturn[self.general.title] = self.general.dictionary_form()
toReturn[self.comparison.title] = self.comparison.dictionary_form()
toReturn[self.notANumber.title] = self.notANumber.dictionary_form()
toReturn[self.missingValue.title] = self.missingValue.dictionary_form()
toReturn[self.finiteData.title] = self.finiteData.dictionary_form()
return toReturn
def doc_strings(self) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return StatisticalAnalysis.doc_strings( )
# TODO, use this method instead of the dictionary at the bottom of this module
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
toReturn = { }
toReturn.update(GeneralStatistics.doc_strings())
toReturn.update(NumericalComparisonStatistics.doc_strings())
toReturn.update(NotANumberStatistics.doc_strings())
toReturn.update(MissingValueStatistics.doc_strings())
toReturn.update(FiniteDataStatistics.doc_strings())
return toReturn
class StatisticalInspectionAnalysis (StatisticalData) :
"""
This class represents a complete statistical analysis of a data set.
It includes the following sets of statistics:
general - a GeneralInspectionStatistics object
notANumber - a NotANumberInspectionStatistics object
missingValue - a MissingValueInspectionStatistics object
finiteData - a FiniteDataInspectionStatistics object
It can also provide a dictionary form of the statistics or the
documentation of the statistics.
"""
def __init__ (self) :
"""
this is a blank constructor to support our new class method creation pattern
"""
self.title = "Statistical Summary"
@classmethod
def withSimpleData (in_class,
dataSet,
missingValue=None,
ignoreMask=None) :
"""
do a full statistical analysis of the data, after building the data object
"""
new_object = in_class()
dataObject = dataobj.DataObject(dataSet, fillValue=missingValue, ignoreMask=ignoreMask)
dataObject.self_analysis()
new_object._create_stats(dataObject)
return new_object
@classmethod
def withDataObjects (in_class,
dataObject) :
"""
do a full statistical analysis of the data, using the given data object
"""
new_object = in_class()
dataObject.self_analysis()
new_object._create_stats(dataObject)
return new_object
def _create_stats(self, dataObject) :
"""
build and set all of the statistics sets
"""
self.general = GeneralInspectionStatistics(dataObject)
self.notANumber = NotANumberInspectionStatistics(dataObject)
self.missingValue = MissingValueInspectionStatistics(dataObject)
self.finiteData = FiniteDataInspectionStatistics(dataObject)
def dictionary_form(self) :
"""
get a dictionary form of the statistics
"""
toReturn = { }
# build a dictionary of all our statistics
toReturn[self.general.title] = self.general.dictionary_form()
toReturn[self.notANumber.title] = self.notANumber.dictionary_form()
toReturn[self.missingValue.title] = self.missingValue.dictionary_form()
toReturn[self.finiteData.title] = self.finiteData.dictionary_form()
return toReturn
def doc_strings(self) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
return StatisticalAnalysis.doc_strings( )
# TODO, use this method instead of the dictionary at the bottom of this module
@staticmethod
def doc_strings( ) :
"""
get documentation strings that match the
dictionary form of the statistics
"""
toReturn = { }
toReturn.update(GeneralInspectionStatistics.doc_strings())
toReturn.update(NotANumberInspectionStatistics.doc_strings())
toReturn.update(MissingValueInspectionStatistics.doc_strings())
toReturn.update(FiniteDataInspectionStatistics.doc_strings())
return toReturn
# -------------------------- documentation -----------------------------
# TODO, can this be moved?
STATISTICS_DOC_STR = '\n'.join( '%s:\n %s' % x for x in sorted(list(StatisticalAnalysis.doc_strings().items())) ) + '\n'
if __name__=='__main__':
import doctest
doctest.testmod()