diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index 9d56fbb9c7dd8b04bae5333416355e68dfd0de77..a1c05a951adbea70e161d22f01cd51b6ec9fab5d 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -463,6 +463,12 @@ def inspect_library_call (a_path, var_list=[ ], # get the various names technical_name, _, explanationName = _get_name_info_for_variable(displayName, varRunInfo) + # make sure that it's possible to load this variable + if not(aFile.file_object.is_loadable_type(technical_name)) : + LOG.warn(displayName + " is of a type that cannot be loaded using current file handling libraries included with Glance." + + " Skipping " + displayName + ".") + continue + LOG.info('analyzing: ' + explanationName) # load the variable data @@ -741,6 +747,12 @@ def reportGen_library_call (a_path, b_path, var_list=[ ], technical_name, b_variable_technical_name, \ explanationName = _get_name_info_for_variable(displayName, varRunInfo) + # make sure that it's possible to load this variable + if not(aFile.file_object.is_loadable_type(technical_name)) or not(bFile.file_object.is_loadable_type(b_variable_technical_name)) : + LOG.warn(displayName + " is of a type that cannot be loaded using current file handling libraries included with Glance." + + " Skipping " + displayName + ".") + continue + LOG.info('analyzing: ' + explanationName) # load the variable data @@ -1013,6 +1025,13 @@ def stats_library_call(afn, bfn, var_list=[ ], doc_atend = do_document and len(names)!=1 for name, epsilon, missing in names: + + # make sure that it's possible to load this variable + if not(aFile.is_loadable_type(name)) or not(bFile.is_loadable_type(name)) : + LOG.warn(name + " is of a type that cannot be loaded using current file handling libraries included with Glance." + + " Skipping " + name + ".") + continue + aData = aFile[name] bData = bFile[name] if missing is None: @@ -1085,6 +1104,13 @@ def inspect_stats_library_call (afn, var_list=[ ], options_set={ }, do_document= doc_atend = do_document and len(names)!=1 for name, epsilon, missing in names: + + # make sure that it's possible to load this variable + if not(aFile.is_loadable_type(name)) : + LOG.warn(name + " is of a type that cannot be loaded using current file handling libraries included with Glance." + + " Skipping " + name + ".") + continue + aData = aFile[name] amiss = missing diff --git a/pyglance/glance/gui_controller.py b/pyglance/glance/gui_controller.py index d7bda0db90b6334264a3c9bd4a0ce6ce5f0e7e7d..d030bd4f3ec330e77c45fd5ce39703c46b2fa12a 100644 --- a/pyglance/glance/gui_controller.py +++ b/pyglance/glance/gui_controller.py @@ -79,7 +79,7 @@ class GlanceGUIController (object) : try : self.model.loadNewFile(file_prefix, new_file_path) - except gui_model.UnableToReadFile, utrf : + except (gui_model.UnableToReadFile, ValueError) as utrf : self.handleWarning(str(utrf)) def userSelectedVariable (self, file_prefix, newSelection) : @@ -87,7 +87,10 @@ class GlanceGUIController (object) : the user selected a new variable """ - self.model.updateFileDataSelection(file_prefix, newVariableText=newSelection) + try : + self.model.updateFileDataSelection(file_prefix, newVariableText=newSelection) + except ValueError as ve : + self.handleWarning(str(ve)) def userChangedOverload (self, file_prefix, new_override_value) : """ @@ -129,14 +132,20 @@ class GlanceGUIController (object) : the user selected a new longitude variable """ - self.model.updateLonLatSelections(file_prefix, new_longitude_name=newSelection) + try: + self.model.updateLonLatSelections(file_prefix, new_longitude_name=newSelection) + except ValueError as ve : + self.handleWarning(str(ve)) def userSelectedLatitude (self, file_prefix, newSelection) : """ the user selected a new latitude variable """ - self.model.updateLonLatSelections(file_prefix, new_latitude_name=newSelection) + try : + self.model.updateLonLatSelections(file_prefix, new_latitude_name=newSelection) + except ValueError as ve : + self.handleWarning(str(ve)) def userSelectedImageType (self, new_image_type) : """ @@ -201,7 +210,7 @@ class GlanceGUIController (object) : try : self.stats.sendStatsInfo() - except IncompatableDataObjects, ido : + except IncompatableDataObjects as ido : self.handleWarning(str(ido)) def userRequestsPlot (self) : @@ -211,7 +220,7 @@ class GlanceGUIController (object) : try : self.figs.spawnPlot() - except (IncompatableDataObjects, ValueError), idove : + except (IncompatableDataObjects, ValueError) as idove : self.handleWarning(str(idove)) #raise diff --git a/pyglance/glance/gui_model.py b/pyglance/glance/gui_model.py index 6877b9f594495d86969eb7fbea98e5f467b997c2..e6c7ef58f0a686d8e293f488fbd4d75e00d4159c 100644 --- a/pyglance/glance/gui_model.py +++ b/pyglance/glance/gui_model.py @@ -186,6 +186,7 @@ class GlanceGUIModel (object) : tempIndex = tempIndex + 1 tempVariable = variableList[tempIndex] + LOG.debug ("selected variable: " + str(tempVariable)) # save all of the data related to this file for later use diff --git a/pyglance/glance/io.py b/pyglance/glance/io.py index 55873ddf59245d1bc654dfdbced6471a61f98e3a..729d261710ad177302eb8d260bcc98cf3aca6f3f 100644 --- a/pyglance/glance/io.py +++ b/pyglance/glance/io.py @@ -177,6 +177,14 @@ class CaseInsensitiveAttributeCache (object) : toReturn = self.globalAttributesLower return toReturn + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + # TODO, are there any bad types for these files? + return True class hdf (object): """wrapper for HDF4 dataset for comparison @@ -352,6 +360,14 @@ class hdf (object): toReturn = self._hdf.attributes()[attributeName] return toReturn + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + # TODO, are there any bad types for these files? + return True class nc (object): """wrapper for NetCDF3/4/opendap dataset for comparison @@ -394,6 +410,23 @@ class nc (object): # get the variable object and use it to # get our raw data and scaling info variable_object = self.get_variable_object(name) + + # do a check to see if this is a multi-dimensional character array + # (right now pycdf can't handle those correctly) + if (variable_object.inq_type() is NC.CHAR) and (len(variable_object.shape()) > 1) : + raise ValueError(name + " is a multidimensional character array, which is not currently supported.") + + #print str("** inq: " + str(variable_object.inq_type())) + #print str("types reference: ") + #print str("NC.BYTE: " + str(NC.BYTE)) + #print str("NC.CHAR: " + str(NC.CHAR)) + #print str("NC.SHORT: " + str(NC.SHORT)) + #print str("NC.INT: " + str(NC.INT)) + #print str("NC.FLOAT: " + str(NC.FLOAT)) + #print str("NC.DOUBLE: " + str(NC.DOUBLE)) + + #print str("shape: " + str(variable_object.shape())) + raw_data_copy = variable_object[:] # load the scale factor and add offset @@ -595,6 +628,14 @@ class nc (object): toReturn = self._nc.attributes()[attributeName] return toReturn + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + variable_object = self.get_variable_object(name) + return (variable_object.inq_type() is not NC.CHAR) nc4 = nc cdf = nc @@ -787,6 +828,14 @@ class h5(object): toReturn = self._h5.attrs[attributeName] return toReturn + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + # TODO, are there any bad types for these files? + return True class aeri(object): @@ -919,6 +968,14 @@ class aeri(object): LOG.warn('Glance does not yet support attribute retrieval in AERI files. None will be used.') return toReturn + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + # TODO, are there any bad types for these files? + return True # handle the variety of file suffixes by building aliases to aeri class cxs = rnc = cxv = csv = spc = sum = uvs = aeri @@ -1088,6 +1145,14 @@ class tiff (object): # FUTURE, GeoTIFF files do have attributes, but this isn't hooked up yet return None + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + # TODO, are there any bad types for these files? + return True # people also name tiff files with one f... tif = tiff @@ -1214,6 +1279,14 @@ class jpss_adl(object): LOG.warn('Glance does not yet support attribute retrieval in JPSS ADL files. None will be used.') return toReturn + + def is_loadable_type (self, name) : + """ + check to see if the indicated variable is a type that can be loaded + """ + + # TODO, are there any bad types for these files? + return True diff --git a/pyglance/glance/stats.py b/pyglance/glance/stats.py index b30bf47e8ee4f784f67997cb7ced5646dee73245..807556f179bff04df824bc0f8ec057cc432f2b5d 100644 --- a/pyglance/glance/stats.py +++ b/pyglance/glance/stats.py @@ -18,8 +18,8 @@ import numpy as np class StatisticalData (object) : """ This class represents a set of statistical data generated from - the examination of two data sets. This data set is relatively - abstract. + the examination of data sets. What form of data is accepted for + analysis is relatively abstract. All Statistics Data objects should have a title and be able to provide a dictionary of their statistics (see dictionary_form function) and @@ -44,7 +44,7 @@ class StatisticalData (object) : return { } @staticmethod - def doc_strings( ) : + def doc_strings(inspect=False) : """ get documentation strings that match the dictionary form of the statistics this class @@ -53,23 +53,50 @@ class StatisticalData (object) : note: child classes should override this method """ return { } + + def make_prefix_and_suffix (self, descriptionText) : + """ + given text describing a statistic (or none) + return an appropriate prefix and suffix + """ + + prefix = "" if descriptionText is None else str(descriptionText) + '_' + suffix = "" if descriptionText is None else '_' + str(descriptionText) + + return prefix, suffix class MissingValueStatistics (StatisticalData) : """ A class representing information about where fill values are found - in a pair of data sets. + in data. It can analyze either a pair of data sets encapsulated in a + glance.data.DiffInfoObject or a single data set in a glance.data.DataObject. - includes the following statistics: + if a DiffInfoObject is given it will produce the following statistics: + + common_missing_count - count of points that are missing in both data sets + common_missing_fraction - fraction of points that are missing in both data sets + + it will also include the following intermediary objects with stats about the + individual data sets in the DiffInfoObject: + + a_missing_stats - a MissingValueStatistics object specific to the a data set + b_missing_stats - a MissingValueStatistics object specific to the b data set + + when turned into a dictionary these become: a_missing_count - count of points that are missing in the a data set a_missing_fraction - fraction of points that are missing in the a data set b_missing_count - count of points that are missing in the b data set b_missing_fraction - fraction of points that are missing in the b data set - common_missing_count - count of points that are missing in both data sets - common_missing_fraction - fraction of points that are missing in both data sets + + if it is only given a DataObject it will produce the following : + + <data set descrption>missing_count - count of points that are missing in the data set + <data set descrption>missing_fraction - fraction of points that are missing in the data set """ - _doc_strings = { + _doc_strings = \ + { 'a_missing_count': "number of values flagged missing in A", 'a_missing_fraction': "fraction of values flagged missing in A", 'b_missing_count': "number of values flagged missing in B", @@ -78,122 +105,122 @@ class MissingValueStatistics (StatisticalData) : 'common_missing_fraction': "fraction of missing values in common between A and B" } - def __init__(self, diffInfoObject) : - """ - build our fill value related statistics based on the comparison - of two data sets - """ - self.title = 'Missing Value Statistics' - - # pull out some masks for later use - a_missing_mask = diffInfoObject.a_data_object.masks.missing_mask - b_missing_mask = diffInfoObject.b_data_object.masks.missing_mask - - assert(a_missing_mask.shape == b_missing_mask.shape) - - # figure out some basic statistics - self.a_missing_count = np.sum(a_missing_mask) - self.b_missing_count = np.sum(b_missing_mask) - self.common_missing_count = np.sum(a_missing_mask & b_missing_mask) - - # make the assumption that a and b are the same size and only use the size of a's mask - total_num_values = a_missing_mask.size - - # figure out some fraction statistics - self.a_missing_fraction = float(self.a_missing_count) / float(total_num_values) - self.b_missing_fraction = float(self.b_missing_count) / float(total_num_values) - self.common_missing_fraction = float(self.common_missing_count) / float(total_num_values) - - def dictionary_form(self) : - """ - get a dictionary form of the statistics - """ - - toReturn = { - 'a_missing_count': self.a_missing_count, - 'a_missing_fraction': self.a_missing_fraction, - 'b_missing_count': self.b_missing_count, - 'b_missing_fraction': self.b_missing_fraction, - 'common_missing_count': self.common_missing_count, - 'common_missing_fraction': self.common_missing_fraction - } - - return toReturn - - @staticmethod - def doc_strings( ) : - """ - get documentation strings that match the - dictionary form of the statistics - """ - - return MissingValueStatistics._doc_strings - -class MissingValueInspectionStatistics (StatisticalData) : - """ - A class representing information about where fill values are found - in a data. - - includes the following statistics: - - missing_count - count of points that are missing in the a data set - missing_fraction - fraction of points that are missing in the a data set - """ - - _doc_strings = { + _doc_strings_inspection = \ + { 'missing_count': "number of values flagged missing", 'missing_fraction': "fraction of values flagged missing", } - def __init__(self, dataObject) : - """ - build our fill value related statistics based on the data set - """ - self.title = 'Missing Value Statistics' - - # pull out a mask for later use - missing_mask = dataObject.masks.missing_mask - - # figure out some basic statistics - self.missing_count = np.sum(missing_mask) - self.missing_fraction = float(self.missing_count) / float(missing_mask.size) + def __init__(self, diffInfoObject=None, dataObject=None, dataSetDescription=None) : + """ + build our fill value related statistics + + diffInfoObject is assumed to be a glance.data.DiffInfoObject + dataObject is assumed to be a glance.data.DataObject + + Either the diffInfoObject or the dataObject must be passed in. If the + diffInfoObject is passed the dataObject will be ignored and the + a_data_object and b_data_object associated with the diffInfoObject + will be analyzed. + + If only dataObject is analysed dataSetDescription will be used in labeling + the resulting dictionary form statistics. + """ + self.title = 'Missing Value Statistics' + self.is_one_data_set = False + + # if we don't have comparison information and we do have a single data set + if (diffInfoObject is None) and (dataObject is not None) : + + # we have one data set and should save the prefix information + self.is_one_data_set = True + self.desc_text = dataSetDescription + + # figure out some basic statistics + self.missing_count = np.sum(dataObject.masks.missing_mask) + self.missing_fraction = float(self.missing_count) / float(dataObject.data.size) + + # if we have a comparison object analyze the data associated with that comparison + elif diffInfoObject is not None : + + # analyze each of the original data sets that are being compared + self.a_missing_stats = MissingValueStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a") + self.b_missing_stats = MissingValueStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b") + + # common statistics + self.common_missing_count = np.sum(diffInfoObject.a_data_object.masks.missing_mask & diffInfoObject.b_data_object.masks.missing_mask) + self.common_missing_fraction = float(self.common_missing_count) / float(diffInfoObject.a_data_object.data.size) + + else : + raise ValueError ("No data set was given when requesting statistical analysis of missing values.") def dictionary_form(self) : """ get a dictionary form of the statistics """ - toReturn = { - 'missing_count': self.missing_count, - 'missing_fraction': self.missing_fraction, - } + toReturn = { } + + # if we only have stats for one data set + if self.is_one_data_set : + temp_prefix, _ = self.make_prefix_and_suffix(self.desc_text) + toReturn = { + temp_prefix + 'missing_count': self.missing_count, + temp_prefix + 'missing_fraction': self.missing_fraction, + } + + # otherwise we must have stats for a comparison + else : + toReturn = { + 'common_missing_count': self.common_missing_count, + 'common_missing_fraction': self.common_missing_fraction, + } + a_dict = self.a_missing_stats.dictionary_form() + toReturn.update(a_dict) + b_dict = self.b_missing_stats.dictionary_form() + toReturn.update(b_dict) return toReturn @staticmethod - def doc_strings( ) : + def doc_strings(inspect=False) : """ get documentation strings that match the dictionary form of the statistics """ - return MissingValueInspectionStatistics._doc_strings + return MissingValueStatistics._doc_strings if not inspect else MissingValueStatistics._doc_strings_inspection class FiniteDataStatistics (StatisticalData) : """ A class representing information about where finite values are found - in a pair of data sets. + in data. It can analyze either a pair of data sets encapsulated in a + glance.data.DiffInfoObject or a single data set in a glance.data.DataObject. - includes the following statistics: + when a single data set is analyzed the following stats are produced: + + <data prefix>finite_count - the number of finite data values in the data set + <data prefix>finite_fraction - the fraction of finite data values in the data set + + if a DiffInfoObject is given for analysis the following statistics are produced: - a_finite_count - the number of finite data values in the a data set - a_finite_fraction - the fraction of finite data values in the a data set - b_finite_count - the number of finite data values in the b data set - b_finite_fraction - the fraction of finite data values in the b data set common_finite_count - the number of finite values the two data sets have in common common_finite_fraction - the fraction of finite values the two data sets have in common finite_in_only_one_count - the number of points that are finite in only one of the two sets finite_in_only_one_fraction - the fraction of points that are finite in only one of the two sets + + it will also include the following intermediary objects with stats about the + individual data sets in the DiffInfoObject: + + a_finite_stats - a FiniteDataStatistics object with further stats on the a data set + b_finite_stats - a FiniteDataStatistics object with further stats on the b data set + + and the dictionary form will includes the following statistics: + + a_finite_count - the number of finite data values in the a data set + a_finite_fraction - the fraction of finite data values in the a data set + b_finite_count - the number of finite data values in the b data set + b_finite_fraction - the fraction of finite data values in the b data set """ _doc_strings = { @@ -209,127 +236,125 @@ class FiniteDataStatistics (StatisticalData) : "only the common spatially valid area is considerd for this statistic" } - def __init__(self, diffInfoObject) : - """ - build our finite data related statistics based on the comparison - of two data sets - """ - self.title = 'Finite Data Statistics' - - # pull out some data we will use later - a_is_finite_mask = diffInfoObject.a_data_object.masks.valid_mask - b_is_finite_mask = diffInfoObject.b_data_object.masks.valid_mask - common_ignore_mask = diffInfoObject.diff_data_object.masks.ignore_mask - - assert(a_is_finite_mask.shape == b_is_finite_mask.shape) - assert(b_is_finite_mask.shape == common_ignore_mask.shape) - - # figure out some basic statistics - self.a_finite_count = np.sum(a_is_finite_mask) - self.b_finite_count = np.sum(b_is_finite_mask) - self.common_finite_count = np.sum(a_is_finite_mask & b_is_finite_mask) - # use an exclusive or to check which points are finite in only one of the two data sets - self.finite_in_only_one_count = np.sum((a_is_finite_mask ^ b_is_finite_mask) & ~common_ignore_mask) - - # make the assumption that a and b are the same size and only use the size of a's mask - total_num_values = a_is_finite_mask.size - - # calculate some fractional statistics - self.a_finite_fraction = float(self.a_finite_count) / float(total_num_values) - self.b_finite_fraction = float(self.b_finite_count) / float(total_num_values) - self.common_finite_fraction = float(self.common_finite_count) / float(total_num_values) - self.finite_in_only_one_fraction = float(self.finite_in_only_one_count) / float(total_num_values) + _doc_strings_inspection = \ + { + 'finite_count': "number of finite values", + 'finite_fraction': "fraction of finite values (out of all data points in set)", + } + + def __init__(self, diffInfoObject=None, dataObject=None, dataSetDescription=None) : + """ + build our finite data related statistics + + diffInfoObject is assumed to be a glance.data.DiffInfoObject + dataObject is assumed to be a glance.data.DataObject + + Either the diffInfoObject or the dataObject must be passed in. If the + diffInfoObject is passed the dataObject will be ignored and the + a_data_object and b_data_object associated with the diffInfoObject + will be analyzed. + + If only dataObject is analysed dataSetDescription will be used in labeling + the resulting dictionary form statistics. + """ + self.title = 'Finite Data Statistics' + self.is_one_data_set = False + + # if we don't have comparison information and we do have a single data set + if (diffInfoObject is None) and (dataObject is not None) : + + # we have one data set and should save the prefix information + self.is_one_data_set = True + self.desc_text = dataSetDescription + + # figure out some basic statistics + self.finite_count = np.sum(dataObject.masks.valid_mask) + self.finite_fraction = float(self.finite_count) / float(dataObject.data.size) + + # if we have a comparison object analyze the data associated with that comparison + elif diffInfoObject is not None : + + # analyze each of the original data sets that are being compared + self.a_finite_stats = FiniteDataStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a") + self.b_finite_stats = FiniteDataStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b") + + # calculate some common statistics + self.common_finite_count = np.sum(diffInfoObject.a_data_object.masks.valid_mask & diffInfoObject.b_data_object.masks.valid_mask) + # use an exclusive or to check which points are finite in only one of the two data sets + self.finite_in_only_one_count = np.sum((diffInfoObject.a_data_object.masks.valid_mask ^ diffInfoObject.b_data_object.masks.valid_mask) \ + & ~diffInfoObject.diff_data_object.masks.ignore_mask) + self.common_finite_fraction = float(self.common_finite_count) / float(diffInfoObject.a_data_object.data.size) + self.finite_in_only_one_fraction = float(self.finite_in_only_one_count) / float(diffInfoObject.a_data_object.data.size) + + else: + raise ValueError ("No data set was given when requesting statistical analysis of finite values.") def dictionary_form(self) : """ get a dictionary form of the statistics """ - toReturn = { - 'a_finite_count': self.a_finite_count, - 'a_finite_fraction': self.a_finite_fraction, - 'b_finite_count': self.b_finite_count, - 'b_finite_fraction': self.b_finite_fraction, - 'common_finite_count': self.common_finite_count, - 'common_finite_fraction': self.common_finite_fraction, - 'finite_in_only_one_count': self.finite_in_only_one_count, - 'finite_in_only_one_fraction': self.finite_in_only_one_fraction, - } + toReturn = { } + + # if we only have stats for one data set + if self.is_one_data_set : + temp_prefix, _ = self.make_prefix_and_suffix(self.desc_text) + toReturn = { + temp_prefix + 'finite_count': self.finite_count, + temp_prefix + 'finite_fraction': self.finite_fraction, + } + + # otherwise we must have stats for a comparison + else : + toReturn = { + 'common_finite_count': self.common_finite_count, + 'common_finite_fraction': self.common_finite_fraction, + 'finite_in_only_one_count': self.finite_in_only_one_count, + 'finite_in_only_one_fraction': self.finite_in_only_one_fraction, + } + a_dict = self.a_finite_stats.dictionary_form() + toReturn.update(a_dict) + b_dict = self.b_finite_stats.dictionary_form() + toReturn.update(b_dict) return toReturn @staticmethod - def doc_strings( ) : + def doc_strings(inspect=False) : """ get documentation strings that match the dictionary form of the statistics """ - return FiniteDataStatistics._doc_strings + return FiniteDataStatistics._doc_strings if not inspect else FiniteDataStatistics._doc_strings_inspection -class FiniteDataInspectionStatistics (StatisticalData) : +class NotANumberStatistics (StatisticalData) : """ - A class representing information about where finite values are found - in a data set. + A class representing information about where non-numerical values are found + in data. It can analyze either a pair of data sets encapsulated in a + glance.data.DiffInfoObject or a single data set in a glance.data.DataObject. - includes the following statistics: + when a single data set is analyzed the following stats are produced: - finite_count - the number of finite data values in the data set - finite_fraction - the fraction of finite data values in the data set - """ + nan_count - the number of non finite values that are present in the data set + nan_fraction - the fraction of non finite values that are present in the data set - _doc_strings = { - 'finite_count': "number of finite values", - 'finite_fraction': "fraction of finite values (out of all data points in set)", - } + if a DiffInfoObject is given for analysis the following statistics are produced: - def __init__(self, dataObject) : - """ - build our finite data related statistics based on the data set - """ - self.title = 'Finite Data Statistics' - - # pull out some data we will use later - is_finite_mask = dataObject.masks.valid_mask - - # figure out some basic statistics - self.finite_count = np.sum(is_finite_mask) - self.finite_fraction = float(self.finite_count) / float(is_finite_mask.size) + common_nan_count - the number of non finite values that are shared between the data sets + common_nan_fraction - the fraction of non finite values that are shared between the data sets - def dictionary_form(self) : - """ - get a dictionary form of the statistics - """ - - toReturn = { - 'finite_count': self.finite_count, - 'finite_fraction': self.finite_fraction, - } - - return toReturn + if a DiffInfoObject is given the object will also have: - @staticmethod - def doc_strings( ) : - """ - get documentation strings that match the - dictionary form of the statistics - """ - - return FiniteDataInspectionStatistics._doc_strings - -class NotANumberStatistics (StatisticalData) : - """ - A class representing information about where non-finite values are found - in a pair of data sets. + a_finite_stats - a NotANumberStatistics object with further stats on the a data set + b_finite_stats - a NotANumberStatistics object with further stats on the b data set - includes the following statistics: + and the dictionary form will includes the following statistics: a_nan_count - the number of non finite values that are present in the a data set a_nan_fraction - the fraction of non finite values that are present in the a data set b_nan_count - the number of non finite values that are present in the b data set b_nan_fraction - the fraction of non finite values that are present in the b data set - common_nan_count - the number of non finite values that are shared between the data sets - common_nan_fraction - the fraction of non finite values that are shared between the data sets """ _doc_strings = { @@ -341,130 +366,138 @@ class NotANumberStatistics (StatisticalData) : 'common_nan_fraction': "fraction of NaNs in common between A and B" } - def __init__(self, diffInfoObject) : - """ - build our nonfinite data related statistics based on the comparison - of two data sets - """ - self.title = 'NaN Statistics' - - # pull out some masks we will use - a_nan_mask = diffInfoObject.a_data_object.masks.non_finite_mask - b_nan_mask = diffInfoObject.b_data_object.masks.non_finite_mask - - assert(a_nan_mask.shape == b_nan_mask.shape) - - # get some basic statistics - self.a_nan_count = np.sum(a_nan_mask) - self.b_nan_count = np.sum(b_nan_mask) - self.common_nan_count = np.sum(a_nan_mask & b_nan_mask) - - # make the assumption that a and b are the same size and only use the size of a - total_num_values = a_nan_mask.size - - # calculate some fractional statistics - self.a_nan_fraction = float(self.a_nan_count) / float(total_num_values) - self.b_nan_fraction = float(self.b_nan_count) / float(total_num_values) - self.common_nan_fraction = float(self.common_nan_count) / float(total_num_values) - - def dictionary_form(self) : - """ - get a dictionary form of the statistics - """ - - toReturn = { - 'a_nan_count': self.a_nan_count, - 'a_nan_fraction': self.a_nan_fraction, - 'b_nan_count': self.b_nan_count, - 'b_nan_fraction': self.b_nan_fraction, - 'common_nan_count': self.common_nan_count, - 'common_nan_fraction': self.common_nan_fraction - } - - return toReturn - - @staticmethod - def doc_strings( ) : - """ - get documentation strings that match the - dictionary form of the statistics - """ - - return NotANumberStatistics._doc_strings - -class NotANumberInspectionStatistics (StatisticalData) : - """ - A class representing information about where non-finite values are found - in a data set. - - includes the following statistics: - - nan_count - the number of non finite values that are present in the data set - nan_fraction - the fraction of non finite values that are present in the data set - """ - - _doc_strings = { + _doc_strings_inspection = \ + { 'nan_count': "number of NaNs", 'nan_fraction': "fraction of NaNs", } - def __init__(self, dataObject) : - """ - build our nonfinite data related statistics based on the data set - """ - self.title = 'NaN Statistics' - - # pull out a mask we will use - nan_mask = dataObject.masks.non_finite_mask - - # get some basic statistics - self.nan_count = np.sum(nan_mask) - self.nan_fraction = float(self.nan_count) / float(nan_mask.size) + def __init__(self, diffInfoObject=None, dataObject=None, dataSetDescription=None) : + """ + build our nonfinite data related statistics + + diffInfoObject is assumed to be a glance.data.DiffInfoObject + dataObject is assumed to be a glance.data.DataObject + + Either the diffInfoObject or the dataObject must be passed in. If the + diffInfoObject is passed the dataObject will be ignored and the + a_data_object and b_data_object associated with the diffInfoObject + will be analyzed. + + If only dataObject is analysed dataSetDescription will be used in labeling + the resulting dictionary form statistics. + """ + self.title = 'NaN Statistics' + self.is_one_data_set = False + + # if we don't have comparison information and we do have a single data set + if (diffInfoObject is None) and (dataObject is not None) : + + # we have one data set and should save the prefix information + self.is_one_data_set = True + self.desc_text = dataSetDescription + + # get some basic statistics + self.nan_count = np.sum(dataObject.masks.non_finite_mask) + self.nan_fraction = float(self.nan_count) / float(dataObject.data.size) + + # if we have a comparison object analyze the data associated with that comparison + elif diffInfoObject is not None : + + # analyze each of the original data sets that are being compared + self.a_nan_stats = NotANumberStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a") + self.b_nan_stats = NotANumberStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b") + + # calculate some common statistics + self.common_nan_count = np.sum(diffInfoObject.a_data_object.masks.non_finite_mask & diffInfoObject.b_data_object.masks.non_finite_mask) + self.common_nan_fraction = float(self.common_nan_count) / float(diffInfoObject.a_data_object.data.size) + + else: + raise ValueError ("No data set was given when requesting statistical analysis of NaN values.") def dictionary_form(self) : """ get a dictionary form of the statistics """ - toReturn = { - 'nan_count': self.nan_count, - 'nan_fraction': self.nan_fraction, - } + toReturn = { } + + # if we only have stats for one data set + if self.is_one_data_set : + temp_prefix, _ = self.make_prefix_and_suffix(self.desc_text) + toReturn = { + temp_prefix + 'nan_count': self.nan_count, + temp_prefix + 'nan_fraction': self.nan_fraction, + } + + # otherwise we must have stats for a comparison + else : + toReturn = { + 'common_nan_count': self.common_nan_count, + 'common_nan_fraction': self.common_nan_fraction + } + a_dict = self.a_nan_stats.dictionary_form() + toReturn.update(a_dict) + b_dict = self.b_nan_stats.dictionary_form() + toReturn.update(b_dict) return toReturn @staticmethod - def doc_strings( ) : + def doc_strings(inspect=False) : """ get documentation strings that match the dictionary form of the statistics """ - return NotANumberInspectionStatistics._doc_strings + return NotANumberStatistics._doc_strings if not inspect else NotANumberStatistics._doc_strings_inspection class GeneralStatistics (StatisticalData) : """ - A class representing general information about a pair of data sets. + A class representing general information about data. It can analyze either a + pair of data sets encapsulated in a glance.data.DiffInfoObject or a single + data set in a glance.data.DataObject. + + if a single DataObject is given the following will be produced: + (some of these are labeled with any dataSetDescription given in the + constructor) + + missing_value - the fill data value + max - the maximum value + min - the minimum value + num_data_points - the total number of data points + shape - the shape of the data + spatially_invalid_pts_ignored - number of points corresponding to invalid lat/lon in the set + (optional if no /lon lat mapped) + mean - the mean of the data values + median - the median of the data values + std_val - the standard deviation of the data values - includes the following statistics: + if a DiffInfoObject is given these comparison stats will be produced: - a_missing_value - the fill data value in the a set - b_missing_value - the fill data value in the b set epsilon - the fixed epsilon value epsilon_percent - the percentage of the a set that will be used for comparison + num_data_points - the number of data points in each of the sets + shape - the shape of each of the data sets + + it will also have the following self owned variables: + + a_gen_stats - a GeneralStatistics object with further stats on the a data set + b_gen_stats - a GeneralStatistics object with further stats on the b data set + + in dictionary form those objects will produce: + + a_missing_value - the fill data value in the a set + b_missing_value - the fill data value in the b set max_a - the maximum value in the a set max_b - the maximum value in the b set min_a - the minimum value in the a set min_b - the minimum value in the b set - num_data_points - the total number of data points in each of the sets - shape - the shape of each of the data sets - spatially_invalid_pts_ignored_in_a - number of points corresponding to invalid lat/lon in a set - spatially_invalid_pts_ignored_in_b - number of points corresponding to invalid lat/lon in b set """ _doc_strings = { - 'a_missing_value': 'the value that is considered \"missing\" data when it is found in A', - 'b_missing_value': 'the value that is considered \"missing\" data when it is found in B', + 'a_missing_value': 'the value that is considered \"missing\" or \"fill\" data when it is found in A', + 'b_missing_value': 'the value that is considered \"missing\" or \"fill\" data when it is found in B', 'epsilon': 'amount of difference between matching data points in A and B that is considered acceptable', 'epsilon_percent': 'the percentage of difference (of A\'s value) that is acceptable between A and B (optional)', 'max_a': 'the maximum finite, non-missing value found in A', @@ -473,102 +506,22 @@ class GeneralStatistics (StatisticalData) : 'min_b': 'the minimum finite, non-missing value found in B', 'num_data_points': "number of data values in A", 'shape': "shape of A", - 'spatially_invalid_pts_ignored_in_a': 'number of points with invalid latitude/longitude information in A that were' + + 'spatially_invalid_pts_ignored_a': 'number of points with invalid latitude/longitude information in A that were' + ' ignored for the purposes of data analysis and presentation', - 'spatially_invalid_pts_ignored_in_b': 'number of points with invalid latitude/longitude information in B that were' + + 'spatially_invalid_pts_ignored_b': 'number of points with invalid latitude/longitude information in B that were' + ' ignored for the purposes of data analysis and presentation', + # these are new! + 'mean_a': "the mean of all finite, non-missing values found in A", + 'mean_b': "the mean of all finite, non-missing values found in B", + 'median_a': "the median of all finite, non-missing values in A", + 'median_b': "the median of all finite, non-missing values in B", + 'std_val_a': "the standard deviation of all finite, non-missing values in A", + 'std_val_b': "the standard deviation of all finite, non-missing values in B", } - def __init__(self, diffInfoObject) : - """ - build our general statistics based on the comparison - of two data sets - """ - self.title = 'General Statistics' - - # pull out some masks for later use - a_missing_mask = diffInfoObject.a_data_object.masks.missing_mask - b_missing_mask = diffInfoObject.b_data_object.masks.missing_mask - ignore_in_a_mask = diffInfoObject.a_data_object.masks.ignore_mask - ignore_in_b_mask = diffInfoObject.b_data_object.masks.ignore_mask - good_in_a_mask = diffInfoObject.a_data_object.masks.valid_mask - good_in_b_mask = diffInfoObject.b_data_object.masks.valid_mask - - assert(a_missing_mask.shape == b_missing_mask.shape) - assert(b_missing_mask.shape == ignore_in_a_mask.shape) - assert(ignore_in_a_mask.shape == ignore_in_b_mask.shape) - assert(ignore_in_b_mask.shape == good_in_a_mask.shape) - assert(good_in_a_mask.shape == good_in_b_mask.shape) - - # get the number of data points - total_num_values = a_missing_mask.size - - # fill in our statistics - self.a_missing_value = diffInfoObject.a_data_object.select_fill_value() - self.b_missing_value = diffInfoObject.b_data_object.select_fill_value() - self.epsilon = diffInfoObject.epsilon_value - self.epsilon_percent = diffInfoObject.epsilon_percent - self.max_a = delta.max_with_mask(diffInfoObject.a_data_object.data, good_in_a_mask) - self.min_a = delta.min_with_mask(diffInfoObject.a_data_object.data, good_in_a_mask) - self.max_b = delta.max_with_mask(diffInfoObject.b_data_object.data, good_in_b_mask) - self.min_b = delta.min_with_mask(diffInfoObject.b_data_object.data, good_in_b_mask) - self.num_data_points = total_num_values - self.shape = a_missing_mask.shape - # also calculate the invalid points - self.spatially_invalid_pts_ignored_in_a = np.sum(ignore_in_a_mask) - self.spatially_invalid_pts_ignored_in_b = np.sum(ignore_in_b_mask) - - def dictionary_form(self) : - """ - get a dictionary form of the statistics - """ - - toReturn = { - 'a_missing_value': self.a_missing_value, - 'b_missing_value': self.b_missing_value, - 'epsilon': self.epsilon, - 'epsilon_percent': self.epsilon_percent, - 'max_a': self.max_a, - 'max_b': self.max_b, - 'min_a': self.min_a, - 'min_b': self.min_b, - 'num_data_points': self.num_data_points, - 'shape': self.shape, - 'spatially_invalid_pts_ignored_in_a': self.spatially_invalid_pts_ignored_in_a, - 'spatially_invalid_pts_ignored_in_b': self.spatially_invalid_pts_ignored_in_b - } - - return toReturn - - @staticmethod - def doc_strings( ) : - """ - get documentation strings that match the - dictionary form of the statistics - """ - - return GeneralStatistics._doc_strings - -class GeneralInspectionStatistics (StatisticalData) : - """ - A class representing general information about a data set. - - includes the following statistics: - - missing_value - the fill data value - max - the maximum value - min - the minimum value - num_data_points - the total number of data points - shape - the shape of the data - spatially_invalid_pts_ignored - number of points corresponding to invalid lat/lon in the set - (optional if no /lon lat mapped) - mean - the mean of the data values - median - the median of the data values - std_val - the standard deviation of the data values - """ - - _doc_strings = { - 'missing_value': 'the value that is considered \"missing\" data when it is found in the data', + _doc_strings_inspect = \ + { + 'missing_value': 'the value that is considered \"missing\" or \"fill\" data in this data set', 'max': 'the maximum finite, non-missing value found in the data', 'min': 'the minimum finite, non-missing value found in the data', 'num_data_points': "number of data points (may be valid or invalid data)", @@ -581,66 +534,121 @@ class GeneralInspectionStatistics (StatisticalData) : 'std_val': "the standard deviation of all finite, non-missing values in the data", } - def __init__(self, dataObject) : - """ - build our general statistics based on the data set - """ - self.title = 'General Statistics' - - # pull out some masks for later use - missing_mask = dataObject.masks.missing_mask - ignore_mask = dataObject.masks.ignore_mask - good_mask = dataObject.masks.valid_mask - # grab the valid data for some calculations - tempGoodData = dataObject.data[good_mask] - - #assert(missing_mask.shape == ignore_mask.shape) - #assert(ignore_mask.shape == good_mask.shape ) - - # get the number of data points - total_num_values = missing_mask.size - - # fill in our statistics - self.missing_value = dataObject.select_fill_value() - self.max = np.max(tempGoodData) - self.min = np.min(tempGoodData) - self.mean = np.mean(tempGoodData) - self.median = np.median(tempGoodData) - self.std_val = np.std(tempGoodData) - self.num_data_points = total_num_values - self.shape = missing_mask.shape - # also calculate the invalid points - self.spatially_invalid_pts_ignored = np.sum(ignore_mask) - - + def __init__(self, diffInfoObject=None, dataObject=None, + doExtras=False, dataSetDescription=None) : + """ + build our general statistics based on the comparison of two data sets + + diffInfoObject is assumed to be a glance.data.DiffInfoObject + dataObject is assumed to be a glance.data.DataObject + + Either the diffInfoObject or the dataObject must be passed in. If the + diffInfoObject is passed the dataObject will be ignored and the + a_data_object and b_data_object associated with the diffInfoObject + will be analyzed. + + If only dataObject is analyzed dataSetDescription will be + used in labeling the resulting dictionary form statistics. + + If you are passing a single dataObject and would like shape and size + statistics reported as well, pass doExtras as True (otherwise these + stats will be omitted). + """ + self.title = 'General Statistics' + self.is_one_data_set = False + + # if we don't have comparison information and we do have a single data set + if (diffInfoObject is None) and (dataObject is not None) : + + # we have one data set and should save the prefix/suffix information + self.is_one_data_set = True + self.do_extras = doExtras + self.desc_text = dataSetDescription + + # grab the valid data for some calculations + tempGoodData = dataObject.data[dataObject.masks.valid_mask] + + # fill in our statistics + self.missing_value = dataObject.select_fill_value() + self.max = np.max(tempGoodData) + self.min = np.min(tempGoodData) + self.mean = np.mean(tempGoodData) + self.median = np.median(tempGoodData) + self.std_val = np.std(tempGoodData) + # also calculate the invalid points + self.spatially_invalid_pts_ignored = np.sum(dataObject.masks.ignore_mask) + + # if we should also do extra stats, do so + if (doExtras) : + self.num_data_points = dataObject.masks.missing_mask.size + self.shape = dataObject.masks.missing_mask.shape + + # if we have a comparison object analyze the data associated with that comparison + elif diffInfoObject is not None : + + # analyze each of the original data sets that are being compared + self.a_gen_stats = GeneralStatistics(dataObject=diffInfoObject.a_data_object, dataSetDescription="a") + self.b_gen_stats = GeneralStatistics(dataObject=diffInfoObject.b_data_object, dataSetDescription="b") + + # fill in our statistics + self.epsilon = diffInfoObject.epsilon_value + self.epsilon_percent = diffInfoObject.epsilon_percent + self.num_data_points = diffInfoObject.a_data_object.masks.missing_mask.size + self.shape = diffInfoObject.a_data_object.masks.missing_mask.shape + # also calculate the invalid points + self.spatially_invalid_pts_ignored_in_a = np.sum(diffInfoObject.a_data_object.masks.ignore_mask) + self.spatially_invalid_pts_ignored_in_b = np.sum(diffInfoObject.b_data_object.masks.ignore_mask) + + else: + raise ValueError ("No data set was given when requesting general statistical analysis.") def dictionary_form(self) : """ get a dictionary form of the statistics """ - toReturn = { - 'missing_value': self.missing_value, - 'max': self.max, - 'min': self.min, - 'mean': self.mean, - 'median': self.median, - 'std_val': self.std_val, - 'num_data_points': self.num_data_points, - 'shape': self.shape, - 'spatially_invalid_pts_ignored': self.spatially_invalid_pts_ignored, - } + toReturn = { } + + # if we only have stats for one data set + if self.is_one_data_set : + temp_prefix, temp_suffix = self.make_prefix_and_suffix(self.desc_text) + toReturn = { + temp_prefix + 'missing_value': self.missing_value, + 'max' + temp_suffix: self.max, + 'min' + temp_suffix: self.min, + 'mean' + temp_suffix: self.mean, + 'median' + temp_suffix: self.median, + 'std_val' + temp_suffix: self.std_val, + 'spatially_invalid_pts_ignored' + temp_suffix: self.spatially_invalid_pts_ignored, + } + + if self.do_extras : + toReturn['num_data_points'] = self.num_data_points + toReturn['shape'] = self.shape + + # otherwise we must have stats for a comparison + else : + toReturn = { + 'epsilon': self.epsilon, + 'epsilon_percent': self.epsilon_percent, + 'num_data_points': self.num_data_points, + 'shape': self.shape, + } + a_dict = self.a_gen_stats.dictionary_form() + toReturn.update(a_dict) + b_dict = self.b_gen_stats.dictionary_form() + toReturn.update(b_dict) return toReturn @staticmethod - def doc_strings( ) : + def doc_strings(inspect=False) : """ get documentation strings that match the dictionary form of the statistics """ - return GeneralInspectionStatistics._doc_strings + return GeneralStatistics._doc_strings if not inspect else GeneralStatistics._doc_strings_inspect class NumericalComparisonStatistics (StatisticalData) : """ @@ -712,66 +720,35 @@ class NumericalComparisonStatistics (StatisticalData) : self.title = 'Numerical Comparison Statistics' # pull out some info we will use later - valid_in_both = diffInfoObject.diff_data_object.masks.valid_mask - outside_epsilon_mask = diffInfoObject.diff_data_object.masks.outside_epsilon_mask - mismatch_mask = diffInfoObject.diff_data_object.masks.mismatch_mask - aData = diffInfoObject.a_data_object.data - bData = diffInfoObject.b_data_object.data - - assert (valid_in_both.shape == outside_epsilon_mask.shape) - assert (outside_epsilon_mask.shape == mismatch_mask.shape) - assert (mismatch_mask.shape == aData.shape) - assert (aData.shape == bData.shape) + valid_in_both = diffInfoObject.diff_data_object.masks.valid_mask + aData = diffInfoObject.a_data_object.data + bData = diffInfoObject.b_data_object.data + total_num_finite_values = np.sum(valid_in_both) # just the finite values, not all data # fill in some simple statistics - self.diff_outside_epsilon_count = np.sum(outside_epsilon_mask) + self.diff_outside_epsilon_count = np.sum(diffInfoObject.diff_data_object.masks.outside_epsilon_mask) self.perfect_match_count = NumericalComparisonStatistics._get_num_perfect(aData, bData, goodMask=valid_in_both) self.correlation = delta.compute_correlation(aData, bData, valid_in_both) self.r_squared_correlation = self.correlation * self.correlation - self.mismatch_points_count = np.sum(mismatch_mask) - - # we actually want the total number of _finite_ values rather than all the data - total_num_finite_values = np.sum(valid_in_both) + self.mismatch_points_count = np.sum(diffInfoObject.diff_data_object.masks.mismatch_mask) - # calculate some more complex statistics - self.mismatch_points_fraction = float(self.mismatch_points_count) / float(aData.size) - # be careful not to divide by zero if we don't have finite data - if total_num_finite_values > 0 : - self.diff_outside_epsilon_fraction = float(self.diff_outside_epsilon_count) / float(total_num_finite_values) - self.perfect_match_fraction = float(self.perfect_match_count) / float(total_num_finite_values) - else: - self.diff_outside_epsilon_fraction = 0.0 - self.perfect_match_fraction = 0.0 + # calculate some more complex statistics, be careful not to divide by zero + self.mismatch_points_fraction = float(self.mismatch_points_count) / float(aData.size) if (aData.size > 0) else 0.0 + self.diff_outside_epsilon_fraction = float(self.diff_outside_epsilon_count) / float(total_num_finite_values) if (total_num_finite_values > 0) else 0.0 + self.perfect_match_fraction = float(self.perfect_match_count) / float(total_num_finite_values) if (total_num_finite_values > 0) else 0.0 # if desired, do the basic analysis - self.does_include_simple = include_basic_analysis - if (include_basic_analysis) : - basic_dict = NumericalComparisonStatistics.basic_analysis(diffInfoObject.diff_data_object.data, - valid_in_both) - if len(basic_dict) > 0 : - self.rms_val = basic_dict['rms_val'] - self.std_val = basic_dict['std_val'] - self.mean_diff = basic_dict['mean_diff'] - self.median_diff = basic_dict['median_diff'] - self.max_diff = basic_dict['max_diff'] - - self.mean_delta = basic_dict['mean_delta'] - self.median_delta = basic_dict['median_delta'] - self.max_delta = basic_dict['max_delta'] - self.min_delta = basic_dict['min_delta'] - else : - self.rms_val = np.nan - self.std_val = np.nan - self.mean_diff = np.nan - self.median_diff = np.nan - self.max_diff = np.nan - - self.mean_delta = np.nan - self.median_delta = np.nan - self.max_delta = np.nan - self.min_delta = np.nan - self.temp_analysis = basic_dict + self.temp_analysis = NumericalComparisonStatistics.basic_analysis(diffInfoObject.diff_data_object.data, valid_in_both) if include_basic_analysis else { } + self.rms_val = self.temp_analysis['rms_val'] if (len(self.temp_analysis) > 0) else np.nan + self.std_val = self.temp_analysis['std_val'] if (len(self.temp_analysis) > 0) else np.nan + self.mean_diff = self.temp_analysis['mean_diff'] if (len(self.temp_analysis) > 0) else np.nan + self.median_diff = self.temp_analysis['median_diff'] if (len(self.temp_analysis) > 0) else np.nan + self.max_diff = self.temp_analysis['max_diff'] if (len(self.temp_analysis) > 0) else np.nan + self.mean_delta = self.temp_analysis['mean_delta'] if (len(self.temp_analysis) > 0) else np.nan + self.median_delta = self.temp_analysis['median_delta'] if (len(self.temp_analysis) > 0) else np.nan + self.max_delta = self.temp_analysis['max_delta'] if (len(self.temp_analysis) > 0) else np.nan + self.min_delta = self.temp_analysis['min_delta'] if (len(self.temp_analysis) > 0) else np.nan def dictionary_form(self) : """ @@ -807,15 +784,14 @@ class NumericalComparisonStatistics (StatisticalData) : do some very minimal analysis of the differences """ - # if all the data is invalid, - # we can't do any of these forms of statistical analysis + # if everything's invalid, stop now if np.sum(valid_mask) <= 0 : return { } - # calculate our statistics + # calculate and return statistics root_mean_square_value = delta.calculate_root_mean_square(diffData, valid_mask) - tempDiffData = diffData[valid_mask] - absDiffData = np.abs(tempDiffData) + tempDiffData = diffData[valid_mask] + absDiffData = np.abs(tempDiffData) return { 'rms_val': root_mean_square_value, 'std_val': np.std(tempDiffData), @@ -836,10 +812,12 @@ class NumericalComparisonStatistics (StatisticalData) : the value in A perfectly matches the value in B """ numPerfect = 0 - if not (goodMask is None) : - numPerfect = np.sum(aData[goodMask] == bData[goodMask]) - else : + + if goodMask is None : numPerfect = np.sum(aData == bData) + else : + numPerfect = np.sum(aData[goodMask] == bData[goodMask]) + return numPerfect class StatisticalAnalysis (StatisticalData) : @@ -854,8 +832,8 @@ class StatisticalAnalysis (StatisticalData) : missingValue - a MissingValueStatistics object finiteData - a FiniteDataStatistics object - It can also provide a dictionary form of the statistics or the - documentation of the statistics. + It can also provide a dictionary form of the statistics and + documentation for the statistics. """ def __init__ (self) : @@ -908,11 +886,11 @@ class StatisticalAnalysis (StatisticalData) : build and set all of the statistics sets """ - self.general = GeneralStatistics(diffInfoObject) + self.general = GeneralStatistics (diffInfoObject=diffInfoObject) self.comparison = NumericalComparisonStatistics(diffInfoObject) - self.notANumber = NotANumberStatistics(diffInfoObject) - self.missingValue = MissingValueStatistics(diffInfoObject) - self.finiteData = FiniteDataStatistics(diffInfoObject) + self.notANumber = NotANumberStatistics (diffInfoObject=diffInfoObject) + self.missingValue = MissingValueStatistics (diffInfoObject=diffInfoObject) + self.finiteData = FiniteDataStatistics (diffInfoObject=diffInfoObject) def check_pass_or_fail(self, epsilon_failure_tolerance =np.nan, epsilon_failure_tolerance_default =None, @@ -936,10 +914,7 @@ class StatisticalAnalysis (StatisticalData) : # did we fail based on the epsilon? failed_fraction = self.comparison.diff_outside_epsilon_fraction - #failed_fraction = variableStats['Numerical Comparison Statistics']['diff_outside_epsilon_fraction'] - passed_epsilon = None - if epsilonTolerance is not None : - passed_epsilon = failed_fraction <= epsilonTolerance + passed_epsilon = None if (epsilonTolerance is None) else (failed_fraction <= epsilonTolerance) passValues.append(passed_epsilon) # test the nonfinite tolerance @@ -949,10 +924,7 @@ class StatisticalAnalysis (StatisticalData) : # did we fail based on nonfinite data non_finite_diff_fraction = self.finiteData.finite_in_only_one_fraction - #non_finite_diff_fraction = variableStats['Finite Data Statistics']['finite_in_only_one_fraction'] - passed_nonfinite = None - if nonfiniteTolerance is not None : - passed_nonfinite = non_finite_diff_fraction <= nonfiniteTolerance + passed_nonfinite = None if (nonfiniteTolerance is None) else (non_finite_diff_fraction <= nonfiniteTolerance) passValues.append(passed_nonfinite) # test if the total failed percentage is acceptable @@ -961,23 +933,17 @@ class StatisticalAnalysis (StatisticalData) : totalFailTolerance = total_data_failure_tolerance if total_data_failure_tolerance is not np.nan else total_data_failure_tolerance_default # did we fail based on all data failures? - passed_all_percentage = None - if totalFailTolerance is not None : - passed_all_percentage = (non_finite_diff_fraction + failed_fraction) <= totalFailTolerance + passed_all_percentage = None if (totalFailTolerance is None) else ((non_finite_diff_fraction + failed_fraction) <= totalFailTolerance) passValues.append(passed_all_percentage) # test the r-squared correlation coefficent # get the minimum acceptable r-squared correlation coefficient - min_r_squared = min_acceptable_r_squared if min_acceptable_r_squared is not np.nan else min_acceptable_r_squared_default + min_r_squared = min_acceptable_r_squared if (min_acceptable_r_squared is not np.nan) else min_acceptable_r_squared_default # did we fail based on the r-squared correlation coefficient? - r_squared_value = None - passed_r_squared = None - if min_r_squared is not None : - r_squared_value = self.comparison.r_squared_correlation - #r_squared_value = variableStats['Numerical Comparison Statistics']['r-squared correlation'] - passed_r_squared = r_squared_value >= min_r_squared + r_squared_value = None if (min_r_squared is None) else self.comparison.r_squared_correlation + passed_r_squared = None if (min_r_squared is None) else (r_squared_value >= min_r_squared) passValues.append(passed_r_squared) # figure out the overall pass/fail result @@ -999,11 +965,11 @@ class StatisticalAnalysis (StatisticalData) : toReturn = { } # build a dictionary of all our statistics - toReturn[self.general.title] = self.general.dictionary_form() - toReturn[self.comparison.title] = self.comparison.dictionary_form() - toReturn[self.notANumber.title] = self.notANumber.dictionary_form() + toReturn[self.general.title] = self.general.dictionary_form() + toReturn[self.comparison.title] = self.comparison.dictionary_form() + toReturn[self.notANumber.title] = self.notANumber.dictionary_form() toReturn[self.missingValue.title] = self.missingValue.dictionary_form() - toReturn[self.finiteData.title] = self.finiteData.dictionary_form() + toReturn[self.finiteData.title] = self.finiteData.dictionary_form() return toReturn @@ -1023,11 +989,11 @@ class StatisticalAnalysis (StatisticalData) : """ toReturn = { } - toReturn.update(GeneralStatistics.doc_strings()) + toReturn.update( GeneralStatistics.doc_strings()) toReturn.update(NumericalComparisonStatistics.doc_strings()) - toReturn.update(NotANumberStatistics.doc_strings()) - toReturn.update(MissingValueStatistics.doc_strings()) - toReturn.update(FiniteDataStatistics.doc_strings()) + toReturn.update( NotANumberStatistics.doc_strings()) + toReturn.update( MissingValueStatistics.doc_strings()) + toReturn.update( FiniteDataStatistics.doc_strings()) return toReturn @@ -1037,13 +1003,13 @@ class StatisticalInspectionAnalysis (StatisticalData) : It includes the following sets of statistics: - general - a GeneralInspectionStatistics object - notANumber - a NotANumberInspectionStatistics object - missingValue - a MissingValueInspectionStatistics object - finiteData - a FiniteDataInspectionStatistics object + general - a GeneralStatistics object + notANumber - a NotANumberStatistics object + missingValue - a MissingValueStatistics object + finiteData - a FiniteDataStatistics object - It can also provide a dictionary form of the statistics or the - documentation of the statistics. + It can also provide a dictionary form of the statistics and + documentation for the statistics. """ def __init__ (self) : @@ -1089,10 +1055,11 @@ class StatisticalInspectionAnalysis (StatisticalData) : build and set all of the statistics sets """ - self.general = GeneralInspectionStatistics(dataObject) - self.notANumber = NotANumberInspectionStatistics(dataObject) - self.missingValue = MissingValueInspectionStatistics(dataObject) - self.finiteData = FiniteDataInspectionStatistics(dataObject) + self.general = GeneralStatistics( dataObject=dataObject, + doExtras=True) + self.notANumber = NotANumberStatistics( dataObject=dataObject) + self.missingValue = MissingValueStatistics(dataObject=dataObject) + self.finiteData = FiniteDataStatistics( dataObject=dataObject) def dictionary_form(self) : """ @@ -1124,10 +1091,10 @@ class StatisticalInspectionAnalysis (StatisticalData) : """ toReturn = { } - toReturn.update(GeneralInspectionStatistics.doc_strings()) - toReturn.update(NotANumberInspectionStatistics.doc_strings()) - toReturn.update(MissingValueInspectionStatistics.doc_strings()) - toReturn.update(FiniteDataInspectionStatistics.doc_strings()) + toReturn.update( GeneralStatistics.doc_strings(inspect=True)) + toReturn.update( NotANumberStatistics.doc_strings(inspect=True)) + toReturn.update(MissingValueStatistics.doc_strings(inspect=True)) + toReturn.update( FiniteDataStatistics.doc_strings(inspect=True)) return toReturn