From 88ea287d303d2e82b9c628b1330de5678dc775e5 Mon Sep 17 00:00:00 2001 From: "(no author)" <(no author)@8a9318a1-56ba-4d59-b755-99d26321be01> Date: Tue, 18 May 2010 19:15:35 +0000 Subject: [PATCH] adding basic data objects and making difference analysis their responsibility; minimal integration to keep them working with the rest of the code git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@110 8a9318a1-56ba-4d59-b755-99d26321be01 --- pyglance/glance/compare.py | 19 +++ pyglance/glance/data.py | 259 +++++++++++++++++++++++++++++++++++++ pyglance/glance/delta.py | 104 --------------- pyglance/glance/plot.py | 19 +++ pyglance/glance/stats.py | 31 +++-- 5 files changed, 318 insertions(+), 114 deletions(-) create mode 100644 pyglance/glance/data.py diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index e8f8419..a91d8fe 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -22,6 +22,7 @@ import glance.delta as delta import glance.plot as plot import glance.report as report import glance.stats as statistics +import glance.data as dataobj import glance.collocation as collocation import glance.plotcreatefns as plotcreate @@ -452,14 +453,32 @@ def _check_lon_lat_equality(longitudeA, latitudeA, lon_lat_not_equal_points_percent = 0.0 # get information about how the latitude and longitude differ + aDataObject = dataobj.DataObject(longitudeA, ignoreMask=ignoreMaskA) + bDataObject = dataobj.DataObject(longitudeB, ignoreMask=ignoreMaskB) + diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=llepsilon) #TODO, needs epsilon percent + #TODO, for the moment, unpack these values into local variables + longitudeDiff = diffInfo.diff_data_object.data + finiteLongitudeMask = diffInfo.diff_data_object.masks.valid_mask + lon_not_equal_mask = diffInfo.diff_data_object.masks.trouble_mask + """ longitudeDiff, finiteLongitudeMask, _, _, lon_not_equal_mask, _, _, _ = delta.diff(longitudeA, longitudeB, llepsilon, (None, None), (ignoreMaskA, ignoreMaskB)) + """ + aDataObject = dataobj.DataObject(latitudeA, ignoreMask=ignoreMaskA) + bDataObject = dataobj.DataObject(latitudeB, ignoreMask=ignoreMaskB) + diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=llepsilon) #TODO, needs epsilon percent + #TODO, for the moment, unpack these values into local variables + latitudeDiff = diffInfo.diff_data_object.data + finiteLatitudeMask = diffInfo.diff_data_object.masks.valid_mask + lat_not_equal_mask = diffInfo.diff_data_object.masks.trouble_mask + """ latitudeDiff, finiteLatitudeMask, _, _, lat_not_equal_mask, _, _, _ = delta.diff(latitudeA, latitudeB, llepsilon, (None, None), (ignoreMaskA, ignoreMaskB)) + """ lon_lat_not_equal_mask = lon_not_equal_mask | lat_not_equal_mask lon_lat_not_equal_points_count = sum(lon_lat_not_equal_mask) diff --git a/pyglance/glance/data.py b/pyglance/glance/data.py new file mode 100644 index 0000000..8798711 --- /dev/null +++ b/pyglance/glance/data.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +Data objects for use in glance + +Created by evas Apr 2010. +Copyright (c) 2010 University of Wisconsin SSEC. All rights reserved. +""" + +import logging +import numpy as np + +import glance.delta as delta + +LOG = logging.getLogger(__name__) + +class BasicMaskSetObject (object) : + """ + This class represents a basic set of masks that a data set may have. + The set must contain an "ignore" mask, and may optionally contain others. + Note: This item is intended to be read only. If you want to change a mask, + create a new one with the new masks + + ignore_mask - a mask of data that should be ignored for reasons not related to + the contents of the actual data set (generally longitude or + latitude issues) + valid_mask - a mask of "good" values (ie. finite and non-missing) + non_finite_mask - a mask of non-finite values + missing_mask - a mask of where the data's fill value is present instead of + actual data values + """ + + def __init__(self, ignoreMask, + validMask=None, nonFiniteMask=None, missingMask=None) : + """ + create the mask set with at least the ignore mask + (the others are optional) + """ + self._reset_all_masks() + + self.ignore_mask = ignoreMask + self.valid_mask = validMask + self.non_finite_mask = nonFiniteMask + self.missing_mask = missingMask + + def _reset_all_masks(self) : + """ + set all the masks to None + """ + self.ignore_mask = None + self.valid_mask = None + self.non_finite_mask = None + self.missing_mask = None + +class DiffMaskSetObject (BasicMaskSetObject) : + """ + This class represents a set of masks that related to two or more + compared data sets. The inherited ignore/valid/non-finite/missing + masks are used to capture information about where data should be + ignored/valid/non-finite in the compared data. + + Additionally, masks describing trouble points and points outside + of the epsilon analysis tolerances are included. + + trouble_mask - a mask of data points which may indicate issues in the data + outside_epsilon_mask - a mask of points which did not pass the epsilon + tolerance testing + """ + + def __init__(self, ignoreMask, validInBothMask, troubleMask, epsilonMask) : + """ + create a more complex mask, including additional difference information + """ + self._reset_all_masks() + + self.ignore_mask = ignoreMask + self.valid_mask = validInBothMask + self.trouble_mask = troubleMask + self.outside_epsilon_mask = epsilonMask + +class DataObject (object) : + """ + This class represents a data set. + It may include a multidimentional numpy array of data + as well as the fill value and a set of masks that apply to this data. + + data - the raw array of data (generally this should be a numpy array) + fill_value - the fill value used in the data array + masks - the set of masks that apply to this data + """ + + def __init__(self, dataArray, fillValue=None, ignoreMask=None) : + """ + Create the data object. + + The array of data is expected to be a numpy array. + The fill value and mask sets are optional. + If the fill value is provided it is expected to be of the same + data type as the data array. + """ + + # TODO, add some assertions for our expectations + + self.data = dataArray + self.fill_value = fillValue + self.masks = BasicMaskSetObject(ignoreMask) + + # TODO, analyze in issolation? + +class DiffInfoObject (object) : + """ + This class represents the full difference between two data sets. + + a_data_object - data object describing the A data set + b_data_object - data object describing the B data set + diff_data_object - data object describing the raw differences between A and B + + epsilon_value - the epsilon value used for comparison or None + epsilon_percent - the percentage (of A) used for epsilon comparisons or None + (if both a value and percent are present, two epsilon tests will be done) + """ + + def __init__(self, aDataObject, bDataObject, + epsilonValue=0.0, epsilonPercent=None) : + """ + analyze the difference between these two data sets at the + given epsilon values + """ + + # set the basic values + self.a_data_object = aDataObject + self.b_data_object = bDataObject + self.epsilon_value = epsilonValue + self.epsilon_percent = epsilonPercent + + # diff the two data sets TODO, this doesn't use epsilon percent yet + raw_diff, valid_in_both, (valid_in_a_mask, valid_in_b_mask), trouble_pt_mask, outside_epsilon_mask, \ + (a_not_finite_mask, b_not_finite_mask), (a_missing_mask, b_missing_mask), (ignore_mask_a, ignore_mask_b) = \ + diff(aDataObject.data, bDataObject.data, epsilonValue, + (aDataObject.fill_value, bDataObject.fill_value), + (aDataObject.masks.ignore_mask, bDataObject.masks.ignore_mask)) + + # set the various data in our two basic data objects + aDataObject.masks = BasicMaskSetObject(ignore_mask_a, valid_in_a_mask, a_not_finite_mask, a_missing_mask) + bDataObject.masks = BasicMaskSetObject(ignore_mask_b, valid_in_b_mask, b_not_finite_mask, b_missing_mask) + + # create our diff info object + self.diff_data_object = DataObject(raw_diff) + self.diff_data_object.masks = DiffMaskSetObject(ignore_mask_a | ignore_mask_b, + valid_in_both, trouble_pt_mask, outside_epsilon_mask) + +# Upcasts to be used in difference computation to avoid overflow. Currently only unsigned +# ints are upcast. +# FUTURE: handle uint64s as well (there is no int128, so might have to detect overflow) +datatype_upcasts = { + np.uint8: np.int16, + np.uint16: np.int32, + np.uint32: np.int64 + } + +# TODO, rethink how this works +def _select_fill_data(dTypeValue) : + """ + select a fill data value based on the type of data that is being + inspected/changed + """ + + fill_value_to_return = None + + if np.issubdtype(dTypeValue, np.float) or np.issubdtype(dTypeValue, np.complex) : + fill_value_to_return = np.nan + elif np.issubdtype(dTypeValue, np.int) : + fill_value_to_return = np.iinfo(dTypeValue).min + elif np.issubdtype(dTypeValue, np.bool) : + fill_value_to_return = True + elif ((dTypeValue is np.uint8) or + (dTypeValue is np.uint16) or + (dTypeValue is np.uint32) or + (dTypeValue is np.uint64)) : + fill_value_to_return = np.iinfo(dTypeValue).max + + return fill_value_to_return + +def diff(aData, bData, epsilon=0., + (a_missing_value, b_missing_value)=(None, None), + (ignore_mask_a, ignore_mask_b)=(None, None)): + """ + take two arrays of similar size and composition + if an ignoreMask is passed in values in the mask will not be analysed to + form the various return masks and the corresponding spots in the + "difference" return data array will contain fill values (selected + based on data type). + + return difference array filled with fill data where differences aren't valid, + good mask where values are finite in both a and b + trouble mask where missing values or nans don't match or delta > epsilon + (a-notfinite-mask, b-notfinite-mask) + (a-missing-mask, b-missing-mask) + """ + shape = aData.shape + assert(bData.shape==shape) + assert(np.can_cast(aData.dtype, bData.dtype) or np.can_cast(bData.dtype, aData.dtype)) + + # if the ignore masks do not exist, set them to include none of the data + if (ignore_mask_a is None) : + ignore_mask_a = np.zeros(shape,dtype=bool) + if (ignore_mask_b is None) : + ignore_mask_b = np.zeros(shape,dtype=bool) + + # deal with the basic masks + a_not_finite_mask, b_not_finite_mask = ~np.isfinite(aData) & ~ignore_mask_a, ~np.isfinite(bData) & ~ignore_mask_b + a_missing_mask, b_missing_mask = np.zeros(shape,dtype=bool), np.zeros(shape,dtype=bool) + # if we were given missing values, mark where they are in the data + if a_missing_value is not None: + a_missing_mask[aData == a_missing_value] = True + a_missing_mask[ignore_mask_a] = False # don't analyse the ignored values + if b_missing_value is not None: + b_missing_mask[bData == b_missing_value] = True + b_missing_mask[ignore_mask_b] = False # don't analyse the ignored values + + # build the comparison data that includes the "good" values + valid_in_a_mask = ~(a_not_finite_mask | a_missing_mask | ignore_mask_a) + valid_in_b_mask = ~(b_not_finite_mask | b_missing_mask | ignore_mask_b) + valid_in_both = valid_in_a_mask & valid_in_b_mask + + # figure out our shared data type + sharedType = aData.dtype + if (aData.dtype is not bData.dtype) : + sharedType = np.common_type(aData, bData) + + # upcast if needed to avoid overflow in difference operation + if sharedType in datatype_upcasts: + sharedType = datatype_upcasts[sharedType] + + LOG.debug('Shared data type that will be used for diff comparison: ' + str(sharedType)) + + # construct our diff'ed array + raw_diff = np.zeros(shape, dtype=sharedType) #empty_like(aData) + + fill_data_value = _select_fill_data(sharedType) + + LOG.debug('current fill data value: ' + str(fill_data_value)) + + raw_diff[~valid_in_both] = fill_data_value # throw away invalid data + + # compute difference, using shared type in computation + raw_diff[valid_in_both] = bData[valid_in_both].astype(sharedType) - aData[valid_in_both].astype(sharedType) + + # the valid data which is too different between the two sets according to the given epsilon + outside_epsilon_mask = (abs(raw_diff) > epsilon) & valid_in_both + # trouble points = mismatched nans, mismatched missing-values, differences that are too large + trouble_pt_mask = (a_not_finite_mask ^ b_not_finite_mask) | (a_missing_mask ^ b_missing_mask) | outside_epsilon_mask + + return raw_diff, valid_in_both, (valid_in_a_mask, valid_in_b_mask), trouble_pt_mask, outside_epsilon_mask, \ + (a_not_finite_mask, b_not_finite_mask), (a_missing_mask, b_missing_mask), (ignore_mask_a, ignore_mask_b) + +if __name__=='__main__': + import doctest + doctest.testmod() diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py index 31f69db..cbfdd0b 100644 --- a/pyglance/glance/delta.py +++ b/pyglance/glance/delta.py @@ -16,116 +16,12 @@ compute_r = pearsonr LOG = logging.getLogger(__name__) -# Upcasts to be used in difference computation to avoid overflow. Currently only unsigned -# ints are upcast. -# FUTURE: handle uint64s as well (there is no int128, so might have to detect overflow) -datatype_upcasts = { - uint8: int16, - uint16: int32, - uint32: int64 - } - # TODO, where is this being used? def _missing(x, missing_value=None): if missing_value is not None: return isnan(x) | (x==missing_value) return isnan(x) -def diff(aData, bData, epsilon=0., - (a_missing_value, b_missing_value)=(None, None), - (ignore_mask_a, ignore_mask_b)=(None, None)): - """ - take two arrays of similar size and composition - if an ignoreMask is passed in values in the mask will not be analysed to - form the various return masks and the corresponding spots in the - "difference" return data array will contain fill values (selected - based on data type). - - return difference array filled with fill data where differences aren't valid, - good mask where values are finite in both a and b - trouble mask where missing values or nans don't match or delta > epsilon - (a-notfinite-mask, b-notfinite-mask) - (a-missing-mask, b-missing-mask) - """ - shape = aData.shape - assert(bData.shape==shape) - assert(can_cast(aData.dtype, bData.dtype) or can_cast(bData.dtype, aData.dtype)) - - # if the ignore masks do not exist, set them to include none of the data - if (ignore_mask_a is None) : - ignore_mask_a = zeros(shape,dtype=bool) - if (ignore_mask_b is None) : - ignore_mask_b = zeros(shape,dtype=bool) - - # deal with the basic masks - a_not_finite_mask, b_not_finite_mask = ~isfinite(aData) & ~ignore_mask_a, ~isfinite(bData) & ~ignore_mask_b - a_missing_mask, b_missing_mask = zeros(shape,dtype=bool), zeros(shape,dtype=bool) - # if we were given missing values, mark where they are in the data - if a_missing_value is not None: - a_missing_mask[aData == a_missing_value] = True - a_missing_mask[ignore_mask_a] = False # don't analyse the ignored values - if b_missing_value is not None: - b_missing_mask[bData == b_missing_value] = True - b_missing_mask[ignore_mask_b] = False # don't analyse the ignored values - - # build the comparison data that includes the "good" values - valid_in_a_mask = ~(a_not_finite_mask | a_missing_mask | ignore_mask_a) - valid_in_b_mask = ~(b_not_finite_mask | b_missing_mask | ignore_mask_b) - valid_in_both = valid_in_a_mask & valid_in_b_mask - - # figure out our shared data type - sharedType = aData.dtype - if (aData.dtype is not bData.dtype) : - sharedType = common_type(aData, bData) - - # upcast if needed to avoid overflow in difference operation - if sharedType in datatype_upcasts: - sharedType = datatype_upcasts[sharedType] - - LOG.debug('Shared data type that will be used for diff comparison: ' + str(sharedType)) - - # construct our diff'ed array - raw_diff = zeros(shape, dtype=sharedType) #empty_like(aData) - - fill_data_value = select_fill_data(sharedType) - - LOG.debug('current fill data value: ' + str(fill_data_value)) - - raw_diff[~valid_in_both] = fill_data_value # throw away invalid data - - # compute difference, using shared type in computation - raw_diff[valid_in_both] = bData[valid_in_both].astype(sharedType) - aData[valid_in_both].astype(sharedType) - - # the valid data which is too different between the two sets according to the given epsilon - outside_epsilon_mask = (abs(raw_diff) > epsilon) & valid_in_both - # trouble points = mismatched nans, mismatched missing-values, differences that are too large - trouble_pt_mask = (a_not_finite_mask ^ b_not_finite_mask) | (a_missing_mask ^ b_missing_mask) | outside_epsilon_mask - - return raw_diff, valid_in_both, (valid_in_a_mask, valid_in_b_mask), trouble_pt_mask, outside_epsilon_mask, \ - (a_not_finite_mask, b_not_finite_mask), (a_missing_mask, b_missing_mask), (ignore_mask_a, ignore_mask_b) - -def select_fill_data(dTypeValue) : - """ - select a fill data value based on the type of data that is being - inspected/changed - """ - - fill_value_to_return = None - - if issubdtype(dTypeValue, np.float) or issubdtype(dTypeValue, np.complex) : - fill_value_to_return = nan - elif issubdtype(dTypeValue, np.int) : - fill_value_to_return = np.iinfo(dTypeValue).min - elif issubdtype(dTypeValue, np.bool) : - fill_value_to_return = True - elif ((dTypeValue is np.uint8) or - (dTypeValue is np.uint16) or - (dTypeValue is np.uint32) or - (dTypeValue is np.uint64)) : - fill_value_to_return = np.iinfo(dTypeValue).max - - return fill_value_to_return - def corr(x,y,mask): "compute correlation coefficient" gf = mask.flatten() diff --git a/pyglance/glance/plot.py b/pyglance/glance/plot.py index 6cfa540..910fac4 100644 --- a/pyglance/glance/plot.py +++ b/pyglance/glance/plot.py @@ -23,6 +23,7 @@ import numpy as np import glance.graphics as maps import glance.delta as delta import glance.figures as figures +import glance.data as dataobj import glance.plotcreatefns as plotfns LOG = logging.getLogger(__name__) @@ -232,10 +233,28 @@ def plot_and_save_comparison_figures (aData, bData, spaciallyInvalidMaskB = lonLatDataDict['b']['inv_mask'] # compare the two data sets to get our difference data and trouble info + aDataObject = dataobj.DataObject(aData, fillValue=missingValue, ignoreMask=spaciallyInvalidMaskA) + bDataObject = dataobj.DataObject(bData, fillValue=missingValueAltInB, ignoreMask=spaciallyInvalidMaskB) + diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=epsilon) #TODO, needs epsilon percent + #TODO, for the moment, unpack these values into local variables + rawDiffData = diffInfo.diff_data_object.data + goodMask = diffInfo.diff_data_object.masks.valid_mask + goodInAMask = diffInfo.a_data_object.masks.valid_mask + goodInBMask = diffInfo.b_data_object.masks.valid_mask + troubleMask = diffInfo.diff_data_object.masks.trouble_mask + outsideEpsilonMask = diffInfo.diff_data_object.masks.outside_epsilon_mask + aNotFiniteMask = diffInfo.a_data_object.masks.non_finite_mask + bNotFiniteMask = diffInfo.b_data_object.masks.non_finite_mask + aMissingMask = diffInfo.a_data_object.masks.missing_mask + bMissingMask = diffInfo.b_data_object.masks.missing_mask + spaciallyInvalidMaskA = diffInfo.a_data_object.masks.ignore_mask + spaciallyInvalidMaskB = diffInfo.b_data_object.masks.ignore_mask + """ rawDiffData, goodMask, (goodInAMask, goodInBMask), troubleMask, outsideEpsilonMask, \ (aNotFiniteMask, bNotFiniteMask), (aMissingMask, bMissingMask), \ (spaciallyInvalidMaskA, spaciallyInvalidMaskB) = delta.diff(aData, bData, epsilon, (missingValue, missingValueAltInB), (spaciallyInvalidMaskA, spaciallyInvalidMaskB)) + """ absDiffData = np.abs(rawDiffData) # we also want to show the distance between our two, not just which one's bigger/smaller # from this point on, we will be forking to create child processes so we can parallelize our image and diff --git a/pyglance/glance/stats.py b/pyglance/glance/stats.py index c9e754f..7461c2c 100644 --- a/pyglance/glance/stats.py +++ b/pyglance/glance/stats.py @@ -8,9 +8,8 @@ Created by evas Apr 2010. Copyright (c) 2010 University of Wisconsin SSEC. All rights reserved. """ -#import glance.data as data +import glance.data as dataobj import glance.delta as delta -#from glance.data import MaskSetObject import numpy as np @@ -20,22 +19,34 @@ def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), """return dictionary of statistics dictionaries stats not including 'nan' in name exclude nans in either arrays """ - + # diff our two data sets + aDataObject = dataobj.DataObject(a, fillValue=a_missing_value, ignoreMask=ignoreInAMask) + bDataObject = dataobj.DataObject(b, fillValue=b_missing_value, ignoreMask=ignoreInBMask) + diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=epsilon) #TODO, needs epsilon percent + #TODO, for the moment, unpack these values into local variables + diffData = diffInfo.diff_data_object.data + finite_mask = diffInfo.diff_data_object.masks.valid_mask + finite_a_mask = diffInfo.a_data_object.masks.valid_mask + finite_b_mask = diffInfo.b_data_object.masks.valid_mask + trouble = diffInfo.diff_data_object.masks.trouble_mask + outside_epsilon = diffInfo.diff_data_object.masks.outside_epsilon_mask + anfin = diffInfo.a_data_object.masks.non_finite_mask + bnfin = diffInfo.b_data_object.masks.non_finite_mask + amis = diffInfo.a_data_object.masks.missing_mask + bmis = diffInfo.b_data_object.masks.missing_mask + ignoreInAMask = diffInfo.a_data_object.masks.ignore_mask + ignoreInBMask = diffInfo.b_data_object.masks.ignore_mask + """ diffData, finite_mask, (finite_a_mask, finite_b_mask), \ trouble, outside_epsilon, (anfin, bnfin), \ (amis, bmis), (ignoreInAMask, ignoreInBMask) = nfo = delta.diff(a, b, epsilon, (a_missing_value, b_missing_value), (ignoreInAMask, ignoreInBMask)) - ''' - d, valid_mask, trouble, (anfin, bnfin), (amis, bmis), outside_epsilon = nfo = diff(a,b, - epsilon, - (a_missing_value, b_missing_value), - (ignoreInAMask, ignoreInBMask)) - ''' + """ general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask) - additional_statistics = stats(*nfo) # grab some additional comparison statistics + additional_statistics = stats(diffData, finite_mask) #*nfo) # grab some additional comparison statistics comparison_stats = _get_numerical_data_stats(a, b, diffData, finite_mask, outside_epsilon, trouble, additional_statistics) nan_stats = _get_nan_stats(anfin, bnfin) missing_stats = _get_missing_value_stats(amis, bmis) -- GitLab