diff --git a/pyglance/glance/collocation.py b/pyglance/glance/collocation.py index 35b1401a557c84901418a05f9af1c5cc353cb2ef..4c5ad9475a57bd130a7bfa23ec0dc26d71c639ae 100644 --- a/pyglance/glance/collocation.py +++ b/pyglance/glance/collocation.py @@ -34,20 +34,20 @@ class CollocationMapping : """ """ - number_of_matches = -1 - number_unmatchable_in_a = -1 - number_unmatchable_in_b = -1 + raw_lon_lat_matches - the number of points matched, based on the longitude and latitude alone + raw_unmatchable_in_a - the number of unmacthed pointes in a, based on longitude and latitude alone + raw_unmatchable_in_b - the number of unmacthed pointes in b, based on longitude and latitude alone - a_point_mapping = None - b_point_mapping = None + a_point_mapping - mapping of points in a to points in b; TODO give form + b_point_mapping - mapping of points in b to points in a; TODO give form - matched_longitude = None - matched_latitude = None + matched_longitude - the list of matched longitude values + matched_latitude - the list of matched latitude values - unmatchable_longitude_a = None - unmatchable_latitude_a = None - unmatchable_longitude_b = None - unmatchable_latitude_b = None + unmatchable_longitude_a - the list of longitude values in a that could not be matched + unmatchable_latitude_a - the list of latitude values in a that could not be matched + unmatchable_longitude_b - the list of longitude values in b that could not be matched + unmatchable_latitude_b - the list of latitude values in b that could not be matched """ def __init__(self, @@ -71,7 +71,31 @@ class CollocationMapping : the match should be accepted """ pass + + def _create_basic_mapping_from_lon_lat((alongitude, alatitude), + (blongitude, blatitude), + lonlatEpsilon, + invalidAMask=None, invalidBMask=None) : + """ + match points together based on their longitude and latitude values + to match points must be within lonlatEpsilon degrees in both longitude and latitude + + if the longitude and latitude variables contain invalid data the invalidAMask and + invalidBMask should be passed with the appropriate masking to remove the invalid values + the return will be in the form of two dictionaries of points, one from a and one from b, + indexed on the index number in the A or B data where they can be found. Each entry will + consist of a list of: + [longitudeValue, latitudeValue, indexNumber, [list of matching indexes in the other set]] + + Note: the return will include all pairs of points that match, + this means an individual a or b point may be repeated if it matches + multiple points within the lonlatEpsilon provided + + Warning: This algorithm will fail to find all matching points if the lonlatEpsilon is set to a + value greater than or equal to 1.0 degrees. This is related to the bin size used for searching + thoretically the bin size could be corrected to scale with the lonlatEpsilon in the future. TODO + """ def create_colocation_mapping_within_epsilon((alongitude, alatitude), (blongitude, blatitude), diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index a91d8fef3d67eee26e8e8106d4dcedd0011d3dfa..1783e571dc4f12036edf7e8afb5112c42725711f 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -460,12 +460,7 @@ def _check_lon_lat_equality(longitudeA, latitudeA, longitudeDiff = diffInfo.diff_data_object.data finiteLongitudeMask = diffInfo.diff_data_object.masks.valid_mask lon_not_equal_mask = diffInfo.diff_data_object.masks.trouble_mask - """ - longitudeDiff, finiteLongitudeMask, _, _, lon_not_equal_mask, _, _, _ = delta.diff(longitudeA, longitudeB, - llepsilon, - (None, None), - (ignoreMaskA, ignoreMaskB)) - """ + aDataObject = dataobj.DataObject(latitudeA, ignoreMask=ignoreMaskA) bDataObject = dataobj.DataObject(latitudeB, ignoreMask=ignoreMaskB) diffInfo = dataobj.DiffInfoObject(aDataObject, bDataObject, epsilonValue=llepsilon) #TODO, needs epsilon percent @@ -473,12 +468,6 @@ def _check_lon_lat_equality(longitudeA, latitudeA, latitudeDiff = diffInfo.diff_data_object.data finiteLatitudeMask = diffInfo.diff_data_object.masks.valid_mask lat_not_equal_mask = diffInfo.diff_data_object.masks.trouble_mask - """ - latitudeDiff, finiteLatitudeMask, _, _, lat_not_equal_mask, _, _, _ = delta.diff(latitudeA, latitudeB, - llepsilon, - (None, None), - (ignoreMaskA, ignoreMaskB)) - """ lon_lat_not_equal_mask = lon_not_equal_mask | lat_not_equal_mask lon_lat_not_equal_points_count = sum(lon_lat_not_equal_mask) diff --git a/pyglance/glance/data.py b/pyglance/glance/data.py index 879871194a80771ff76a477ed30a197654cb1468..885319af18f564a9050c6a2a4a9b14802a45648d 100644 --- a/pyglance/glance/data.py +++ b/pyglance/glance/data.py @@ -105,7 +105,35 @@ class DataObject (object) : self.fill_value = fillValue self.masks = BasicMaskSetObject(ignoreMask) - # TODO, analyze in issolation? + def self_analysis(self) : + """ + Gather some basic information about a data set + """ + + # hang onto the shape for convenience + shape = self.data.shape + + # if there isn't an ignore mask, make an empty one + if self.masks.ignore_mask is None : + self.masks.ignore_mask = np.zeros(shape, dtype=np.bool) + + # find the non-finite values + non_finite_mask = ~np.isfinite(self.data) & ~self.masks.ignore_mask + + # find and mark the missing values + missing_mask = np.zeros(shape, dtype=np.bool) + # if the data has a fill value, mark where the missing data is + if self.fill_value is not None : + missing_mask[self.data == self.fill_value] = True + missing_mask[self.masks.ignore_mask] = False + + # define the valid mask as places where the data is not missing, + # nonfinite, or ignored + valid_mask = ~ (missing_mask | non_finite_mask | self.masks.ignore_mask) + + # set our masks + self.masks = BasicMaskSetObject(self.masks.ignore_mask, valid_mask, + non_finite_mask, missing_mask) class DiffInfoObject (object) : """ @@ -133,126 +161,106 @@ class DiffInfoObject (object) : self.epsilon_value = epsilonValue self.epsilon_percent = epsilonPercent - # diff the two data sets TODO, this doesn't use epsilon percent yet - raw_diff, valid_in_both, (valid_in_a_mask, valid_in_b_mask), trouble_pt_mask, outside_epsilon_mask, \ - (a_not_finite_mask, b_not_finite_mask), (a_missing_mask, b_missing_mask), (ignore_mask_a, ignore_mask_b) = \ - diff(aDataObject.data, bDataObject.data, epsilonValue, - (aDataObject.fill_value, bDataObject.fill_value), - (aDataObject.masks.ignore_mask, bDataObject.masks.ignore_mask)) - - # set the various data in our two basic data objects - aDataObject.masks = BasicMaskSetObject(ignore_mask_a, valid_in_a_mask, a_not_finite_mask, a_missing_mask) - bDataObject.masks = BasicMaskSetObject(ignore_mask_b, valid_in_b_mask, b_not_finite_mask, b_missing_mask) - - # create our diff info object - self.diff_data_object = DataObject(raw_diff) - self.diff_data_object.masks = DiffMaskSetObject(ignore_mask_a | ignore_mask_b, - valid_in_both, trouble_pt_mask, outside_epsilon_mask) - -# Upcasts to be used in difference computation to avoid overflow. Currently only unsigned -# ints are upcast. -# FUTURE: handle uint64s as well (there is no int128, so might have to detect overflow) -datatype_upcasts = { - np.uint8: np.int16, - np.uint16: np.int32, - np.uint32: np.int64 - } - -# TODO, rethink how this works -def _select_fill_data(dTypeValue) : - """ - select a fill data value based on the type of data that is being - inspected/changed - """ - - fill_value_to_return = None - - if np.issubdtype(dTypeValue, np.float) or np.issubdtype(dTypeValue, np.complex) : - fill_value_to_return = np.nan - elif np.issubdtype(dTypeValue, np.int) : - fill_value_to_return = np.iinfo(dTypeValue).min - elif np.issubdtype(dTypeValue, np.bool) : - fill_value_to_return = True - elif ((dTypeValue is np.uint8) or - (dTypeValue is np.uint16) or - (dTypeValue is np.uint32) or - (dTypeValue is np.uint64)) : - fill_value_to_return = np.iinfo(dTypeValue).max - - return fill_value_to_return - -def diff(aData, bData, epsilon=0., - (a_missing_value, b_missing_value)=(None, None), - (ignore_mask_a, ignore_mask_b)=(None, None)): - """ - take two arrays of similar size and composition - if an ignoreMask is passed in values in the mask will not be analysed to - form the various return masks and the corresponding spots in the - "difference" return data array will contain fill values (selected - based on data type). - - return difference array filled with fill data where differences aren't valid, - good mask where values are finite in both a and b - trouble mask where missing values or nans don't match or delta > epsilon - (a-notfinite-mask, b-notfinite-mask) - (a-missing-mask, b-missing-mask) - """ - shape = aData.shape - assert(bData.shape==shape) - assert(np.can_cast(aData.dtype, bData.dtype) or np.can_cast(bData.dtype, aData.dtype)) + # analyze our data and get the difference object + self.diff_data_object = DiffInfoObject.analyze(aDataObject, bDataObject, + epsilonValue, epsilonPercent) - # if the ignore masks do not exist, set them to include none of the data - if (ignore_mask_a is None) : - ignore_mask_a = np.zeros(shape,dtype=bool) - if (ignore_mask_b is None) : - ignore_mask_b = np.zeros(shape,dtype=bool) + # Upcasts to be used in difference computation to avoid overflow. Currently only unsigned + # ints are upcast. + # FUTURE: handle uint64s as well (there is no int128, so might have to detect overflow) + DATATYPE_UPCASTS = { + np.uint8: np.int16, + np.uint16: np.int32, + np.uint32: np.int64 + } - # deal with the basic masks - a_not_finite_mask, b_not_finite_mask = ~np.isfinite(aData) & ~ignore_mask_a, ~np.isfinite(bData) & ~ignore_mask_b - a_missing_mask, b_missing_mask = np.zeros(shape,dtype=bool), np.zeros(shape,dtype=bool) - # if we were given missing values, mark where they are in the data - if a_missing_value is not None: - a_missing_mask[aData == a_missing_value] = True - a_missing_mask[ignore_mask_a] = False # don't analyse the ignored values - if b_missing_value is not None: - b_missing_mask[bData == b_missing_value] = True - b_missing_mask[ignore_mask_b] = False # don't analyse the ignored values - - # build the comparison data that includes the "good" values - valid_in_a_mask = ~(a_not_finite_mask | a_missing_mask | ignore_mask_a) - valid_in_b_mask = ~(b_not_finite_mask | b_missing_mask | ignore_mask_b) - valid_in_both = valid_in_a_mask & valid_in_b_mask - - # figure out our shared data type - sharedType = aData.dtype - if (aData.dtype is not bData.dtype) : - sharedType = np.common_type(aData, bData) - - # upcast if needed to avoid overflow in difference operation - if sharedType in datatype_upcasts: - sharedType = datatype_upcasts[sharedType] - - LOG.debug('Shared data type that will be used for diff comparison: ' + str(sharedType)) - - # construct our diff'ed array - raw_diff = np.zeros(shape, dtype=sharedType) #empty_like(aData) - - fill_data_value = _select_fill_data(sharedType) - - LOG.debug('current fill data value: ' + str(fill_data_value)) - - raw_diff[~valid_in_both] = fill_data_value # throw away invalid data - - # compute difference, using shared type in computation - raw_diff[valid_in_both] = bData[valid_in_both].astype(sharedType) - aData[valid_in_both].astype(sharedType) + @staticmethod + def _get_shared_type_and_fill_value(data1, data2, fill1=None, fill2=None) : + """ + Figure out a shared type that can be used when adding or subtracting + the two data sets given (accounting for possible overflow) + Also returns a fill value that can be used. + """ + + # figure out the shared type + type_to_return = data1.dtype + if data1.dtype is not data2.dtype: + type_to_return = np.common_type(data1, data2) + + # upcast the type if we need to + if type_to_return in DiffInfoObject.DATATYPE_UPCASTS : + type_to_return = DiffInfoObject.DATATYPE_UPCASTS[type_to_return] + LOG.debug('To prevent overflow, difference data will be upcast from (' + + str(data1.dtype) + '/' + str(data2.dtype) + ') to: ' + str(type_to_return)) + + # figure out the fill value + fill_value_to_return = None + + # if we're looking at float or complex data, use a nan + if (np.issubdtype(type_to_return, np.float) or + np.issubdtype(type_to_return, np.complex)) : + fill_value_to_return = np.nan - # the valid data which is too different between the two sets according to the given epsilon - outside_epsilon_mask = (abs(raw_diff) > epsilon) & valid_in_both - # trouble points = mismatched nans, mismatched missing-values, differences that are too large - trouble_pt_mask = (a_not_finite_mask ^ b_not_finite_mask) | (a_missing_mask ^ b_missing_mask) | outside_epsilon_mask + # if we're looking at int data, use the minimum value + elif np.issubdtype(type_to_return, np.int) : + fill_value_to_return = np.iinfo(type_to_return).min + + # if we're looking at unsigned data, use the maximum value + elif ((type_to_return is np.uint8) or + (type_to_return is np.uint16) or + (type_to_return is np.uint32) or + (type_to_return is np.uint64)) : + fill_value_to_return = np.iinfo(type_to_return).max + + return type_to_return, fill_value_to_return - return raw_diff, valid_in_both, (valid_in_a_mask, valid_in_b_mask), trouble_pt_mask, outside_epsilon_mask, \ - (a_not_finite_mask, b_not_finite_mask), (a_missing_mask, b_missing_mask), (ignore_mask_a, ignore_mask_b) + @staticmethod + def analyze(aDataObject, bDataObject, + epsilonValue=0.0, epsilonPercent=None): + """ + analyze the differences between the two data sets + updates the two data objects with additional masks + and returns data object containing diff data and masks + """ + shape = aDataObject.data.shape + assert(bDataObject.data.shape == shape) + assert(np.can_cast(aDataObject.data.dtype, bDataObject.data.dtype) or + np.can_cast(bDataObject.data.dtype, aDataObject.data.dtype)) + + # do some basic analysis on the individual data sets + aDataObject.self_analysis() + bDataObject.self_analysis() + + # where is the shared valid data? + valid_in_both = aDataObject.masks.valid_mask & bDataObject.masks.valid_mask + ignore_in_both = aDataObject.masks.ignore_mask | bDataObject.masks.ignore_mask + + # get our shared data type and fill value + sharedType, fill_data_value = DiffInfoObject._get_shared_type_and_fill_value(aDataObject.data, + bDataObject.data, + aDataObject.fill_value, + bDataObject.fill_value) + + # construct our diff'ed data set + raw_diff = np.zeros(shape, dtype=sharedType) + raw_diff[~valid_in_both] = fill_data_value # throw away invalid data + # compute difference, using shared type in computation + raw_diff[valid_in_both] = bDataObject.data[valid_in_both].astype(sharedType) - \ + aDataObject.data[valid_in_both].astype(sharedType) + + # the valid data which is too different between the two sets according to the given epsilon + outside_epsilon_mask = (abs(raw_diff) > epsilonValue) & valid_in_both + # trouble points = mismatched nans, mismatched missing-values, differences that are too large + trouble_pt_mask = ( (aDataObject.masks.non_finite_mask ^ bDataObject.masks.non_finite_mask) | + (aDataObject.masks.missing_mask ^ bDataObject.masks.missing_mask) | + outside_epsilon_mask ) + + # make our diff data object + diff_data_object = DataObject(raw_diff, fillValue=fill_data_value) + diff_data_object.masks = DiffMaskSetObject(ignore_in_both, valid_in_both, + trouble_pt_mask, outside_epsilon_mask) + + return diff_data_object if __name__=='__main__': import doctest diff --git a/pyglance/glance/plot.py b/pyglance/glance/plot.py index 71f232edba6faabe312aa7bee103331d6c10a99b..12ed1bb89ae591b11e5ac1c38dd10a4b19be2556 100644 --- a/pyglance/glance/plot.py +++ b/pyglance/glance/plot.py @@ -249,12 +249,7 @@ def plot_and_save_comparison_figures (aData, bData, bMissingMask = diffInfo.b_data_object.masks.missing_mask spaciallyInvalidMaskA = diffInfo.a_data_object.masks.ignore_mask spaciallyInvalidMaskB = diffInfo.b_data_object.masks.ignore_mask - """ - rawDiffData, goodMask, (goodInAMask, goodInBMask), troubleMask, outsideEpsilonMask, \ - (aNotFiniteMask, bNotFiniteMask), (aMissingMask, bMissingMask), \ - (spaciallyInvalidMaskA, spaciallyInvalidMaskB) = delta.diff(aData, bData, epsilon, (missingValue, missingValueAltInB), - (spaciallyInvalidMaskA, spaciallyInvalidMaskB)) - """ + absDiffData = np.abs(rawDiffData) # we also want to show the distance between our two, not just which one's bigger/smaller # from this point on, we will be forking to create child processes so we can parallelize our image and diff --git a/pyglance/glance/stats.py b/pyglance/glance/stats.py index 48ef9ca8c9b31f872a7a00563c67d1b1ae768a5a..f5df9f5cee2fd559a50807119cc072a0ec6400d2 100644 --- a/pyglance/glance/stats.py +++ b/pyglance/glance/stats.py @@ -36,13 +36,6 @@ def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None), bmis = diffInfo.b_data_object.masks.missing_mask ignoreInAMask = diffInfo.a_data_object.masks.ignore_mask ignoreInBMask = diffInfo.b_data_object.masks.ignore_mask - """ - diffData, finite_mask, (finite_a_mask, finite_b_mask), \ - trouble, outside_epsilon, (anfin, bnfin), \ - (amis, bmis), (ignoreInAMask, ignoreInBMask) = nfo = delta.diff(a, b, epsilon, - (a_missing_value, b_missing_value), - (ignoreInAMask, ignoreInBMask)) - """ general_stats = _get_general_data_stats(a, b, a_missing_value, b_missing_value, epsilon, ignoreInAMask, ignoreInBMask, ~finite_a_mask, ~finite_b_mask)