From 47f236d6abd39d2b8e85c7f4e7d789f7e2500d6e Mon Sep 17 00:00:00 2001
From: "(no author)" <(no author)@8a9318a1-56ba-4d59-b755-99d26321be01>
Date: Sat, 19 Dec 2009 00:38:15 +0000
Subject: [PATCH] inital draft of colocation code, not hooked up, still buggy
 and needing testing

git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@96 8a9318a1-56ba-4d59-b755-99d26321be01
---
 pyglance/glance/compare.py |  29 ++++++-
 pyglance/glance/delta.py   | 173 ++++++++++++++++++++++++++++++++++++-
 pyglance/glance/figures.py |   3 -
 3 files changed, 198 insertions(+), 7 deletions(-)

diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py
index 5b1424e..7617ddb 100644
--- a/pyglance/glance/compare.py
+++ b/pyglance/glance/compare.py
@@ -858,6 +858,33 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
         # handle vector data
         isVectorData = False # TODO actually figure out if we have vector data from user inputted settings
         
+        # TODO This if is for testing data colocation, this feature is not yet functional
+        if False :
+            (aData, bData, newLongitude, newLatitude), \
+            (aUnmatchedData,             unmatchedALongitude, unmatchedALatitude), \
+            (bUnmatchedData,             unmatchedBLongitude, unmatchedBLatitude) = \
+                    delta.colocate_matching_points_within_epsilon((aData, lon_lat_data['a']['lon'], lon_lat_data['a']['lat']),
+                                                                  (bData, lon_lat_data['b']['lon'], lon_lat_data['b']['lat']),
+                                                                  0.03,
+                                                                  invalidAMask=lon_lat_data['a']['inv_mask'],
+                                                                  invalidBMask=lon_lat_data['b']['inv_mask'])
+            lon_lat_data['a'] = {
+                                 'lon': newLongitude,
+                                 'lat': newLatitude,
+                                 'inv_mask': zeros(aData.shape, dtype=bool)
+                                 }
+            lon_lat_data['b'] = {
+                                 'lon': newLongitude,
+                                 'lat': newLatitude,
+                                 'inv_mask': zeros(aData.shape, dtype=bool)
+                                 }
+            lon_lat_data['common'] = {
+                                 'lon': newLongitude,
+                                 'lat': newLatitude,
+                                 'inv_mask': zeros(aData.shape, dtype=bool)
+                                 }
+            good_shape_from_lon_lat = newLatitude.shape
+        
         # check if this data can be displayed but
         # don't compare lon/lat sizes if we won't be plotting
         if ( (aData.shape == bData.shape) 
@@ -918,8 +945,6 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
                 if (aData.shape == bData.shape) :
                     plotFunctionGenerationObjects.append(plotcreate.BasicComparisonPlotsFunctionFactory())
                 
-                # TODO, need a way to match data if it is not the same shape
-                
                 # if it's vector data with longitude and latitude, quiver plot it on the Earth
                 if isVectorData and (not do_not_test_with_lon_lat) :
                     plotFunctionGenerationObjects.append(plotcreate.MappedQuiverPlotFunctionFactory())
diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py
index aac25e4..e4e254e 100644
--- a/pyglance/glance/delta.py
+++ b/pyglance/glance/delta.py
@@ -468,7 +468,13 @@ def _get_numerical_data_stats(a, b, diff_data,  data_is_finite_mask,
     return comparison
 
 # get the min, ignoring the stuff in mask
-def min_with_mask(data, mask) :
+def min_with_mask(data, mask=None) :
+    
+    if (mask is None) and (type(data) is numpy.ma) :
+        mask = ~data.mask
+    if mask is None :
+        mask = np.zeros(data.shape, dtype=bool)
+    
     temp = data[~mask]
     toReturn = None
     if len(temp) > 0 :
@@ -476,7 +482,13 @@ def min_with_mask(data, mask) :
     return toReturn
 
 # get the max, ignoring the stuff in mask
-def max_with_mask(data, mask) :
+def max_with_mask(data, mask=None) :
+    
+    if (mask is None) and (type(data) is numpy.ma) :
+        mask = ~data.mask
+    if mask is None :
+        mask = np.zeros(data.shape, dtype=bool)
+    
     temp = data[~mask]
     toReturn = None
     if len(temp) > 0 :
@@ -517,6 +529,163 @@ def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None),
     
     return out
 
+def colocate_matching_points_within_epsilon((aData, alongitude, alatitude),
+                                            (bData, blongitude, blatitude),
+                                            lonlatEpsilon,
+                                            invalidAMask=None, invalidBMask=None):
+    """
+    match data points together based on their longitude and latitude values
+    to match points must be within lonlatEpsilon degrees in both longitude and latitude
+    
+    if the longitude and latitude variables contain invalid data they should represent this by
+    being masked arrays that mask out the invalid data
+    
+    Note: the return will contain all pairs of points that match, this means an individual a or b
+    data point may be repeated if it matches multiple points within the lonlatEpsilon provided
+    
+    Warning: This algorithm will fail to find all matching points if the lonlatEpsilon is set to a
+    value greater than or equal to 1.0 degrees. This is related to the bin size used for searching
+    thoretically the bin size could be corrected to scale with the lonlatEpsilon in the future. TODO
+    """
+    LOG.debug("Preparing to colocate data using longitude and latitude (acceptable epsilon: " + str(lonlatEpsilon) + " degrees)")
+    LOG.debug("size of aData: " + str(aData.shape))
+    LOG.debug("size of bData: " + str(bData.shape))
+    
+    # make sure our invalid masks exist
+    if invalidAMask is None :
+        invalidAMask = np.zeros(aData.shape, dtype=bool)
+    if invalidBMask is None :
+        invalidBMask = np.zeros(bData.shape, dtype=bool)
+    
+    # construct the full invalid mask
+    if type(alatitude)  is ma.array :
+        invalidAMask = invalidAMask |  ~alatitude.mask
+    if type(alongitude) is ma.array :
+        invalidAMask = invalidAMask | ~alongitude.mask
+    if type(blatitude)  is ma.array :
+        invalidBMask = invalidBMask |  ~blatitude.mask
+    if type(blongitude) is ma.array :
+        invalidBMask = invalidBMask | ~blongitude.mask
+    
+    # select only the valid points
+    aData      =      aData[~invalidAMask]
+    alongitude = alongitude[~invalidAMask]
+    alatitude  =  alatitude[~invalidAMask]
+    bData      =      bData[~invalidBMask]
+    blongitude = blongitude[~invalidBMask]
+    blatitude  =  blatitude[~invalidBMask]
+    
+    # Note: at this point the invalid masks are no longer relevant
+    # all the remaining points are valid data in flat arrays
+    # there is no reason for the lon/lat to remain masked arrays
+    alongitude = np.array(alongitude)
+    alatitude  = np.array(alatitude)
+    blongitude = np.array(blongitude)
+    blatitude  = np.array(blatitude)
+    
+    # find the ranges of the longitude and latitude
+    minLatitude  = min(min(alatitude),  min(blatitude))
+    maxLatitude  = max(max(alatitude),  max(blatitude))
+    minLongitude = min(min(alongitude), min(blongitude))
+    maxLongitude = max(max(alongitude), max(blongitude))
+    
+    # make the bins for the data in longitude/latitude space
+    aBins = { }
+    bBins = { }
+    
+    # loop to put all the aData in the bins
+    for index in range(aData.size) :
+        filingLat = int( alatitude[index])
+        filingLon = int(alongitude[index])
+        
+        if (filingLat, filingLon) not in aBins :
+            aBins[(filingLat, filingLon)] = [ ]
+        
+        aBins[(filingLat, filingLon)].append( (aData[index], alatitude[index], alongitude[index]) )
+    
+    # loop to put all the bData in the bins
+    for index in range(bData.size) :
+        filingLat = int( blatitude[index])
+        filingLon = int(blongitude[index])
+        
+        if (filingLat, filingLon) not in bBins :
+            bBins[(filingLat, filingLon)] = [ ]
+        
+        bBins[(filingLat, filingLon)].append( (bData[index], blatitude[index], blongitude[index]) )
+    
+    # some variables to hold our final data
+    aMatchedData        = [ ]
+    bMatchedData        = [ ]
+    matchedLongitude    = [ ]
+    matchedLatitude     = [ ]
+    numDuplicateMatches = 0
+    aUnmatchedData      = [ ]
+    unmatchedALongitude = [ ]
+    unmatchedALatitude  = [ ]
+    bUnmatchedData      = [ ]
+    unmatchedBLongitude = [ ]
+    unmatchedBLatitude  = [ ]
+    
+    # look in all the aData bins and select point pairs that match within epsilon
+    for binLatitude, binLongitude in aBins.keys() :
+        
+        # figure out the lat/lon of the 9 bins that are "near" this one
+        toSearch = [ ]
+        for latValue in range(binLatitude - 1, binLatitude + 1) :
+            for lonValue in range(binLongitude -1, binLongitude + 1) :
+                toSearch.append((latValue, lonValue))
+        
+        # for each A pt in this bin
+        for aDataPt, aDataLatPt, aDataLonPt in aBins[(binLatitude, binLongitude)] :
+            haveFoundMatch = False
+            
+            # look through my nearby B bins and find any
+            # "matching" points that fall within epsilon
+            for latValue, lonValue in toSearch :
+                
+                # if there's anything in the B bin
+                if ((latValue, lonValue) in bBins) and (bBins[(latValue, lonValue)] is not None) :
+                    # for each data point in the B bin, check if it matches our current A point
+                    for bDataPt, bDataLatPt, bDataLonPt in bBins[(latValue, lonValue)] :
+                        if (abs(aDataLatPt - bDataLatPt) < lonlatEpsilon) and (abs(aDataLonPt - bDataLonPt) < lonlatEpsilon) :
+                            # track number of duplicates
+                            if haveFoundMatch :
+                                numDuplicateMatches = numDuplicateMatches + 1
+                            haveFoundMatch = True
+                            # put the point on our matched lists
+                            aMatchedData.append(aDataPt)
+                            bMatchedData.append(bDataPt)
+                            matchedLongitude.append((aDataLonPt + bDataLonPt) / 2.0)
+                            matchedLatitude.append((aDataLatPt + bDataLatPt) / 2.0)
+            
+            # if we couldn't find a match for this a point, put it in the list of unmatched A points
+            if not haveFoundMatch :
+                aUnmatchedData.append(aDataPt)
+                unmatchedALatitude.append(aDataLatPt)
+                unmatchedBLongitude.append(aDataLonPt)
+    
+    LOG.debug('Found ' + str(len(aMatchedData)) + ' matched data points.')
+    
+    # TODO rebuild the lists of matched and unmatched points
+    # TODO, need to find unmatched B points and duplicately matched B points?
+    
+    # convert our data back into numpy arrays
+    aMatchedData        = np.array(aMatchedData)
+    bMatchedData        = np.array(bMatchedData)
+    matchedLongitude    = np.array(matchedLongitude)
+    matchedLatitude     = np.array(matchedLatitude)
+    aUnmatchedData      = np.array(aUnmatchedData)
+    unmatchedALongitude = np.array(unmatchedALongitude)
+    unmatchedALatitude  = np.array(unmatchedALatitude)
+    bUnmatchedData      = np.array(bUnmatchedData)
+    unmatchedBLongitude = np.array(unmatchedBLongitude)
+    unmatchedBLatitude  = np.array(unmatchedBLatitude)
+    
+    # TODO, should we return the number of duplicates?
+    return (aMatchedData, bMatchedData,    matchedLongitude,    matchedLatitude), \
+           (aUnmatchedData,             unmatchedALongitude, unmatchedALatitude), \
+           (bUnmatchedData,             unmatchedBLongitude, unmatchedBLatitude)
+
 STATISTICS_DOC = {  'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " +
                                "both finite and not, unless otherwise specified",
                     
diff --git a/pyglance/glance/figures.py b/pyglance/glance/figures.py
index 4228719..0a7af9f 100644
--- a/pyglance/glance/figures.py
+++ b/pyglance/glance/figures.py
@@ -143,9 +143,6 @@ def _plot_tag_data_mapped(bMap, tagData, x, y, addExplinationLabel=True) :
             newX = np.array(x[tagData])
             newY = np.array(y[tagData])
             
-            print ("****newX type: " + str(type(newX)))
-            print ("****newY type: " + str(type(newY)))
-            
             # figure out how many bad points there are
             totalNumPoints = x.size # the number of points
             percentBad = (float(numTroublePoints) / float(totalNumPoints)) * 100.0
-- 
GitLab