From bd2ca32f2b88727f884ccf795a80f53c9afde93c Mon Sep 17 00:00:00 2001
From: "(no author)" <(no author)@8a9318a1-56ba-4d59-b755-99d26321be01>
Date: Mon, 11 Jan 2010 23:30:30 +0000
Subject: [PATCH] added code to support data colocation and creation of
 modified files including colocated data; added support for a filter based on
 another variable data set (for example filtering winds on QI); quite a bit of
 refactoring to clean up code and handle longitude/latitude edge cases

git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@97 8a9318a1-56ba-4d59-b755-99d26321be01
---
 pyglance/glance/compare.py | 387 +++++++++++++++++++++++++++++--------
 pyglance/glance/delta.py   | 363 ++++++++++++++++++++++++----------
 pyglance/glance/filters.py |  26 +++
 pyglance/glance/io.py      | 108 +++++++++--
 4 files changed, 681 insertions(+), 203 deletions(-)

diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py
index 7617ddb..6d1e007 100644
--- a/pyglance/glance/compare.py
+++ b/pyglance/glance/compare.py
@@ -15,6 +15,7 @@ from pprint import pprint, pformat
 from numpy import *
 import pkg_resources
 from pycdf import CDFError
+from subprocess import check_call as sh
 
 import glance.io as io
 import glance.delta as delta
@@ -81,11 +82,12 @@ def _parse_varnames(names, terms, epsilon=0.0, missing=None):
     sel = [ ((x,)+_cvt_em(*em)) for x in names for (t,em) in terms if t(x) ]
     return set(sel)
 
-def _setup_file(fileNameAndPath, prefexText='') :
+def _setup_file(fileNameAndPath, prefexText='', allowWrite=False) :
     '''
     open the provided file name/path and extract information on the md5sum and last modification time
     optional prefext text may be passed in for informational output formatting
     '''
+    
     # some info to return
     fileInfo = {'path': fileNameAndPath}
     
@@ -95,20 +97,20 @@ def _setup_file(fileNameAndPath, prefexText='') :
         return None, fileInfo
     
     # open the file
-    LOG.info(prefexText + "opening " + fileNameAndPath)
+    LOG.info(prefexText + " opening " + fileNameAndPath)
     fileNameAndPath = os.path.abspath(os.path.expanduser(fileNameAndPath))
     LOG.debug("User provided path after normalization and user expansion: " + fileNameAndPath)
-    fileObject = io.open(fileNameAndPath)
+    fileObject = io.open(fileNameAndPath, allowWrite=allowWrite)
     
     # get the file md5sum
     tempSubProcess = subprocess.Popen("md5sum \'" + fileNameAndPath + "\'", shell=True, stdout=subprocess.PIPE)
     fileInfo['md5sum'] = tempSubProcess.communicate()[0].split()[0]
-    LOG.info(prefexText + "file md5sum: " + str(fileInfo['md5sum']))
+    LOG.info(prefexText + " file md5sum: " + str(fileInfo['md5sum']))
     
     # get the last modified stamp
     statsForFile = os.stat(fileNameAndPath)
     fileInfo['lastModifiedTime'] = datetime.datetime.fromtimestamp(statsForFile.st_mtime).ctime() # should time zone be forced?
-    LOG.info (prefexText + "file was last modified: " + fileInfo['lastModifiedTime'])
+    LOG.info (prefexText + " file was last modified: " + fileInfo['lastModifiedTime'])
     
     return fileObject, fileInfo
 
@@ -275,10 +277,15 @@ def _load_config_or_options(aPath, bPath, optionsSet, requestedVars = [ ]) :
     
     # set up the paths, they can only come from the command line
     paths = {}
-    paths['a'] = aPath 
-    paths['b'] = bPath
+    paths['a']   = aPath 
+    paths['b']   = bPath
     paths['out'] = optionsSet['outputpath']
     
+    # the colocation selection can only come from the command line options
+    # TODO since this is really only coming from the user's selection of the call,
+    # this is ok for the moment, may want to reconsider later
+    runInfo['doColocate'] = ('doColocate' in optionsSet) and (optionsSet['doColocate'])
+    
     # check to see if the user wants to use a config file and if the path exists
     requestedConfigFile = optionsSet['configFile']
     usedConfigFile = False
@@ -306,8 +313,8 @@ def _load_config_or_options(aPath, bPath, optionsSet, requestedVars = [ ]) :
         # this is an exception, since it is not advertised to the user we don't expect it to be in the file
         # (at least not at the moment, it could be added later and if they did happen to put it in the
         # config file, it would override this line)
-        runInfo['shouldIncludeReport'] = not optionsSet['imagesOnly']
-        runInfo['noLonLatVars'] = optionsSet['noLonLatVars']
+        runInfo['shouldIncludeReport'] = not optionsSet['imagesOnly'] if 'imagesOnly'   in optionsSet else False
+        runInfo['noLonLatVars']        = optionsSet['noLonLatVars']   if 'noLonLatVars' in optionsSet else False
         
         # get everything from the config file
         runInfo.update(glanceRunConfig.settings)
@@ -534,7 +541,8 @@ def _compare_spatial_invalidity(invalid_in_a_mask, invalid_in_b_mask, spatial_in
     
     return invalid_in_common_mask, spatial_info, longitude_common, latitude_common
 
-def _handle_lon_lat_info (lon_lat_settings, a_file_object, b_file_object, output_path, should_make_images=False) :
+def _handle_lon_lat_info (lon_lat_settings, a_file_object, b_file_object, output_path,
+                          should_make_images=False, should_check_equality=True) :
     """
     Manage loading and comparing longitude and latitude information for two files
     
@@ -587,27 +595,32 @@ def _handle_lon_lat_info (lon_lat_settings, a_file_object, b_file_object, output
         _get_and_analyze_lon_lat (file_for_b_lon_lat, b_latitude_name, b_longitude_name,
                                   lon_lat_settings['data_filter_function_lat_in_b'], lon_lat_settings['data_filter_function_lon_in_b'])
     
-    # test the "valid" values in our lon/lat
-    moreSpatialInfo = _check_lon_lat_equality(longitude_a, latitude_a, longitude_b, latitude_b,
-                                              spaciallyInvalidMaskA, spaciallyInvalidMaskB,
-                                              lon_lat_settings['lon_lat_epsilon'],
-                                              should_make_images, output_path)
-    # if we got the worst type of error result from the comparison this data is too dissimilar to continue
-    if moreSpatialInfo is None :
-        error_msg = ("Unable to reconcile sizes of longitude and latitude for variables "
-                 + str(lon_lat_settings['longitude']) + str(longitude_a.shape) + "/"
-                 + str(lon_lat_settings['latitude'])  + str(latitude_a.shape) + " in file A and variables "
-                 + str(b_longitude_name) + str(longitude_b.shape) + "/"
-                 + str(b_latitude_name)  + str(latitude_b.shape) + " in file B. Aborting attempt to compare files.")
-        return { }, { }, error_msg # things have gone wrong
-    # update our existing spatial information
-    spatialInfo.update(moreSpatialInfo)
-    
-    # compare our spatially invalid info to see if the two files have invalid longitudes and latitudes in the same places
-    spaciallyInvalidMask, spatialInfo, longitude_common, latitude_common = \
-                            _compare_spatial_invalidity(spaciallyInvalidMaskA, spaciallyInvalidMaskB, spatialInfo,
-                                                        longitude_a, longitude_b, latitude_a, latitude_b,
-                                                        should_make_images, output_path)
+    # if we need to, test the level of equality of the "valid" values in our lon/lat
+    if should_check_equality :
+        moreSpatialInfo = _check_lon_lat_equality(longitude_a, latitude_a, longitude_b, latitude_b,
+                                                  spaciallyInvalidMaskA, spaciallyInvalidMaskB,
+                                                  lon_lat_settings['lon_lat_epsilon'],
+                                                  should_make_images, output_path)
+        # if we got the worst type of error result from the comparison this data is too dissimilar to continue
+        if moreSpatialInfo is None :
+            error_msg = ("Unable to reconcile sizes of longitude and latitude for variables "
+                     + str(lon_lat_settings['longitude']) + str(longitude_a.shape) + "/"
+                     + str(lon_lat_settings['latitude'])  + str(latitude_a.shape) + " in file A and variables "
+                     + str(b_longitude_name) + str(longitude_b.shape) + "/"
+                     + str(b_latitude_name)  + str(latitude_b.shape) + " in file B. Aborting attempt to compare files.")
+            return { }, { }, error_msg # things have gone wrong
+        # update our existing spatial information
+        spatialInfo.update(moreSpatialInfo)
+    
+        # compare our spatially invalid info to see if the two files have invalid longitudes and latitudes in the same places
+        spaciallyInvalidMask, spatialInfo, longitude_common, latitude_common = \
+                                _compare_spatial_invalidity(spaciallyInvalidMaskA, spaciallyInvalidMaskB, spatialInfo,
+                                                            longitude_a, longitude_b, latitude_a, latitude_b,
+                                                            should_make_images, output_path)
+    else:
+        spaciallyInvalidMask = None
+        longitude_common     = None
+        latitude_common      = None
     
     return {'a':      {"lon": longitude_a,      "lat": latitude_a,      "inv_mask": spaciallyInvalidMaskA},
             'b':      {"lon": longitude_b,      "lat": latitude_b,      "inv_mask": spaciallyInvalidMaskB},
@@ -734,6 +747,207 @@ def _get_name_info_for_variable(original_display_name, variable_run_info) :
     
     return technical_name, b_variable_technical_name, explanation_name
 
+def _load_variable_data(fileObject, variableNameInFile,
+                        dataFilter=None,
+                        variableToFilterOn=None,
+                        variableBasedFilter=None,
+                        fileDescriptionForDisplay="file") :
+    """
+    load data for a variable from a file
+    optionally filter the variable data based on a data filter or another variable
+    
+    dataFilter must be in the form of (lambda data: some manipulation returning the new data)
+    variableBasedFilter must be in the form of (lambda data, filterData: some manipulation returning the new data))
+    """
+    
+    # get the data for the variable
+    LOG.debug("loading basic data for variable " + variableNameInFile + " from " + fileDescriptionForDisplay)
+    variableData = fileObject[variableNameInFile]
+    
+    # apply the basic filter if there is one
+    if dataFilter is not None :
+        LOG.debug ("applying filter function to data from " + fileDescriptionForDisplay + " for variable " + variableNameInFile)
+        variableData = dataFilter(variableData)
+    
+    # if we've got another variable to filter on, do that
+    if (variableToFilterOn is not None) and (variableBasedFilter is not None) :
+        LOG.debug ("filtering data from " + fileDescriptionForDisplay + " for variable " + variableNameInFile
+                   + " based on additional data from variable " + variableToFilterOn)
+        dataToFilterOn = fileObject[variableToFilterOn]
+        variableData = variableBasedFilter(variableData, dataToFilterOn)
+    
+    return variableData
+
+def _uri_needs_rsync(uri_to_check) :
+    """
+    check if the uri requires an rsync in order to access the data
+    this will return some false positives if you phrase local uri's with the machine name
+    for ex. you are on the machine "lotus" and you use the path "rsync:://lotus/data/"
+    """
+    return not os.path.exists(uri_to_check)
+
+def rsync_or_copy_files (list_of_files, target_directory='.') :
+    """
+    If the files in the list are remote, rsync them, otherwise, just copy
+    them to the target directory
+    """
+    for file_uri in list_of_files :
+        if _uri_needs_rsync(file_uri) :
+            cmd = ['rsync', '-Cuav', file_uri, os.path.join(target_directory, os.path.split(file_uri)[1])]
+        else :
+            cmd = ['cp', os.path.abspath(file_uri), os.path.join(target_directory, os.path.split(file_uri)[1])]
+        LOG.debug('running ' + ' '.join(cmd)) 
+        sh(cmd)
+
+def colocateToFile_library_call(a_path, b_path, var_list=[ ],
+                                options_set={ },
+                                # todo, this doesn't yet do anything
+                                do_document=False,
+                                # todo, the output channel does nothing at the moment
+                                output_channel=sys.stdout) :
+    """
+    this method handles the actual work of the colocateData command line tool
+    and can be used as a library routine.
+    
+    TODO, properly document the options
+    """
+    
+    # load the user settings from either the command line or a user defined config file
+    pathsTemp, runInfo, defaultValues, requestedNames, usedConfigFile = _load_config_or_options(a_path, b_path,
+                                                                                                options_set,
+                                                                                                requestedVars = var_list)
+    
+    # deal with the input and output files
+    if not (os.path.isdir(pathsTemp['out'])) :
+        LOG.info("Specified output directory (" + pathsTemp['out'] + ") does not exist.")
+        LOG.info("Creating output directory.")
+        os.makedirs(pathsTemp['out'])
+    
+    # make copies of the input files for colocation
+    rsync_or_copy_files ([pathsTemp['a'], pathsTemp['b']], target_directory=pathsTemp['out'])
+    pathsTemp['a'] = os.path.join(pathsTemp['out'], os.path.split(pathsTemp['a'])[1])
+    pathsTemp['b'] = os.path.join(pathsTemp['out'], os.path.split(pathsTemp['b'])[1])
+    
+    # open the files
+    LOG.info("Processing File A:")
+    aFile, _ = _setup_file(pathsTemp['a'], "\t", allowWrite = True)
+    if aFile is None:
+        LOG.warn("Unable to continue with comparison because file a (" + pathsTemp['a'] + ") could not be opened.")
+        sys.exit(1)
+    LOG.info("Processing File B:")
+    bFile, _ = _setup_file(pathsTemp['b'], "\t", allowWrite = True)
+    if bFile is None:
+        LOG.warn("Unable to continue with comparison because file b (" + pathsTemp['b'] + ") could not be opened.")
+        sys.exit(1)
+    
+    # get information about the names the user requested
+    finalNames, nameStats = _resolve_names(aFile, bFile,
+                                           defaultValues,
+                                           requestedNames, usedConfigFile)
+    
+    # return for lon_lat_data variables will be in the form 
+    #{"lon": longitude_data,      "lat": latitude_data,      "inv_mask": spaciallyInvalidMaskData}
+    # or { } if there is no lon/lat info
+    lon_lat_data, _, fatalErrorMsg = _handle_lon_lat_info (runInfo, aFile, bFile, pathsTemp['out'], should_check_equality=False)
+    if fatalErrorMsg is not None :
+        LOG.warn(fatalErrorMsg)
+        sys.exit(1)
+    
+    # handle the longitude and latitude colocation
+    LOG.info("Colocating raw longitude and latitude information")
+    aColocationInfomation, bColocationInformation, totalNumberOfMatchedPoints = \
+                                delta.create_colocation_mapping_within_epsilon((lon_lat_data['a']['lon'], lon_lat_data['a']['lat']),
+                                                                               (lon_lat_data['b']['lon'], lon_lat_data['b']['lat']),
+                                                                               runInfo['lon_lat_epsilon'],
+                                                                               invalidAMask=lon_lat_data['a']['inv_mask'],
+                                                                               invalidBMask=lon_lat_data['b']['inv_mask'])
+    (colocatedLongitude, colocatedLatitude, (numMultipleMatchesInA, numMultipleMatchesInB)), \
+    (unmatchedALongitude, unmatchedALatitude), \
+    (unmatchedBLongitude, unmatchedBLatitude) = \
+                delta.create_colocated_lonlat_with_lon_lat_colocation(aColocationInfomation, bColocationInformation,
+                                                                      totalNumberOfMatchedPoints,
+                                                                      lon_lat_data['a']['lon'], lon_lat_data['a']['lat'],
+                                                                      lon_lat_data['b']['lon'], lon_lat_data['b']['lat'])
+    
+    # TODO, based on unmatched, issue warnings and record info in the file?
+    LOG.debug("colocated shape of the longitude: " + str(colocatedLongitude.shape))
+    LOG.debug("colocated shape of the latitude:  " + str(colocatedLatitude.shape))
+    LOG.debug(str(numMultipleMatchesInA) + " lon/lat pairs contain A points used for multiple matches.")
+    LOG.debug(str(numMultipleMatchesInB) + " lon/lat pairs contain B points used for multiple matches.")
+    LOG.debug(str(len(unmatchedALatitude)) + " A lon/lat points could not be matched.")
+    LOG.debug(str(len(unmatchedBLatitude)) + " B lon/lat points could not be matched.")
+    
+    # go through each of the possible variables in our files
+    # and do our colocation for whichever ones we can
+    for displayName in finalNames:
+        
+        # pull out the information for this variable analysis run
+        varRunInfo = finalNames[displayName].copy()
+        
+        # get the various names
+        technical_name, b_variable_technical_name, \
+                explanationName = _get_name_info_for_variable(displayName, varRunInfo)
+        
+        print('analyzing: ' + explanationName + ')')
+        
+        # load the variable data
+        aData = _load_variable_data(aFile, technical_name,
+                                    dataFilter = varRunInfo['data_filter_function_a'] if 'data_filter_function_a' in varRunInfo else None,
+                                    variableToFilterOn = varRunInfo['variable_to_filter_on_a'] if 'variable_to_filter_on_a' in varRunInfo else None,
+                                    variableBasedFilter = varRunInfo['variable_based_filter_a'] if 'variable_based_filter_a' in varRunInfo else None,
+                                    fileDescriptionForDisplay = "file A")
+        bData = _load_variable_data(bFile, b_variable_technical_name,
+                                    dataFilter = varRunInfo['data_filter_function_b'] if 'data_filter_function_b' in varRunInfo else None,
+                                    variableToFilterOn = varRunInfo['variable_to_filter_on_b'] if 'variable_to_filter_on_b' in varRunInfo else None,
+                                    variableBasedFilter = varRunInfo['variable_based_filter_b'] if 'variable_based_filter_b' in varRunInfo else None,
+                                    fileDescriptionForDisplay = "file B")
+        
+        # pre-check if this data should be compared to the longitude and latitude
+        do_not_test_with_lon_lat = (len(lon_lat_data.keys()) <= 0)
+        
+        # colocate the data for this variable
+        if (not do_not_test_with_lon_lat) and runInfo['doColocate'] :
+            
+            # match up our points in A and B
+            (aData, bData, (numberOfMultipleMatchesInA, numberOfMultipleMatchesInB)), \
+            (aUnmatchedData,             unmatchedALongitude, unmatchedALatitude), \
+            (bUnmatchedData,             unmatchedBLongitude, unmatchedBLatitude) = \
+                    delta.create_colocated_data_with_lon_lat_colocation(aColocationInfomation, bColocationInformation,
+                                                                        colocatedLongitude, colocatedLatitude,
+                                                                        aData, bData,
+                                                                        missingData=varRunInfo['missing_value'],
+                                                                        altMissingDataInB=varRunInfo['missing_value_alt_in_b'],
+                                                                        # TODO, should missing data be considered?
+                                                                        invalidAMask=lon_lat_data['a']['inv_mask'],
+                                                                        invalidBMask=lon_lat_data['b']['inv_mask'])
+            
+            LOG.debug(str(numberOfMultipleMatchesInA) + " data pairs contain A data points used for multiple matches.")
+            LOG.debug(str(numberOfMultipleMatchesInB) + " data pairs contain B data points used for multiple matches.")
+            LOG.debug(str(len(aUnmatchedData)) + " A data points could not be matched.")
+            LOG.debug(str(len(bUnmatchedData)) + " B data points could not be matched.")
+            
+            # save the colocated data information in the output files
+            aFile.create_new_variable(technical_name + '-colocated', # TODO, how should this suffix be handled?
+                                      missingvalue = varRunInfo['missing'] if 'missing' in varRunInfo else None,
+                                      data = aData,
+                                      variabletocopyattributesfrom = technical_name)
+            bFile.create_new_variable(b_variable_technical_name + '-colocated', # TODO, how should this suffix be handled?
+                                      missingvalue = varRunInfo['missing_value_alt_in_b'] if 'missing_value_alt_in_b' in varRunInfo else None,
+                                      data = bData,
+                                      variabletocopyattributesfrom = b_variable_technical_name)
+            # TODO, save the unmatched data and info on multiple matches
+            
+        else :
+            LOG.debug(explanationName + " was not selected for colocation and will be ignored.")
+        
+    # the end of the loop to examine all the variables
+    
+    # we're done with the files, so close them up
+    aFile.close()
+    bFile.close()
+    
+    return
+
 def reportGen_library_call (a_path, b_path, var_list=[ ],
                             options_set={ },
                             # todo, this doesn't yet do anything
@@ -793,10 +1007,7 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
                                            defaultValues,
                                            requestedNames, usedConfigFile)
     
-    # if there is longitude and latitude info, handle the longitude and latitude
-    #if 'lon_lat' in runInfo : TODO, how can we handle cases where lon/lat is meaningless?
-    
-    print("output dir: " + str(pathsTemp['out']))
+    LOG.debug("output dir: " + str(pathsTemp['out']))
     
     # return for lon_lat_data variables will be in the form 
     #{"lon": longitude_data,      "lat": latitude_data,      "inv_mask": spaciallyInvalidMaskData}
@@ -834,17 +1045,17 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
         
         print('analyzing: ' + explanationName + ')')
         
-        # get the data for the variable 
-        aData = aFile[technical_name]
-        bData = bFile[b_variable_technical_name]
-        
-        # apply data filter functions if needed
-        if ('data_filter_function_a' in varRunInfo) :
-            aData = varRunInfo['data_filter_function_a'](aData)
-            LOG.debug ("filter function was applied to file A data for variable: " + explanationName)
-        if ('data_filter_function_b' in varRunInfo) :
-            bData = varRunInfo['data_filter_function_b'](bData)
-            LOG.debug ("filter function was applied to file B data for variable: " + explanationName)
+        # load the variable data
+        aData = _load_variable_data(aFile, technical_name,
+                                    dataFilter = varRunInfo['data_filter_function_a'] if 'data_filter_function_a' in varRunInfo else None,
+                                    variableToFilterOn = varRunInfo['variable_to_filter_on_a'] if 'variable_to_filter_on_a' in varRunInfo else None,
+                                    variableBasedFilter = varRunInfo['variable_based_filter_a'] if 'variable_based_filter_a' in varRunInfo else None,
+                                    fileDescriptionForDisplay = "file A")
+        bData = _load_variable_data(bFile, b_variable_technical_name,
+                                    dataFilter = varRunInfo['data_filter_function_b'] if 'data_filter_function_b' in varRunInfo else None,
+                                    variableToFilterOn = varRunInfo['variable_to_filter_on_b'] if 'variable_to_filter_on_b' in varRunInfo else None,
+                                    variableBasedFilter = varRunInfo['variable_based_filter_b'] if 'variable_based_filter_b' in varRunInfo else None,
+                                    fileDescriptionForDisplay = "file B")
         
         # pre-check if this data should be plotted and if it should be compared to the longitude and latitude
         include_images_for_this_variable = ((not('shouldIncludeImages' in runInfo)) or (runInfo['shouldIncludeImages']))
@@ -852,39 +1063,9 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
             include_images_for_this_variable = varRunInfo['shouldIncludeImages']
         do_not_test_with_lon_lat = (not include_images_for_this_variable) or (len(lon_lat_data.keys()) <= 0)
         
-        LOG.debug ("do_not_test_with_lon_lat = " + str(do_not_test_with_lon_lat))
-        LOG.debug ("include_images_for_this_variable = " + str(include_images_for_this_variable))
-        
         # handle vector data
         isVectorData = False # TODO actually figure out if we have vector data from user inputted settings
         
-        # TODO This if is for testing data colocation, this feature is not yet functional
-        if False :
-            (aData, bData, newLongitude, newLatitude), \
-            (aUnmatchedData,             unmatchedALongitude, unmatchedALatitude), \
-            (bUnmatchedData,             unmatchedBLongitude, unmatchedBLatitude) = \
-                    delta.colocate_matching_points_within_epsilon((aData, lon_lat_data['a']['lon'], lon_lat_data['a']['lat']),
-                                                                  (bData, lon_lat_data['b']['lon'], lon_lat_data['b']['lat']),
-                                                                  0.03,
-                                                                  invalidAMask=lon_lat_data['a']['inv_mask'],
-                                                                  invalidBMask=lon_lat_data['b']['inv_mask'])
-            lon_lat_data['a'] = {
-                                 'lon': newLongitude,
-                                 'lat': newLatitude,
-                                 'inv_mask': zeros(aData.shape, dtype=bool)
-                                 }
-            lon_lat_data['b'] = {
-                                 'lon': newLongitude,
-                                 'lat': newLatitude,
-                                 'inv_mask': zeros(aData.shape, dtype=bool)
-                                 }
-            lon_lat_data['common'] = {
-                                 'lon': newLongitude,
-                                 'lat': newLatitude,
-                                 'inv_mask': zeros(aData.shape, dtype=bool)
-                                 }
-            good_shape_from_lon_lat = newLatitude.shape
-        
         # check if this data can be displayed but
         # don't compare lon/lat sizes if we won't be plotting
         if ( (aData.shape == bData.shape) 
@@ -950,7 +1131,7 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
                     plotFunctionGenerationObjects.append(plotcreate.MappedQuiverPlotFunctionFactory())
                 
                 # if the data is one dimensional we can plot it as lines
-                elif   (len(aData.shape) is 1) :
+                elif   (len(aData.shape) is 1) : 
                     plotFunctionGenerationObjects.append(plotcreate.LinePlotsFunctionFactory())
                 
                 # if the data is 2D we have some options based on the type of data
@@ -1014,8 +1195,9 @@ def reportGen_library_call (a_path, b_path, var_list=[ ],
             if do_not_test_with_lon_lat :
                 message = message + '.'
             else :
-                message = (message + ' or the data may not match the shape of the selected longitude ' +
-                     str(good_shape_from_lon_lat) + ' and ' + 'latitude ' + str(good_shape_from_lon_lat) + ' variables.')
+                message = (message + ' or the data may not match the shape of the selected '
+                     + 'longitude ' + str(good_shape_from_lon_lat) + ' and '
+                     + 'latitude '  + str(good_shape_from_lon_lat) + ' variables.')
             LOG.warn(message)
         
     # the end of the loop to examine all the variables
@@ -1340,6 +1522,53 @@ python -m glance
         
         reportGen_library_call(args[0], args[1], args[2:], tempOptions)
     
+    def colocateData(*args) :
+        """colocate data in two files
+        
+        This option colocates data in the two given input files and saves it to separate output files.
+        Data will be colocated based on its corresponding longitude and latitude. Multiple matches may be
+        made between a data point in file A and those in file B if they are within the longitude/latitude epsilon.
+        Points from each file that could not be matched and the number of duplicate matches will also be
+        recorded in the output file.
+        
+        The user may also use the notation variable_name::missing_value to specify the missing_value which indicates
+        missing data. If no missing value is given, glance will attempt to load a missing value from the input file.
+        If there is no missing value defined for that variable in the file, no missing value will be analyzed.
+        Missing value data points will not be considered for colocation.
+        
+        Data which corresponds to longitude or latitude values which fall outside the earth (outside the normally
+        accepted valid ranges) will also be considered invalid and will not be considered for colocation.
+        
+        The longitude and latitude variables may be specified with --longitude and --latitude
+        If no longitude or latitude are specified the pixel_latitude and pixel_longitude variables will be used.
+        The longitude and latitude epsilon may be specified with --llepsilon
+        If no longitude/latitude epsilon is given the value of 0.0 (degrees) will be used
+        
+        The output data files generated by this option will appear in the selected output directory, or the current
+        directory if no out put directory is selected. The output files will be named originalFileName-colocation.nc
+        (replacing "originalFileName" with the names of your input files).
+        
+        Examples:
+         python -m glance.compare colocateData A.hdf B.hdf variable_name_1 variable_name_2 variable_name_3::missing3 
+         python -m glance.compare colocateData --outputpath=/path/where/output/will/be/placed/ A.nc B.nc
+         python -m glance.compare colocateData --longitude=lon_variable_name --latitude=lat_variable_name A.hdf B.hdf variable_name
+         python -m glance.compare colocateData --llepsilon=0.0001 A.nc B.hdf
+        """
+        
+        tempOptions = { }
+        tempOptions['outputpath']    = options.outputpath
+        tempOptions['configFile']    = options.configFile
+        tempOptions['noLonLatVars']  = options.noLonLatVars
+        tempOptions['latitudeVar']   = options.latitudeVar
+        tempOptions['longitudeVar']  = options.longitudeVar
+        tempOptions['lonlatepsilon'] = options.lonlatepsilon
+        tempOptions['epsilon']       = options.epsilon
+        tempOptions['missing']       = options.missing
+        
+        tempOptions['doColocate']    = True
+        
+        colocateToFile_library_call(args[0], args[1], args[2:], tempOptions)
+    
     """
     # This was used to modify files for testing and should not be uncommented
     # unless you intend to use it only temporarily for testing purposes
diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py
index e4e254e..fc14e56 100644
--- a/pyglance/glance/delta.py
+++ b/pyglance/glance/delta.py
@@ -529,102 +529,97 @@ def summarize(a, b, epsilon=0., (a_missing_value, b_missing_value)=(None,None),
     
     return out
 
-def colocate_matching_points_within_epsilon((aData, alongitude, alatitude),
-                                            (bData, blongitude, blatitude),
-                                            lonlatEpsilon,
-                                            invalidAMask=None, invalidBMask=None):
+def create_colocation_mapping_within_epsilon((alongitude, alatitude),
+                                             (blongitude, blatitude),
+                                             lonlatEpsilon,
+                                             invalidAMask=None, invalidBMask=None):
     """
-    match data points together based on their longitude and latitude values
+    match points together based on their longitude and latitude values
     to match points must be within lonlatEpsilon degrees in both longitude and latitude
     
-    if the longitude and latitude variables contain invalid data they should represent this by
-    being masked arrays that mask out the invalid data
+    if the longitude and latitude variables contain invalid data the invalidAMask and
+    invalidBMask should be passed with the appropriate masking to remove the invalid values
     
-    Note: the return will contain all pairs of points that match, this means an individual a or b
-    data point may be repeated if it matches multiple points within the lonlatEpsilon provided
+    the return will be in the form of two dictionaries of points, one from a and one from b,
+    indexed on the index number in the A or B data where they can be found. Each entry will
+    consist of a list of:
+        [longitudeValue, latitudeValue, indexNumber, [list of matching indexes in the other set]]
+    
+    Note: the return will include all pairs of points that match,
+    this means an individual a or b point may be repeated if it matches
+    multiple points within the lonlatEpsilon provided
     
     Warning: This algorithm will fail to find all matching points if the lonlatEpsilon is set to a
     value greater than or equal to 1.0 degrees. This is related to the bin size used for searching
     thoretically the bin size could be corrected to scale with the lonlatEpsilon in the future. TODO
     """
-    LOG.debug("Preparing to colocate data using longitude and latitude (acceptable epsilon: " + str(lonlatEpsilon) + " degrees)")
-    LOG.debug("size of aData: " + str(aData.shape))
-    LOG.debug("size of bData: " + str(bData.shape))
+    assert(alongitude.shape == alatitude.shape)
+    assert(blongitude.shape == blatitude.shape)
+    assert(lonlatEpsilon >= 0.0)
+    
+    LOG.debug("Preparing to colocate longitude and latitude points (acceptable epsilon: " + str(lonlatEpsilon) + " degrees)")
+    LOG.debug("size of A: " + str(alongitude.shape))
+    LOG.debug("size of B: " + str(blongitude.shape))
     
-    # make sure our invalid masks exist
+    # make blank invalid masks if none were passed in
     if invalidAMask is None :
-        invalidAMask = np.zeros(aData.shape, dtype=bool)
+        invalidAMask = np.zeros(alongitude.shape, dtype=bool)
     if invalidBMask is None :
-        invalidBMask = np.zeros(bData.shape, dtype=bool)
-    
-    # construct the full invalid mask
-    if type(alatitude)  is ma.array :
-        invalidAMask = invalidAMask |  ~alatitude.mask
-    if type(alongitude) is ma.array :
-        invalidAMask = invalidAMask | ~alongitude.mask
-    if type(blatitude)  is ma.array :
-        invalidBMask = invalidBMask |  ~blatitude.mask
-    if type(blongitude) is ma.array :
-        invalidBMask = invalidBMask | ~blongitude.mask
-    
-    # select only the valid points
-    aData      =      aData[~invalidAMask]
-    alongitude = alongitude[~invalidAMask]
-    alatitude  =  alatitude[~invalidAMask]
-    bData      =      bData[~invalidBMask]
-    blongitude = blongitude[~invalidBMask]
-    blatitude  =  blatitude[~invalidBMask]
-    
-    # Note: at this point the invalid masks are no longer relevant
-    # all the remaining points are valid data in flat arrays
-    # there is no reason for the lon/lat to remain masked arrays
-    alongitude = np.array(alongitude)
-    alatitude  = np.array(alatitude)
-    blongitude = np.array(blongitude)
-    blatitude  = np.array(blatitude)
+        invalidBMask = np.zeros(blongitude.shape, dtype=bool)
+    
+    # make flat versions of our longitude and latitude
+    # so that our index correlations will be simple
+    flatALatitude  =  alatitude.ravel()
+    flatALongitude = alongitude.ravel()
+    flatBLatitude  =  blatitude.ravel()
+    flatBLongitude = blongitude.ravel()
     
     # find the ranges of the longitude and latitude
-    minLatitude  = min(min(alatitude),  min(blatitude))
-    maxLatitude  = max(max(alatitude),  max(blatitude))
-    minLongitude = min(min(alongitude), min(blongitude))
-    maxLongitude = max(max(alongitude), max(blongitude))
+    minLatitude  = min(min(flatALatitude),  min(flatBLatitude))
+    maxLatitude  = max(max(flatALatitude),  max(flatBLatitude))
+    minLongitude = min(min(flatALongitude), min(flatBLongitude))
+    maxLongitude = max(max(flatALongitude), max(flatBLongitude))
     
     # make the bins for the data in longitude/latitude space
     aBins = { }
     bBins = { }
+    allAPts = { }
+    allBPts = { }
     
     # loop to put all the aData in the bins
-    for index in range(aData.size) :
-        filingLat = int( alatitude[index])
-        filingLon = int(alongitude[index])
+    for index in range(flatALatitude.size) :
+        filingLat = int( flatALatitude[index])
+        filingLon = int(flatALongitude[index])
         
         if (filingLat, filingLon) not in aBins :
             aBins[(filingLat, filingLon)] = [ ]
         
-        aBins[(filingLat, filingLon)].append( (aData[index], alatitude[index], alongitude[index]) )
+        # create the simple list holding that point in the form:
+        # the lon/lat values (for ease of comparison), the index number in A, and the list of matches
+        aPoint = [flatALatitude[index], flatALongitude[index], index, [ ]]
+        
+        # put the point in the list and bin
+        allAPts[index] = aPoint
+        aBins[(filingLat, filingLon)].append(aPoint)
     
     # loop to put all the bData in the bins
-    for index in range(bData.size) :
-        filingLat = int( blatitude[index])
-        filingLon = int(blongitude[index])
+    for index in range(flatBLatitude.size) :
+        filingLat = int( flatBLatitude[index])
+        filingLon = int(flatBLongitude[index])
         
         if (filingLat, filingLon) not in bBins :
             bBins[(filingLat, filingLon)] = [ ]
         
-        bBins[(filingLat, filingLon)].append( (bData[index], blatitude[index], blongitude[index]) )
-    
-    # some variables to hold our final data
-    aMatchedData        = [ ]
-    bMatchedData        = [ ]
-    matchedLongitude    = [ ]
-    matchedLatitude     = [ ]
-    numDuplicateMatches = 0
-    aUnmatchedData      = [ ]
-    unmatchedALongitude = [ ]
-    unmatchedALatitude  = [ ]
-    bUnmatchedData      = [ ]
-    unmatchedBLongitude = [ ]
-    unmatchedBLatitude  = [ ]
+        # create the simple list holding that point in the form:
+        # the lon/lat values (for ease of comparison), the index number in A, and the list of matches
+        bPoint = [flatBLatitude[index], flatBLongitude[index], index, [ ]]
+        
+        # put the point in the list and bin
+        allBPts[index] = bPoint
+        bBins[(filingLat, filingLon)].append(bPoint)
+    
+    # for debugging purposes
+    totalMatches = 0
     
     # look in all the aData bins and select point pairs that match within epsilon
     for binLatitude, binLongitude in aBins.keys() :
@@ -636,55 +631,211 @@ def colocate_matching_points_within_epsilon((aData, alongitude, alatitude),
                 toSearch.append((latValue, lonValue))
         
         # for each A pt in this bin
-        for aDataPt, aDataLatPt, aDataLonPt in aBins[(binLatitude, binLongitude)] :
-            haveFoundMatch = False
+        for aLat, aLon, aIndex, aMatches in aBins[(binLatitude, binLongitude)] :
             
             # look through my nearby B bins and find any
             # "matching" points that fall within epsilon
             for latValue, lonValue in toSearch :
                 
-                # if there's anything in the B bin
+                # if there's anything in that B bin
                 if ((latValue, lonValue) in bBins) and (bBins[(latValue, lonValue)] is not None) :
+                    
                     # for each data point in the B bin, check if it matches our current A point
-                    for bDataPt, bDataLatPt, bDataLonPt in bBins[(latValue, lonValue)] :
-                        if (abs(aDataLatPt - bDataLatPt) < lonlatEpsilon) and (abs(aDataLonPt - bDataLonPt) < lonlatEpsilon) :
-                            # track number of duplicates
-                            if haveFoundMatch :
-                                numDuplicateMatches = numDuplicateMatches + 1
-                            haveFoundMatch = True
+                    for bLat, bLon, bIndex, bMatches in bBins[(latValue, lonValue)] :
+                        
+                        if (abs(bLat - aLat) < lonlatEpsilon) and (abs(aLon - bLon) < lonlatEpsilon) :
+                            totalMatches = totalMatches + 1
+                            
                             # put the point on our matched lists
-                            aMatchedData.append(aDataPt)
-                            bMatchedData.append(bDataPt)
-                            matchedLongitude.append((aDataLonPt + bDataLonPt) / 2.0)
-                            matchedLatitude.append((aDataLatPt + bDataLatPt) / 2.0)
+                            aMatches.append(bIndex)
+                            bMatches.append(aIndex)
+    
+    LOG.debug('Found ' + str(totalMatches) + ' matched pairs.')
+    
+    return allAPts, allBPts, totalMatches
+
+def create_colocated_lonlat_with_lon_lat_colocation(listOfColocatedALonLat, listOfColocatedBLonLat,
+                                                    totalMatches,
+                                                    aLongitude, aLatitude,
+                                                    bLongitude, bLatitude) :
+    """
+    given a pre colocated list of A and B lon/lat info from create_colocation_mapping_within_epsilon,
+    match up the longitude and latitude and return the colocated sets
+    """
+    
+    # some general statistics
+    multipleMatchesInA      = 0
+    multipleMatchesInB      = 0
+    totalValidMatchedPairs  = 0
+    
+    # our final data sets
+    matchedLongitude   = np.zeros(totalMatches, dtype=aLongitude.dtype) 
+    matchedLatitide    = np.zeros(totalMatches, dtype=aLatitude.dtype) 
+    
+    # we don't know how many unmatched points we may have
+    unmatchedALongitude = [ ]
+    unmatchedALatitude  = [ ]
+    unmatchedBLongitude = [ ]
+    unmatchedBLatitude  = [ ]
+    
+    # go through the A list and collect all the matches
+    currentIndex = 0
+    # look through all the A points
+    for aIndex in sorted(listOfColocatedALonLat.keys()) :
+        
+        [aLon, aLat, aIndex, aMatches] = listOfColocatedALonLat[aIndex]
+        tempMatches = 0
+        
+        # for each match you found on a given a point
+        for matchIndex in sorted(aMatches) :
+            
+            [bLon, bLat, bIndex, bMatches] = listOfColocatedBLonLat[matchIndex]
+            
+            # copy the lon/lat info 
+            matchedLongitude[currentIndex] = (aLon + bLon) / 2
+            matchedLatitide[currentIndex]  = (aLat + bLat) / 2
             
-            # if we couldn't find a match for this a point, put it in the list of unmatched A points
-            if not haveFoundMatch :
-                aUnmatchedData.append(aDataPt)
-                unmatchedALatitude.append(aDataLatPt)
-                unmatchedBLongitude.append(aDataLonPt)
-    
-    LOG.debug('Found ' + str(len(aMatchedData)) + ' matched data points.')
-    
-    # TODO rebuild the lists of matched and unmatched points
-    # TODO, need to find unmatched B points and duplicately matched B points?
-    
-    # convert our data back into numpy arrays
-    aMatchedData        = np.array(aMatchedData)
-    bMatchedData        = np.array(bMatchedData)
-    matchedLongitude    = np.array(matchedLongitude)
-    matchedLatitude     = np.array(matchedLatitude)
-    aUnmatchedData      = np.array(aUnmatchedData)
-    unmatchedALongitude = np.array(unmatchedALongitude)
-    unmatchedALatitude  = np.array(unmatchedALatitude)
-    bUnmatchedData      = np.array(bUnmatchedData)
-    unmatchedBLongitude = np.array(unmatchedBLongitude)
-    unmatchedBLatitude  = np.array(unmatchedBLatitude)
-    
-    # TODO, should we return the number of duplicates?
-    return (aMatchedData, bMatchedData,    matchedLongitude,    matchedLatitude), \
-           (aUnmatchedData,             unmatchedALongitude, unmatchedALatitude), \
-           (bUnmatchedData,             unmatchedBLongitude, unmatchedBLatitude)
+            currentIndex = currentIndex + 1
+            tempMatches  = tempMatches  + 1
+        
+        # update statistics based on the number of matches
+        totalValidMatchedPairs = totalValidMatchedPairs + tempMatches
+        if tempMatches > 1 :
+            multipleMatchesInA = multipleMatchesInA + tempMatches
+        elif tempMatches <= 0 :
+            unmatchedALatitude.append(aLat)
+            unmatchedALongitude.append(aLon)
+    
+    # gather some additional statistics from the B list
+    # go through each b point
+    for bIndex in sorted(listOfColocatedBLonLat) :
+        
+        [bLon, bLat, bIndex, bMatches] = listOfColocatedBLonLat[bIndex]
+        tempMatches = len(bMatches)
+        
+        # update some statistics based on the number of matches
+        if tempMatches > 1 :
+            multipleMatchesInB = multipleMatchesInB + tempMatches
+        elif tempMatches <= 0 :
+            unmatchedBLatitude.append(bLat)
+            unmatchedBLongitude.append(bLon)
+    
+    # make the unmatched lists into proper numpy arrays
+    unmatchedALatitude  = np.array(unmatchedALatitude,  dtype=aLatitude.dtype)
+    unmatchedALongitude = np.array(unmatchedALongitude, dtype=aLongitude.dtype)
+    unmatchedBLatitude  = np.array(unmatchedBLatitude,  dtype=bLatitude.dtype)
+    unmatchedBLongitude = np.array(unmatchedBLongitude, dtype=bLongitude.dtype)
+    
+    LOG.debug("Total matched pairs of longitude/latitide: " + str(totalValidMatchedPairs))
+    
+    return (matchedLongitude,    matchedLatitide, (multipleMatchesInA, multipleMatchesInB)), \
+           (unmatchedALongitude, unmatchedALatitude), \
+           (unmatchedBLongitude, unmatchedBLatitude)
+
+def create_colocated_data_with_lon_lat_colocation(listOfColocatedALonLat, listOfColocatedBLonLat,
+                                                  colocatedLongitude, colocatedLatitude,
+                                                  aData, bData,
+                                                  missingData=None, altMissingDataInB=None,
+                                                  invalidAMask=None, invalidBMask=None) :
+    """
+    given a pre colocated list of A and B lon/lat info from create_colocation_mapping_within_epsilon,
+    match up the valid data in two data sets and return the list of valid data, padded with missing
+    values so that it will match the original longitude and latitude
+    """
+    
+    if altMissingDataInB is None :
+        altMissingDataInB = missingData
+    
+    # some general statistics
+    multipleMatchesInA      = 0
+    multipleMatchesInB      = 0
+    totalValidMatchedPairs  = 0
+    
+    # our final data sets
+    matchedAPoints     = np.ones(colocatedLatitude.shape, dtype=aData.dtype) * missingData
+    matchedBPoints     = np.ones(colocatedLatitude.shape, dtype=bData.dtype) * altMissingDataInB
+    
+    # we don't know how many unmatched points we may have
+    unmatchedAPoints    = [ ]
+    unmatchedBPoints    = [ ]
+    unmatchedALongitude = [ ]
+    unmatchedALatitude  = [ ]
+    unmatchedBLongitude = [ ]
+    unmatchedBLatitude  = [ ]
+    
+    # go through the A list and sort all the valid matches
+    currentIndex = 0
+    # go through all the a points
+    for aIndex in sorted(listOfColocatedALonLat.keys()) :
+        
+        [aLon, aLat, aIndex, aMatches] = listOfColocatedALonLat[aIndex]
+        tempMatches = 0
+        
+        # for each point that matched to a given a point
+        for matchIndex in sorted(aMatches) :
+            
+            [bLon, bLat, bIndex, bMatches] = listOfColocatedBLonLat[matchIndex]
+            
+            # if either of our data points is invalid, then the data doesn't match
+            if invalidBMask[matchIndex] or invalidAMask[aIndex] :
+                # fill in missing data in the matches
+                matchedAPoints[currentIndex] = missingData
+                matchedBPoints[currentIndex] = altMissingDataInB
+                
+            else: # we have a valid match!
+                tempMatches = tempMatches + 1
+                matchedAPoints[currentIndex] = aData[aIndex]
+                matchedBPoints[currentIndex] = bData[bIndex]
+            
+            currentIndex = currentIndex + 1
+        
+        totalValidMatchedPairs = totalValidMatchedPairs + tempMatches
+        if tempMatches > 1 :
+            multipleMatchesInA = multipleMatchesInA + tempMatches
+        elif tempMatches <= 0 :
+            unmatchedAPoints.append(aData[aIndex])
+            unmatchedALongitude.append(aLon)
+            unmatchedALatitude.append(aLat)
+    
+    # gather some additional statistics from the B list
+    # go through all the b points
+    for bIndex in sorted(listOfColocatedBLonLat.keys()) :
+        
+        [bLon, bLat, bIndex, bMatches] = listOfColocatedBLonLat[bIndex]
+        tempMatches = 0
+        
+        # for each point that matched to a given b point
+        for matchIndex in sorted(bMatches) :
+            
+            [aLon, aLat, aIndex, aMatches] = listOfColocatedALonLat[matchIndex]
+            
+            # if either of our data points is invalid, then the data doesn't match
+            if invalidAMask[matchIndex] or invalidBMask[bIndex] :
+                # we've already built our matched data, so no need to missing it out
+                pass
+            else: # we have a valid match!
+                tempMatches = tempMatches + 1
+        
+        if tempMatches > 1 :
+            multipleMatchesInB = multipleMatchesInB + tempMatches
+        elif tempMatches <= 0 :
+            unmatchedBPoints.append(bData[bIndex])
+            unmatchedBLongitude.append(bLon)
+            unmatchedBLatitude.append(bLat)
+    
+    # make the unmatched lists into proper numpy arrays
+    unmatchedAPoints    = np.array(unmatchedAPoints,    dtype=aData.dtype)
+    unmatchedBPoints    = np.array(unmatchedBPoints,    dtype=bData.dtype)
+    unmatchedALongitude = np.array(unmatchedALongitude, dtype=colocatedLongitude.dtype)
+    unmatchedALatitude  = np.array(unmatchedALatitude,  dtype=colocatedLatitude.dtype)
+    unmatchedBLongitude = np.array(unmatchedBLongitude, dtype=colocatedLongitude.dtype)
+    unmatchedBLatitude  = np.array(unmatchedBLatitude,  dtype=colocatedLatitude.dtype)
+    
+    LOG.debug("Total matched data point pairs found: " + str(totalValidMatchedPairs))
+    
+    return (matchedAPoints, matchedBPoints, (multipleMatchesInA, multipleMatchesInB)), \
+           (unmatchedAPoints, unmatchedALongitude, unmatchedALatitude), \
+           (unmatchedBPoints, unmatchedBLongitude, unmatchedBLatitude)
 
 STATISTICS_DOC = {  'general': "Finite values are non-missing and finite (not NaN or +-Inf); fractions are out of all data, " +
                                "both finite and not, unless otherwise specified",
diff --git a/pyglance/glance/filters.py b/pyglance/glance/filters.py
index 03b9f55..da9294c 100644
--- a/pyglance/glance/filters.py
+++ b/pyglance/glance/filters.py
@@ -161,6 +161,32 @@ def set_to_value_between_bounds(data, value_to_set_to, bottom_bound_exclusive, t
     
     return data
 
+def filter_based_on_additional_data_set_min_max_bounds(data, filterData, missingValue=None,
+                                                       minOkFilterValue=None, maxOkFilterValue=None) :
+    """
+    filter a data set based on values in another data set
+    
+    if some of the filter data is above/below the optional min/max values the corresponding  values in the
+    data will be set to the missingValue
+    
+    ex. this filter might be used to remove winds data that has a quality index below a certain threshold
+    """
+    
+    assert(data.shape == filterData.shape)
+    
+    goodAreas = np.ones(data.shape, dtype=bool)
+    
+    if minOkFilterValue is not None :
+        goodAreas = goodAreas & (filterData >= minOkFilterValue)
+    
+    if maxOkFilterValue is not None :
+        goodAreas = goodAreas & (filterData <= maxOkFilterValue)
+    
+    newData = data.copy()
+    newData[~goodAreas] = missingValue
+    
+    return newData
+
 def collapse_to_index(data, index, collapsing_function=np.mean,
                       missing_value=None, ignore_below_exclusive=None, ignore_above_exclusive=None) :
     """
diff --git a/pyglance/glance/io.py b/pyglance/glance/io.py
index 44deb6c..41d99ea 100644
--- a/pyglance/glance/io.py
+++ b/pyglance/glance/io.py
@@ -14,20 +14,26 @@ try:
     import h5py
 except ImportError:
     pass
-from pycdf import CDF, NC
+from pycdf import CDF, NC, strerror
 
 import numpy as np
 
 LOG = logging.getLogger(__name__)
 
+fillValConst1 = '_FillValue'
+fillValConst2 = 'missing_value'
+
 class hdf(SD):
     """wrapper for HDF4 dataset for comparison
     __call__ yields sequence of variable names
     __getitem__ returns individual variables ready for slicing to numpy arrays
     """
     
-    def __init__(self,filename):
-        super(self.__class__,self).__init__(filename, SDC.READ)
+    def __init__(self, filename, allowWrite=False):
+        mode = SDC.READ
+        if allowWrite:
+            mode = mode | SDC.WRITE
+        super(self.__class__,self).__init__(filename, mode)
 
     def __call__(self):
         "yield names of variables to be compared"
@@ -101,12 +107,11 @@ class hdf(SD):
         return self.select(name)
     
     def missing_value(self, name):
-        missing_value_attr_name = '_FillValue'
         variable_object = self.select(name)
         
         to_return = None
-        if hasattr(variable_object, missing_value_attr_name) :
-            to_return = getattr(variable_object, missing_value_attr_name, None)
+        if hasattr(variable_object, fillValConst1) :
+            to_return = getattr(variable_object, fillValConst1, None)
         SDS.endaccess(variable_object)
         
         return to_return
@@ -118,8 +123,13 @@ class nc(CDF):
     __getitem__ returns individual variables ready for slicing to numpy arrays
     """
     
-    def __init__(self,filename):
-        super(self.__class__,self).__init__(filename, NC.NOWRITE)
+    def __init__(self, filename, allowWrite=False):
+        
+        mode = NC.NOWRITE
+        if allowWrite :
+            mode = NC.WRITE
+        
+        super(self.__class__,self).__init__(filename, mode)
 
     def __call__(self):
         "yield names of variables to be compared"
@@ -168,18 +178,76 @@ class nc(CDF):
     
     def missing_value(self, name):
         
-        missing_value_attr_name_1 = '_FillValue'
-        missing_value_attr_name_2 = 'missing_value'
         variable_object = self.var(name)
         
         to_return = None
-        if hasattr(variable_object, missing_value_attr_name_1) \
+        if hasattr(variable_object, fillValConst1) \
            or \
-           hasattr(variable_object, missing_value_attr_name_2) :
-            to_return = getattr(variable_object, missing_value_attr_name_1,
-                                getattr(variable_object, missing_value_attr_name_2, None))
+           hasattr(variable_object, fillValConst2) :
+            to_return = getattr(variable_object, fillValConst1,
+                                getattr(variable_object, fillValConst2, None))
         
         return to_return
+    
+    # TODO, this method only exists for nc files at the moment, make the others at some point
+    def create_new_variable(self, variablename, missingvalue=None, data=None, variabletocopyattributesfrom=None):
+        """
+        create a new variable with the given name
+        optionally set the missing value (fill value) and data to those given
+        
+        the created variable will be returned, or None if a variable could not
+        be created
+        """
+        
+        self.redef()
+        
+        # if the variable already exists, stop with a warning
+        if variablename in self.variables().keys() :
+            LOG.warn("New variable name requested (" + variablename + ") is already present in file. " +
+                     "Skipping generation of new variable.")
+            return None
+        
+        dataType = None
+        if np.issubdtype(data.dtype, int) :
+            dataType = NC.INT
+            #print("Picked INT")
+        # TODO, at the moment the fill type is forcing me to use a double, when sometimes I want a float
+        #elif np.issubdtype(data.dtype, np.float32) :
+        #    dataType = NC.FLOAT
+        #    print("Picked FLOAT")
+        elif np.issubdtype(data.dtype, float) :
+            dataType = NC.DOUBLE
+            #print("Picked DOUBLE")
+        # what do we do if it's some other type?
+        
+        # create and set all the dimensions
+        dimensions = [ ]
+        dimensionNum = 0
+        for dimSize in data.shape :
+            dimensions.append(self.def_dim(variablename + '-index' + str(dimensionNum), dimSize))
+            dimensionNum = dimensionNum + 1
+        
+        # create the new variable
+        newVariable = self.def_var(variablename, dataType, tuple(dimensions))
+        
+        # if a missing value was given, use that
+        if missingvalue is not None :
+            newVariable._FillValue = missingvalue
+        
+        # if we have a variable to copy attributes from, do so
+        if variabletocopyattributesfrom is not None :
+            tocopyfrom = self.get_variable_object(variabletocopyattributesfrom)
+            attributes = tocopyfrom.attributes()
+            for attribute in attributes.keys() :
+                newVariable.__setattr__(attribute, attributes[attribute])
+        
+        self.enddef()
+        
+        # if data was given, use that
+        if data is not None :
+            newVariable.put(data.tolist()) 
+        
+        return newVariable
 
 nc4 = nc
 cdf = nc
@@ -192,8 +260,12 @@ class h5(object):
     """
     _h5 = None
     
-    def __init__(self,filename):
-        self._h5 = h5py.File(filename,'r')
+    def __init__(self, filename, allowWrite=False):
+        mode = 'r'
+        if allowWrite :
+            mode = 'r+'
+        
+        self._h5 = h5py.File(filename, mode)
     
     def __call__(self):
         
@@ -266,9 +338,9 @@ class h5(object):
         return None
         
 
-def open(pathname):
+def open(pathname, allowWrite=False):
     cls = globals()[os.path.splitext(pathname)[1][1:]]
-    return cls(pathname)
+    return cls(pathname, allowWrite=allowWrite)
 
 
 
-- 
GitLab