diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index 7c002914a21ee59330ff05948192c2223e3ddc65..9bc69223fef3a7c55d7c004c1c3c489bfe2c6e77 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -11,6 +11,7 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. import os, sys, logging, re from pprint import pprint, pformat +import numpy as np import glance.io as io import glance.delta as delta @@ -47,7 +48,6 @@ def _parse_varnames(names, terms, epsilon=0.0, missing=None): return eps, mis sel = [ ((x,)+_cvt_em(*em)) for x in names for (t,em) in terms if t(x) ] return set(sel) - def main(): import optparse @@ -58,6 +58,7 @@ examples: python -m glance.compare info A.hdf python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0' +python -m glance.compare plotDiffs A.hdf B.hdf [optional output path] """ parser = optparse.OptionParser(usage) @@ -72,7 +73,15 @@ python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0 parser.add_option('-e', '--epsilon', dest="epsilon", type='float', default=0.0, help="set default epsilon value for comparison threshold") parser.add_option('-m', '--missing', dest="missing", type='float', default=None, - help="set default missing-value") + help="set default missing-value") + #plotting related options + parser.add_option('-p', '--outputpath', dest="outputpath", type='string', default='./', + help="set path to output directory") + parser.add_option('-o', '--longitude', dest="longitudeVar", type='string', + help="set name of longitude variable") + parser.add_option('-a', '--latitude', dest="latitudeVar", type='string', + help="set name of latitude variable") + options, args = parser.parse_args() if options.self_test: @@ -209,7 +218,92 @@ python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0 if doc_each: print(' ' + delta.STATISTICS_DOC[each[0]]) if doc_atend: print('\n\n' + delta.STATISTICS_DOC_STR) + + def plotDiffs(*args) : + """create comparison images of various variables + This option creates graphical comparisons between variables in the two given hdf files. + Images will be created to show the variable data value for each of the two files and to show + the difference between them. + Variables to be compared may be specified after the names of the two input files. If no variables + are specified, all variables that match the shape of the longitude and latitude will be compared. + Specified variables that do not exist, do not match the correct data shape, or are the longitude/latitude + variables will be ignored. + Created images will be stored in the provided path, or if no path is provided, they will be stored + in the current directory. + The longitude and latitude variables may be specified with --longitude and --latitude + If no longitude or latitude are specified the imager_prof_retr_abi_r4_generic1 and + imager_prof_retr_abi_r4_generic2 variables will be used. + Examples: + python -m glance.compare plotDiffs A.hdf B.hdf variable_name_1 variable_name_2 variable_name_3 variable_name_4 + python -m glance.compare --outputpath=/path/where/output/will/be/placed/ plotDiffs A.hdf B.hdf + python -m glance.compare plotDiffs --longitude=lon_variable_name --latitude=lat_variable_name A.hdf B.hdf variable_name + """ + # get the file names the user wants to use and open them up + aFileName, bFileName = args[:2] + LOG.info("opening %s" % aFileName) + aFile = io.open(aFileName) + LOG.info("opening %s" % bFileName) + bFile = io.open(bFileName) + + # get information about the variables stored in the file + aNames = set(aFile()) + bNames = set(bFile()) + # get the variable names they have in common + commonNames = aNames.intersection(bNames) + # pull the ones the user asked for (if they asked for any specifically) + requestedNames = args[2:] or ['.*'] + finalNames = _parse_varnames(commonNames, requestedNames, options.epsilon, options.missing) + LOG.debug(str(finalNames)) + hadUserRequest = (len(args) > 2) + # get the output path + outputPath = options.outputpath + # TODO, should I validate that the output path is a file path? + LOG.debug(str(outputPath)) + + # get information about the longitude/latitude data we will be using + # to build our plots + longitudeVariableName = options.longitudeVar or 'imager_prof_retr_abi_r4_generic1' + latitudeVariableName = options.latitudeVar or'imager_prof_retr_abi_r4_generic2' + LOG.debug(str("longitude variable: " + longitudeVariableName)) + LOG.debug(str("latitude variable: " + latitudeVariableName)) + # get the actual data + longitudeA = np.array(aFile[longitudeVariableName][:], dtype=np.float) + longitudeB = np.array(bFile[longitudeVariableName][:], dtype=np.float) + latitudeA = np.array(aFile[latitudeVariableName][:], dtype=np.float) + latitudeB = np.array(bFile[latitudeVariableName][:], dtype=np.float) + # TODO validate that the two sets are the same (with .shape) + + # go through each of the possible variables in our files + # and make comparison images for whichever ones we can + for name, epsilon, missing in finalNames: + + # get the data for the variable + aData = aFile[name][:] + bData = bFile[name][:] + + # check if this data can be displayed and is + # not the lon/lat variable itself + if ((aData.shape == bData.shape) and + (aData.shape == longitudeA.shape) and + (bData.shape == longitudeB.shape) and + (name != longitudeVariableName) and + (name != latitudeVariableName)) : + # since things match, we will try to + # create the images comparing that variable + plot.plot_and_save_figure_comparison(aData, bData, name, + aFileName, bFileName, + latitudeA, longitudeA, + outputPath, epsilon, + missing) + # TODO should I pass both sets of lat/lon? + # only log a warning if the user themselves picked the faulty variables + elif hadUserRequest : + LOG.warn(name + ' could not be plotted. This may be because the data for this variable is not the ' + + 'right shape or because the variable is currently selected as the longitude or latitude ' + + 'variable for this file.') + return + # def build(*args): # """build summary # build extended info diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py index 109aae3f08f390e7eb8fcef22a7ad50b2f0f7b5e..9b66aa71ebdf751dad5bd4fa117576b062da2050 100644 --- a/pyglance/glance/delta.py +++ b/pyglance/glance/delta.py @@ -21,9 +21,12 @@ def _missing(x,missing_value=None): return isnan(x) | (x==missing_value) return isnan(x) -def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None)): +def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None), ignoreMask=None): """ take two arrays of similar size and composition + if an ignoreMask is passed in values in the mask will not be analysed to + form the various return masks and the corresponding spots in the + "difference" return data array will contain nan values. return difference array filled with nans where differences aren't valid, good mask where values are finite in both a and b trouble mask where missing values or nans don't match or delta > epsilon @@ -33,16 +36,27 @@ def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None)): shape = a.shape assert(b.shape==shape) assert(a.dtype==b.dtype) - anfin, bnfin = ~isfinite(a), ~isfinite(b) + + # if the ignore mask does not exist, set it to include none of the data + if (ignoreMask is None) : + ignoreMask = zeros(shape,dtype=bool) + + # deal with the basic masks + anfin, bnfin = ~isfinite(a) & ~ignoreMask, ~isfinite(b) & ~ignoreMask amis, bmis = zeros(shape,dtype=bool), zeros(shape,dtype=bool) if amissing is not None: amis[a==amissing] = True + amis[ignoreMask] = False # don't analyse the ignored values if bmissing is not None: bmis[b==bmissing] = True + bmis[ignoreMask] = False # don't analyse the ignored values + + # build the comparison data that includes the "good" values d = empty_like(a) - mask = ~(anfin | bnfin | amis | bmis) + mask = ~(anfin | bnfin | amis | bmis | ignoreMask) d[~mask] = nan d[mask] = b[mask] - a[mask] + # trouble areas - mismatched nans, mismatched missing-values, differences > epsilon trouble = (anfin ^ bnfin) | (amis ^ bmis) | (abs(d)>epsilon) return d, mask, trouble, (anfin, bnfin), (amis, bmis) diff --git a/pyglance/glance/plot.py b/pyglance/glance/plot.py index 8ff9e461a6688c49838a469d82722467778f9963..c57c798fc18a33e9db429d0f050b9e6707bff802 100644 --- a/pyglance/glance/plot.py +++ b/pyglance/glance/plot.py @@ -10,9 +10,31 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. import os, sys, logging from pylab import * +import matplotlib.pyplot as plt +from matplotlib import cm +import matplotlib.colors as colors +import keoni.map.graphics as maps + +import glance.delta as delta + LOG = logging.getLogger(__name__) +# the value that will denote "bad" longitudes and latitudes +badLonLat = 1.0E30 +# make a custom medium grayscale color map for putting our bad data on top of +mediumGrayColorMapData = { + 'red' : ((0.0, 1.00, 1.00), + (0.5, 0.60, 0.60), + (1.0, 0.20, 0.20)), + 'green' : ((0.0, 1.00, 1.00), + (0.5, 0.60, 0.60), + (1.0, 0.20, 0.20)), + 'blue' : ((0.0, 1.00, 1.00), + (0.5, 0.60, 0.60), + (1.0, 0.20, 0.20)) +} +mediumGrayColorMap = colors.LinearSegmentedColormap('mediumGrayColorMap', mediumGrayColorMapData, 256) def spectral_diff_plot( mean_diff, std_diff, max_diff, min_diff, acceptable_diff=None, x=None ): """plot spectral difference in current axes, wrapped in std @@ -56,8 +78,212 @@ def compare_spectra(actual, desired=None, acceptable=None, x=None): grid() title("difference min-max (blue), mean (black), mean +/- std (red)") +def plot_and_save_figure_comparison(aData, bData, variableName, + fileAName, fileBName, + latitudeData, longitudeData, outputPath, + epsilon, missing) : + """ + given two files, and information on what to compare, make comparison + figures and save them to the given output graph. + Four figures will be generated, one for each file, a comparison image + that represents the amount of difference in that variable between the + two files, and an image highlighting the trouble spots where the + difference exceeds epsilon or there are missing or nan values in one + or both of the files + """ + print("Creating figures for: " + variableName) + + # build a mask of our spacially invalid data so we can ask our + # comparison tool to ignore it + invalidLatitude = (latitudeData < -90) | (latitudeData > 90) + invalidLongitude = (longitudeData < -180) | (longitudeData > 180) + spaciallyInvalidMask = invalidLatitude | invalidLongitude + + # compare the two data sets to get our difference data and trouble info + rawDiffData, goodMask, troubleMask, (aNotFiniteMask, bNotFiniteMask), \ + (aMissingMask, bMissingMask) = delta.diff(aData, bData, epsilon, (missing,missing), spaciallyInvalidMask) + diffData = abs(rawDiffData) # we want to show the distance between our two, rather than which one's bigger + # mark where our invalid data is for each of the files (a and b) + invalidDataMaskA = aMissingMask | aNotFiniteMask + invalidDataMaskB = bMissingMask | bNotFiniteMask + # this mask potentially represents data we don't want to try to plot in our diff because it may be malformed + everyThingWrongButEpsilon = spaciallyInvalidMask | invalidDataMaskA | invalidDataMaskB + # use an exclusive or to get a mask for just the points deemed "bad" by epsilon comparison + tooDifferentMask = everyThingWrongButEpsilon ^ troubleMask + + # calculate the bounding range for the display + # this is in the form [longitude min, longitude max, latitude min, latitude max] + visibleAxes = [_min_with_mask(longitudeData, spaciallyInvalidMask), + _max_with_mask(longitudeData, spaciallyInvalidMask), + _min_with_mask(latitudeData, spaciallyInvalidMask), + _max_with_mask(latitudeData, spaciallyInvalidMask)] + + # make the three figures + print("\tcreating image of file a") + figureA = _create_mapped_figure(aData, latitudeData, longitudeData, visibleAxes, + (variableName + "\nin File A"), + invalidMask=(spaciallyInvalidMask | invalidDataMaskA)) + print("\tcreating image of file b") + figureB = _create_mapped_figure(bData, latitudeData, longitudeData, visibleAxes, + (variableName + "\nin File B"), + invalidMask=(spaciallyInvalidMask | invalidDataMaskB)) + print("\tcreating image of the amount of difference") + figureDiff = _create_mapped_figure(diffData, latitudeData, longitudeData, visibleAxes, + ("Amount of difference in\n" + variableName), + invalidMask=(everyThingWrongButEpsilon)) + # this figure is more complex because we want to mark the trouble points on it + print("\tcreating image marking trouble data") + figureBadDataInDiff = _create_mapped_figure(bData, latitudeData, longitudeData, visibleAxes, + ("Areas of trouble data in\n" + variableName), + spaciallyInvalidMask | invalidDataMaskB, + mediumGrayColorMap, troubleMask) + # a histogram of the values of fileA - file B so that the distribution of error is visible (hopefully) + print("\tcreating histogram of the amount of difference") + numBinsToUse = 50 + diffHistogramFigure = _create_histogram(rawDiffData[~everyThingWrongButEpsilon].ravel(), numBinsToUse, + ("Difference in\n" + variableName), + ('Value of (Data File B - Data File A) at a Data Point (in ' + str(numBinsToUse) + ' bins)'), + ('Number of Data Points with a Given Difference')) + + # TEMP scatter plot test of file a and b comparison + print("\tcreating scatter plot of file a values vs file b values") + diffScatterPlot = _create_scatter_plot(aData[~everyThingWrongButEpsilon].ravel(), bData[~everyThingWrongButEpsilon].ravel(), + "Value in File A vs Value in File B", "File A Value", "File B Value", + tooDifferentMask[~everyThingWrongButEpsilon].ravel()) + + # save the figures to disk + print("Saving figures for: " + variableName) + print("\tsaving image of file a") + figureA.savefig(outputPath + "/" + variableName + ".A.png", dpi=200) + print("\tsaving image of file b") + figureB.savefig(outputPath + "/" + variableName + ".B.png", dpi=200) + print("\tsaving image of the amount of difference") + figureDiff.savefig(outputPath + "/" + variableName + ".Diff.png", dpi=200) + print("\tsaving image marking trouble data") + figureBadDataInDiff.savefig(outputPath + "/" + variableName + ".Trouble.png", dpi=200) + print("\tsaving histogram of the amount of difference") + diffHistogramFigure.savefig(outputPath + "/" + variableName + ".Hist.png", dpi=200) + print("\tsaving scatter plot of file a values vs file b values") + diffScatterPlot.savefig(outputPath + "/" + variableName + ".Scatter.png", dpi=200) + + return + +# build a scatter plot of the x,y points +def _create_scatter_plot(dataX, dataY, title, xLabel, yLabel, badMask=None) : + # make the figure + figure = plt.figure() + axes = figure.add_subplot(111) + + # if we have "bad" data to plot, pull it out + badX = None + badY = None + if (badMask != None) : + badX = dataX[badMask] + badY = dataY[badMask] + dataX = dataX[~badMask] + dataY = dataY[~badMask] + + # the scatter plot of the data + axes.plot(dataX, dataY, 'b,') + if (badMask != None) : + LOG.debug('\t\tnumber of trouble points: ' + str(badX.shape)) + axes.plot(badX, badY, 'r,') + + # and some informational stuff + axes.set_title(title) + plt.xlabel(xLabel) + plt.ylabel(yLabel) + return figure +# build a histogram figure of the given data with the given title and number of bins +def _create_histogram(data, bins, title, xLabel, yLabel) : + + # make the figure + figure = plt.figure() + axes = figure.add_subplot(111) + + # the histogram of the data + n, bins, patches = plt.hist(data, bins) + + # and some informational stuff + axes.set_title(title) + plt.xlabel(xLabel) + plt.ylabel(yLabel) + + return figure + + +# create a figure including our data mapped onto a map at the lon/lat given +# the colorMap parameter can be used to control the colors the figure is drawn in +# if tagData is passed in it will be plotted as an overlayed set on the existing +# image using the tagDataColorMap colors rather than those of the original image +def _create_mapped_figure(data, latitude, longitude, boundingAxes, title, + invalidMask=None, colorMap=None, tagDataToShowMask=None) : + + # this is very inefficient, TODO find a better way? + latitudeCleaned = empty_like(latitude) + latitudeCleaned[~invalidMask] = latitude[~invalidMask] + latitudeCleaned[invalidMask] = badLonLat + longitudeCleaned = empty_like(longitude) + longitudeCleaned[~invalidMask] = longitude[~invalidMask] + longitudeCleaned[invalidMask] = badLonLat + + # build the plot + figure = plt.figure() + axes = figure.add_subplot(111) + + # figure the range for the color bars + minVal = _min_with_mask(data, invalidMask) + maxVal = _max_with_mask(data, invalidMask) + rangeForBar = arange(minVal, maxVal, (maxVal - minVal) / 50) + + # draw our data placed on a map + if (colorMap != None) : + bMap, x, y = maps.mapshow(longitudeCleaned, latitudeCleaned, data, boundingAxes, levelsToUse=rangeForBar, cmap=colorMap) + else : + bMap, x, y = maps.mapshow(longitudeCleaned, latitudeCleaned, data, boundingAxes, levelsToUse=rangeForBar) + + # and some informational stuff + axes.set_title(title) + # show a generic color bar + colorbar() + + # if there's a"tag" mask, plot it over the existing map + if (tagDataToShowMask != None) : + + # pick out the cooridinates of the points we want to plot + newX = x[tagDataToShowMask] + newY = y[tagDataToShowMask] + + # look at how many trouble points we have + numTroublePoints = newX.shape[0] + LOG.debug('\t\tnumber of trouble points: ' + str(numTroublePoints)) + # figure out how big to make the points when we plot them + totalNumPoints = x[~invalidMask].shape[0] + pointSize = 1 + percentBad = (float(numTroublePoints) / float(totalNumPoints)) * 100.0 + # if we have very few bad data points, make them bigger so they're easier to see + LOG.debug('\t\tpercent of trouble points: ' + str(percentBad)) + if (percentBad > 1.0) : + pointSize = 1 + elif (percentBad > 0.25) : + pointSize = 3 + else : + pointSize = 5 + + # plot our point on top of the existing figure + bMap.plot(newX, newY, '.', color='#00FF00', markersize=pointSize) + + return figure + +# get the min, ignoring the stuff in mask +def _min_with_mask(data, mask) : + return data[~mask].ravel()[data[~mask].argmin()] + +# get the max, ignoring the stuff in mask +def _max_with_mask(data, mask) : + return data[~mask].ravel()[data[~mask].argmax()] if __name__=='__main__': import doctest