Skip to content
Snippets Groups Projects
Commit a7a8da65 authored by (no author)'s avatar (no author)
Browse files

addition of super awesome plotting comparison between two hdf files

git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@10 8a9318a1-56ba-4d59-b755-99d26321be01
parent 1f260634
No related branches found
No related tags found
No related merge requests found
......@@ -11,6 +11,7 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
import os, sys, logging, re
from pprint import pprint, pformat
import numpy as np
import glance.io as io
import glance.delta as delta
......@@ -47,7 +48,6 @@ def _parse_varnames(names, terms, epsilon=0.0, missing=None):
return eps, mis
sel = [ ((x,)+_cvt_em(*em)) for x in names for (t,em) in terms if t(x) ]
return set(sel)
def main():
import optparse
......@@ -58,6 +58,7 @@ examples:
python -m glance.compare info A.hdf
python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0'
python -m glance.compare plotDiffs A.hdf B.hdf [optional output path]
"""
parser = optparse.OptionParser(usage)
......@@ -72,7 +73,15 @@ python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0
parser.add_option('-e', '--epsilon', dest="epsilon", type='float', default=0.0,
help="set default epsilon value for comparison threshold")
parser.add_option('-m', '--missing', dest="missing", type='float', default=None,
help="set default missing-value")
help="set default missing-value")
#plotting related options
parser.add_option('-p', '--outputpath', dest="outputpath", type='string', default='./',
help="set path to output directory")
parser.add_option('-o', '--longitude', dest="longitudeVar", type='string',
help="set name of longitude variable")
parser.add_option('-a', '--latitude', dest="latitudeVar", type='string',
help="set name of latitude variable")
options, args = parser.parse_args()
if options.self_test:
......@@ -209,7 +218,92 @@ python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0
if doc_each: print(' ' + delta.STATISTICS_DOC[each[0]])
if doc_atend:
print('\n\n' + delta.STATISTICS_DOC_STR)
def plotDiffs(*args) :
"""create comparison images of various variables
This option creates graphical comparisons between variables in the two given hdf files.
Images will be created to show the variable data value for each of the two files and to show
the difference between them.
Variables to be compared may be specified after the names of the two input files. If no variables
are specified, all variables that match the shape of the longitude and latitude will be compared.
Specified variables that do not exist, do not match the correct data shape, or are the longitude/latitude
variables will be ignored.
Created images will be stored in the provided path, or if no path is provided, they will be stored
in the current directory.
The longitude and latitude variables may be specified with --longitude and --latitude
If no longitude or latitude are specified the imager_prof_retr_abi_r4_generic1 and
imager_prof_retr_abi_r4_generic2 variables will be used.
Examples:
python -m glance.compare plotDiffs A.hdf B.hdf variable_name_1 variable_name_2 variable_name_3 variable_name_4
python -m glance.compare --outputpath=/path/where/output/will/be/placed/ plotDiffs A.hdf B.hdf
python -m glance.compare plotDiffs --longitude=lon_variable_name --latitude=lat_variable_name A.hdf B.hdf variable_name
"""
# get the file names the user wants to use and open them up
aFileName, bFileName = args[:2]
LOG.info("opening %s" % aFileName)
aFile = io.open(aFileName)
LOG.info("opening %s" % bFileName)
bFile = io.open(bFileName)
# get information about the variables stored in the file
aNames = set(aFile())
bNames = set(bFile())
# get the variable names they have in common
commonNames = aNames.intersection(bNames)
# pull the ones the user asked for (if they asked for any specifically)
requestedNames = args[2:] or ['.*']
finalNames = _parse_varnames(commonNames, requestedNames, options.epsilon, options.missing)
LOG.debug(str(finalNames))
hadUserRequest = (len(args) > 2)
# get the output path
outputPath = options.outputpath
# TODO, should I validate that the output path is a file path?
LOG.debug(str(outputPath))
# get information about the longitude/latitude data we will be using
# to build our plots
longitudeVariableName = options.longitudeVar or 'imager_prof_retr_abi_r4_generic1'
latitudeVariableName = options.latitudeVar or'imager_prof_retr_abi_r4_generic2'
LOG.debug(str("longitude variable: " + longitudeVariableName))
LOG.debug(str("latitude variable: " + latitudeVariableName))
# get the actual data
longitudeA = np.array(aFile[longitudeVariableName][:], dtype=np.float)
longitudeB = np.array(bFile[longitudeVariableName][:], dtype=np.float)
latitudeA = np.array(aFile[latitudeVariableName][:], dtype=np.float)
latitudeB = np.array(bFile[latitudeVariableName][:], dtype=np.float)
# TODO validate that the two sets are the same (with .shape)
# go through each of the possible variables in our files
# and make comparison images for whichever ones we can
for name, epsilon, missing in finalNames:
# get the data for the variable
aData = aFile[name][:]
bData = bFile[name][:]
# check if this data can be displayed and is
# not the lon/lat variable itself
if ((aData.shape == bData.shape) and
(aData.shape == longitudeA.shape) and
(bData.shape == longitudeB.shape) and
(name != longitudeVariableName) and
(name != latitudeVariableName)) :
# since things match, we will try to
# create the images comparing that variable
plot.plot_and_save_figure_comparison(aData, bData, name,
aFileName, bFileName,
latitudeA, longitudeA,
outputPath, epsilon,
missing)
# TODO should I pass both sets of lat/lon?
# only log a warning if the user themselves picked the faulty variables
elif hadUserRequest :
LOG.warn(name + ' could not be plotted. This may be because the data for this variable is not the ' +
'right shape or because the variable is currently selected as the longitude or latitude ' +
'variable for this file.')
return
# def build(*args):
# """build summary
# build extended info
......
......@@ -21,9 +21,12 @@ def _missing(x,missing_value=None):
return isnan(x) | (x==missing_value)
return isnan(x)
def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None)):
def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None), ignoreMask=None):
"""
take two arrays of similar size and composition
if an ignoreMask is passed in values in the mask will not be analysed to
form the various return masks and the corresponding spots in the
"difference" return data array will contain nan values.
return difference array filled with nans where differences aren't valid,
good mask where values are finite in both a and b
trouble mask where missing values or nans don't match or delta > epsilon
......@@ -33,16 +36,27 @@ def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None)):
shape = a.shape
assert(b.shape==shape)
assert(a.dtype==b.dtype)
anfin, bnfin = ~isfinite(a), ~isfinite(b)
# if the ignore mask does not exist, set it to include none of the data
if (ignoreMask is None) :
ignoreMask = zeros(shape,dtype=bool)
# deal with the basic masks
anfin, bnfin = ~isfinite(a) & ~ignoreMask, ~isfinite(b) & ~ignoreMask
amis, bmis = zeros(shape,dtype=bool), zeros(shape,dtype=bool)
if amissing is not None:
amis[a==amissing] = True
amis[ignoreMask] = False # don't analyse the ignored values
if bmissing is not None:
bmis[b==bmissing] = True
bmis[ignoreMask] = False # don't analyse the ignored values
# build the comparison data that includes the "good" values
d = empty_like(a)
mask = ~(anfin | bnfin | amis | bmis)
mask = ~(anfin | bnfin | amis | bmis | ignoreMask)
d[~mask] = nan
d[mask] = b[mask] - a[mask]
# trouble areas - mismatched nans, mismatched missing-values, differences > epsilon
trouble = (anfin ^ bnfin) | (amis ^ bmis) | (abs(d)>epsilon)
return d, mask, trouble, (anfin, bnfin), (amis, bmis)
......
......@@ -10,9 +10,31 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
import os, sys, logging
from pylab import *
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as colors
import keoni.map.graphics as maps
import glance.delta as delta
LOG = logging.getLogger(__name__)
# the value that will denote "bad" longitudes and latitudes
badLonLat = 1.0E30
# make a custom medium grayscale color map for putting our bad data on top of
mediumGrayColorMapData = {
'red' : ((0.0, 1.00, 1.00),
(0.5, 0.60, 0.60),
(1.0, 0.20, 0.20)),
'green' : ((0.0, 1.00, 1.00),
(0.5, 0.60, 0.60),
(1.0, 0.20, 0.20)),
'blue' : ((0.0, 1.00, 1.00),
(0.5, 0.60, 0.60),
(1.0, 0.20, 0.20))
}
mediumGrayColorMap = colors.LinearSegmentedColormap('mediumGrayColorMap', mediumGrayColorMapData, 256)
def spectral_diff_plot( mean_diff, std_diff, max_diff, min_diff, acceptable_diff=None, x=None ):
"""plot spectral difference in current axes, wrapped in std
......@@ -56,8 +78,212 @@ def compare_spectra(actual, desired=None, acceptable=None, x=None):
grid()
title("difference min-max (blue), mean (black), mean +/- std (red)")
def plot_and_save_figure_comparison(aData, bData, variableName,
fileAName, fileBName,
latitudeData, longitudeData, outputPath,
epsilon, missing) :
"""
given two files, and information on what to compare, make comparison
figures and save them to the given output graph.
Four figures will be generated, one for each file, a comparison image
that represents the amount of difference in that variable between the
two files, and an image highlighting the trouble spots where the
difference exceeds epsilon or there are missing or nan values in one
or both of the files
"""
print("Creating figures for: " + variableName)
# build a mask of our spacially invalid data so we can ask our
# comparison tool to ignore it
invalidLatitude = (latitudeData < -90) | (latitudeData > 90)
invalidLongitude = (longitudeData < -180) | (longitudeData > 180)
spaciallyInvalidMask = invalidLatitude | invalidLongitude
# compare the two data sets to get our difference data and trouble info
rawDiffData, goodMask, troubleMask, (aNotFiniteMask, bNotFiniteMask), \
(aMissingMask, bMissingMask) = delta.diff(aData, bData, epsilon, (missing,missing), spaciallyInvalidMask)
diffData = abs(rawDiffData) # we want to show the distance between our two, rather than which one's bigger
# mark where our invalid data is for each of the files (a and b)
invalidDataMaskA = aMissingMask | aNotFiniteMask
invalidDataMaskB = bMissingMask | bNotFiniteMask
# this mask potentially represents data we don't want to try to plot in our diff because it may be malformed
everyThingWrongButEpsilon = spaciallyInvalidMask | invalidDataMaskA | invalidDataMaskB
# use an exclusive or to get a mask for just the points deemed "bad" by epsilon comparison
tooDifferentMask = everyThingWrongButEpsilon ^ troubleMask
# calculate the bounding range for the display
# this is in the form [longitude min, longitude max, latitude min, latitude max]
visibleAxes = [_min_with_mask(longitudeData, spaciallyInvalidMask),
_max_with_mask(longitudeData, spaciallyInvalidMask),
_min_with_mask(latitudeData, spaciallyInvalidMask),
_max_with_mask(latitudeData, spaciallyInvalidMask)]
# make the three figures
print("\tcreating image of file a")
figureA = _create_mapped_figure(aData, latitudeData, longitudeData, visibleAxes,
(variableName + "\nin File A"),
invalidMask=(spaciallyInvalidMask | invalidDataMaskA))
print("\tcreating image of file b")
figureB = _create_mapped_figure(bData, latitudeData, longitudeData, visibleAxes,
(variableName + "\nin File B"),
invalidMask=(spaciallyInvalidMask | invalidDataMaskB))
print("\tcreating image of the amount of difference")
figureDiff = _create_mapped_figure(diffData, latitudeData, longitudeData, visibleAxes,
("Amount of difference in\n" + variableName),
invalidMask=(everyThingWrongButEpsilon))
# this figure is more complex because we want to mark the trouble points on it
print("\tcreating image marking trouble data")
figureBadDataInDiff = _create_mapped_figure(bData, latitudeData, longitudeData, visibleAxes,
("Areas of trouble data in\n" + variableName),
spaciallyInvalidMask | invalidDataMaskB,
mediumGrayColorMap, troubleMask)
# a histogram of the values of fileA - file B so that the distribution of error is visible (hopefully)
print("\tcreating histogram of the amount of difference")
numBinsToUse = 50
diffHistogramFigure = _create_histogram(rawDiffData[~everyThingWrongButEpsilon].ravel(), numBinsToUse,
("Difference in\n" + variableName),
('Value of (Data File B - Data File A) at a Data Point (in ' + str(numBinsToUse) + ' bins)'),
('Number of Data Points with a Given Difference'))
# TEMP scatter plot test of file a and b comparison
print("\tcreating scatter plot of file a values vs file b values")
diffScatterPlot = _create_scatter_plot(aData[~everyThingWrongButEpsilon].ravel(), bData[~everyThingWrongButEpsilon].ravel(),
"Value in File A vs Value in File B", "File A Value", "File B Value",
tooDifferentMask[~everyThingWrongButEpsilon].ravel())
# save the figures to disk
print("Saving figures for: " + variableName)
print("\tsaving image of file a")
figureA.savefig(outputPath + "/" + variableName + ".A.png", dpi=200)
print("\tsaving image of file b")
figureB.savefig(outputPath + "/" + variableName + ".B.png", dpi=200)
print("\tsaving image of the amount of difference")
figureDiff.savefig(outputPath + "/" + variableName + ".Diff.png", dpi=200)
print("\tsaving image marking trouble data")
figureBadDataInDiff.savefig(outputPath + "/" + variableName + ".Trouble.png", dpi=200)
print("\tsaving histogram of the amount of difference")
diffHistogramFigure.savefig(outputPath + "/" + variableName + ".Hist.png", dpi=200)
print("\tsaving scatter plot of file a values vs file b values")
diffScatterPlot.savefig(outputPath + "/" + variableName + ".Scatter.png", dpi=200)
return
# build a scatter plot of the x,y points
def _create_scatter_plot(dataX, dataY, title, xLabel, yLabel, badMask=None) :
# make the figure
figure = plt.figure()
axes = figure.add_subplot(111)
# if we have "bad" data to plot, pull it out
badX = None
badY = None
if (badMask != None) :
badX = dataX[badMask]
badY = dataY[badMask]
dataX = dataX[~badMask]
dataY = dataY[~badMask]
# the scatter plot of the data
axes.plot(dataX, dataY, 'b,')
if (badMask != None) :
LOG.debug('\t\tnumber of trouble points: ' + str(badX.shape))
axes.plot(badX, badY, 'r,')
# and some informational stuff
axes.set_title(title)
plt.xlabel(xLabel)
plt.ylabel(yLabel)
return figure
# build a histogram figure of the given data with the given title and number of bins
def _create_histogram(data, bins, title, xLabel, yLabel) :
# make the figure
figure = plt.figure()
axes = figure.add_subplot(111)
# the histogram of the data
n, bins, patches = plt.hist(data, bins)
# and some informational stuff
axes.set_title(title)
plt.xlabel(xLabel)
plt.ylabel(yLabel)
return figure
# create a figure including our data mapped onto a map at the lon/lat given
# the colorMap parameter can be used to control the colors the figure is drawn in
# if tagData is passed in it will be plotted as an overlayed set on the existing
# image using the tagDataColorMap colors rather than those of the original image
def _create_mapped_figure(data, latitude, longitude, boundingAxes, title,
invalidMask=None, colorMap=None, tagDataToShowMask=None) :
# this is very inefficient, TODO find a better way?
latitudeCleaned = empty_like(latitude)
latitudeCleaned[~invalidMask] = latitude[~invalidMask]
latitudeCleaned[invalidMask] = badLonLat
longitudeCleaned = empty_like(longitude)
longitudeCleaned[~invalidMask] = longitude[~invalidMask]
longitudeCleaned[invalidMask] = badLonLat
# build the plot
figure = plt.figure()
axes = figure.add_subplot(111)
# figure the range for the color bars
minVal = _min_with_mask(data, invalidMask)
maxVal = _max_with_mask(data, invalidMask)
rangeForBar = arange(minVal, maxVal, (maxVal - minVal) / 50)
# draw our data placed on a map
if (colorMap != None) :
bMap, x, y = maps.mapshow(longitudeCleaned, latitudeCleaned, data, boundingAxes, levelsToUse=rangeForBar, cmap=colorMap)
else :
bMap, x, y = maps.mapshow(longitudeCleaned, latitudeCleaned, data, boundingAxes, levelsToUse=rangeForBar)
# and some informational stuff
axes.set_title(title)
# show a generic color bar
colorbar()
# if there's a"tag" mask, plot it over the existing map
if (tagDataToShowMask != None) :
# pick out the cooridinates of the points we want to plot
newX = x[tagDataToShowMask]
newY = y[tagDataToShowMask]
# look at how many trouble points we have
numTroublePoints = newX.shape[0]
LOG.debug('\t\tnumber of trouble points: ' + str(numTroublePoints))
# figure out how big to make the points when we plot them
totalNumPoints = x[~invalidMask].shape[0]
pointSize = 1
percentBad = (float(numTroublePoints) / float(totalNumPoints)) * 100.0
# if we have very few bad data points, make them bigger so they're easier to see
LOG.debug('\t\tpercent of trouble points: ' + str(percentBad))
if (percentBad > 1.0) :
pointSize = 1
elif (percentBad > 0.25) :
pointSize = 3
else :
pointSize = 5
# plot our point on top of the existing figure
bMap.plot(newX, newY, '.', color='#00FF00', markersize=pointSize)
return figure
# get the min, ignoring the stuff in mask
def _min_with_mask(data, mask) :
return data[~mask].ravel()[data[~mask].argmin()]
# get the max, ignoring the stuff in mask
def _max_with_mask(data, mask) :
return data[~mask].ravel()[data[~mask].argmax()]
if __name__=='__main__':
import doctest
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment