diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py index a28de59ffa3093c3d82047ab7c2c22fe09f9406f..7c002914a21ee59330ff05948192c2223e3ddc65 100644 --- a/pyglance/glance/compare.py +++ b/pyglance/glance/compare.py @@ -120,7 +120,46 @@ python -m glance.compare stats A.hdf B.hdf '.*_prof_retr_.*:1e-4' 'nwp_._index:0 bspc = get(b) plot.compare_spectra(bspc,aspc) plot.show() + + def noisecheck(*args): + """gives statistics for dataset comparisons against truth with and without noise + usage: noisecheck truth-file noise-file actual-file variable1{:epsilon{:missing}} {variable2...} + glance noisecheck /Volumes/snaapy/data/justins/abi_graffir/coreg/pure/l2_data/geocatL2.GOES-R.2005155.220000.hdf.gz /Volumes/snaapy/data/justins/abi_graffir/noise/noise1x/l2_data/geocatL2.GOES-R.2005155.220000.hdf + """ + afn,noizfn,bfn = args[:3] + LOG.info("opening truth file %s" % afn) + a = io.open(afn) + LOG.info("opening actual file %s" % noizfn) + noiz = io.open(noizfn) + LOG.info("opening noise file %s" % bfn) + b = io.open(bfn) + anames = set(a()) + bnames = set(b()) + cnames = anames.intersection(bnames) # common names + pats = args[3:] or ['.*'] + names = _parse_varnames( cnames, pats, options.epsilon, options.missing ) + for name,epsilon,missing in names: + avar = a[name] + bvar = b[name] + nvar = noiz[name] + if missing is None: + amiss = a.missing_value(name) + bmiss = b.missing_value(name) + else: + amiss,bmiss = missing,missing + x = avar[:] + y = bvar[:] + z = nvar[:] + def scat(x,xn,y): + from pylab import plot,show,scatter + scatter(x,y) + show() + nfo = delta.rms_corr_withnoise(x,y,z,epsilon,(amiss,bmiss),plot=scat) + print '-'*32 + print name + for kv in sorted(nfo.items()): + print ' %s: %s' % kv def stats(*args): """create statistics summary of variables diff --git a/pyglance/glance/delta.py b/pyglance/glance/delta.py index 112364ff724ca194a286d8e9c68ea00bab5fc287..f536070f2b01cb82f35dc0294782e5f8b5d7f10c 100644 --- a/pyglance/glance/delta.py +++ b/pyglance/glance/delta.py @@ -9,6 +9,9 @@ Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved. import os, sys, logging from numpy import * +from scipy.stats import pearsonr, spearmanr, pointbiserialr + +compute_r = pointbiserialr LOG = logging.getLogger(__name__) @@ -44,6 +47,48 @@ def diff(a, b, epsilon=0., (amissing,bmissing)=(None,None)): trouble = (anfin ^ bnfin) | (amis ^ bmis) | (abs(d)>epsilon) return d, mask, trouble, (anfin, bnfin), (amis, bmis) +def corr(x,y,mask): + "compute correlation coefficient" + gf = mask.flatten() + xf = x.flatten()[gf] + yf = y.flatten()[gf] + assert(sum(~isfinite(yf))==0) + assert(sum(~isfinite(xf))==0) + return compute_r(xf,yf)[0] + +def rms_corr_withnoise(truth, actual, noiz, epsilon=0., (amissing,bmissing)=(None,None), plot=None): + """ compute RMS and R statistics for truth vs actual and truth+noiz vs actual + """ + x=truth + y=actual + z=noiz + d,good,bad,_,_ = diff(x,y,epsilon,(amissing,bmissing)) + # compute RMS error + rmse = sqrt(sum(d[good]**2)) / d.size + gf = good.flatten() + # raise NotImplementedError # FIXME we're getting NaN for R + xf = x.flatten()[gf] + yf = y.flatten()[gf] + assert(sum(~isfinite(yf))==0) + assert(sum(~isfinite(xf))==0) + # compute correlation coefficient + r = compute_r(xf,yf)[0] + # create xpn, x plus noise + xpn = array(x) + xpn[good] += z[good] + xpnf = xpn.flatten()[gf] + # compute RMS error versus noise + dpn,good,bad,_,_ = diff(xpn,y,epsilon,(amissing,bmissing)) + rmsepn = sqrt(sum(dpn[good]**2)) / d.size + assert(sum(~isfinite(xpnf))==0) + rpn = compute_r(xpnf,yf)[0] + if plot: plot(xf,xpnf,yf) + return { 'rms_error': rmse, + 'correlation': r, + 'rms_error_with_noise': rmsepn, + 'correlation_with_noise': rpn, + } + def stats(dif, mask, bad, *etc): rms = sum(abs(dif[mask] ** 2)) / dif.size return { 'rms_diff': rms, @@ -52,7 +97,6 @@ def stats(dif, mask, bad, *etc): 'median_diff': median(dif[mask]) } - def summarize(a, b, epsilon=0., (amiss,bmiss)=(None,None)): """return dictionary of similarity statistics stats not including 'nan' in name exclude nans in either arrays @@ -64,6 +108,7 @@ def summarize(a, b, epsilon=0., (amiss,bmiss)=(None,None)): a_nans = isnan(a) b_nans = isnan(b) n = a.size + r = corr(a,b,mask) out = { 'a_xor_b_finite_count': a_xor_b_finite, 'finite_count': sum(mask), @@ -75,7 +120,8 @@ def summarize(a, b, epsilon=0., (amiss,bmiss)=(None,None)): 'a_nan_count': sum(a_nans), 'b_nan_count': sum(b_nans), 'a_and_b_nan_count': sum(a_nans & b_nans), - 'shape': a.shape + 'shape': a.shape, + 'correlation': r } out.update(stadic) return out diff --git a/pyglance/glance/io.py b/pyglance/glance/io.py index f0e0594d21047d5622582febda3c6f37f97b48ed..e69cdff0ed3d9a400cf514fdd359c4bf81d6d1d8 100644 --- a/pyglance/glance/io.py +++ b/pyglance/glance/io.py @@ -37,7 +37,7 @@ class hdf(SD): return getattr(self.select(name),'_FillValue',None) -FIXME_IDPS = [ '/All_Data/CrIS-SDR_All/ES' + ri + band for ri in ['Real','Imag'] for band in ['LW','MW','SW'] ] +FIXME_IDPS = [ '/All_Data/CrIS-SDR_All/ES' + ri + band for ri in ['Real','Imaginary'] for band in ['LW','MW','SW'] ] class h5(object):