delta.py

#!/usr/bin/env python
# encoding: utf-8
"""
Routines to do assorted difference and comparison calculations and statistics

Created by rayg Apr 2009.
Copyright (c) 2009 University of Wisconsin SSEC. All rights reserved.
"""

import logging
import math
import numpy as numpy
from numpy import * # todo, remove this line

from scipy.stats import pearsonr

LOG = logging.getLogger(__name__)

# for calculating the great circle distance, this is the radius well will
# assume that the spherical model of the earth has, in km
SPHERICAL_EARTH_RADIUS = 6373.0

# -------------- generic data manipulation and analysis --------------

# TODO have someone double check the math here
def great_circle_distance (latitudeA, longitudeA, latitudeB, longitudeB) :
    """
    Calculate the great circle distance (in km) between the A and B points
    given in the input parameters, the inputs are expected to be in degrees

    note: This method uses the spherical law of cosines, and is best suited
    for smaller distances.
    """

    # convert to radians
    latARad = math.radians(latitudeA)
    lonARad = math.radians(longitudeA)
    latBRad = math.radians(latitudeB)
    lonBRad = math.radians(longitudeB)

    distToReturn = math.acos(math.sin(latARad) * math.sin(latBRad) +
                             math.cos(latARad) * math.cos(latBRad) *
                             math.cos(lonBRad - lonARad)) * SPHERICAL_EARTH_RADIUS

    return distToReturn

def min_with_mask(data, goodMask=None) :
    """
    get the minimum of the values in data,
    inspecting only the values selected in the goodMask

    if no mask is passed and a masked array is given the
    mask associated with the array will be used
    if no mask is passed and any other kind of data is
    given, all values are assumed to be good
    """

    # if the data array is a masked array and we weren't
    # given a mask, assume that we should use the one
    # associated with the masked array
    if (goodMask is None) and (type(data) is numpy.ma) :
        goodMask = data.mask

    # select only the good data values
    goodData = data[goodMask]

    # if we have any good data, get the minimum
    toReturn = numpy.min(goodData) if goodData.size > 0 else None

    return toReturn

def max_with_mask(data, goodMask=None) :
    """
    get the maximum of the values in data,
    inspecting only the values selected in the goodMask

    if no mask is passed and a masked array is given the
    mask associated with the array will be used
    if no mask is passed and any other kind of data is
    given, all values are assumed to be good
    """

    # if the data array is a masked array and we weren't
    # given a mask, assume that we should use the one
    # associated with the masked array
    if (goodMask is None) and (type(data) is numpy.ma) :
        goodMask = data.mask

    # select only the good data values
    goodData = data[goodMask]

    # if we have any good data, get the maximum
    toReturn = numpy.max(goodData) if goodData.size > 0 else None

    return toReturn

def compute_correlation(xData, yData, goodMask, compute_r_function=pearsonr):
    """
    compute the correlation coefficient of two data sets
    given a mask describing good data values in the sets
    """

    # make sure our data sets and mask are the same shape
    assert(xData.shape == yData.shape)
    assert(xData.shape == goodMask.shape)

    # pull out just the good data
    good_x_data = xData[goodMask]
    good_y_data = yData[goodMask]

    # make sure that there is no remaining bad data
    assert(numpy.all(numpy.isfinite(good_x_data)))
    assert(numpy.all(numpy.isfinite(good_y_data)))

    # if we have enough data, try to build the correlation
    toReturn = numpy.nan
    if (good_x_data.size >= 2) and (good_y_data.size >= 2) :
        toReturn = compute_r_function(good_x_data, good_y_data)[0]

    return toReturn

def calculate_root_mean_square (data, goodMask=None) :
    """
    calculate the root mean square of the data,
    possibly selecting only the points in the given
    goodMask, if no mask is given, all points will
    be used
    """

    # get a count of how many good data points we have
    numGoodPoints = data.size
    if goodMask is not None:
        numGoodPoints = numpy.sum(goodMask)

    rootMeanSquare = numpy.sqrt( numpy.sum( data[goodMask] ** 2 ) / numGoodPoints )

    return rootMeanSquare

# TODO, should the name of this function be changed?
def convert_mag_dir_to_U_V_vector(magnitude_data, direction_data, invalidMask=None, offset_degrees=180):
    """
    This method is intended to convert magnitude and direction data into (U, V) vector data.
    An invalid mask may be given if some of the points in the set should be masked out.

    TODO, this method is not fully tested
    """

    if invalidMask is None :
        invalidMask = numpy.zeros(magnitude_data.shape, dtype=bool)

    new_direction_data = direction_data[:] + offset_degrees

    LOG.debug ("direction data: " + str(new_direction_data[~invalidMask]))

    uData = numpy.zeros(magnitude_data.shape, dtype=float)
    uData[invalidMask]  = numpy.nan
    uData[~invalidMask] = magnitude_data[~invalidMask] * numpy.sin (deg2rad(new_direction_data[~invalidMask]))

    vData = numpy.zeros(magnitude_data.shape, dtype=float)
    vData[invalidMask]  = numpy.nan
    vData[~invalidMask] = magnitude_data[~invalidMask] * numpy.cos (deg2rad(new_direction_data[~invalidMask]))

    return uData, vData

# ------------- bin/tuple related functions --------------------

class BinTupleMapping (object) :
    """
    This class represents a bin / tuple data remapping.
    It encapsulates information about the dimensions that are considered the
    bin and tuple dimensions and is able to transform data into the
    [bin][case][tuple] form. It also allows for the reverse calculation of
    indexes so that you can recreate positioning information in the original
    data set based on the new shape of the case dimension.


    internal instance variables:

    bin_dimension_index   - the original index of the bin dimension
    tuple_dimension_index - the original index of the tuple dimension

    original_data_shape   - the shape the data was before it was reordered
    new_data_shape        - the data shape after it's been reordered

    new_index_order       - a mapping that lists the order of the new dimension indexes
    original_case_shape   - the shape of the case dimension(s) before being flattened
    reverse_case_index    - a reverse index for finding the original positions of
                            flattened case indexes

    TODO, in the long run, find a way to get rid of the reverse_case_index
    """

    def __init__ (self, dataShape, binIndexNumber=0, tupleIndexNumber=None) :
        """
        Given information on the original data and the desired bin/tuple,
        build the mapping object
        """

        # minimally, we need to have a shape
        assert(dataShape is not None)

        # get the number of dimensions present in our data
        numberOfDimensions = len(dataShape)

        # is our shape ok?
        assert(numberOfDimensions >=2)

        self.original_data_shape = dataShape

        # set up our tuple if it wasn't selected
        if (tupleIndexNumber is None) :
            tupleIndexNumber = numberOfDimensions - 1

        # are the bin and tuple ok?
        assert(binIndexNumber is not None)
        assert(binIndexNumber   >= 0)
        assert(binIndexNumber   < numberOfDimensions)
        assert(tupleIndexNumber >= 0)
        assert(tupleIndexNumber < numberOfDimensions)

        self.bin_dimension_index   = binIndexNumber
        self.tuple_dimension_index = tupleIndexNumber

        # get the new index ordering for the data
        self.new_index_order     = BinTupleMapping._make_new_index_list(numberOfDimensions,
                                                                        self.bin_dimension_index,
                                                                        self.tuple_dimension_index)
        temp_data_shape = [ ]
        for index in self.new_index_order:
            temp_data_shape = temp_data_shape + [dataShape[index]]
        temp_data_shape = tuple(temp_data_shape)
        """
        temp_data_shape          = numpy.array(dataShape).transpose(self.new_index_order)
        """
        self.original_case_shape = temp_data_shape[1:-1]

        # figure out the new size with the flattened cases
        number_of_cases     = 0
        self.new_data_shape = (temp_data_shape[0], temp_data_shape[-1])
        if len(self.original_case_shape) > 0 :
            number_of_cases = numpy.multiply.accumulate(self.original_case_shape)[-1]
            self.new_data_shape = (temp_data_shape[0], number_of_cases, temp_data_shape[-1])

        # build the reverse index for looking up flat case indexes
        self.reverse_case_index     = None
        if len(self.original_case_shape) > 0 :
            self.reverse_case_index = numpy.arange(number_of_cases).reshape(self.original_case_shape)

    @staticmethod
    def _make_new_index_list(numberOfIndexes, firstIndexNumber, lastIndexNumber) :
        """
        a utility method to make a list of index numbers for reordering a
        multi-dimensional array

        the first and last index numbers represent the dimensions you want to be
        first and last (respectively) when the list is reordered; any other indexes
        will retain their relative ordering

        Note: This is a private method of the BinTupleMapping class and assumes
        that the index numbers passed to it will have been preverified to be
        acceptable.
        """

        # make the new list
        newIndexList = range(numberOfIndexes)

        # remove our two "important" indexes, in the correct order
        maxSpecial   = max(firstIndexNumber, lastIndexNumber)
        minSpecial   = min(firstIndexNumber, lastIndexNumber)
        del(newIndexList[maxSpecial])
        del(newIndexList[minSpecial])

        # add our two important indexes back into the list in their new places
        newIndexList = [firstIndexNumber] + newIndexList + [lastIndexNumber]

        return newIndexList

    def reorder_for_bin_tuple (self, data) :
        """
        reorder the data so that the bin index is first, the tuple index is last,
        and any additional dimensions are flattened into a middle "case" index

        the reordered data and the shape of flattened case indexes will be returned
        (note: the shape of the data must match the shape with which the BinTupleMatching
        object was originally constructed)
        """

        assert(data.shape == self.original_data_shape)

        # put the bin and tuple dimensions in the correct places
        newData = data.transpose(self.new_index_order)

        # flatten the case dimensions
        newData = newData.reshape(self.new_data_shape)

        return newData

    def determine_case_indecies (self, flatIndex) :
        """
        determine the original indexes of the case from the flat case index number

        Note: this method requires the object to hold a large data structure
        TODO, find a better way of doing this? does numpy guarantee reshaping strategy?
        TODO, can I find information on reshape and do this with pure math?
        """

        if self.reverse_case_index is None :
            return None

        # find the flat index in our reverse case index
        positionOfIndex = numpy.where(self.reverse_case_index == flatIndex)

        return positionOfIndex

if __name__=='__main__':
    import doctest
    doctest.testmod()