ancil.py

# python3

# Copyright 2021 University of Wisconsin Regents
#
# This file is part of csppfetch.
#
# csppfetch is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Foobar is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar.  If not, see <https://www.gnu.org/licenses/>.

import os
import sys
import datetime as dt
import csppfetch
import csppfetch.daterange
from csppfetch.roundtozero import roundtozero

################################################################################
#
# Common settings
#

PACKAGE_ENV_ID = 'CSPP_GEO_AITF_ANCIL_'
CSPP_GEO_AITF_URL_BASE = "https://geodb.ssec.wisc.edu/ancillary/"
CSPP_GEO_AITF_URL_DIR_PART = "%Y_%m_%d_%j/"


################################################################################
#
# Data: AVHRR Sea Surface Temperature
#
# This is a relatively basic use of csppfetch.Downloader.

SST_FILENAME = "avhrr-only-v2.%Y%m%d{priority}.nc"
SST_LOCAL_DIR = 'oisst_daily/'
SST = csppfetch.Downloader(
    name = "AVHRR Sea Surface temperature",
    package_env_id = PACKAGE_ENV_ID,
    url_base = CSPP_GEO_AITF_URL_BASE,
    url_relative = CSPP_GEO_AITF_URL_DIR_PART+SST_FILENAME,
    local = SST_LOCAL_DIR+SST_FILENAME,
    period = dt.timedelta(days=1),
    epoch_start = dt.datetime(2010,1,1,0,0,0),
    priorities = ['', '_preliminary'],
    oldest_usable = dt.timedelta(days=7),
    expected_newest = dt.timedelta(hours=1)
    )


###############################################################################
#
# Data: Global Forecast System
#
# GFS is unusual; we need a pair of forecasts that bracket the time we
# want to process, while a full cache is just a pile of stuff.
# Only accept forecasts 3 hours through 15 hours.
#
# So, to process 2019-05-27 22:00Z, we could use any of
#   gfs.t18z.190527.pgrb2f03 + gfs.t18z.190527.pgrb2f06 or
#   gfs.t12z.190527.pgrb2f09 + gfs.t12z.190527.pgrb2f12
# We can NOT use
#   gfs.t06z.190527.pgrb2f15 + gfs.t06z.190527.pgrb2f18
# as we won't use a 18 hour and later forecasts.

class GFSDownloader(csppfetch.Downloader):
    def __init__(self):
        remote_filename = "gfs.t%Hz.%y%m%d.pgrb2f{priority}"
        local_filename = "gfs.t%Hz.pgrbf{priority}"
        localdir = "gfs_grib2_0.5deg/%Y/%m/%d/"

        # Why 5 hours?
        #
        # Examination of geodb in 2019 covering 2019-03-01 through 2019-06-12
        # and 2021 covering 2021-07-02 through 2021-09-01, suggests the data
        # usually available within about 65 minutes, and is almost always
        # available within 4 hours and 30 minutes. Round up to 5 arbitrarily.
        #
        # As we don't really have use for the data until 3 hours after
        # nominal generation and don't _need_ it until 6 hours, this seems
        # a reasonable number.
        expected_newest = dt.timedelta(hours=5)

        super().__init__(
            name="Global Forecast System",
            package_env_id = PACKAGE_ENV_ID,
            url_base = CSPP_GEO_AITF_URL_BASE,
            url_relative = CSPP_GEO_AITF_URL_DIR_PART+remote_filename,
            local = localdir+local_filename,
            period = dt.timedelta(hours=6),
            epoch_start = dt.datetime(2010,1,1,0,0,0),
            expected_newest = expected_newest,
            )

        # Fields we're not using:
        # oldest_usable - Not meaningful; the limit is how far into the future
        #               of a forecast we're willing to use, specified as
        #               "{priority}" in the "filename" above.

        # Forecasts are available this often
        self.forecast_step = dt.timedelta(hours=3)
        # and the first forecast is (starting point for forecast_steps)
        self.first_forecast = dt.timedelta(hours=0)
        # Given the above, valid forecasts are are  0, 3, 6,... hours

        # We can use forecasts from this far into the future
        self.shortest_valid_forecast = dt.timedelta(hours=3)
        # ...through this far into the future.
        self.longest_valid_forecast = dt.timedelta(hours=12)
        # Given that, valid forecasts are 3, 6, 9, and 12.

        valid_multiple = (self.shortest_valid_forecast-self.first_forecast)/self.forecast_step
        if not valid_multiple.is_integer():
            raise RuntimeError("shortest_valid_forecast ({self.shortest_valid_forecast})-self.first_forecast  ({self.first_forecast}) is not an integer multiple of forecast step ({self.forecast_step})")

    def url_to_file_for_time(self, time, forecast_hours):
        hours = int(forecast_hours / dt.timedelta(hours=1))
        hours_str = f"{hours:02d}"
        url = self._expand(self.url_relative, time, hours_str)
        local = self._expand(self.local, time, hours_str)
        return {url:local}

    def get_cache_filesets(self, start = None, end = None):

        # A copy of Downloader's implementation, but we don't use
        # self._daterange; we call daterange.daterange directly so we can pass
        # in a step of self.forecast_step instead of self.period. We do this
        # because although our period is only every 6 hours, we change which
        # files someone might want every _3_. That is to process 5Z data, we'd
        # use the 0Z GFS data's 3 and 6 hour forecasts, but for 7Z data, we'd
        # use the 0Z GFS data's 6 and 9 hour forecasts.
        #
        # Perhaps the ablity to override the period here should be in the
        # parent class? Or maybe this case is too specialized to bother.

        if self.oldest_cache < dt.timedelta(days=0):
            raise ValueError(f"oldest_cache should be positive; it is {self.oldest_cache}")

        if end is None: end = dt.datetime.now()
        if start is None: start = end - self.oldest_cache

        if start > end:
            raise ValueError(f"start ({start}) should be before end ({end})")

        fileset_list_list = []
        for time in csppfetch.daterange.daterange(end, start, -self.forecast_step, inclusive=True):
            fileset_list = self.get_filesets_for_time(time)
            fileset_list_list.append(fileset_list)
        return fileset_list_list

    def get_filesets_for_time(self, scan_time):
        generated_time = self._nearest_preceeding_time(scan_time)
        time_since_gen = scan_time - generated_time
        if time_since_gen < dt.timedelta(0):
            raise RuntimeError(f"self._nearest_preceeding_time({scan_time}) returned {generated_time}, but that's AFTER the start; should be BEFORE or SAME.")
        forecast_start = roundtozero(time_since_gen, self.forecast_step, self.first_forecast)

        # There is a non-looping way to do this,
        # but my brain is fried at the moment.
        while forecast_start < self.shortest_valid_forecast:
            forecast_start += self.period
            generated_time -= self.period

        filesets = []
        while (forecast_start+self.forecast_step) <= self.longest_valid_forecast:
            end = forecast_start + self.forecast_step
            urls_to_files = {}
            urls_to_files.update(self.url_to_file_for_time(generated_time, forecast_start))
            urls_to_files.update(self.url_to_file_for_time(generated_time, end))

            expected = self.is_expected(generated_time)
            startstr = str(int(forecast_start.total_seconds()/(60*60)))
            endstr   = str(int(           end.total_seconds()/(60*60)))

            description = f"{self.name} for {generated_time} forecast {startstr} and {endstr} hours into the future"
            fs = csppfetch.FileSet(urls_to_files, expected, description)
            filesets.append(fs)

            forecast_start += self.period
            generated_time -= self.period
        return filesets
GFS = GFSDownloader()