split.py

# encoding: utf-8
__copyright__ = "Copyright (C) 2015 University of Wisconsin SSEC. All rights reserved."

import os
import io
from datetime import datetime

from edosl0util.timecode import unixtime
from edosl0util.stream import jpss_packet_stream


def split_stream(stream, minutes):
    """Split a VIIRS L0 PDS data stream into data blobs based on their scan
    time mod the number of minutes provided.

    :param fobj: A file-like object
    :param minutes: integer number of minutes
    """
    buf = bytearray()  # buffer for a single data file until it is written
    cur_bucket = 0  # cur time bucket of size 'minutes'

    pkt_count = 0
    for pkt in stream:
        # do the bucketing based on secondary header timestamps
        if pkt.stamp:
            hdrtime = unixtime(pkt.stamp)

            pkt_bucket = hdrtime - hdrtime % (minutes * 60)
            if cur_bucket == 0:
                cur_bucket = pkt_bucket

            if pkt_bucket > cur_bucket:
                yield cur_bucket, pkt_count, buf
                pkt_count = 0
                buf = bytearray()
            cur_bucket = pkt_bucket

        # this is an append operation
        buf.extend(pkt.bytes())
        pkt_count += 1

    yield cur_bucket, pkt_count, buf


def _replace_pdsname_stamp(filename, stamp):
    # P1570769AAAAAAAAAAAAAS15208032721001.PDS
    #
    # NOTE: It seems that EDOS uses the file_id column for fractional seconds.
    #       We just zero this out since the bucket should be on even seconds.
    pat = "{}{}0{}".format(filename[:22], "%y%j%H%M%S", filename[-6:])
    return stamp.strftime(pat)


def _filename_for_splitfile(filename, stamp, minutes):
    # P1570769AAAAAAAAAAAAAS15208032721001.PDS
    #
    # NOTE: It seems that EDOS uses the file_id column for fractional seconds.
    #       We just zero this out since the bucket should be on even seconds.
    pat = "{}{}{}{}0{}".format(
        filename[:20], minutes, filename[21], "%y%j%H%M%S", filename[-6:]
    )
    return stamp.strftime(pat)


def split_file(filepath, minutes, destdir):
    """
    Split a level0 PDS file into X minutes files by filename.

    :param filepath: Path to a Level0 PDS file, with a standard L0 PDS filename.
    :param minutes: Number of minutes per bucket. Buckets always start at the
        top of the hour. For example, a bucket size of 6 will create 10 6-min
        buckets starting at minutes 0, 6, 12, etc ...
    :param destdir: Where the output files are to be written. NOTE: it is likely
        there will be filename collisions between time-based files and generated
        files, so make sure `destdir` does not contain a time-base input file.

    :raises RuntimeError: If a file exists with the same name of a bucket file.
    """
    destdir = destdir or "."
    stream = split_stream(jpss_packet_stream(io.open(filepath, "rb")), minutes)
    for timestamp, pkts, blob in stream:
        stamp = datetime.utcfromtimestamp(timestamp)
        dirname, filename = os.path.split(filepath)
        newname = _filename_for_splitfile(filename, stamp, minutes)
        dstpath = os.path.join(dirname, destdir, newname)
        if os.path.exists(dstpath):
            raise RuntimeError(
                (
                    "File already exists. "
                    "Bucket file possibly colliding with input file."
                ),
                dstpath,
            )
        with io.open(dstpath, "wb") as fptr:
            fptr.write(blob)
        yield stamp, fptr.name