Merge branch '5-handle-fewer-than-nominal-number-of-inputs-differently' into 'master'

group inputs by minute and process accordingly See merge request !26

Merge branch '5-handle-fewer-than-nominal-number-of-inputs-differently' into 'master'
group inputs by minute and process accordingly See merge request !26
b1f35a55 · Nick Bearson · 840520fe · d9e6fe95 · b1f35a55
Commit b1f35a55 authored 3 years ago by Nick Bearson
--- a/gridded_glm/libexec/_minute_gridder.py
+++ b/gridded_glm/libexec/_minute_gridder.py
@@ -118,6 +118,11 @@ def get_goes_position(filenames):
    # we require that all files are from the same sensor and raise an exception if not
    raise ValueError("could not determine GOES position - did you provide a mix of satellites?")

+def glm_filename_to_minute(glm_filename):
+    glminfo = parse_glm_filename(os.path.basename(glm_filename))
+    ftime = glminfo[3]
+    ftime = ftime.replace(second = 0, microsecond=0)
+    return ftime

 def get_start_end(filenames, start_time=None, end_time=None):
    """Compute start and end time of data based on filenames."""
@@ -187,7 +192,7 @@ def get_outpath_base(args):
    return dsname


-def grid_setup(args, work_dir=os.getcwd()):
+def grid_setup(glm_files, args, work_dir=os.getcwd()):
    # When passed None for the minimum event or group counts, the gridder will skip
    # the check, saving a bit of time.
    min_events = None
@@ -216,10 +221,10 @@ def grid_setup(args, work_dir=os.getcwd()):
        exit(1)

    try:
-        start_time, end_time = get_start_end(args.filenames)
+        start_time, end_time = get_start_end(glm_files)
    except:
        log.error("Could not parse start & end times from one or more of the files provided:")
-        log.error(", ".join(args.filenames))
+        log.error(", ".join(glm_files))
        exit(1)

    base_date = datetime(start_time.year, start_time.month, start_time.day)
@@ -227,7 +232,7 @@ def grid_setup(args, work_dir=os.getcwd()):

    outputpath = os.path.join(work_dir, get_outpath_base(args)) # GLMTools expects a template in addition to the path

-    goes_position = get_goes_position(args.filenames)
+    goes_position = get_goes_position(glm_files)

    if "meso" in args.goes_sector:
        view = "meso"
@@ -296,7 +301,7 @@ def grid_setup(args, work_dir=os.getcwd()):
    if (proj_name == 'pixel_grid') or (proj_name == 'geos'):
        grid_kwargs['pixel_coords'] = fixed_grid
    grid_kwargs['ellipse_rev'] = -1  # -1 (default) = infer from date in each GLM file
-    return gridder, args.filenames, start_time, end_time, grid_kwargs
+    return gridder, glm_files, start_time, end_time, grid_kwargs


 def get_cspp_gglm_version():
@@ -338,6 +343,33 @@ if __name__ == '__main__':
    log.info("Starting GLM Gridding")
    log.debug("Starting script with: %s", sys.argv)

+    # handle the realtime flag
+    if args.realtime:
+        if len(args.filenames) != 1:
+            log.error("realtime mode only accepts one input file")
+            exit(1)
+        glminfo = parse_glm_filename(os.path.basename(args.filenames[0]))
+
+        globstring = "{}_{}_{}_s{}*".format(glminfo[0], glminfo[1], glminfo[2], glminfo[3].strftime("%Y%j%H%M"))
+        fileglob = glob(os.path.join(os.path.dirname(args.filenames[0]), globstring))
+        if len(fileglob) != 3:
+            log.error("There are not (yet) three GLM files from this minute. This may be expected. Exiting.")
+            exit(0)
+
+        # this allows a user to use realtime mode to process a large directory of GLM without
+        # creating the same output file multiple times
+        if sorted(fileglob)[-1] != args.filenames[0]:
+            log.error("This is not the last file from this minute. Exiting.")
+            exit(0)
+
+        args.filenames = fileglob
+
+    # check that all of our inputs exist
+    for f in args.filenames:
+        if not os.path.exists(f):
+            log.error("Tried to grid file that does not exist: {}".format(f))
+            exit(1)
+
    # set up output dir
    os.makedirs(args.output_dir, exist_ok=True)

@@ -347,63 +379,82 @@ if __name__ == '__main__':
    # clean our temporary dir on exit
    atexit.register(shutil.rmtree, tempdir_path)

-    # do the gridding
-    gridder, glm_filenames, start_time, end_time, grid_kwargs = grid_setup(args, work_dir=tempdir_path)
-    gridder_return = gridder(glm_filenames, start_time, end_time, **grid_kwargs)
-
-    gridded_files = []
-    for subgrid in gridder_return:
-        for gridded_file in subgrid[1]:
-            gridded_files.append(gridded_file)
-
-    # we need to add attributes here due to an issue where satpy (or its dependencies) are
-    # holding the input gridded file open until the process exits
-    for f in gridded_files:
-        add_gglm_attrs(f, glm_filenames)
-
-    # (optionally) do tiling
-    if args.create_tiles:
-
-        sector = get_goes_position(glm_filenames)
-        if sector == "east":
-            sector_id = "GOES_EAST"
-        elif sector == "west":
-            sector_id = "GOES_WEST"
-        else:
-            raise RuntimeError("could not determine sector_id")
-
-        from satpy import Scene
-        for gridded_file in gridded_files:
-            log.info("TILING: {}".format(gridded_files))
-            scn = Scene(reader='glm_l2', filenames=[gridded_file]) # n.b. satpy requires a list of filenames
-            scn.load([
-                'DQF',
-                'flash_extent_density',
-                'minimum_flash_area',
-                'total_energy',
-            ])
-
-            scn.save_datasets(writer='awips_tiled',
-                              template='glm_l2_radf', 
-                              sector_id=sector_id, # sector_id becomes an attribute in the output files and may be another legacy kind of thing. I'm not sure how much is is actually used here.
-                              source_name="", # You could probably make source_name an empty string. I think it is required by the writer for legacy reasons but isn't actually used for the glm output
-                              base_dir=tempdir_path, # base_dir is the output directory. I think blank is the same as current directory.
-                              tile_size=(506, 904), # tile_size is set to the size of the GLMF sample tiles we were given and should match the full disk ABI tiles which is what they wanted
-                              check_categories=False, # check_categories is there because of that issue I mentioned where DQF is all valid all the time so there is no way to detect empty tiles unless we ignore the "category" products
-                              environment_prefix=args.system_environment_prefix_tiles,
-                              compress=True)
-
-    # pick up output files from the tempdir
-    # output looks like: CG_GLM-L2-GLMC-M3_G17_T03_20200925160040.nc
-    log.debug("files in {}".format(tempdir_path))
-    log.debug(os.listdir(tempdir_path))
-    log.debug("moving output to {}".format(args.output_dir))
-    tiled_path = os.path.join(tempdir_path, '{}_GLM-L2-GLM*-M?_G??_T??_*.nc'.format(args.system_environment_prefix_tiles))
-    tiled_files = glob(tiled_path)
-    for f in tiled_files:
-        add_gglm_attrs(f, glm_filenames)
-        shutil.move(f, os.path.join(args.output_dir, os.path.basename(f)))
-    for f in gridded_files:
-        shutil.move(f, os.path.join(args.output_dir, os.path.basename(f)))
+    minutes = []
+    for f in args.filenames:
+        m = glm_filename_to_minute(f)
+        if m not in minutes:
+            minutes.append(m)
+
+    for m in minutes:
+        # grab all input files for this minute
+        minute_files = []
+        for f in args.filenames:
+            if glm_filename_to_minute(f) == m:
+                minute_files.append(f)
+        minute_files = sorted(minute_files)
+
+        # do we have three input files for this minute?
+        if len(minute_files) != 3:
+            log.error("Minute {} only has {} input file(s). A gridded file will not be generated.".format(m, len(minute_files)))
+            continue
+
+        # do the gridding
+        gridder, glm_filenames, start_time, end_time, grid_kwargs = grid_setup(minute_files, args, work_dir=tempdir_path)
+        gridder_return = gridder(glm_filenames, start_time, end_time, **grid_kwargs)
+
+        gridded_files = []
+        for subgrid in gridder_return:
+            for gridded_file in subgrid[1]:
+                gridded_files.append(gridded_file)
+
+        # we need to add attributes here due to an issue where satpy (or its dependencies) are
+        # holding the input gridded file open until the process exits
+        for f in gridded_files:
+            add_gglm_attrs(f, glm_filenames)
+
+        # (optionally) do tiling
+        if args.create_tiles:
+
+            sector = get_goes_position(glm_filenames)
+            if sector == "east":
+                sector_id = "GOES_EAST"
+            elif sector == "west":
+                sector_id = "GOES_WEST"
+            else:
+                raise RuntimeError("could not determine sector_id")
+
+            from satpy import Scene
+            for gridded_file in gridded_files:
+                log.info("TILING: {}".format(gridded_files))
+                scn = Scene(reader='glm_l2', filenames=[gridded_file]) # n.b. satpy requires a list of filenames
+                scn.load([
+                    'DQF',
+                    'flash_extent_density',
+                    'minimum_flash_area',
+                    'total_energy',
+                ])
+
+                scn.save_datasets(writer='awips_tiled',
+                                  template='glm_l2_radf', 
+                                  sector_id=sector_id, # sector_id becomes an attribute in the output files and may be another legacy kind of thing. I'm not sure how much is is actually used here.
+                                  source_name="", # You could probably make source_name an empty string. I think it is required by the writer for legacy reasons but isn't actually used for the glm output
+                                  base_dir=tempdir_path, # base_dir is the output directory. I think blank is the same as current directory.
+                                  tile_size=(506, 904), # tile_size is set to the size of the GLMF sample tiles we were given and should match the full disk ABI tiles which is what they wanted
+                                  check_categories=False, # check_categories is there because of that issue I mentioned where DQF is all valid all the time so there is no way to detect empty tiles unless we ignore the "category" products
+                                  environment_prefix=args.system_environment_prefix_tiles,
+                                  compress=True)
+
+        # pick up output files from the tempdir
+        # output looks like: CG_GLM-L2-GLMC-M3_G17_T03_20200925160040.nc
+        log.debug("files in {}".format(tempdir_path))
+        log.debug(os.listdir(tempdir_path))
+        log.debug("moving output to {}".format(args.output_dir))
+        tiled_path = os.path.join(tempdir_path, '{}_GLM-L2-GLM*-M?_G??_T??_*.nc'.format(args.system_environment_prefix_tiles))
+        tiled_files = glob(tiled_path)
+        for f in tiled_files:
+            add_gglm_attrs(f, glm_filenames)
+            shutil.move(f, os.path.join(args.output_dir, os.path.basename(f)))
+        for f in gridded_files:
+            shutil.move(f, os.path.join(args.output_dir, os.path.basename(f)))

    # tempdir cleans itself up via atexit, above
\ No newline at end of file