From f3f5ac6da082536ab359148c51c398a4c91cb6b1 Mon Sep 17 00:00:00 2001 From: davidh-ssec <david.hoese@ssec.wisc.edu> Date: Tue, 28 Mar 2017 15:33:48 -0500 Subject: [PATCH] Add ability to list what variables to include in netcdf file Includes fixes and clean up for specifying multiple files versus a single input file. --- aosstower/level_b1/nc.py | 121 ++++++++++++++++++++------------------- aosstower/schema.py | 2 +- 2 files changed, 62 insertions(+), 61 deletions(-) diff --git a/aosstower/level_b1/nc.py b/aosstower/level_b1/nc.py index 2f3ba60..3218f5d 100644 --- a/aosstower/level_b1/nc.py +++ b/aosstower/level_b1/nc.py @@ -269,16 +269,17 @@ def average_over_interval(frame, interval_width): return out_frames -def get_data(input_files): - frames = [] +def _get_data(input_files): for filename in input_files: - frame = pd.DataFrame(parser.read_frames(filename)) - frame.set_index('stamp', inplace=True) - frame.mask(frame == -99999., inplace=True) - frame.fillna(value=np.nan, inplace=True) - frames.append(frame) + yield from parser.read_frames(filename) + - return pd.concat(frames, axis=1, copy=False) +def get_data(input_files): + frame = pd.DataFrame(_get_data(input_files)) + frame.set_index('stamp', inplace=True) + frame.mask(frame == -99999., inplace=True) + frame.fillna(value=np.nan, inplace=True) + return frame def write_vars(nc_file, frame, database): @@ -304,7 +305,7 @@ def write_vars(nc_file, frame, database): # writes data into file for varName in frame: if varName not in fileVar: - LOG.warning('Extraneous key: %s in frame' % varName) + LOG.debug('Unused input variable: %s', varName) continue fileVar[varName][:] = frame[varName].fillna(-999.).values @@ -349,15 +350,7 @@ def write_global_attributes(nc_file, input_sources): nc_file.input_sources = ', '.join(input_sources) -# The purpose of this method is to take a begin date, and end date -# input filenames and output filename and create a netCDF file -# based upon that -# @param start time - a start datetime object -# @param end time - an end datetime object -# @param input filenames - list of filenames -# @param output filename - filename of the netcdf file - -def create_giant_netcdf(input_files, outputName, zlib, chunk_size, +def create_giant_netcdf(input_files, output_fn, zlib, chunk_size, start=None, end=None, interval_width=None, database=schema.database): frame = get_data(input_files) @@ -377,7 +370,7 @@ def create_giant_netcdf(input_files, outputName, zlib, chunk_size, chunk_sizes = [frame.shape[0]] first_stamp = dt.strptime(str(frame.index[0]), '%Y-%m-%d %H:%M:%S') - nc_file = Dataset(outputName, 'w', format='NETCDF4_CLASSIC') + nc_file = Dataset(output_fn, 'w', format='NETCDF4_CLASSIC') write_dimensions(nc_file) create_variables(nc_file, first_stamp, database, chunk_sizes, zlib) write_vars(nc_file, frame, database) @@ -386,23 +379,6 @@ def create_giant_netcdf(input_files, outputName, zlib, chunk_size, return nc_file -def create_multiple(filenames, output_filenames, zlib, chunkSize): - if output_filenames and len(filenames) != len(output_filenames): - raise ValueError( - 'Number of output filenames must equal number of input filenames when start and end times are not specified') - - success = False - for idx, filename in enumerate(filenames): - try: - create_giant_netcdf([filename], output_filenames[idx], zlib, chunkSize, None, None) - success = True - except (ValueError, TypeError): - LOG.error("Could not generate NetCDF file for {}".format(filename), exc_info=1) - - if not success: - raise IOError('All ASCII files were empty or could not be read') - - def _dt_convert(datetime_str): """Parse datetime string, return datetime object""" try: @@ -413,35 +389,41 @@ def _dt_convert(datetime_str): def main(): import argparse - argparser = argparse.ArgumentParser(description="Convert level_00 aoss tower data to level_a0", - fromfile_prefix_chars='@') - - argparser.add_argument('-v', '--verbose', action="count", default=int(os.environ.get("VERBOSITY", 2)), - dest='verbosity', - help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)') - - argparser.add_argument('-s', '--start-time', type=_dt_convert, - help="Start time of massive netcdf file, if only -s is given, a netcdf file for only that day is given" + - ". Formats allowed: \'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'") - argparser.add_argument('-e', '--end-time', type=_dt_convert, - help='End time of massive netcdf file. Formats allowed:' + + parser = argparse.ArgumentParser(description="Convert level_00 aoss tower data to level_b1", + fromfile_prefix_chars='@') + + parser.add_argument('-v', '--verbose', action="count", default=int(os.environ.get("VERBOSITY", 2)), + dest='verbosity', + help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)') + + parser.add_argument('-s', '--start-time', type=_dt_convert, + help="Start time of massive netcdf file, if only -s is given, a netcdf file for only that day is given" + + ". Formats allowed: \'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'") + parser.add_argument('-e', '--end-time', type=_dt_convert, + help='End time of massive netcdf file. Formats allowed:' + "\'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'") - argparser.add_argument('-i', '--interval', - help="""Width of the interval to average input data + parser.add_argument('-n', '--interval', + help="""Width of the interval to average input data over in Pandas offset format. If not specified, 1 minute averages are used. If specified then '_high', '_mean', and '_low' versions of the data fields are written to the output NetCDF. Use '1D' for daily or '5T' for 5 minute averages. See this page for more details: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""") - argparser.add_argument('--chunk-size', type=int, help='chunk size for the netCDF file') - argparser.add_argument('-z', '--zlib', action='store_true', help='compress netCDF file with zlib') + parser.add_argument('-f', '--fields', nargs='+', default=schema.met_vars, + help="Variable names to include in the NetCDF file (base name, no suffixes)") + parser.add_argument('--chunk-size', type=int, help='chunk size for the netCDF file') + parser.add_argument('-z', '--zlib', action='store_true', help='compress netCDF file with zlib') - argparser.add_argument("input_files", nargs="+", - help="aoss_tower level_00 paths. Use @filename to red a list of paths from that file.") + parser.add_argument('-i', '--input', dest='input_files', required=True, nargs="+", + help="aoss_tower level_00 paths. Use @filename to red a list of paths from that file.") - argparser.add_argument('-o', '--output', nargs="+", help="NetCDF filename(s) to create from input") - args = argparser.parse_args() + parser.add_argument('-o', '--output', dest='output_files', required=True, nargs="+", + help="""NetCDF filename(s) to create from input. If one +filename is specified then all input files are combined in to it. Otherwise +each input file is mapped to the corresponding output file. +""") + args = parser.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] level = levels[min(3, args.verbosity)] @@ -453,11 +435,30 @@ http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""") raise ValueError('start time must be specified when end time is specified') database = MEAN_DATABASE if args.interval else schema.database - if args.start_time and args.end_time: - create_giant_netcdf(args.input_files, args.output[0], args.zlib, args.chunk_size, args.start_time, - args.end_time, args.interval, database) + mini_database = {k: database[k] for k in args.fields} + + # Case 1: All inputs to 1 output file + # Case 2: Each input in to a separate output file + if args.output_files and len(args.output_files) not in [1, len(args.input_files)]: + raise ValueError('Output filenames must be 1 or the same length as input files') + elif args.output_files and len(args.output_files) == len(args.input_files): + args.input_files = [[i] for i in args.input_files] else: - create_multiple(args.input_files, args.output, args.zlib, args.chunk_size) + args.input_files = [args.input_files] + + success = False + for in_files, out_fn in zip(args.input_files, args.output_files): + try: + print(in_files, out_fn) + create_giant_netcdf(in_files, out_fn, args.zlib, + args.chunk_size, args.start_time, + args.end_time, args.interval, + mini_database) + success = True + except (ValueError, TypeError): + LOG.error("Could not generate NetCDF file for {}".format(in_files), exc_info=True) + if not success: + raise IOError('All ASCII files were empty or could not be read') if __name__ == "__main__": diff --git a/aosstower/schema.py b/aosstower/schema.py index 2165074..b8f38a6 100644 --- a/aosstower/schema.py +++ b/aosstower/schema.py @@ -268,6 +268,6 @@ database = dict( ) ) -met_vars = {'air_temp', 'rh', 'solar_flux', 'pressure', 'precip', 'accum_precip', +met_vars = {'air_temp', 'dewpoint', 'rh', 'solar_flux', 'pressure', 'precip', 'accum_precip', 'wind_speed', 'wind_dir', 'gust'} engr_vars = set(database.keys()) - met_vars -- GitLab