Skip to content
Snippets Groups Projects
Unverified Commit f3f5ac6d authored by David Hoese's avatar David Hoese
Browse files

Add ability to list what variables to include in netcdf file

Includes fixes and clean up for specifying multiple files versus
a single input file.
parent 2daf7d42
No related branches found
No related tags found
No related merge requests found
...@@ -269,16 +269,17 @@ def average_over_interval(frame, interval_width): ...@@ -269,16 +269,17 @@ def average_over_interval(frame, interval_width):
return out_frames return out_frames
def get_data(input_files): def _get_data(input_files):
frames = []
for filename in input_files: for filename in input_files:
frame = pd.DataFrame(parser.read_frames(filename)) yield from parser.read_frames(filename)
frame.set_index('stamp', inplace=True)
frame.mask(frame == -99999., inplace=True)
frame.fillna(value=np.nan, inplace=True)
frames.append(frame)
return pd.concat(frames, axis=1, copy=False) def get_data(input_files):
frame = pd.DataFrame(_get_data(input_files))
frame.set_index('stamp', inplace=True)
frame.mask(frame == -99999., inplace=True)
frame.fillna(value=np.nan, inplace=True)
return frame
def write_vars(nc_file, frame, database): def write_vars(nc_file, frame, database):
...@@ -304,7 +305,7 @@ def write_vars(nc_file, frame, database): ...@@ -304,7 +305,7 @@ def write_vars(nc_file, frame, database):
# writes data into file # writes data into file
for varName in frame: for varName in frame:
if varName not in fileVar: if varName not in fileVar:
LOG.warning('Extraneous key: %s in frame' % varName) LOG.debug('Unused input variable: %s', varName)
continue continue
fileVar[varName][:] = frame[varName].fillna(-999.).values fileVar[varName][:] = frame[varName].fillna(-999.).values
...@@ -349,15 +350,7 @@ def write_global_attributes(nc_file, input_sources): ...@@ -349,15 +350,7 @@ def write_global_attributes(nc_file, input_sources):
nc_file.input_sources = ', '.join(input_sources) nc_file.input_sources = ', '.join(input_sources)
# The purpose of this method is to take a begin date, and end date def create_giant_netcdf(input_files, output_fn, zlib, chunk_size,
# input filenames and output filename and create a netCDF file
# based upon that
# @param start time - a start datetime object
# @param end time - an end datetime object
# @param input filenames - list of filenames
# @param output filename - filename of the netcdf file
def create_giant_netcdf(input_files, outputName, zlib, chunk_size,
start=None, end=None, interval_width=None, start=None, end=None, interval_width=None,
database=schema.database): database=schema.database):
frame = get_data(input_files) frame = get_data(input_files)
...@@ -377,7 +370,7 @@ def create_giant_netcdf(input_files, outputName, zlib, chunk_size, ...@@ -377,7 +370,7 @@ def create_giant_netcdf(input_files, outputName, zlib, chunk_size,
chunk_sizes = [frame.shape[0]] chunk_sizes = [frame.shape[0]]
first_stamp = dt.strptime(str(frame.index[0]), '%Y-%m-%d %H:%M:%S') first_stamp = dt.strptime(str(frame.index[0]), '%Y-%m-%d %H:%M:%S')
nc_file = Dataset(outputName, 'w', format='NETCDF4_CLASSIC') nc_file = Dataset(output_fn, 'w', format='NETCDF4_CLASSIC')
write_dimensions(nc_file) write_dimensions(nc_file)
create_variables(nc_file, first_stamp, database, chunk_sizes, zlib) create_variables(nc_file, first_stamp, database, chunk_sizes, zlib)
write_vars(nc_file, frame, database) write_vars(nc_file, frame, database)
...@@ -386,23 +379,6 @@ def create_giant_netcdf(input_files, outputName, zlib, chunk_size, ...@@ -386,23 +379,6 @@ def create_giant_netcdf(input_files, outputName, zlib, chunk_size,
return nc_file return nc_file
def create_multiple(filenames, output_filenames, zlib, chunkSize):
if output_filenames and len(filenames) != len(output_filenames):
raise ValueError(
'Number of output filenames must equal number of input filenames when start and end times are not specified')
success = False
for idx, filename in enumerate(filenames):
try:
create_giant_netcdf([filename], output_filenames[idx], zlib, chunkSize, None, None)
success = True
except (ValueError, TypeError):
LOG.error("Could not generate NetCDF file for {}".format(filename), exc_info=1)
if not success:
raise IOError('All ASCII files were empty or could not be read')
def _dt_convert(datetime_str): def _dt_convert(datetime_str):
"""Parse datetime string, return datetime object""" """Parse datetime string, return datetime object"""
try: try:
...@@ -413,35 +389,41 @@ def _dt_convert(datetime_str): ...@@ -413,35 +389,41 @@ def _dt_convert(datetime_str):
def main(): def main():
import argparse import argparse
argparser = argparse.ArgumentParser(description="Convert level_00 aoss tower data to level_a0", parser = argparse.ArgumentParser(description="Convert level_00 aoss tower data to level_b1",
fromfile_prefix_chars='@') fromfile_prefix_chars='@')
argparser.add_argument('-v', '--verbose', action="count", default=int(os.environ.get("VERBOSITY", 2)), parser.add_argument('-v', '--verbose', action="count", default=int(os.environ.get("VERBOSITY", 2)),
dest='verbosity', dest='verbosity',
help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)') help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)')
argparser.add_argument('-s', '--start-time', type=_dt_convert, parser.add_argument('-s', '--start-time', type=_dt_convert,
help="Start time of massive netcdf file, if only -s is given, a netcdf file for only that day is given" + help="Start time of massive netcdf file, if only -s is given, a netcdf file for only that day is given" +
". Formats allowed: \'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'") ". Formats allowed: \'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'")
argparser.add_argument('-e', '--end-time', type=_dt_convert, parser.add_argument('-e', '--end-time', type=_dt_convert,
help='End time of massive netcdf file. Formats allowed:' + help='End time of massive netcdf file. Formats allowed:' +
"\'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'") "\'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'")
argparser.add_argument('-i', '--interval', parser.add_argument('-n', '--interval',
help="""Width of the interval to average input data help="""Width of the interval to average input data
over in Pandas offset format. If not specified, 1 minute averages are used. If over in Pandas offset format. If not specified, 1 minute averages are used. If
specified then '_high', '_mean', and '_low' versions of the data fields are specified then '_high', '_mean', and '_low' versions of the data fields are
written to the output NetCDF. written to the output NetCDF.
Use '1D' for daily or '5T' for 5 minute averages. Use '1D' for daily or '5T' for 5 minute averages.
See this page for more details: See this page for more details:
http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""") http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""")
argparser.add_argument('--chunk-size', type=int, help='chunk size for the netCDF file') parser.add_argument('-f', '--fields', nargs='+', default=schema.met_vars,
argparser.add_argument('-z', '--zlib', action='store_true', help='compress netCDF file with zlib') help="Variable names to include in the NetCDF file (base name, no suffixes)")
parser.add_argument('--chunk-size', type=int, help='chunk size for the netCDF file')
parser.add_argument('-z', '--zlib', action='store_true', help='compress netCDF file with zlib')
argparser.add_argument("input_files", nargs="+", parser.add_argument('-i', '--input', dest='input_files', required=True, nargs="+",
help="aoss_tower level_00 paths. Use @filename to red a list of paths from that file.") help="aoss_tower level_00 paths. Use @filename to red a list of paths from that file.")
argparser.add_argument('-o', '--output', nargs="+", help="NetCDF filename(s) to create from input") parser.add_argument('-o', '--output', dest='output_files', required=True, nargs="+",
args = argparser.parse_args() help="""NetCDF filename(s) to create from input. If one
filename is specified then all input files are combined in to it. Otherwise
each input file is mapped to the corresponding output file.
""")
args = parser.parse_args()
levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
level = levels[min(3, args.verbosity)] level = levels[min(3, args.verbosity)]
...@@ -453,11 +435,30 @@ http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""") ...@@ -453,11 +435,30 @@ http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""")
raise ValueError('start time must be specified when end time is specified') raise ValueError('start time must be specified when end time is specified')
database = MEAN_DATABASE if args.interval else schema.database database = MEAN_DATABASE if args.interval else schema.database
if args.start_time and args.end_time: mini_database = {k: database[k] for k in args.fields}
create_giant_netcdf(args.input_files, args.output[0], args.zlib, args.chunk_size, args.start_time,
args.end_time, args.interval, database) # Case 1: All inputs to 1 output file
# Case 2: Each input in to a separate output file
if args.output_files and len(args.output_files) not in [1, len(args.input_files)]:
raise ValueError('Output filenames must be 1 or the same length as input files')
elif args.output_files and len(args.output_files) == len(args.input_files):
args.input_files = [[i] for i in args.input_files]
else: else:
create_multiple(args.input_files, args.output, args.zlib, args.chunk_size) args.input_files = [args.input_files]
success = False
for in_files, out_fn in zip(args.input_files, args.output_files):
try:
print(in_files, out_fn)
create_giant_netcdf(in_files, out_fn, args.zlib,
args.chunk_size, args.start_time,
args.end_time, args.interval,
mini_database)
success = True
except (ValueError, TypeError):
LOG.error("Could not generate NetCDF file for {}".format(in_files), exc_info=True)
if not success:
raise IOError('All ASCII files were empty or could not be read')
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -268,6 +268,6 @@ database = dict( ...@@ -268,6 +268,6 @@ database = dict(
) )
) )
met_vars = {'air_temp', 'rh', 'solar_flux', 'pressure', 'precip', 'accum_precip', met_vars = {'air_temp', 'dewpoint', 'rh', 'solar_flux', 'pressure', 'precip', 'accum_precip',
'wind_speed', 'wind_dir', 'gust'} 'wind_speed', 'wind_dir', 'gust'}
engr_vars = set(database.keys()) - met_vars engr_vars = set(database.keys()) - met_vars
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment