diff --git a/aosstower/level_b1/nc.py b/aosstower/level_b1/nc.py index ebf7a3d67b8fe76167f1f74c5e639b0d2fad20de..cfad12846d9c121ebdd5dcd7041d0c9bda838d81 100644 --- a/aosstower/level_b1/nc.py +++ b/aosstower/level_b1/nc.py @@ -20,9 +20,10 @@ KNOTS_9 = calc.knots_to_mps(9.) KNOTS_5 = calc.knots_to_mps(5.) KNOTS_3 = calc.knots_to_mps(3.) KNOTS_2 = calc.knots_to_mps(2.) +DEFAULT_FLOAT_FILL = -9999. -def make_mean_dict(source_dict): +def make_summary_dict(source_dict): """Create the '_mean','_low','_high' file structure.""" dest_dict = {} for key in source_dict: @@ -32,9 +33,6 @@ def make_mean_dict(source_dict): return dest_dict -MEAN_DATABASE = make_mean_dict(schema.database) - - def filter_array(arr, valid_min, valid_max, valid_delta): """Create QC field array. @@ -93,15 +91,15 @@ def create_variables(nc_file, first_stamp, database, chunk_sizes=None, zlib=Fals coordinates = { # fields: type, dimension, fill, valid_min, std_name, longname, units, valid_max, cf_role, axis - 'time': [np.float64, ('time',), -999., None, None, "Hour offset from midnight", + 'time': [np.float64, ('time',), DEFAULT_FLOAT_FILL, None, None, "Hour offset from midnight", t_u, None, None, None], - 'lon': [np.float32, tuple(), -999., -180., 'longitude', None, 'degrees_east', 180., None], - 'lat': [np.float32, tuple(), -999., -90., 'latitude', None, 'degrees_north', 90., None], - 'alt': [np.float32, tuple(), -999., None, 'height', 'vertical distance', 'm', None, None], + 'lon': [np.float32, tuple(), DEFAULT_FLOAT_FILL, -180., 'longitude', None, 'degrees_east', 180., None], + 'lat': [np.float32, tuple(), DEFAULT_FLOAT_FILL, -90., 'latitude', None, 'degrees_north', 90., None], + 'alt': [np.float32, tuple(), DEFAULT_FLOAT_FILL, None, 'height', 'vertical distance', 'm', None, None], # int64 for base_time would be best, but NetCDF4 Classic does not support it # NetCDF4 Classic mode was chosen so users can use MFDatasets (multi-file datasets) - 'base_time': [np.int32, tuple(), -999., None, 'time', btln, btu, None, None], - 'time_offset': [np.float64, ('time',), -999., None, 'time', to_ln, to_u, None, None], + 'base_time': [np.int32, tuple(), DEFAULT_FLOAT_FILL, None, 'time', btln, btu, None, None], + 'time_offset': [np.float64, ('time',), DEFAULT_FLOAT_FILL, None, 'time', to_ln, to_u, None, None], 'station_name': ['c', ('max_len_station_name',), '\0', None, None, 'station name', None, None, 'timeseries_id'], } @@ -154,7 +152,7 @@ def create_variables(nc_file, first_stamp, database, chunk_sizes=None, zlib=Fals varTup = database[entry] variable = nc_file.createVariable(entry, np.float32, - dimensions=('time',), fill_value=float(-999), zlib=zlib, + dimensions=('time',), fill_value=DEFAULT_FLOAT_FILL, zlib=zlib, chunksizes=chunk_sizes) variable.standard_name = varTup[1] @@ -250,7 +248,7 @@ def minute_averages(frame): return new_frame.fillna(np.nan) -def average_over_interval(frame, interval_width): +def summary_over_interval(frame, interval_width): """takes a frame and an interval to average it over, and returns a minimum, maximum, and average dataframe for that interval """ @@ -309,7 +307,7 @@ def write_vars(nc_file, frame, database): if varName not in fileVar: LOG.debug('Unused input variable: %s', varName) continue - fileVar[varName][:] = frame[varName].fillna(-999.).values + fileVar[varName][:] = frame[varName].fillna(DEFAULT_FLOAT_FILL).values valid_min = database[varName][5] valid_max = database[varName][6] @@ -354,14 +352,40 @@ def write_global_attributes(nc_file, input_sources): def create_giant_netcdf(input_files, output_fn, zlib, chunk_size, start=None, end=None, interval_width=None, + summary=False, database=schema.database): frame = get_data(input_files) if frame.empty: raise ValueError("No data found from input files: {}".format(", ".join(input_files))) - frame = minute_averages(frame) - if interval_width: - frame = average_over_interval(frame, interval_width) + # Add wind direction components so we can average wind direction properly + frame['wind_east'], frame['wind_north'], _ = calc.wind_vector_components(frame['wind_speed'], frame['wind_dir']) + # round up each 1 minute group so data at time T is the average of data + # from T - 1 (exclusive) to T (inclusive). + # new_frame = frame.resample('1T', closed='right', loffset='1T').mean() + new_frame = frame.resample('5S', closed='right', loffset='5S').mean() + + # 2 minute rolling average of 5 second data (5 seconds * 24 = 120 seconds = 2 minutes) + winds_frame_5s = new_frame[['wind_speed', 'wind_east', 'wind_north']] + # winds_frame_5s = winds_frame_5s.resample('5S', closed='right', loffset='5S').mean() + winds_frame_2m = winds_frame_5s.rolling(24, win_type='boxcar').mean() + winds_frame_2m['gust'] = calculate_wind_gust(winds_frame_5s['wind_speed'], winds_frame_2m['wind_speed']) + + # rolling average is used for mean output + new_frame.update(winds_frame_2m) # adds wind_speed, wind_east/north + new_frame['gust'] = winds_frame_2m['gust'] + + # average the values + if summary: + frame = summary_over_interval(frame, interval_width) + else: + frame = new_frame.resample(interval_width, closed='right', loffset=interval_width).mean() + # gust_idx = new_frame['gust'].resample(interval_width, closed='right', loffset=interval_width).apply(lambda arr_like: arr_like.argmax()) + # frame['gust'][:] = new_frame['gust'][gust_idx.values] + # frame['wind_dir'] = calc.wind_vector_degrees(frame['wind_east'][gust_idx.values], frame['wind_north'][gust_idx.values]) + frame['wind_dir'] = calc.wind_vector_degrees(frame['wind_east'], frame['wind_north']) + frame['gust'] = new_frame['gust'].resample(interval_width, closed='right', loffset=interval_width).max() + frame.fillna(np.nan, inplace=True) if start and end: frame = frame[start.strftime('%Y-%m-%d %H:%M:%S'): end.strftime('%Y-%m-%d %H:%M:%S')] @@ -371,6 +395,7 @@ def create_giant_netcdf(input_files, output_fn, zlib, chunk_size, else: chunk_sizes = [frame.shape[0]] + import ipdb; ipdb.set_trace() first_stamp = dt.strptime(str(frame.index[0]), '%Y-%m-%d %H:%M:%S') # NETCDF4_CLASSIC was chosen so that MFDataset reading would work. See: # http://unidata.github.io/netcdf4-python/#netCDF4.MFDataset @@ -406,7 +431,7 @@ def main(): parser.add_argument('-e', '--end-time', type=_dt_convert, help='End time of massive netcdf file. Formats allowed:' + "\'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'") - parser.add_argument('-n', '--interval', + parser.add_argument('-n', '--interval', default='1T', help="""Width of the interval to average input data over in Pandas offset format. If not specified, 1 minute averages are used. If specified then '_high', '_mean', and '_low' versions of the data fields are @@ -414,6 +439,8 @@ written to the output NetCDF. Use '1D' for daily or '5T' for 5 minute averages. See this page for more details: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""") + parser.add_argument('--summary', action='store_true', + help="Create a file with _low, _mean, _high versions of every variable name") parser.add_argument('-f', '--fields', nargs='+', default=schema.met_vars, help="Variable names to include in the NetCDF file (base name, no suffixes)") parser.add_argument('--chunk-size', type=int, help='chunk size for the netCDF file') @@ -438,8 +465,9 @@ each input file is mapped to the corresponding output file. elif not args.start_time and args.end_time: raise ValueError('start time must be specified when end time is specified') - database = MEAN_DATABASE if args.interval else schema.database - mini_database = {k: database[k] for k in args.fields} + mini_database = {k: schema.database[k] for k in args.fields} + if args.summary: + mini_database = make_summary_dict(mini_database) # Case 1: All inputs to 1 output file # Case 2: Each input in to a separate output file @@ -455,7 +483,7 @@ each input file is mapped to the corresponding output file. try: create_giant_netcdf(in_files, out_fn, args.zlib, args.chunk_size, args.start_time, - args.end_time, args.interval, + args.end_time, args.interval, args.summary, mini_database) success = True except (ValueError, TypeError):