Update netcdf generation to use 5s data in summary/monthly generation

18835498 · David Hoese · cfc38e61 · 18835498
Unverified Commit 18835498 authored 8 years ago by David Hoese
--- a/aosstower/level_b1/nc.py
+++ b/aosstower/level_b1/nc.py
@@ -20,9 +20,10 @@ KNOTS_9 = calc.knots_to_mps(9.)
 KNOTS_5 = calc.knots_to_mps(5.)
 KNOTS_3 = calc.knots_to_mps(3.)
 KNOTS_2 = calc.knots_to_mps(2.)
+DEFAULT_FLOAT_FILL = -9999.
-def make_mean_dict(source_dict):
+def make_summary_dict(source_dict):
    """Create the '_mean','_low','_high' file structure."""
    dest_dict = {}
    for key in source_dict:
@@ -32,9 +33,6 @@ def make_mean_dict(source_dict):
    return dest_dict
-MEAN_DATABASE = make_mean_dict(schema.database)
 def filter_array(arr, valid_min, valid_max, valid_delta):
    """Create QC field array.
@@ -93,15 +91,15 @@ def create_variables(nc_file, first_stamp, database, chunk_sizes=None, zlib=Fals
    coordinates = {
        # fields: type, dimension, fill, valid_min, std_name, longname, units, valid_max, cf_role, axis
-        'time': [np.float64, ('time',), -999., None, None, "Hour offset from midnight",
+        'time': [np.float64, ('time',), DEFAULT_FLOAT_FILL, None, None, "Hour offset from midnight",
                 t_u, None, None, None],
-        'lon': [np.float32, tuple(), -999., -180., 'longitude', None, 'degrees_east', 180., None],
+        'lon': [np.float32, tuple(), DEFAULT_FLOAT_FILL, -180., 'longitude', None, 'degrees_east', 180., None],
-        'lat': [np.float32, tuple(), -999., -90., 'latitude', None, 'degrees_north', 90., None],
+        'lat': [np.float32, tuple(), DEFAULT_FLOAT_FILL, -90., 'latitude', None, 'degrees_north', 90., None],
-        'alt': [np.float32, tuple(), -999., None, 'height', 'vertical distance', 'm', None, None],
+        'alt': [np.float32, tuple(), DEFAULT_FLOAT_FILL, None, 'height', 'vertical distance', 'm', None, None],
        # int64 for base_time would be best, but NetCDF4 Classic does not support it
        # NetCDF4 Classic mode was chosen so users can use MFDatasets (multi-file datasets)
-        'base_time': [np.int32, tuple(), -999., None, 'time', btln, btu, None, None],
+        'base_time': [np.int32, tuple(), DEFAULT_FLOAT_FILL, None, 'time', btln, btu, None, None],
-        'time_offset': [np.float64, ('time',), -999., None, 'time', to_ln, to_u, None, None],
+        'time_offset': [np.float64, ('time',), DEFAULT_FLOAT_FILL, None, 'time', to_ln, to_u, None, None],
        'station_name': ['c', ('max_len_station_name',), '\0', None, None, 'station name', None, None, 'timeseries_id'],
    }
@@ -154,7 +152,7 @@ def create_variables(nc_file, first_stamp, database, chunk_sizes=None, zlib=Fals
        varTup = database[entry]
        variable = nc_file.createVariable(entry, np.float32,
-                                          dimensions=('time',), fill_value=float(-999), zlib=zlib,
+                                          dimensions=('time',), fill_value=DEFAULT_FLOAT_FILL, zlib=zlib,
                                          chunksizes=chunk_sizes)
        variable.standard_name = varTup[1]
@@ -250,7 +248,7 @@ def minute_averages(frame):
    return new_frame.fillna(np.nan)
-def average_over_interval(frame, interval_width):
+def summary_over_interval(frame, interval_width):
    """takes a frame and an interval to average it over, and returns a minimum,
    maximum, and average dataframe for that interval
    """
@@ -309,7 +307,7 @@ def write_vars(nc_file, frame, database):
        if varName not in fileVar:
            LOG.debug('Unused input variable: %s', varName)
            continue
-        fileVar[varName][:] = frame[varName].fillna(-999.).values
+        fileVar[varName][:] = frame[varName].fillna(DEFAULT_FLOAT_FILL).values
        valid_min = database[varName][5]
        valid_max = database[varName][6]
@@ -354,14 +352,40 @@ def write_global_attributes(nc_file, input_sources):
 def create_giant_netcdf(input_files, output_fn, zlib, chunk_size,
                        start=None, end=None, interval_width=None,
+                        summary=False,
                        database=schema.database):
    frame = get_data(input_files)
    if frame.empty:
        raise ValueError("No data found from input files: {}".format(", ".join(input_files)))
-    frame = minute_averages(frame)
+    # Add wind direction components so we can average wind direction properly
-    if interval_width:
+    frame['wind_east'], frame['wind_north'], _ = calc.wind_vector_components(frame['wind_speed'], frame['wind_dir'])
-        frame = average_over_interval(frame, interval_width)
+    # round up each 1 minute group so data at time T is the average of data
+    # from T - 1 (exclusive) to T (inclusive).
+    # new_frame = frame.resample('1T', closed='right', loffset='1T').mean()
+    new_frame = frame.resample('5S', closed='right', loffset='5S').mean()
+    # 2 minute rolling average of 5 second data (5 seconds * 24 = 120 seconds = 2 minutes)
+    winds_frame_5s = new_frame[['wind_speed', 'wind_east', 'wind_north']]
+    # winds_frame_5s = winds_frame_5s.resample('5S', closed='right', loffset='5S').mean()
+    winds_frame_2m = winds_frame_5s.rolling(24, win_type='boxcar').mean()
+    winds_frame_2m['gust'] = calculate_wind_gust(winds_frame_5s['wind_speed'], winds_frame_2m['wind_speed'])
+    # rolling average is used for mean output
+    new_frame.update(winds_frame_2m)  # adds wind_speed, wind_east/north
+    new_frame['gust'] = winds_frame_2m['gust']
+    # average the values
+    if summary:
+        frame = summary_over_interval(frame, interval_width)
+    else:
+        frame = new_frame.resample(interval_width, closed='right', loffset=interval_width).mean()
+        # gust_idx = new_frame['gust'].resample(interval_width, closed='right', loffset=interval_width).apply(lambda arr_like: arr_like.argmax())
+        # frame['gust'][:] = new_frame['gust'][gust_idx.values]
+        # frame['wind_dir'] = calc.wind_vector_degrees(frame['wind_east'][gust_idx.values], frame['wind_north'][gust_idx.values])
+        frame['wind_dir'] = calc.wind_vector_degrees(frame['wind_east'], frame['wind_north'])
+        frame['gust'] = new_frame['gust'].resample(interval_width, closed='right', loffset=interval_width).max()
+    frame.fillna(np.nan, inplace=True)
    if start and end:
        frame = frame[start.strftime('%Y-%m-%d %H:%M:%S'): end.strftime('%Y-%m-%d %H:%M:%S')]
@@ -371,6 +395,7 @@ def create_giant_netcdf(input_files, output_fn, zlib, chunk_size,
    else:
        chunk_sizes = [frame.shape[0]]
+    import ipdb; ipdb.set_trace()
    first_stamp = dt.strptime(str(frame.index[0]), '%Y-%m-%d %H:%M:%S')
    # NETCDF4_CLASSIC was chosen so that MFDataset reading would work. See:
    # http://unidata.github.io/netcdf4-python/#netCDF4.MFDataset
@@ -406,7 +431,7 @@ def main():
    parser.add_argument('-e', '--end-time', type=_dt_convert,
                        help='End time of massive netcdf file. Formats allowed:' +
                                "\'YYYY-MM-DDTHH:MM:SS\', \'YYYY-MM-DD\'")
-    parser.add_argument('-n', '--interval',
+    parser.add_argument('-n', '--interval', default='1T',
                        help="""Width of the interval to average input data
 over in Pandas offset format. If not specified, 1 minute averages are used. If
 specified then '_high', '_mean', and '_low' versions of the data fields are
@@ -414,6 +439,8 @@ written to the output NetCDF.
 Use '1D' for daily or '5T' for 5 minute averages.
 See this page for more details:
 http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases""")
+    parser.add_argument('--summary', action='store_true',
+                        help="Create a file with _low, _mean, _high versions of every variable name")
    parser.add_argument('-f', '--fields', nargs='+', default=schema.met_vars,
                        help="Variable names to include in the NetCDF file (base name, no suffixes)")
    parser.add_argument('--chunk-size', type=int, help='chunk size for the netCDF file')
@@ -438,8 +465,9 @@ each input file is mapped to the corresponding output file.
    elif not args.start_time and args.end_time:
        raise ValueError('start time must be specified when end time is specified')
-    database = MEAN_DATABASE if args.interval else schema.database
+    mini_database = {k: schema.database[k] for k in args.fields}
-    mini_database = {k: database[k] for k in args.fields}
+    if args.summary:
+        mini_database = make_summary_dict(mini_database)
    # Case 1: All inputs to 1 output file
    # Case 2: Each input in to a separate output file
@@ -455,7 +483,7 @@ each input file is mapped to the corresponding output file.
        try:
            create_giant_netcdf(in_files, out_fn, args.zlib,
                                args.chunk_size, args.start_time,
-                                args.end_time, args.interval,
+                                args.end_time, args.interval, args.summary,
                                mini_database)
            success = True
        except (ValueError, TypeError):