from icing.pireps import pirep_icing import numpy as np import pickle import matplotlib.pyplot as plt import os from util.util import get_time_tuple_utc, GenericException, add_time_range_to_filename, is_night, is_day from aeolus.datasource import CLAVRx, GOESL1B from util.geos_nav import GEOSNavigation import h5py import re import datetime from datetime import timezone import glob goes_date_format = '%Y%j%H' goes16_directory = '/arcdata/goes/grb/goes16' # /year/date/abi/L1b/RadC clavrx_dir = '/ships19/cloud/scratch/ICING/' dir_fmt = '%Y_%m_%d_%j' # dir_list = [f.path for f in os.scandir('.') if f.is_dir()] ds_dct = {} goes_ds_dct = {} #pirep_file = '/home/rink/data/pireps/pireps_2019010000_2019063023.csv' pirep_file = '/home/rink/data/pireps/pireps_20180101_20200331.csv' l1b_ds_list = ['temp_10_4um_nom', 'temp_11_0um_nom', 'temp_12_0um_nom', 'temp_13_3um_nom', 'temp_3_75um_nom', 'temp_6_2um_nom', 'temp_6_7um_nom', 'temp_7_3um_nom', 'temp_8_5um_nom', 'temp_9_7um_nom', 'refl_0_47um_nom', 'refl_0_65um_nom', 'refl_0_86um_nom', 'refl_1_38um_nom', 'refl_1_60um_nom'] l1b_ds_types = ['f4' for ds in l1b_ds_list] ds_list = ['cld_height_acha', 'cld_geo_thick', 'cld_press_acha', 'sensor_zenith_angle', 'supercooled_prob_acha', 'supercooled_cloud_fraction', 'cld_temp_acha', 'cld_opd_acha', 'solar_zenith_angle', 'cld_reff_acha', 'cld_reff_dcomp', 'cld_reff_dcomp_1', 'cld_reff_dcomp_2', 'cld_reff_dcomp_3', 'cld_opd_dcomp', 'cld_opd_dcomp_1', 'cld_opd_dcomp_2', 'cld_opd_dcomp_3', 'cld_cwp_dcomp', 'iwc_dcomp', 'lwc_dcomp', 'cld_emiss_acha', 'conv_cloud_fraction', 'cloud_type', 'cloud_phase', 'cloud_mask'] ds_types = ['f4' for i in range(23)] + ['i4' for i in range(3)] a_clvr_file = '/home/rink/data/clavrx/clavrx_OR_ABI-L1b-RadC-M3C01_G16_s20190020002186.level2.nc' icing_files = [f for f in glob.glob('/data/Personal/rink/icing_ml/icing_l1b_2*.h5')] icing_l1b_files = [] no_icing_files = [f for f in glob.glob('/data/Personal/rink/icing_ml/no_icing_l1b_2*.h5')] no_icing_l1b_files = [] train_params_day = ['cld_height_acha', 'cld_geo_thick', 'supercooled_cloud_fraction', 'cld_temp_acha', 'cld_press_acha', 'solar_zenith_angle', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp', 'iwc_dcomp', 'lwc_dcomp', 'cloud_phase', 'cloud_mask'] train_params_night = ['cld_height_acha', 'cld_geo_thick', 'supercooled_cloud_fraction', 'cld_temp_acha', 'cld_press_acha', 'solar_zenith_angle', 'cld_reff_acha', 'cld_opd_acha', 'cloud_phase', 'cloud_mask'] def setup(): ice_dict, no_ice_dict, neg_ice_dict = pirep_icing(pirep_file) return ice_dict, no_ice_dict, neg_ice_dict def get_clavrx_datasource(timestamp): dt_obj, time_tup = get_time_tuple_utc(timestamp) date_dir_str = dt_obj.strftime(dir_fmt) ds = ds_dct.get(date_dir_str) if ds is None: ds = CLAVRx(clavrx_dir + date_dir_str + '/') ds_dct[date_dir_str] = ds return ds def get_goes_datasource(timestamp): dt_obj, time_tup = get_time_tuple_utc(timestamp) yr_dir = str(dt_obj.timetuple().tm_year) date_dir = dt_obj.strftime(dir_fmt) files_path = goes16_directory + '/' + yr_dir + '/' + date_dir + '/abi' + '/L1b' + '/RadC/' ds = goes_ds_dct.get(date_dir) if ds is None: ds = GOESL1B(files_path) goes_ds_dct[date_dir] = ds return ds def get_grid_values(h5f, grid_name, j_c, i_c, half_width, scale_factor_name='scale_factor', add_offset_name='add_offset'): hfds = h5f[grid_name] attrs = hfds.attrs ylen, xlen = hfds.shape j_l = j_c-half_width i_l = i_c-half_width if j_l < 0 or i_l < 0: return None j_r = j_c+half_width+1 i_r = i_c+half_width+1 if j_r >= ylen or i_r >= xlen: return None grd_vals = hfds[j_l:j_r, i_l:i_r] grd_vals = np.where(grd_vals == -999, np.nan, grd_vals) grd_vals = np.where(grd_vals == -127, np.nan, grd_vals) grd_vals = np.where(grd_vals == -32768, np.nan, grd_vals) if attrs is None: return grd_vals if scale_factor_name is not None: scale_factor = attrs.get(scale_factor_name)[0] grd_vals = grd_vals * scale_factor if add_offset_name is not None: add_offset = attrs.get(add_offset_name)[0] grd_vals = grd_vals + add_offset return grd_vals def create_file(filename, data_dct, ds_list, ds_types, lon_c, lat_c, time_s, fl_alt_s, icing_intensity, unq_ids): h5f_expl = h5py.File(a_clvr_file, 'r') h5f = h5py.File(filename, 'w') for idx, ds_name in enumerate(ds_list): data = data_dct[ds_name] h5f.create_dataset(ds_name, data=data, dtype=ds_types[idx]) lon_ds = h5f.create_dataset('longitude', data=lon_c, dtype='f4') lon_ds.dims[0].label = 'time' lon_ds.attrs.create('units', data='degrees_east') lon_ds.attrs.create('long_name', data='PIREP longitude') lat_ds = h5f.create_dataset('latitude', data=lat_c, dtype='f4') lat_ds.dims[0].label = 'time' lat_ds.attrs.create('units', data='degrees_north') lat_ds.attrs.create('long_name', data='PIREP latitude') time_ds = h5f.create_dataset('time', data=time_s) time_ds.dims[0].label = 'time' time_ds.attrs.create('units', data='seconds since 1970-1-1 00:00:00') time_ds.attrs.create('long_name', data='PIREP time') ice_alt_ds = h5f.create_dataset('icing_altitude', data=fl_alt_s, dtype='f4') ice_alt_ds.dims[0].label = 'time' ice_alt_ds.attrs.create('units', data='m') ice_alt_ds.attrs.create('long_name', data='PIREP altitude') if icing_intensity is not None: icing_int_ds = h5f.create_dataset('icing_intensity', data=icing_intensity, dtype='i4') icing_int_ds.attrs.create('long_name', data='From PIREP. 0:No intensity report, 1:Trace, 2:Light, 3:Light Moderate, 4:Moderate, 5:Moderate Severe, 6:Severe') unq_ids_ds = h5f.create_dataset('unique_id', data=unq_ids, dtype='i4') unq_ids_ds.attrs.create('long_name', data='ID mapping to PIREP icing dictionary: see pireps.py') # copy relevant attributes for ds_name in ds_list: h5f_ds = h5f[ds_name] h5f_ds.attrs.create('standard_name', data=h5f_expl[ds_name].attrs.get('standard_name')) h5f_ds.attrs.create('long_name', data=h5f_expl[ds_name].attrs.get('long_name')) h5f_ds.attrs.create('units', data=h5f_expl[ds_name].attrs.get('units')) h5f_ds.dims[0].label = 'time' h5f_ds.dims[1].label = 'y' h5f_ds.dims[2].label = 'x' h5f.close() h5f_expl.close() def run(pirep_dct, outfile=None, outfile_l1b=None, dt_str_start=None, dt_str_end=None, reduce=False): time_keys = list(pirep_dct.keys()) l1b_grd_dct = {name: [] for name in l1b_ds_list} ds_grd_dct = {name: [] for name in ds_list} t_start = None t_end = None if (dt_str_start is not None) and (dt_str_end is not None): dto = datetime.datetime.strptime(dt_str_start, '%Y-%m-%d_%H:%M').replace(tzinfo=timezone.utc) dto.replace(tzinfo=timezone.utc) t_start = dto.timestamp() dto = datetime.datetime.strptime(dt_str_end, '%Y-%m-%d_%H:%M').replace(tzinfo=timezone.utc) dto.replace(tzinfo=timezone.utc) t_end = dto.timestamp() nav = GEOSNavigation(sub_lon=-75.0, CFAC=5.6E-05, COFF=-0.101332, LFAC=-5.6E-05, LOFF=0.128212, num_elems=2500, num_lines=1500) lon_s = np.zeros(1) lat_s = np.zeros(1) last_clvr_file = None last_h5f = None lon_c = [] lat_c = [] time_s = [] fl_alt_s = [] ice_int_s = [] unq_ids = [] for idx, time in enumerate(time_keys): if t_start is not None: if time < t_start: continue if time > t_end: continue try: clvr_ds = get_clavrx_datasource(time) except Exception: continue clvr_file = clvr_ds.get_file(time)[0] if clvr_file is None: continue if clvr_file != last_clvr_file: try: h5f = h5py.File(clvr_file, 'r') except Exception: if h5f is not None: h5f.close() print('Problem with file: ', clvr_file) continue if last_h5f is not None: last_h5f.close() last_h5f = h5f last_clvr_file = clvr_file else: h5f = last_h5f reports = pirep_dct[time] for tup in reports: lat, lon, fl, I, uid, rpt_str = tup lat_s[0] = lat lon_s[0] = lon cc, ll = nav.earth_to_lc_s(lon_s, lat_s) if cc[0] < 0: continue cnt_a = 0 for didx, ds_name in enumerate(ds_list): gvals = get_grid_values(h5f, ds_name, ll[0], cc[0], 20) if gvals is not None: ds_grd_dct[ds_name].append(gvals) cnt_a += 1 cnt_b = 0 for didx, ds_name in enumerate(l1b_ds_list): gvals = get_grid_values(h5f, ds_name, ll[0], cc[0], 20) if gvals is not None: l1b_grd_dct[ds_name].append(gvals) cnt_b += 1 if cnt_a > 0 and cnt_a != len(ds_list): raise GenericException('weirdness') if cnt_b > 0 and cnt_b != len(l1b_ds_list): raise GenericException('weirdness') if cnt_a == len(ds_list) and cnt_b == len(l1b_ds_list): lon_c.append(lon_s[0]) lat_c.append(lat_s[0]) time_s.append(time) fl_alt_s.append(fl) ice_int_s.append(I) unq_ids.append(uid) if reduce is True: break if len(time_s) == 0: return t_start = time_s[0] t_end = time_s[len(time_s)-1] data_dct = {} for ds_name in ds_list: data_dct[ds_name] = np.array(ds_grd_dct[ds_name]) lon_c = np.array(lon_c) lat_c = np.array(lat_c) time_s = np.array(time_s) fl_alt_s = np.array(fl_alt_s) ice_int_s = np.array(ice_int_s) unq_ids = np.array(unq_ids) if outfile is not None: outfile = add_time_range_to_filename(outfile, t_start, t_end) create_file(outfile, data_dct, ds_list, ds_types, lon_c, lat_c, time_s, fl_alt_s, ice_int_s, unq_ids) data_dct = {} for ds_name in l1b_ds_list: data_dct[ds_name] = np.array(l1b_grd_dct[ds_name]) if outfile_l1b is not None: outfile_l1b = add_time_range_to_filename(outfile_l1b, t_start, t_end) create_file(outfile_l1b, data_dct, l1b_ds_list, l1b_ds_types, lon_c, lat_c, time_s, fl_alt_s, ice_int_s, unq_ids) def analyze(ice_dct, no_ice_dct): last_file = None ice_files = [] ice_times = [] for ts in list(ice_dct.keys()): try: ds = get_goes_datasource(ts) goes_file, t_0, _ = ds.get_file(ts) if goes_file is not None and goes_file != last_file: ice_files.append(goes_file) ice_times.append(t_0) last_file = goes_file except Exception: continue last_file = None no_ice_files = [] no_ice_times = [] for ts in list(no_ice_dct.keys()): try: ds = get_goes_datasource(ts) goes_file, t_0, _ = ds.get_file(ts) if goes_file is not None and goes_file != last_file: no_ice_files.append(goes_file) no_ice_times.append(t_0) last_file = goes_file except Exception: continue ice_times = np.array(ice_times) no_ice_times = np.array(no_ice_times) itrsct_vals, comm1, comm2 = np.intersect1d(no_ice_times, ice_times, return_indices=True) ice_indexes = np.arange(len(ice_times)) ucomm2 = np.setxor1d(comm2, ice_indexes) np.random.seed(42) np.random.shuffle(ucomm2) ucomm2 = ucomm2[0:8000] files_comm = [] for i in comm2: files_comm.append(ice_files[i]) files_extra = [] times_extra = [] for i in ucomm2: files_extra.append(ice_files[i]) times_extra.append(ice_times[i]) files = files_comm + files_extra times = itrsct_vals.tolist() + times_extra times = np.array(times) sidxs = np.argsort(times) for i in sidxs: filename = os.path.split(files[i])[1] so = re.search('_s\\d{11}', filename) dt_str = so.group() print(dt_str[2:]) def process(ice_dct, no_ice_dct, neg_ice_dct): new_ice_dct = {} new_no_ice_dct = {} new_neg_ice_dct = {} ice_keys_5_6 = [] ice_keys_1 = [] ice_keys_4 = [] ice_keys_3 = [] ice_keys_2 = [] print('num keys ice, no_ice, neg_ice: ', len(ice_dct), len(no_ice_dct), len(neg_ice_dct)) no_intensity_cnt = 0 num_ice_reports = 0 for ts in list(ice_dct.keys()): rpts = ice_dct[ts] for tup in rpts: num_ice_reports += 1 if tup[3] == 5 or tup[3] == 6: ice_keys_5_6.append(ts) elif tup[3] == 1: ice_keys_1.append(ts) elif tup[3] == 4: ice_keys_4.append(ts) elif tup[3] == 3: ice_keys_3.append(ts) elif tup[3] == 2: ice_keys_2.append(ts) else: no_intensity_cnt += 1 no_ice_keys = [] for ts in list(no_ice_dct.keys()): rpts = no_ice_dct[ts] for tup in rpts: no_ice_keys.append(ts) neg_ice_keys = [] for ts in list(neg_ice_dct.keys()): rpts = neg_ice_dct[ts] for tup in rpts: neg_ice_keys.append(ts) print('num ice reports, no ice, neg ice: ', num_ice_reports, len(no_ice_keys), len(neg_ice_keys)) print('------------------------------------------------') ice_keys_5_6 = np.array(ice_keys_5_6) print('5_6: ', ice_keys_5_6.shape) ice_keys_4 = np.array(ice_keys_4) print('4: ', ice_keys_4.shape) ice_keys_3 = np.array(ice_keys_3) print('3: ', ice_keys_3.shape) ice_keys_2 = np.array(ice_keys_2) print('2: ', ice_keys_2.shape) np.random.seed(42) np.random.shuffle(ice_keys_2) ice_keys_2 = ice_keys_2[0:70000] ice_keys_1 = np.array(ice_keys_1) print('1: ', ice_keys_1.shape) print('no intensity: ', no_intensity_cnt) ice_keys = np.concatenate([ice_keys_5_6, ice_keys_1, ice_keys_2, ice_keys_3, ice_keys_4]) uniq_sorted_keys = np.unique(ice_keys) print('ice: ', ice_keys.shape, uniq_sorted_keys.shape) uniq_sorted_keys = uniq_sorted_keys.tolist() for key in uniq_sorted_keys: new_ice_dct[key] = ice_dct[key] no_ice_keys = np.array(no_ice_keys) print('no ice total: ', no_ice_keys.shape) np.random.seed(42) np.random.shuffle(no_ice_keys) no_ice_keys = no_ice_keys[0:150000] uniq_sorted_no_ice = np.unique(no_ice_keys) print('no ice: ', no_ice_keys.shape, uniq_sorted_no_ice.shape) uniq_sorted_no_ice = uniq_sorted_no_ice.tolist() for key in uniq_sorted_no_ice: new_no_ice_dct[key] = no_ice_dct[key] neg_ice_keys = np.array(neg_ice_keys) print('neg ice total: ', neg_ice_keys.shape) np.random.seed(42) np.random.shuffle(neg_ice_keys) neg_ice_keys = neg_ice_keys[0:10000] uniq_sorted_neg_ice = np.unique(neg_ice_keys) print('neg ice: ', neg_ice_keys.shape, uniq_sorted_neg_ice.shape) for key in uniq_sorted_neg_ice: new_neg_ice_dct[key] = neg_ice_dct[key] return new_ice_dct, new_no_ice_dct, new_neg_ice_dct def analyze2(filename, filename_l1b): f = h5py.File(filename, 'r') icing_alt = f['icing_altitude'][:] cld_top_hgt = f['cld_height_acha'][:, 10:30, 10:30] cld_phase = f['cloud_phase'][:, 10:30, 10:30] cld_opd_dc = f['cld_opd_dcomp'][:, 10:30, 10:30] cld_opd = f['cld_opd_acha'][:, 10:30, 10:30] solzen = f['solar_zenith_angle'][:, 10:30, 10:30] f_l1b = h5py.File(filename_l1b, 'r') bt_11um = f_l1b['temp_11_0um_nom'][:, 10:30, 10:30] cld_opd = cld_opd.flatten() cld_opd_dc = cld_opd_dc.flatten() solzen = solzen.flatten() keep1 = np.invert(np.isnan(cld_opd)) keep2 = np.invert(np.isnan(solzen)) keep = keep1 & keep2 cld_opd = cld_opd[np.invert(np.isnan(cld_opd))] cld_opd_dc = cld_opd_dc[keep] solzen = solzen[keep] plt.hist(cld_opd, bins=20) plt.show() plt.hist(cld_opd_dc, bins=20) plt.show() # -------------------------------------------- x_a = 10 x_b = 30 y_a = x_a y_b = x_b nx = ny = (x_b - x_a) nx_x_ny = nx * ny def run_daynight(filename, filename_l1b, day_night='ANY'): f = h5py.File(filename, 'r') f_l1b = h5py.File(filename_l1b, 'r') solzen = f['solar_zenith_angle'][:, y_a:y_b, x_a:x_b] num_obs = solzen.shape[0] idxs = [] for i in range(num_obs): if day_night == 'NIGHT': if not is_day(solzen[i,]): idxs.append(i) elif day_night == 'DAY': if is_day(solzen[i,]): idxs.append(i) keep_idxs = np.array(idxs) data_dct = {} for didx, ds_name in enumerate(ds_list): data_dct[ds_name] = f[ds_name][keep_idxs,] lon_c = f['longitude'][keep_idxs] lat_c = f['latitude'][keep_idxs] time_s = f['time'][keep_idxs] fl_alt_s = f['icing_altitude'][keep_idxs] ice_int_s = f['icing_intensity'][keep_idxs] unq_ids = f['unique_id'][keep_idxs] path, fname = os.path.split(filename) fbase, fext = os.path.splitext(fname) outfile = path + '/' + fbase + '_' + day_night + fext create_file(outfile, data_dct, ds_list, ds_types, lon_c, lat_c, time_s, fl_alt_s, ice_int_s, unq_ids) data_dct = {} for didx, ds_name in enumerate(l1b_ds_list): data_dct[ds_name] = f_l1b[ds_name][keep_idxs] path, fname = os.path.split(filename_l1b) fbase, fext = os.path.splitext(fname) outfile_l1b = path + '/' + fbase + '_' + 'QC' + '_' + day_night + fext create_file(outfile_l1b, data_dct, l1b_ds_list, l1b_ds_types, lon_c, lat_c, time_s, fl_alt_s, ice_int_s, unq_ids) f.close() f_l1b.close() def run_qc(filename, filename_l1b, day_night='ANY', pass_thresh_frac=0.25): f = h5py.File(filename, 'r') icing_alt = f['icing_altitude'][:] cld_top_hgt = f['cld_height_acha'][:, y_a:y_b, x_a:x_b] cld_phase = f['cloud_phase'][:, y_a:y_b, x_a:x_b] if day_night == 'DAY': cld_opd = f['cld_opd_dcomp'][:, y_a:y_b, x_a:x_b] else: cld_opd = f['cld_opd_acha'][:, y_a:y_b, x_a:x_b] cld_mask = f['cloud_mask'][:, y_a:y_b, x_a:x_b] sol_zen = f['solar_zenith_angle'][:, y_a:y_b, x_a:x_b] f_l1b = h5py.File(filename_l1b, 'r') bt_11um = f_l1b['temp_11_0um_nom'][:, y_a:y_b, x_a:x_b] print('num pireps all: ', len(icing_alt)) mask, idxs, num_tested = apply_qc_icing_pireps(icing_alt, cld_top_hgt, cld_phase, cld_opd, cld_mask, bt_11um, sol_zen, day_night=day_night) print('num pireps, day_night: ', len(mask), day_night) keep_idxs = [] for i in range(len(mask)): if (np.sum(mask[i]) / nx_x_ny) > pass_thresh_frac: keep_idxs.append(idxs[i]) print('num valid pireps: ', len(keep_idxs)) keep_idxs = np.array(keep_idxs) data_dct = {} for didx, ds_name in enumerate(ds_list): data_dct[ds_name] = f[ds_name][keep_idxs,] lon_c = f['longitude'][keep_idxs] lat_c = f['latitude'][keep_idxs] time_s = f['time'][keep_idxs] fl_alt_s = f['icing_altitude'][keep_idxs] ice_int_s = f['icing_intensity'][keep_idxs] unq_ids = f['unique_id'][keep_idxs] path, fname = os.path.split(filename) fbase, fext = os.path.splitext(fname) outfile = path + '/' + fbase + '_' + 'QC' + '_' + day_night + fext create_file(outfile, data_dct, ds_list, ds_types, lon_c, lat_c, time_s, fl_alt_s, ice_int_s, unq_ids) data_dct = {} for didx, ds_name in enumerate(l1b_ds_list): data_dct[ds_name] = f_l1b[ds_name][keep_idxs] path, fname = os.path.split(filename_l1b) fbase, fext = os.path.splitext(fname) outfile_l1b = path + '/' + fbase + '_' + 'QC' + '_' + day_night + fext create_file(outfile_l1b, data_dct, l1b_ds_list, l1b_ds_types, lon_c, lat_c, time_s, fl_alt_s, ice_int_s, unq_ids) f.close() f_l1b.close() def apply_qc_icing_pireps(icing_alt, cld_top_hgt, cld_phase, cld_opd, cld_mask, bt_11um, solzen, day_night='ANY'): opd_thick_threshold = 2 if day_night == 'DAY': opd_thick_threshold = 20 closeness = 100.0 # meters num_obs = len(icing_alt) cld_mask = cld_mask.reshape((num_obs, -1)) cld_top_hgt = cld_top_hgt.reshape((num_obs, -1)) cld_phase = cld_phase.reshape((num_obs, -1)) cld_opd = cld_opd.reshape((num_obs, -1)) bt_11um = bt_11um.reshape((num_obs, -1)) mask = [] idxs = [] num_tested = [] for i in range(num_obs): if day_night == 'NIGHT': if is_day(solzen[i,]): continue elif day_night == 'DAY': if not is_day(solzen[i,]): continue keep_0 = np.logical_or(cld_mask[i,] == 2, cld_mask[i,] == 3) # cloudy keep_1 = np.invert(np.isnan(cld_top_hgt[i,])) keep_2 = np.invert(np.isnan(bt_11um[i,])) keep_3 = np.invert(np.isnan(cld_opd[i,])) keep = keep_0 & keep_1 & keep_2 & keep_3 num_keep = np.sum(keep) if num_keep == 0: continue keep = np.where(keep, cld_top_hgt[i,] > icing_alt[i], False) keep = np.where(keep, np.invert((cld_phase[i,] == 4) & np.logical_and(cld_top_hgt[i,]+closeness > icing_alt[i], cld_top_hgt[i,]-closeness < icing_alt[i])), False) keep = np.where(keep, (cld_opd[i,] >= opd_thick_threshold) & (cld_phase[i,] == 4) & (cld_top_hgt[i,] > icing_alt[i]), False) keep = np.where(keep, np.invert((cld_phase[i,] == 4) & (cld_opd[i,] < 0.1) & (cld_top_hgt[i,] > icing_alt[i])), False) keep = np.where(keep, np.invert(bt_11um[i,] > 270.0), False) keep = np.where(keep, np.invert(bt_11um[i,] < 228.0), False) mask.append(keep) idxs.append(i) num_tested.append(num_keep) return mask, idxs, num_tested def fov_extract(outfile='/home/rink/fovs_l1b_out.h5', train_params=l1b_ds_list, ds_types=l1b_ds_types): ice_times = [] icing_int_s = [] ice_lons = [] ice_lats = [] no_ice_times = [] no_ice_lons = [] no_ice_lats = [] h5_s_icing = [] h5_s_no_icing = [] icing_data_dct = {ds: [] for ds in train_params} no_icing_data_dct = {ds: [] for ds in train_params} sub_indexes = np.arange(400) num_ice = 0 for fidx in range(len(icing_files)): fname = icing_files[fidx] f = h5py.File(fname, 'r') h5_s_icing.append(f) times = f['time'][:] num_obs = len(times) icing_int = f['icing_intensity'][:] lons = f['longitude'][:] lats = f['latitude'][:] cld_mask = f['cloud_mask'][:, 10:30, 10:30] cld_mask = cld_mask.reshape((num_obs, -1)) cld_top_temp = f['cld_temp_acha'][:, 10:30, 10:30] cld_top_temp = cld_top_temp.reshape((num_obs, -1)) for i in range(num_obs): keep_0 = np.logical_or(cld_mask[i,] == 2, cld_mask[i,] == 3) # cloudy keep_1 = np.invert(np.isnan(cld_top_temp[i,])) keep = keep_0 & keep_1 keep = np.where(keep, cld_top_temp[i,] < 273.0, False) k_idxs = sub_indexes[keep] np.random.shuffle(k_idxs) if len(k_idxs) > 20: k_idxs = k_idxs[0:20] else: k_idxs = k_idxs[0:len(k_idxs)] num_ice += len(k_idxs) for ds_name in train_params: dat = f[ds_name][i, 10:30, 10:30].flatten() icing_data_dct[ds_name].append(dat[k_idxs]) icing_int_s.append(np.full(len(k_idxs), icing_int[i])) ice_times.append(np.full(len(k_idxs), times[i])) ice_lons.append(np.full(len(k_idxs), lons[i])) ice_lats.append(np.full(len(k_idxs), lats[i])) print(fname) for ds_name in train_params: lst = icing_data_dct[ds_name] icing_data_dct[ds_name] = np.concatenate(lst) icing_int_s = np.concatenate(icing_int_s) ice_times = np.concatenate(ice_times) ice_lons = np.concatenate(ice_lons) ice_lats = np.concatenate(ice_lats) num_no_ice = 0 for fidx in range(len(no_icing_files)): fname = no_icing_files[fidx] f = h5py.File(fname, 'r') h5_s_no_icing.append(f) times = f['time'] num_obs = len(times) lons = f['longitude'] lats = f['latitude'] cld_mask = f['cloud_mask'][:, 10:30, 10:30] cld_mask = cld_mask.reshape((num_obs, -1)) cld_top_temp = f['cld_temp_acha'][:, 10:30, 10:30] cld_top_temp = cld_top_temp.reshape((num_obs, -1)) for i in range(num_obs): keep_0 = np.logical_or(cld_mask[i,] == 2, cld_mask[i,] == 3) # cloudy keep_1 = np.invert(np.isnan(cld_top_temp[i,])) keep = keep_0 & keep_1 keep = np.where(keep, cld_top_temp[i,] < 273.0, False) k_idxs = sub_indexes[keep] np.random.shuffle(k_idxs) if len(k_idxs) > 10: k_idxs = k_idxs[0:10] else: k_idxs = k_idxs[0:len(k_idxs)] num_no_ice += len(k_idxs) no_ice_times.append(np.full(len(k_idxs), times[i])) no_ice_lons.append(np.full(len(k_idxs), lons[i])) no_ice_lats.append(np.full(len(k_idxs), lats[i])) for ds_name in train_params: dat = f[ds_name][i, 10:30, 10:30].flatten() no_icing_data_dct[ds_name].append(dat[k_idxs]) print(fname) for ds_name in train_params: lst = no_icing_data_dct[ds_name] no_icing_data_dct[ds_name] = np.concatenate(lst) no_icing_int_s = np.full(num_no_ice, -1) no_ice_times = np.concatenate(no_ice_times) no_ice_lons = np.concatenate(no_ice_lons) no_ice_lats = np.concatenate(no_ice_lats) icing_intensity = np.concatenate([icing_int_s, no_icing_int_s]) icing_times = np.concatenate([ice_times, no_ice_times]) icing_lons = np.concatenate([ice_lons, no_ice_lons]) icing_lats = np.concatenate([ice_lats, no_ice_lats]) data_dct = {} for ds_name in train_params: data_dct[ds_name] = np.concatenate([icing_data_dct[ds_name], no_icing_data_dct[ds_name]]) # apply shuffle indexes # ds_indexes = np.arange(num_ice + num_no_ice) # np.random.shuffle(ds_indexes) # # for ds_name in train_params: # data_dct[ds_name] = data_dct[ds_name][ds_indexes] # icing_intensity = icing_intensity[ds_indexes] # icing_times = icing_times[ds_indexes] # icing_lons = icing_lons[ds_indexes] # icing_lats = icing_lats[ds_indexes] # do sort ds_indexes = np.argsort(icing_times) for ds_name in train_params: data_dct[ds_name] = data_dct[ds_name][ds_indexes] icing_intensity = icing_intensity[ds_indexes] icing_times = icing_times[ds_indexes] icing_lons = icing_lons[ds_indexes] icing_lats = icing_lats[ds_indexes] h5f_expl = h5py.File(a_clvr_file, 'r') h5f_out = h5py.File(outfile, 'w') for idx, ds_name in enumerate(train_params): dt = ds_types[ds_list.index(ds_name)] data = data_dct[ds_name] h5f_out.create_dataset(ds_name, data=data, dtype=dt) icing_int_ds = h5f_out.create_dataset('icing_intensity', data=icing_intensity, dtype='i4') icing_int_ds.attrs.create('long_name', data='From PIREP. -1:No Icing, 1:Trace, 2:Light, 3:Light Moderate, 4:Moderate, 5:Moderate Severe, 6:Severe') time_ds = h5f_out.create_dataset('time', data=icing_times, dtype='f4') time_ds.attrs.create('units', data='seconds since 1970-1-1 00:00:00') time_ds.attrs.create('long_name', data='PIREP time') lon_ds = h5f_out.create_dataset('longitude', data=icing_lons, dtype='f4') lon_ds.attrs.create('units', data='degrees_east') lon_ds.attrs.create('long_name', data='PIREP longitude') lat_ds = h5f_out.create_dataset('latitude', data=icing_lats, dtype='f4') lat_ds.attrs.create('units', data='degrees_north') lat_ds.attrs.create('long_name', data='PIREP latitude') # copy relevant attributes for ds_name in train_params: h5f_ds = h5f_out[ds_name] h5f_ds.attrs.create('standard_name', data=h5f_expl[ds_name].attrs.get('standard_name')) h5f_ds.attrs.create('long_name', data=h5f_expl[ds_name].attrs.get('long_name')) h5f_ds.attrs.create('units', data=h5f_expl[ds_name].attrs.get('units')) attr = h5f_expl[ds_name].attrs.get('actual_range') if attr is not None: h5f_ds.attrs.create('actual_range', data=attr) attr = h5f_expl[ds_name].attrs.get('flag_values') if attr is not None: h5f_ds.attrs.create('flag_values', data=attr) # --- close files for h5f in h5_s_icing: h5f.close() for h5f in h5_s_no_icing: h5f.close() h5f_out.close() h5f_expl.close() def tile_extract(trnfile='/home/rink/tiles_l1b_train.h5', tstfile='/home/rink/tiles_l1b_test.h5', train_params=l1b_ds_list, ds_types=l1b_ds_types, augment=False, split=0.2): icing_int_s = [] ice_time_s = [] no_ice_time_s = [] ice_lon_s = [] no_ice_lon_s = [] ice_lat_s = [] no_ice_lat_s = [] h5_s_icing = [] h5_s_no_icing = [] icing_data_dct = {ds: [] for ds in train_params} no_icing_data_dct = {ds: [] for ds in train_params} for fidx in range(len(icing_files)): fname = icing_files[fidx] f = h5py.File(fname, 'r') h5_s_icing.append(f) times = f['time'][:] num_obs = len(times) lons = f['longitude'] lats = f['latitude'] icing_int = f['icing_intensity'][:] for i in range(num_obs): for ds_name in train_params: dat = f[ds_name][i, 12:28, 12:28] icing_data_dct[ds_name].append(dat) icing_int_s.append(icing_int[i]) ice_time_s.append(times[i]) ice_lon_s.append(lons[i]) ice_lat_s.append(lats[i]) print(fname) for ds_name in train_params: lst = icing_data_dct[ds_name] icing_data_dct[ds_name] = np.stack(lst, axis=0) icing_int_s = np.array(icing_int_s) ice_time_s = np.array(ice_time_s) ice_lon_s = np.array(ice_lon_s) ice_lat_s = np.array(ice_lat_s) num_ice = icing_int_s.shape[0] # No icing ------------------------------------------------------------ num_no_ice = 0 for fidx in range(len(no_icing_files)): fname = no_icing_files[fidx] f = h5py.File(fname, 'r') h5_s_no_icing.append(f) times = f['time'] num_obs = len(times) lons = f['longitude'] lats = f['latitude'] for i in range(num_obs): for ds_name in train_params: dat = f[ds_name][i, 12:28, 12:28] no_icing_data_dct[ds_name].append(dat) num_no_ice += 1 no_ice_time_s.append(times[i]) no_ice_lon_s.append(lons[i]) no_ice_lat_s.append(lats[i]) print(fname) for ds_name in train_params: lst = no_icing_data_dct[ds_name] no_icing_data_dct[ds_name] = np.stack(lst, axis=0) no_icing_int_s = np.full(num_no_ice, -1) no_ice_time_s = np.array(no_ice_time_s) no_ice_lon_s = np.array(no_ice_lon_s) no_ice_lat_s = np.array(no_ice_lat_s) icing_intensity = np.concatenate([icing_int_s, no_icing_int_s]) icing_times = np.concatenate([ice_time_s, no_ice_time_s]) icing_lons = np.concatenate([ice_lon_s, no_ice_lon_s]) icing_lats = np.concatenate([ice_lat_s, no_ice_lat_s]) data_dct = {} for ds_name in train_params: data_dct[ds_name] = np.concatenate([icing_data_dct[ds_name], no_icing_data_dct[ds_name]]) trn_idxs, tst_idxs = split_data(icing_intensity.shape[0], shuffle=False, perc=split) trn_data_dct = {} for ds_name in train_params: trn_data_dct[ds_name] = data_dct[ds_name][trn_idxs,] trn_icing_intesity = icing_intensity[trn_idxs,] trn_icing_times = icing_times[trn_idxs,] trn_icing_lons = icing_lons[trn_idxs,] trn_icing_lats = icing_lats[trn_idxs,] # Data augmentation ------------------------------------------------------------- if augment: trn_data_dct_aug = {ds_name: [] for ds_name in train_params} trn_icing_intesity_aug = [] trn_icing_times_aug = [] trn_icing_lons_aug = [] trn_icing_lats_aug = [] for k in range(trn_icing_intesity.shape[0]): iceint = trn_icing_intesity[k] icetime = trn_icing_times[k] icelon = trn_icing_lons[k] icelat = trn_icing_lats[k] if iceint >= 3: for ds_name in train_params: dat = trn_data_dct[ds_name] trn_data_dct_aug[ds_name].append(np.fliplr(dat[k,])) trn_data_dct_aug[ds_name].append(np.flipud(dat[k,])) trn_data_dct_aug[ds_name].append(np.rot90(dat[k,])) trn_icing_intesity_aug.append(iceint) trn_icing_intesity_aug.append(iceint) trn_icing_intesity_aug.append(iceint) trn_icing_times_aug.append(icetime) trn_icing_times_aug.append(icetime) trn_icing_times_aug.append(icetime) trn_icing_lons_aug.append(icelon) trn_icing_lons_aug.append(icelon) trn_icing_lons_aug.append(icelon) trn_icing_lats_aug.append(icelat) trn_icing_lats_aug.append(icelat) trn_icing_lats_aug.append(icelat) for ds_name in train_params: trn_data_dct_aug[ds_name] = np.stack(trn_data_dct_aug[ds_name]) trn_icing_intesity_aug = np.stack(trn_icing_intesity_aug) trn_icing_times_aug = np.stack(trn_icing_intesity_aug) trn_icing_lons_aug = np.stack(trn_icing_lons_aug) trn_icing_lats_aug = np.stack(trn_icing_lats_aug) for ds_name in train_params: trn_data_dct[ds_name] = np.concatenate([trn_data_dct[ds_name], trn_data_dct_aug[ds_name]]) trn_icing_intensity = np.concatenate([trn_icing_intesity, trn_icing_intesity_aug]) trn_icing_times = np.concatenate([trn_icing_times, trn_icing_times_aug]) trn_icing_lons = np.concatenate([trn_icing_lons, trn_icing_lons_aug]) trn_icing_lats = np.concatenate([trn_icing_lats, trn_icing_lats_aug]) # do sort ds_indexes = np.argsort(trn_icing_times) for ds_name in train_params: trn_data_dct[ds_name] = trn_data_dct[ds_name][ds_indexes] trn_icing_intensity = trn_icing_intensity[ds_indexes] trn_icing_times = trn_icing_times[ds_indexes] trn_icing_lons = trn_icing_lons[ds_indexes] trn_icing_lats = trn_icing_lats[ds_indexes] write_file(trnfile, train_params, trn_data_dct, trn_icing_intensity, trn_icing_times, trn_icing_lons, trn_icing_lats) tst_data_dct = {} for ds_name in train_params: tst_data_dct[ds_name] = data_dct[ds_name][tst_idxs,] tst_icing_intensity = icing_intensity[tst_idxs,] tst_icing_times = icing_times[tst_idxs,] tst_icing_lons = icing_lons[tst_idxs,] tst_icing_lats = icing_lats[tst_idxs,] # Do shuffle # ds_indexes = np.arange(num_ice + num_no_ice) # np.random.shuffle(ds_indexes) # # for ds_name in train_params: # data_dct[ds_name] = data_dct[ds_name][ds_indexes] # icing_intensity = icing_intensity[ds_indexes] # icing_times = icing_times[ds_indexes] # icing_lons = icing_lons[ds_indexes] # icing_lats = icing_lats[ds_indexes] # do sort ds_indexes = np.argsort(tst_icing_times) for ds_name in train_params: tst_data_dct[ds_name] = tst_data_dct[ds_name][ds_indexes] tst_icing_intensity = tst_icing_intensity[ds_indexes] tst_icing_times = tst_icing_times[ds_indexes] tst_icing_lons = tst_icing_lons[ds_indexes] tst_icing_lats = tst_icing_lats[ds_indexes] write_file(tstfile, train_params, tst_data_dct, tst_icing_intensity, tst_icing_times, tst_icing_lons, tst_icing_lats) # --- close files for h5f in h5_s_icing: h5f.close() for h5f in h5_s_no_icing: h5f.close() def write_file(outfile, train_params, data_dct, icing_intensity, icing_times, icing_lons, icing_lats): h5f_expl = h5py.File(a_clvr_file, 'r') h5f_out = h5py.File(outfile, 'w') for idx, ds_name in enumerate(train_params): dt = ds_types[idx] data = data_dct[ds_name] h5f_out.create_dataset(ds_name, data=data, dtype=dt) icing_int_ds = h5f_out.create_dataset('icing_intensity', data=icing_intensity, dtype='i4') icing_int_ds.attrs.create('long_name', data='From PIREP. -1:No Icing, 1:Trace, 2:Light, 3:Light Moderate, 4:Moderate, 5:Moderate Severe, 6:Severe') time_ds = h5f_out.create_dataset('time', data=icing_times, dtype='f4') time_ds.attrs.create('units', data='seconds since 1970-1-1 00:00:00') time_ds.attrs.create('long_name', data='PIREP time') lon_ds = h5f_out.create_dataset('longitude', data=icing_lons, dtype='f4') lon_ds.attrs.create('units', data='degrees_east') lon_ds.attrs.create('long_name', data='PIREP longitude') lat_ds = h5f_out.create_dataset('latitude', data=icing_lats, dtype='f4') lat_ds.attrs.create('units', data='degrees_north') lat_ds.attrs.create('long_name', data='PIREP latitude') # copy relevant attributes for ds_name in train_params: h5f_ds = h5f_out[ds_name] h5f_ds.attrs.create('standard_name', data=h5f_expl[ds_name].attrs.get('standard_name')) h5f_ds.attrs.create('long_name', data=h5f_expl[ds_name].attrs.get('long_name')) h5f_ds.attrs.create('units', data=h5f_expl[ds_name].attrs.get('units')) attr = h5f_expl[ds_name].attrs.get('actual_range') if attr is not None: h5f_ds.attrs.create('actual_range', data=attr) attr = h5f_expl[ds_name].attrs.get('flag_values') if attr is not None: h5f_ds.attrs.create('flag_values', data=attr) # --- close files h5f_out.close() h5f_expl.close() def run_mean_std(check_cloudy=False, no_icing_to_icing_ratio=5): ds_list = ['cld_height_acha', 'cld_geo_thick', 'cld_press_acha', 'supercooled_cloud_fraction', 'cld_temp_acha', 'cld_opd_acha', 'cld_reff_acha', 'cld_reff_dcomp', 'cld_reff_dcomp_1', 'cld_reff_dcomp_2', 'cld_reff_dcomp_3', 'cld_opd_dcomp', 'cld_opd_dcomp_1', 'cld_opd_dcomp_2', 'cld_opd_dcomp_3', 'cld_cwp_dcomp', 'iwc_dcomp', 'lwc_dcomp', 'cld_emiss_acha', 'conv_cloud_fraction'] mean_std_dct = {} ice_flist = [f for f in glob.glob('/data/Personal/rink/icing2/icing_2*.h5')] no_ice_flist = [f for f in glob.glob('/data/Personal/rink/icing2/no_icing_2*.h5')] ice_h5f_lst = [h5py.File(f, 'r') for f in ice_flist] no_ice_h5f_lst = [h5py.File(f, 'r') for f in no_ice_flist] if check_cloudy: cld_msk_i = [] cld_msk_ni = [] for idx, ice_h5f in enumerate(ice_h5f_lst): no_ice_h5f = no_ice_h5f_lst[idx] cld_msk_i.append(ice_h5f['cloud_mask'][:,].flatten()) cld_msk_ni.append(no_ice_h5f['cloud_mask'][:,].flatten()) cld_msk_i = np.concatenate(cld_msk_i) cld_msk_ni = np.concatenate(cld_msk_ni) for dname in ds_list: data = [] data_i = [] data_ni = [] for idx, ice_h5f in enumerate(ice_h5f_lst): no_ice_h5f = no_ice_h5f_lst[idx] data.append(ice_h5f[dname][:,].flatten()) data.append(no_ice_h5f[dname][:,].flatten()) data_i.append(ice_h5f[dname][:,].flatten()) data_ni.append(no_ice_h5f[dname][:,].flatten()) data = np.concatenate(data) mean = np.nanmean(data) data -= mean std = np.nanstd(data) data_i = np.concatenate(data_i) if check_cloudy: keep = np.logical_or(cld_msk_i == 2, cld_msk_i == 3) data_i = data_i[keep] mean_i = np.nanmean(data_i) data_i -= mean_i std_i = np.nanstd(data_i) data_ni = np.concatenate(data_ni) if check_cloudy: keep = np.logical_or(cld_msk_ni == 2, cld_msk_ni == 3) data_ni = data_ni[keep] mean_ni = np.nanmean(data_ni) data_ni -= mean_ni std_ni = np.nanstd(data_ni) mean = (mean_i + no_icing_to_icing_ratio*mean_ni)/(no_icing_to_icing_ratio + 1) std = (std_i + no_icing_to_icing_ratio*std_ni)/(no_icing_to_icing_ratio + 1) print(dname,': (', mean, mean_i, mean_ni, ') (', std, std_i, std_ni, ')') mean_std_dct[dname] = (mean_ni, std_ni) [h5f.close() for h5f in ice_h5f_lst] [h5f.close() for h5f in no_ice_h5f_lst] f = open('/home/rink/data/icing_ml/mean_std.pkl', 'wb') pickle.dump(mean_std_dct, f) f.close() return mean_std_dct def run_mean_std_2(check_cloudy=False, no_icing_to_icing_ratio=5, params=train_params_day): params = ['cld_height_acha', 'cld_geo_thick', 'supercooled_cloud_fraction', 'cld_temp_acha', 'cld_press_acha', 'cld_reff_dcomp', 'cld_opd_dcomp', 'cld_cwp_dcomp', 'iwc_dcomp', 'lwc_dcomp'] mean_std_dct = {} flist = [f for f in glob.glob('/Users/tomrink/data/icing/fov*.h5')] h5f_lst = [h5py.File(f, 'r') for f in flist] if check_cloudy: cld_msk = [] for idx, h5f in enumerate(h5f_lst): cld_msk.append(h5f['cloud_mask'][:,].flatten()) cld_msk = np.concatenate(cld_msk) for dname in params: data = [] for idx, h5f in enumerate(h5f_lst): data.append(h5f[dname][:,].flatten()) data = np.concatenate(data) if check_cloudy: keep = np.logical_or(cld_msk == 2, cld_msk == 3) data = data[keep] mean = np.nanmean(data) data -= mean std = np.nanstd(data) print(dname,': ', mean, std) mean_std_dct[dname] = (mean, std) [h5f.close() for h5f in h5f_lst] f = open('/Users/tomrink/data/icing/fovs_mean_std_day.pkl', 'wb') pickle.dump(mean_std_dct, f) f.close() # return mean_std_dct def split_data(num_obs, perc=0.2, skip=1, shuffle=True, seed=None): idxs = np.arange(num_obs) idxs = list(idxs) num_test = int(num_obs * perc) test_idxs = idxs[::int(num_obs / num_test)] test_set = set(test_idxs) train_set = (set(idxs)).difference(test_set) train_idxs = list(train_set) test_idxs = np.array(test_idxs) train_idxs = np.array(train_idxs) if seed is not None: np.random.seed(seed) if shuffle: np.random.shuffle(test_idxs) np.random.shuffle(train_idxs) return train_idxs[::skip], test_idxs[::skip] def normalize(data, param, mean_std_dict): if mean_std_dict.get(param) is None: return data shape = data.shape data = data.flatten() mean, std = mean_std_dict.get(param) data -= mean data /= std not_valid = np.isnan(data) data[not_valid] = 0 data = np.reshape(data, shape) return data def test(filename, skip=1): h5f = h5py.File(filename, 'r') time = h5f['time'] intsty = h5f['icing_intensity'] trn_idxs, tst_idxs = split_data(time.shape[0], skip=skip) print(np.histogram(intsty[trn_idxs], bins=7)) print(np.histogram(intsty[tst_idxs], bins=7))