From e0fa76212b383f7910aeb4d24c2911fec4cb9e86 Mon Sep 17 00:00:00 2001 From: tomrink <rink@ssec.wisc.edu> Date: Wed, 12 Apr 2023 15:14:13 -0500 Subject: [PATCH] snapshot... --- modules/util/abi_surfrad.py | 262 ++++++++++++++++++++++++++---------- 1 file changed, 192 insertions(+), 70 deletions(-) diff --git a/modules/util/abi_surfrad.py b/modules/util/abi_surfrad.py index 43c34256..1391b3e1 100644 --- a/modules/util/abi_surfrad.py +++ b/modules/util/abi_surfrad.py @@ -1,162 +1,284 @@ import numpy as np import h5py -from util.util import get_grid_values +from util.util import get_grid_values, is_day import glob - -target_param = 'cloud_probability' -# target_param = 'cld_opd_dcomp' +keep_out_opd = ['/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/arm/2019/11/02/clavrx_VNP02IMG.A2019306.1912.001.2019307003236.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/arm/2019/04/13/clavrx_VNP02IMG.A2019103.1918.001.2019104005120.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/sioux_falls/2019/05/25/clavrx_VNP02IMG.A2019145.1936.001.2019146005424.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/sioux_falls/2019/11/01/clavrx_VNP02IMG.A2019305.1936.001.2019306005913.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/sioux_falls/2019/03/01/clavrx_VNP02IMG.A2019060.1930.001.2019061005942.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/table_mountain/2019/12/01/clavrx_VNP02IMG.A2019335.2012.001.2019336013827.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/table_mountain/2019/05/18/clavrx_VNP02IMG.A2019138.2006.001.2019139013059.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/fort_peck/2019/01/28/clavrx_VNP02IMG.A2019028.1930.001.2019029005408.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/fort_peck/2019/08/08/clavrx_VNP02IMG.A2019220.1930.001.2019221010714.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/madison/2019/10/13/clavrx_VNP02IMG.A2019286.1848.001.2019287001722.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/madison/2019/03/20/clavrx_VNP02IMG.A2019079.1830.001.2019079235918.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/madison/2019/12/26/clavrx_VNP02IMG.A2019360.1900.001.2019361001327.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/desert_rock/2019/02/05/clavrx_VNP02IMG.A2019036.2018.001.2019037030301.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/desert_rock/2019/03/30/clavrx_VNP02IMG.A2019089.2024.001.2019090015614.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/bondville_il/2019/11/03/clavrx_VNP02IMG.A2019307.1854.001.2019308001716.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/goodwin_creek/2019/04/15/clavrx_VNP02IMG.A2019105.1842.001.2019106001003.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/penn_state/2019/07/18/clavrx_VNP02IMG.A2019199.1742.001.2019199230925.uwssec.nc', + '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/penn_state/2019/02/02/clavrx_VNP02IMG.A2019033.1754.001.2019034011318.uwssec.nc'] + +keep_out = keep_out_opd + + +# target_param = 'cloud_probability' +target_param = 'cld_opd_dcomp' group_name_i = 'super/' group_name_m = 'orig/' solzen_name = group_name_m + 'solar_zenith' -label_params = [group_name_i+target_param] -data_params = [group_name_m+'temp_ch31', group_name_m+'refl_ch01', group_name_m+target_param] +# params_i = [group_name_i+'temp_11_0um', group_name_i+'refl_0_65um', group_name_i+target_param] +# params_m = [group_name_m+'temp_11_0um', group_name_m+'refl_0_65um', group_name_m+target_param] +params_i = [group_name_i+'temp_ch38', group_name_i+'refl_ch01', group_name_i+target_param] +params_m = [group_name_m+'temp_ch38', group_name_m+'refl_ch01', group_name_m+target_param] + +param_idx_m = params_m.index(group_name_m + target_param) +param_idx_i = params_i.index(group_name_i + target_param) + +def is_missing(p_idx, tile): + keep = np.invert(np.isnan(tile[p_idx, ])) + if np.sum(keep) / keep.size < 0.98: + return True -def keep_tile(param, param_s, tile): - k = param_s.index(param) - grd_k = tile[k, ].copy() + +def keep_tile(p_idx, tile): + grd_k = tile[p_idx, ].copy() if target_param == 'cloud_probability': - grd_k = process_cld_prob_(grd_k) + grd_k = process_cld_prob(grd_k) elif target_param == 'cld_opd_dcomp': - grd_k = process_cld_opd_(grd_k) + grd_k = process_cld_opd(grd_k) if grd_k is not None: - tile[k, ] = grd_k + tile[p_idx, ] = grd_k return tile else: return None -def process_cld_prob_(grd_k): +def process_cld_prob(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) - # if num_keep / grd_k.size < 0.98: - # return None - keep = np.where(keep, np.logical_and(0.10 < grd_k, grd_k < 0.90), False) - if np.sum(keep)/num_keep < 0.25: + keep_clr = np.where(keep, grd_k < 0.30, False) + keep_cld = np.where(keep, grd_k > 0.70, False) + frac_clr = np.sum(keep_clr)/num_keep + frac_cld = np.sum(keep_cld)/num_keep + if not (frac_clr >= 0.20 and frac_cld >= 0.20): return None - grd_k = np.where(np.invert(keep), 0, grd_k) + grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 return grd_k -def process_cld_opd_(grd_k): +def process_cld_opd(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) - # if num_keep / grd_k.size < 0.98: - # return None - grd_k = np.where(np.invert(keep), 0, grd_k) - keep = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 158.0), False) - if np.sum(keep)/num_keep < 0.50: + keep_cld = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 158.0), False) + frac_cld = np.sum(keep_cld)/num_keep + if not (0.10 < frac_cld < 0.90): return None + grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 return grd_k -def run_all(directory, out_directory, pattern='clavrx_*.nc', start=10): +def run_all(directory, out_directory, day_night='ANY', pattern='clavrx_*.nc', start=10): cnt = start - total_num_samples = 0 + total_num_train_samples = 0 + total_num_valid_samples = 0 + num_keep_x_tiles = 4 path = directory + '**' + '/' + pattern - files = glob.glob(path, recursive=True) + all_files = glob.glob(path, recursive=True) + data_files = [f for f in all_files if f not in keep_out] + # data_files = glob.glob(path, recursive=True) - label_tiles = [] - data_tiles = [] + valid_tiles_i = [] + train_tiles_i = [] + valid_tiles_m = [] + train_tiles_m = [] f_cnt = 0 - num_files = len(files) + num_files = len(data_files) print('Start, number of files: ', num_files) - for idx, f in enumerate(files): + total_num_not_missing = 0 + + for idx, data_f in enumerate(data_files): # if idx % 4 == 0: # if we want to skip some files if True: try: - h5f = h5py.File(f, 'r') + h5f = h5py.File(data_f, 'r') except: - print('cant open file: ', f) + print('cant open file: ', data_f) continue try: - run(h5f, data_params, data_tiles, label_params, label_tiles, kernel_size=5) + num_not_missing = run(h5f, params_m, train_tiles_m, valid_tiles_m, + params_i, train_tiles_i, valid_tiles_i, + num_keep_x_tiles=num_keep_x_tiles, tile_width=16, kernel_size=4, factor=4, day_night=day_night) except Exception as e: print(e) h5f.close() continue - - print(f) + print(data_f) f_cnt += 1 h5f.close() - if len(data_tiles) == 0: - continue - - if (f_cnt % 100) == 0: - num_samples = 0 - if len(data_tiles) > 0: - label = np.stack(label_tiles) - data = np.stack(data_tiles) - #np.save(out_directory + 'label_' + str(cnt), label) - #np.save(out_directory + 'data_' + str(cnt), data) - num_samples = data.shape[0] + total_num_not_missing += num_not_missing - label_tiles = [] - data_tiles = [] + if len(train_tiles_m) == 0 and len(valid_tiles_m) == 0: + continue - print(' num_samples, progress % : ', num_samples, int((f_cnt/num_files)*100)) - total_num_samples += num_samples - print('total_num_samples: ', total_num_samples) - print('------------------------------------------------------------') + if (f_cnt % 20) == 0: + num_valid_samples = 0 + if len(valid_tiles_m) > 0: + valid_i = np.stack(valid_tiles_i) + valid_m = np.stack(valid_tiles_m) + np.save(out_directory + 'valid_mres_' + str(cnt), valid_m) + np.save(out_directory + 'valid_ires_' + str(cnt), valid_i) + num_valid_samples = valid_m.shape[0] + + num_train_samples = 0 + if len(train_tiles_m) > 0: + train_i = np.stack(train_tiles_i) + train_m = np.stack(train_tiles_m) + np.save(out_directory + 'train_ires_' + str(cnt), train_i) + np.save(out_directory + 'train_mres_' + str(cnt), train_m) + num_train_samples = train_m.shape[0] + + valid_tiles_i = [] + train_tiles_i = [] + valid_tiles_m = [] + train_tiles_m = [] + + print(' num_train_samples, num_valid_samples, progress % : ', num_train_samples, num_valid_samples, int((f_cnt/num_files)*100)) + total_num_train_samples += num_train_samples + total_num_valid_samples += num_valid_samples + print('total_num_train_samples, total_num_valid_samples, total_num_not_missing: ', total_num_train_samples, + total_num_valid_samples, total_num_not_missing) + print('--------------------------------------------------') cnt += 1 - print('** total_num_samples: ', total_num_samples) + # Write out leftover, if any. Maybe make this better someday + num_valid_samples = 0 + if len(valid_tiles_m) > 0: + valid_i = np.stack(valid_tiles_i) + valid_m = np.stack(valid_tiles_m) + np.save(out_directory + 'valid_mres_' + str(cnt), valid_m) + np.save(out_directory + 'valid_ires_' + str(cnt), valid_i) + num_valid_samples = valid_m.shape[0] + + num_train_samples = 0 + if len(train_tiles_m) > 0: + train_i = np.stack(train_tiles_i) + train_m = np.stack(train_tiles_m) + np.save(out_directory + 'train_ires_' + str(cnt), train_i) + np.save(out_directory + 'train_mres_' + str(cnt), train_m) + num_train_samples = train_m.shape[0] + + print(' num_train_samples, num_valid_samples, progress % : ', num_train_samples, num_valid_samples, + int((f_cnt / num_files) * 100)) + total_num_train_samples += num_train_samples + total_num_valid_samples += num_valid_samples + print('total_num_train_samples, total_num_valid_samples, total_num_not_missing: ', total_num_train_samples, + total_num_valid_samples, total_num_not_missing) + print('--------------------------------------------------') + + print('** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) # tile_width: Must be even! # kernel_size: Must be odd! -def run(h5f, param_s, tiles, lbl_param_s, lbl_tiles, kernel_size=3): +def run(h5f, params_m, train_tiles_m, valid_tiles_m, params_i, train_tiles_i, valid_tiles_i, + num_keep_x_tiles=8, tile_width=64, kernel_size=3, factor=2, day_night='ANY'): border = int((kernel_size - 1)/2) + 1 # Need to add for interpolation with no edge effects - param_name = param_s[0] + param_name = params_m[0] num_lines = h5f[param_name].shape[0] num_pixels = h5f[param_name].shape[1] # Must be even + if day_night != 'ANY': + solzen = get_grid_values(h5f, solzen_name, 0, 0, None, num_lines, num_pixels) + grd_s = [] - for param in param_s: + for param in params_m: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels) grd_s.append(grd) except Exception as e: print(e) return - data = np.stack(grd_s) + data_m = np.stack(grd_s) grd_s = [] - for param in lbl_param_s: + for param in params_i: try: - grd = get_grid_values(h5f, param, 0, 0, None, num_lines*2, num_pixels*2) + grd = get_grid_values(h5f, param, 0, 0, None, num_lines*factor, num_pixels*factor) grd_s.append(grd) except Exception as e: print(e) return - label = np.stack(grd_s) + data_i = np.stack(grd_s) + + tile_width += 2 * border + + i_skip = tile_width + j_skip = tile_width + # i_start = int(num_pixels / 2) - int((num_keep_x_tiles * tile_width) / 2) + # j_start = 0 + i_start = border - 1 # zero-based + j_start = border - 1 # zero-based + + num_y_tiles = int(num_lines / tile_width) - 1 + + data_tiles_m = [] + data_tiles_i = [] + num_not_missing = 0 + + for j in range(num_y_tiles): + j_a = j_start + j * j_skip + j_b = j_a + tile_width + + for i in range(num_keep_x_tiles): + i_a = i_start + i * i_skip + i_b = i_a + tile_width + + if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]): + continue + elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]): + continue + + nda_m = data_m[:, j_a:j_b, i_a:i_b] + nda_i = data_i[:, j_a*factor:j_b*factor, i_a*factor:i_b*factor] + if is_missing(param_idx_i, nda_i): + continue + num_not_missing += 1 - nda = data[:, :, :] - nda = keep_tile(group_name_m + target_param, param_s, nda) - if nda is None: # if none, no need to check the next one - return + nda_i = keep_tile(param_idx_i, nda_i) + if nda_i is not None: + data_tiles_m.append(nda_m) + data_tiles_i.append(nda_i) - nda_lbl = label[:, :, :] - nda_lbl = keep_tile(group_name_i + target_param, lbl_param_s, nda_lbl) + num_tiles = len(data_tiles_i) + num_valid = int(num_tiles * 0.10) + num_train = num_tiles - num_valid - if nda_lbl is not None: - tiles.append(nda) - lbl_tiles.append(nda_lbl) + for k in range(num_train): + train_tiles_m.append(data_tiles_m[k]) + train_tiles_i.append(data_tiles_i[k]) + for k in range(num_valid): + valid_tiles_m.append(data_tiles_m[num_train + k]) + valid_tiles_i.append(data_tiles_i[num_train + k]) + return num_not_missing -- GitLab