diff --git a/modules/util/abi_surfrad_opd.py b/modules/util/abi_surfrad_opd.py new file mode 100644 index 0000000000000000000000000000000000000000..3583f9ad8d9d130c03273b9e1cf1b6bf5e4262b5 --- /dev/null +++ b/modules/util/abi_surfrad_opd.py @@ -0,0 +1,373 @@ +import numpy as np +import h5py +from util.util import get_grid_values, is_day +import glob + +# target_param = 'cloud_probability' +# target_param = 'cld_opd_dcomp' +target_param = 'cld_opd_dcomp_1' +# target_param = 'cld_opd_dcomp_2' +# target_param = 'cld_opd_dcomp_3' + +group_name_i = 'super/' +group_name_m = 'orig/' + +solzen_name = group_name_m + 'solar_zenith' +snow_class_name = group_name_m + 'snow_class' + +params_i = [group_name_i+'temp_ch38', group_name_i+'refl_ch01', group_name_i+'cloud_probability', + group_name_i+target_param] + +params_m = [group_name_m+'temp_ch38', group_name_m+'refl_ch01', + group_name_m+'refl_submin_ch01', group_name_m+'refl_submax_ch01', group_name_m+'refl_substddev_ch01', + group_name_m+'cloud_probability', + group_name_m+target_param] + +param_idx_m = params_m.index(group_name_m + target_param) +param_idx_i = params_i.index(group_name_i + target_param) +cp_idx_i = params_i.index(group_name_i + 'cloud_probability') + +DO_WRITE_OUTFILE = True + + +def snow_covered(tile): + return np.any(tile > 1) + + +def is_missing(p_idx, tile): + keep = np.invert(np.isnan(tile[p_idx, ])) + if np.sum(keep) / keep.size < 0.98: + return True + + +def keep_tile(cp_idx, opd_idx, tile): + opd = tile[opd_idx, ].copy() + cp = tile[cp_idx, ].copy() + + cp = process_cld_prob(cp) + if cp is None: + return None + + opd = process_cld_opd(opd) + if opd is not None: + tile[opd_idx, ] = opd + return tile + else: + return None + + +def process_cld_prob(cld_prob): + keep = np.invert(np.isnan(cld_prob)) + num_keep = np.sum(keep) + keep = np.where(keep, np.logical_or(cld_prob < 0.35, cld_prob > 0.65), False) + frac_keep = np.sum(keep)/num_keep + if not (frac_keep > 0.70): + return None + grd_k = np.where(np.invert(keep), 0, cld_prob) # Convert NaN to 0 + return grd_k + + +def process_cloud_frac(grd_k): + pass + + +def get_cloud_frac_5cat(grd_k): + grd_k = np.where(np.isnan(grd_k), 0, grd_k) + grd_k = np.where(grd_k < 0.5, 0, 1) + + s = grd_k[:, 0::4, 0::4] + grd_k[:, 1::4, 0::4] + grd_k[:, 2::4, 0::4] + grd_k[:, 3::4, 0::4] + \ + grd_k[:, 0::4, 1::4] + grd_k[:, 1::4, 1::4] + grd_k[:, 2::4, 1::4] + grd_k[:, 3::4, 1::4] + \ + grd_k[:, 0::4, 2::4] + grd_k[:, 1::4, 2::4] + grd_k[:, 2::4, 2::4] + grd_k[:, 3::4, 2::4] + \ + grd_k[:, 0::4, 3::4] + grd_k[:, 1::4, 3::4] + grd_k[:, 2::4, 3::4] + grd_k[:, 3::4, 3::4] + + cat_0 = np.logical_and(s >= 0, s < 2) + cat_1 = np.logical_and(s >= 2, s < 6) + cat_2 = np.logical_and(s >= 6, s < 11) + cat_3 = np.logical_and(s >= 11, s < 15) + cat_4 = np.logical_and(s >= 15, s <= 16) + + s[cat_0] = 0 + s[cat_1] = 1 + s[cat_2] = 2 + s[cat_3] = 3 + s[cat_4] = 4 + + return s + + +def process_cld_opd(grd_k): + keep = np.invert(np.isnan(grd_k)) + num_keep = np.sum(keep) + keep_cld = np.where(keep, np.logical_and(2.0 < grd_k, grd_k < 158.0), False) + # keep_cld = np.where(keep, 2.0 < grd_k, False) + frac_cld = np.sum(keep_cld)/num_keep + if not (0.20 < frac_cld < 0.90): + # if not (0.20 < frac_cld): + return None + grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 + return grd_k + + +def run_all(directory, out_directory, day_night='ANY', pattern='clavrx_*.nc', start=10, is_snow_covered=None): + cnt = start + total_num_train_samples = 0 + total_num_valid_samples = 0 + + # path = directory + '**' + '/' + pattern + path = directory + '*_v3/2020/' + '**' + '/' + pattern + + all_files = glob.glob(path, recursive=True) + test_files = glob.glob(directory + '*_v3/2020/*/01/*/*.nc', recursive=True) + valid_files = glob.glob(directory + '*_v3/2020/*/0[2-6]/*/*.nc', recursive=True) + train_files = [f for f in all_files if f not in valid_files + test_files] + + data_tiles_i = [] + data_tiles_m = [] + f_cnt = 0 + + num_files = len(valid_files) + print('Start, number of valid files: ', num_files) + + total_num_not_missing = 0 + num_skip = 3 + + param_train_hist = np.zeros([16], dtype=np.int64) + param_valid_hist = np.zeros([16], dtype=np.int64) + + # cloud_prob to cloud fraction + # ---------------------------- + tile_width = 32 + kernel_size = 5 + factor = 4 + + # tile_width = 64 + # kernel_size = 7 + # factor = 4 + + # hist_range = [0.0, 1.0] + hist_range = [0.0, 160.0] + + for idx, data_f in enumerate(valid_files): + if idx % num_skip == 0: # if we want to skip some files + try: + h5f = h5py.File(data_f, 'r') + except: + print('cant open file: ', data_f) + continue + + try: + num_not_missing, num_snow_covered = \ + run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, + tile_width=tile_width, kernel_size=kernel_size, factor=factor, + day_night=day_night, is_snow_covered=is_snow_covered) + except Exception as e: + print(e) + h5f.close() + continue + print(data_f) + f_cnt += 1 + h5f.close() + + total_num_not_missing += num_not_missing + + if len(data_tiles_m) == 0: + continue + + if (f_cnt % 100) == 0: + num_valid_samples = 0 + if len(data_tiles_m) > 0: + valid_i = np.stack(data_tiles_i) + valid_m = np.stack(data_tiles_m) + if DO_WRITE_OUTFILE: + np.save(out_directory + 'valid_mres_' + f'{cnt:04d}', valid_m) + np.save(out_directory + 'valid_ires_' + f'{cnt:04d}', valid_i) + num_valid_samples = valid_m.shape[0] + + param_valid_hist += np.histogram(valid_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] + + data_tiles_i = [] + data_tiles_m = [] + + print(' num_valid_samples, progress % : ', num_valid_samples, int((f_cnt/(num_files/num_skip))*100)) + total_num_valid_samples += num_valid_samples + print('total_num_valid_samples, total_num_not_missing: ', total_num_valid_samples, total_num_not_missing) + print('--------------------------------------------------') + + cnt += 1 + + # Write out leftover, if any. Maybe make this better someday + num_valid_samples = 0 + if len(data_tiles_m) > 0: + valid_i = np.stack(data_tiles_i) + valid_m = np.stack(data_tiles_m) + if DO_WRITE_OUTFILE: + np.save(out_directory + 'valid_mres_' + f'{cnt:04d}', valid_m) + np.save(out_directory + 'valid_ires_' + f'{cnt:04d}', valid_i) + num_valid_samples = valid_m.shape[0] + param_valid_hist += np.histogram(valid_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] + total_num_valid_samples += num_valid_samples + print('total_num_valid_samples, total_num_not_missing: ', total_num_valid_samples, total_num_not_missing) + print(param_valid_hist) + print('--------------------------------------------------') + print('----------------------------------------------------------------') + + data_tiles_i = [] + data_tiles_m = [] + f_cnt = 0 + cnt = start + total_num_not_missing = 0 + num_files = len(train_files) + print('Start, number of train files: ', num_files) + + for idx, data_f in enumerate(train_files): + if idx % num_skip == 0: # if we want to skip some files + try: + h5f = h5py.File(data_f, 'r') + except: + print('cant open file: ', data_f) + continue + + try: + num_not_missing, num_snow_covered = \ + run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, + tile_width=tile_width, kernel_size=kernel_size, factor=factor, + day_night=day_night, is_snow_covered=is_snow_covered) + except Exception as e: + print(e) + h5f.close() + continue + print(data_f) + f_cnt += 1 + h5f.close() + + total_num_not_missing += num_not_missing + + if len(data_tiles_m) == 0: + continue + + if (f_cnt % 100) == 0: + num_train_samples = 0 + if len(data_tiles_m) > 0: + train_i = np.stack(data_tiles_i) + train_m = np.stack(data_tiles_m) + if DO_WRITE_OUTFILE: + np.save(out_directory + 'train_ires_' + f'{cnt:04d}', train_i) + np.save(out_directory + 'train_mres_' + f'{cnt:04d}', train_m) + num_train_samples = train_m.shape[0] + + param_train_hist += np.histogram(train_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] + + data_tiles_i = [] + data_tiles_m = [] + + print(' num_train_samples, progress % : ', num_train_samples, int((f_cnt/(num_files/num_skip))*100)) + total_num_train_samples += num_train_samples + print('total_num_train_samples, total_num_not_missing: ', total_num_train_samples, total_num_not_missing) + print('--------------------------------------------------') + + cnt += 1 + + # Write out leftover, if any. Maybe make this better someday + num_train_samples = 0 + if len(data_tiles_m) > 0: + train_i = np.stack(data_tiles_i) + train_m = np.stack(data_tiles_m) + if DO_WRITE_OUTFILE: + np.save(out_directory + 'train_ires_' + f'{cnt:04d}', train_i) + np.save(out_directory + 'train_mres_' + f'{cnt:04d}', train_m) + num_train_samples = train_m.shape[0] + param_train_hist += np.histogram(train_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] + total_num_train_samples += num_train_samples + print('total_num_train_samples, total_num_not_missing: ', total_num_train_samples, total_num_not_missing) + print(param_train_hist) + print('--------------------------------------------------') + + print('*** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) + + +# tile_width: Must be even! +# kernel_size: Must be odd! +def run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, tile_width=64, kernel_size=3, factor=2, + day_night='ANY', is_snow_covered=None): + + border = int((kernel_size - 1)/2) + 1 # Need to add for interpolation with no edge effects + + param_name = params_m[0] + + num_lines = h5f[param_name].shape[0] + num_pixels = h5f[param_name].shape[1] # Must be even + + if day_night != 'ANY': + solzen = get_grid_values(h5f, solzen_name, 0, 0, None, num_lines, num_pixels) + + if is_snow_covered is not None: + snow = get_grid_values(h5f, snow_class_name, 0, 0, None, num_lines, num_pixels) + + grd_s = [] + for param in params_m: + try: + grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels) + grd_s.append(grd) + except Exception as e: + print(e) + return + data_m = np.stack(grd_s) + + grd_s = [] + for param in params_i: + try: + grd = get_grid_values(h5f, param, 0, 0, None, num_lines*factor, num_pixels*factor) + grd_s.append(grd) + except Exception as e: + print(e) + return + data_i = np.stack(grd_s) + + tile_width += 2 * border + + i_skip = tile_width + j_skip = tile_width + i_start = border - 1 # zero-based + j_start = border - 1 # zero-based + + num_y_tiles = int(num_lines / tile_width) + num_x_tiles = int(num_pixels / tile_width) + + num_not_missing = 0 + num_snow_covered = 0 + + for j in range(num_y_tiles): + j_a = j_start + j * j_skip + j_b = j_a + tile_width + + for i in range(num_x_tiles): + i_a = i_start + i * i_skip + i_b = i_a + tile_width + + if is_snow_covered is not None: + if is_snow_covered: + if not snow_covered(snow[j_a:j_b, i_a:i_b]): + continue + num_snow_covered += 1 + else: + if snow_covered(snow[j_a:j_b, i_a:i_b]): + num_snow_covered += 1 + continue + + if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]): + continue + elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]): + continue + + nda_m = data_m[:, j_a:j_b, i_a:i_b] + nda_i = data_i[:, j_a*factor:j_b*factor, i_a*factor:i_b*factor] + if is_missing(param_idx_i, nda_i): + continue + num_not_missing += 1 + + nda_i = keep_tile(cp_idx_i, param_idx_i, nda_i) + if nda_i is not None: + data_tiles_m.append(nda_m) + data_tiles_i.append(nda_i) + + return num_not_missing, num_snow_covered