import numpy as np import h5py from util.util import get_grid_values, is_day import glob target_param = 'cloud_probability' # target_param = 'cld_opd_dcomp' # target_param = 'cld_opd_dcomp_1' # target_param = 'cld_opd_dcomp_2' # target_param = 'cld_opd_dcomp_3' group_name_i = 'super/' group_name_m = 'orig/' solzen_name = group_name_m + 'solar_zenith' snow_class_name = group_name_m + 'snow_class' params_i = [group_name_i+'temp_ch38', group_name_i+'refl_ch01', group_name_i+target_param] params_m = [group_name_m+'temp_ch38', group_name_m+'refl_ch01', group_name_m+'refl_submin_ch01', group_name_m+'refl_submax_ch01', group_name_m+'refl_substddev_ch01', group_name_m+target_param, group_name_m+'cloud_fraction'] param_idx_m = params_m.index(group_name_m + target_param) param_idx_i = params_i.index(group_name_i + target_param) hist_range = [0.0, 1.0] # hist_range = [0.0, 160.0] tile_width = 32 kernel_size = 5 factor = 4 # tile_width = 64 # kernel_size = 7 # factor = 4 num_skip = 2 DO_WRITE_OUTFILE = True def snow_covered(tile): return np.any(tile > 1) def is_missing(p_idx, tile): keep = np.invert(np.isnan(tile[p_idx, ])) if np.sum(keep) / keep.size < 0.98: return True def keep_tile(p_idx, tile): grd_k = tile[p_idx, ].copy() if target_param == 'cloud_probability': # grd_k = process_cld_prob(grd_k) grd_k = process_cloud_frac(grd_k) elif 'cld_opd_dcomp' in target_param: grd_k = process_cld_opd(grd_k) if grd_k is not None: tile[p_idx, ] = grd_k return tile else: return None def process_cld_prob(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) # keep_clr = np.where(keep, grd_k < 0.30, False) keep_cld = np.where(keep, grd_k > 0.70, False) # frac_clr = np.sum(keep_clr)/num_keep frac_cld = np.sum(keep_cld)/num_keep if not (0.30 <= frac_cld <= 0.90): return None grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 return grd_k def process_cloud_frac(grd_k): keep = np.invert(np.isnan(grd_k)) grd_by_cat = get_cloud_frac_5cat(grd_k) keep_mix = np.logical_and(grd_by_cat > 0, grd_by_cat < 4) frac_mix = np.sum(keep_mix)/keep_mix.size if frac_mix < 0.13: # at least 13% mixed to pass return None grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 return grd_k def get_cloud_frac_5cat(grd_k): grd_k = grd_k.copy() grd_k = np.where(np.isnan(grd_k), 0, grd_k) grd_k = np.where(grd_k < 0.5, 0, 1) s = grd_k[0::4, 0::4] + grd_k[1::4, 0::4] + grd_k[2::4, 0::4] + grd_k[3::4, 0::4] + \ grd_k[0::4, 1::4] + grd_k[1::4, 1::4] + grd_k[2::4, 1::4] + grd_k[3::4, 1::4] + \ grd_k[0::4, 2::4] + grd_k[1::4, 2::4] + grd_k[2::4, 2::4] + grd_k[3::4, 2::4] + \ grd_k[0::4, 3::4] + grd_k[1::4, 3::4] + grd_k[2::4, 3::4] + grd_k[3::4, 3::4] cat_0 = np.logical_and(s >= 0, s < 1) cat_1 = np.logical_and(s >= 1, s < 6) cat_2 = np.logical_and(s >= 6, s < 11) cat_3 = np.logical_and(s >= 11, s <= 15) cat_4 = np.logical_and(s > 15, s <= 16) s[cat_0] = 0 s[cat_1] = 1 s[cat_2] = 2 s[cat_3] = 3 s[cat_4] = 4 return s def process_cld_opd(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) keep_cld = np.where(keep, np.logical_and(2.0 < grd_k, grd_k < 158.0), False) # keep_cld = np.where(keep, 2.0 < grd_k, False) frac_cld = np.sum(keep_cld)/num_keep if not (0.20 < frac_cld < 0.90): # if not (0.20 < frac_cld): return None grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 return grd_k def run_all(directory, out_directory, day_night='ANY', pattern='clavrx_*.nc', start=10, is_snow_covered=None): cnt = start total_num_train_samples = 0 total_num_valid_samples = 0 # path = directory + '**' + '/' + pattern path = directory + '*_v3/202?/' + '**' + '/' + pattern all_files = glob.glob(path, recursive=True) test_files = glob.glob(directory + '*_v3/202?/*/01/*/*.nc', recursive=True) valid_files = glob.glob(directory + '*_v3/202?/*/0[2-6]/*/*.nc', recursive=True) train_files = [f for f in all_files if f not in valid_files + test_files] data_tiles_i = [] data_tiles_m = [] f_cnt = 0 num_files = len(valid_files) print('Start, number of valid files: ', num_files) total_num_not_missing = 0 param_train_hist = np.zeros([16], dtype=np.int64) param_valid_hist = np.zeros([16], dtype=np.int64) for idx, data_f in enumerate(valid_files): if idx % num_skip == 0: # if we want to skip some files try: h5f = h5py.File(data_f, 'r') except: print('cant open file: ', data_f) continue try: num_not_missing, num_snow_covered = \ run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, tile_width=tile_width, kernel_size=kernel_size, factor=factor, day_night=day_night, is_snow_covered=is_snow_covered) except Exception as e: print(e) h5f.close() continue print(data_f) f_cnt += 1 h5f.close() total_num_not_missing += num_not_missing if len(data_tiles_m) == 0: continue if (f_cnt % 100) == 0: num_valid_samples = 0 if len(data_tiles_m) > 0: valid_i = np.stack(data_tiles_i) valid_m = np.stack(data_tiles_m) if valid_m.shape[0] != valid_i.shape[0]: print('problem: number of samples dont match', valid_m.shape, valid_i.shape) continue if DO_WRITE_OUTFILE: np.save(out_directory + 'valid_mres_' + f'{cnt:04d}', valid_m) np.save(out_directory + 'valid_ires_' + f'{cnt:04d}', valid_i) num_valid_samples = valid_m.shape[0] param_valid_hist += np.histogram(valid_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] data_tiles_i = [] data_tiles_m = [] print(' num_valid_samples, progress % : ', num_valid_samples, int((f_cnt/(num_files/num_skip))*100)) total_num_valid_samples += num_valid_samples print('total_num_valid_samples, total_num_not_missing: ', total_num_valid_samples, total_num_not_missing) print('--------------------------------------------------') cnt += 1 # Write out leftover, if any. Maybe make this better someday num_valid_samples = 0 if len(data_tiles_m) > 0: valid_i = np.stack(data_tiles_i) valid_m = np.stack(data_tiles_m) if valid_m.shape[0] != valid_i.shape[0]: print('problem: number of samples dont match', valid_m.shape, valid_i.shape) elif DO_WRITE_OUTFILE: np.save(out_directory + 'valid_mres_' + f'{cnt:04d}', valid_m) np.save(out_directory + 'valid_ires_' + f'{cnt:04d}', valid_i) num_valid_samples = valid_m.shape[0] param_valid_hist += np.histogram(valid_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] total_num_valid_samples += num_valid_samples print('total_num_valid_samples, total_num_not_missing: ', total_num_valid_samples, total_num_not_missing) print(param_valid_hist) print('--------------------------------------------------') print('----------------------------------------------------------------') data_tiles_i = [] data_tiles_m = [] f_cnt = 0 cnt = start total_num_not_missing = 0 num_files = len(train_files) print('Start, number of train files: ', num_files) for idx, data_f in enumerate(train_files): if idx % num_skip == 0: # if we want to skip some files try: h5f = h5py.File(data_f, 'r') except: print('cant open file: ', data_f) continue try: num_not_missing, num_snow_covered = \ run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, tile_width=tile_width, kernel_size=kernel_size, factor=factor, day_night=day_night, is_snow_covered=is_snow_covered) except Exception as e: print(e) h5f.close() continue print(data_f) f_cnt += 1 h5f.close() total_num_not_missing += num_not_missing if len(data_tiles_m) == 0: continue if (f_cnt % 100) == 0: num_train_samples = 0 if len(data_tiles_m) > 0: train_i = np.stack(data_tiles_i) train_m = np.stack(data_tiles_m) if DO_WRITE_OUTFILE: np.save(out_directory + 'train_ires_' + f'{cnt:04d}', train_i) np.save(out_directory + 'train_mres_' + f'{cnt:04d}', train_m) num_train_samples = train_m.shape[0] param_train_hist += np.histogram(train_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] data_tiles_i = [] data_tiles_m = [] print(' num_train_samples, progress % : ', num_train_samples, int((f_cnt/(num_files/num_skip))*100)) total_num_train_samples += num_train_samples print('total_num_train_samples, total_num_not_missing: ', total_num_train_samples, total_num_not_missing) print('--------------------------------------------------') cnt += 1 # Write out leftover, if any. Maybe make this better someday num_train_samples = 0 if len(data_tiles_m) > 0: train_i = np.stack(data_tiles_i) train_m = np.stack(data_tiles_m) if DO_WRITE_OUTFILE: np.save(out_directory + 'train_ires_' + f'{cnt:04d}', train_i) np.save(out_directory + 'train_mres_' + f'{cnt:04d}', train_m) num_train_samples = train_m.shape[0] param_train_hist += np.histogram(train_m[:, param_idx_m, :, :], bins=16, range=hist_range)[0] total_num_train_samples += num_train_samples print('total_num_train_samples, total_num_not_missing: ', total_num_train_samples, total_num_not_missing) print(param_train_hist) print('--------------------------------------------------') print('*** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) # tile_width: Must be even! # kernel_size: Must be odd! def run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, tile_width=64, kernel_size=3, factor=2, day_night='ANY', is_snow_covered=None): border = int((kernel_size - 1)/2) + 1 # Need to add for interpolation with no edge effects param_name = params_m[0] num_lines = h5f[param_name].shape[0] num_pixels = h5f[param_name].shape[1] # Must be even if day_night != 'ANY': solzen = get_grid_values(h5f, solzen_name, 0, 0, None, num_lines, num_pixels) if is_snow_covered is not None: snow = get_grid_values(h5f, snow_class_name, 0, 0, None, num_lines, num_pixels) grd_s = [] for param in params_m: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels) grd_s.append(grd) except Exception as e: print(e) return data_m = np.stack(grd_s) grd_s = [] for param in params_i: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines*factor, num_pixels*factor) grd_s.append(grd) except Exception as e: print(e) return data_i = np.stack(grd_s) tile_width += 2 * border i_skip = tile_width j_skip = tile_width i_start = border - 1 # zero-based j_start = border - 1 # zero-based num_y_tiles = int(num_lines / tile_width) num_x_tiles = int(num_pixels / tile_width) num_not_missing = 0 num_snow_covered = 0 for j in range(num_y_tiles): j_a = j_start + j * j_skip j_b = j_a + tile_width for i in range(num_x_tiles): i_a = i_start + i * i_skip i_b = i_a + tile_width if is_snow_covered is not None: if is_snow_covered: if not snow_covered(snow[j_a:j_b, i_a:i_b]): continue num_snow_covered += 1 else: if snow_covered(snow[j_a:j_b, i_a:i_b]): num_snow_covered += 1 continue if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]): continue elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]): continue nda_m = data_m[:, j_a:j_b, i_a:i_b] nda_i = data_i[:, j_a*factor:j_b*factor, i_a*factor:i_b*factor] if is_missing(param_idx_i, nda_i): continue num_not_missing += 1 nda_i = keep_tile(param_idx_i, nda_i) if nda_i is not None: data_tiles_m.append(nda_m) data_tiles_i.append(nda_i) return num_not_missing, num_snow_covered