import numpy as np import h5py from util.util import get_grid_values, get_grid_values_all, is_night, is_day, compute_lwc_iwc, get_fill_attrs import glob import os from aeolus.datasource import CLAVRx_VIIRS from icing.moon_phase import * from pathlib import Path keep_out_opd = ['/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/arm/2019/11/02/clavrx_VNP02IMG.A2019306.1912.001.2019307003236.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/arm/2019/04/13/clavrx_VNP02IMG.A2019103.1918.001.2019104005120.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/sioux_falls/2019/05/25/clavrx_VNP02IMG.A2019145.1936.001.2019146005424.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/sioux_falls/2019/11/01/clavrx_VNP02IMG.A2019305.1936.001.2019306005913.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/sioux_falls/2019/03/01/clavrx_VNP02IMG.A2019060.1930.001.2019061005942.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/table_mountain/2019/12/01/clavrx_VNP02IMG.A2019335.2012.001.2019336013827.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/table_mountain/2019/05/18/clavrx_VNP02IMG.A2019138.2006.001.2019139013059.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/fort_peck/2019/01/28/clavrx_VNP02IMG.A2019028.1930.001.2019029005408.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/fort_peck/2019/08/08/clavrx_VNP02IMG.A2019220.1930.001.2019221010714.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/madison/2019/10/13/clavrx_VNP02IMG.A2019286.1848.001.2019287001722.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/madison/2019/03/20/clavrx_VNP02IMG.A2019079.1830.001.2019079235918.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/madison/2019/12/26/clavrx_VNP02IMG.A2019360.1900.001.2019361001327.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/desert_rock/2019/02/05/clavrx_VNP02IMG.A2019036.2018.001.2019037030301.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/desert_rock/2019/03/30/clavrx_VNP02IMG.A2019089.2024.001.2019090015614.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/bondville_il/2019/11/03/clavrx_VNP02IMG.A2019307.1854.001.2019308001716.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/goodwin_creek/2019/04/15/clavrx_VNP02IMG.A2019105.1842.001.2019106001003.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/penn_state/2019/07/18/clavrx_VNP02IMG.A2019199.1742.001.2019199230925.uwssec.nc', '/ships19/cloud/scratch/cphillips/clavrx/run_viirs_superres/sites_super_l2/penn_state/2019/02/02/clavrx_VNP02IMG.A2019033.1754.001.2019034011318.uwssec.nc'] keep_out = keep_out_opd target_param = 'cloud_probability' # target_param = 'cld_opd_dcomp' group_name_i = 'super/' group_name_m = 'orig/' solzen_name = group_name_m + 'solar_zenith' label_params = [group_name_i+target_param] data_params = [group_name_m+'temp_11_0um', group_name_m+'refl_0_65um', group_name_m+target_param] param_idx_m = data_params.index(group_name_m + target_param) param_idx_i = label_params.index(group_name_i + target_param) def is_missing(p_idx, tile): keep = np.invert(np.isnan(tile[p_idx, ])) if np.sum(keep) / keep.size < 0.98: return True def keep_tile(p_idx, tile): grd_k = tile[p_idx, ].copy() if target_param == 'cloud_probability': grd_k = process_cld_prob(grd_k) elif target_param == 'cld_opd_dcomp': grd_k = process_cld_opd(grd_k) if grd_k is not None: tile[p_idx, ] = grd_k return tile else: return None def process_cld_prob(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) keep_clr = np.where(keep, grd_k < 0.20, False) frac_keep = np.sum(keep_clr)/num_keep if not (0.30 < frac_keep < 0.70): return None grd_k = np.where(np.invert(keep), 0, grd_k) # Convert NaN to 0 return grd_k def process_cld_opd(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) grd_k = np.where(np.invert(keep), 0, grd_k) keep = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 158.0), False) frac_keep = np.sum(keep)/num_keep if frac_keep < 0.50: return None return grd_k def run_all(directory, out_directory, day_night='ANY', pattern='clavrx_*.nc', start=10): cnt = start total_num_train_samples = 0 total_num_valid_samples = 0 num_keep_x_tiles = 14 path = directory + '**' + '/' + pattern all_files = glob.glob(path, recursive=True) data_files = [f for f in all_files if f not in keep_out] label_valid_tiles = [] label_train_tiles = [] data_valid_tiles = [] data_train_tiles = [] f_cnt = 0 num_files = len(data_files) print('Start, number of files: ', num_files) total_num_not_missing = 0 for idx, data_f in enumerate(data_files): # if idx % 4 == 0: # if we want to skip some files if True: try: h5f = h5py.File(data_f, 'r') except: print('cant open file: ', data_f) continue try: num_not_missing = run(h5f, data_params, data_train_tiles, data_valid_tiles, label_params, label_train_tiles, label_valid_tiles, num_keep_x_tiles=num_keep_x_tiles, tile_width=64, kernel_size=7, day_night=day_night) except Exception as e: print(e) h5f.close() continue print(data_f) f_cnt += 1 h5f.close() total_num_not_missing += num_not_missing if len(data_train_tiles) == 0 and len(data_valid_tiles) == 0: continue if (f_cnt % 5) == 0: num_valid_samples = 0 if len(data_valid_tiles) > 0: label_valid = np.stack(label_valid_tiles) data_valid = np.stack(data_valid_tiles) np.save(out_directory + 'data_valid_' + str(cnt), data_valid) np.save(out_directory + 'label_valid_' + str(cnt), label_valid) num_valid_samples = data_valid.shape[0] num_train_samples = 0 if len(data_train_tiles) > 0: label_train = np.stack(label_train_tiles) data_train = np.stack(data_train_tiles) np.save(out_directory + 'label_train_' + str(cnt), label_train) np.save(out_directory + 'data_train_' + str(cnt), data_train) num_train_samples = data_train.shape[0] label_valid_tiles = [] label_train_tiles = [] data_valid_tiles = [] data_train_tiles = [] print(' num_train_samples, num_valid_samples, progress % : ', num_train_samples, num_valid_samples, int((f_cnt/num_files)*100)) total_num_train_samples += num_train_samples total_num_valid_samples += num_valid_samples print('total_num_train_samples, total_num_valid_samples, total_num_not_missing: ', total_num_train_samples, total_num_valid_samples, total_num_not_missing) print('--------------------------------------------------') cnt += 1 print('** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) # tile_width: Must be even! # kernel_size: Must be odd! def run(h5f, param_s, train_tiles, valid_tiles, lbl_param_s, lbl_train_tiles, lbl_valid_tiles, num_keep_x_tiles=8, tile_width=64, kernel_size=3, day_night='ANY'): border = int((kernel_size - 1)/2) + 1 # Need to add for interpolation with no edge effects param_name = param_s[0] num_lines = h5f[param_name].shape[0] num_pixels = h5f[param_name].shape[1] # Must be even if day_night != 'ANY': solzen = get_grid_values(h5f, solzen_name, 0, 0, None, num_lines, num_pixels) grd_s = [] for param in param_s: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels) grd_s.append(grd) except Exception as e: print(e) return data = np.stack(grd_s) grd_s = [] for param in lbl_param_s: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines*2, num_pixels*2) grd_s.append(grd) except Exception as e: print(e) return label = np.stack(grd_s) tile_width += 2 * border i_skip = tile_width j_skip = tile_width i_start = int(num_pixels / 2) - int((num_keep_x_tiles * tile_width) / 2) j_start = 0 num_y_tiles = int(num_lines / tile_width) - 1 data_tiles = [] lbl_tiles = [] num_not_missing = 0 for j in range(num_y_tiles): j_a = j_start + j * j_skip j_b = j_a + tile_width for i in range(num_keep_x_tiles): i_a = i_start + i * i_skip i_b = i_a + tile_width if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]): continue elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]): continue nda = data[:, j_a:j_b, i_a:i_b] nda_lbl = label[:, j_a*2:j_b*2, i_a*2:i_b*2] if is_missing(param_idx_i, nda_lbl): continue num_not_missing += 1 nda_lbl = keep_tile(param_idx_i, nda_lbl) if nda_lbl is not None: data_tiles.append(nda) lbl_tiles.append(nda_lbl) num_tiles = len(lbl_tiles) num_valid = int(num_tiles * 0.10) num_train = num_tiles - num_valid for k in range(num_train): train_tiles.append(data_tiles[k]) lbl_train_tiles.append(lbl_tiles[k]) for k in range(num_valid): valid_tiles.append(data_tiles[num_train + k]) lbl_valid_tiles.append(lbl_tiles[num_train + k]) return num_not_missing