import numpy as np import h5py from util.util import get_grid_values, get_grid_values_all, is_night, is_day, compute_lwc_iwc import glob from aeolus.datasource import CLAVRx_VIIRS from icing.moon_phase import * target_param = 'cloud_probability' # target_param = 'cld_opd_dcomp' # group_name = '' group_name = 'super/' # l2_params = [group_name+'temp_11_0um_nom', group_name+'refl_0_65um_nom', group_name+target_param] l2_params = [group_name+'temp_11_0um', group_name+'refl_0_65um', group_name+target_param] # solzen_name = group_name + 'solar_zenith_angle' solzen_name = group_name + 'solar_zenith' label_params = l2_params data_params = l2_params # range = [0.0, 1.0] cld_prob_norm_hist = [0.34458323, 0.03729378, 0.01817725, 0.01246574, 0.00991681, 0.00826515, 0.00785976, 0.00595133, 0.00567965, 0.00579926, 0.00642895, 0.00797761, 0.01218471, 0.51741677] # range = [0.0, 160.0] cld_opd_norm_hist = [7.31926378e-01, 9.52482193e-02, 4.62747706e-02, 3.15450036e-02, 1.98358694e-02, 1.33123841e-02, 1.03378429e-02, 7.95560979e-03, 5.77925319e-03, 4.82856215e-03, 3.31576300e-03, 2.86789405e-03, 2.50456177e-03, 1.79184632e-03, 1.51077739e-03, 1.29144749e-03,9.20514553e-04, 7.47183923e-04, 6.50404531e-04, 1.73557144e-02] def keep_tile(param_s, tile, dum): k = param_s.index(group_name + target_param) grd_k = tile[k, ].copy() if target_param == 'cloud_probability': grd_k = process_cld_prob_(grd_k, dum) elif target_param == 'cld_opd_dcomp': grd_k = process_cld_opd_(grd_k, dum) if grd_k is not None: tile[k, ] = grd_k return tile else: return None def process_cld_prob(param_s, tile): k = param_s.index(group_name + 'cloud_probability') grd_k = tile[k, ].copy() grd_k = process_cld_prob_(grd_k) if grd_k is not None: tile[k, ] = grd_k return tile else: return None def process_cld_prob_(grd_k, dum): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) if num_keep / grd_k.size < 0.98: return None # hist_10 += np.histogram(grd_k.flatten(), range=[0.0, 1.0], bins=10)[0] keep = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 0.90), False) if np.sum(keep)/num_keep < 0.30: return None grd_k = np.where(np.invert(keep), 0, grd_k) return grd_k def process_cld_opd(param_s, tile): k = param_s.index(group_name + 'cld_opd_dcomp') grd_k = tile[k, ].copy() grd_k = process_cld_opd_(grd_k) if grd_k is not None: tile[k, ] = grd_k return tile else: return None def process_cld_opd_(grd_k, dum): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) if num_keep / grd_k.size < 0.98: return None grd_k = np.where(np.invert(keep), 0, grd_k) keep = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 158.0), False) if np.sum(keep)/num_keep < 0.50: return None return grd_k def run_all(directory, out_directory, day_night='ANY', start=10): cnt = start total_num_train_samples = 0 total_num_valid_samples = 0 num_keep_x_tiles = 12 # pattern = 'clavrx_VNP02MOD*.highres.nc.level2.nc' pattern = 'clavrx_*.nc' path = directory + '**' + '/' + pattern data_files = glob.glob(path, recursive=True) data_valid_tiles = [] data_train_tiles = [] f_cnt = 0 num_files = len(data_files) print('Start, number of files: ', num_files) # hist_10 = np.zeros((10), dtype=np.int64) for idx, data_f in enumerate(data_files): # if idx % 4 == 0: # if we want to skip some files if True: try: data_h5f = h5py.File(data_f, 'r') except: print('cant open file: ', data_f) continue try: run(data_h5f, data_params, data_train_tiles, data_valid_tiles, None, num_keep_x_tiles=num_keep_x_tiles, tile_width=128, kernel_size=11, day_night=day_night) except Exception as e: print(e) data_h5f.close() continue print(data_f) f_cnt += 1 data_h5f.close() if len(data_train_tiles) == 0 and len(data_valid_tiles) == 0: continue if (f_cnt % 5) == 0: num_valid_samples = 0 if len(data_valid_tiles) > 0: data_valid = np.stack(data_valid_tiles) np.save(out_directory + 'data_valid_' + str(cnt), data_valid) num_valid_samples = data_valid.shape[0] num_train_samples = 0 if len(data_train_tiles) > 0: data_train = np.stack(data_train_tiles) np.save(out_directory+'data_train_' + str(cnt), data_train) num_train_samples = data_train.shape[0] data_valid_tiles = [] data_train_tiles = [] print(' num_train_samples, num_valid_samples, progress % : ', num_train_samples, num_valid_samples, int((f_cnt/num_files)*100)) total_num_train_samples += num_train_samples total_num_valid_samples += num_valid_samples print('total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) print('---------------------------------------------------------') cnt += 1 print('** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) # tile_width: Must be even! # kernel_size: Must be odd! def run(data_h5f, param_s, train_tiles, valid_tiles, dum, num_keep_x_tiles=8, tile_width=64, kernel_size=9, day_night='ANY'): border = int((kernel_size - 1)/2) param_name = param_s[0] num_lines = data_h5f[param_name].shape[0] num_pixels = data_h5f[param_name].shape[1] # Must be even if day_night != 'ANY': solzen = get_grid_values(data_h5f, solzen_name, 0, 0, None, num_lines, num_pixels) grd_s = [] for param in param_s: try: grd = get_grid_values(data_h5f, param, 0, 0, None, num_lines, num_pixels) grd_s.append(grd) except Exception as e: print(e) return data = np.stack(grd_s) tile_width += 2 * border i_skip = tile_width j_skip = tile_width i_start = int(num_pixels / 2) - int((num_keep_x_tiles * tile_width) / 2) j_start = 0 num_keep_y_tiles = int(num_lines / tile_width) - 3 num_y_valid = int(num_keep_y_tiles * 0.1) + 1 num_y_train = num_keep_y_tiles - num_y_valid - 1 for j in range(num_y_train): j_a = j_start + j * j_skip j_b = j_a + tile_width for i in range(num_keep_x_tiles): i_a = i_start + i * i_skip i_b = i_a + tile_width if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]): continue elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]): continue nda = data[:, j_a:j_b, i_a:i_b] nda = keep_tile(param_s, nda, dum) if nda is not None: train_tiles.append(nda) j_start = num_y_train * tile_width + 2*tile_width for j in range(num_y_valid): j_a = j_start + j * j_skip j_b = j_a + tile_width for i in range(num_keep_x_tiles): i_a = i_start + i * i_skip i_b = i_a + tile_width if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]): continue elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]): continue nda = data[:, j_a:j_b, i_a:i_b] nda = keep_tile(param_s, nda, dum) if nda is not None: valid_tiles.append(nda) def scan(directory): data_src = CLAVRx_VIIRS(directory) files = data_src.flist for idx, file in enumerate(files): h5f = h5py.File(file, 'r') ts = data_src.ftimes[idx][0] try: solzen = get_grid_values_all(h5f, 'solar_zenith_angle') except Exception as e: # print(e) h5f.close() continue # if is_day(solzen) and moon_phase(ts): if is_night(solzen) and moon_phase(ts): print(file) h5f.close() def scan_for_location(txt_file, lon_range=[111.0, 130.0], lat_range=[14.0, 32.0]): with open(txt_file) as file: for idx, fpath in enumerate(file): fpath = fpath.strip() h5f = h5py.File(fpath, 'r') try: lon_s = get_grid_values_all(h5f, 'longitude', stride=4) lat_s = get_grid_values_all(h5f, 'latitude', stride=4) c_lon, c_lat = lon_s[406, 400], lat_s[406, 400] if (lon_range[0] < c_lon < lon_range[1]) and (lat_range[0] < c_lat < lat_range[1]): print(fpath) except Exception as e: # print(e) h5f.close() continue def test_nlcomp(file): h5f = h5py.File(file, 'r') cld_phs = get_grid_values_all(h5f, 'cloud_phase', scale_factor_name=None, range_name=None) keep_0 = np.invert(np.isnan(cld_phs)) reff = get_grid_values_all(h5f, 'cld_reff_nlcomp') keep_1 = np.invert(np.isnan(reff)) opd = get_grid_values_all(h5f, 'cld_opd_nlcomp') keep_2 = np.invert(np.isnan(opd)) cld_dz = get_grid_values_all(h5f, 'cld_geo_thick') keep_3 = np.logical_and(np.invert(np.isnan(cld_dz)), cld_dz > 5.0) keep = keep_0 & keep_1 & keep_2 & keep_3 cld_phs = cld_phs[keep] reff = reff[keep] opd = opd[keep] cld_dz = cld_dz[keep] lwc_c, iwc_c = compute_lwc_iwc(cld_phs, reff, opd, cld_dz) return lwc_c, iwc_c # def run_mean_std(directory): # # data_dct = {name: [] for name in mod_res_params} # mean_dct = {name: 0 for name in mod_res_params} # std_dct = {name: 0 for name in mod_res_params} # # for p in os.scandir(directory): # if not p.is_dir(): # continue # mod_files = glob.glob(directory+p.name+'/'+'VNP02MOD*.uwssec.nc') # # for idx, mfile in enumerate(mod_files): # if idx % 8 == 0: # h5f = h5py.File(mfile, 'r') # for param in mod_res_params: # name = 'observation_data/'+param # gvals = get_grid_values_all(h5f, name, range_name=None, stride=10) # data_dct[param].append(gvals.flatten()) # print(mfile) # h5f.close() # # for param in mod_res_params: # data = data_dct[param] # data = np.concatenate(data) # # mean_dct[param] = np.nanmean(data) # std_dct[param] = np.nanstd(data)