import numpy as np import h5py from util.util import get_grid_values, get_grid_values_all, is_night, is_day, compute_lwc_iwc, get_fill_attrs import glob import os from aeolus.datasource import CLAVRx_VIIRS from icing.moon_phase import * from pathlib import Path # target_param = 'cloud_probability' target_param = 'cld_opd_dcomp' group_name_i = 'super/' group_name_m = 'orig/' solzen_name = group_name_m + 'solar_zenith' label_params = [group_name_i+target_param] data_params = [group_name_m+'temp_ch31', group_name_m+'refl_ch01', group_name_m+target_param] def keep_tile(param, param_s, tile): k = param_s.index(param) grd_k = tile[k, ].copy() if target_param == 'cloud_probability': grd_k = process_cld_prob_(grd_k) elif target_param == 'cld_opd_dcomp': grd_k = process_cld_opd_(grd_k) if grd_k is not None: tile[k, ] = grd_k return tile else: return None def process_cld_prob_(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) if num_keep / grd_k.size < 0.98: return None keep = np.where(keep, np.logical_and(0.05 < grd_k, grd_k < 0.95), False) if np.sum(keep)/num_keep < 0.50: return None grd_k = np.where(np.invert(keep), 0, grd_k) return grd_k def process_cld_opd_(grd_k): keep = np.invert(np.isnan(grd_k)) num_keep = np.sum(keep) if num_keep / grd_k.size < 0.98: return None grd_k = np.where(np.invert(keep), 0, grd_k) keep = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 158.0), False) if np.sum(keep)/num_keep < 0.50: return None return grd_k def run_all(directory, out_directory, pattern='clavrx_*.nc', start=10): cnt = start total_num_train_samples = 0 total_num_valid_samples = 0 path = directory + '**' + '/' + pattern files = glob.glob(path, recursive=True) label_tiles = [] data_tiles = [] f_cnt = 0 num_files = len(files) print('Start, number of files: ', num_files) for idx, f in enumerate(files): # if idx % 4 == 0: # if we want to skip some files if True: try: h5f = h5py.File(f, 'r') except: print('cant open file: ', f) continue try: run(h5f, data_params, data_tiles, label_params, label_tiles, tile_width=64, kernel_size=5) except Exception as e: print(e) h5f.close() continue print(f) f_cnt += 1 h5f.close() if len(data_tiles) == 0: continue if (f_cnt % 100) == 0: num_valid_samples = 0 label = np.stack(label_tiles) data = np.stack(data_tiles) np.save(out_directory + 'label_' + str(cnt), label) np.save(out_directory + 'data_' + str(cnt), data) num_samples = data.shape[0] label_tiles = [] data_tiles = [] # print(' num_train_samples, num_valid_samples, progress % : ', num_train_samples, num_valid_samples, int((f_cnt/num_files)*100)) # total_num_train_samples += num_samples # total_num_valid_samples += num_valid_samples # print('total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) cnt += 1 print('** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples) # tile_width: Must be even! # kernel_size: Must be odd! def run(h5f, param_s, tiles, lbl_param_s, lbl_tiles, tile_width=64, kernel_size=3): border = int((kernel_size - 1)/2) + 1 # Need to add for interpolation with no edge effects param_name = param_s[0] num_lines = h5f[param_name].shape[0] num_pixels = h5f[param_name].shape[1] # Must be even grd_s = [] for param in param_s: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels) grd_s.append(grd) except Exception as e: print(e) return data = np.stack(grd_s) grd_s = [] for param in lbl_param_s: try: grd = get_grid_values(h5f, param, 0, 0, None, num_lines*2, num_pixels*2) grd_s.append(grd) except Exception as e: print(e) return label = np.stack(grd_s) nda = data[:, :, :] nda = keep_tile(group_name_m + target_param, param_s, nda) if nda is None: # if none, no need to check the next one return nda_lbl = label[:, :, :] nda_lbl = keep_tile(group_name_i + target_param, lbl_param_s, nda_lbl) if nda_lbl is not None: tiles.append(nda) lbl_tiles.append(nda_lbl)