Skip to content
Snippets Groups Projects
Select Git revision
  • 1037a6f53a1b7be00391900e53b780a82d3d5f51
  • master default protected
  • use_flight_altitude
  • distribute
4 results

__init__.py

Blame
  • abi_surfrad.py 11.68 KiB
    import numpy as np
    import h5py
    from util.util import get_grid_values, is_day
    import glob
    
    # target_param = 'cloud_probability'
    # target_param = 'cld_opd_dcomp'
    target_param = 'cld_opd_dcomp_1'
    # target_param = 'cld_opd_dcomp_2'
    # target_param = 'cld_opd_dcomp_3'
    
    group_name_i = 'super/'
    group_name_m = 'orig/'
    
    solzen_name = group_name_m + 'solar_zenith'
    snow_class_name = group_name_m + 'snow_class'
    
    # params_i = [group_name_i+'temp_ch38', group_name_i+'refl_ch01', group_name_i+target_param]
    params_i = [group_name_i+'temp_ch38', group_name_i+'refl_ch01', group_name_i+'temp_stddev3x3_ch31', group_name_i+'refl_stddev3x3_ch01', group_name_i+target_param]
    params_m = [group_name_m+'temp_ch38', group_name_m+'refl_ch01', group_name_m+'refl_submin_ch01', group_name_m+'refl_submax_ch01', group_name_m+'refl_substddev_ch01', group_name_m+'temp_stddev3x3_ch31', group_name_m+'refl_stddev3x3_ch01', group_name_m+target_param]
                # group_name_m+'refl_submax_ch01', group_name_m+'refl_substddev_ch01', group_name_m+target_param]
    
    param_idx_m = params_m.index(group_name_m + target_param)
    param_idx_i = params_i.index(group_name_i + target_param)
    
    
    def snow_covered(tile):
        return np.any(tile > 1)
    
    
    def is_missing(p_idx, tile):
        keep = np.invert(np.isnan(tile[p_idx, ]))
        if np.sum(keep) / keep.size < 0.98:
            return True
    
    
    def keep_tile(p_idx, tile):
        grd_k = tile[p_idx, ].copy()
    
        if target_param == 'cloud_probability':
            grd_k = process_cld_prob(grd_k)
        elif 'cld_opd_dcomp' in target_param:
            grd_k = process_cld_opd(grd_k)
    
        if grd_k is not None:
            tile[p_idx, ] = grd_k
            return tile
        else:
            return None
    
    
    def process_cld_prob(grd_k):
        keep = np.invert(np.isnan(grd_k))
        num_keep = np.sum(keep)
        # keep_clr = np.where(keep, grd_k < 0.30, False)
        keep_cld = np.where(keep, grd_k > 0.70, False)
        # frac_clr = np.sum(keep_clr)/num_keep
        frac_cld = np.sum(keep_cld)/num_keep
        if not (0.10 <= frac_cld <= 0.90):
            return None
        grd_k = np.where(np.invert(keep), 0, grd_k)  # Convert NaN to 0
        return grd_k
    
    
    def process_cld_opd(grd_k):
        keep = np.invert(np.isnan(grd_k))
        num_keep = np.sum(keep)
        keep_cld = np.where(keep, np.logical_and(2.0 < grd_k, grd_k < 158.0), False)
        # keep_cld = np.where(keep, 2.0 < grd_k, False)
        frac_cld = np.sum(keep_cld)/num_keep
        # if not (0.40 < frac_cld):
        if not (0.10 < frac_cld < 0.90):
            return None
        grd_k = np.where(np.invert(keep), 0, grd_k)  # Convert NaN to 0
        return grd_k
    
    
    def run_all(directory, out_directory, day_night='ANY', pattern='clavrx_*.nc', start=10, is_snow_covered=None):
        cnt = start
        total_num_train_samples = 0
        total_num_valid_samples = 0
    
        # path = directory + '**' + '/' + pattern
        path = directory + '*_v3/2020/' + '**' + '/' + pattern
    
        all_files = glob.glob(path, recursive=True)
        test_files = glob.glob(directory + '*_v3/2020/*/01/*/*.nc', recursive=True)
        valid_files = glob.glob(directory + '*_v3/2020/*/0[2-6]/*/*.nc', recursive=True)
        train_files = [f for f in all_files if f not in valid_files + test_files]
    
        data_tiles_i = []
        data_tiles_m = []
        f_cnt = 0
    
        num_files = len(valid_files)
        print('Start, number of valid files: ', num_files)
    
        total_num_not_missing = 0
        num_skip = 3
    
        param_train_hist = np.zeros([20], dtype=np.int64)
        param_valid_hist = np.zeros([20], dtype=np.int64)
    
        for idx, data_f in enumerate(valid_files):
            if idx % num_skip == 0:  # if we want to skip some files
                try:
                    h5f = h5py.File(data_f, 'r')
                except:
                    print('cant open file: ', data_f)
                    continue
    
                try:
                    num_not_missing, num_snow_covered = \
                        run(h5f, params_m, data_tiles_m, params_i, data_tiles_i,
                            # tile_width=16, kernel_size=4, factor=4,
                            tile_width=64, kernel_size=7, factor=2,
                            day_night=day_night, is_snow_covered=is_snow_covered)
                except Exception as e:
                    print(e)
                    h5f.close()
                    continue
                print(data_f)
                f_cnt += 1
                h5f.close()
    
                total_num_not_missing += num_not_missing
    
                if len(data_tiles_m) == 0:
                    continue
    
                if (f_cnt % 100) == 0:
                    num_valid_samples = 0
                    if len(data_tiles_m) > 0:
                        valid_i = np.stack(data_tiles_i)
                        valid_m = np.stack(data_tiles_m)
                        np.save(out_directory + 'valid_mres_' + str(cnt), valid_m)
                        np.save(out_directory + 'valid_ires_' + str(cnt), valid_i)
                        num_valid_samples = valid_m.shape[0]
    
                        param_valid_hist += np.histogram(valid_m[param_idx_m, ], bins=20, range=[0.0, 160.0])[0]
    
                    data_tiles_i = []
                    data_tiles_m = []
    
                    print('  num_valid_samples, progress % : ', num_valid_samples, int((f_cnt/(num_files/num_skip))*100))
                    total_num_valid_samples += num_valid_samples
                    print('total_num_valid_samples, total_num_not_missing: ', total_num_valid_samples, total_num_not_missing)
                    print('--------------------------------------------------')
    
                    cnt += 1
    
        # Write out leftover, if any. Maybe make this better someday
        num_valid_samples = 0
        if len(data_tiles_m) > 0:
            valid_i = np.stack(data_tiles_i)
            valid_m = np.stack(data_tiles_m)
            np.save(out_directory + 'valid_mres_' + str(cnt), valid_m)
            np.save(out_directory + 'valid_ires_' + str(cnt), valid_i)
            num_valid_samples = valid_m.shape[0]
            param_valid_hist += np.histogram(valid_m[param_idx_m, ], bins=20, range=[0.0, 160.0])[0]
        total_num_valid_samples += num_valid_samples
        print('total_num_valid_samples, total_num_not_missing: ', total_num_valid_samples, total_num_not_missing)
        print(param_valid_hist)
        print('--------------------------------------------------')
        print('----------------------------------------------------------------')
    
        data_tiles_i = []
        data_tiles_m = []
        f_cnt = 0
        total_num_not_missing = 0
        num_files = len(train_files)
        print('Start, number of train files: ', num_files)
    
        for idx, data_f in enumerate(train_files):
            if idx % num_skip == 0:  # if we want to skip some files
                try:
                    h5f = h5py.File(data_f, 'r')
                except:
                    print('cant open file: ', data_f)
                    continue
    
                try:
                    num_not_missing, num_snow_covered = \
                        run(h5f, params_m, data_tiles_m, params_i, data_tiles_i,
                            # tile_width=16, kernel_size=4, factor=4,
                            tile_width=64, kernel_size=7, factor=2,
                            day_night=day_night, is_snow_covered=is_snow_covered)
                except Exception as e:
                    print(e)
                    h5f.close()
                    continue
                print(data_f)
                f_cnt += 1
                h5f.close()
    
                total_num_not_missing += num_not_missing
    
                if len(data_tiles_m) == 0:
                    continue
    
                if (f_cnt % 100) == 0:
                    num_train_samples = 0
                    if len(data_tiles_m) > 0:
                        train_i = np.stack(data_tiles_i)
                        train_m = np.stack(data_tiles_m)
                        np.save(out_directory + 'train_ires_' + str(cnt), train_i)
                        np.save(out_directory + 'train_mres_' + str(cnt), train_m)
                        num_train_samples = train_m.shape[0]
    
                        param_train_hist += np.histogram(train_m[param_idx_m, ], bins=20, range=[0.0, 160.0])[0]
    
                    data_tiles_i = []
                    data_tiles_m = []
    
                    print('  num_train_samples, progress % : ', num_train_samples, int((f_cnt/(num_files/num_skip))*100))
                    total_num_train_samples += num_train_samples
                    print('total_num_train_samples, total_num_not_missing: ', total_num_train_samples, total_num_not_missing)
                    print('--------------------------------------------------')
    
                    cnt += 1
    
        # Write out leftover, if any. Maybe make this better someday
        num_train_samples = 0
        if len(data_tiles_m) > 0:
            train_i = np.stack(data_tiles_i)
            train_m = np.stack(data_tiles_m)
            np.save(out_directory + 'train_ires_' + str(cnt), train_i)
            np.save(out_directory + 'train_mres_' + str(cnt), train_m)
            num_train_samples = train_m.shape[0]
            param_train_hist += np.histogram(train_m[param_idx_m, ], bins=20, range=[0.0, 160.0])[0]
        total_num_train_samples += num_train_samples
        print('total_num_train_samples,  total_num_not_missing: ', total_num_train_samples, total_num_not_missing)
        print(param_train_hist)
        print('--------------------------------------------------')
    
        print('*** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples)
    
    
    #  tile_width: Must be even!
    #  kernel_size: Must be odd!
    def run(h5f, params_m, data_tiles_m, params_i, data_tiles_i, tile_width=64, kernel_size=3, factor=2,
            day_night='ANY', is_snow_covered=None):
    
        border = int((kernel_size - 1)/2) + 1  # Need to add for interpolation with no edge effects
    
        param_name = params_m[0]
    
        num_lines = h5f[param_name].shape[0]
        num_pixels = h5f[param_name].shape[1]  # Must be even
    
        if day_night != 'ANY':
            solzen = get_grid_values(h5f, solzen_name, 0, 0, None, num_lines, num_pixels)
    
        if is_snow_covered is not None:
            snow = get_grid_values(h5f, snow_class_name, 0, 0, None, num_lines, num_pixels)
    
        grd_s = []
        for param in params_m:
            try:
                grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels)
                grd_s.append(grd)
            except Exception as e:
                print(e)
                return
        data_m = np.stack(grd_s)
    
        grd_s = []
        for param in params_i:
            try:
                grd = get_grid_values(h5f, param, 0, 0, None, num_lines*factor, num_pixels*factor)
                grd_s.append(grd)
            except Exception as e:
                print(e)
                return
        data_i = np.stack(grd_s)
    
        tile_width += 2 * border
    
        i_skip = tile_width
        j_skip = tile_width
        i_start = border - 1  # zero-based
        j_start = border - 1  # zero-based
    
        num_y_tiles = int(num_lines / tile_width)
        num_x_tiles = int(num_pixels / tile_width)
    
        num_not_missing = 0
        num_snow_covered = 0
    
        for j in range(num_y_tiles):
            j_a = j_start + j * j_skip
            j_b = j_a + tile_width
    
            for i in range(num_x_tiles):
                i_a = i_start + i * i_skip
                i_b = i_a + tile_width
    
                if is_snow_covered is not None:
                    if is_snow_covered:
                        if not snow_covered(snow[j_a:j_b, i_a:i_b]):
                            continue
                        num_snow_covered += 1
                    else:
                        if snow_covered(snow[j_a:j_b, i_a:i_b]):
                            num_snow_covered += 1
                            continue
    
                if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]):
                    continue
                elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]):
                    continue
    
                nda_m = data_m[:, j_a:j_b, i_a:i_b]
                nda_i = data_i[:, j_a*factor:j_b*factor, i_a*factor:i_b*factor]
                if is_missing(param_idx_i, nda_i):
                    continue
                num_not_missing += 1
    
                nda_i = keep_tile(param_idx_i, nda_i)
                if nda_i is not None:
                    data_tiles_m.append(nda_m)
                    data_tiles_i.append(nda_i)
    
        return num_not_missing, num_snow_covered