Skip to content
Snippets Groups Projects
viirs_surfrad.py 10.65 KiB
import numpy as np
import h5py
from util.util import get_grid_values, get_grid_values_all, is_night, is_day, compute_lwc_iwc, get_fill_attrs
import glob
import os
from aeolus.datasource import CLAVRx_VIIRS
from icing.moon_phase import *
from pathlib import Path


# target_param = 'cloud_probability'
target_param = 'cld_opd_dcomp'

group_name_i = 'super/'
group_name_m = 'orig/'

solzen_name = group_name_m + 'solar_zenith'

label_params = [group_name_i+target_param]
data_params = [group_name_m+'temp_11_0um', group_name_m+'refl_0_65um', group_name_m+target_param]


def keep_tile(param, param_s, tile):
    k = param_s.index(param)
    grd_k = tile[k, ].copy()

    if target_param == 'cloud_probability':
        grd_k = process_cld_prob_(grd_k)
    elif target_param == 'cld_opd_dcomp':
        grd_k = process_cld_opd_(grd_k)

    if grd_k is not None:
        tile[k, ] = grd_k
        return tile
    else:
        return None


def process_cld_prob_(grd_k):
    keep = np.invert(np.isnan(grd_k))
    num_keep = np.sum(keep)
    if num_keep / grd_k.size < 0.98:
        return None
    keep = np.where(keep, np.logical_and(0.05 < grd_k, grd_k < 0.95), False)
    if np.sum(keep)/num_keep < 0.50:
        return None
    grd_k = np.where(np.invert(keep), 0, grd_k)
    return grd_k


def process_cld_opd_(grd_k):
    keep = np.invert(np.isnan(grd_k))
    num_keep = np.sum(keep)
    if num_keep / grd_k.size < 0.98:
        return None
    grd_k = np.where(np.invert(keep), 0, grd_k)
    keep = np.where(keep, np.logical_and(0.1 < grd_k, grd_k < 158.0), False)
    if np.sum(keep)/num_keep < 0.50:
        return None
    return grd_k


def run_all(directory, out_directory, day_night='ANY', pattern='clavrx_*.nc', start=10):
    cnt = start
    total_num_train_samples = 0
    total_num_valid_samples = 0
    num_keep_x_tiles = 8

    path = directory + '**' + '/' + pattern

    data_files = glob.glob(path, recursive=True)

    label_valid_tiles = []
    label_train_tiles = []
    data_valid_tiles = []
    data_train_tiles = []
    f_cnt = 0

    num_files = len(data_files)
    print('Start, number of files: ', num_files)

    for idx, data_f in enumerate(data_files):
        # if idx % 4 == 0:  # if we want to skip some files
        if True:
            try:
                h5f = h5py.File(data_f, 'r')
            except:
                print('cant open file: ', data_f)
                continue

            try:
                total, kept = run(h5f, data_params, data_train_tiles, data_valid_tiles,
                                  label_params, label_train_tiles, label_valid_tiles,
                                  num_keep_x_tiles=num_keep_x_tiles, tile_width=64, kernel_size=5, day_night=day_night)
            except Exception as e:
                print(e)
                h5f.close()
                continue

            print(data_f, int(100 * (kept/total)))
            f_cnt += 1
            h5f.close()

            if len(data_train_tiles) == 0:
                continue

            if (f_cnt % 5) == 0:
                num_valid_samples = 0
                if len(data_valid_tiles) > 0:
                    label_valid = np.stack(label_valid_tiles)
                    data_valid = np.stack(data_valid_tiles)
                    np.save(out_directory + 'data_valid_' + str(cnt), data_valid)
                    np.save(out_directory + 'label_valid_' + str(cnt), label_valid)
                    num_valid_samples = data_valid.shape[0]

                label_train = np.stack(label_train_tiles)
                data_train = np.stack(data_train_tiles)
                np.save(out_directory + 'label_train_' + str(cnt), label_train)
                np.save(out_directory + 'data_train_' + str(cnt), data_train)
                num_train_samples = data_train.shape[0]

                label_valid_tiles = []
                label_train_tiles = []
                data_valid_tiles = []
                data_train_tiles = []

                print('  num_train_samples, num_valid_samples, progress % : ', num_train_samples, num_valid_samples, int((f_cnt/num_files)*100))
                total_num_train_samples += num_train_samples
                total_num_valid_samples += num_valid_samples
                print('total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples)

                cnt += 1

    print('** total_num_train_samples, total_num_valid_samples: ', total_num_train_samples, total_num_valid_samples)


#  tile_width: Must be even!
#  kernel_size: Must be odd!
def run(h5f, param_s, train_tiles, valid_tiles, lbl_param_s, lbl_train_tiles, lbl_valid_tiles,
        num_keep_x_tiles=8, tile_width=64, kernel_size=3, day_night='DAY'):

    border = int((kernel_size - 1)/2) + 1  # Need to add for interpolation with no edge effects

    param_name = param_s[0]

    num_lines = h5f[param_name].shape[0]
    num_pixels = h5f[param_name].shape[1]  # Must be even

    if day_night != 'BOTH':
        solzen = get_grid_values(h5f, solzen_name, 0, 0, None, num_lines, num_pixels)

    grd_s = []
    for param in param_s:
        try:
            grd = get_grid_values(h5f, param, 0, 0, None, num_lines, num_pixels)
            grd_s.append(grd)
        except Exception as e:
            print(e)
            return
    data = np.stack(grd_s)

    grd_s = []
    for param in lbl_param_s:
        try:
            grd = get_grid_values(h5f, param, 0, 0, None, num_lines*2, num_pixels*2)
            grd_s.append(grd)
        except Exception as e:
            print(e)
            return
    label = np.stack(grd_s)

    tile_width += 2 * border

    i_skip = tile_width
    j_skip = tile_width
    i_start = int(num_pixels / 2) - int((num_keep_x_tiles * tile_width) / 2)
    j_start = 0

    num_keep_y_tiles = int(num_lines / tile_width) - 3

    num_y_valid = int(num_keep_y_tiles * 0.1) + 1
    num_y_train = num_keep_y_tiles - num_y_valid - 1

    cnt_total = 0
    cnt_kept = 0
    for j in range(num_y_train):
        j_a = j_start + j * j_skip
        j_b = j_a + tile_width

        for i in range(num_keep_x_tiles):
            i_a = i_start + i * i_skip
            i_b = i_a + tile_width

            cnt_total += 1

            if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]):
                continue
            elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]):
                continue

            nda = data[:, j_a:j_b, i_a:i_b]
            nda = keep_tile(group_name_m+target_param, param_s, nda)
            if nda is None:  # if none, no need to check the next one
                continue

            nda_lbl = label[:, j_a*2:j_b*2, i_a*2:i_b*2]
            nda_lbl = keep_tile(group_name_i+target_param, lbl_param_s, nda_lbl)

            if nda_lbl is not None:
                train_tiles.append(nda)
                lbl_train_tiles.append(nda_lbl)
                cnt_kept += 1

    j_start = num_y_train * tile_width + 2*tile_width
    for j in range(num_y_valid):
        j_a = j_start + j * j_skip
        j_b = j_a + tile_width

        for i in range(num_keep_x_tiles):
            i_a = i_start + i * i_skip
            i_b = i_a + tile_width

            cnt_total += 1

            if day_night == 'DAY' and not is_day(solzen[j_a:j_b, i_a:i_b]):
                continue
            elif day_night == 'NIGHT' and is_day(solzen[j_a:j_b, i_a:i_b]):
                continue

            nda = data[:, j_a:j_b, i_a:i_b]
            nda = keep_tile(group_name_m+target_param, param_s, nda)
            if nda is None:
                continue

            nda_lbl = label[:, j_a * 2:j_b * 2, i_a * 2:i_b * 2]
            nda_lbl = keep_tile(group_name_i+target_param, lbl_param_s, nda_lbl)

            if nda_lbl is not None:
                valid_tiles.append(nda)
                lbl_valid_tiles.append(nda_lbl)
                cnt_kept += 1

    return cnt_total, cnt_kept


def scan(directory):

    data_src = CLAVRx_VIIRS(directory)
    files = data_src.flist

    for idx, file in enumerate(files):
        h5f = h5py.File(file, 'r')
        ts = data_src.ftimes[idx][0]
        try:
            solzen = get_grid_values_all(h5f, 'solar_zenith_angle')
        except Exception as e:
            # print(e)
            h5f.close()
            continue

        # if is_day(solzen) and moon_phase(ts):
        if is_night(solzen) and moon_phase(ts):
            print(file)
        h5f.close()


def scan_for_location(txt_file, lon_range=[111.0, 130.0], lat_range=[14.0, 32.0]):
    with open(txt_file) as file:
        for idx, fpath in enumerate(file):
            fpath = fpath.strip()
            h5f = h5py.File(fpath, 'r')
            try:
                lon_s = get_grid_values_all(h5f, 'longitude', stride=4)
                lat_s = get_grid_values_all(h5f, 'latitude', stride=4)
                c_lon, c_lat = lon_s[406, 400], lat_s[406, 400]
                if (lon_range[0] < c_lon < lon_range[1]) and (lat_range[0] < c_lat < lat_range[1]):
                    print(fpath)

            except Exception as e:
                # print(e)
                h5f.close()
                continue


def test_nlcomp(file):
    h5f = h5py.File(file, 'r')

    cld_phs = get_grid_values_all(h5f, 'cloud_phase', scale_factor_name=None, range_name=None)
    keep_0 = np.invert(np.isnan(cld_phs))

    reff = get_grid_values_all(h5f, 'cld_reff_nlcomp')
    keep_1 = np.invert(np.isnan(reff))

    opd = get_grid_values_all(h5f, 'cld_opd_nlcomp')
    keep_2 = np.invert(np.isnan(opd))

    cld_dz = get_grid_values_all(h5f, 'cld_geo_thick')
    keep_3 = np.logical_and(np.invert(np.isnan(cld_dz)), cld_dz > 5.0)

    keep = keep_0 & keep_1 & keep_2 & keep_3

    cld_phs = cld_phs[keep]
    reff = reff[keep]
    opd = opd[keep]
    cld_dz = cld_dz[keep]

    lwc_c, iwc_c = compute_lwc_iwc(cld_phs, reff, opd, cld_dz)

    return lwc_c, iwc_c


# def run_mean_std(directory):
#
#     data_dct = {name: [] for name in mod_res_params}
#     mean_dct = {name: 0 for name in mod_res_params}
#     std_dct = {name: 0 for name in mod_res_params}
#
#     for p in os.scandir(directory):
#         if not p.is_dir():
#             continue
#         mod_files = glob.glob(directory+p.name+'/'+'VNP02MOD*.uwssec.nc')
#
#         for idx, mfile in enumerate(mod_files):
#             if idx % 8 == 0:
#                 h5f = h5py.File(mfile, 'r')
#                 for param in mod_res_params:
#                     name = 'observation_data/'+param
#                     gvals = get_grid_values_all(h5f, name, range_name=None, stride=10)
#                     data_dct[param].append(gvals.flatten())
#                 print(mfile)
#                 h5f.close()
#
#     for param in mod_res_params:
#         data = data_dct[param]
#         data = np.concatenate(data)
#
#         mean_dct[param] = np.nanmean(data)
#         std_dct[param] = np.nanstd(data)