Skip to content
Snippets Groups Projects
hdf5_conversion.py 3.3 KiB
Newer Older
tomrink's avatar
tomrink committed
import h5py
import numpy as np
import pandas as pd
tomrink's avatar
tomrink committed
import random
tomrink's avatar
tomrink committed
import os
tomrink's avatar
tomrink committed


tomrink's avatar
tomrink committed
def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
tomrink's avatar
tomrink committed
    """
    Convert HDF5 files to NumPy's NPZ and CSV formats in chunks.
    Only values where the boolean mask is True are included.

    Parameters:
    hdf5_filename (str): Path to the input HDF5 file.
    output_file_prefix (str): Prefix for the output NPZ and CSV files.
    chunk_size (int): Size of chunks to process at once (default is 1000).
    """
tomrink's avatar
tomrink committed
    # Get the directory from hdf5_filename
    dirpath = os.path.dirname(hdf5_filename)
tomrink's avatar
tomrink committed

    with h5py.File(hdf5_filename, "r") as file:
tomrink's avatar
tomrink committed
        fov_mask = np.asarray(file["FOV_mask"])
        # these are 1D arrays that we'll broadcast from below
        exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude']
        dict_2d = {}
        dict_1d = {}
tomrink's avatar
tomrink committed

        # For each dataset
tomrink's avatar
tomrink committed
        for dataset_name in filter(lambda key: key not in exclude_list, file.keys()):
tomrink's avatar
tomrink committed
            dataset = file[dataset_name]

            # Determine how many chunks are needed (rounded up)
            num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size

            # Process each chunk
            for i in range(num_chunks):
                start_index = i * chunk_size
                end_index = min((i + 1) * chunk_size, dataset.shape[0])

                # Load chunk into memory, apply mask if necessary
tomrink's avatar
tomrink committed
                data_chunk = dataset[start_index:end_index,]
tomrink's avatar
tomrink committed

tomrink's avatar
tomrink committed
                keep_array = []
                for idx in range(data_chunk.shape[0]):
                    data = data_chunk[idx,].flatten()
                    mask = fov_mask[idx,].flatten().astype(np.bool)
                    keep_array.append(data[mask])
                data_chunk = np.concatenate(keep_array)
                dict_2d[dataset_name] = data_chunk
tomrink's avatar
tomrink committed

tomrink's avatar
tomrink committed
        exclude_list.remove('FOV_mask')
        for dataset_name in exclude_list:
            dataset = file[dataset_name]

            # Determine how many chunks are needed (rounded up)
            num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size

            # Process each chunk
            for i in range(num_chunks):
                start_index = i * chunk_size
                end_index = min((i + 1) * chunk_size, dataset.shape[0])

                # Load chunk into memory, apply mask if necessary
                data_chunk = dataset[start_index:end_index, ]

                keep_array = []
                for idx in range(data_chunk.shape[0]):
                    data = np.zeros(np.sum(fov_mask[idx]))
                    data[:] = data_chunk[idx]
                    keep_array.append(data)
                data_chunk = np.concatenate(keep_array)
                dict_1d[dataset_name] = data_chunk

        # Combine dict_2d and dict_1d
        combined_dict = {**dict_2d, **dict_1d}

        # Convert combined_dict to DataFrame
        df = pd.DataFrame(combined_dict)
tomrink's avatar
tomrink committed

tomrink's avatar
tomrink committed
        # Write the DataFrame to a file
tomrink's avatar
tomrink committed
        df.to_csv(os.path.join(dirpath, f"{output_file_prefix}_1D.csv"), index=False)
tomrink's avatar
tomrink committed

        # Write the combined_dict to a new HDF5 file
tomrink's avatar
tomrink committed
        with h5py.File(os.path.join(dirpath, f"{output_file_prefix}_1D.h5"), 'w') as output_file:
tomrink's avatar
tomrink committed
            for key, data in combined_dict.items():
                output_file.create_dataset(key, data=data)