hdf5_conversion.py

import h5py
import numpy as np
import pandas as pd
import random
import os


def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
    """
    Convert HDF5 files to NumPy's NPZ and CSV formats in chunks.
    Only values where the boolean mask is True are included.

    Parameters:
    hdf5_filename (str): Path to the input HDF5 file.
    output_file_prefix (str): Prefix for the output NPZ and CSV files.
    chunk_size (int): Size of chunks to process at once (default is 1000).
    """
    # Get the directory from hdf5_filename
    dirpath = os.path.dirname(hdf5_filename)

    with h5py.File(hdf5_filename, "r") as file:
        fov_mask = np.asarray(file["FOV_mask"])
        # these are 1D arrays that we'll broadcast from below
        exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude']
        dict_2d = {}
        dict_1d = {}

        # For each dataset
        for dataset_name in filter(lambda key: key not in exclude_list, file.keys()):
            dataset = file[dataset_name]

            # Determine how many chunks are needed (rounded up)
            num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size

            # Process each chunk
            for i in range(num_chunks):
                start_index = i * chunk_size
                end_index = min((i + 1) * chunk_size, dataset.shape[0])

                # Load chunk into memory, apply mask if necessary
                data_chunk = dataset[start_index:end_index,]

                keep_array = []
                for idx in range(data_chunk.shape[0]):
                    data = data_chunk[idx,].flatten()
                    mask = fov_mask[idx,].flatten().astype(np.bool)
                    keep_array.append(data[mask])
                data_chunk = np.concatenate(keep_array)
                dict_2d[dataset_name] = data_chunk

        exclude_list.remove('FOV_mask')
        for dataset_name in exclude_list:
            dataset = file[dataset_name]

            # Determine how many chunks are needed (rounded up)
            num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size

            # Process each chunk
            for i in range(num_chunks):
                start_index = i * chunk_size
                end_index = min((i + 1) * chunk_size, dataset.shape[0])

                # Load chunk into memory, apply mask if necessary
                data_chunk = dataset[start_index:end_index, ]

                keep_array = []
                for idx in range(data_chunk.shape[0]):
                    data = np.zeros(np.sum(fov_mask[idx]))
                    data[:] = data_chunk[idx]
                    keep_array.append(data)
                data_chunk = np.concatenate(keep_array)
                dict_1d[dataset_name] = data_chunk

        # Combine dict_2d and dict_1d
        combined_dict = {**dict_2d, **dict_1d}

        # Convert combined_dict to DataFrame
        df = pd.DataFrame(combined_dict)

        # Write the DataFrame to a file
        df.to_csv(os.path.join(dirpath, f"{output_file_prefix}_1D.csv"), index=False)

        # Write the combined_dict to a new HDF5 file
        with h5py.File(os.path.join(dirpath, f"{output_file_prefix}_1D.h5"), 'w') as output_file:
            for key, data in combined_dict.items():
                output_file.create_dataset(key, data=data)