import h5py import numpy as np import pandas as pd import random def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000): """ Convert HDF5 files to NumPy's NPZ and CSV formats in chunks. Only values where the boolean mask is True are included. Parameters: hdf5_filename (str): Path to the input HDF5 file. output_file_prefix (str): Prefix for the output NPZ and CSV files. chunk_size (int): Size of chunks to process at once (default is 1000). """ # # New step: keep only 10 mask values == True # mask_indices = list(np.nonzero(mask)[0]) # if len(mask_indices) > 10: # selected_indices = random.sample(mask_indices, 10) # new_mask = np.zeros(mask.size).astype(np.bool) # new_mask[selected_indices] = True # mask = new_mask # keep_array.append(data[mask]) # Step 1: Open HDF5 file with h5py.File(hdf5_filename, "r") as file: fov_mask = np.asarray(file["FOV_mask"]) # these are 1D arrays that we'll broadcast from below exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude'] dict_2d = {} dict_1d = {} # For each dataset for dataset_name in filter(lambda key: key not in exclude_list, file.keys()): dataset = file[dataset_name] # Determine how many chunks are needed (rounded up) num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size # Process each chunk for i in range(num_chunks): start_index = i * chunk_size end_index = min((i + 1) * chunk_size, dataset.shape[0]) # Load chunk into memory, apply mask if necessary data_chunk = dataset[start_index:end_index,] keep_array = [] for idx in range(data_chunk.shape[0]): data = data_chunk[idx,].flatten() mask = fov_mask[idx,].flatten().astype(np.bool) keep_array.append(data[mask]) data_chunk = np.concatenate(keep_array) dict_2d[dataset_name] = data_chunk exclude_list.remove('FOV_mask') for dataset_name in exclude_list: dataset = file[dataset_name] # Determine how many chunks are needed (rounded up) num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size # Process each chunk for i in range(num_chunks): start_index = i * chunk_size end_index = min((i + 1) * chunk_size, dataset.shape[0]) # Load chunk into memory, apply mask if necessary data_chunk = dataset[start_index:end_index, ] keep_array = [] for idx in range(data_chunk.shape[0]): data = np.zeros(np.sum(fov_mask[idx])) data[:] = data_chunk[idx] keep_array.append(data) data_chunk = np.concatenate(keep_array) dict_1d[dataset_name] = data_chunk # Combine dict_2d and dict_1d combined_dict = {**dict_2d, **dict_1d} # Convert combined_dict to DataFrame df = pd.DataFrame(combined_dict) # Write the DataFrame to a file df.to_csv(f"{output_file_prefix}_1D.csv", index=False) # Write the combined_dict to a new HDF5 file with h5py.File(f"{output_file_prefix}_1D.h5", 'w') as output_file: for key, data in combined_dict.items(): output_file.create_dataset(key, data=data)