import h5py import numpy as np import pandas as pd def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000): """ Convert HDF5 files to NumPy's NPZ and CSV formats in chunks. Only values where the boolean mask is True are included. Parameters: hdf5_filename (str): Path to the input HDF5 file. output_file_prefix (str): Prefix for the output NPZ and CSV files. chunk_size (int): Size of chunks to process at once (default is 1000). """ # Step 1: Open HDF5 file with h5py.File(hdf5_filename, "r") as file: mask = np.asarray(file["FOV_mask"]) # If mask needs to be applied, load it into memory # For each dataset for dataset_name in filter(lambda key: key != "FOV_mask", file.keys()): dataset = file[dataset_name] # Determine how many chunks are needed (rounded up) num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size # Process each chunk for i in range(num_chunks): start_index = i * chunk_size end_index = min((i + 1) * chunk_size, dataset.shape[0]) # Load chunk into memory, apply mask if necessary data_chunk = dataset[start_index:end_index] if data_chunk.shape == mask.shape: data_chunk = data_chunk[mask[start_index:end_index]] # Step 2: Save chunk to npz file (adds a suffix to filename) np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk) # Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename) df = pd.DataFrame(data_chunk) df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv")