Newer
Older
def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
"""
Convert HDF5 files to NumPy's NPZ and CSV formats in chunks.
Only values where the boolean mask is True are included.
Parameters:
hdf5_filename (str): Path to the input HDF5 file.
output_file_prefix (str): Prefix for the output NPZ and CSV files.
chunk_size (int): Size of chunks to process at once (default is 1000).
"""
# Step 1: Open HDF5 file
with h5py.File(hdf5_filename, "r") as file:
fov_mask = np.asarray(file["FOV_mask"])
# these are 1D arrays that we'll broadcast from below
exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude']
dict_2d = {}
dict_1d = {}
for dataset_name in filter(lambda key: key not in exclude_list, file.keys()):
dataset = file[dataset_name]
# Determine how many chunks are needed (rounded up)
num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size
# Process each chunk
for i in range(num_chunks):
start_index = i * chunk_size
end_index = min((i + 1) * chunk_size, dataset.shape[0])
# Load chunk into memory, apply mask if necessary
keep_array = []
for idx in range(data_chunk.shape[0]):
data = data_chunk[idx,].flatten()
mask = fov_mask[idx,].flatten().astype(np.bool)
keep_array.append(data[mask])
data_chunk = np.concatenate(keep_array)
dict_2d[dataset_name] = data_chunk
exclude_list.remove('FOV_mask')
for dataset_name in exclude_list:
dataset = file[dataset_name]
# Determine how many chunks are needed (rounded up)
num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size
# Process each chunk
for i in range(num_chunks):
start_index = i * chunk_size
end_index = min((i + 1) * chunk_size, dataset.shape[0])
# Load chunk into memory, apply mask if necessary
data_chunk = dataset[start_index:end_index, ]
keep_array = []
for idx in range(data_chunk.shape[0]):
data = np.zeros(np.sum(fov_mask[idx]))
data[:] = data_chunk[idx]
keep_array.append(data)
data_chunk = np.concatenate(keep_array)
dict_1d[dataset_name] = data_chunk
# Combine dict_2d and dict_1d
combined_dict = {**dict_2d, **dict_1d}
# Convert combined_dict to DataFrame
df = pd.DataFrame(combined_dict)
for key, data in combined_dict.items():
output_file.create_dataset(key, data=data)