Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import h5py
import numpy as np
import pandas as pd
def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=1000):
"""
Convert HDF5 files to NumPy's NPZ and CSV formats in chunks.
Only values where the boolean mask is True are included.
Parameters:
hdf5_filename (str): Path to the input HDF5 file.
output_file_prefix (str): Prefix for the output NPZ and CSV files.
chunk_size (int): Size of chunks to process at once (default is 1000).
"""
# Step 1: Open HDF5 file
with h5py.File(hdf5_filename, "r") as file:
mask = np.asarray(file["mask"]) # If mask needs to be applied, load it into memory
# For each dataset
for dataset_name in file.keys():
dataset = file[dataset_name]
# Determine how many chunks are needed (rounded up)
num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size
# Process each chunk
for i in range(num_chunks):
start_index = i * chunk_size
end_index = min((i + 1) * chunk_size, dataset.shape[0])
# Load chunk into memory, apply mask if necessary
data_chunk = dataset[start_index:end_index]
if data_chunk.shape == mask.shape:
data_chunk = data_chunk[mask[start_index:end_index]]
# Step 2: Save chunk to npz file (adds a suffix to filename)
np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk)
# Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename)
df = pd.DataFrame(data_chunk)
df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv")