From 28dc08108691079a4ccdc30c3964d477b13dc1ad Mon Sep 17 00:00:00 2001 From: tomrink <rink@ssec.wisc.edu> Date: Thu, 21 Mar 2024 10:07:35 -0500 Subject: [PATCH] snapshot... --- modules/util/hdf5_conversion.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 modules/util/hdf5_conversion.py diff --git a/modules/util/hdf5_conversion.py b/modules/util/hdf5_conversion.py new file mode 100644 index 00000000..0aa0eb33 --- /dev/null +++ b/modules/util/hdf5_conversion.py @@ -0,0 +1,44 @@ +import h5py +import numpy as np +import pandas as pd + + +def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=1000): + """ + Convert HDF5 files to NumPy's NPZ and CSV formats in chunks. + Only values where the boolean mask is True are included. + + Parameters: + hdf5_filename (str): Path to the input HDF5 file. + output_file_prefix (str): Prefix for the output NPZ and CSV files. + chunk_size (int): Size of chunks to process at once (default is 1000). + """ + + # Step 1: Open HDF5 file + with h5py.File(hdf5_filename, "r") as file: + mask = np.asarray(file["mask"]) # If mask needs to be applied, load it into memory + + # For each dataset + for dataset_name in file.keys(): + dataset = file[dataset_name] + + # Determine how many chunks are needed (rounded up) + num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size + + # Process each chunk + for i in range(num_chunks): + start_index = i * chunk_size + end_index = min((i + 1) * chunk_size, dataset.shape[0]) + + # Load chunk into memory, apply mask if necessary + data_chunk = dataset[start_index:end_index] + + if data_chunk.shape == mask.shape: + data_chunk = data_chunk[mask[start_index:end_index]] + + # Step 2: Save chunk to npz file (adds a suffix to filename) + np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk) + + # Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename) + df = pd.DataFrame(data_chunk) + df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv") \ No newline at end of file -- GitLab