From 28dc08108691079a4ccdc30c3964d477b13dc1ad Mon Sep 17 00:00:00 2001
From: tomrink <rink@ssec.wisc.edu>
Date: Thu, 21 Mar 2024 10:07:35 -0500
Subject: [PATCH] snapshot...

---
 modules/util/hdf5_conversion.py | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 modules/util/hdf5_conversion.py

diff --git a/modules/util/hdf5_conversion.py b/modules/util/hdf5_conversion.py
new file mode 100644
index 00000000..0aa0eb33
--- /dev/null
+++ b/modules/util/hdf5_conversion.py
@@ -0,0 +1,44 @@
+import h5py
+import numpy as np
+import pandas as pd
+
+
+def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=1000):
+    """
+    Convert HDF5 files to NumPy's NPZ and CSV formats in chunks.
+    Only values where the boolean mask is True are included.
+
+    Parameters:
+    hdf5_filename (str): Path to the input HDF5 file.
+    output_file_prefix (str): Prefix for the output NPZ and CSV files.
+    chunk_size (int): Size of chunks to process at once (default is 1000).
+    """
+
+    # Step 1: Open HDF5 file
+    with h5py.File(hdf5_filename, "r") as file:
+        mask = np.asarray(file["mask"])  # If mask needs to be applied, load it into memory
+
+        # For each dataset
+        for dataset_name in file.keys():
+            dataset = file[dataset_name]
+
+            # Determine how many chunks are needed (rounded up)
+            num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size
+
+            # Process each chunk
+            for i in range(num_chunks):
+                start_index = i * chunk_size
+                end_index = min((i + 1) * chunk_size, dataset.shape[0])
+
+                # Load chunk into memory, apply mask if necessary
+                data_chunk = dataset[start_index:end_index]
+
+                if data_chunk.shape == mask.shape:
+                    data_chunk = data_chunk[mask[start_index:end_index]]
+
+                # Step 2: Save chunk to npz file (adds a suffix to filename)
+                np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk)
+
+                # Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename)
+                df = pd.DataFrame(data_chunk)
+                df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv")
\ No newline at end of file
-- 
GitLab