snapshot...

5b7691fd · tomrink · 48989aad · 5b7691fd
Commit 5b7691fd authored 1 year ago by tomrink
--- a/modules/util/hdf5_conversion.py
+++ b/modules/util/hdf5_conversion.py
@@ -16,10 +16,14 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
    # Step 1: Open HDF5 file
    with h5py.File(hdf5_filename, "r") as file:
-        mask = np.asarray(file["FOV_mask"])  # If mask needs to be applied, load it into memory
+        fov_mask = np.asarray(file["FOV_mask"])
+        # these are 1D arrays that we'll broadcast from below
+        exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude']
+        dict_2d = {}
+        dict_1d = {}
        # For each dataset
-        for dataset_name in filter(lambda key: key != "FOV_mask", file.keys()):
+        for dataset_name in filter(lambda key: key not in exclude_list, file.keys()):
            dataset = file[dataset_name]
            # Determine how many chunks are needed (rounded up)
@@ -31,14 +35,44 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
                end_index = min((i + 1) * chunk_size, dataset.shape[0])
                # Load chunk into memory, apply mask if necessary
-                data_chunk = dataset[start_index:end_index]
+                data_chunk = dataset[start_index:end_index,]
-                if data_chunk.shape == mask.shape:
+                keep_array = []
-                    data_chunk = data_chunk[mask[start_index:end_index]]
+                for idx in range(data_chunk.shape[0]):
+                    data = data_chunk[idx,].flatten()
+                    mask = fov_mask[idx,].flatten().astype(np.bool)
+                    keep_array.append(data[mask])
+                data_chunk = np.concatenate(keep_array)
+                dict_2d[dataset_name] = data_chunk
-                # Step 2: Save chunk to npz file (adds a suffix to filename)
+        exclude_list.remove('FOV_mask')
-                np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk)
+        for dataset_name in exclude_list:
+            dataset = file[dataset_name]
+            # Determine how many chunks are needed (rounded up)
+            num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size
+            # Process each chunk
+            for i in range(num_chunks):
+                start_index = i * chunk_size
+                end_index = min((i + 1) * chunk_size, dataset.shape[0])
+                # Load chunk into memory, apply mask if necessary
+                data_chunk = dataset[start_index:end_index, ]
+                keep_array = []
+                for idx in range(data_chunk.shape[0]):
+                    data = np.zeros(np.sum(fov_mask[idx]))
+                    data[:] = data_chunk[idx]
+                    keep_array.append(data)
+                data_chunk = np.concatenate(keep_array)
+                dict_1d[dataset_name] = data_chunk
+        # Combine dict_2d and dict_1d
+        combined_dict = {**dict_2d, **dict_1d}
+        # Convert combined_dict to DataFrame
+        df = pd.DataFrame(combined_dict)
-                # Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename)
+        # Write the DataFrame to a file
-                df = pd.DataFrame(data_chunk)
+        df.to_csv(f"{output_file_prefix}_combined.csv", index=False)
-                df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv")
\ No newline at end of file
\ No newline at end of file