diff --git a/modules/util/hdf5_conversion.py b/modules/util/hdf5_conversion.py index 60b5617abdbb668858826a930479f7878071a948..573417631b08114375a600143d72bf8b13b48d43 100644 --- a/modules/util/hdf5_conversion.py +++ b/modules/util/hdf5_conversion.py @@ -16,10 +16,14 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000): # Step 1: Open HDF5 file with h5py.File(hdf5_filename, "r") as file: - mask = np.asarray(file["FOV_mask"]) # If mask needs to be applied, load it into memory + fov_mask = np.asarray(file["FOV_mask"]) + # these are 1D arrays that we'll broadcast from below + exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude'] + dict_2d = {} + dict_1d = {} # For each dataset - for dataset_name in filter(lambda key: key != "FOV_mask", file.keys()): + for dataset_name in filter(lambda key: key not in exclude_list, file.keys()): dataset = file[dataset_name] # Determine how many chunks are needed (rounded up) @@ -31,14 +35,44 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000): end_index = min((i + 1) * chunk_size, dataset.shape[0]) # Load chunk into memory, apply mask if necessary - data_chunk = dataset[start_index:end_index] + data_chunk = dataset[start_index:end_index,] - if data_chunk.shape == mask.shape: - data_chunk = data_chunk[mask[start_index:end_index]] + keep_array = [] + for idx in range(data_chunk.shape[0]): + data = data_chunk[idx,].flatten() + mask = fov_mask[idx,].flatten().astype(np.bool) + keep_array.append(data[mask]) + data_chunk = np.concatenate(keep_array) + dict_2d[dataset_name] = data_chunk - # Step 2: Save chunk to npz file (adds a suffix to filename) - np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk) + exclude_list.remove('FOV_mask') + for dataset_name in exclude_list: + dataset = file[dataset_name] + + # Determine how many chunks are needed (rounded up) + num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size + + # Process each chunk + for i in range(num_chunks): + start_index = i * chunk_size + end_index = min((i + 1) * chunk_size, dataset.shape[0]) + + # Load chunk into memory, apply mask if necessary + data_chunk = dataset[start_index:end_index, ] + + keep_array = [] + for idx in range(data_chunk.shape[0]): + data = np.zeros(np.sum(fov_mask[idx])) + data[:] = data_chunk[idx] + keep_array.append(data) + data_chunk = np.concatenate(keep_array) + dict_1d[dataset_name] = data_chunk + + # Combine dict_2d and dict_1d + combined_dict = {**dict_2d, **dict_1d} + + # Convert combined_dict to DataFrame + df = pd.DataFrame(combined_dict) - # Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename) - df = pd.DataFrame(data_chunk) - df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv") \ No newline at end of file + # Write the DataFrame to a file + df.to_csv(f"{output_file_prefix}_combined.csv", index=False) \ No newline at end of file