Skip to content
Snippets Groups Projects
Commit 5b7691fd authored by tomrink's avatar tomrink
Browse files

snapshot...

parent 48989aad
No related branches found
No related tags found
No related merge requests found
...@@ -16,10 +16,14 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000): ...@@ -16,10 +16,14 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
# Step 1: Open HDF5 file # Step 1: Open HDF5 file
with h5py.File(hdf5_filename, "r") as file: with h5py.File(hdf5_filename, "r") as file:
mask = np.asarray(file["FOV_mask"]) # If mask needs to be applied, load it into memory fov_mask = np.asarray(file["FOV_mask"])
# these are 1D arrays that we'll broadcast from below
exclude_list = ['FOV_mask', 'time', 'icing_intensity', 'flight_altitude', 'latitude', 'longitude']
dict_2d = {}
dict_1d = {}
# For each dataset # For each dataset
for dataset_name in filter(lambda key: key != "FOV_mask", file.keys()): for dataset_name in filter(lambda key: key not in exclude_list, file.keys()):
dataset = file[dataset_name] dataset = file[dataset_name]
# Determine how many chunks are needed (rounded up) # Determine how many chunks are needed (rounded up)
...@@ -31,14 +35,44 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000): ...@@ -31,14 +35,44 @@ def hdf5_to_npz_csv(hdf5_filename, output_file_prefix, chunk_size=10000):
end_index = min((i + 1) * chunk_size, dataset.shape[0]) end_index = min((i + 1) * chunk_size, dataset.shape[0])
# Load chunk into memory, apply mask if necessary # Load chunk into memory, apply mask if necessary
data_chunk = dataset[start_index:end_index] data_chunk = dataset[start_index:end_index,]
if data_chunk.shape == mask.shape: keep_array = []
data_chunk = data_chunk[mask[start_index:end_index]] for idx in range(data_chunk.shape[0]):
data = data_chunk[idx,].flatten()
mask = fov_mask[idx,].flatten().astype(np.bool)
keep_array.append(data[mask])
data_chunk = np.concatenate(keep_array)
dict_2d[dataset_name] = data_chunk
# Step 2: Save chunk to npz file (adds a suffix to filename) exclude_list.remove('FOV_mask')
np.savez(f"{output_file_prefix}_chunk_{i}_{dataset_name}.npz", data_chunk) for dataset_name in exclude_list:
dataset = file[dataset_name]
# Determine how many chunks are needed (rounded up)
num_chunks = (dataset.shape[0] + chunk_size - 1) // chunk_size
# Process each chunk
for i in range(num_chunks):
start_index = i * chunk_size
end_index = min((i + 1) * chunk_size, dataset.shape[0])
# Load chunk into memory, apply mask if necessary
data_chunk = dataset[start_index:end_index, ]
keep_array = []
for idx in range(data_chunk.shape[0]):
data = np.zeros(np.sum(fov_mask[idx]))
data[:] = data_chunk[idx]
keep_array.append(data)
data_chunk = np.concatenate(keep_array)
dict_1d[dataset_name] = data_chunk
# Combine dict_2d and dict_1d
combined_dict = {**dict_2d, **dict_1d}
# Convert combined_dict to DataFrame
df = pd.DataFrame(combined_dict)
# Step 3: Convert chunk to DataFrame and save as CSV (adds a suffix to filename) # Write the DataFrame to a file
df = pd.DataFrame(data_chunk) df.to_csv(f"{output_file_prefix}_combined.csv", index=False)
df.to_csv(f"{output_file_prefix}_chunk_{i}_{dataset_name}.csv") \ No newline at end of file
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment