Skip to content
Snippets Groups Projects

Sorting script

Merged Katherine Kolman requested to merge sorting-script into master
+ 118
0
import os
from datetime import datetime, timedelta
import sys
import logging
LOG = logging.getLogger(__name__)
def isLegacyFile(filename):
return filename.split(".")[1].split("-")[0] < "2013"
def getDateTime(line, isLegacy):
if isLegacy:
_, _, _, line_year, line_doy, line_time = line.split(",")[:6]
line_month = datetime.strptime((line_doy + " " + line_year), "%j %Y").strftime("%m")
line_day = datetime.strptime((line_doy + " " + line_year), "%j %Y").strftime("%d")
if line_time == "2400":
line_hour = 00
line_min = 00
return datetime(int(line_year), int(line_month), int(line_day), int(line_hour), int(line_min)) + timedelta(days=1, hours=6)
else:
line_hour = datetime.strptime(line_time.rjust(4, "0"), "%H%M").strftime("%H")
line_min = datetime.strptime(line_time.rjust(4, "0"), "%H%M").strftime("%M")
return datetime(int(line_year), int(line_month), int(line_day), int(line_hour), int(line_min)) + timedelta(hours=6)
else:
line_year = line.split(" ")[0][1:5]
line_month = line.split(" ")[0][6:8]
line_day = line.split(" ")[0][9:11]
line_time = line.split(" ")[1][:8]
line_hour = datetime.strptime(line_time, "%H:%M:%S").strftime("%H")
line_min = datetime.strptime(line_time, "%H:%M:%S").strftime("%M")
line_sec = datetime.strptime(line_time, "%H:%M:%S").strftime("%S")
return datetime(int(line_year), int(line_month), int(line_day), int(line_hour), int(line_min), int(line_sec))
INPUT_DIR = "/Users/kkolman/data1/raw/mendota/buoy/"
OUTPUT_DIR = "./testdata2/data1/raw/mendota/buoy/"
def create_line_infos(base_dir):
line_infos = []
headers = {}
LOG.info("Searching through files...")
# collects every file name and their filepath (indices match up)
for dirpath, dirs, files in os.walk(base_dir):
LOG.debug("Searching: {}...".format(dirpath))
for filename in files:
if filename[0:2] != "me":
continue
filepath = os.path.join(dirpath, filename)
file_date = datetime.strptime(filename.split(".")[1], "%Y-%m-%d").date()
is_legacy = isLegacyFile(filename)
file_type = "_" + filename.split(".")[0].split("_")[-1]
if file_type == "_buoy":
file_type = ""
with open(filepath, "r", newline='') as filecontents:
for line in filecontents:
if len(line) < 1:
continue
# checking for header files
if not line[1].isdigit() and not line[0].isdigit():
headers.setdefault((file_type, file_date), []).append(line)
continue
line_date = getDateTime(line, is_legacy)
line_infos.append((file_type, line_date, file_date, line))
return headers, line_infos
def main():
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--input", nargs="+", default=[INPUT_DIR],
help="One or more input directories to search for files")
parser.add_argument("--output", default=OUTPUT_DIR,
help="Base output directory to write files to (next subdirectory is YYYY)")
args = parser.parse_args()
input_dirs = args.input
output_dir = args.output
logging.basicConfig(level=logging.DEBUG)
for input_dir in input_dirs:
headers, line_infos = create_line_infos(input_dir)
LOG.info("Writing data lines to destination files...")
current_file = None
current_file_date = None
prev_line_date = None
current_file_type = None
FILENAME_PATTERN = os.path.join("%Y", "%m", "%d", "mendota_buoy{}.%Y-%m-%d.ascii")
for file_type, line_date, file_date, line in sorted(line_infos):
if current_file is None or (line_date.date() != current_file_date or file_type != current_file_type):
fpath = os.path.join(output_dir, line_date.strftime(FILENAME_PATTERN.format(file_type)))
output_dirname = os.path.dirname(fpath)
os.makedirs(output_dirname, exist_ok=True)
if current_file is not None:
current_file.close()
current_file = open(fpath, 'w', newline='')
this_header = headers.get((file_type, file_date), [])
for header_line in this_header:
current_file.write(header_line)
current_file_date = line_date.date()
current_file_type = file_type
if prev_line_date is not None and prev_line_date == line_date:
LOG.debug("Duplicate line: %s", line_date.isoformat())
continue
current_file.write(line)
prev_line_date = line_date
current_file.close()
if __name__ == "__main__":
sys.exit(main())
Loading