Skip to content
Snippets Groups Projects

Sorting script

+ 118
0
 
import os
 
from datetime import datetime, timedelta
 
import sys
 
import logging
 
 
LOG = logging.getLogger(__name__)
 
 
def isLegacyFile(filename):
 
return filename.split(".")[1].split("-")[0] < "2013"
 
 
def getDateTime(line, isLegacy):
 
if isLegacy:
 
_, _, _, line_year, line_doy, line_time = line.split(",")[:6]
 
line_month = datetime.strptime((line_doy + " " + line_year), "%j %Y").strftime("%m")
 
line_day = datetime.strptime((line_doy + " " + line_year), "%j %Y").strftime("%d")
 
 
if line_time == "2400":
 
line_hour = 00
 
line_min = 00
 
return datetime(int(line_year), int(line_month), int(line_day), int(line_hour), int(line_min)) + timedelta(days=1, hours=6)
 
else:
 
line_hour = datetime.strptime(line_time.rjust(4, "0"), "%H%M").strftime("%H")
 
line_min = datetime.strptime(line_time.rjust(4, "0"), "%H%M").strftime("%M")
 
return datetime(int(line_year), int(line_month), int(line_day), int(line_hour), int(line_min)) + timedelta(hours=6)
 
 
else:
 
line_year = line.split(" ")[0][1:5]
 
line_month = line.split(" ")[0][6:8]
 
line_day = line.split(" ")[0][9:11]
 
line_time = line.split(" ")[1][:8]
 
 
line_hour = datetime.strptime(line_time, "%H:%M:%S").strftime("%H")
 
line_min = datetime.strptime(line_time, "%H:%M:%S").strftime("%M")
 
line_sec = datetime.strptime(line_time, "%H:%M:%S").strftime("%S")
 
 
return datetime(int(line_year), int(line_month), int(line_day), int(line_hour), int(line_min), int(line_sec))
 
 
 
INPUT_DIR = "/Users/kkolman/data1/raw/mendota/buoy/"
 
OUTPUT_DIR = "./testdata2/data1/raw/mendota/buoy/"
 
 
 
def create_line_infos(base_dir):
 
line_infos = []
 
headers = {}
 
LOG.info("Searching through files...")
 
# collects every file name and their filepath (indices match up)
 
for dirpath, dirs, files in os.walk(base_dir):
 
LOG.debug("Searching: {}...".format(dirpath))
 
for filename in files:
 
if filename[0:2] != "me":
 
continue
 
filepath = os.path.join(dirpath, filename)
 
file_date = datetime.strptime(filename.split(".")[1], "%Y-%m-%d").date()
 
is_legacy = isLegacyFile(filename)
 
file_type = "_" + filename.split(".")[0].split("_")[-1]
 
if file_type == "_buoy":
 
file_type = ""
 
with open(filepath, "r", newline='') as filecontents:
 
for line in filecontents:
 
if len(line) < 1:
 
continue
 
 
# checking for header files
 
if not line[1].isdigit() and not line[0].isdigit():
 
headers.setdefault((file_type, file_date), []).append(line)
 
continue
 
 
line_date = getDateTime(line, is_legacy)
 
line_infos.append((file_type, line_date, file_date, line))
 
return headers, line_infos
 
 
 
def main():
 
from argparse import ArgumentParser
 
parser = ArgumentParser()
 
parser.add_argument("--input", nargs="+", default=[INPUT_DIR],
 
help="One or more input directories to search for files")
 
parser.add_argument("--output", default=OUTPUT_DIR,
 
help="Base output directory to write files to (next subdirectory is YYYY)")
 
args = parser.parse_args()
 
 
input_dirs = args.input
 
output_dir = args.output
 
logging.basicConfig(level=logging.DEBUG)
 
for input_dir in input_dirs:
 
headers, line_infos = create_line_infos(input_dir)
 
 
LOG.info("Writing data lines to destination files...")
 
current_file = None
 
current_file_date = None
 
prev_line_date = None
 
current_file_type = None
 
FILENAME_PATTERN = os.path.join("%Y", "%m", "%d", "mendota_buoy{}.%Y-%m-%d.ascii")
 
for file_type, line_date, file_date, line in sorted(line_infos):
 
if current_file is None or (line_date.date() != current_file_date or file_type != current_file_type):
 
fpath = os.path.join(output_dir, line_date.strftime(FILENAME_PATTERN.format(file_type)))
 
output_dirname = os.path.dirname(fpath)
 
os.makedirs(output_dirname, exist_ok=True)
 
if current_file is not None:
 
current_file.close()
 
current_file = open(fpath, 'w', newline='')
 
this_header = headers.get((file_type, file_date), [])
 
for header_line in this_header:
 
current_file.write(header_line)
 
current_file_date = line_date.date()
 
current_file_type = file_type
 
 
if prev_line_date is not None and prev_line_date == line_date:
 
LOG.debug("Duplicate line: %s", line_date.isoformat())
 
continue
 
current_file.write(line)
 
prev_line_date = line_date
 
current_file.close()
 
 
 
if __name__ == "__main__":
 
sys.exit(main())
Loading