-
Max Drexler authoredMax Drexler authored
passes.py 3.36 KiB
from datetime import datetime
import re
import sys
from typing import Dict, Optional, Tuple, Union
import pandas as pd
from predict_ats import Prediction, make_prediction, LINE_COLUMN
L_MONTHS = [
"",
"JANUARY",
"FEBRUARY",
"MARCH",
"APRIL",
"MAY",
"JUNE",
"JULY",
"AUGUST",
"SEPTEMBER",
"OCTOBER",
"NOVEMBER",
"DECEMBER",
]
S_MONTHS = [
"",
"JAN",
"FEB",
"MAR",
"APR",
"MAY",
"JUN",
"JUL",
"AUG",
"SEP",
"OCT",
"NOV",
"DEC",
]
"""all dataframes will follow the following configuration:
|----------------------------------|
| filename | line_number | content |
|----------------------------------|
"""
def convert_month(messy_month: str) -> Tuple[int, float]:
import difflib
messy_month = messy_month.lower()
if messy_month == "sept":
return S_MONTHS.index("SEP"), 1.0
if len(messy_month) == 3:
m_list = S_MONTHS
m_char = "%b"
else:
m_list = L_MONTHS
m_char = "%B"
try:
return datetime.strptime(messy_month, m_char).month, 1.0
except ValueError:
pass
match = difflib.get_close_matches(messy_month, m_list, n=1)
if match == []:
return 0, 0
score = difflib.SequenceMatcher(None, messy_month, match).ratio()
return m_list.index(match[0]), score
def convert_year(year: Union[int, str]) -> int:
year = int(year)
if year < 100:
year += 1900
return year
def numeral_to_int(sat_num: str) -> int:
sat_num = sat_num.upper()
if sat_num == "I":
return 1
elif sat_num == "III":
return 3
else:
return 0
def doy_to_date(doy: int, year: int) -> Tuple[int, int]:
dt = datetime.strptime(f"{doy}{year}", "%j%Y")
return dt.day, dt.month
def dmy_num_regex(match: re.Match) -> Optional[Prediction]:
base_confidence = 0.75
pred_month, conf = convert_month(match.group(3))
if pred_month == 0:
return None
new_conf = base_confidence + conf / 10
return Prediction(
month=pred_month,
year=convert_year(match.group(4)),
day=int(match.group(2)),
satellite=numeral_to_int(match.group(1)),
time=match.group(5),
confidence=new_conf,
)
def dmy_dig_regex(match: re.Match) -> Optional[Prediction]:
base_confidence = 0.75
pred_month, conf = convert_month(match.group(3))
if pred_month == 0:
return None
new_conf = base_confidence + conf / 10
return Prediction(
month=pred_month,
year=convert_year(match.group(4)),
day=int(match.group(2)),
satellite=int(match.group(1)),
time=match.group(5),
confidence=new_conf,
)
def doy_regex(match: re.Match) -> Optional[Dict]:
year = int(f"196{match.group(3)}")
try:
day, month = doy_to_date(int(match.group(2)), year)
except ValueError:
return None
return {
"satellite": numeral_to_int(match.group(1)),
"time": match.group(4),
"year": year,
"month": month,
"day": day,
"confidence": 0.80,
}
@make_prediction
def gen_regex(
df: pd.DataFrame, reg: str, to_pred: str
) -> Optional[Prediction]:
line_str = ''.join(df.loc[:, LINE_COLUMN].astype(str))
match = re.search(rf'{reg}', line_str)
if not match:
return None
return getattr(sys.modules[__name__], to_pred)(match)