from datetime import datetime import re import sys from typing import Dict, Optional, Tuple, Union import pandas as pd from predict_ats import Prediction, make_prediction, LINE_COLUMN L_MONTHS = [ "", "JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY", "JUNE", "JULY", "AUGUST", "SEPTEMBER", "OCTOBER", "NOVEMBER", "DECEMBER", ] S_MONTHS = [ "", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", ] """all dataframes will follow the following configuration: |----------------------------------| | filename | line_number | content | |----------------------------------| """ def convert_month(messy_month: str) -> Tuple[int, float]: import difflib messy_month = messy_month.lower() if messy_month == "sept": return S_MONTHS.index("SEP"), 1.0 if len(messy_month) == 3: m_list = S_MONTHS m_char = "%b" else: m_list = L_MONTHS m_char = "%B" try: return datetime.strptime(messy_month, m_char).month, 1.0 except ValueError: pass match = difflib.get_close_matches(messy_month, m_list, n=1) if match == []: return 0, 0 score = difflib.SequenceMatcher(None, messy_month, match).ratio() return m_list.index(match[0]), score def convert_year(year: Union[int, str]) -> int: year = int(year) if year < 100: year += 1900 return year def numeral_to_int(sat_num: str) -> int: sat_num = sat_num.upper() if sat_num == "I": return 1 elif sat_num == "III": return 3 else: return 0 def doy_to_date(doy: int, year: int) -> Tuple[int, int]: dt = datetime.strptime(f"{doy}{year}", "%j%Y") return dt.day, dt.month def dmy_num_regex(match: re.Match) -> Optional[Prediction]: base_confidence = 0.75 pred_month, conf = convert_month(match.group(3)) if pred_month == 0: return None new_conf = base_confidence + conf / 10 return Prediction( month=pred_month, year=convert_year(match.group(4)), day=int(match.group(2)), satellite=numeral_to_int(match.group(1)), time=match.group(5), confidence=new_conf, ) def dmy_dig_regex(match: re.Match) -> Optional[Prediction]: base_confidence = 0.75 pred_month, conf = convert_month(match.group(3)) if pred_month == 0: return None new_conf = base_confidence + conf / 10 return Prediction( month=pred_month, year=convert_year(match.group(4)), day=int(match.group(2)), satellite=int(match.group(1)), time=match.group(5), confidence=new_conf, ) def doy_regex(match: re.Match) -> Optional[Dict]: year = int(f"196{match.group(3)}") try: day, month = doy_to_date(int(match.group(2)), year) except ValueError: return None return { "satellite": numeral_to_int(match.group(1)), "time": match.group(4), "year": year, "month": month, "day": day, "confidence": 0.80, } @make_prediction def gen_regex( df: pd.DataFrame, reg: str, to_pred: str ) -> Optional[Prediction]: line_str = ''.join(df.loc[:, LINE_COLUMN].astype(str)) match = re.search(rf'{reg}', line_str) if not match: return None return getattr(sys.modules[__name__], to_pred)(match)