Skip to content
Snippets Groups Projects
passes.py 3.36 KiB
from datetime import datetime
import re
import sys
from typing import Dict, Optional, Tuple, Union

import pandas as pd

from predict_ats import Prediction, make_prediction, LINE_COLUMN

L_MONTHS = [
    "",
    "JANUARY",
    "FEBRUARY",
    "MARCH",
    "APRIL",
    "MAY",
    "JUNE",
    "JULY",
    "AUGUST",
    "SEPTEMBER",
    "OCTOBER",
    "NOVEMBER",
    "DECEMBER",
]
S_MONTHS = [
    "",
    "JAN",
    "FEB",
    "MAR",
    "APR",
    "MAY",
    "JUN",
    "JUL",
    "AUG",
    "SEP",
    "OCT",
    "NOV",
    "DEC",
]

"""all dataframes will follow the following configuration:

|----------------------------------|
| filename | line_number | content |
|----------------------------------|
"""


def convert_month(messy_month: str) -> Tuple[int, float]:
    import difflib

    messy_month = messy_month.lower()
    if messy_month == "sept":
        return S_MONTHS.index("SEP"), 1.0
    if len(messy_month) == 3:
        m_list = S_MONTHS
        m_char = "%b"
    else:
        m_list = L_MONTHS
        m_char = "%B"

    try:
        return datetime.strptime(messy_month, m_char).month, 1.0
    except ValueError:
        pass
    match = difflib.get_close_matches(messy_month, m_list, n=1)
    if match == []:
        return 0, 0
    score = difflib.SequenceMatcher(None, messy_month, match).ratio()
    return m_list.index(match[0]), score


def convert_year(year: Union[int, str]) -> int:
    year = int(year)
    if year < 100:
        year += 1900
    return year


def numeral_to_int(sat_num: str) -> int:
    sat_num = sat_num.upper()
    if sat_num == "I":
        return 1
    elif sat_num == "III":
        return 3
    else:
        return 0


def doy_to_date(doy: int, year: int) -> Tuple[int, int]:
    dt = datetime.strptime(f"{doy}{year}", "%j%Y")
    return dt.day, dt.month


def dmy_num_regex(match: re.Match) -> Optional[Prediction]:
    base_confidence = 0.75
    pred_month, conf = convert_month(match.group(3))
    if pred_month == 0:
        return None
    new_conf = base_confidence + conf / 10
    return Prediction(
        month=pred_month,
        year=convert_year(match.group(4)),
        day=int(match.group(2)),
        satellite=numeral_to_int(match.group(1)),
        time=match.group(5),
        confidence=new_conf,
    )


def dmy_dig_regex(match: re.Match) -> Optional[Prediction]:
    base_confidence = 0.75
    pred_month, conf = convert_month(match.group(3))
    if pred_month == 0:
        return None
    new_conf = base_confidence + conf / 10
    return Prediction(
        month=pred_month,
        year=convert_year(match.group(4)),
        day=int(match.group(2)),
        satellite=int(match.group(1)),
        time=match.group(5),
        confidence=new_conf,
    )


def doy_regex(match: re.Match) -> Optional[Dict]:
    year = int(f"196{match.group(3)}")
    try:
        day, month = doy_to_date(int(match.group(2)), year)
    except ValueError:
        return None
    return {
        "satellite": numeral_to_int(match.group(1)),
        "time": match.group(4),
        "year": year,
        "month": month,
        "day": day,
        "confidence": 0.80,
    }


@make_prediction
def gen_regex(
    df: pd.DataFrame, reg: str, to_pred: str
) -> Optional[Prediction]:
    line_str = ''.join(df.loc[:, LINE_COLUMN].astype(str))
    match = re.search(rf'{reg}', line_str)
    if not match:
        return None
    return getattr(sys.modules[__name__], to_pred)(match)