Update bb outlier detection algorithm

6acaf5c2 · Coda Phillips · 6b0eb7ca · 6acaf5c2
Commit 6acaf5c2 authored 8 years ago by Coda Phillips
--- a/electronic_checks.py
+++ b/electronic_checks.py
@@ -6,11 +6,16 @@ def find_bb_outliers(frame, parameters, bb):
    if not np.in1d(['{}bottomTemp'.format(bb),'{}apexTemp'.format(bb),'{}topTemp'.format(bb)], frame.columns).all():
        return frame

-    window_length = parameters.get('window_length', 100)
+    window_length = parameters.get('window_length', 50)

-    bbb_outliers = _find_6sigma_outliers(frame['{}bottomTemp'.format(bb)], window_length)
-    bba_outliers = _find_6sigma_outliers(frame['{}apexTemp'.format(bb)], window_length)
-    bbt_outliers = _find_6sigma_outliers(frame['{}topTemp'.format(bb)], window_length)
+    if bb == 'HBB':
+        bbb_outliers = _find_6sigma_outliers(frame['{}bottomTemp'.format(bb)], window_length)
+        bba_outliers = _find_6sigma_outliers(frame['{}apexTemp'.format(bb)], window_length)
+        bbt_outliers = _find_6sigma_outliers(frame['{}topTemp'.format(bb)], window_length)
+    else:
+        bbb_outliers = _find_6sigma_outliers(frame['{}bottomTemp'.format(bb)], window_length, estimation_func=_compute_robust_rate_zscore)
+        bba_outliers = _find_6sigma_outliers(frame['{}apexTemp'.format(bb)], window_length, estimation_func=_compute_robust_rate_zscore)
+        bbt_outliers = _find_6sigma_outliers(frame['{}topTemp'.format(bb)], window_length, estimation_func=_compute_robust_rate_zscore)

    variable_qcs = pd.DataFrame({
        'qc_{}bottomTemp'.format(bb) : bbb_outliers * 1,
@@ -37,9 +42,9 @@ def calibrationambienttemp_outlier_check(frame, parameters):
    if 'calibrationAmbientTemp' not in frame.columns:
        return frame
    
-    window_length = parameters.get('window_length', 100)
+    window_length = parameters.get('window_length', 50)

-    temp_outliers = _find_6sigma_outliers(frame['calibrationAmbientTemp'], window_length, use_mean=True)
+    temp_outliers = _find_6sigma_outliers(frame['calibrationAmbientTemp'], window_length)
    frame['calibrationambienttemp_outlier_check'] = temp_outliers * 1
    frame = update_variable_qc(frame, pd.DataFrame({'qc_calibrationAmbientTemp':temp_outliers*1}, index=frame.index))
    annotate_all(frame, temp_outliers, 'calibrationAmbientTemp outlier')
@@ -48,23 +53,25 @@ def calibrationambienttemp_outlier_check(frame, parameters):
 class CheckList(BaseCheckList):
    checks = [ hbb_temp_outlier_check , abb_temp_outlier_check, calibrationambienttemp_outlier_check ]

-def _compute_robust_zscore(frame, window_length, use_mean=False):
-    use_mean = False
-    
-    if use_mean:
-        robust_rolling_std = frame.rolling(window=window_length, center=True, min_periods=1).std()
-        return abs((frame - frame.rolling(window=window_length, center=True, min_periods=1).mean()) / robust_rolling_std)
-    else:
-        # Compute a centered rolling MAD over window_length
-        rolling_mad = abs(frame - frame.rolling(window=window_length, center=True, min_periods=1).median()
-                     ).rolling(window=window_length, center=True, min_periods=1).median()
-        # standard deviation is proportional to median absolute deviation I'm told
-        robust_rolling_std = rolling_mad * 1.48
-        return abs((frame - frame.rolling(window=window_length, center=True, min_periods=1).median()) / robust_rolling_std)
+def _compute_robust_zscore(frame, window_length):
+    median_values = frame.rolling(window=window_length, center=True, min_periods=1).median()
+    # Compute the MAD
+    mad = abs(frame - median_values).median()
+    # standard deviation is proportional to median absolute deviation I'm told
+    robust_std = mad * 1.48
+    # compute the Mahalanobis distance from rolling median
+    return abs((frame - median_values) / robust_std)
+
+def _compute_robust_rate_zscore(frame, window_length=None):
+    time_diffs = pd.Series((frame.index.values[1:] - frame.index.values[:-1]).astype(np.int64),
+                           index=frame.index[1:])
+    changes = frame.diff() / time_diffs
+    mad_diff = abs(changes - changes.median()).median() * 1.48
+    return abs(frame.diff() / time_diffs)
    
-def _find_6sigma_outliers(frame, window_length, use_mean=False):
+def _find_6sigma_outliers(frame, window_length, estimation_func=_compute_robust_zscore):
    # Find outliers with deviation greater than 6 sigma
-    outlier_mask = _compute_robust_zscore(frame, window_length, use_mean) > 6
+    outlier_mask = estimation_func(frame, window_length) > 6
    return outlier_mask