... | ... | @@ -5,8 +5,8 @@ flo_ops="-h ratchet -U ops flo3_ops" |
|
|
```
|
|
|
we look at how many jobs succeeded/failed
|
|
|
```
|
|
|
satellite='snpp'; psql $flo_user -c "select count(*) from stored_products where computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''';"
|
|
|
satellite='snpp'; psql $flo_user -c "select count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''';"
|
|
|
satellite='snpp'; psql $flo_user -c "select count(*) from stored_products where computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''';"
|
|
|
satellite='snpp'; psql $flo_user -c "select count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''';"
|
|
|
```
|
|
|
```
|
|
|
count
|
... | ... | @@ -21,7 +21,7 @@ count |
|
|
|
|
|
So `20996/109572 = 19.1%` failure rate. We can use the failed jobs table to group then by `exit_code` which can sometimes be useful:
|
|
|
```
|
|
|
satellite='snpp'; psql $flo_user -c "select exit_code,count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''' group by exit_code;"
|
|
|
satellite='snpp'; psql $flo_user -c "select exit_code,count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''' group by exit_code;"
|
|
|
```
|
|
|
```
|
|
|
exit_code | count
|
... | ... | @@ -36,7 +36,7 @@ satellite='snpp'; psql $flo_user -c "select exit_code,count(*) from failed_jobs |
|
|
```
|
|
|
We already know that the `99` code is due to a collocation bug. Next lets checkout the -11 exit codes:
|
|
|
```
|
|
|
satellite='snpp'; psql $flo_user -c "select job, pydt(context->'granule') from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''' and exit_code=-11;"
|
|
|
satellite='snpp'; psql $flo_user -c "select job, pydt(context->>'granule') from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''' and exit_code=-11;"
|
|
|
```
|
|
|
```
|
|
|
job | pydt
|
... | ... | @@ -74,12 +74,12 @@ Those 19 failures (`exit_code` `-11`) appear to be `viirsmend` failures. That wi |
|
|
|
|
|
That leaves us with the big elephant in the room, the null exit_codes. So I can group the failures by day. Interestingly it looks like there are whole days that are failing.
|
|
|
```
|
|
|
satellite='snpp'; psql $flo_user -c "select date_trunc('days',pydt(context->'granule')) as d,count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''' and exit_code is null group by d order by d;"
|
|
|
satellite='snpp'; psql $flo_user -c "select date_trunc('days',pydt(context->'granule')) as d,count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''' and timestamp > '2019-08-22' and exit_code is null group by d order by d;"
|
|
|
satellite='snpp'; psql $flo_user -c "select date_trunc('days',pydt(context->>'granule')) as d,count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''' and exit_code is null group by d order by d;"
|
|
|
satellite='snpp'; psql $flo_user -c "select date_trunc('days',pydt(context->>'granule')) as d,count(*) from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''' and timestamp > '2019-08-22' and exit_code is null group by d order by d;"
|
|
|
```
|
|
|
So I list out a few of the job numbers for the 13th so that we can look at the output.
|
|
|
```
|
|
|
satellite='snpp'; psql $flo_user -c "select job from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''' and exit_code is null and context->'granule' like '%2018, 3, 13,%' limit 5;"
|
|
|
satellite='snpp'; psql $flo_user -c "select job from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''' and exit_code is null and context->>'granule' like '%2018, 3, 13,%' limit 5;"
|
|
|
```
|
|
|
```
|
|
|
job
|
... | ... | @@ -103,7 +103,7 @@ ERROR 2019-08-21 03:31:21,189 runner -- Failure on computation flo.sw.fusion_mat |
|
|
```
|
|
|
So, it is saying there is bad VIIRS data. Looking at a more recent null granule
|
|
|
```
|
|
|
satellite='snpp'; psql $flo_user -c "select job from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->'satellite'='''$satellite''' and context->'version'='''1.0.0dev3''' and exit_code is null and context->'granule' like '%2019, 6, 30,%' limit 5;"
|
|
|
satellite='snpp'; psql $flo_user -c "select job from failed_jobs where head_computation='flo.sw.fusion_matlab:FUSION_MATLAB' and context->>'satellite'='''$satellite''' and context->>'version'='''1.0.0dev3''' and exit_code is null and context->>'granule' like '%2019, 6, 30,%' limit 5;"
|
|
|
```
|
|
|
```
|
|
|
job
|
... | ... | |