import sklearn.metrics import numpy as np import pandas as pd import netCDF4 import os from glob import glob def main(): truths = [] qc_percents = [] for problem_file in glob('AE*/problems.csv'): for qc_file in glob(os.path.join(os.path.dirname(problem_file), '*.qc')): qc = netCDF4.Dataset(qc_file) break else: continue qc_percent = qc.variables['qc_percent'][:] problem = pd.read_csv(problem_file, comment='#') truth = np.in1d(np.arange(len(qc_percent)), problem.record_cxs) report = sklearn.metrics.classification_report(truth, qc_percent > .95, target_names=['Pass','Fail']) print('-'*50) print(os.path.dirname(problem_file)) print(report) print('-'*50) truths.append(truth) qc_percents.append(qc_percent) overall_report = sklearn.metrics.classification_report(np.concatenate(truths), np.concatenate(qc_percents) > .95, target_names=['Pass','Fail']) print('='*50) print(overall_report) print('='*50) if __name__ == '__main__': main()