From ee92dac87b09e1d93b7a8e76f2d99db77b3c4e35 Mon Sep 17 00:00:00 2001
From: "(no author)" <(no author)@8a9318a1-56ba-4d59-b755-99d26321be01>
Date: Mon, 27 Jul 2009 20:15:46 +0000
Subject: [PATCH] adding multiple threads to create images for multiple
 variables concurrently

git-svn-id: https://svn.ssec.wisc.edu/repos/glance/trunk@43 8a9318a1-56ba-4d59-b755-99d26321be01
---
 pyglance/glance/compare.py | 204 ++++++++++++++++++++++++-------------
 pyglance/glance/report.py  |   6 +-
 2 files changed, 136 insertions(+), 74 deletions(-)

diff --git a/pyglance/glance/compare.py b/pyglance/glance/compare.py
index 08b279b..3f27bab 100644
--- a/pyglance/glance/compare.py
+++ b/pyglance/glance/compare.py
@@ -443,6 +443,49 @@ def _open_and_process_files (args, numFilesExpected):
     
     return files
 
+def _check_pass_or_fail(varRunInfo, variableStats, defaultValues) :
+    """
+    Check whether the variable passed analysis, failed analysis, or
+    did not need to be quantitatively tested
+    """
+    didPass = None
+    
+    # get our tolerance values
+    
+    # get the tolerance for failures in comparison compared to epsilon
+    epsilonTolerance = None
+    if ('epsilon_failure_tolerance' in varRunInfo) :
+        epsilonTolerance = varRunInfo['epsilon_failure_tolerance']
+    else :
+        epsilonTolerance = defaultValues['epsilon_failure_tolerance']
+    # get the tolerance for failures in amount of nonfinite data
+    # found in spatially valid areas
+    nonfiniteTolerance = None
+    if ('nonfinite_data_tolerance'  in varRunInfo) :
+        nonfiniteTolerance = varRunInfo['nonfinite_data_tolerance']
+    else :
+        nonfiniteTolerance = defaultValues['nonfinite_data_tolerance']
+    
+    # test to see if we passed or failed
+    
+    # check for our epsilon tolerance
+    if not (epsilonTolerance is None) :
+        failed_fraction = variableStats['Numerical Comparison Statistics']['diff_outside_epsilon_fraction']
+        didPass = failed_fraction <= epsilonTolerance
+    # check to see if it failed on nonfinite data
+    if not (nonfiniteTolerance is None) :
+        non_finite_pts = variableStats['Finite Data Statistics']['finite_in_only_one_count']
+        non_finite_pts = non_finite_pts + variableStats['Missing Value Statistics']['common_missing_count']
+        non_finite_pts = non_finite_pts + variableStats['NaN Statistics']['common_nan_count']
+        non_finite_fraction = float(non_finite_pts) / float(variableStats['General Statistics']['num_data_points'])
+        passedNonFinite = non_finite_fraction <= nonfiniteTolerance 
+        if (didPass is None) :
+            didPass = passedNonFinite
+        else :
+            didPass = didPass and passedNonFinite
+    
+    return didPass
+
 def main():
     import optparse
     usage = """
@@ -656,7 +699,7 @@ python -m glance
         reportGen(*args)
         
         return
-    
+
     def reportGen(*args) :
         """generate a report comparing two files
         This option creates a report comparing variables in the two given hdf files.
@@ -761,9 +804,11 @@ python -m glance
                                                             runInfo['shouldIncludeImages'], outputPath)
             
         # set some things up to hold info for our reports
-        # this is going to be in the form
-        # [var_name] = {"passEpsilonPercent": percent ok with epsilon, "epsilon": epsilon)
-        variableComparisons = {}
+        
+        # this will hold our variable report information in the form
+        # [var_name] = {"var_stats": dictionary of statistics info, "run_info": information specific to that variable run,
+        #               "data": {"A": data from file A, "B": data from file B}}
+        variableAnalysisInfo = {}
         
         # go through each of the possible variables in our files
         # and make a report section with images for whichever ones we can
@@ -771,13 +816,16 @@ python -m glance
             
             # pull out the information for this variable analysis run
             varRunInfo = finalNames[name].copy()
+            
+            # make some local copies of our name info for display and labeling
             displayName = name
             if (varRunInfo.has_key('display_name')) :
                 displayName = varRunInfo['display_name']
             explanationName = name
             if (varRunInfo.has_key('alternate_name_in_B')) :
                 explanationName = explanationName + " / " + varRunInfo['alternate_name_in_B']
-            print('analyzing: ' + displayName + ' (' + explanationName + ')')
+            explanationName = displayName + '(' + explanationName + ')'
+            print('analyzing: ' + explanationName + ')')
             
             # if B has an alternate variable name, figure that out
             has_alt_B_variable = False
@@ -786,7 +834,7 @@ python -m glance
                 has_alt_B_variable = True
                 b_variable = varRunInfo['alternate_name_in_B']
             
-            # get the data for the variable
+            # get the data for the variable 
             aData = aFile[varRunInfo['variable_name']]
             bData = bFile[b_variable]
             
@@ -795,12 +843,53 @@ python -m glance
                 (aData.shape == longitudeCommon.shape) and
                 (bData.shape == longitudeCommon.shape)) :
                 
-                # if we should be making images, then make them for this variable
-                if (runInfo['shouldIncludeImages']) :
-                    doShortCircuit = ('short_circuit_diffs' in runInfo) and runInfo['short_circuit_diffs']
+                # build a dictionary of information on the variable
+                variableAnalysisInfo[varRunInfo['variable_name']] = {}
+                variableAnalysisInfo[varRunInfo['variable_name']]['data'] = {'A': aData,
+                                                                             'B': bData}
+                variableAnalysisInfo[varRunInfo['variable_name']]['var_stats'] = delta.summarize(aData, bData,
+                                                                                                 varRunInfo['epsilon'],
+                                                                                                 (varRunInfo['missing_value'],
+                                                                                                  varRunInfo['missing_value_alt_in_b']),
+                                                                                                 spaciallyInvalidMaskA, spaciallyInvalidMaskB)
+                # add a little additional info to our variable run info before we squirrel it away
+                varRunInfo['time'] = datetime.datetime.ctime(datetime.datetime.now()) 
+                passedFraction = (1.0 - variableAnalysisInfo[name]['var_stats']
+                                  ['Numerical Comparison Statistics']['diff_outside_epsilon_fraction'])
+                varRunInfo['did_pass'] = _check_pass_or_fail(varRunInfo,
+                                                             variableAnalysisInfo[name]['var_stats'],
+                                                             defaultValues)
+                variableAnalysisInfo[varRunInfo['variable_name']]['run_info'] = varRunInfo
+                variableAnalysisInfo[varRunInfo['variable_name']]['exp_name'] = explanationName
+                
+            # if we can't compare the variable, we should tell the user 
+            else :
+                LOG.warn(explanationName + ' ' + 
+                         'could not be compared. This may be because the data for this variable does not match in shape ' +
+                         'between the two files or the data may not match the shape of the selected longitude and ' +
+                         'latitude variables.')
+        
+        # from this point on, we will be forking to create child processes so we can parallelize our image and
+        # report generation
+        
+        isParent = True 
+        childPids = []
+        
+        # loop to create the images for all our variables
+        if (runInfo['shouldIncludeImages']) :
+            for name in variableAnalysisInfo :
+                # create a child to handle this variable's images
+                pid = os.fork()
+                isParent = not (pid is 0)
+                if (isParent) :
+                    childPids.append(pid)
+                    LOG.debug ("Started child process (pid: " + str(pid) + ") to create reports for variable " + name)
+                else :
                     # create the images comparing that variable
-                    print("\tcreating figures for: " + displayName)
-                    plot.plot_and_save_figure_comparison(aData, bData, varRunInfo, 
+                    print("\tcreating figures for: " + variableAnalysisInfo[name]['exp_name'])
+                    plot.plot_and_save_figure_comparison(variableAnalysisInfo[name]['data']['A'],
+                                                         variableAnalysisInfo[name]['data']['B'],
+                                                         variableAnalysisInfo[name]['run_info'],
                                                          files['file A']['path'],
                                                          files['file B']['path'],
                                                          latitudeA, longitudeA,
@@ -809,69 +898,35 @@ python -m glance
                                                          spaciallyInvalidMaskA,
                                                          spaciallyInvalidMaskB,
                                                          spaciallyInvalidMask,
-                                                         outputPath, True,
-                                                         doShortCircuit)
-                
-                # generate the report for this variable
-                if (runInfo['shouldIncludeReport']) :
-                    # get the current time
-                    runInfo['time'] = datetime.datetime.ctime(datetime.datetime.now())
-                    #get info on the variable
-                    variableStats = delta.summarize(aData, bData, varRunInfo['epsilon'],
-                                                    (varRunInfo['missing_value'], varRunInfo['missing_value_alt_in_b']),
-                                                    spaciallyInvalidMaskA, spaciallyInvalidMaskB)
-                    # hang on to our good % and our epsilon value to describe our comparison
-                    passedFraction = (1.0 - variableStats['Numerical Comparison Statistics']['diff_outside_epsilon_fraction'])
-                    passedPercent = passedFraction * 100.0
-                    variableComparisons[varRunInfo['variable_name']] = {'pass_epsilon_percent': passedPercent,
-                                                                        'variable_run_info': varRunInfo
-                                                                        }
-                    # check to see if the variable passed, failed, or wasn't quantitatively tested
-                    didPass = None
-                    # check to see if it failed on epsilon
-                    epsilonTolerance = None
-                    if ('epsilon_failure_tolerance' in varRunInfo) :
-                        epsilonTolerance = varRunInfo['epsilon_failure_tolerance']
-                    else :
-                        epsilonTolerance = defaultValues['epsilon_failure_tolerance']
-                    if not (epsilonTolerance is None) :
-                        didPass =         passedFraction >= (1.0 - epsilonTolerance)
-                    # check to see if it failed on nonfinite data
-                    nonfiniteTolerance = None
-                    if ('nonfinite_data_tolerance'  in varRunInfo) :
-                        nonfiniteTolerance = varRunInfo['nonfinite_data_tolerance']
-                    else :
-                        nonfiniteTolerance = defaultValues['nonfinite_data_tolerance']
-                    if not (nonfiniteTolerance is None) :
-                        non_finite_pts = variableStats['Finite Data Statistics']['finite_in_only_one_count']
-                        non_finite_pts = non_finite_pts + variableStats['Missing Value Statistics']['common_missing_count']
-                        non_finite_pts = non_finite_pts + variableStats['NaN Statistics']['common_nan_count']
-                        non_finite_fraction = float(non_finite_pts) / float(variableStats['General Statistics']['num_data_points'])
-                        passedNonFinite = non_finite_fraction <= nonfiniteTolerance 
-                        if (didPass is None) :
-                            didPass = passedNonFinite
-                        else :
-                            didPass = didPass and passedNonFinite
-                    varRunInfo['did_pass'] = didPass
-                    
-                    print ('\tgenerating report for: ' + displayName) 
-                    report.generate_and_save_variable_report(files,
-                                                             varRunInfo, runInfo,
-                                                             variableStats,
-                                                             spatialInfo,
-                                                             outputPath, varRunInfo['variable_name'] + ".html") 
-                                                             
-                                                             
-                    
-            # only log a warning if the user themselves picked the faulty variable
-            else :
-                LOG.warn(explanationName + ' ' + 
-                         'could not be compared. This may be because the data for this variable does not match in shape ' +
-                         'between the two files or the data may not match the shape of the selected longitude and ' +
-                         'latitude variables.')
+                                                         outputPath, True)
+                    print("\tfinished creating figures for: " + variableAnalysisInfo[name]['exp_name'])
+                    sys.exit(0) # this child has successfully finished it's tasks
         
+        # reports are fast, so the parent thread will just do this
         # generate our general report pages once we've looked at all the variables
         if (runInfo['shouldIncludeReport']) :
+            
+            # this is going to be in the form
+            # [var_name] = {"passEpsilonPercent": percent ok with epsilon, "epsilon": epsilon)
+            variableComparisons = {}
+            
+            # generate the variable reports
+            for name in variableAnalysisInfo :
+                
+                # hang on to our good % and other info to describe our comparison
+                passedPercent = (1.0 - variableAnalysisInfo[name]['var_stats']
+                                  ['Numerical Comparison Statistics']['diff_outside_epsilon_fraction']) * 100.0
+                variableComparisons[name] = {'pass_epsilon_percent': passedPercent,
+                                             'variable_run_info': variableAnalysisInfo[name]['run_info']
+                                             }
+                
+                print ('\tgenerating report for: ' + variableAnalysisInfo[name]['exp_name']) 
+                report.generate_and_save_variable_report(files,
+                                                         variableAnalysisInfo[name]['run_info'], runInfo,
+                                                         variableAnalysisInfo[name]['var_stats'],
+                                                         spatialInfo,
+                                                         outputPath, name + ".html")
+            
             print ('generating summary report')
             # get the current time
             runInfo['time'] = datetime.datetime.ctime(datetime.datetime.now())
@@ -886,6 +941,13 @@ python -m glance
             print ('generating glossary')
             report.generate_and_save_doc_page(delta.STATISTICS_DOC, outputPath)
         
+        # if we're the parent, wait for any children to catch up
+        if isParent:
+            if len(childPids) > 0 :
+                print ("waiting for completion of report and\/or figure generation...")
+            for pid in childPids:
+                os.waitpid(pid, 0)
+        print("... report and figure generation complete")
         return
     
     """
diff --git a/pyglance/glance/report.py b/pyglance/glance/report.py
index 7bce53c..c136fef 100644
--- a/pyglance/glance/report.py
+++ b/pyglance/glance/report.py
@@ -177,7 +177,6 @@ def generate_and_save_variable_report(files,
     generalRunInfo is a dictionary in the form
         generalRunInfo = {  'machine': currentMachine,
                             'user': currentUser,
-                            'time': currentTime,
                             'latitude': latitudeName,
                             'longitude': longitudeName,
                             'latitude_alt_name_in_b': latitudeNameInB,       # optional, if not defined, B's using the normal latitude
@@ -191,8 +190,9 @@ def generate_and_save_variable_report(files,
                             'epsilon': epsilon,
                             'missing_value': missingDataValue,
                             'display_name': displayName
-                            'did_pass': boolean value or None # optional, boolean means it did or did not pass, None means it was
-                                                              # not qualitatively tested against a set of tolerances
+                            'did_pass': boolean value or None, # optional, boolean means it did or did not pass, None means it was
+                                                               # not qualitatively tested against a set of tolerances
+                            'time': currentTime
                             }
                             
     files is a dictionary in the form
-- 
GitLab