#!/usr/bin/perl # randCompareMeans.pl # # Using the per-topic average precision scores for 2 or more runs in the # search.results.table, perform a partial randomization test on each # pair of runs. As implemented this is a one-tailed test. # # For background and details see the literature, e.g.: # # - Manly, Bryan F.J. 1997. Randomization, Bootstrap and Monte Carlo # Methods in Biology, Chapman & Hall # # - Kempthorne, O. and Doerfler, T.E. The Behaviour of Some Significance # Tests Under Experimental Randomization. Biometrika, Vol 56, No.2 # (Aug.,1969) 231 - 238. # # Romano, Joseph. P. On the Behavior of Randomization Tests Without a # Group Invariance Assumption. Journal of the American Statistical # Association, Vol. 85, No. 411 (Sep. 1990), pp. 686-692. # # This software was produced by NIST, an agency of the U.S. government, # and by statute is not subject to copyright in the United States. Recipients # of this software assume all responsibilities associated with its operation, # modification and maintenance. # # Version change log ---------------------------------------------------------------- # 1.0 # # 1.1 Put run first in each output line and sort by first run id. # # 1.2 Add estimate of error of p-values #---------------------------------------------------------------------------- # Arguments $numIterations = $ARGV[0]; $sigLevel = $ARGV[1]; $resultsFile = $ARGV[2]; $runSubstring1 = $ARGV[3]; # Used to filter runs based on run id or type prefix $runSubstring2 = $ARGV[4]; $runSubstring3 = $ARGV[5]; $runSubstring4 = $ARGV[6]; if ($#ARGV < 1) { print "Usage randCompareMeans.pl number-of-iterations significance-level results-file run-name-substring1 run-name-substring2 run-name-substring3 run-name-substring4\n"; exit; } @resultsList = (); # Holds list of output lines - one for each comparison output, then sorted %resultsHash = (); # Holds for each run a count of number of inferior runs open RESULTS, $resultsFile || die "couldn't open \"$resultsFile\""; $lineNum = -1; # If the line from the results table meets the run name criteria # passed on invocation, then save the line in the lines array # Each line starts with the run id followed by the scores while ($line = ) { if ($line =~ /_/ && $line =~ /$runSubstring1/ && $line =~ /$runSubstring2/ && $line =~ /$runSubstring3/ && $line =~ /$runSubstring4/ ) { chomp $line; $lineNum++; $lines[$lineNum]=$line; } } close RESULTS || die "Can't close \"$resultsFile\""; if ($lineNum == -1) { print "No matching runs found\n"; exit; } elsif ($lineNum == 0) { (undef,$runname,undef) = split /\s+/,$lines[$lineNum]; print "No comparisons possible - only one matching run found: $runname\n"; exit; } # Invoke the comparison routine for each pair # of lines, i.e., runs for ($m=0;$m<=$#lines;$m++) { for ($n=$m+1;$n<=$#lines;$n++) { calcRandComparison($lines[$m],$lines[$n]); } } @resultsList = sort @resultsList; foreach $item (@resultsList) { print "$item"; } # Estimate the standard error of the p-values after Efron and Tinshirani 1998 $pError = $sigLevel * ( ( ( 1-($sigLevel) ) / ($sigLevel) )/$numIterations )**0.5; # Print arguments, etc printf "\nTarget iterations: $numIterations\n"; printf "significance level: $sigLevel\n"; printf "estimated error of p values: + or - %6.5f\n",$pError; printf "scores file: $resultsFile\n"; printf "run substring 1: $runSubstring1\n"; printf "run substring 2: $runSubstring2\n"; printf "run substring 3: $runSubstring3\n"; printf "run substring 4: $runSubstring4\n\n"; # For each run, print number of runs it is significantly # better than according to the current test printf "Number of runs each run is significantly better than according to current test:\n"; for ($i=$#lines;$i>=0;$i--) { foreach $key (keys %resultsHash) { if ($resultsHash{$key} == $i) { printf "%3d %s\n",$resultsHash{$key},$key; } } } #-------------------------------------------------------- # Subroutine to make randomization comparison of two runs #-------------------------------------------------------- sub calcRandComparison { my ($run1,$run2) = @_; # Given two runs (arrays of run name followed by per-topic # scores), take the observed per-topic average precision # scores, calculate the observed mean (mean average precision: # MAP) for each run, and then calculate the observed difference # in means (mean for run1 - means for run2). # # For a large number times (>=10000): # In effect, interchange the paired scores for a random # subset of topics # Recalculate the difference in means for the two runs. # Store this difference as part of a distribution under # the null hypothesis (H0). # # Calculate the fraction of the values in the H0 # distribution that is >= (for positive values) or <= (for # negative values) the observed difference in means. # This fraction estimates p: the probability of getting the # observed difference due to chance - in other words the # strength of the null hypothesis. my @run1scores = ""; my @run2scores = ""; my $c = 0; my @run1 = split /\s+/,$run1; my @run2 = split /\s+/,$run2; # Remove of run names shift @run1; my $run1name = shift @run1; shift @run2; my $run2name = shift @run2; my $numTopics = $#run1+1; # Avoid comparison of run to self if ($run1name eq $run2name) { return 0; } # Calculate average precision differences (run - run2) for observed data for ($i=0;$i<=$numTopics-1;$i++) { $diffAvgp[$i] = $run1[$i] - $run2[$i]; } # Calculate the difference in means for the observed data my $t=0; foreach $val (@diffAvgp) { $t+=$val; } my $observedDiffMeans = $t/$numTopics; # Calculate randomized difference in means numIterations times (NOTE you could # just as well look at the difference in summed average precision to save # computation but that number is less familiar) my %randHash = (); my $numRandomizations = 0; for ($j=1;$j<=$numIterations;$j++) { $t=0; $randomizationString = ""; foreach $val (@diffAvgp) { if (rand >=.5) { $t+=$val; $randomizationString .="1"; } else { $t-=$val; $randomizationString .="0"; } } # If this randomization is as yet unseen, then use it in the # distribution under the null hypothesis (H0) if (!exists $randHash{$randomizationString}) { $generatedDiffMeans = $t/$numTopics; #print "$generatedDiffMeans\n"; $randHash{$randomizationString} = 1; if ($observedDiffMeans >= 0 && $generatedDiffMeans >= $observedDiffMeans) { $c++; # count instance of H0 distribution containing difference # in means equal to or more extreme then observed difference } elsif ($observedDiffMeans < 0 && $generatedDiffMeans <= $observedDiffMeans) { $c++; } $numRandomizations++; } } # Calculate frequency (probability) that observed difference in means # or one more extreme occurs in constructed distribution under the # null hypothesis my $p = $c/$numRandomizations; # Initialize hash containing number of runs "inferior" to run1name if (!exists $resultsHash{$run1name}) { $resultsHash{$run1name} = 0; } if (!exists $resultsHash{$run2name}) { $resultsHash{$run2name} = 0; } if ($p <= $sigLevel) { if ($observedDiffMeans > 0) # run 1 better than run 2 { $result = " > "; $resultsHash{$run1name}++; printResultLine($p,$c,$numRandomizations,$observedDiffMeans,$run1name, $result,$run2name); } else # run 2 better than run 1 { $result = " > "; $resultsHash{$run2name}++; printResultLine($p,$c,$numRandomizations,-$observedDiffMeans,$run2name, $result,$run1name); } } else # runs have same scores { # return without printing a result line } return $p; } #------------------------------------- # Subroutine to make print result line #------------------------------------- # Print test result for current run pair: # name of run 1 # comparison result (>) # name of run 2 # probability observed difference due to to chance, # count of generated differences equal or more extreme than observed, # total number of differences generated under the null hypothesis, # observed difference in the means sub printResultLine { my ($p,$c,$numRandomizations,$observedDiff,$run1Name,$result,$run2Name) = @_; $line = sprintf " %-35s %s %-35s %6.3f %6d %6d %6.3f\n", $run1Name,$result,$run2Name,$p,$c,$numRandomizations,$observedDiff,; push @resultsList, ($line); }