#! /usr/bin/perl -w
use strict;
use Statistics::Distributions qw(uprob udistr);
use Carp qw(confess);
print <) {
chomp($line);
my @row = split /\s+/, $line;
if (2 != @row) {
print "WARNING: wrong number of columns at line $.\n";
next;
}
if ($row[0] eq 'A' or $row[0] eq 'a') {
push @a, $row[1];
}
elsif ($row[0] eq 'B' or $row[0] eq 'b') {
push @b, $row[1];
}
else {
print "WARNING: one line $. '$row[0]' should have been A or B\n";
}
}
if (not @a) {
die "No data found for version A\n";
}
elsif (not @b) {
die "No data found for version B\n";
}
elsif (@a + @b < 3) {
die "Insufficient data to calculate (need at least 3 data points)\n";
}
my $var = variance(@a, @b);
my @c = sort {$a <=> $b} @a, @b;
pop @c;
my $var_c = variance(@c);
if (0 == $var_c) {
die "Insufficient variation (need at least 3 different values)\n";
}
# Sanity check
my $w = $var/$var_c - 1;
my $a_weight = $w * (@a + @b)/@a;
my $b_weight = $w * (@a + @b)/@b;
if ($a_weight >= 0.1) {
print "WARNING: The normal approximation may not fit A yet\n";
printf " (a_weight = %.2f > 0.1)\n", $a_weight;
}
if ($b_weight >= 0.1) {
print "WARNING: The normal approximation may not fit B yet\n";
printf " (b_weight = %.2f > 0.1)\n", $b_weight;
}
my $m_a = mean(@a);
printf "Mean of A: %.3f\n", $m_a;
my $m_b = mean(@b);
printf "Mean of B: %.3f\n", $m_b;
my $diff = $m_a - $m_b;
printf "A is %.3f better than B\n", $diff;
my $m_var = $var * (1/@a + 1/@b);
my $p = 2 * uprob( abs($diff) / sqrt($m_var) );
printf "p = %.3f\n", $p;
for my $c (qw(80 90 95 99)) {
my $p = 1 - $c/100;
# We're calculating the upper bound of a $c% confidence interval for
# a normal distribution with mean 0 and variance $m_var.
#
# The 2 is because it is 2-tailed. (Half the left out stuff is in
# one tail, half in the other.)
#
my $b = udistr( $p/2 ) * sqrt($m_var);
printf "$c%% Confidence for A - B: (%.3f, %.3f)\n", $diff - $b, $diff + $b;
}
# Calculates the arithmetic average of a list of numbers.
sub mean {
if (not @_) {
confess("No data found, cannot take mean");
}
return sum(@_)/@_;
}
# Calculates the sum of a list of numbers.
sub sum {
my $s = shift;
$s += $_ for @_;
return $s;
}
# Estimates the variance of a list of numbers.
sub variance {
if (@_ < 2) {
confess("Not enough data to calculate a variance");
}
my $m = mean(@_);
return sum( map {($_ - $m)**2} @_ ) / $#_;
}