Calculate Statistical Significance
Calculate statistical significance for A/B test results including z-test, t-test, confidence intervals, and sample size adequacy
Source Code
import fs from "fs";
import path from "path";
const [inputPath, outputPath, variantColumn, metricColumn, metricType = "binary", confidenceLevel = "0.95"] =
process.argv.slice(2);
if (!inputPath || !outputPath || !variantColumn || !metricColumn) {
console.error("Usage: inputPath outputPath variantColumn metricColumn [metricType] [confidenceLevel]");
process.exit(1);
}
// Statistical helper functions
function mean(arr) {
if (arr.length === 0) return 0;
return arr.reduce((a, b) => a + b, 0) / arr.length;
}
function variance(arr) {
if (arr.length < 2) return 0;
const m = mean(arr);
return arr.reduce((sum, val) => sum + Math.pow(val - m, 2), 0) / (arr.length - 1);
}
function standardDeviation(arr) {
return Math.sqrt(variance(arr));
}
// Standard normal CDF approximation (Abramowitz and Stegun)
function normalCDF(x) {
const a1 = 0.254829592;
const a2 = -0.284496736;
const a3 = 1.421413741;
const a4 = -1.453152027;
const a5 = 1.061405429;
const p = 0.3275911;
const sign = x < 0 ? -1 : 1;
x = Math.abs(x) / Math.sqrt(2);
const t = 1.0 / (1.0 + p * x);
const y = 1.0 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
return 0.5 * (1.0 + sign * y);
}
// Inverse normal CDF approximation (Rational approximation)
function normalInvCDF(p) {
if (p <= 0) return -Infinity;
if (p >= 1) return Infinity;
const a = [
-3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2, 1.383577518672690e2, -3.066479806614716e1,
2.506628277459239e0,
];
const b = [
-5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2, 6.680131188771972e1, -1.328068155288572e1,
];
const c = [
-7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838e0, -2.549732539343734e0, 4.374664141464968e0,
2.938163982698783e0,
];
const d = [7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996e0, 3.754408661907416e0];
const pLow = 0.02425;
const pHigh = 1 - pLow;
let q, r;
if (p < pLow) {
q = Math.sqrt(-2 * Math.log(p));
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
} else if (p <= pHigh) {
q = p - 0.5;
r = q * q;
return (
((((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q) /
(((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1)
);
} else {
q = Math.sqrt(-2 * Math.log(1 - p));
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
}
}
// Z-test for proportions (binary metrics like conversion)
function zTestProportions(control, variant, confidence) {
const n1 = control.length;
const n2 = variant.length;
const p1 = mean(control);
const p2 = mean(variant);
// Pooled proportion under null hypothesis
const pooled = (p1 * n1 + p2 * n2) / (n1 + n2);
const se = Math.sqrt(pooled * (1 - pooled) * (1 / n1 + 1 / n2));
if (se === 0) {
return {
zScore: 0,
pValue: 1,
significant: false,
controlRate: p1,
variantRate: p2,
lift: 0,
liftPercent: 0,
ciLower: 0,
ciUpper: 0,
standardError: 0,
};
}
const zScore = (p2 - p1) / se;
const pValue = 2 * (1 - normalCDF(Math.abs(zScore)));
// Confidence interval for the difference
const zCritical = normalInvCDF(1 - (1 - confidence) / 2);
const seDiff = Math.sqrt((p1 * (1 - p1)) / n1 + (p2 * (1 - p2)) / n2);
const diff = p2 - p1;
const ciLower = diff - zCritical * seDiff;
const ciUpper = diff + zCritical * seDiff;
const lift = p1 > 0 ? (p2 - p1) / p1 : 0;
return {
zScore: round(zScore, 4),
pValue: round(pValue, 4),
significant: pValue < 1 - confidence,
controlRate: round(p1, 4),
variantRate: round(p2, 4),
absoluteDiff: round(diff, 4),
lift: round(lift, 4),
liftPercent: round(lift * 100, 2),
ciLower: round(ciLower, 4),
ciUpper: round(ciUpper, 4),
standardError: round(seDiff, 4),
};
}
// T-test for continuous metrics (revenue, time, etc.)
function tTestContinuous(control, variant, confidence) {
const n1 = control.length;
const n2 = variant.length;
const m1 = mean(control);
const m2 = mean(variant);
const v1 = variance(control);
const v2 = variance(variant);
// Welch's t-test (unequal variances)
const se = Math.sqrt(v1 / n1 + v2 / n2);
if (se === 0) {
return {
tScore: 0,
pValue: 1,
significant: false,
controlMean: m1,
variantMean: m2,
lift: 0,
liftPercent: 0,
ciLower: 0,
ciUpper: 0,
standardError: 0,
degreesOfFreedom: 0,
};
}
const tScore = (m2 - m1) / se;
// Welch-Satterthwaite degrees of freedom
const df = Math.pow(v1 / n1 + v2 / n2, 2) / (Math.pow(v1 / n1, 2) / (n1 - 1) + Math.pow(v2 / n2, 2) / (n2 - 1));
// Approximate p-value using normal distribution (valid for large samples)
const pValue = 2 * (1 - normalCDF(Math.abs(tScore)));
// Confidence interval
const tCritical = normalInvCDF(1 - (1 - confidence) / 2);
const diff = m2 - m1;
const ciLower = diff - tCritical * se;
const ciUpper = diff + tCritical * se;
const lift = m1 !== 0 ? (m2 - m1) / Math.abs(m1) : 0;
return {
tScore: round(tScore, 4),
pValue: round(pValue, 4),
significant: pValue < 1 - confidence,
controlMean: round(m1, 4),
variantMean: round(m2, 4),
controlStdDev: round(standardDeviation(control), 4),
variantStdDev: round(standardDeviation(variant), 4),
absoluteDiff: round(diff, 4),
lift: round(lift, 4),
liftPercent: round(lift * 100, 2),
ciLower: round(ciLower, 4),
ciUpper: round(ciUpper, 4),
standardError: round(se, 4),
degreesOfFreedom: round(df, 1),
};
}
// Sample size adequacy check
function checkSampleSize(n1, n2, metricType, baselineRate = 0.1) {
const warnings = [];
// Minimum recommended samples for 80% power to detect 10% relative lift
const minPerVariant = metricType === "binary" ? 3000 : 100;
if (n1 < 30 || n2 < 30) {
warnings.push(`Very small sample sizes (control: ${n1}, variant: ${n2}). Results are unreliable.`);
} else if (n1 < minPerVariant || n2 < minPerVariant) {
warnings.push(
`Sample sizes (control: ${n1}, variant: ${n2}) may be insufficient to detect small effects. ` +
`Recommend ${minPerVariant}+ per variant for reliable results.`
);
}
if (Math.abs(n1 - n2) / Math.max(n1, n2) > 0.2) {
warnings.push(`Imbalanced sample sizes (${n1} vs ${n2}). Check for assignment issues.`);
}
return {
controlSize: n1,
variantSize: n2,
adequate: warnings.length === 0,
warnings,
};
}
function round(num, decimals) {
return Math.round(num * Math.pow(10, decimals)) / Math.pow(10, decimals);
}
try {
console.log(`Reading experiment data: ${inputPath}...`);
const rawData = fs.readFileSync(inputPath, "utf-8");
const data = JSON.parse(rawData);
if (!Array.isArray(data) || data.length === 0) {
throw new Error("Input data must be a non-empty array");
}
// Check required columns exist
const sampleRow = data[0];
if (!(variantColumn in sampleRow)) {
throw new Error(`Variant column '${variantColumn}' not found in data. Available: ${Object.keys(sampleRow).join(", ")}`);
}
if (!(metricColumn in sampleRow)) {
throw new Error(`Metric column '${metricColumn}' not found in data. Available: ${Object.keys(sampleRow).join(", ")}`);
}
// Identify variants
const variants = [...new Set(data.map((row) => row[variantColumn]).filter((v) => v != null))];
console.log(`Found variants: ${variants.join(", ")}`);
if (variants.length < 2) {
throw new Error(`Need at least 2 variants, found: ${variants.length}`);
}
// Identify control (usually "control", "A", "0", or first alphabetically)
const controlLabels = ["control", "ctrl", "a", "0", "baseline"];
let controlVariant = variants.find((v) => controlLabels.includes(String(v).toLowerCase()));
if (!controlVariant) {
controlVariant = variants.sort()[0];
}
const treatmentVariants = variants.filter((v) => v !== controlVariant);
// Extract metric values by variant
const controlData = data.filter((row) => row[variantColumn] === controlVariant).map((row) => Number(row[metricColumn]) || 0);
const confidence = parseFloat(confidenceLevel);
const results = {
summary: {
totalRows: data.length,
variantColumn,
metricColumn,
metricType,
confidenceLevel: confidence,
controlVariant,
treatmentVariants,
},
sampleSize: checkSampleSize(controlData.length, 0, metricType),
comparisons: [],
};
// Compare each treatment to control
for (const treatment of treatmentVariants) {
const treatmentData = data.filter((row) => row[variantColumn] === treatment).map((row) => Number(row[metricColumn]) || 0);
results.sampleSize = checkSampleSize(controlData.length, treatmentData.length, metricType);
let testResult;
if (metricType === "binary") {
testResult = zTestProportions(controlData, treatmentData, confidence);
testResult.testType = "z-test (proportions)";
} else {
testResult = tTestContinuous(controlData, treatmentData, confidence);
testResult.testType = "t-test (continuous)";
}
testResult.variant = treatment;
testResult.controlVariant = controlVariant;
// Add interpretation
if (testResult.significant) {
if (testResult.liftPercent > 0) {
testResult.interpretation = `Variant "${treatment}" shows a statistically significant INCREASE of ${testResult.liftPercent}% vs control.`;
testResult.recommendation = "SHIP: Results are statistically significant and positive.";
} else {
testResult.interpretation = `Variant "${treatment}" shows a statistically significant DECREASE of ${Math.abs(testResult.liftPercent)}% vs control.`;
testResult.recommendation = "DO NOT SHIP: Results are statistically significant but negative.";
}
} else {
testResult.interpretation = `No statistically significant difference detected between "${treatment}" and control (p=${testResult.pValue}).`;
testResult.recommendation = "INCONCLUSIVE: Consider extending the test or evaluating practical significance.";
}
results.comparisons.push(testResult);
}
// Ensure output directory exists
const dir = path.dirname(outputPath);
if (dir && dir !== ".") {
fs.mkdirSync(dir, { recursive: true });
}
fs.writeFileSync(outputPath, JSON.stringify(results, null, 2));
console.log(`\n✓ Statistical Analysis Complete`);
console.log(` Control: "${controlVariant}" (n=${controlData.length})`);
for (const comp of results.comparisons) {
console.log(` vs "${comp.variant}": ${comp.significant ? "SIGNIFICANT" : "not significant"} (p=${comp.pValue})`);
console.log(` Lift: ${comp.liftPercent > 0 ? "+" : ""}${comp.liftPercent}%`);
}
if (results.sampleSize.warnings.length > 0) {
console.log(`\n⚠️ Warnings:`);
for (const w of results.sampleSize.warnings) {
console.log(` - ${w}`);
}
}
console.log(`\n Written to: ${outputPath}`);
console.log(
JSON.stringify({
success: true,
outputPath,
significant: results.comparisons.some((c) => c.significant),
comparisons: results.comparisons.length,
})
);
} catch (error) {
console.error("Error:", error.message);
process.exit(1);
}