code icon Code

Calculate Statistical Significance

Calculate statistical significance for A/B test results including z-test, t-test, confidence intervals, and sample size adequacy

Source Code

import fs from "fs";
import path from "path";

const [inputPath, outputPath, variantColumn, metricColumn, metricType = "binary", confidenceLevel = "0.95"] =
  process.argv.slice(2);

if (!inputPath || !outputPath || !variantColumn || !metricColumn) {
  console.error("Usage: inputPath outputPath variantColumn metricColumn [metricType] [confidenceLevel]");
  process.exit(1);
}

// Statistical helper functions
function mean(arr) {
  if (arr.length === 0) return 0;
  return arr.reduce((a, b) => a + b, 0) / arr.length;
}

function variance(arr) {
  if (arr.length < 2) return 0;
  const m = mean(arr);
  return arr.reduce((sum, val) => sum + Math.pow(val - m, 2), 0) / (arr.length - 1);
}

function standardDeviation(arr) {
  return Math.sqrt(variance(arr));
}

// Standard normal CDF approximation (Abramowitz and Stegun)
function normalCDF(x) {
  const a1 = 0.254829592;
  const a2 = -0.284496736;
  const a3 = 1.421413741;
  const a4 = -1.453152027;
  const a5 = 1.061405429;
  const p = 0.3275911;

  const sign = x < 0 ? -1 : 1;
  x = Math.abs(x) / Math.sqrt(2);

  const t = 1.0 / (1.0 + p * x);
  const y = 1.0 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);

  return 0.5 * (1.0 + sign * y);
}

// Inverse normal CDF approximation (Rational approximation)
function normalInvCDF(p) {
  if (p <= 0) return -Infinity;
  if (p >= 1) return Infinity;

  const a = [
    -3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2, 1.383577518672690e2, -3.066479806614716e1,
    2.506628277459239e0,
  ];
  const b = [
    -5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2, 6.680131188771972e1, -1.328068155288572e1,
  ];
  const c = [
    -7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838e0, -2.549732539343734e0, 4.374664141464968e0,
    2.938163982698783e0,
  ];
  const d = [7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996e0, 3.754408661907416e0];

  const pLow = 0.02425;
  const pHigh = 1 - pLow;

  let q, r;

  if (p < pLow) {
    q = Math.sqrt(-2 * Math.log(p));
    return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
  } else if (p <= pHigh) {
    q = p - 0.5;
    r = q * q;
    return (
      ((((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q) /
      (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1)
    );
  } else {
    q = Math.sqrt(-2 * Math.log(1 - p));
    return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
  }
}

// Z-test for proportions (binary metrics like conversion)
function zTestProportions(control, variant, confidence) {
  const n1 = control.length;
  const n2 = variant.length;
  const p1 = mean(control);
  const p2 = mean(variant);

  // Pooled proportion under null hypothesis
  const pooled = (p1 * n1 + p2 * n2) / (n1 + n2);
  const se = Math.sqrt(pooled * (1 - pooled) * (1 / n1 + 1 / n2));

  if (se === 0) {
    return {
      zScore: 0,
      pValue: 1,
      significant: false,
      controlRate: p1,
      variantRate: p2,
      lift: 0,
      liftPercent: 0,
      ciLower: 0,
      ciUpper: 0,
      standardError: 0,
    };
  }

  const zScore = (p2 - p1) / se;
  const pValue = 2 * (1 - normalCDF(Math.abs(zScore)));

  // Confidence interval for the difference
  const zCritical = normalInvCDF(1 - (1 - confidence) / 2);
  const seDiff = Math.sqrt((p1 * (1 - p1)) / n1 + (p2 * (1 - p2)) / n2);
  const diff = p2 - p1;
  const ciLower = diff - zCritical * seDiff;
  const ciUpper = diff + zCritical * seDiff;

  const lift = p1 > 0 ? (p2 - p1) / p1 : 0;

  return {
    zScore: round(zScore, 4),
    pValue: round(pValue, 4),
    significant: pValue < 1 - confidence,
    controlRate: round(p1, 4),
    variantRate: round(p2, 4),
    absoluteDiff: round(diff, 4),
    lift: round(lift, 4),
    liftPercent: round(lift * 100, 2),
    ciLower: round(ciLower, 4),
    ciUpper: round(ciUpper, 4),
    standardError: round(seDiff, 4),
  };
}

// T-test for continuous metrics (revenue, time, etc.)
function tTestContinuous(control, variant, confidence) {
  const n1 = control.length;
  const n2 = variant.length;
  const m1 = mean(control);
  const m2 = mean(variant);
  const v1 = variance(control);
  const v2 = variance(variant);

  // Welch's t-test (unequal variances)
  const se = Math.sqrt(v1 / n1 + v2 / n2);

  if (se === 0) {
    return {
      tScore: 0,
      pValue: 1,
      significant: false,
      controlMean: m1,
      variantMean: m2,
      lift: 0,
      liftPercent: 0,
      ciLower: 0,
      ciUpper: 0,
      standardError: 0,
      degreesOfFreedom: 0,
    };
  }

  const tScore = (m2 - m1) / se;

  // Welch-Satterthwaite degrees of freedom
  const df = Math.pow(v1 / n1 + v2 / n2, 2) / (Math.pow(v1 / n1, 2) / (n1 - 1) + Math.pow(v2 / n2, 2) / (n2 - 1));

  // Approximate p-value using normal distribution (valid for large samples)
  const pValue = 2 * (1 - normalCDF(Math.abs(tScore)));

  // Confidence interval
  const tCritical = normalInvCDF(1 - (1 - confidence) / 2);
  const diff = m2 - m1;
  const ciLower = diff - tCritical * se;
  const ciUpper = diff + tCritical * se;

  const lift = m1 !== 0 ? (m2 - m1) / Math.abs(m1) : 0;

  return {
    tScore: round(tScore, 4),
    pValue: round(pValue, 4),
    significant: pValue < 1 - confidence,
    controlMean: round(m1, 4),
    variantMean: round(m2, 4),
    controlStdDev: round(standardDeviation(control), 4),
    variantStdDev: round(standardDeviation(variant), 4),
    absoluteDiff: round(diff, 4),
    lift: round(lift, 4),
    liftPercent: round(lift * 100, 2),
    ciLower: round(ciLower, 4),
    ciUpper: round(ciUpper, 4),
    standardError: round(se, 4),
    degreesOfFreedom: round(df, 1),
  };
}

// Sample size adequacy check
function checkSampleSize(n1, n2, metricType, baselineRate = 0.1) {
  const warnings = [];

  // Minimum recommended samples for 80% power to detect 10% relative lift
  const minPerVariant = metricType === "binary" ? 3000 : 100;

  if (n1 < 30 || n2 < 30) {
    warnings.push(`Very small sample sizes (control: ${n1}, variant: ${n2}). Results are unreliable.`);
  } else if (n1 < minPerVariant || n2 < minPerVariant) {
    warnings.push(
      `Sample sizes (control: ${n1}, variant: ${n2}) may be insufficient to detect small effects. ` +
        `Recommend ${minPerVariant}+ per variant for reliable results.`
    );
  }

  if (Math.abs(n1 - n2) / Math.max(n1, n2) > 0.2) {
    warnings.push(`Imbalanced sample sizes (${n1} vs ${n2}). Check for assignment issues.`);
  }

  return {
    controlSize: n1,
    variantSize: n2,
    adequate: warnings.length === 0,
    warnings,
  };
}

function round(num, decimals) {
  return Math.round(num * Math.pow(10, decimals)) / Math.pow(10, decimals);
}

try {
  console.log(`Reading experiment data: ${inputPath}...`);
  const rawData = fs.readFileSync(inputPath, "utf-8");
  const data = JSON.parse(rawData);

  if (!Array.isArray(data) || data.length === 0) {
    throw new Error("Input data must be a non-empty array");
  }

  // Check required columns exist
  const sampleRow = data[0];
  if (!(variantColumn in sampleRow)) {
    throw new Error(`Variant column '${variantColumn}' not found in data. Available: ${Object.keys(sampleRow).join(", ")}`);
  }
  if (!(metricColumn in sampleRow)) {
    throw new Error(`Metric column '${metricColumn}' not found in data. Available: ${Object.keys(sampleRow).join(", ")}`);
  }

  // Identify variants
  const variants = [...new Set(data.map((row) => row[variantColumn]).filter((v) => v != null))];
  console.log(`Found variants: ${variants.join(", ")}`);

  if (variants.length < 2) {
    throw new Error(`Need at least 2 variants, found: ${variants.length}`);
  }

  // Identify control (usually "control", "A", "0", or first alphabetically)
  const controlLabels = ["control", "ctrl", "a", "0", "baseline"];
  let controlVariant = variants.find((v) => controlLabels.includes(String(v).toLowerCase()));
  if (!controlVariant) {
    controlVariant = variants.sort()[0];
  }

  const treatmentVariants = variants.filter((v) => v !== controlVariant);

  // Extract metric values by variant
  const controlData = data.filter((row) => row[variantColumn] === controlVariant).map((row) => Number(row[metricColumn]) || 0);

  const confidence = parseFloat(confidenceLevel);
  const results = {
    summary: {
      totalRows: data.length,
      variantColumn,
      metricColumn,
      metricType,
      confidenceLevel: confidence,
      controlVariant,
      treatmentVariants,
    },
    sampleSize: checkSampleSize(controlData.length, 0, metricType),
    comparisons: [],
  };

  // Compare each treatment to control
  for (const treatment of treatmentVariants) {
    const treatmentData = data.filter((row) => row[variantColumn] === treatment).map((row) => Number(row[metricColumn]) || 0);

    results.sampleSize = checkSampleSize(controlData.length, treatmentData.length, metricType);

    let testResult;
    if (metricType === "binary") {
      testResult = zTestProportions(controlData, treatmentData, confidence);
      testResult.testType = "z-test (proportions)";
    } else {
      testResult = tTestContinuous(controlData, treatmentData, confidence);
      testResult.testType = "t-test (continuous)";
    }

    testResult.variant = treatment;
    testResult.controlVariant = controlVariant;

    // Add interpretation
    if (testResult.significant) {
      if (testResult.liftPercent > 0) {
        testResult.interpretation = `Variant "${treatment}" shows a statistically significant INCREASE of ${testResult.liftPercent}% vs control.`;
        testResult.recommendation = "SHIP: Results are statistically significant and positive.";
      } else {
        testResult.interpretation = `Variant "${treatment}" shows a statistically significant DECREASE of ${Math.abs(testResult.liftPercent)}% vs control.`;
        testResult.recommendation = "DO NOT SHIP: Results are statistically significant but negative.";
      }
    } else {
      testResult.interpretation = `No statistically significant difference detected between "${treatment}" and control (p=${testResult.pValue}).`;
      testResult.recommendation = "INCONCLUSIVE: Consider extending the test or evaluating practical significance.";
    }

    results.comparisons.push(testResult);
  }

  // Ensure output directory exists
  const dir = path.dirname(outputPath);
  if (dir && dir !== ".") {
    fs.mkdirSync(dir, { recursive: true });
  }

  fs.writeFileSync(outputPath, JSON.stringify(results, null, 2));

  console.log(`\n✓ Statistical Analysis Complete`);
  console.log(`  Control: "${controlVariant}" (n=${controlData.length})`);

  for (const comp of results.comparisons) {
    console.log(`  vs "${comp.variant}": ${comp.significant ? "SIGNIFICANT" : "not significant"} (p=${comp.pValue})`);
    console.log(`    Lift: ${comp.liftPercent > 0 ? "+" : ""}${comp.liftPercent}%`);
  }

  if (results.sampleSize.warnings.length > 0) {
    console.log(`\n⚠️ Warnings:`);
    for (const w of results.sampleSize.warnings) {
      console.log(`  - ${w}`);
    }
  }

  console.log(`\n  Written to: ${outputPath}`);

  console.log(
    JSON.stringify({
      success: true,
      outputPath,
      significant: results.comparisons.some((c) => c.significant),
      comparisons: results.comparisons.length,
    })
  );
} catch (error) {
  console.error("Error:", error.message);
  process.exit(1);
}