/** * Calculate the beta-hats that best fit case read counts given the panel of normals. * * @param normalsPseudoinverse the log-normalized or reduced-panel pseudoinverse from a panel of normals * @param input a {@code TxS} matrix where {@code T} is the number of targets and {@code S} the number of count groups (e.g. case samples). * @return never {@code null} an {@code NxS} matrix, where N is the number of samples in * the panel and S the original name of count groups. */ @VisibleForTesting public static RealMatrix calculateBetaHats(final RealMatrix normalsPseudoinverse, final RealMatrix input, final double epsilon) { Utils.nonNull(normalsPseudoinverse, "Normals inverse matrix cannot be null."); Utils.nonNull(input, "Input counts cannot be null."); Utils.validateArg(epsilon > 0, String.format("Invalid epsilon value, must be > 0: %f", epsilon)); final double targetThreshold = (Math.log(epsilon) / Math.log(2)) + 1; // copy case samples in order to mask targets in-place and mask (set to zero) targets with coverage below threshold final RealMatrix maskedInput = input.copy(); maskedInput.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return value > targetThreshold ? value : 0; } }); return normalsPseudoinverse.multiply(maskedInput); }
/** * Final pre-panel normalization that consists of dividing all counts by the median of * its column and log it with base 2. * <p> * The normalization occurs in-place. * </p> * * @param readCounts the input counts to normalize. */ @VisibleForTesting static void normalizeAndLogReadCounts(final ReadCountCollection readCounts, final Logger logger) { final RealMatrix counts = readCounts.counts(); final Median medianCalculator = new Median(); final double[] medians = IntStream.range(0, counts.getColumnDimension()).mapToDouble(col -> medianCalculator.evaluate(counts.getColumn(col))).toArray(); counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return Math.log(Math.max(EPSILON, value / medians[column])) * INV_LN_2; } }); logger.info("Counts normalized by the column median and log2'd."); }
/** * Calculates the median of column medians and subtract it from all counts. * @param readCounts the input counts to center. * @return the median of medians that has been subtracted from all counts. */ @VisibleForTesting static double subtractMedianOfMedians(final ReadCountCollection readCounts, final Logger logger) { final RealMatrix counts = readCounts.counts(); final Median medianCalculator = new Median(); final double[] columnMedians = MatrixSummaryUtils.getColumnMedians(counts); final double medianOfMedians = medianCalculator.evaluate(columnMedians); counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return value - medianOfMedians; } }); logger.info(String.format("Counts centered around the median of medians %.2f", medianOfMedians)); return medianOfMedians; }
public static Pair<ReadCountCollection, double[]> simulatedData(final int numTargets, final int numSamples) { final List<Target> phonyTargets = SimulatedTargets.phonyTargets(numTargets); final List<String> phonySamples = SimulatedSamples.phonySamples(numSamples); final Random random = new Random(13); final double[] gcContentByTarget = IntStream.range(0, numTargets) .mapToDouble(n -> 0.5 + 0.2*random.nextGaussian()) .map(x -> Math.min(x,0.95)).map(x -> Math.max(x,0.05)).toArray(); final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray(); // model mainly GC bias with a small random amount of non-GC bias // thus noise after GC correction should be nearly zero final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples); counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int target, final int column, final double value) { return gcBiasByTarget[target]*(1.0 + 0.01*random.nextDouble()); } }); final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts); return new ImmutablePair<>(rcc, gcContentByTarget); }
private static Pair<RealMatrix, double[]> simulateData(final int numSamples, final int numIntervals) { final Random random = new Random(RANDOM_SEED); final double[] intervalGCContent = IntStream.range(0, numIntervals) .mapToDouble(n -> 0.5 + 0.2 * random.nextGaussian()) .map(x -> Math.min(x, 0.95)).map(x -> Math.max(x, 0.05)) .toArray(); final double[] intervalGCBias = Arrays.stream(intervalGCContent) .map(QUADRATIC_GC_BIAS_CURVE::apply) .toArray(); //model GC bias along with a small random amount of uniform non-GC-bias noise; //remaining noise after GC-bias correction should be only arise from the latter final RealMatrix readCounts = new Array2DRowRealMatrix(numSamples, numIntervals); readCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int sample, final int interval, final double value) { return (int) (MEAN_READ_DEPTH * intervalGCBias[interval] * (1. + NON_GC_BIAS_NOISE_LEVEL * random.nextDouble())); } }); return new ImmutablePair<>(readCounts, intervalGCContent); }
@Override public ClassifierResult apply(Matrix data) { if ( VERB>0 ) System.out.println("ERSP apply"); // Do the standard pre-processing data = preproc(data); // Linearly classifying the data if( VERB>1 ) System.out.println(TAG+ "Classifying with linear classifier"); Matrix fraw = applyLinearClassifier(data, 0); if( VERB>1 ) System.out.println(TAG+ "Results from the classifier (fraw): " + fraw.toString()); Matrix f = new Matrix(fraw.copy()); Matrix p = new Matrix(f.copy()); p.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { public double visit(int row, int column, double value) { return 1. / (1. + Math.exp(-value)); } }); if ( VERB>1 ) System.out.println(TAG+ "Results from the classifier (p): " + p.toString()); return new ClassifierResult(f, fraw, p, data); }
public ClassifierResult apply(Matrix data){ if ( VERB>1 ) System.out.println(TAG+ " preproc"); // Do the standard pre-processing data = preproc(data); // Linearly classifying the data if ( VERB>1 ) System.out.println(TAG+ "Classifying with linear classifier"); Matrix fraw = applyLinearClassifier(data, 0); if ( VERB>1 ) System.out.println(TAG+ "Results from the classifier (fraw): " + fraw.toString()); Matrix f = new Matrix(fraw.copy()); Matrix p = new Matrix(f.copy()); // map to probabilities using the logistic operator p.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { public double visit(int row, int column, double value) { return 1. / (1. + Math.exp(-value)); } }); if ( VERB>=0 ) System.out.println(TAG+ "Results from the classifier (p): " + p.toString()); return new ClassifierResult(f, fraw, p, data); }
private RealMatrix createMatrixOfGaussianValues(int numRows, int numCols, final double mean, final double sigma) { final RealMatrix bigCounts = new Array2DRowRealMatrix(numRows, numCols); final RandomDataGenerator randomDataGenerator = new RandomDataGenerator(); randomDataGenerator.reSeed(337337337); bigCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int row, int column, double value) { return randomDataGenerator.nextGaussian(mean, sigma); } }); return bigCounts; }
@Test public void testPerfectFit() { double[] betaHat = regression.estimateRegressionParameters(); TestUtils.assertEquals(betaHat, new double[]{ 11.0, 1.0 / 2.0, 2.0 / 3.0, 3.0 / 4.0, 4.0 / 5.0, 5.0 / 6.0 }, 1e-14); double[] residuals = regression.estimateResiduals(); TestUtils.assertEquals(residuals, new double[]{0d,0d,0d,0d,0d,0d}, 1e-14); RealMatrix errors = new Array2DRowRealMatrix(regression.estimateRegressionParametersVariance(), false); final double[] s = { 1.0, -1.0 / 2.0, -1.0 / 3.0, -1.0 / 4.0, -1.0 / 5.0, -1.0 / 6.0 }; RealMatrix referenceVariance = new Array2DRowRealMatrix(s.length, s.length); referenceVariance.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int row, int column, double value) { if (row == 0) { return s[column]; } double x = s[row] * s[column]; return (row == column) ? 2 * x : x; } }); Assert.assertEquals(0.0, errors.subtract(referenceVariance).getNorm(), 5.0e-16 * referenceVariance.getNorm()); Assert.assertEquals(1, ((OLSMultipleLinearRegression) regression).calculateRSquared(), 1E-12); }
/** * Impute zero counts to the median of non-zero values in the enclosing target row. * * <p>The imputation is done in-place, thus the input matrix is well be modified as a result of this call.</p> * * @param readCounts the input and output read-count matrix. */ public static void imputeZeroCountsAsTargetMedians(final ReadCountCollection readCounts, final Logger logger) { final RealMatrix counts = readCounts.counts(); final int targetCount = counts.getRowDimension(); final Median medianCalculator = new Median(); int totalCounts = counts.getColumnDimension() * counts.getRowDimension(); // Get the number of zeroes contained in the counts. final long totalZeroCounts = IntStream.range(0, targetCount) .mapToLong(t -> DoubleStream.of(counts.getRow(t)) .filter(c -> c == 0.0).count()).sum(); // Get the median of each row, not including any entries that are zero. final double[] medians = IntStream.range(0, targetCount) .mapToDouble(t -> medianCalculator.evaluate( DoubleStream.of(counts.getRow(t)) .filter(c -> c != 0.0) .toArray() )).toArray(); // Change any zeros in the counts to the median for the row. counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return value != 0 ? value : medians[row]; } }); if (totalZeroCounts > 0) { final double totalZeroCountsPercentage = 100.0 * (totalZeroCounts / totalCounts); logger.info(String.format("Some 0.0 counts (%d out of %d, %.2f%%) were imputed to their enclosing target " + "non-zero median", totalZeroCounts, totalZeroCounts, totalZeroCountsPercentage)); } else { logger.info("No count is 0.0 thus no count needed to be imputed."); } }
/** * Standardize read counts (per-target). * Note: modification is done in-place. * * @param counts original read counts */ private void standardizeByTarget(final RealMatrix counts) { final double[] rowMeans = GATKProtectedMathUtils.rowMeans(counts); final double[] rowStdDev = GATKProtectedMathUtils.rowStdDevs(counts); counts.walkInColumnOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return (value - rowMeans[row]) / rowStdDev[row]; } }); }
/** * Standardize read counts (per-sample). * Note: modification is done in-place. * * @param counts original read counts */ private void standardizeBySample(final RealMatrix counts) { final double[] columnMeans = GATKProtectedMathUtils.columnMeans(counts); final double[] columnStdDev = GATKProtectedMathUtils.columnStdDevs(counts); counts.walkInColumnOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int row, int column, double value) { return (value - columnMeans[column]) / columnStdDev[column]; } }); }
public static RealMatrix getAsRealMatrix(final LikelihoodMatrix<Allele> matrix) { final RealMatrix result = new Array2DRowRealMatrix(matrix.numberOfAlleles(), matrix.numberOfReads()); result.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int row, int column, double value) { return matrix.get(row, column); } }); return result; }
/** * Target-factor normalizes a {@link RealMatrix} in-place given target factors.. */ static void factorNormalize(final RealMatrix input, final double[] targetFactors) { Utils.nonNull(input, "Input matrix cannot be null."); Utils.nonNull(targetFactors, "Target factors cannot be null."); Utils.validateArg(targetFactors.length == input.getRowDimension(), "Number of target factors does not correspond to the number of rows."); // Divide all counts by the target factor for the row. input.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return value / targetFactors[row]; } }); }
/** * SVD and Pseudo inverse calculation. * * @param logNormalized the input counts for the SVD and reduction steps, fully normalized and already logged. * @param requestedNumberOfEigensamples user requested number of eigensamples for the reduced panel. * @return never {@code null}. */ @VisibleForTesting static ReductionResult calculateReducedPanelAndPInverses(final ReadCountCollection logNormalized, final OptionalInt requestedNumberOfEigensamples, final Logger logger, final JavaSparkContext ctx) { if (ctx == null) { logger.warn("No Spark context provided, not going to use Spark..."); } final RealMatrix logNormalizedCounts = logNormalized.counts(); final int numberOfCountColumns = logNormalizedCounts.getColumnDimension(); logger.info("Starting the SVD decomposition of the log-normalized counts ..."); final long svdStartTime = System.currentTimeMillis(); final SVD logNormalizedSVD = SVDFactory.createSVD(logNormalized.counts(), ctx); final long svdEndTime = System.currentTimeMillis(); logger.info(String.format("Finished the SVD decomposition of the log-normal counts. Elapse of %d seconds", (svdEndTime - svdStartTime) / 1000)); final int numberOfEigensamples = determineNumberOfEigensamples(requestedNumberOfEigensamples, numberOfCountColumns, logNormalizedSVD, logger); logger.info(String.format("Including %d eigensamples in the reduced PoN", numberOfEigensamples)); final double[] singularValues = logNormalizedSVD.getSingularValues(); final RealMatrix reducedCounts = logNormalizedSVD.getU().getSubMatrix(0, logNormalizedCounts.getRowDimension()-1, 0, numberOfEigensamples - 1).copy(); reducedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(final int row, final int column, final double value) { return singularValues[column]*value; } }); // The Pseudo inverse comes nearly for free from having run the SVD decomposition. final RealMatrix logNormalizedPseudoInverse = logNormalizedSVD.getPinv(); logger.info("Calculating the reduced PoN inverse matrix..."); final long riStartTime = System.currentTimeMillis(); final RealMatrix reducedCountsPseudoInverse = SVDFactory.createSVD(reducedCounts, ctx).getPinv(); final long riEndTime = System.currentTimeMillis(); logger.info(String.format("Finished calculating the reduced PoN inverse matrix. Elapse of %d seconds", (riEndTime - riStartTime) / 1000)); return new ReductionResult(logNormalizedPseudoInverse, reducedCounts, reducedCountsPseudoInverse, logNormalizedSVD.getSingularValues()); }
/** * Preprocess (i.e., transform to fractional coverage, correct GC bias, filter, impute, and truncate) * and standardize read counts from a panel of normals. * All inputs are assumed to be valid. * The dimensions of {@code readCounts} should be samples x intervals. * To reduce memory footprint, {@code readCounts} is modified in place when possible. * Filtering is performed by using boolean arrays to keep track of intervals and samples * that have been filtered at any step and masking {@code readCounts} with them appropriately. * If {@code intervalGCContent} is null, GC-bias correction will not be performed. */ static PreprocessedStandardizedResult preprocessAndStandardizePanel(final RealMatrix readCounts, final double[] intervalGCContent, final double minimumIntervalMedianPercentile, final double maximumZerosInSamplePercentage, final double maximumZerosInIntervalPercentage, final double extremeSampleMedianPercentile, final boolean doImputeZeros, final double extremeOutlierTruncationPercentile) { //preprocess (transform to fractional coverage, correct GC bias, filter, impute, truncate) and return copy of submatrix logger.info("Preprocessing read counts..."); final PreprocessedStandardizedResult preprocessedStandardizedResult = preprocessPanel(readCounts, intervalGCContent, minimumIntervalMedianPercentile, maximumZerosInSamplePercentage, maximumZerosInIntervalPercentage, extremeSampleMedianPercentile, doImputeZeros, extremeOutlierTruncationPercentile); logger.info("Panel read counts preprocessed."); //standardize in place logger.info("Standardizing read counts..."); divideBySampleMedianAndTransformToLog2(preprocessedStandardizedResult.preprocessedStandardizedValues); logger.info("Subtracting median of sample medians..."); final double[] sampleLog2Medians = MatrixSummaryUtils.getRowMedians(preprocessedStandardizedResult.preprocessedStandardizedValues); final double medianOfSampleMedians = new Median().evaluate(sampleLog2Medians); preprocessedStandardizedResult.preprocessedStandardizedValues .walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int sampleIndex, int intervalIndex, double value) { return value - medianOfSampleMedians; } }); logger.info("Panel read counts standardized."); return preprocessedStandardizedResult; }
/** * Preprocess (i.e., transform to fractional coverage and correct GC bias) * and standardize read counts for samples when no panel of normals is available. * The original {@code readCounts} has dimensions 1 x intervals and is not modified. * If {@code intervalGCContent} is null, GC-bias correction will not be performed */ public static RealMatrix preprocessAndStandardizeSample(final double[] readCounts, final double[] intervalGCContent) { Utils.nonNull(readCounts); Utils.validateArg(intervalGCContent == null || readCounts.length == intervalGCContent.length, "Number of intervals for read counts must match those for GC-content annotations."); RealMatrix result = new Array2DRowRealMatrix(new double[][]{readCounts}); //preprocess (transform to fractional coverage, correct GC bias) copy in place logger.info("Preprocessing read counts..."); transformToFractionalCoverage(result); performOptionalGCBiasCorrection(result, intervalGCContent); logger.info("Sample read counts preprocessed."); //standardize copy in place logger.info("Standardizing read counts..."); divideBySampleMedianAndTransformToLog2(result); logger.info("Subtracting sample median..."); final double[] sampleLog2Medians = MatrixSummaryUtils.getRowMedians(result); result.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int sampleIndex, int intervalIndex, double value) { return value - sampleLog2Medians[sampleIndex]; } }); logger.info("Sample read counts standardized."); return result; }
private static void transformToFractionalCoverage(final RealMatrix matrix) { logger.info("Transforming read counts to fractional coverage..."); final double[] sampleSums = MathUtils.rowSums(matrix); matrix.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int sampleIndex, int intervalIndex, double value) { return value / sampleSums[sampleIndex]; } }); }
private static void divideBySampleMedianAndTransformToLog2(final RealMatrix matrix) { logger.info("Dividing by sample medians and transforming to log2 space..."); final double[] sampleMedians = MatrixSummaryUtils.getRowMedians(matrix); matrix.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int sampleIndex, int intervalIndex, double value) { return safeLog2(value / sampleMedians[sampleIndex]); } }); }
private static RealMatrix createMatrixOfGaussianValues(final int numRows, final int numColumns, final double mean, final double sigma) { final RealMatrix largeMatrix = new Array2DRowRealMatrix(numRows, numColumns); final RandomDataGenerator randomDataGenerator = new RandomDataGenerator(); randomDataGenerator.reSeed(337337337); largeMatrix.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @Override public double visit(int row, int column, double value) { return randomDataGenerator.nextGaussian(mean, sigma); } }); return largeMatrix; }