Source code for sctools.metrics.aggregator

"""
Sequence Metric Aggregators
===========================

.. currentmodule:: sctools.metrics

This module provides classes useful for aggregating metric information for individual cells or
genes. These classes consume BAM files that have been pre-sorted such that all sequencing reads
that correspond to the molecules of a cell (CellMetrics) or the molecules of a gene (GeneMetrics)
are yielded sequentially.

Classes
-------

.. autosummary::
   :toctree: generated/

   MetricAggregatorBase         Aggregator Base Class
   GeneMetrics                  Class to iteratively calculate metrics for a gene (by molecule)
   CellMetrics                  Class to iteratively calculate metrics for a cell (by molecule)

Notes
-----
This module can be rewritten with dataclass when python 3.7 stabilizes, see
https://www.python.org/dev/peps/pep-0557/


See Also
--------
sctools.metrics.gatherer
sctools.metrics.merge
sctools.metrics.writer

"""


from typing import Iterable, Tuple, Counter, List, Sequence

import numpy as np
import pysam

from sctools import consts
from sctools.stats import OnlineGaussianSufficientStatistic


[docs]class MetricAggregator: """Metric Aggregator Base Class The ``MetricAggregator`` class defines a set of metrics that can be extracted from an aligned bam file. It defines all the metrics that are general across genes and cells. This class is subclassed by ``GeneMetrics`` and ``CellMetrics``, which define data-specific metrics in the ``parse_extra_fields`` method. An instance of ``GeneMetrics`` or ``CellMetrics`` is instantiated for each gene or molecule in a bam file, respectively. Attributes ---------- n_reads : int The number of reads associated with this entity noise_reads : int, NotImplemented Number of reads that are categorized by 10x genomics cellranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides perfect_molecule_barcodes : int The number of reads with molecule barcodes that have no errors (cell barcode tag == raw barcode tag) reads_mapped_exonic : int The number of reads for this entity that are mapped to exons reads_mapped_intronic : int The number of reads for this entity that are mapped to introns reads_mapped_utr : int The number of reads for this entity that are mapped to 3' untranslated regions (UTRs) reads_mapped_uniquely : int The number of reads mapped to a single unambiguous location in the genome reads_mapped_multiple : int The number of reads mapped to multiple genomic positions with equal confidence # todo make sure equal confidence is accurate duplicate_reads : int The number of reads that are duplicates (see README.md for defition of a duplicate) spliced_reads : int The number of reads that overlap splicing junctions antisense_reads : int The number of reads that are mapped to the antisense strand instead of the transcribed strand molecule_barcode_fraction_bases_above_30_mean : float The average fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of this entity molecule_barcode_fraction_bases_above_30_variance : float The variance in the fraction of bases in molecule barcodes that receive quality scores greater than 30 across the reads of this entity genomic_reads_fraction_bases_quality_above_30_mean : float The average fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity (included for 10x cell ranger count comparison) genomic_reads_fraction_bases_quality_above_30_variance : float The variance in the fraction of bases in the genomic read that receive quality scores greater than 30 across the reads of this entity (included for 10x cell ranger count comparison) genomic_read_quality_mean : float Average quality of Illumina base calls in the genomic reads corresponding to this entity genomic_read_quality_variance : float Variance in quality of Illumina base calls in the genomic reads corresponding to this entity n_molecules : float Number of molecules corresponding to this entity. See README.md for the definition of a Molecule n_fragments : float Number of fragments corresponding to this entity. See README.md for the definition of a Fragment reads_per_molecule : float The average number of reads associated with each molecule in this entity reads_per_fragment : float The average number of reads associated with each fragment in this entity fragments_per_molecule : float The average number of fragments associated with each molecule in this entity fragments_with_single_read_evidence : int The number of fragments associated with this entity that are observed by only one read molecules_with_single_read_evidence : int The number of molecules associated with this entity that are observed by only one read Methods ------- parse_extra_fields(tags, record), NotImplemented Abstract method that must be implemented by subclasses. Called by ``parse_molecule()`` to gather information for subclass-specific metrics parse_molecule(tags, record) Extract information from a set of sequencing reads that correspond to a molecule and store the data in the MetricAggregator class. finalize() Some metrics cannot be calculated until all the information for an entity has been aggregated, for example, the number of `fragments_per_molecule`. Finalize calculates all such higher-order metrics """ def __init__(self): # type definitions Chromosome: int Strand: bool # reverse = True, see pysam.AlignedSegment.is_reverse Position: int Fragment: Tuple[Chromosome, Position, Strand] # noqa: F821 # count information self.n_reads: int = 0 self.noise_reads: int = 0 # long polymers, N-sequences; NotImplemented self._fragment_histogram: Counter[Fragment] = Counter() # noqa: F821 self._molecule_histogram: Counter[str] = Counter() # molecule information self._molecule_barcode_fraction_bases_above_30 = ( OnlineGaussianSufficientStatistic() ) self.perfect_molecule_barcodes = 0 self._genomic_reads_fraction_bases_quality_above_30 = ( OnlineGaussianSufficientStatistic() ) self._genomic_read_quality = OnlineGaussianSufficientStatistic() # alignment location information self.reads_mapped_exonic = 0 self.reads_mapped_intronic = 0 self.reads_mapped_utr = 0 # todo implement this once we have a gene model # self.reads_mapped_outside_window = 0 # reads should be within 1000 bases of UTR # self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic() # alignment uniqueness information self.reads_mapped_uniquely = 0 self.reads_mapped_multiple = 0 self.duplicate_reads = 0 # alignment splicing information self.spliced_reads = 0 self.antisense_reads = 0 self._plus_strand_reads = 0 # strand balance # todo implement property here # higher-order methods, filled in by finalize() when all data is extracted self.molecule_barcode_fraction_bases_above_30_mean: float = None self.molecule_barcode_fraction_bases_above_30_variance: float = None self.genomic_reads_fraction_bases_quality_above_30_mean: float = None self.genomic_reads_fraction_bases_quality_above_30_variance: float = None self.genomic_read_quality_mean: float = None self.genomic_read_quality_variance: float = None self.n_molecules: float = None self.n_fragments: float = None self.reads_per_molecule: float = None self.reads_per_fragment: float = None self.fragments_per_molecule: float = None self.fragments_with_single_read_evidence: int = None self.molecules_with_single_read_evidence: int = None @staticmethod def _quality_string_to_numeric(quality_sequence: Iterable[str]) -> List[int]: """Convert an HTSlib ASCII quality string to an integer representation. Parameters ---------- quality_sequence : Iterable[str] An iterable of Illumina base call qualities in ASCII encoding Returns ------- numeric_qualities : List[int] A list of Illumina base call qualities converted to integers """ return [ ord(c) - 33 for c in quality_sequence ] # todo look up if this is accurate @staticmethod def _quality_above_threshold( threshold: int, quality_sequence: Sequence[int] ) -> float: """Calculate the fraction of bases called with a quality above ``threshold``. Parameters ---------- threshold: int The quality threshold quality_sequence: Sequence[int] A sequence of Illumina base qualities Returns ------- fraction : float The fraction of bases in ``quality_sequence`` with quality greater than ``threshold`` """ return sum(1 for base in quality_sequence if base > threshold) / len( quality_sequence ) def _is_noise(self, record: pysam.AlignedSegment) -> bool: return NotImplemented # todo required because 10x measures this
[docs] def parse_molecule( self, tags: Sequence[str], records: Iterable[pysam.AlignedSegment] ) -> None: """Parse information from all records of a molecule. The parsed information is stored in the MetricAggregator in-place. Parameters ---------- tags : Sequence[str] all the tags that define this molecule. one of {[CB, GE, UB], [GE, CB, UB]} records : Iterable[pysam.AlignedSegment] the sam records associated with the molecule """ for record in records: # todo think about how I could use the duplicate tag to reduce computation; duplicates # should normally come in order in a sorted file # extract sub-class-specific information self.parse_extra_fields(tags=tags, record=record) self.n_reads += 1 # self.noise_reads += self.is_noise(record) # todo implement me # the tags passed to this function define a molecule, this increments the counter, # identifying a new molecule only if a new tag combination is observed self._molecule_histogram[tags] += 1 self._molecule_barcode_fraction_bases_above_30.update( self._quality_above_threshold( 30, self._quality_string_to_numeric( record.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY) ), ) ) # we should be tolerant and handle it if the pysam.AlignedSegment.get_tag # cannot retrieve the data by a tag since it's not a fatal error try: self.perfect_molecule_barcodes += record.get_tag( consts.RAW_MOLECULE_BARCODE_TAG_KEY ) == record.get_tag(consts.MOLECULE_BARCODE_TAG_KEY) except KeyError: # An error occurred while retrieving the data from the optional alighment section, which # indicates that the read did not have a corrected UMI sequence. In the future we would like to # keep track of these reads. pass self._genomic_reads_fraction_bases_quality_above_30.update( self._quality_above_threshold(30, record.query_alignment_qualities) ) mean_alignment_quality: float = np.mean(record.query_alignment_qualities) self._genomic_read_quality.update(mean_alignment_quality) # the remaining portions deal with aligned reads, so if the read is not mapped, we are # done with it if record.is_unmapped: continue # get components that define a unique sequence fragment and increment the histogram position: int = record.pos strand: bool = record.is_reverse reference: int = record.reference_id self._fragment_histogram[reference, position, strand, tags] += 1 alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) if alignment_location == consts.CODING_ALIGNMENT_LOCATION_TAG_VALUE: self.reads_mapped_exonic += 1 elif alignment_location == consts.INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE: self.reads_mapped_intronic += 1 elif alignment_location == consts.UTR_ALIGNMENT_LOCATION_TAG_VALUE: self.reads_mapped_utr += 1 # todo check if read maps outside window (needs gene model) # todo create distances from terminate side (needs gene model) # uniqueness number_mappings = record.get_tag(consts.NUMBER_OF_HITS_TAG_KEY) if number_mappings == 1: self.reads_mapped_uniquely += 1 else: self.reads_mapped_multiple += ( 1 # todo without multi-mapping, this number is zero! ) if record.is_duplicate: self.duplicate_reads += 1 # cigar N field (3) indicates a read is spliced if the value is non-zero cigar_stats, num_blocks = record.get_cigar_stats() if cigar_stats[3]: self.spliced_reads += 1 # todo figure out antisense and make this notation clearer; info likely in dropseqtools self._plus_strand_reads += not record.is_reverse
[docs] def parse_extra_fields( self, tags: Sequence[str], record: pysam.AlignedSegment ) -> None: """Defined by subclasses to extract class-specific information from molecules""" raise NotImplementedError
[docs] def finalize(self) -> None: """Calculate metrics that require information from all molecules of an entity ``finalize()`` replaces attributes in-place that were initialized by the constructor as ``None`` with a value calculated across all molecule data that has been aggregated. """ self.molecule_barcode_fraction_bases_above_30_mean: float = self._molecule_barcode_fraction_bases_above_30.mean self.molecule_barcode_fraction_bases_above_30_variance: float = self._molecule_barcode_fraction_bases_above_30.calculate_variance() self.genomic_reads_fraction_bases_quality_above_30_mean: float = self._genomic_reads_fraction_bases_quality_above_30.mean self.genomic_reads_fraction_bases_quality_above_30_variance: float = self._genomic_reads_fraction_bases_quality_above_30.calculate_variance() self.genomic_read_quality_mean: float = self._genomic_read_quality.mean self.genomic_read_quality_variance: float = self._genomic_read_quality.calculate_variance() self.n_molecules: int = len(self._molecule_histogram.keys()) self.n_fragments: int = len(self._fragment_histogram.keys()) try: self.reads_per_molecule: float = self.n_reads / self.n_molecules except ZeroDivisionError: self.reads_per_molecule: float = float("nan") try: self.reads_per_fragment: float = self.n_reads / self.n_fragments except ZeroDivisionError: self.reads_per_fragment: float = float("nan") try: self.fragments_per_molecule: float = self.n_fragments / self.n_molecules except ZeroDivisionError: self.fragments_per_molecule: float = float("nan") self.fragments_with_single_read_evidence: int = sum( 1 for v in self._fragment_histogram.values() if v == 1 ) self.molecules_with_single_read_evidence: int = sum( 1 for v in self._molecule_histogram.values() if v == 1 )
[docs]class CellMetrics(MetricAggregator): """Cell Metric Aggregator Aggregator that captures metric information about a cell by parsing all of the molecules in an experiment that were annotated with a specific cell barcode, as recorded in the ``CB`` tag. Attributes ---------- perfect_cell_barcodes : int The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``) reads_mapped_intergenic : int The number of reads mapped to an intergenic region for this cell reads_mapped_too_many_loci : int The number of reads that were mapped to too many loci across the genome and as a consequence, are reported unmapped by the aligner cell_barcode_fraction_bases_above_30_variance : float The variance of the fraction of Illumina base calls for the cell barcode sequence that are greater than 30, across molecules cell_barcode_fraction_bases_above_30_mean : float The average fraction of Illumina base calls for the cell barcode sequence that are greater than 30, across molecules n_genes : int The number of genes detected by this cell genes_detected_multiple_observations : int The number of genes that are observed by more than one read in this cell n_mitochondrial_genes: int The number of mitochondrial genes detected by this cell n_mitochondrial_molecules: int The number of molecules from mitochondrial genes detected for this cell pct_mitochondrial_molecules: int The percentage of molecules from mitochondrial genes detected for this cell """ extra_docs = """ Examples -------- # todo implement me See Also -------- GeneMetrics """ __doc__ += MetricAggregator.__doc__ + extra_docs def __init__(self): super().__init__() # barcode quality data self._cell_barcode_fraction_bases_above_30 = OnlineGaussianSufficientStatistic() self.perfect_cell_barcodes = 0 # inv: fraction cells with errors # track non-transcriptomic reads self.reads_mapped_intergenic = 0 self.reads_unmapped = 0 self.reads_mapped_too_many_loci = 0 self._genes_histogram = Counter() # todo think about whether we can build molecule models that map to things that aren't genes # i.e. to integentic regions or intronic regions. This could be a part of multi-mapping # self.molecules_mapped_intergenic = 0 self.cell_barcode_fraction_bases_above_30_variance: float = None self.cell_barcode_fraction_bases_above_30_mean: float = None self.n_genes: int = None self.genes_detected_multiple_observations: int = None self.n_mitochondrial_genes: int = None self.n_mitochondrial_molecules: int = None self.pct_mitochondrial_molecules: float = None
[docs] def finalize(self, mitochondrial_genes=set()): super().finalize() self.cell_barcode_fraction_bases_above_30_mean: float = self._cell_barcode_fraction_bases_above_30.mean self.cell_barcode_fraction_bases_above_30_variance: float = self._cell_barcode_fraction_bases_above_30.calculate_variance() self.n_genes: int = len(self._genes_histogram.keys()) self.genes_detected_multiple_observations: int = sum( 1 for v in self._genes_histogram.values() if v > 1 ) self.n_mitochondrial_genes: int = sum( 1 for g in self._genes_histogram.keys() if g in mitochondrial_genes ) self.n_mitochondrial_molecules: int = sum( c for g, c in self._genes_histogram.items() if g in mitochondrial_genes ) if self.n_mitochondrial_molecules: tot_molecules = sum(self._genes_histogram.values()) self.pct_mitochondrial_molecules = ( self.n_mitochondrial_molecules / tot_molecules * 100.0 ) else: self.pct_mitochondrial_molecules = 0.00
[docs] def parse_extra_fields( self, tags: Sequence[str], record: pysam.AlignedSegment ) -> None: """Parses a record to extract gene-specific information Gene-specific metric data is stored in-place in the MetricAggregator Parameters ---------- tags : Sequence[str] The GE, UB and CB tags that define this molecule record : pysam.AlignedSegment SAM record to be parsed """ self._cell_barcode_fraction_bases_above_30.update( self._quality_above_threshold( 30, self._quality_string_to_numeric( record.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY) ), ) ) # Exclude reads that do not have a CB tag from the perfect_cell_barcodes count if record.has_tag(consts.CELL_BARCODE_TAG_KEY): raw_cell_barcode_tag = record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) cell_barcode_tag = record.get_tag(consts.CELL_BARCODE_TAG_KEY) self.perfect_cell_barcodes += raw_cell_barcode_tag == cell_barcode_tag try: alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) if alignment_location == consts.INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE: self.reads_mapped_intergenic += 1 except KeyError: self.reads_unmapped += 1 # todo track reads_mapped_too_many_loci after multi-alignment is done self._genes_histogram[tags[2]] += 1 # note that no gene == None
[docs]class GeneMetrics(MetricAggregator): """Gene Metric Aggregator Aggregator that captures metric information about a gene by parsing all of the molecules in an experiment that were annotated with a specific gene ID, as recorded in the ``GE`` tag. Attributes ---------- number_cells_detected_multiple : int The number of cells which observe more than one read of this gene number_cells_expressing : int The number of cells that detect this gene """ extra_docs = """ Examples -------- # todo implement me See Also -------- CellMetrics """ __doc__ += MetricAggregator.__doc__ + extra_docs def __init__(self): super().__init__() self._cells_histogram = Counter() # todo we don't tag exon right now. Not sure if we want to or not # self._exon_histogram = Counter() self.number_cells_detected_multiple: int = None self.number_cells_expressing: int = None
[docs] def finalize(self): super().finalize() self.number_cells_expressing: int = len(self._cells_histogram.keys()) self.number_cells_detected_multiple: int = sum( 1 for c in self._cells_histogram.values() if c > 1 )
[docs] def parse_extra_fields( self, tags: Sequence[str], record: pysam.AlignedSegment ) -> None: """Parses a record to extract cell-specific information Cell-specific metric data is stored in-place in the MetricAggregator Parameters ---------- tags : Sequence[str] The CB, UB and GE tags that define this molecule record : pysam.AlignedSegment SAM record to be parsed """ self._cells_histogram[tags[1]] += 1