"""
Sequence Metric Aggregators
===========================
.. currentmodule:: sctools.metrics
This module provides classes useful for aggregating metric information for individual cells or
genes. These classes consume BAM files that have been pre-sorted such that all sequencing reads
that correspond to the molecules of a cell (CellMetrics) or the molecules of a gene (GeneMetrics)
are yielded sequentially.
Classes
-------
.. autosummary::
:toctree: generated/
MetricAggregatorBase Aggregator Base Class
GeneMetrics Class to iteratively calculate metrics for a gene (by molecule)
CellMetrics Class to iteratively calculate metrics for a cell (by molecule)
Notes
-----
This module can be rewritten with dataclass when python 3.7 stabilizes, see
https://www.python.org/dev/peps/pep-0557/
See Also
--------
sctools.metrics.gatherer
sctools.metrics.merge
sctools.metrics.writer
"""
from typing import Iterable, Tuple, Counter, List, Sequence
import numpy as np
import pysam
from sctools import consts
from sctools.stats import OnlineGaussianSufficientStatistic
[docs]class MetricAggregator:
"""Metric Aggregator Base Class
The ``MetricAggregator`` class defines a set of metrics that can be extracted from an
aligned bam file. It defines all the metrics that are general across genes and cells. This
class is subclassed by ``GeneMetrics`` and ``CellMetrics``, which define data-specific metrics
in the ``parse_extra_fields`` method. An instance of ``GeneMetrics`` or ``CellMetrics`` is
instantiated for each gene or molecule in a bam file, respectively.
Attributes
----------
n_reads : int
The number of reads associated with this entity
noise_reads : int, NotImplemented
Number of reads that are categorized by 10x genomics cellranger as "noise". Refers to
long polymers, or reads with high numbers of N (ambiguous) nucleotides
perfect_molecule_barcodes : int
The number of reads with molecule barcodes that have no errors (cell barcode tag == raw barcode tag)
reads_mapped_exonic : int
The number of reads for this entity that are mapped to exons
reads_mapped_intronic : int
The number of reads for this entity that are mapped to introns
reads_mapped_utr : int
The number of reads for this entity that are mapped to 3' untranslated regions (UTRs)
reads_mapped_uniquely : int
The number of reads mapped to a single unambiguous location in the genome
reads_mapped_multiple : int
The number of reads mapped to multiple genomic positions with equal confidence
# todo make sure equal confidence is accurate
duplicate_reads : int
The number of reads that are duplicates (see README.md for defition of a duplicate)
spliced_reads : int
The number of reads that overlap splicing junctions
antisense_reads : int
The number of reads that are mapped to the antisense strand instead of the transcribed
strand
molecule_barcode_fraction_bases_above_30_mean : float
The average fraction of bases in molecule barcodes that receive quality scores greater than
30 across the reads of this entity
molecule_barcode_fraction_bases_above_30_variance : float
The variance in the fraction of bases in molecule barcodes that receive quality scores
greater than 30 across the reads of this entity
genomic_reads_fraction_bases_quality_above_30_mean : float
The average fraction of bases in the genomic read that receive quality scores greater than
30 across the reads of this entity (included for 10x cell ranger count comparison)
genomic_reads_fraction_bases_quality_above_30_variance : float
The variance in the fraction of bases in the genomic read that receive quality scores
greater than 30 across the reads of this entity (included for 10x cell ranger count
comparison)
genomic_read_quality_mean : float
Average quality of Illumina base calls in the genomic reads corresponding to this entity
genomic_read_quality_variance : float
Variance in quality of Illumina base calls in the genomic reads corresponding to this
entity
n_molecules : float
Number of molecules corresponding to this entity. See README.md for the definition of a
Molecule
n_fragments : float
Number of fragments corresponding to this entity. See README.md for the definition of a
Fragment
reads_per_molecule : float
The average number of reads associated with each molecule in this entity
reads_per_fragment : float
The average number of reads associated with each fragment in this entity
fragments_per_molecule : float
The average number of fragments associated with each molecule in this entity
fragments_with_single_read_evidence : int
The number of fragments associated with this entity that are observed by only one read
molecules_with_single_read_evidence : int
The number of molecules associated with this entity that are observed by only one read
Methods
-------
parse_extra_fields(tags, record), NotImplemented
Abstract method that must be implemented by subclasses. Called by ``parse_molecule()``
to gather information for subclass-specific metrics
parse_molecule(tags, record)
Extract information from a set of sequencing reads that correspond to a molecule and store
the data in the MetricAggregator class.
finalize()
Some metrics cannot be calculated until all the information for an entity has been
aggregated, for example, the number of `fragments_per_molecule`. Finalize calculates all
such higher-order metrics
"""
def __init__(self):
# type definitions
Chromosome: int
Strand: bool # reverse = True, see pysam.AlignedSegment.is_reverse
Position: int
Fragment: Tuple[Chromosome, Position, Strand] # noqa: F821
# count information
self.n_reads: int = 0
self.noise_reads: int = 0 # long polymers, N-sequences; NotImplemented
self._fragment_histogram: Counter[Fragment] = Counter() # noqa: F821
self._molecule_histogram: Counter[str] = Counter()
# molecule information
self._molecule_barcode_fraction_bases_above_30 = (
OnlineGaussianSufficientStatistic()
)
self.perfect_molecule_barcodes = 0
self._genomic_reads_fraction_bases_quality_above_30 = (
OnlineGaussianSufficientStatistic()
)
self._genomic_read_quality = OnlineGaussianSufficientStatistic()
# alignment location information
self.reads_mapped_exonic = 0
self.reads_mapped_intronic = 0
self.reads_mapped_utr = 0
# todo implement this once we have a gene model
# self.reads_mapped_outside_window = 0 # reads should be within 1000 bases of UTR
# self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic()
# alignment uniqueness information
self.reads_mapped_uniquely = 0
self.reads_mapped_multiple = 0
self.duplicate_reads = 0
# alignment splicing information
self.spliced_reads = 0
self.antisense_reads = 0
self._plus_strand_reads = 0 # strand balance # todo implement property here
# higher-order methods, filled in by finalize() when all data is extracted
self.molecule_barcode_fraction_bases_above_30_mean: float = None
self.molecule_barcode_fraction_bases_above_30_variance: float = None
self.genomic_reads_fraction_bases_quality_above_30_mean: float = None
self.genomic_reads_fraction_bases_quality_above_30_variance: float = None
self.genomic_read_quality_mean: float = None
self.genomic_read_quality_variance: float = None
self.n_molecules: float = None
self.n_fragments: float = None
self.reads_per_molecule: float = None
self.reads_per_fragment: float = None
self.fragments_per_molecule: float = None
self.fragments_with_single_read_evidence: int = None
self.molecules_with_single_read_evidence: int = None
@staticmethod
def _quality_string_to_numeric(quality_sequence: Iterable[str]) -> List[int]:
"""Convert an HTSlib ASCII quality string to an integer representation.
Parameters
----------
quality_sequence : Iterable[str]
An iterable of Illumina base call qualities in ASCII encoding
Returns
-------
numeric_qualities : List[int]
A list of Illumina base call qualities converted to integers
"""
return [
ord(c) - 33 for c in quality_sequence
] # todo look up if this is accurate
@staticmethod
def _quality_above_threshold(
threshold: int, quality_sequence: Sequence[int]
) -> float:
"""Calculate the fraction of bases called with a quality above ``threshold``.
Parameters
----------
threshold: int
The quality threshold
quality_sequence: Sequence[int]
A sequence of Illumina base qualities
Returns
-------
fraction : float
The fraction of bases in ``quality_sequence`` with quality greater than ``threshold``
"""
return sum(1 for base in quality_sequence if base > threshold) / len(
quality_sequence
)
def _is_noise(self, record: pysam.AlignedSegment) -> bool:
return NotImplemented # todo required because 10x measures this
[docs] def parse_molecule(
self, tags: Sequence[str], records: Iterable[pysam.AlignedSegment]
) -> None:
"""Parse information from all records of a molecule.
The parsed information is stored in the MetricAggregator in-place.
Parameters
----------
tags : Sequence[str]
all the tags that define this molecule. one of {[CB, GE, UB], [GE, CB, UB]}
records : Iterable[pysam.AlignedSegment]
the sam records associated with the molecule
"""
for record in records:
# todo think about how I could use the duplicate tag to reduce computation; duplicates
# should normally come in order in a sorted file
# extract sub-class-specific information
self.parse_extra_fields(tags=tags, record=record)
self.n_reads += 1
# self.noise_reads += self.is_noise(record) # todo implement me
# the tags passed to this function define a molecule, this increments the counter,
# identifying a new molecule only if a new tag combination is observed
self._molecule_histogram[tags] += 1
self._molecule_barcode_fraction_bases_above_30.update(
self._quality_above_threshold(
30,
self._quality_string_to_numeric(
record.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY)
),
)
)
# we should be tolerant and handle it if the pysam.AlignedSegment.get_tag
# cannot retrieve the data by a tag since it's not a fatal error
try:
self.perfect_molecule_barcodes += record.get_tag(
consts.RAW_MOLECULE_BARCODE_TAG_KEY
) == record.get_tag(consts.MOLECULE_BARCODE_TAG_KEY)
except KeyError:
# An error occurred while retrieving the data from the optional alighment section, which
# indicates that the read did not have a corrected UMI sequence. In the future we would like to
# keep track of these reads.
pass
self._genomic_reads_fraction_bases_quality_above_30.update(
self._quality_above_threshold(30, record.query_alignment_qualities)
)
mean_alignment_quality: float = np.mean(record.query_alignment_qualities)
self._genomic_read_quality.update(mean_alignment_quality)
# the remaining portions deal with aligned reads, so if the read is not mapped, we are
# done with it
if record.is_unmapped:
continue
# get components that define a unique sequence fragment and increment the histogram
position: int = record.pos
strand: bool = record.is_reverse
reference: int = record.reference_id
self._fragment_histogram[reference, position, strand, tags] += 1
alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY)
if alignment_location == consts.CODING_ALIGNMENT_LOCATION_TAG_VALUE:
self.reads_mapped_exonic += 1
elif alignment_location == consts.INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE:
self.reads_mapped_intronic += 1
elif alignment_location == consts.UTR_ALIGNMENT_LOCATION_TAG_VALUE:
self.reads_mapped_utr += 1
# todo check if read maps outside window (needs gene model)
# todo create distances from terminate side (needs gene model)
# uniqueness
number_mappings = record.get_tag(consts.NUMBER_OF_HITS_TAG_KEY)
if number_mappings == 1:
self.reads_mapped_uniquely += 1
else:
self.reads_mapped_multiple += (
1 # todo without multi-mapping, this number is zero!
)
if record.is_duplicate:
self.duplicate_reads += 1
# cigar N field (3) indicates a read is spliced if the value is non-zero
cigar_stats, num_blocks = record.get_cigar_stats()
if cigar_stats[3]:
self.spliced_reads += 1
# todo figure out antisense and make this notation clearer; info likely in dropseqtools
self._plus_strand_reads += not record.is_reverse
[docs] def finalize(self) -> None:
"""Calculate metrics that require information from all molecules of an entity
``finalize()`` replaces attributes in-place that were initialized by the constructor as
``None`` with a value calculated across all molecule data that has been aggregated.
"""
self.molecule_barcode_fraction_bases_above_30_mean: float = self._molecule_barcode_fraction_bases_above_30.mean
self.molecule_barcode_fraction_bases_above_30_variance: float = self._molecule_barcode_fraction_bases_above_30.calculate_variance()
self.genomic_reads_fraction_bases_quality_above_30_mean: float = self._genomic_reads_fraction_bases_quality_above_30.mean
self.genomic_reads_fraction_bases_quality_above_30_variance: float = self._genomic_reads_fraction_bases_quality_above_30.calculate_variance()
self.genomic_read_quality_mean: float = self._genomic_read_quality.mean
self.genomic_read_quality_variance: float = self._genomic_read_quality.calculate_variance()
self.n_molecules: int = len(self._molecule_histogram.keys())
self.n_fragments: int = len(self._fragment_histogram.keys())
try:
self.reads_per_molecule: float = self.n_reads / self.n_molecules
except ZeroDivisionError:
self.reads_per_molecule: float = float("nan")
try:
self.reads_per_fragment: float = self.n_reads / self.n_fragments
except ZeroDivisionError:
self.reads_per_fragment: float = float("nan")
try:
self.fragments_per_molecule: float = self.n_fragments / self.n_molecules
except ZeroDivisionError:
self.fragments_per_molecule: float = float("nan")
self.fragments_with_single_read_evidence: int = sum(
1 for v in self._fragment_histogram.values() if v == 1
)
self.molecules_with_single_read_evidence: int = sum(
1 for v in self._molecule_histogram.values() if v == 1
)
[docs]class CellMetrics(MetricAggregator):
"""Cell Metric Aggregator
Aggregator that captures metric information about a cell by parsing all of the molecules in
an experiment that were annotated with a specific cell barcode, as recorded in the ``CB`` tag.
Attributes
----------
perfect_cell_barcodes : int
The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``)
reads_mapped_intergenic : int
The number of reads mapped to an intergenic region for this cell
reads_mapped_too_many_loci : int
The number of reads that were mapped to too many loci across the genome and as a
consequence, are reported unmapped by the aligner
cell_barcode_fraction_bases_above_30_variance : float
The variance of the fraction of Illumina base calls for the cell barcode sequence that
are greater than 30, across molecules
cell_barcode_fraction_bases_above_30_mean : float
The average fraction of Illumina base calls for the cell barcode sequence that
are greater than 30, across molecules
n_genes : int
The number of genes detected by this cell
genes_detected_multiple_observations : int
The number of genes that are observed by more than one read in this cell
n_mitochondrial_genes: int
The number of mitochondrial genes detected by this cell
n_mitochondrial_molecules: int
The number of molecules from mitochondrial genes detected for this cell
pct_mitochondrial_molecules: int
The percentage of molecules from mitochondrial genes detected for this cell
"""
extra_docs = """
Examples
--------
# todo implement me
See Also
--------
GeneMetrics
"""
__doc__ += MetricAggregator.__doc__ + extra_docs
def __init__(self):
super().__init__()
# barcode quality data
self._cell_barcode_fraction_bases_above_30 = OnlineGaussianSufficientStatistic()
self.perfect_cell_barcodes = 0 # inv: fraction cells with errors
# track non-transcriptomic reads
self.reads_mapped_intergenic = 0
self.reads_unmapped = 0
self.reads_mapped_too_many_loci = 0
self._genes_histogram = Counter()
# todo think about whether we can build molecule models that map to things that aren't genes
# i.e. to integentic regions or intronic regions. This could be a part of multi-mapping
# self.molecules_mapped_intergenic = 0
self.cell_barcode_fraction_bases_above_30_variance: float = None
self.cell_barcode_fraction_bases_above_30_mean: float = None
self.n_genes: int = None
self.genes_detected_multiple_observations: int = None
self.n_mitochondrial_genes: int = None
self.n_mitochondrial_molecules: int = None
self.pct_mitochondrial_molecules: float = None
[docs] def finalize(self, mitochondrial_genes=set()):
super().finalize()
self.cell_barcode_fraction_bases_above_30_mean: float = self._cell_barcode_fraction_bases_above_30.mean
self.cell_barcode_fraction_bases_above_30_variance: float = self._cell_barcode_fraction_bases_above_30.calculate_variance()
self.n_genes: int = len(self._genes_histogram.keys())
self.genes_detected_multiple_observations: int = sum(
1 for v in self._genes_histogram.values() if v > 1
)
self.n_mitochondrial_genes: int = sum(
1 for g in self._genes_histogram.keys() if g in mitochondrial_genes
)
self.n_mitochondrial_molecules: int = sum(
c for g, c in self._genes_histogram.items() if g in mitochondrial_genes
)
if self.n_mitochondrial_molecules:
tot_molecules = sum(self._genes_histogram.values())
self.pct_mitochondrial_molecules = (
self.n_mitochondrial_molecules / tot_molecules * 100.0
)
else:
self.pct_mitochondrial_molecules = 0.00
[docs]class GeneMetrics(MetricAggregator):
"""Gene Metric Aggregator
Aggregator that captures metric information about a gene by parsing all of the molecules in
an experiment that were annotated with a specific gene ID, as recorded in the ``GE`` tag.
Attributes
----------
number_cells_detected_multiple : int
The number of cells which observe more than one read of this gene
number_cells_expressing : int
The number of cells that detect this gene
"""
extra_docs = """
Examples
--------
# todo implement me
See Also
--------
CellMetrics
"""
__doc__ += MetricAggregator.__doc__ + extra_docs
def __init__(self):
super().__init__()
self._cells_histogram = Counter()
# todo we don't tag exon right now. Not sure if we want to or not
# self._exon_histogram = Counter()
self.number_cells_detected_multiple: int = None
self.number_cells_expressing: int = None
[docs] def finalize(self):
super().finalize()
self.number_cells_expressing: int = len(self._cells_histogram.keys())
self.number_cells_detected_multiple: int = sum(
1 for c in self._cells_histogram.values() if c > 1
)