Source code for sctools.metrics.gatherer
"""
Sequence Metric Gatherers
=========================
..currentmodule:: sctools.metrics
This module defines classes to gather metrics across the cells or genes of an experiment and write
them to gzip-compressed csv files
Classes
-------
.. autosummary::
:toctree: generated/
MetricGatherer Gatherer Base Class
GatherCellMetrics Class to gather metrics on all cells in an experiment
GatherGeneMetrics Class to gather metrics on all genes in an experiment
See Also
--------
sctools.metrics.aggregator
sctools.metrics.merge
sctools.metrics.writer
"""
from contextlib import closing
import pysam
from typing import Set
from sctools.bam import iter_cell_barcodes, iter_genes, iter_molecule_barcodes
from sctools.metrics.aggregator import CellMetrics, GeneMetrics
from sctools.metrics.writer import MetricCSVWriter
[docs]class MetricGatherer:
"""Gathers Metrics from an experiment
Because molecules tend to have relatively small numbers of reads, the memory footprint of
this method is typically small (tens of megabytes).
Parameters
----------
bam_file : str
the bam file containing the reads that metrics should be calculated from. Can be a chunk
of cells or an entire experiment
output_stem : str
the file stem for the gzipped csv output
Methods
-------
extract_metrics
extracts metrics from ``bam_file`` and writes them to output_stem.csv.gz
"""
[docs] def __init__(
self,
bam_file: str,
output_stem: str,
mitochondrial_gene_ids: Set[str] = set(),
compress: bool = True,
):
self._bam_file = bam_file
self._output_stem = output_stem
self._compress = compress
self._mitochondrial_gene_ids = mitochondrial_gene_ids
@property
def bam_file(self) -> str:
"""the bam file that metrics are generated from"""
return self._bam_file
[docs] def extract_metrics(self, mode="rb") -> None:
"""extract metrics from the provided bam file and write the results to csv.
Parameters
----------
mode : {'r', 'rb'}, default 'rb'
the open mode for pysam.AlignmentFile. 'r' indicates the input is a sam file, and 'rb'
indicates a bam file.
"""
raise NotImplementedError
[docs]class GatherCellMetrics(MetricGatherer):
extra_docs = """
Notes
-----
``bam_file`` must be sorted by gene (``GE``), molecule (``UB``), and cell (``CB``), where gene
varies fastest.
Examples
--------
>>> from sctools.metrics.gatherer import GatherCellMetrics
>>> import os, tempfile
>>> # example data
>>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam'
>>> temp_dir = tempfile.mkdtemp()
>>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True)
>>> g.extract_metrics()
See Also
--------
GatherGeneMetrics
"""
__doc__ += extra_docs
[docs] def extract_metrics(self, mode: str = "rb") -> None:
"""Extract cell metrics from self.bam_file
Parameters
----------
mode : str, optional
Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb').
"""
# open the files
with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing(
MetricCSVWriter(self._output_stem, self._compress)
) as cell_metrics_output:
# write the header
cell_metrics_output.write_header(vars(CellMetrics()))
# break up the bam file into sub-iterators over cell barcodes
for cell_iterator, cell_tag in iter_cell_barcodes(
bam_iterator=bam_iterator
):
metric_aggregator = CellMetrics()
# break up cell barcodes by molecule barcodes
for molecule_iterator, molecule_tag in iter_molecule_barcodes(
bam_iterator=cell_iterator
):
# break up molecule barcodes by gene ids
for gene_iterator, gene_tag in iter_genes(
bam_iterator=molecule_iterator
):
# process the data
metric_aggregator.parse_molecule(
tags=(cell_tag, molecule_tag, gene_tag),
records=gene_iterator,
)
# write a record for each cell
metric_aggregator.finalize(
mitochondrial_genes=self._mitochondrial_gene_ids
)
cell_metrics_output.write(cell_tag, vars(metric_aggregator))
[docs]class GatherGeneMetrics(MetricGatherer):
extra_docs = """
Notes
-----
``bam_file`` must be sorted by molecule (``UB``), cell (``CB``), and gene (``GE``), where
molecule varies fastest.
Examples
--------
>>> from sctools.metrics.gatherer import GatherCellMetrics
>>> import os, tempfile
>>> # example data
>>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam'
>>> temp_dir = tempfile.mkdtemp()
>>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True)
>>> g.extract_metrics()
See Also
--------
GatherGeneMetrics
"""
__doc__ += extra_docs
[docs] def extract_metrics(self, mode: str = "rb") -> None:
"""Extract gene metrics from self.bam_file
Parameters
----------
mode : str, optional
Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb').
"""
# open the files
with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing(
MetricCSVWriter(self._output_stem, self._compress)
) as gene_metrics_output:
# write the header
gene_metrics_output.write_header(vars(GeneMetrics()))
# break up the bam file into sub-iterators over gene ids
for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator):
metric_aggregator = GeneMetrics()
# in case of multi-genes ignore as in the counting stage
if gene_tag and len(gene_tag.split(",")) > 1:
continue
# break up gene ids by cell barcodes
for cell_iterator, cell_tag in iter_cell_barcodes(
bam_iterator=gene_iterator
):
# break up cell barcodes by molecular barcodes
for molecule_iterator, molecule_tag in iter_molecule_barcodes(
bam_iterator=cell_iterator
):
# process the data
metric_aggregator.parse_molecule(
tags=(gene_tag, cell_tag, molecule_tag),
records=molecule_iterator,
)
# write a record for each gene id
metric_aggregator.finalize()
gene_metrics_output.write(gene_tag, vars(metric_aggregator))