Source code for sctools.platform
"""
Command Line Interface for SC Tools:
====================================
.. currentmodule:: sctools
This module defines the command line interface for SC Tools. Tools are separated into those that
are specific to particular chemistries (e.g. Smart-seq 2) or experimental platforms (e.g. 10x
Genomics v2) and those that are general across any sequencing experiment.
Currently, only general modules and those used for 10x v2 are implemented
Classes
-------
GenericPlatform Class containing all general command line utilities
TenXV2 Class containing 10x v2 specific command line utilities
"""
import argparse
from typing import Iterable, List, Dict, Set, Optional, Sequence
from itertools import chain
import pysam
from sctools import fastq, bam, metrics, count, consts, gtf, groups
[docs]class GenericPlatform:
"""Platform-agnostic command line functions available in SC Tools.
Platform-Agnostic Methods
-------------------------
tag_sort_bam():
sort a bam file by zero or more tags and then by queryname
verify_bam_sort():
verifies whether bam file is correctly sorted by given list of zero or more tags, then queryname
split_bam()
split a bam file into subfiles of equal size
calculate_gene_metrics()
calculate information about genes captured by a sequencing experiment
calculate_cell_metrics()
calculate information about cells captured by a sequencing experiment
merge_gene_metrics()
merge multiple gene metrics files into a single output
merge_cell_metrics()
merge multiple cell metrics files into a single output
bam_to_count()
construct a compressed sparse row count file from a tagged, aligned bam file
merge_count_matrices()
merge multiple csr-format count matrices into a single csr matrix
group_qc_outputs()
aggregate Picard, HISAT2 and RSME QC statisitics
"""
[docs] @classmethod
def tag_sort_bam(cls, args: Iterable = None) -> int:
"""Command line entrypoint for sorting a bam file by zero or more tags, followed by queryname.
Parameters
----------
args : Iterable[str], optional
arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
description = "Sorts bam by list of zero or more tags, followed by query name"
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-i", "--input_bam", required=True, help="input bamfile")
parser.add_argument("-o", "--output_bam", required=True, help="output bamfile")
parser.add_argument(
"-t",
"--tags",
nargs="+",
action="append",
help="tag(s) to sort by, separated by space, e.g. -t CB GE UB",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
tags = cls.get_tags(args.tags)
with pysam.AlignmentFile(args.input_bam, "rb") as f:
header = f.header
records = f.fetch(until_eof=True)
sorted_records = bam.sort_by_tags_and_queryname(records, tags)
with pysam.AlignmentFile(args.output_bam, "wb", header=header) as f:
for record in sorted_records:
f.write(record)
return 0
[docs] @classmethod
def verify_bam_sort(cls, args: Iterable = None) -> int:
"""Command line entrypoint for verifying bam is properly sorted by zero or more tags, followed by queryname.
Parameters
----------
args : Iterable[str], optional
arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
description = "Verifies whether bam is sorted by the list of zero or more tags, followed by query name"
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-i", "--input_bam", required=True, help="input bamfile")
parser.add_argument(
"-t",
"--tags",
nargs="+",
action="append",
help="tag(s) to use to verify sorting, separated by space, e.g. -t CB GE UB",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
tags = cls.get_tags(args.tags)
with pysam.AlignmentFile(args.input_bam, "rb") as f:
aligned_segments = f.fetch(until_eof=True)
sortable_records = (
bam.TagSortableRecord.from_aligned_segment(r, tags)
for r in aligned_segments
)
bam.verify_sort(sortable_records, tags)
print(
"{0} is correctly sorted by {1} and query name".format(args.input_bam, tags)
)
return 0
[docs] @classmethod
def get_tags(cls, raw_tags: Optional[Sequence[str]]) -> Iterable[str]:
if raw_tags is None:
raw_tags = []
# Flattens into single list when tags specified like -t A -t B -t C
return [t for t in chain.from_iterable(raw_tags)]
[docs] @classmethod
def split_bam(cls, args: Iterable = None) -> int:
"""Command line entrypoint for splitting a bamfile into subfiles of equal size.
prints filenames of chunks to stdout
Parameters
----------
args : Iterable[str], optional
arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-b", "--bamfile", nargs="+", required=True, help="input bamfile"
)
parser.add_argument(
"-p", "--output-prefix", required=True, help="prefix for output chunks"
)
parser.add_argument(
"-s",
"--subfile-size",
required=False,
default=1000,
type=float,
help="approximate size target for each subfile (in MB)",
)
parser.add_argument(
"--num-processes",
required=False,
default=None,
type=int,
help="Number of processes to parallelize over",
)
parser.add_argument(
"-t",
"--tags",
nargs="+",
help="tag(s) to split bamfile over. Tags are checked sequentially, "
"and tags after the first are only checked if the first tag is "
"not present.",
)
parser.set_defaults(raise_missing=True)
parser.add_argument(
"--drop-missing",
action="store_false",
help="drop records without tag specified by -t/--tag (default "
"behavior is to raise an exception",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
filenames = bam.split(
args.bamfile,
args.output_prefix,
args.tags,
approx_mb_per_split=args.subfile_size,
raise_missing=args.drop_missing,
num_processes=args.num_processes,
)
print(" ".join(filenames))
return 0
[docs] @classmethod
def calculate_gene_metrics(cls, args: Iterable[str] = None) -> int:
"""Command line entrypoint for calculating gene metrics from a sorted bamfile.
Writes metrics to .csv
Parameters
----------
args : Iterable[str], optional
arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-i", "--input-bam", required=True, help="Input bam file name."
)
parser.add_argument(
"-o", "--output-filestem", required=True, help="Output file stem."
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
gene_metric_gatherer = metrics.gatherer.GatherGeneMetrics(
args.input_bam, args.output_filestem
)
gene_metric_gatherer.extract_metrics()
return 0
[docs] @classmethod
def calculate_cell_metrics(cls, args: Iterable[str] = None) -> int:
"""Command line entrypoint for calculating cell metrics from a sorted bamfile.
Writes metrics to .csv
Parameters
----------
args : Iterable[str], optional
Arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-i", "--input-bam", required=True, help="Input bam file name."
)
parser.add_argument(
"-o", "--output-filestem", required=True, help="Output file stem."
)
parser.add_argument(
"-a",
"--gtf-annotation-file",
required=False,
default=None,
help="gtf annotation file that bam_file was aligned against",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
# load mitochondrial gene ids from the annotation file
mitochondrial_gene_ids: Set(str) = set()
if args.gtf_annotation_file:
mitochondrial_gene_ids = gtf.get_mitochondrial_gene_names(
args.gtf_annotation_file
)
cell_metric_gatherer = metrics.gatherer.GatherCellMetrics(
args.input_bam, args.output_filestem, mitochondrial_gene_ids
)
cell_metric_gatherer.extract_metrics()
return 0
[docs] @classmethod
def merge_gene_metrics(cls, args: Iterable[str] = None) -> int:
"""Command line entrypoint for merging multiple gene metrics files.
Merges multiple metrics inputs into a single metrics file that matches the shape and
order of the generated count matrix.
Parameters
----------
args : Iterable[str], optional
Arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument("metric_files", nargs="+", help="Input metric files")
parser.add_argument(
"-o", "--output-filestem", required=True, help="Output file stem."
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
merge = metrics.merge.MergeGeneMetrics(args.metric_files, args.output_filestem)
merge.execute()
return 0
[docs] @classmethod
def merge_cell_metrics(cls, args: Iterable[str] = None) -> int:
"""Command line entrypoint for merging multiple cell metrics files.
Merges multiple metrics inputs into a single metrics file that matches the shape and
order of the generated count matrix.
Parameters
----------
args : Iterable[str], optional
Arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument("metric_files", nargs="+", help="Input metric files")
parser.add_argument(
"-o", "--output-filestem", required=True, help="Output file stem."
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
merge = metrics.merge.MergeCellMetrics(args.metric_files, args.output_filestem)
merge.execute()
return 0
[docs] @classmethod
def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int:
"""Command line entrypoint for constructing a count matrix from a tagged bam file.
Constructs a count matrix from an aligned bam file sorted by cell barcode, molecule
barcode, and gene id.
Parameters
----------
args : Iterable[str], optional
Arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.set_defaults(
cell_barcode_tag=consts.CELL_BARCODE_TAG_KEY,
molecule_barcode_tag=consts.MOLECULE_BARCODE_TAG_KEY,
gene_name_tag=consts.GENE_NAME_TAG_KEY,
sn_rna_seq_mode=False,
)
parser.add_argument("-b", "--bam-file", help="input_bam_file", required=True)
parser.add_argument(
"-o", "--output-prefix", help="file stem for count matrix", required=True
)
parser.add_argument(
"-a",
"--gtf-annotation-file",
required=True,
help="gtf annotation file that bam_file was aligned against",
)
parser.add_argument(
"-c",
"--cell-barcode-tag",
help=f"tag that identifies the cell barcode (default = {consts.CELL_BARCODE_TAG_KEY})",
)
parser.add_argument(
"-m",
"--molecule-barcode-tag",
help=f"tag that identifies the molecule barcode (default = {consts.MOLECULE_BARCODE_TAG_KEY})",
)
parser.add_argument(
"-g",
"--gene-id-tag",
help=f"tag that identifies the gene name (default = {consts.GENE_NAME_TAG_KEY})",
)
parser.add_argument(
"-n",
"--sn-rna-seq-mode",
action="store_true",
help=f"snRNA Seq mode (default = False)",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
# assume bam file unless the file explicitly has a sam suffix
open_mode = "r" if args.bam_file.endswith(".sam") else "rb"
# load gene names from the annotation file
gene_name_to_index: Dict[str, int] = gtf.extract_gene_names(
args.gtf_annotation_file
)
# For snRNA-seq we need the extended gene information
if args.sn_rna_seq_mode:
gene_locations = gtf.extract_extended_gene_names(args.gtf_annotation_file)
else:
gene_locations = None
matrix = count.CountMatrix.from_sorted_tagged_bam(
bam_file=args.bam_file,
gene_name_to_index=gene_name_to_index,
chromosomes_gene_locations_extended=gene_locations,
cell_barcode_tag=args.cell_barcode_tag,
molecule_barcode_tag=args.molecule_barcode_tag,
gene_name_tag=args.gene_id_tag,
open_mode=open_mode,
)
matrix.save(args.output_prefix)
return 0
[docs] @classmethod
def merge_count_matrices(cls, args: Iterable[str] = None) -> int:
"""Command line entrypoint for constructing a count matrix from a tagged bam file.
Constructs a count matrix from an aligned bam file sorted by cell barcode, molecule
barcode, and gene id.
Parameters
----------
args : Iterable[str], optional
Arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input-prefixes",
nargs="+",
help="prefix for count matrices to be concatenated. e.g. test_counts "
"for test_counts.npz, test_counts_col_index.npy, and test_counts_"
"row_index.npy",
)
parser.add_argument(
"-o", "--output-stem", help="file stem for merged csr matrix", required=True
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
count_matrix = count.CountMatrix.merge_matrices(args.input_prefixes)
count_matrix.save(args.output_stem)
return 0
[docs] @classmethod
def group_qc_outputs(cls, args: Iterable[str] = None) -> int:
"""Commandline entrypoint for parsing picard metrics files, hisat2 and rsem statistics log files.
Parameters
----------
args:
file_names: array of files
output_name: prefix of output file name.
metrics_type: Picard, PicardTable, HISAT2, RSEM and Core.
Returns
----------
return: 0
return if the program completes successfully.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-f",
"--file_names",
dest="file_names",
nargs="+",
required=True,
help="a list of files to be parsed out.",
)
parser.add_argument(
"-o",
"--output_name",
dest="output_name",
required=True,
help="The output file name",
)
parser.add_argument(
"-t",
"--metrics_type",
dest="metrics_type",
choices=["Picard", "PicardTable", "Core", "HISAT2", "RSEM"],
required=True,
help="a list of string to represent metrics types,such Picard, PicardTable, HISAT2,RSEM, Core",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
if args.metrics_type == "Picard":
groups.write_aggregated_picard_metrics_by_row(
args.file_names, args.output_name
)
elif args.metrics_type == "PicardTable":
groups.write_aggregated_picard_metrics_by_table(
args.file_names, args.output_name
)
elif args.metrics_type == "Core":
groups.write_aggregated_qc_metrics(args.file_names, args.output_name)
elif args.metrics_type == "HISAT2":
groups.parse_hisat2_log(args.file_names, args.output_name)
elif args.metrics_type == "RSEM":
groups.parse_rsem_cnt(args.file_names, args.output_name)
return 0
[docs]class TenXV2(GenericPlatform):
"""Command Line Interface for 10x Genomics v2 RNA-sequencing programs
This class defines several methods that are created as CLI tools when sctools is installed
(see setup.py)
Attributes
----------
cell_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the cell barcode and the tags to
assign the sequence and quality of the cell barcode
molecule_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the molecule barcode and the tags
to assign the sequence and quality of the molecule barcode
sample_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the sample barcode and the tags
to assign the sequence and quality of the sample barcode
Methods
-------
attach_barcodes()
Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse
(r2) bam file
"""
# 10x contains three barcodes embedded within sequencing reads. The below objects define the
# start and end points of those barcodes relative to the start of the sequence, and the
# GA4GH standard tags that the extracted barcodes should be labeled with in the BAM file.
cell_barcode = fastq.EmbeddedBarcode(
start=0,
end=16,
quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY,
)
molecule_barcode = fastq.EmbeddedBarcode(
start=16,
end=26,
quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY,
)
sample_barcode = fastq.EmbeddedBarcode(
start=0,
end=8,
quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY,
)
@classmethod
def _tag_bamfile(
cls,
input_bamfile_name: str,
output_bamfile_name: str,
tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator],
) -> None:
"""Adds tags from fastq file(s) to a bam file.
Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from
`input_bamfile_name`, and writes the result to `output_bamfile_name`
Parameters
----------
input_bamfile_name : str
input bam
output_bamfile_name : str
output bam
tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator]
Iterable of generators that yield barcodes from fastq files
"""
bam_tagger = bam.Tagger(input_bamfile_name)
bam_tagger.tag(output_bamfile_name, tag_generators)
@classmethod
def _make_tag_generators(
cls, r1, i1=None, whitelist=None
) -> List[fastq.EmbeddedBarcodeGenerator]:
"""Create tag generators from fastq files.
Tag generators are iterators that run over fastq records, they extract and yield all of the
barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and
optionally, the sample barcode.
Parameters
----------
r1 : str
forward fastq file
i1 : str, optional
index fastq file
whitelist : str, optional
A file that contains a list of acceptable cell barcodes
Returns
-------
tag_generators, List[EmbeddedBarcodeGenerator]
EmbeddedBarcodeGenerators containing barcodes from 10x fastq records
"""
tag_generators = []
# generator for cell and molecule barcodes
if whitelist is not None:
tag_generators.append(
fastq.BarcodeGeneratorWithCorrectedCellBarcodes(
fastq_files=r1,
embedded_cell_barcode=cls.cell_barcode,
whitelist=whitelist,
other_embedded_barcodes=[cls.molecule_barcode],
)
)
else:
tag_generators.append(
fastq.EmbeddedBarcodeGenerator(
fastq_files=r1,
embedded_barcodes=[cls.cell_barcode, cls.molecule_barcode],
)
)
# generator for sample barcodes
if i1 is not None:
tag_generators.append(
fastq.EmbeddedBarcodeGenerator(
fastq_files=i1, embedded_barcodes=[cls.sample_barcode]
)
)
return tag_generators
[docs] @classmethod
def attach_barcodes(cls, args=None):
"""Command line entrypoint for attaching barcodes to a bamfile.
Parameters
----------
args : Iterable[str], optional
arguments list, for testing (see test/test_entrypoints.py for example). The default
value of None, when passed to `parser.parse_args` causes the parser to
read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--r1",
required=True,
help="read 1 fastq file for a 10x genomics v2 experiment",
)
parser.add_argument(
"--u2",
required=True,
help="unaligned bam containing cDNA fragments. Can be converted from fastq read 2"
"using picard FastqToSam",
)
parser.add_argument(
"--i1",
default=None,
help="(optional) i7 index fastq file for a 10x genomics experiment",
)
parser.add_argument(
"-o", "--output-bamfile", required=True, help="filename for tagged bam"
)
parser.add_argument(
"-w",
"--whitelist",
default=None,
help="optional cell barcode whitelist. If provided, corrected barcodes "
"will also be output when barcodes are observed within 1ED of a "
"whitelisted barcode",
)
if args is not None:
args = parser.parse_args(args)
else:
args = parser.parse_args()
tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist)
cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators)
return 0
[docs]class BarcodePlatform(GenericPlatform):
"""Command Line Interface for extracting and attaching barcodes with specified positions
generalizing TenXV2 attach barcodes
Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the
corresponding barcode's start position and and length are provided. The sample barcode is extracted
from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file
This class defines several methods that are created as CLI tools when sctools is installed
(see setup.py)
Attributes
----------
cell_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the cell barcode and the tags to
assign the sequence and quality of the cell barcode
molecule_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the molecule barcode and the tags
to assign the sequence and quality of the molecule barcode
sample_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the sample barcode and the tags
to assign the sequence and quality of the sample barcode
Methods
-------
attach_barcodes()
Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse
(r2) bam file
"""
cell_barcode = None
molecule_barcode = None
sample_barcode = None
@classmethod
def _validate_barcode_args(cls, args):
"""Validates that the barcode start position is greater than 0
Parameters
----------
args : object
arguments list, The default value of None, when passed to `parser.parse_args`
causes the parser to read `sys.argv`
Returns
-------
args : object
return arguments list if valid
"""
# check that if a barcode start position is provided, its length is also (and vice versa)
cls._validate_barcode_length_and_position(
args.cell_barcode_start_pos, args.cell_barcode_length
)
cls._validate_barcode_length_and_position(
args.molecule_barcode_start_pos, args.molecule_barcode_length
)
cls._validate_barcode_length_and_position(
args.sample_barcode_start_pos, args.sample_barcode_length
)
# check that an index fastq is provided sample barcode length and position are given
if args.i1 is None and args.sample_barcode_length:
raise argparse.ArgumentError(
"An i7 index fastq file must be given to attach a sample barcode"
)
# check that cell and molecule barcodes don't overlap
if args.cell_barcode_length and args.molecule_barcode_length:
cls._validate_barcode_input(
args.molecule_barcode_start_pos,
args.cell_barcode_start_pos + args.cell_barcode_length,
)
return args
@classmethod
def _validate_barcode_length_and_position(
cls, barcode_start_position, barcode_length
):
"""Checks that either that both barcode length and position are given or that neither are given as arguments
Parameters
----------
barcode_start_position : int
the user defined start position (base pairs) of the barcode
barcode_length : int
the user defined length (base pairs) of the barcode
Returns
-------
given_value : int
return given value if valid
"""
barcode_start_pos_exists = bool(barcode_start_position) or (
barcode_start_position == 0
)
barcode_length_exists = bool(barcode_length)
# (XOR boolean logic)
if barcode_start_pos_exists != barcode_length_exists:
raise argparse.ArgumentError(
"Invalid position/length, both position and length must be provided by the user together"
)
@classmethod
def _validate_barcode_input(cls, given_value, min_value):
"""Validates that the barcode input is greater than a min value
Parameters
----------
given_value : int
the given value that must be greater than the min_value,
(barcode length or barcode starting position)
min_value : int
the min value that the given_value must be greater than
Returns
-------
given_value : int
return given value if valid
"""
if given_value < min_value:
raise argparse.ArgumentTypeError("Invalid barcode length/position")
return given_value
@classmethod
def _validate_barcode_start_pos(cls, given_value):
"""Validates that the barcode start position is greater than 0
Parameters
----------
given_value : Union[int, str]
the given start position of the barcode to validate
Returns
-------
given_value : int
returns the start position if it is valid
"""
return cls._validate_barcode_input(int(given_value), 0)
@classmethod
def _validate_barcode_length(cls, given_value):
"""Validates that the barcode length is greater than 1
Parameters
----------
given_value : Union[int, str]
the given length of the barcode to validate
Returns
-------
given_value : int
returns the length if it is valid
"""
return cls._validate_barcode_input(int(given_value), 1)
@classmethod
def _tag_bamfile(
cls,
input_bamfile_name: str,
output_bamfile_name: str,
tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator],
) -> None:
"""Adds tags from fastq file(s) to a bam file.
Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from
`input_bamfile_name`, and writes the result to `output_bamfile_name`
Parameters
----------
input_bamfile_name : str
input bam
output_bamfile_name : str
output bam
tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator]
Iterable of generators that yield barcodes from fastq files
"""
bam_tagger = bam.Tagger(input_bamfile_name)
bam_tagger.tag(output_bamfile_name, tag_generators)
@classmethod
def _make_tag_generators(
cls, r1, i1=None, whitelist=None
) -> List[fastq.EmbeddedBarcodeGenerator]:
"""Create tag generators from fastq files.
Tag generators are iterators that run over fastq records, they extract and yield all of the
barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode.
Parameters
----------
r1 : str
forward fastq file, where possibly the cell and/or molecule barcode is found
i1 : str, optional
index fastq file, where the sample barcode is found
whitelist : str, optional
A file that contains a list of acceptable cell barcodes
Returns
-------
tag_generators : List[EmbeddedBarcodeGenerator]
EmbeddedBarcodeGenerators containing barcodes from the given fastq
"""
tag_generators = []
barcode_args = {"fastq_files": r1}
if i1:
sample_barcode_args = dict(barcode_args)
sample_barcode_args["embedded_barcodes"] = [cls.sample_barcode]
tag_generators.append(fastq.EmbeddedBarcodeGenerator(**sample_barcode_args))
if whitelist:
barcode_args["whitelist"] = whitelist
if cls.cell_barcode:
barcode_args["embedded_cell_barcode"] = cls.cell_barcode
if cls.molecule_barcode:
barcode_args["other_embedded_barcodes"] = [cls.molecule_barcode]
tag_generators.append(
fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)
)
else:
# for all the barcodes that have a length and starting position specified
barcode_args["embedded_barcodes"] = [
barcode
for barcode in [cls.cell_barcode, cls.molecule_barcode]
if barcode
]
tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args))
return tag_generators
[docs] @classmethod
def attach_barcodes(cls, args=None):
"""Command line entrypoint for attaching barcodes to a bamfile.
Parameters
----------
args : Iterable[str], optional
arguments list, The default value of None, when passed to `parser.parse_args`
causes the parser to read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--r1",
required=True,
help="read 1 fastq file, where the cell and molecule barcode is found",
)
parser.add_argument(
"--u2",
required=True,
help="unaligned bam, can be converted from fastq read 2"
"using picard FastqToSam",
)
parser.add_argument(
"-o", "--output-bamfile", required=True, help="filename for tagged bam"
)
parser.add_argument(
"-w",
"--whitelist",
default=None,
help="optional cell barcode whitelist. If provided, corrected barcodes "
"will also be output when barcodes are observed within 1ED of a "
"whitelisted barcode",
)
parser.add_argument(
"--i1",
default=None,
help="(optional) i7 index fastq file, where the sample barcode is found",
)
parser.add_argument(
"--sample-barcode-start-position",
dest="sample_barcode_start_pos",
default=None,
help="the user defined start position (base pairs) of the sample barcode",
type=cls._validate_barcode_start_pos,
)
parser.add_argument(
"--sample-barcode-length",
dest="sample_barcode_length",
default=None,
help="the user defined length (base pairs) of the sample barcode",
type=cls._validate_barcode_length,
)
parser.add_argument(
"--cell-barcode-start-position",
dest="cell_barcode_start_pos",
default=None,
help="the user defined start position, in base pairs, of the cell barcode",
type=cls._validate_barcode_start_pos,
)
parser.add_argument(
"--cell-barcode-length",
dest="cell_barcode_length",
default=None,
help="the user defined length, in base pairs, of the cell barcode",
type=cls._validate_barcode_length,
)
parser.add_argument(
"--molecule-barcode-start-position",
dest="molecule_barcode_start_pos",
default=None,
help="the user defined start position, in base pairs, of the molecule barcode "
"(must be not overlap cell barcode if cell barcode is provided)",
type=cls._validate_barcode_start_pos,
)
parser.add_argument(
"--molecule-barcode-length",
dest="molecule_barcode_length",
default=None,
help="the user defined length, in base pairs, of the molecule barcode",
type=cls._validate_barcode_length,
)
# parse and validate the args
if args:
args = parser.parse_args(args)
else:
args = parser.parse_args()
cls._validate_barcode_args(args)
# if the length and there for the start pos have been given as args
# get the appropriate barcodes
if args.cell_barcode_length:
cls.cell_barcode = fastq.EmbeddedBarcode(
start=args.cell_barcode_start_pos,
end=args.cell_barcode_start_pos + args.cell_barcode_length,
quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY,
)
if args.molecule_barcode_length:
cls.molecule_barcode = fastq.EmbeddedBarcode(
start=args.molecule_barcode_start_pos,
end=args.molecule_barcode_start_pos + args.molecule_barcode_length,
quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY,
)
if args.sample_barcode_length:
cls.sample_barcode = fastq.EmbeddedBarcode(
start=args.sample_barcode_start_pos,
end=args.sample_barcode_start_pos + args.sample_barcode_length,
quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY,
)
# make the tags and attach the barcodes
tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist)
cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators)
return 0