Source code for sctools.encodings

"""
Compressed Barcode Encoding Methods
===================================

.. currentmodule:: sctools

This module defines several classes to encode DNA sequences in memory-efficient forms, using 2 bits
to encode bases of a 4-letter DNA alphabet (ACGT) or 3 bits to encode a 5-letter DNA alphabet
that includes the ambiguous call often included by Illumina base calling software (ACGTN). The
classes also contain several methods useful for efficient querying and manipulation of the encoded
sequence.

Classes
-------
Encoding                            Encoder base class
ThreeBit                            Three bit DNA encoder / decoder
TwoBit                              Two bit DNA encoder / decoder

"""

import random
from typing import Mapping, AnyStr, Set


[docs]class Encoding: """ Attributes ---------- encoding_map : TwoBitEncodingMap Class that mimics a Mapping[bytes, str] where bytes must be a single byte encoded character (encoder) decoding_map : Mapping[int, bytes] Dictionary that maps integers to bytes human-readable representations (decoder) bits_per_base : int number of bits used to encode each base Methods ------- encode(bytes_encoded: bytes) encode a DNA string in a compressed representation decode(integer_encoded: int) decode a compressed DNA string into a human readable bytes format gc_content(integer_encoded: int) calculate the GC content of an encoded DNA string hamming_distance(a: int, b: int) calculate the hamming distance between two encoded DNA strings """ encoding_map: Mapping[AnyStr, int] = NotImplemented decoding_map: Mapping[int, AnyStr] = NotImplemented bits_per_base: int = NotImplemented
[docs] @classmethod def encode(cls, bytes_encoded: bytes) -> int: """Encode a DNA bytes string. Parameters ---------- bytes_encoded : bytes bytes DNA string Returns ------- encoded : int Encoded DNA sequence """ raise NotImplementedError
[docs] def decode(self, integer_encoded: int) -> bytes: """Decode a DNA bytes string. Parameters ---------- integer_encoded : bytes Integer encoded DNA string Returns ------- decoded : bytes Bytes decoded DNA sequence """ raise NotImplementedError
[docs] def gc_content(self, integer_encoded: int) -> int: """Return the number of G or C nucleotides in `integer_encoded` Parameters ---------- integer_encoded : int Integer encoded DNA string Returns ------- gc_content, int number of bases in `integer_encoded` input that are G or C. """ raise NotImplementedError
[docs] @staticmethod def hamming_distance(a, b) -> int: """Calculate the hamming distance between two DNA sequences The hamming distance counts the number of bases that are not the same nucleotide Parameters ---------- a, b : int integer encoded Returns ------- d : int hamming distance between a and b """ raise NotImplementedError
[docs]class TwoBit(Encoding): """Encode a DNA sequence using a 2-bit encoding. Two-bit encoding uses 0 for an encoded nucleotide. As such, it cannot distinguish between the end of sequence and trailing A nucleotides, and thus decoding these strings requires knowledge of their length. Therefore, it is only appropriate for encoding fixed sequence lengths In addition, in order to encode in 2-bit, N-nucleotides must be randomized to one of A, C, G, and T. Parameters ---------- sequence_length : int number of nucleotides that are being encoded """ __doc__ += Encoding.__doc__ def __init__(self, sequence_length: int): self.sequence_length: int = sequence_length
[docs] class TwoBitEncodingMap: """Dict-like class that maps bytes to 2-bit integer representations Generates random nucleotides for ambiguous nucleotides e.g. N """ map_ = { ord("A"): 0, ord("C"): 1, ord("T"): 2, ord("G"): 3, ord("a"): 0, ord("c"): 1, ord("t"): 2, ord("g"): 3, } iupac_ambiguous: Set[int] = {ord(c) for c in "MRWSYKVHDBNmrwsykvhdbn"} def __getitem__(self, byte: int) -> int: try: return self.map_[byte] except KeyError: if byte not in self.iupac_ambiguous: raise KeyError(f"{chr(byte)} is not a valid IUPAC nucleotide code") return random.randint(0, 3)
encoding_map: TwoBitEncodingMap = TwoBitEncodingMap() decoding_map: Mapping[int, bytes] = {0: b"A", 1: b"C", 2: b"T", 3: b"G"} bits_per_base: int = 2
[docs] @classmethod def encode(cls, bytes_encoded: bytes) -> int: encoded = 0 for character in bytes_encoded: encoded <<= 2 encoded += cls.encoding_map[character] return encoded
[docs] def decode(self, integer_encoded: int) -> bytes: decoded = b"" for _ in range(self.sequence_length): decoded = self.decoding_map[integer_encoded & 3] + decoded integer_encoded >>= 2 return decoded
[docs] def gc_content(self, integer_encoded: int) -> int: i = 0 for _ in range(self.sequence_length): i += integer_encoded & 1 integer_encoded >>= 2 return i
[docs] @staticmethod def hamming_distance(a: int, b: int) -> int: difference = a ^ b d_hamming = 0 while difference: if difference & 3: d_hamming += 1 difference >>= 2 return d_hamming
[docs]class ThreeBit(Encoding): """Encode a DNA sequence using a 3-bit encoding. Since no bases are encoded as 0, an empty triplet is interpreted as the end of the encoded string; Three-bit encoding can be used to encode and decode strings without knowledge of their length. """ __doc__ += Encoding.__doc__ def __init__(self, *args, **kwargs): """ Notes ----- args and kwargs are not used, but allow ThreeBit to be initialized the same way as TwoBit, despite not requiring a sequence length parameter. """ pass
[docs] class ThreeBitEncodingMap: """Dict-like class that maps bytes to 3-bit integer representations All IUPAC ambiguous codes are treated as "N" """ # C: 1, A: 2, G: 3, T: 4, N: 6; # note, not using 0 map_ = { ord("C"): 1, ord("A"): 2, ord("G"): 3, ord("T"): 4, ord("N"): 6, ord("c"): 1, ord("a"): 2, ord("g"): 3, ord("t"): 4, ord("n"): 6, } def __getitem__(self, byte: int) -> int: try: return self.map_[byte] except KeyError: return 6 # any non-standard nucleotide gets "N"
encoding_map: ThreeBitEncodingMap = ThreeBitEncodingMap() decoding_map: Mapping[int, bytes] = {1: b"C", 2: b"A", 3: b"G", 4: b"T", 6: b"N"} bits_per_base: int = 3
[docs] @classmethod def encode(cls, bytes_encoded: bytes) -> int: encoded = 0 for character in bytes_encoded: encoded <<= 3 encoded += cls.encoding_map[character] return encoded
[docs] @classmethod def decode(cls, integer_encoded: int) -> bytes: decoded = b"" while integer_encoded: decoded = cls.decoding_map[integer_encoded & 7] + decoded integer_encoded >>= 3 return decoded
[docs] @classmethod def gc_content(cls, integer_encoded: int) -> int: i = 0 while integer_encoded: i += integer_encoded & 1 integer_encoded >>= 3 return i
[docs] @staticmethod def hamming_distance(a: int, b: int) -> int: difference = a ^ b d_hamming = 0 while difference: if difference & 7: d_hamming += 1 difference >>= 3 return d_hamming