Source code for bioseq._sequence

import re

from collections import Counter
from typing import Dict, List, Optional, Tuple, TypeVar, Union

from bioseq import config, algorithm
from bioseq.config import AlignmentConfig


[docs]class Sequence: info: str _seq: str _composition: Dict[str, Union[int, float]] _weight: float
[docs] def __init__(self, seq: str, info: str = ""): """Base class of all sequence type Args: seq(str): Sequence info(str): Some information string about the sequence """ self._seq = seq.upper() self.info = info self.reset_cache()
def reset_cache(self): """ Reset cached property related with sequence, called when sequence changed """ self._composition = {} self._weight = 0. @property def composition(self) -> Dict[str, Union[int, float]]: """Analysis the composition of sequence Returns: Dict: Each element's appearance times or percentage in sequence """ if not self._composition: counter = dict(Counter(self._seq)) self._composition = {key: counter[key] for key in sorted(counter)} return self._composition @property def length(self) -> int: """ The length of sequence """ return len(self._seq) @property def seq(self) -> str: """ Read-only property, sequence can only be modified by ``mutation()`` """ return self._seq @property def weight(self) -> float: """Calculate the sequence's Molar mass by below function .. math:: weight_{seq} = \\sum_i^{length}weight_i - 18 * (length - 1) Returns: weight(float): Molar mass with unit of Dalton """ if not self._weight: weight_table = config.MW[self.__class__.__name__] self._weight = sum( [weight_table[e] for e in self._seq]) - 18 * (self.length - 1) return self._weight
[docs] def align(self, subject: Union[str, "Sequence"], mode: int = 1) \ -> Tuple[str, str, float]: """Align two sequence. Use ``bioseq.config.AlignmentConfig`` to set the alignment score, including match(2), mismatch(-3), gap_open(-3), gap_extend(-3). number in brackets is default value Args: subject(str|Sequence): Sequence to align mode(int): 1: Use Needleman-Wunsch to global alignment\n 2: Use Smith-Waterman to partial alignment Returns: tuple: query(str): Self sequence after alignment\n subject(str): Subject sequence after alignment\n score(int): align score if choose return_score """ if isinstance(subject, self.__class__): subject = subject._seq elif not isinstance(subject, str): raise TypeError( f"Only str or {self.__class__.__name__} can be aligned to {self.__class__.__name__}") args = (self._seq, subject, AlignmentConfig.MATCH, AlignmentConfig.MISMATCH, AlignmentConfig.GAP_OPEN, AlignmentConfig.GAP_EXTEND) if mode == 1: return algorithm.NeedlemanWunsch(*args) elif mode == 2: return algorithm.SmithWaterman(*args) else: raise TypeError("\ Please choose alignment mode:\n\ 1-Global alignment by Needleman-Wunsch\n\ 2-Local alignment by Smith-Waterman")
[docs] def find(self, target: Union[str, "Sequence"]) -> List[int]: """Find the target sequence in this sequence and return the positions Returns: List[int]: All position of target in this sequence """ if isinstance(target, Sequence): target = target._seq elif not isinstance(target, str): raise TypeError("Target should be str or Sequence") return [i.start() for i in re.finditer(target, self._seq)]
[docs] def mutation(self, position: Union[str, int, List[int]], target: Union[str, "Sequence"]) -> str: """Change this sequence, ``self.seq`` can only be modified by this function Args: position(Union[str, int, List[int]): char, index(s) to be mutation target(Union[str, Sequence]): the target char of mutation Returns: str: Sequence after modified """ if isinstance(target, Sequence): target = target._seq elif not isinstance(target, str): raise TypeError("Target should be str or Sequence") if isinstance(position, str): self._seq = self._seq.replace(position, target) # Reset cached property related with sequence(weight, composition...) self.reset_cache() return self._seq elif isinstance(position, int): position = [position] elif not isinstance(position, List): raise TypeError("Position should be str, int or list of int") seq_list, length = list(self._seq), len(target) prev_end = 0 # previous end of mutation while position: pos = position.pop(0) if not isinstance(pos, int): raise TypeError("Position must be int") if pos < prev_end: print( f"WARNING: mutation <{pos}~{pos + length}> overlapped previous mutation <{prev_end - length}~{prev_end}>") prev_end = pos + length seq_list[pos: prev_end] = target self._seq = "".join(seq_list) # Reset cached property related with sequence(weight, composition...) self.reset_cache() return self._seq
[docs] def toDNA(self) -> "DNA": """ Convert to a DNA instance """ return DNA(self.seq, self.info)
[docs] def toRNA(self) -> "RNA": """ Convert to an RNA instance """ return RNA(self.seq, self.info)
[docs] def toPeptide(self) -> "Peptide": """ Convert to a Peptide instance """ return Peptide(self.seq, self.info)
[docs] def _print(self) -> str: """ Sequence's output format, if length more than 30, only show the first and end 30 chars """ if self._seq and len(self._seq) > 30: return f"{self._seq[:10]}...{self._seq[-10:]}" return self._seq
def __add__(self, s: Union[str, "Sequence"]) -> "Sequence": if isinstance(s, str): return self.__class__(self._seq + s) elif isinstance(s, self.__class__): return self.__class__(self._seq + s._seq) else: raise TypeError( f"Expect str or {self.__class__.__name__} add with {self.__class__.__name__}, not {s.__class__.__name__}") def __radd__(self, s: Union[str, "Sequence"]) -> "Sequence": if isinstance(s, str): return self.__class__(s + self._seq) elif isinstance(s, self.__class__): return self.__class__(s._seq + self._seq) else: raise TypeError( f"Expect str or {self.__class__.__name__} add with {self.__class__.__name__}, not {s.__class__.__name__}") def __eq__(self, o: object) -> bool: if isinstance(o, str): return self._seq == o elif isinstance(o, self.__class__): return self._seq == o._seq else: raise TypeError( f"Expect str or {self.__class__.__name__} compare to {self.__class__.__name__}, not {o.__class__.__name__}") def __getitem__(self, index: int) -> str: return self._seq[index] def __len__(self) -> int: return len(self._seq) def __str__(self) -> str: return self._print() def __repr__(self) -> str: return self._print()
[docs]class Peptide(Sequence): _pI: float _Hphob_list: List[float] def reset_cache(self): super().reset_cache() self._pI = 0. self._Hphob_list = [] @property def pI(self) -> float: """ Calculate the peptide's pI which is a pH make peptide's charge equal zero """ if not self._pI: min = 0. max = 14. pH = (min + max) / 2 charge = self.chargeInpH(pH) while abs(charge) > 1e-8: if charge > 0: min = pH else: max = pH pH = (min + max) / 2 charge = self.chargeInpH(pH) self._pI = round(pH, 3) return self._pI
[docs] def chargeInpH(self, pH: float) -> float: """ Calculate the charge amount of peptide at pH .. math:: pH = pK_a + \\lg{\\frac{[A^-]}{[HA]}}\\implies\\frac{[HA]}{[A^-]} = 10^{pK_a - pH} charge = \\frac{[A^-]}{[A]_{total}} = \\frac{[A^-]}{[A^-] + [HA]} = \\frac{1}{\\frac{[A^-] + [HA]}{[A^-]}} = \\frac{1}{1 + \\frac{[HA]}{[A-]}} for acidic residues: :math:`charge = 1 / (1 + 10 ^{pK_a - pH})` for basic residues: :math:`charge = 1 / (1 + 10^{pH - pK_a})` Args: pH(float): pH value Returns: float: charge in specific pH """ pos_charge = 1.0 / (10 ** (pH - config.PK["Nterm"]) + 1.0) for pos_aa, aa_pK in config.PK["pos_pK"].items(): pos_charge += self.composition.get(pos_aa, 0) / \ (10 ** (pH - aa_pK) + 1.0) neg_charge = 1.0 / (10 ** (config.PK["Cterm"] - pH) + 1.0) for neg_aa, aa_pK in config.PK["neg_pK"].items(): neg_charge += self.composition.get(neg_aa, 0) / \ (10 ** (aa_pK - pH) + 1.0) return pos_charge - neg_charge
[docs] def getHphob(self, window_size: int = 9, show_img: bool = False) -> List[float]: """ Calculate the Hydropathy Score.The lager the score, the higher the hydrophobicity. Each aa's score is the average score of all aa in window_size. So part of Amino Acid at begin and end don't have score Args: window_size(int): the number for calculate average hydropathy value show_img(book): whether to draw the result, require ``matplotlib`` Returns: List[float]: the result of peptide's Hydropathy Score """ if not self._Hphob_list: _Hphob = [config.HYDROPATHY[aa] for aa in self._seq] half_part = window_size // 2 if len(_Hphob) < half_part: print("WARNING: Windows size is to big to calculate") for i in range(half_part, len(_Hphob) - half_part): self._Hphob_list.append( round(sum(_Hphob[i - half_part: i + window_size - half_part]) / window_size, 3)) if show_img: try: import matplotlib.pyplot as plt except ImportError: print( "Can't import matplotlib.pyplot, please use 'pip install matplotlib' to install.") else: plt.title( f"Hydropathy Score for {self._seq[:4]}...{self._seq[-4:]}") plt.plot(range(window_size // 2 + 1, self.length - window_size // 2 + 1), self._Hphob_list, linewidth=0.8) plt.xlim((1, self.length + 1)) plt.grid(linestyle="--") plt.xlabel("Position") plt.ylabel("Score") plt.show() return self._Hphob_list
[docs] def _print(self) -> str: """ Peptide print starts with "N-" and then ends with "-C", means sequence is from N-terminal to C-terminal """ if self._seq: return f"N-{super()._print()}-C" return self.__class__.__name__ + "()"
T = TypeVar("T", "RNA", "DNA")
[docs]class RNA(Sequence): _GC: float orf: List[str] #: Can only visit after called `get_orf()` peptide: List[Peptide] #: Can only visit after called `transcript()` def reset_cache(self): self._GC, self.orf, self.peptide = 0., [], [] super().reset_cache() @property def complement(self: T) -> T: """ Return self complementary sequence """ complement_dict = config.NC_INFO[self.__class__.__name__ + "_COMPLEMENT"] complement_seq = "".join([complement_dict[bp] for bp in self._seq[::-1]]) return self.__class__(complement_seq) @property def GC(self) -> float: """ Calculate the GC percentage """ if not self._GC: self._GC = round( (self.composition["C"] + self.composition["G"]) / self.length, 4) return self._GC @property def reversed(self: T) -> T: """ Return self reversed sequence """ return self.__class__(self._seq[::-1])
[docs] def getOrf(self, topn: int = 1, replace: bool = False) -> List[str]: """ Find the Open Reading Frame in sequence and save in ``self.orf`` Args: topn(int): the num of orfs, sorted by length of each orf, default is 1 replace(bool): Replace origin sequence with the longest Orf Returns: List[str]: Orf found on self. """ # Traverse all Orf Frame i, step = 0, 1 starts: List[int] = [] end_points: List[Tuple[int, int]] = [] while i < self.length: if self._seq[i: i+3] in config.START_CODON: starts.append(i) step = 3 i += step if config.CODON_TABLE.get(self._seq[i: i+3]) == "*": end_points.extend([(s, i + 3) for s in starts]) starts = [] step = 1 end_points = sorted( end_points, key=lambda se: se[1] - se[0], reverse=True) self.orf = [self._seq[se[0]: se[1]] for se in end_points[:topn]] if replace: self._seq = self.orf[0] return self.orf
[docs] def transcript(self, topn: int = 1) -> List[Peptide]: """ Transcript the sequence to peptide, the result will save in ``self.peptide`` Args: topn(int): filter num of transcripts, sorted by length of each transcript, default is 1 Returns: List[Peptide]: List of transcript product """ orf = self.orf if self.orf else self.getOrf(topn=topn) self.peptide = [] for frame in orf: peptide = "".join([ config.CODON_TABLE.get( frame[i:i + 3], config.SYMBOL["printAlign"][1]) for i in range(0, len(frame), 3) ]) self.peptide.append(Peptide(peptide[:-1])) return self.peptide
[docs] def _print(self): """ Peptide print starts with " 5'- " and then ends with " -3' ", means sequence is from 5' to 3' """ if self._seq: return f"5'-{super()._print()}-3'" return self.__class__.__name__ + "()"
[docs]class DNA(RNA): _translate: Optional[RNA] def reset_cache(self): self._translate = None super().reset_cache()
[docs] def translate(self) -> RNA: """ Translate the sequence to RNA, which replace the T with U """ if not self._translate: self._translate = RNA(self._seq.replace("T", "U")) return self._translate
[docs] def getOrf(self, topn: int = 1, replace: bool = False) -> List[str]: """Return the open reading frame of mRNA which is translated from this sequence. Args: topn(int): the num of orfs, sorted by length of each orf, default is 1 replace(bool): Replace origin sequence with the longest Orf Returns: List[str]: Orf found on mRNA """ self.orf = self.translate().getOrf(topn, replace) return self.orf
[docs] def transcript(self, topn: int = 1) -> List[Peptide]: """Return the transcript product of mRNA which is translated from this sequence. Args: topn(int): filter num of transcripts, sorted by length of each transcript, default is 1 Returns: List[Peptide]: List of transcript product """ self.peptide = self.translate().transcript(topn) return self.peptide