.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "examples/gallery/sequence/profile/rbs_identification.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code .. rst-class:: sphx-glr-example-title .. _sphx_glr_examples_gallery_sequence_profile_rbs_identification.py: Identification of a binding site by sequence conservation ========================================================= In this example we identify the ribosomal binding site on mRNA, also called *Shine-Dalgarno* sequence, in *Escherichia coli*. In the beginning of the translation the 16S rRNA of the ribosome recognizes this purine-rich region on the mRNA, which typically lies a few bases upstream of the start codon. After binding the sequence, the ribosome starts scanning the mRNA in downstream direction to locate the start codon. .. GENERATED FROM PYTHON SOURCE LINES 14-139 .. image-sg:: /examples/gallery/sequence/profile/images/sphx_glr_rbs_identification_001.png :alt: rbs identification :srcset: /examples/gallery/sequence/profile/images/sphx_glr_rbs_identification_001.png :class: sphx-glr-single-img .. code-block:: Python # Code source: Patrick Kunzmann # License: BSD 3 clause import tempfile import matplotlib.pyplot as plt import matplotlib.ticker as ticker import numpy as np from matplotlib.patches import Patch import biotite import biotite.database.entrez as entrez import biotite.sequence as seq import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb UTR_LENGTH = 20 ### Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb") ) # We are only interested in CDS features bl21_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) ### Extract sequences for 5' untranslated regions (UTRs) # In this case we define the untranslated region, as the sequence # up to UTR_LENGTH bases upstream from the start codon utrs = [] for cds in bl21_genome.annotation: # Expect a single location for the feature, # since no splicing can occur # Ignore special cases like ribosomal slippage sites, etc. # for simplicity if len(cds.locs) != 1: continue # Get the only location for this feature loc = list(cds.locs)[0] # Get the region behind or before the CDS, based on the strand the # CDS is on if loc.strand == seq.Location.Strand.FORWARD: utr_start = loc.first - UTR_LENGTH utr_stop = loc.first # Include the start codon (3 bases) in the UTRs for later # visualization utrs.append(bl21_genome[utr_start : utr_stop + 3].sequence) else: utr_start = loc.last + 1 utr_stop = loc.last + 1 + UTR_LENGTH utrs.append( bl21_genome[utr_start - 3 : utr_stop].sequence.reverse().complement() ) ### Create profile # Increase the counter for each base and position # while iterating over the sequences frequencies = np.zeros((UTR_LENGTH + 3, len(bl21_genome.sequence.alphabet)), dtype=int) for utr in utrs: frequencies[np.arange(len(utr)), utr.code] += 1 profile = seq.SequenceProfile( symbols=frequencies, gaps=np.zeros(len(frequencies)), alphabet=bl21_genome.sequence.alphabet, ) ### Visualize the profile # Spend extra effort for correct sequence postion labels def normalize_seq_pos(x): """ Normalize sequence position, so that the position of the upstream bases is negative. """ # Sequence positions are always integers x = int(x) x -= UTR_LENGTH # There is no '0' position if x <= 0: x -= 1 return x @ticker.FuncFormatter def sequence_loc_formatter(x, pos): x = normalize_seq_pos(x) return f"{x:+}" COLOR_SCHEME = [ biotite.colors["lightgreen"], # A biotite.colors["orange"], # C biotite.colors["dimgreen"], # G biotite.colors["brightorange"], # T ] fig, ax = plt.subplots(figsize=(8.0, 3.0)) graphics.plot_sequence_logo(ax, profile, COLOR_SCHEME) normalized_pos = np.array([normalize_seq_pos(x) for x in range(len(profile.symbols))]) tick_locs = np.where(np.isin(normalized_pos, [-15, -10, -5, -1, 1]))[0] ax.set_xticks(tick_locs) ax.xaxis.set_major_formatter(ticker.FuncFormatter(sequence_loc_formatter)) ax.set_xlabel("Residue position") ax.set_ylabel("Conservation (Bits)") ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.legend( loc="upper left", handles=[ Patch(color=biotite.colors["green"], label="Purine"), Patch(color=biotite.colors["lightorange"], label="Pyrimidine"), ], ) fig.tight_layout() plt.show() .. _sphx_glr_download_examples_gallery_sequence_profile_rbs_identification.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: rbs_identification.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: rbs_identification.py ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_