Source code for pyani.tetra

# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2017-2019
# (c) University of Strathclyde 2019
# Author: Leighton Pritchard
#
# Contact:
# leighton.pritchard@strath.ac.uk
#
# Leighton Pritchard,
# Strathclyde Institute for Pharmacy and Biomedical Sciences,
# Cathedral Street,
# Glasgow,
# G1 1XQ
# Scotland,
# UK
#
# The MIT License
#
# Copyright (c) 2017-2019 The James Hutton Institute
# Copyright (c) 2019 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""Code to implement the TETRA average nucleotide identity method.

Provides functions for calculation of TETRA as described in:

Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for the
prokaryotic species definition. Proc Natl Acad Sci USA 106: 19126-19131.
doi:10.1073/pnas.0906412106.

and

Teeling et al. (2004) Application of tetranucleotide frequencies for the
assignment of genomic fragments. Env. Microbiol. 6(9): 938-947.
doi:10.1111/j.1462-2920.2004.00624.x
"""

import collections
import math

from pathlib import Path
from typing import Dict, Iterable, Tuple

import pandas as pd  # type: ignore

from Bio import SeqIO  # type: ignore


# Calculate tetranucleotide Z-score for a set of input sequences
[docs]def calculate_tetra_zscores(infilenames: Iterable) -> Dict[str, Dict[str, float]]:
    """Return dictionary of TETRA Z-scores for each input file.

    :param infilenames:  iterable of paths to input sequence files
    """
    org_tetraz = {}  # type: Dict[str, Dict[str, float]]
    for filename in infilenames:
        org_tetraz[filename.stem] = calculate_tetra_zscore(filename)
    return org_tetraz


# Calculate tetranucleotide Z-score for a single sequence file
[docs]def calculate_tetra_zscore(filename: Path) -> Dict[str, float]:
    """Return TETRA Z-score for the sequence in the passed file.

    :param filename:  path to sequence file

    Calculates mono-, di-, tri- and tetranucleotide frequencies
    for each sequence, on each strand, and follows Teeling et al. (2004)
    in calculating a corresponding Z-score for each observed
    tetranucleotide frequency, dependent on the mono-, di- and tri-
    nucleotide frequencies for that input sequence.
    """
    # For the Teeling et al. method, the Z-scores require us to count
    # mono, di, tri and tetranucleotide sequences - these are stored
    # (in order) in the counts tuple
    counts = (
        collections.defaultdict(int),
        collections.defaultdict(int),
        collections.defaultdict(int),
        collections.defaultdict(int),
    )  # type: Tuple
    for rec in SeqIO.parse(filename, "fasta"):
        for seq in [str(rec.seq).upper(), str(rec.seq.reverse_complement()).upper()]:
            # The Teeling et al. algorithm requires us to consider
            # both strand orientations, so monocounts are easy
            for base in ("G", "C", "T", "A"):
                counts[0][base] += seq.count(base)
            # For di, tri and tetranucleotide counts, loop over the
            # sequence and its reverse complement, until near the end:
            for i in range(len(seq[:-4])):
                din, tri, tetra = seq[i : i + 2], seq[i : i + 3], seq[i : i + 4]
                counts[1][str(din)] += 1
                counts[2][str(tri)] += 1
                counts[3][str(tetra)] += 1
            # Then clean up the straggling bit at the end:
            counts[2][str(seq[-4:-1])] += 1
            counts[2][str(seq[-3:])] += 1
            counts[1][str(seq[-4:-2])] += 1
            counts[1][str(seq[-3:-1])] += 1
            counts[1][str(seq[-2:])] += 1
    # Following Teeling (2004), calculate expected frequencies for each
    # tetranucleotide; we ignore ambiguity symbols
    tetra_exp = {}
    for tet in [tetn for tetn in counts[3] if tetra_clean(tetn)]:
        tetra_exp[tet] = (
            1.0 * counts[2][tet[:3]] * counts[2][tet[1:]] / counts[1][tet[1:3]]
        )
    # Following Teeling (2004) we approximate the std dev and Z-score for each
    # tetranucleotide
    tetra_sd = {}
    tetra_z = {}
    for tet, exp in list(tetra_exp.items()):
        den = counts[1][tet[1:3]]
        tetra_sd[tet] = math.sqrt(
            exp * (den - counts[2][tet[:3]]) * (den - counts[2][tet[1:]]) / (den * den)
        )
        try:
            tetra_z[tet] = (counts[3][tet] - exp) / tetra_sd[tet]
        except ZeroDivisionError:
            # To record if we hit a zero in the estimation of variance
            # zeroes = [k for k, v in list(tetra_sd.items()) if v == 0]
            tetra_z[tet] = 1 / (counts[1][tet[1:3]] * counts[1][tet[1:3]])
    return tetra_z


# Returns true if the passed string contains only A, C, G or T
[docs]def tetra_clean(instr: str) -> bool:
    """Return True if string contains only unambiguous IUPAC nucleotide symbols.

    :param instr:  str, nucleotide sequence

    We are assuming that a low frequency of IUPAC ambiguity symbols doesn't
    affect our calculation.
    """
    if set(instr) - set("ACGT"):
        return False
    return True


# Calculate Pearson's correlation coefficient from the Z-scores for each
# tetranucleotide. If we're forcing rpy2, might as well use that, though...
[docs]def calculate_correlations(tetra_z: Dict[str, Dict[str, float]]) -> pd.DataFrame:
    """Return dataframe of Pearson correlation coefficients.

    :param tetra_z:  dict, Z-scores, keyed by sequence ID

    Calculates Pearson correlation coefficient from Z scores for each
    tetranucleotide. This is done longhand here, which is fast enough,
    but for robustness we might want to do something else... (TODO).

    Note that we report a correlation by this method, rather than a
    percentage identity.
    """
    orgs = sorted(tetra_z.keys())
    correlations = pd.DataFrame(index=orgs, columns=orgs, dtype=float).fillna(1.0)
    for idx, org1 in enumerate(orgs[:-1]):
        for org2 in orgs[idx + 1 :]:
            if not sorted(tetra_z[org1].keys()) == sorted(tetra_z[org2].keys()):
                raise AssertionError()
            tets = sorted(tetra_z[org1].keys())
            zscores = [
                [tetra_z[org1][t] for t in tets],
                [tetra_z[org2][t] for t in tets],
            ]
            zmeans = [sum(zscore) / len(zscore) for zscore in zscores]
            zdiffs = [
                [z - zmeans[0] for z in zscores[0]],
                [z - zmeans[1] for z in zscores[1]],
            ]
            diffprods = sum(
                [zdiffs[0][i] * zdiffs[1][i] for i in range(len(zdiffs[0]))]
            )
            zdiffs2 = [sum([z * z for z in zdiffs[0]]), sum([z * z for z in zdiffs[1]])]
            correlations[org1][org2] = diffprods / math.sqrt(zdiffs2[0] * zdiffs2[1])
            correlations[org2][org1] = correlations[org1][org2]
    return correlations