Source code for pyani.scripts.subcommands.subcmd_classify

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2017-2019
# (c) University of Strathclyde 2019-2020
# Author: Leighton Pritchard
#
# Contact:
# leighton.pritchard@strath.ac.uk
#
# Leighton Pritchard,
# Strathclyde Institute for Pharmacy and Biomedical Sciences,
# 161 Cathedral Street,
# Glasgow,
# G4 0RE
# Scotland,
# UK
#
# The MIT License
#
# Copyright (c) 2017-2019 The James Hutton Institute
# Copyright (c) 2019-2020 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""Provides the classify subcommand for pyani."""

import logging

from argparse import Namespace
from typing import Generator, NamedTuple

import networkx as nx  # pylint: disable=E0401
import numpy as np

from tqdm import tqdm

from pyani import pyani_classify, pyani_orm
from pyani.pyani_tools import termcolor


[docs]class SubgraphData(NamedTuple): """Subgraph clique/classification output.""" interval: float # the trimming threshold for this subgraph graph: nx.Graph # the trimmed subgraph cliqueinfo: pyani_classify.Cliquesinfo
[docs]def subcmd_classify(args: Namespace) -> int: """Generate classifications for an analysis. :param args: Namespace, command-line arguments """ logger = logging.getLogger(__name__) # Tell the user what's going on logger.info( termcolor("Generating classification for ANI run: %s", "red"), args.run_id ) logger.info("\tWriting output to: %s", args.outdir) logger.info(termcolor("\tCoverage threshold: %s", "cyan"), args.cov_min) logger.info( termcolor("\tInitial minimum identity threshold: %s", "cyan"), args.id_min ) # Get results data for the specified run logger.info("Acquiring results for run: %s", args.run_id) logger.debug("Connecting to database: %s", args.dbpath) session = pyani_orm.get_session(args.dbpath) logger.debug("Retrieving results matrices") results = ( session.query(pyani_orm.Run).filter(pyani_orm.Run.run_id == args.run_id).first() ) result_label_dict = pyani_orm.get_matrix_labels_for_run(session, args.run_id) # Generate initial graph on basis of results logger.info("Constructing graph from results.") initgraph = pyani_classify.build_graph_from_results( results, result_label_dict, args.cov_min, args.id_min ) logger.debug( "Returned graph has %d nodes:\n\t%s", len(initgraph), "\n\t".join(n for n in initgraph), ) logger.debug( "Initial graph clique information:\n\t%s", pyani_classify.analyse_cliques(initgraph), ) # Obtain all subgraph splits, thresholding by identity subgraphs = trimmed_graph_sequence(initgraph, args) special_intervals = [_ for _ in subgraphs if _.cliqueinfo.all_k_complete] outstr = "\n\t".join([f"{_.interval}\t{_.cliqueinfo}" for _ in special_intervals]) logger.info( "%s intervals with special property:\n\t%s", len(special_intervals), outstr ) if args.show_all: outstr = "\n\t".join([f"{_.interval}\t{_.cliqueinfo}" for _ in subgraphs]) logger.debug("Subgraphs at all identity thresholds:\n\t%s", outstr) return 0
# Generate a list of graphs from lowest to highest pairwise identity threshold
[docs]def trimmed_graph_sequence( ingraph: nx.Graph, args: Namespace, attribute: str = "identity" ) -> Generator: """Return graphs trimmed from lowest to highest attribute value. :param ingraph: nx.Graph of genomes as nodes, having edges weighted by the property named in attribute :param args: Namespace, parsed command-line arguments :param attribute: str, name of the property by which the graph edges should be trimmed A generator which, starting from the initial graph, yields in sequence a series of graphs from which the edge(s) with the lowest threshold value attribute were removed. The generator returns a tuple of: (threshold, graph, analyse_cliques(graph)) This will be slow with moderate-large graphs """ graph = ingraph.copy() # Trim the graph now, removing edges at the minimum allowed identity and below graph, edgelist = pyani_classify.remove_low_weight_edges( graph, args.min_id, attribute ) #  There's no sense resolving to small intervals if there would be more # of these than there are edges to remove from the graph. If the # resolution leads to more breakpoints than there are edges, just trim # edge by edge. Otherwise work over a range of property value breaks. if len(edgelist) < 1 / args.resolution: breaks = [_[-1] for _ in edgelist] else: breaks = np.arange( args.min_id or edgelist[0][-1], args.max_id or 1, args.resolution ) # Remove edges at each breakpoint and yield the resulting subgraph for threshold in tqdm(breaks, disable=args.disable_tqdm): yield SubgraphData(threshold, graph, pyani_classify.analyse_cliques(graph)) while edgelist and edgelist[0][-1] <= threshold: edge = edgelist.pop(0) graph.remove_edge(edge[0], edge[1]) # Yield final edge/graph (identity) yield SubgraphData( 1, pyani_classify.remove_low_weight_edges(graph, 1, attribute)[0], pyani_classify.analyse_cliques(graph), )