Source code for pyani.scripts.subcommands.subcmd_download

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2016-2019
# (c) University of Strathclyde 2019-2020
# Author: Leighton Pritchard
#
# Contact:
# leighton.pritchard@strath.ac.uk
#
# Leighton Pritchard,
# Strathclyde Institute for Pharmacy and Biomedical Sciences,
# 161 Cathedral Street,
# Glasgow,
# G4 0RE
# Scotland,
# UK
#
# The MIT License
#
# Copyright (c) 2016-2019 The James Hutton Institute
# Copyright (c) 2019-2020 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""Provides the download subcommand for pyani."""

import logging

from argparse import Namespace
from typing import Dict, List, NamedTuple, Optional, Tuple

from Bio import SeqIO

from pyani import download
from pyani.pyani_tools import termcolor
from pyani.scripts import make_outdir


[docs]class Skipped(NamedTuple): """Convenience struct for holding information about skipped genomes.""" taxon_id: str accession: str organism: str strain: str url: str dltype: str
[docs]def configure_entrez(args: Namespace) -> Optional[str]: """Configure Entrez email, return API key. :param args: Namespace, command-line arguments Returns None if no API key found """ logger = logging.getLogger(__name__) download.set_ncbi_email(args.email) logger.info("Setting Entrez email address: %s", args.email) return parse_api_key(args)
[docs]def dl_info_to_str(esummary, uid_class) -> str: """Return descriptive string for passed download data. :param esummary: :param uid_class: """ return "\n\t".join( [ f"Species Taxid: {esummary['SpeciesTaxid']}", f"TaxID: {esummary['Taxid']}", f"Accession: {esummary['AssemblyAccession']}", f"Name: {esummary['AssemblyName']}", f"Organism: {uid_class.organism}", f"Genus: {uid_class.genus}", f"Species: {uid_class.species}", f"Strain: {uid_class.strain}", ] )
[docs]def download_data( args: Namespace, api_key: Optional[str], asm_dict: Dict[str, List], ) -> Tuple[List, List, List]: """Download the accessions indicated in the passed dictionary. :param args: Namespace of command-line arguments :param api_key: str, API key for NCBI downloads :param asm_dict: dictionary of assembly UIDs to download, keyed by taxID Returns lists of information about downloaded genome classes and labels, and a list of skipped downloads (as Skipped objects). """ logger = logging.getLogger(__name__) classes, labels, skippedlist = [], [], [] for tid, uids in asm_dict.items(): logger.info(termcolor("Downloading contigs for Taxon ID %s", "blue"), uids) for uid in uids: # Obtain eSummary for each assembly UID logger.info( termcolor("Retrieving eSummary information for UID %s", "cyan"), uid ) try: esummary, filestem = download.get_ncbi_esummary( uid, args.retries, api_key ) except download.NCBIDownloadException: logger.warning( termcolor("Skipping download of record for UID %s", "red"), uid, exc_info=True, ) skippedlist.append( Skipped(tid, uid, "", "", None, "RefSeq",) ) # pylint: disable=no-member continue uid_class = download.get_ncbi_classification(esummary) logger.debug( "eSummary information (%s):\n\t%s", filestem, dl_info_to_str(esummary, uid_class), ) if args.dryrun: logger.warning( "(dry-run) skipping download of %s", esummary["AssemblyAccession"] ) continue # Download genome for UID, and extract compressed files dlstatus, skipped_genomes = download_genome( args, filestem, tid, uid, uid_class ) skippedlist.extend(skipped_genomes) if not dlstatus.skipped: extract_genomes(args, dlstatus, esummary) labeltxt, classtxt = hash_genomes(args, dlstatus, filestem, uid_class) classes.append(classtxt) labels.append(labeltxt) logger.info( "Label and class file entries\n\tLabel: %s\n\tClass: %s", labeltxt, classtxt, ) return classes, labels, skippedlist
[docs]def extract_genomes(args: Namespace, dlstatus: download.DLStatus, esummary) -> None: """Extract genome files in passed dlstatus. :param args: Namespace of command-line arguments :param dlstatus: :param esummary: """ logger = logging.getLogger(__name__) # Extract downloaded files ename = dlstatus.outfname.with_suffix("") # should strip only last suffix if ename.exists() and args.noclobber: logger.warning("Output file %s exists, not extracting", ename) else: logger.debug("Extracting archive %s to %s", dlstatus.outfname, ename) download.extract_contigs(dlstatus.outfname, ename) # Modify sequence ID header if Kraken option active if args.kraken: logger.warning("Modifying downloaded sequence for Kraken compatibility") seqdata = list(SeqIO.parse(ename, "fasta")) logger.debug("Modifying %s", ename) for seq in seqdata: seq.id = "|".join([seq.id, "kraken:taxid", esummary["SpeciesTaxid"]]) SeqIO.write(seqdata, ename, "fasta")
[docs]def hash_genomes( args: Namespace, dlstatus: download.DLStatus, filestem: str, uid_class ) -> Tuple[str, str]: """Hash genome files in passed dlstatus. :param args: Namespace of command-line arguments :param dlstatus: :param filestem: str, filestem for output :param uid_class: """ logger = logging.getLogger(__name__) # Create MD5 hash for the downloaded contigs ename = dlstatus.outfname.with_suffix("") # should strip only last suffix logger.debug("Creating local MD5 hash for %s", ename) hashfname = ename.with_suffix(".md5") datahash = download.create_hash(ename) logger.debug("Writing hash to %s", hashfname) with open(hashfname, "w") as hfh: hfh.write("\t".join([datahash, str(ename)]) + "\n") # Make label/class text labeltxt, classtxt = download.create_labels(uid_class, filestem, datahash) return labeltxt, classtxt
[docs]def download_genome(args: Namespace, filestem: str, tid: str, uid: str, uid_class): """Download single genome data to output directory. :param args: Namespace, command-line arguments :param filestem: str, output filestem :param tid: str, taxonID :param uid: str, assembly UID :param uid_class: """ logger = logging.getLogger(__name__) skippedlist = [] refseq_status, genbank_status = True, True # set False if skipped dlfiledata = download.DLFileData( filestem, "ftp://ftp.ncbi.nlm.nih.gov/genomes/all", "genomic.fna.gz" ) logger.info("Retrieving URLs for %s", filestem) # Try RefSeq first dlstatus = download.download_genome_and_hash( args.outdir, args.timeout, dlfiledata, dltype="RefSeq", disable_tqdm=args.disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, "RefSeq", # pylint: disable=no-member ) ) refseq_status = False # RefSeq fails, so try GenBank if refseq_status is False: logger.warning( termcolor("RefSeq failed. Trying GenBank alternative assembly", "magenta") ) # Try GenBank assembly dlstatus = download.download_genome_and_hash( args.outdir, args.timeout, dlfiledata, dltype="GenBank", disable_tqdm=args.disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, "GenBank", ) ) genbank_status = False logger.warning(termcolor("GenBank failed.", "magenta")) if genbank_status or refseq_status: # One of the downloads worked: report information logger.debug("Downloaded from URL: %s", dlstatus.url) logger.debug("Wrote assembly to: %s", dlstatus.outfname) logger.debug("Wrote MD5 hashes to: %s", dlstatus.outfhash) # Check hash for the download hashstatus = download.check_hash(dlstatus.outfname, dlstatus.outfhash) logger.debug("Local MD5 hash: %s", hashstatus.localhash) logger.debug("NCBI MD5 hash: %s", hashstatus.localhash) if hashstatus.passed: logger.info(termcolor("MD5 hash check passed", "green")) else: logger.warning("MD5 hash check failed. Please check and retry.") return dlstatus, skippedlist
[docs]def get_tax_asm_dict(args: Namespace) -> Dict[str, List]: """Return dictionary of assembly UIDs to download, keyed by taxID. :param args: Namespace of command-line arguments """ logger = logging.getLogger(__name__) taxon_ids = download.split_taxa(args.taxon) logger.info(termcolor("Taxon IDs received: %s", "blue"), taxon_ids) asm_dict = download.make_asm_dict(taxon_ids, args.retries) for tid, uids in asm_dict.items(): logger.debug( "Taxon ID summary\n\tQuery: %s\n\tasm count: %s\n\tUIDs: %s", tid, len(uids), uids, ) return asm_dict
[docs]def parse_api_key(args: Namespace) -> Optional[str]: """Returns NCBI API key if present, None otherwise. :param args: Namespace of command-line arguments Checks for key in args.api_keypath. """ logger = logging.getLogger(__name__) api_path = args.api_keypath.expanduser() if not api_path.is_file(): logger.warning("API path %s not a valid file. Not using API key.", api_path) api_key = None else: with api_path.open() as ifh: api_key = ifh.readline().strip() logger.info("API key recovered from %s", api_path) return api_key
[docs]def subcmd_download(args: Namespace) -> int: """Download assembled genomes in subtree of passed NCBI taxon ID. :param args: Namespace, command-line arguments """ # Create logger logger = logging.getLogger(__name__) logger.info(termcolor("Downloading genomes from NCBI", "red")) # Create output directory, respecting force/noclobber if args.dryrun: logger.warning( termcolor("Dry run only: will not overwrite or download", "cyan") ) else: make_outdir(args.outdir, args.force, args.noclobber) api_key = configure_entrez(args) # set up email/get API key asm_dict = get_tax_asm_dict(args) # dictionary of assembly UIDs for download # Download contigs and hashes for each assembly UID in the dictionary # Collect class and label information for each downloaded genome, plus a list # of skipped genome data classes, labels, skippedlist = download_data(args, api_key, asm_dict) # Write class and label files if not args.dryrun: classfname = args.outdir / args.classfname logger.info("Writing classes file to %s", classfname) if classfname.exists() and args.noclobber: logger.warning("Class file %s exists, not overwriting", classfname) else: with open(classfname, "w") as ofh: ofh.write("\n".join(classes) + "\n") labelfname = args.outdir / args.labelfname logger.info("Writing labels file to %s", labelfname) if labelfname.exists() and args.noclobber: logger.warning("Labels file %s exists, not overwriting", labelfname) else: with open(labelfname, "w") as ofh: ofh.write("\n".join(labels) + "\n") # Report skipped genome list if skippedlist: logger.warning( termcolor("%s genome downloads were skipped", "red"), len(skippedlist) ) for skipped in skippedlist: outstr = "\n\t".join( [ f"taxon id: {skipped.taxon_id}", f"accession: {skipped.accession}", f"URL: {skipped.url}", f"source: {skipped.dltype}", ] ) logger.warning("%s %s:\n\t%s", skipped.organism, skipped.strain, outstr) return 0