Source code for pyani.scripts.subcommands.subcmd_index

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2017-2019
# (c) University of Strathclyde 2019-2021
# Author: Leighton Pritchard
#
# Contact:
# leighton.pritchard@strath.ac.uk
#
# Leighton Pritchard,
# Strathclyde Institute for Pharmacy and Biomedical Sciences,
# Cathedral Street,
# Glasgow,
# G1 1XQ
# Scotland,
# UK
#
# The MIT License
#
# Copyright (c) 2017-2019 The James Hutton Institute
# Copyright (c) 2019-2021 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""Provides the index subcommand for pyani."""

import logging
import os

from argparse import Namespace
from pathlib import Path

from Bio import SeqIO

from pyani import download, pyani_files


[docs]def subcmd_index(args: Namespace) -> int:
    """Generate a file with the MD5 hash for each genome in an input directory.

    :param args:  Namespace, received command-line arguments
    :param logger:  logging object

    Identify the genome files in the input directory, and generate a single
    MD5 for each so that <genome>.fna produces <genome>.md5

    Genome files (FASTA) are identified from the file extension.
    """
    logger = logging.getLogger(__name__)

    # Get list of FASTA files in the input directory
    logger.info("Scanning directory %s for FASTA files", args.indir)
    fpaths = pyani_files.get_fasta_paths(args.indir)
    logger.info("Found FASTA files:\n" + "\n".join([f"\t{fpath}" for fpath in fpaths]))

    # Lists of class/label information
    classes = []
    labels = []

    # Create MD5 hash for each file, if needed
    for fpath in fpaths:
        hashfname = fpath.with_name(f"{fpath.name}.md5")
        if hashfname.is_file():
            logger.info("%s already indexed (using existing hash)", fpath)
            with open(hashfname, "r") as ifh:
                datahash = ifh.readline().split()[0]
        else:
            # Write an .md5 hash file
            datahash = download.create_hash(fpath)
            logger.info("Writing hash to %s", hashfname)
            with open(hashfname, "w") as hfh:
                hfh.write(f"{datahash}\t{fpath}\n")

        # Parse the file and get the label/class information
        with fpath.open("r") as sfh:
            label = list(SeqIO.parse(sfh, "fasta"))[0].description.split(" ", 1)[-1]
        labels.append("\t".join([datahash, fpath.stem, label]))
        classes.append("\t".join([datahash, fpath.stem, label]))

    # Write class and label files
    classfname = args.indir / args.classfname
    logger.info("Writing classes file to %s", classfname)
    if classfname.exists():
        logger.warning("Class file %s exists, not overwriting", classfname)
    else:
        with open(classfname, "w") as ofh:
            ofh.write("\n".join(classes) + "\n")

    labelfname = args.indir / args.labelfname
    logger.info("Writing labels file to %s", labelfname)
    if labelfname.exists():
        logger.warning("Labels file %s exists, not overwriting", labelfname)
    else:
        with open(labelfname, "w") as ofh:
            ofh.write("\n".join(labels) + "\n")

    return 0