#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2017-2019
# (c) University of Strathclyde 2019-2021
# Author: Leighton Pritchard
#
# Contact:
# leighton.pritchard@strath.ac.uk
#
# Leighton Pritchard,
# Strathclyde Institute for Pharmacy and Biomedical Sciences,
# Cathedral Street,
# Glasgow,
# G1 1XQ
# Scotland,
# UK
#
# The MIT License
#
# Copyright (c) 2017-2019 The James Hutton Institute
# Copyright (c) 2019-2021 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""Provides the index subcommand for pyani."""
import logging
import os
from argparse import Namespace
from pathlib import Path
from Bio import SeqIO
from pyani import download, pyani_files
[docs]def subcmd_index(args: Namespace) -> int:
"""Generate a file with the MD5 hash for each genome in an input directory.
:param args: Namespace, received command-line arguments
:param logger: logging object
Identify the genome files in the input directory, and generate a single
MD5 for each so that <genome>.fna produces <genome>.md5
Genome files (FASTA) are identified from the file extension.
"""
logger = logging.getLogger(__name__)
# Get list of FASTA files in the input directory
logger.info("Scanning directory %s for FASTA files", args.indir)
fpaths = pyani_files.get_fasta_paths(args.indir)
logger.info("Found FASTA files:\n" + "\n".join([f"\t{fpath}" for fpath in fpaths]))
# Lists of class/label information
classes = []
labels = []
# Create MD5 hash for each file, if needed
for fpath in fpaths:
hashfname = fpath.with_name(f"{fpath.name}.md5")
if hashfname.is_file():
logger.info("%s already indexed (using existing hash)", fpath)
with open(hashfname, "r") as ifh:
datahash = ifh.readline().split()[0]
else:
# Write an .md5 hash file
datahash = download.create_hash(fpath)
logger.info("Writing hash to %s", hashfname)
with open(hashfname, "w") as hfh:
hfh.write(f"{datahash}\t{fpath}\n")
# Parse the file and get the label/class information
with fpath.open("r") as sfh:
label = list(SeqIO.parse(sfh, "fasta"))[0].description.split(" ", 1)[-1]
labels.append("\t".join([datahash, fpath.stem, label]))
classes.append("\t".join([datahash, fpath.stem, label]))
# Write class and label files
classfname = args.indir / args.classfname
logger.info("Writing classes file to %s", classfname)
if classfname.exists():
logger.warning("Class file %s exists, not overwriting", classfname)
else:
with open(classfname, "w") as ofh:
ofh.write("\n".join(classes) + "\n")
labelfname = args.indir / args.labelfname
logger.info("Writing labels file to %s", labelfname)
if labelfname.exists():
logger.warning("Labels file %s exists, not overwriting", labelfname)
else:
with open(labelfname, "w") as ofh:
ofh.write("\n".join(labels) + "\n")
return 0