Skip to content

Commit 8ee5f90

Browse files
Feature: print a FASTA of marker genes for each bin
Added the ability to print a FASTA file (amino acid) that includes the sequences of the genes that were identified as marker genes in each bin. Feature is implemented in the qa subprogram as a new output format (number 10). Each sequence is printed with an informative header that takes the form: >$bin_id $contig_name geneId=$gene_num;start=$start;end=$end;strand=$strand marker=$marker;mstart=$mstart;mend=$mend Where $ denotes a variable. Variables: - bin_id: FASTA bin the sequence comes from - contig_name: contig the sequence comes from - gene_num: gene number as assigned by Prodigal - start: gene start position on contig (nucleotide) - end: gene end position on contig (nucleotide) - strand: strand the gene is on, 1 or -1 - marker: name of the marker that was identified as a match to the gene - mstart: start position on gene of alignment with marker (amino acid) - mend: end position on gene of alignment with marker (amino acid)
1 parent dd24811 commit 8ee5f90

File tree

3 files changed

+67
-8
lines changed

3 files changed

+67
-8
lines changed

checkm/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def qa(self, options):
393393
)
394394

395395
self.logger.info('')
396-
RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file)
396+
RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder)
397397
RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers)
398398

399399
if options.file != '':

checkm/resultsParser.py

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from checkm.hmmer import HMMERParser
3636

3737
from checkm.util.pfam import PFAM
38-
38+
from checkm.util.seqUtils import readFasta, writeFasta
3939

4040
class ResultsParser():
4141
"""Parse output of Prodigal+HMMER run and derived statistics."""
@@ -234,10 +234,12 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
234234
header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
235235
elif outputFormat == 9:
236236
header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
237+
elif outputFormat == 10:
238+
header = None
237239

238240
return header
239241

240-
def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile):
242+
def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
241243
# redirect output
242244
oldStdOut = reassignStdOut(outFile)
243245

@@ -265,7 +267,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
265267

266268
seqsReported = 0
267269
for binId in sorted(self.results.keys()):
268-
seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)
270+
seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)
269271

270272
if outputFormat in [6, 7] and seqsReported == 0:
271273
print('[No marker genes satisfied the reporting criteria.]')
@@ -622,7 +624,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):
622624

623625
return summary
624626

625-
def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
627+
def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):
628+
626629
"""Print out information about bin."""
627630
if outputFormat == 1:
628631
selectedMarkerSet = binMarkerSets.selectedMarkerSet()
@@ -789,13 +792,66 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
789792
rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
790793
print(rowStr)
791794

795+
# Hunter Cameron, May 29, 2015 - print a fasta of marker genes
796+
elif outputFormat == 10:
797+
# tabular of bin_id, marker, contig_id
798+
799+
# check for the analyze folder for later use
800+
if anaFolder is None:
801+
raise ValueError("AnaFolder must not be None for outputFormat 10")
802+
803+
### build a dict to link target_names with marker gene alignment information
804+
markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
805+
hitInfo = {}
806+
for marker, hit_list in self.markerHits.items():
807+
if marker not in markerGenes:
808+
continue
809+
810+
for hit in hit_list:
811+
name = hit.target_name
812+
hitInfo[name] = {
813+
"marker": marker,
814+
"ali_from": hit.ali_from,
815+
"ali_to": hit.ali_to
816+
}
817+
818+
819+
### Open genes.faa and print the ones that were found with some descriptive info in the header
820+
path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])
821+
for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems():
822+
elems = header.split(" # ")
823+
gene_name = elems[0]
824+
if gene_name in hitInfo:
825+
826+
# remove the gene number from Prodigal to get the original contig name
827+
contig_name, gene_num = gene_name.rsplit("_", 1)
828+
829+
# parse some info about the gene from the header line
830+
gene_start = elems[1]
831+
gene_end = elems[2]
832+
gene_strand = elems[3]
833+
834+
gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
835+
gene_num, gene_start, gene_end, gene_strand, str(len(seq)))
836+
837+
marker_info = "marker={};mstart={};mend={}".format(
838+
hitInfo[gene_name]["marker"],
839+
hitInfo[gene_name]["ali_from"],
840+
hitInfo[gene_name]["ali_to"])
841+
842+
# new header will be the bin name, contig name, gene info, and marker info separated by spaces
843+
new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])
844+
845+
print(new_header, seq, sep="\n")
846+
847+
792848
else:
793849
self.logger.error("Unknown output format: %d", outputFormat)
794850

795851
return 0
796852

797853
'''
798-
elif outputFormat == 9:
854+
elif outputFormat == 10:
799855
markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
800856
801857
markersInScaffold = {}

checkm/util/seqUtils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import logging
2525

2626

27-
def readFasta(fastaFile):
27+
def readFasta(fastaFile, trimHeader=True):
2828
'''Read sequences from FASTA file.'''
2929
try:
3030
if fastaFile.endswith('.gz'):
@@ -39,7 +39,10 @@ def readFasta(fastaFile):
3939
continue
4040

4141
if line[0] == '>':
42-
seqId = line[1:].split(None, 1)[0]
42+
if trimHeader:
43+
seqId = line[1:].split(None, 1)[0]
44+
else:
45+
seqId = line[1:].rstrip()
4346
seqs[seqId] = []
4447
else:
4548
seqs[seqId].append(line[0:-1])

0 commit comments

Comments
 (0)