Feature: print a FASTA of marker genes for each bin

hunter-cameron · hunter-cameron · commit 8ee5f90a8a1d · 2015-06-03T11:47:31.000-04:00
Added the ability to print a FASTA file (amino acid) that includes the sequences
of the genes that were identified as marker genes in each bin. Feature is
implemented in the qa subprogram as a new output format (number 10).

Each sequence is printed with an informative header that takes the form:

&gt;$bin_id $contig_name geneId=$gene_num;start=$start;end=$end;strand=$strand marker=$marker;mstart=$mstart;mend=$mend

Where $ denotes a variable.

	Variables:
	- bin_id: FASTA bin the sequence comes from
	- contig_name: contig the sequence comes from
	- gene_num: gene number as assigned by Prodigal
	- start: gene start position on contig (nucleotide)
	- end: gene end position on contig (nucleotide)
	- strand: strand the gene is on, 1 or -1
	- marker: name of the marker that was identified as a match to the gene
	- mstart: start position on gene of alignment with marker (amino acid)
	- mend: end position on gene of alignment with marker (amino acid)
diff --git a/checkm/main.py b/checkm/main.py
@@ -393,7 +393,7 @@ def qa(self, options):
                           )
 
         self.logger.info('')
-        RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file)
+        RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder)
         RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers)
 
         if options.file != '':
diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py
@@ -35,7 +35,7 @@
 from checkm.hmmer import HMMERParser
 
 from checkm.util.pfam import PFAM
-
+from checkm.util.seqUtils import readFasta, writeFasta
 
 class ResultsParser():
     """Parse output of Prodigal+HMMER run and derived statistics."""
@@ -234,10 +234,12 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
             header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
         elif outputFormat == 9:
             header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
+        elif outputFormat == 10:
+            header = None
 
         return header
 
-    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile):
+    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
         # redirect output
         oldStdOut = reassignStdOut(outFile)
 
@@ -265,7 +267,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
 
         seqsReported = 0
         for binId in sorted(self.results.keys()):
-            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)
+            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)
 
         if outputFormat in [6, 7] and seqsReported == 0:
             print('[No marker genes satisfied the reporting criteria.]')
@@ -622,7 +624,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):
 
         return summary
 
-    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
+    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):
+
         """Print out information about bin."""
         if outputFormat == 1:
             selectedMarkerSet = binMarkerSets.selectedMarkerSet()
@@ -789,13 +792,66 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                     rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
                 print(rowStr)
 
+        # Hunter Cameron, May 29, 2015 - print a fasta of marker genes
+        elif outputFormat == 10:
+            # tabular of bin_id, marker, contig_id
+           
+            # check for the analyze folder for later use
+            if anaFolder is None:
+                raise ValueError("AnaFolder must not be None for outputFormat 10")
+
+            ### build a dict to link target_names with marker gene alignment information
+            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
+            hitInfo = {}
+            for marker, hit_list in self.markerHits.items():
+                if marker not in markerGenes:
+                    continue
+
+                for hit in hit_list:
+                    name = hit.target_name
+                    hitInfo[name] = {
+                            "marker": marker,
+                            "ali_from": hit.ali_from,
+                            "ali_to": hit.ali_to
+                            }
+
+            
+            ### Open genes.faa and print the ones that were found with some descriptive info in the header
+            path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])
+            for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems():
+                elems = header.split(" # ")
+                gene_name = elems[0]
+                if gene_name in hitInfo:
+
+                    # remove the gene number from Prodigal to get the original contig name
+                    contig_name, gene_num = gene_name.rsplit("_", 1)
+
+                    # parse some info about the gene from the header line
+                    gene_start = elems[1]
+                    gene_end = elems[2]
+                    gene_strand = elems[3]
+
+                    gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
+                            gene_num, gene_start, gene_end, gene_strand, str(len(seq)))
+                    
+                    marker_info = "marker={};mstart={};mend={}".format(
+                            hitInfo[gene_name]["marker"], 
+                            hitInfo[gene_name]["ali_from"], 
+                            hitInfo[gene_name]["ali_to"])
+
+                    # new header will be the bin name, contig name, gene info, and marker info separated by spaces
+                    new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])
+
+                    print(new_header, seq, sep="\n")
+            
+
         else:
             self.logger.error("Unknown output format: %d", outputFormat)
 
         return 0
 
         '''
-        elif outputFormat == 9:
+        elif outputFormat == 10:
             markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
 
             markersInScaffold = {}
diff --git a/checkm/util/seqUtils.py b/checkm/util/seqUtils.py
@@ -24,7 +24,7 @@
 import logging
 
 
-def readFasta(fastaFile):
+def readFasta(fastaFile, trimHeader=True):
     '''Read sequences from FASTA file.'''
     try:
         if fastaFile.endswith('.gz'):
@@ -39,7 +39,10 @@ def readFasta(fastaFile):
                 continue
 
             if line[0] == '>':
-                seqId = line[1:].split(None, 1)[0]
+                if trimHeader:
+                    seqId = line[1:].split(None, 1)[0]
+                else:
+                    seqId = line[1:].rstrip()
                 seqs[seqId] = []
             else:
                 seqs[seqId].append(line[0:-1])

Original file line number	Diff line number	Diff line change
`@@ -393,7 +393,7 @@ def qa(self, options):`
`393`	`393`	`)`
`394`	`394`
`395`	`395`	`self.logger.info('')`
`396`		`- RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file)`
	`396`	`+ RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder)`
`397`	`397`	`RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers)`
`398`	`398`
`399`	`399`	`if options.file != '':`