|
35 | 35 | from checkm.hmmer import HMMERParser |
36 | 36 |
|
37 | 37 | from checkm.util.pfam import PFAM |
38 | | - |
| 38 | +from checkm.util.seqUtils import readFasta, writeFasta |
39 | 39 |
|
40 | 40 | class ResultsParser(): |
41 | 41 | """Parse output of Prodigal+HMMER run and derived statistics.""" |
@@ -234,10 +234,12 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None): |
234 | 234 | header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}'] |
235 | 235 | elif outputFormat == 9: |
236 | 236 | header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids'] |
| 237 | + elif outputFormat == 10: |
| 238 | + header = None |
237 | 239 |
|
238 | 240 | return header |
239 | 241 |
|
240 | | - def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile): |
| 242 | + def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder): |
241 | 243 | # redirect output |
242 | 244 | oldStdOut = reassignStdOut(outFile) |
243 | 245 |
|
@@ -265,7 +267,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke |
265 | 267 |
|
266 | 268 | seqsReported = 0 |
267 | 269 | for binId in sorted(self.results.keys()): |
268 | | - seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable) |
| 270 | + seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) |
269 | 271 |
|
270 | 272 | if outputFormat in [6, 7] and seqsReported == 0: |
271 | 273 | print('[No marker genes satisfied the reporting criteria.]') |
@@ -622,7 +624,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1): |
622 | 624 |
|
623 | 625 | return summary |
624 | 626 |
|
625 | | - def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None): |
| 627 | + def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None): |
| 628 | + |
626 | 629 | """Print out information about bin.""" |
627 | 630 | if outputFormat == 1: |
628 | 631 | selectedMarkerSet = binMarkerSets.selectedMarkerSet() |
@@ -789,13 +792,66 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov |
789 | 792 | rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to) |
790 | 793 | print(rowStr) |
791 | 794 |
|
| 795 | + # Hunter Cameron, May 29, 2015 - print a fasta of marker genes |
| 796 | + elif outputFormat == 10: |
| 797 | + # tabular of bin_id, marker, contig_id |
| 798 | + |
| 799 | + # check for the analyze folder for later use |
| 800 | + if anaFolder is None: |
| 801 | + raise ValueError("AnaFolder must not be None for outputFormat 10") |
| 802 | + |
| 803 | + ### build a dict to link target_names with marker gene alignment information |
| 804 | + markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() |
| 805 | + hitInfo = {} |
| 806 | + for marker, hit_list in self.markerHits.items(): |
| 807 | + if marker not in markerGenes: |
| 808 | + continue |
| 809 | + |
| 810 | + for hit in hit_list: |
| 811 | + name = hit.target_name |
| 812 | + hitInfo[name] = { |
| 813 | + "marker": marker, |
| 814 | + "ali_from": hit.ali_from, |
| 815 | + "ali_to": hit.ali_to |
| 816 | + } |
| 817 | + |
| 818 | + |
| 819 | + ### Open genes.faa and print the ones that were found with some descriptive info in the header |
| 820 | + path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"]) |
| 821 | + for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems(): |
| 822 | + elems = header.split(" # ") |
| 823 | + gene_name = elems[0] |
| 824 | + if gene_name in hitInfo: |
| 825 | + |
| 826 | + # remove the gene number from Prodigal to get the original contig name |
| 827 | + contig_name, gene_num = gene_name.rsplit("_", 1) |
| 828 | + |
| 829 | + # parse some info about the gene from the header line |
| 830 | + gene_start = elems[1] |
| 831 | + gene_end = elems[2] |
| 832 | + gene_strand = elems[3] |
| 833 | + |
| 834 | + gene_info = "geneId={};start={};end={};strand={};protlen={}".format( |
| 835 | + gene_num, gene_start, gene_end, gene_strand, str(len(seq))) |
| 836 | + |
| 837 | + marker_info = "marker={};mstart={};mend={}".format( |
| 838 | + hitInfo[gene_name]["marker"], |
| 839 | + hitInfo[gene_name]["ali_from"], |
| 840 | + hitInfo[gene_name]["ali_to"]) |
| 841 | + |
| 842 | + # new header will be the bin name, contig name, gene info, and marker info separated by spaces |
| 843 | + new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info]) |
| 844 | + |
| 845 | + print(new_header, seq, sep="\n") |
| 846 | + |
| 847 | + |
792 | 848 | else: |
793 | 849 | self.logger.error("Unknown output format: %d", outputFormat) |
794 | 850 |
|
795 | 851 | return 0 |
796 | 852 |
|
797 | 853 | ''' |
798 | | - elif outputFormat == 9: |
| 854 | + elif outputFormat == 10: |
799 | 855 | markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() |
800 | 856 |
|
801 | 857 | markersInScaffold = {} |
|
0 commit comments