Skip to content

Commit 1514015

Browse files
Rework pairaln to support different pairing modes. Add support for dummy sequences to result2msa
1 parent 12ba202 commit 1514015

File tree

5 files changed

+128
-19
lines changed

5 files changed

+128
-19
lines changed

src/alignment/MultipleAlignment.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,9 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
105105
unsigned int queryPos = result.qStartPos;
106106
unsigned int targetPos = result.dbStartPos;
107107
// HACK: score was 0 and sequence was rejected, so we fill in an empty gap sequence
108+
// Needed for pairaln with dummy sequences
108109
if(targetPos == UINT_MAX) {
109-
Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n";
110+
//Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n";
110111
// fill up with gaps
111112
for(size_t pos = 0; pos < centerSeqSize; pos++){
112113
edgeSeqMSA[pos] = '-';

src/commons/Parameters.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,9 @@ Parameters::Parameters():
270270
// aggregatetax
271271
PARAM_MAJORITY(PARAM_MAJORITY_ID, "--majority", "Majority threshold", "minimal fraction of agreement among taxonomically assigned sequences of a set", typeid(float), (void *) &majorityThr, "^0(\\.[0-9]+)?|^1(\\.0+)?$"),
272272
PARAM_VOTE_MODE(PARAM_VOTE_MODE_ID, "--vote-mode", "Vote mode", "Mode of assigning weights to compute majority. 0: uniform, 1: minus log E-value, 2: score", typeid(int), (void *) &voteMode, "^[0-2]{1}$"),
273+
// pairaln
274+
PARAM_PAIRING_DUMMY_MODE(PARAM_PAIRING_DUMMY_MODE_ID, "--pairing-dummy-mode", "Include dummy pairing", "0: dont include, 1: include - an entry that will cause result2msa to write a gap only line", typeid(int), (void *) &pairdummymode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
275+
PARAM_PAIRING_MODE(PARAM_PAIRING_MODE_ID, "--pairing-mode", "Pairing mode", "0: pair maximal per species, 1: pair only if all chains are covered per species", typeid(int), (void *) &pairmode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
273276
// taxonomyreport
274277
PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"),
275278
// createtaxdb
@@ -1191,6 +1194,8 @@ Parameters::Parameters():
11911194
expand2profile.push_back(&PARAM_V);
11921195
11931196
pairaln.push_back(&PARAM_PRELOAD_MODE);
1197+
pairaln.push_back(&PARAM_PAIRING_DUMMY_MODE);
1198+
pairaln.push_back(&PARAM_PAIRING_MODE);
11941199
pairaln.push_back(&PARAM_COMPRESSED);
11951200
pairaln.push_back(&PARAM_THREADS);
11961201
pairaln.push_back(&PARAM_V);
@@ -2512,6 +2517,10 @@ void Parameters::setDefaults() {
25122517
majorityThr = 0.5;
25132518
voteMode = AGG_TAX_MINUS_LOG_EVAL;
25142519

2520+
// pairaln
2521+
pairdummymode = PAIRALN_DUMMY_MODE_OFF;
2522+
pairmode = PAIRALN_MODE_ALL_PER_SPECIES;
2523+
25152524
// taxonomyreport
25162525
reportMode = 0;
25172526

src/commons/Parameters.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,14 @@ class Parameters {
213213
static const int AGG_TAX_MINUS_LOG_EVAL = 1;
214214
static const int AGG_TAX_SCORE = 2;
215215

216+
// pairaln dummy mode
217+
static const int PAIRALN_DUMMY_MODE_OFF = 0;
218+
static const int PAIRALN_DUMMY_MODE_ON = 1;
219+
220+
// pairaln mode
221+
static const int PAIRALN_MODE_ALL_PER_SPECIES = 0;
222+
static const int PAIRALN_MODE_COVER_ALL_CHAINS = 1;
223+
216224
// taxonomy search strategy
217225
static const int TAXONOMY_SINGLE_SEARCH = 1;
218226
static const int TAXONOMY_2BLCA = 2;
@@ -644,6 +652,10 @@ class Parameters {
644652
float majorityThr;
645653
int voteMode;
646654

655+
// pairaln
656+
int pairdummymode;
657+
int pairmode;
658+
647659
// taxonomyreport
648660
int reportMode;
649661

@@ -983,6 +995,10 @@ class Parameters {
983995
PARAMETER(PARAM_MAJORITY)
984996
PARAMETER(PARAM_VOTE_MODE)
985997

998+
// pairaln
999+
PARAMETER(PARAM_PAIRING_DUMMY_MODE)
1000+
PARAMETER(PARAM_PAIRING_MODE)
1001+
9861002
// taxonomyreport
9871003
PARAMETER(PARAM_REPORT_MODE)
9881004

src/util/pairaln.cpp

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -57,23 +57,30 @@ int pairaln(int argc, const char **argv, const Command& command) {
5757
std::vector<Matcher::result_t> result;
5858
result.reserve(100000);
5959
std::unordered_map<unsigned int, size_t> findPair;
60+
std::vector<unsigned int> taxonToPair;
6061
std::string output;
6162
output.reserve(100000);
63+
bool hasBacktrace = false;
6264
#pragma omp for schedule(dynamic, 1)
63-
for (size_t fileNumber = 0; fileNumber < fileToIds.size(); fileNumber++) {
65+
for (size_t fileNumber = 0; fileNumber < fileToIds.size(); fileNumber++) {
6466
char buffer[1024 + 32768 * 4];
6567
findPair.clear();
68+
taxonToPair.clear();
6669
progress.updateProgress();
6770

71+
unsigned int minResultDbKey = UINT_MAX;
6872
// find intersection between all proteins
6973
for (size_t i = 0; i < fileToIds[fileNumber].size(); i++) {
7074
result.clear();
7175
size_t id = fileToIds[fileNumber][i];
7276
Matcher::readAlignmentResults(result, alnDbr.getData(id, thread_idx), true);
77+
7378
for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) {
79+
hasBacktrace = result[resIdx].backtrace.size() > 0;
7480
unsigned int taxon = mapping->lookup(result[resIdx].dbKey);
7581
// we don't want to introduce a new field, reuse existing unused field here
7682
result[resIdx].dbOrfStartPos = taxon;
83+
minResultDbKey = std::min(result[resIdx].dbKey, minResultDbKey);
7784
}
7885
std::stable_sort(result.begin(), result.end(), compareByTaxId);
7986
unsigned int prevTaxon = UINT_MAX;
@@ -92,6 +99,16 @@ int pairaln(int argc, const char **argv, const Command& command) {
9299
}
93100
}
94101

102+
// fill taxonToPair vector
103+
std::unordered_map<unsigned int, size_t>::iterator it;
104+
for (it = findPair.begin(); it != findPair.end(); ++it) {
105+
int thresholdToPair = (par.pairmode == Parameters::PAIRALN_MODE_ALL_PER_SPECIES) ? 1 : fileToIds[fileNumber].size() - 1;
106+
if (it->second > thresholdToPair) {
107+
taxonToPair.emplace_back(it->first);
108+
}
109+
}
110+
std::sort(taxonToPair.begin(), taxonToPair.end());
111+
95112
for (size_t i = 0; i < fileToIds[fileNumber].size(); i++) {
96113
result.clear();
97114
output.clear();
@@ -103,21 +120,39 @@ int pairaln(int argc, const char **argv, const Command& command) {
103120
// we don't want to introduce a new field, reuse existing unused field here
104121
result[resIdx].dbOrfStartPos = taxon;
105122
}
123+
106124
// stable sort is required to assure that best hit is first per taxon
107125
std::stable_sort(result.begin(), result.end(), compareByTaxId);
108126
unsigned int prevTaxon = UINT_MAX;
109-
for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) {
110-
unsigned int taxon = result[resIdx].dbOrfStartPos;
111-
// found pair!
112-
if (taxon != prevTaxon
113-
&& findPair.find(taxon) != findPair.end()
114-
&& findPair[taxon] == fileToIds[fileNumber].size()) {
115-
bool hasBacktrace = result[resIdx].backtrace.size() > 0;
116-
size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false);
127+
// iterate over taxonToPair
128+
size_t resIdxStart = 0;
129+
for(size_t taxonInList : taxonToPair) {
130+
bool taxonFound = false;
131+
for (size_t resIdx = resIdxStart; resIdx < result.size(); ++resIdx) {
132+
unsigned int taxon = result[resIdx].dbOrfStartPos;
133+
// check if this taxon has enough information to pair
134+
if(taxonInList != taxon){
135+
continue;
136+
}
137+
bool bestTaxonHit = (taxon != prevTaxon);
138+
taxonFound = true;
139+
if(bestTaxonHit){
140+
size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false);
141+
output.append(buffer, len);
142+
resIdxStart = resIdx + 1;
143+
break;
144+
}
145+
prevTaxon = taxon;
146+
}
147+
if(taxonFound == false && par.pairdummymode == Parameters::PAIRALN_DUMMY_MODE_ON){ // par.addDummyPairedAlignment
148+
// write an Matcher::result_t with UINT_MAX as dbKey
149+
Matcher::result_t emptyResult(minResultDbKey, 1, 1, 0, 1, 0,
150+
0, UINT_MAX, 0, 0, UINT_MAX, 0, 0, "1M");
151+
size_t len = Matcher::resultToBuffer(buffer, emptyResult, hasBacktrace, false, false);
117152
output.append(buffer, len);
118153
}
119-
prevTaxon = taxon;
120154
}
155+
121156
resultWriter.writeData(output.c_str(), output.length(), alnDbr.getDbKey(id), thread_idx);
122157
}
123158
}

src/util/result2msa.cpp

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -324,9 +324,21 @@ int result2msa(int argc, const char **argv, const Command &command) {
324324
header = targetHeaderReader->getData(id, thread_idx);
325325
length = targetHeaderReader->getEntryLen(id) - 1;
326326
}
327+
bool isOnlyGap = true;
328+
for (size_t pos = 0; pos < res.centerLength; pos++) {
329+
char aa = res.msaSequence[i][pos];
330+
if (aa != MultipleAlignment::GAP) {
331+
isOnlyGap = false;
332+
break;
333+
}
334+
}
327335

328336
result.append(1, '>');
329-
result.append(header, length);
337+
if(isOnlyGap) {
338+
result.append("DUMMY\n");
339+
}else{
340+
result.append(header, length);
341+
}
330342
// need to allow insertion in the centerSequence
331343
for (size_t pos = 0; pos < res.centerLength; pos++) {
332344
char aa = res.msaSequence[i][pos];
@@ -353,13 +365,31 @@ int result2msa(int argc, const char **argv, const Command &command) {
353365
continue;
354366
}
355367

356-
char *header;
368+
const char *header;
369+
bool isOnlyGap = true;
370+
for (size_t pos = 0; pos < res.centerLength; pos++) {
371+
char aa = res.msaSequence[i][pos];
372+
if (aa != MultipleAlignment::GAP) {
373+
isOnlyGap = false;
374+
break;
375+
}
376+
}
377+
378+
357379
if (i == 0) {
358-
header = centerSequenceHeader;
380+
if(isOnlyGap) {
381+
header = "DUMMY";
382+
}else{
383+
header = centerSequenceHeader;
384+
}
359385
} else {
360-
unsigned int key = seqKeys[i - 1];
361-
size_t id = targetHeaderReader->getId(key);
362-
header = targetHeaderReader->getData(id, thread_idx);
386+
if(isOnlyGap) {
387+
header = "DUMMY";
388+
}else {
389+
unsigned int key = seqKeys[i - 1];
390+
size_t id = targetHeaderReader->getId(key);
391+
header = targetHeaderReader->getData(id, thread_idx);
392+
}
363393
}
364394
accession = Util::parseFastaHeader(header);
365395

@@ -386,12 +416,30 @@ int result2msa(int argc, const char **argv, const Command &command) {
386416
}
387417

388418
result.push_back('>');
419+
bool isOnlyGap = true;
420+
for (size_t pos = 0; pos < res.centerLength; pos++) {
421+
char aa = res.msaSequence[i][pos];
422+
if (aa != MultipleAlignment::GAP) {
423+
isOnlyGap = false;
424+
break;
425+
}
426+
}
427+
428+
389429
if (i == 0) {
390-
result.append(Util::parseFastaHeader(centerSequenceHeader));
430+
if(isOnlyGap){
431+
result.append("DUMMY");
432+
}else{
433+
result.append(Util::parseFastaHeader(centerSequenceHeader));
434+
}
391435
} else {
392436
unsigned int key = seqKeys[i - 1];
393437
size_t id = targetHeaderReader->getId(key);
394-
result.append(Util::parseFastaHeader(targetHeaderReader->getData(id, thread_idx)));
438+
if(isOnlyGap){
439+
result.append("DUMMY");
440+
}else {
441+
result.append(Util::parseFastaHeader(targetHeaderReader->getData(id, thread_idx)));
442+
}
395443
if (par.msaFormatMode == Parameters::FORMAT_MSA_A3M_ALN_INFO) {
396444
size_t len = Matcher::resultToBuffer(buffer, alnResults[i - 1], false);
397445
char* data = buffer;

0 commit comments

Comments
 (0)