Skip to content

Commit 85ce847

Browse files
committed
Expand can filter in each target cluster before expanding
1 parent ae4c7ab commit 85ce847

File tree

3 files changed

+61
-8
lines changed

3 files changed

+61
-8
lines changed

src/commons/Parameters.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ Parameters::Parameters():
104104
PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID, "--filter-hits", "Remove hits by seq. id. and coverage", "Filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT),
105105
PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by E-value (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
106106
// result2msa
107-
PARAM_MSA_FORMAT_MODE(PARAM_MSA_FORMAT_MODE_ID, "--msa-format-mode", "MSA format mode", "Format MSA as: 0: binary cA3M DB\n1: binary ca3m w. consensus DB\n2: aligned FASTA DB\n3: aligned FASTA w. header summary\n4: STOCKHOLM flat file\n5: A3M format", typeid(int), (void *) &msaFormatMode, "^[0-5]{1}$"),
107+
PARAM_MSA_FORMAT_MODE(PARAM_MSA_FORMAT_MODE_ID, "--msa-format-mode", "MSA format mode", "Format MSA as: 0: binary cA3M DB\n1: binary ca3m w. consensus DB\n2: aligned FASTA DB\n3: aligned FASTA w. header summary\n4: STOCKHOLM flat file\n5: A3M format\n6: A3M format w. alignment info", typeid(int), (void *) &msaFormatMode, "^[0-5]{1}$"),
108108
PARAM_ALLOW_DELETION(PARAM_ALLOW_DELETION_ID, "--allow-deletion", "Allow deletions", "Allow deletions in a MSA", typeid(bool), (void *) &allowDeletion, ""),
109109
PARAM_SUMMARY_PREFIX(PARAM_SUMMARY_PREFIX_ID, "--summary-prefix", "Summary prefix", "Set the cluster summary prefix", typeid(std::string), (void *) &summaryPrefix, "", MMseqsParameter::COMMAND_EXPERT),
110110
PARAM_SKIP_QUERY(PARAM_SKIP_QUERY_ID, "--skip-query", "Skip query", "Skip the query sequence", typeid(bool), (void *) &skipQuery, "", MMseqsParameter::COMMAND_EXPERT),
@@ -272,6 +272,7 @@ Parameters::Parameters():
272272
PARAM_TAX_DB_MODE(PARAM_TAX_DB_MODE_ID, "--tax-db-mode", "Taxonomy db mode", "Create taxonomy database as: 0: .dmp flat files (human readable) 1: binary dump (faster readin)", typeid(int), (void *) &taxDbMode, "^[0-1]{1}$"),
273273
// expandaln
274274
PARAM_EXPANSION_MODE(PARAM_EXPANSION_MODE_ID, "--expansion-mode", "Expansion mode", "Update score, E-value, and sequence identity by 0: input alignment 1: rescoring the inferred backtrace", typeid(int), (void *) &expansionMode, "^[0-2]{1}$"),
275+
PARAM_EXPAND_FILTER_CLUSTERS(PARAM_EXPAND_FILTER_CLUSTERS_ID, "--expand-filter-clusters", "Expand filter clusters", "Filter each target cluster during expansion 0: no filter 1: filter", typeid(int), (void *) &expandFilterClusters, "^[0-1]{1}$"),
275276
// taxonomy
276277
PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: single search LCA , 2/3: approximate 2bLCA, 4: top hit", typeid(int), (void *) &taxonomySearchMode, "^[1-4]{1}$"),
277278
PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment 2: output both", typeid(int), (void *) &taxonomyOutputMode, "^[0-2]{1}$"),
@@ -1084,6 +1085,13 @@ Parameters::Parameters():
10841085
expandaln.push_back(&PARAM_PC_MODE);
10851086
expandaln.push_back(&PARAM_PCA);
10861087
expandaln.push_back(&PARAM_PCB);
1088+
expandaln.push_back(&PARAM_EXPAND_FILTER_CLUSTERS);
1089+
expandaln.push_back(&PARAM_FILTER_MIN_ENABLE);
1090+
expandaln.push_back(&PARAM_FILTER_MAX_SEQ_ID);
1091+
expandaln.push_back(&PARAM_FILTER_QID);
1092+
expandaln.push_back(&PARAM_FILTER_QSC);
1093+
expandaln.push_back(&PARAM_FILTER_COV);
1094+
expandaln.push_back(&PARAM_FILTER_NDIFF);
10871095
expandaln.push_back(&PARAM_PRELOAD_MODE);
10881096
expandaln.push_back(&PARAM_COMPRESSED);
10891097
expandaln.push_back(&PARAM_THREADS);
@@ -1107,6 +1115,7 @@ Parameters::Parameters():
11071115
expand2profile.push_back(&PARAM_MASK_PROFILE);
11081116
expand2profile.push_back(&PARAM_WG);
11091117
expand2profile.push_back(&PARAM_ALLOW_DELETION);
1118+
expand2profile.push_back(&PARAM_EXPAND_FILTER_CLUSTERS);
11101119
expand2profile.push_back(&PARAM_FILTER_MSA);
11111120
expand2profile.push_back(&PARAM_FILTER_MIN_ENABLE);
11121121
expand2profile.push_back(&PARAM_FILTER_MAX_SEQ_ID);
@@ -2403,6 +2412,7 @@ void Parameters::setDefaults() {
24032412

24042413
// expandaln
24052414
expansionMode = EXPAND_TRANSFER_EVALUE;
2415+
expandFilterClusters = 0;
24062416

24072417
// taxonomy
24082418
taxonomySearchMode = Parameters::TAXONOMY_APPROX_2BLCA;

src/commons/Parameters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ class Parameters {
641641

642642
// exapandaln
643643
int expansionMode;
644+
int expandFilterClusters;
644645

645646
// taxonomy
646647
int taxonomySearchMode;
@@ -971,6 +972,7 @@ class Parameters {
971972

972973
// exapandaln
973974
PARAMETER(PARAM_EXPANSION_MODE)
975+
PARAMETER(PARAM_EXPAND_FILTER_CLUSTERS)
974976

975977
// taxonomy
976978
PARAMETER(PARAM_LCA_MODE)

src/util/expandaln.cpp

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
152152
BacktraceTranslator translator;
153153
SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
154154

155+
const bool filterBc = par.expandFilterClusters;
155156
EvalueComputation *evaluer = NULL;
156157
ProbabilityMatrix *probMatrix = NULL;
157158
if (returnAlnRes == false) {
@@ -169,24 +170,33 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
169170

170171
Sequence aSeq(par.maxSeqLen, aSeqDbType, &subMat, 0, false, par.compBiasCorrection);
171172
Sequence cSeq(par.maxSeqLen, cSeqDbType, &subMat, 0, false, false);
173+
Sequence* bSeq = NULL;
174+
if (filterBc) {
175+
bSeq = new Sequence(par.maxSeqLen, cSeqDbType, &subMat, 0, false, false);
176+
}
172177

173178
MultipleAlignment *aligner = NULL;
174179
MsaFilter *filter = NULL;
175180
PSSMCalculator *calculator = NULL;
176181
PSSMMasker *masker = NULL;
177182
std::vector<std::vector<unsigned char>> seqSet;
183+
std::vector<std::vector<unsigned char>> subSeqSet;
178184
std::string result;
179185

180-
if (returnAlnRes == false) {
186+
if (returnAlnRes == false || filterBc) {
181187
aligner = new MultipleAlignment(par.maxSeqLen, &subMat);
182-
if (par.filterMsa) {
188+
if (par.filterMsa || filterBc) {
183189
filter = new MsaFilter(par.maxSeqLen, 300, &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
184190
}
191+
subSeqSet.reserve(300);
192+
}
193+
194+
if (returnAlnRes == false) {
185195
// TODO: is this right?
186196
calculator = new PSSMCalculator(&subMat, par.maxSeqLen, 300, par.pcmode, par.pca, par.pcb, par.gapOpen.values.aminoacid(), par.gapPseudoCount);
187197
masker = new PSSMMasker(par.maxSeqLen, *probMatrix, subMat);
188-
seqSet.reserve(300);
189198
result.reserve(par.maxSeqLen * Sequence::PROFILE_READIN_SIZE);
199+
seqSet.reserve(300);
190200
}
191201

192202
size_t compBufferSize = (par.maxSeqLen + 1) * sizeof(float);
@@ -249,6 +259,32 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
249259
// CompressedA3M::extractMatcherResults(key, resultsBc, resultBcReader.getData(bResId, thread_idx), resultBcReader.getEntryLen(bResId), *cReader, false);
250260
Matcher::readAlignmentResults(resultsBc, resultBcReader->getData(bResId, thread_idx), false);
251261

262+
if (filterBc) {
263+
for (size_t k = 0; k < resultsBc.size(); ++k) {
264+
Matcher::result_t &resultBc = resultsBc[k];
265+
if (resultBc.backtrace.size() == 0) {
266+
Debug(Debug::ERROR) << "Alignment must contain a backtrace\n";
267+
EXIT(EXIT_FAILURE);
268+
}
269+
if (k == 0) {
270+
unsigned int bSeqKey = resultAb.dbKey;
271+
size_t bSeqId = cReader->getId(bSeqKey);
272+
bSeq->mapSequence(bSeqId, bSeqKey, cReader->getData(bSeqId, thread_idx), cReader->getSeqLen(bSeqId));
273+
} else {
274+
unsigned int cSeqKey = resultBc.dbKey;
275+
size_t cSeqId = cReader->getId(cSeqKey);
276+
cSeq.mapSequence(cSeqId, cSeqKey, cReader->getData(cSeqId, thread_idx), cReader->getSeqLen(cSeqId));
277+
subSeqSet.emplace_back(cSeq.numSequence, cSeq.numSequence + cSeq.L);
278+
}
279+
}
280+
Matcher::result_t query = *(resultsBc.begin());
281+
resultsBc.erase(resultsBc.begin());
282+
MultipleAlignment::MSAResult res = aligner->computeMSA(bSeq, subSeqSet, resultsBc, true);
283+
filter->filter(res, resultsBc, (int)(par.covMSAThr * 100), qid_vec, par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff, par.filterMinEnable);
284+
resultsBc.insert(resultsBc.begin(), query);
285+
MultipleAlignment::deleteMSA(&res);
286+
subSeqSet.clear();
287+
}
252288
//std::stable_sort(resultsBc.begin(), resultsBc.end(), compareHitsByKeyScore);
253289

254290
for (size_t k = 0; k < resultsBc.size(); ++k) {
@@ -362,18 +398,23 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
362398
if (compositionBias != NULL) {
363399
free(compositionBias);
364400
}
365-
if (returnAlnRes == false) {
401+
if (aligner != NULL) {
366402
delete aligner;
367-
if (filter != NULL) {
368-
delete filter;
369-
}
403+
}
404+
if (filter != NULL) {
405+
delete filter;
406+
}
407+
if (returnAlnRes == false) {
370408
delete calculator;
371409
delete masker;
372410
}
373411
while(intervalBuffer.size()){
374412
delete intervalBuffer.top();
375413
intervalBuffer.pop();
376414
}
415+
if (bSeq != NULL) {
416+
delete bSeq;
417+
}
377418
}
378419

379420
writer.close(returnAlnRes == false);

0 commit comments

Comments
 (0)