Skip to content

Commit 9ae4458

Browse files
Add createclusearchdb, rm mkrepseqdb
1 parent 80f8b0b commit 9ae4458

File tree

7 files changed

+167
-143
lines changed

7 files changed

+167
-143
lines changed

src/CommandDeclarations.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ extern int mergedbs(int argc, const char **argv, const Command& command);
7676
extern int mergeresultsbyset(int argc, const char **argv, const Command &command);
7777
extern int msa2profile(int argc, const char **argv, const Command& command);
7878
extern int sequence2profile(int argc, const char **argv, const Command& command);
79-
extern int mkrepseqdb(int argc, const char **argv, const Command& command);
79+
extern int createclusearchdb(int argc, const char **argv, const Command& command);
8080
extern int msa2result(int argc, const char **argv, const Command& command);
8181
extern int multihitdb(int argc, const char **argv, const Command& command);
8282
extern int multihitsearch(int argc, const char **argv, const Command& command);

src/MMseqsBase.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,8 +1245,8 @@ std::vector<Command> baseCommands = {
12451245
"Martin Steinegger <[email protected]>",
12461246
"<i:sequenceDB> ",
12471247
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
1248-
{"mkrepseqdb", mkrepseqdb, &par.threadsandcompression, COMMAND_HIDDEN,
1249-
"Seperates a sequence DB into a representative and a non-representative DB",
1248+
{"createclusearchdb", createclusearchdb, &par.createclusearchdb, COMMAND_HIDDEN,
1249+
"Separates a sequence DB into a representative and a non-representative DB",
12501250
NULL,
12511251
"Martin Steinegger <[email protected]>",
12521252
"<i:sequenceDB> <i:resultDB> <o:sequenceDB>",

src/commons/Parameters.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,8 @@ Parameters::Parameters():
201201
PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "Overlap between sequences", typeid(int), (void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
202202
PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"),
203203
PARAM_HEADER_SPLIT_MODE(PARAM_HEADER_SPLIT_MODE_ID, "--headers-split-mode", "Header split mode", "Header split mode: 0: split position, 1: original header", typeid(int), (void *) &headerSplitMode, "^[0-1]{1}$"),
204+
// createclusearchdb
205+
PARAM_DB_SUFFIX_LIST(PARAM_DB_SUFFIX_LIST_ID, "--db-suffix-list", "Database suffixes", "Suffixes for database to be split in rep/seq", typeid(std::string), (void *) &dbSuffixList, ""),
204206
// gff2db
205207
PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID, "--gff-type", "GFF type", "Comma separated list of feature types in the GFF file to select", typeid(std::string), (void *) &gffType, ""),
206208
// translatenucs
@@ -322,6 +324,12 @@ Parameters::Parameters():
322324
threadsandcompression.push_back(&PARAM_COMPRESSED);
323325
threadsandcompression.push_back(&PARAM_V);
324326
327+
// createclusearchdb
328+
createclusearchdb.push_back(&PARAM_THREADS);
329+
createclusearchdb.push_back(&PARAM_COMPRESSED);
330+
createclusearchdb.push_back(&PARAM_V);
331+
createclusearchdb.push_back(&PARAM_DB_SUFFIX_LIST);
332+
325333
// alignall
326334
alignall.push_back(&PARAM_SUB_MAT);
327335
alignall.push_back(&PARAM_ADD_BACKTRACE);

src/commons/Parameters.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,9 @@ class Parameters {
565565
// result2flat
566566
bool useHeader;
567567

568+
// createclusearchdb
569+
std::string dbSuffixList;
570+
568571
// gff2db
569572
std::string gffType;
570573

@@ -906,6 +909,9 @@ class Parameters {
906909
PARAMETER(PARAM_SEQUENCE_SPLIT_MODE)
907910
PARAMETER(PARAM_HEADER_SPLIT_MODE)
908911

912+
// createclusearchdb
913+
PARAMETER(PARAM_DB_SUFFIX_LIST)
914+
909915
// gff2db
910916
PARAMETER(PARAM_GFF_TYPE)
911917

@@ -1118,6 +1124,7 @@ class Parameters {
11181124
std::vector<MMseqsParameter*> summarizeresult;
11191125
std::vector<MMseqsParameter*> summarizetabs;
11201126
std::vector<MMseqsParameter*> extractdomains;
1127+
std::vector<MMseqsParameter*> createclusearchdb;
11211128
std::vector<MMseqsParameter*> extractalignedregion;
11221129
std::vector<MMseqsParameter*> convertkb;
11231130
std::vector<MMseqsParameter*> tsv2db;

src/util/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ set(util_source_files
3636
util/mergeclusters.cpp
3737
util/mergeresultsbyset.cpp
3838
util/mergedbs.cpp
39-
util/mkrepseqdb.cpp
39+
util/createclusterdb.cpp
4040
util/msa2profile.cpp
4141
util/msa2result.cpp
4242
util/nrtotaxmapping.cpp

src/util/createclusterdb.cpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#include "DBReader.h"
2+
#include "DBWriter.h"
3+
#include "Debug.h"
4+
#include "Util.h"
5+
#include "FastSort.h"
6+
#include "Parameters.h"
7+
8+
#ifdef OPENMP
9+
#include <omp.h>
10+
#endif
11+
12+
int createclusearchdb(int argc, const char **argv, const Command& command) {
13+
Parameters &par = Parameters::getInstance();
14+
par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_ALIGN);
15+
DBReader<unsigned int> clusterReader(par.db2.c_str(), par.db2Index.c_str(), par.threads,
16+
DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
17+
clusterReader.open(DBReader<unsigned int>::NOSORT);
18+
std::vector<std::string> suffixes = Util::split(par.dbSuffixList, ",");
19+
suffixes.insert(suffixes.begin(), "");
20+
for(size_t prefix = 0; prefix < suffixes.size(); prefix++) {
21+
std::string db1 = par.db1 + suffixes[prefix];
22+
std::string db1Index = par.db1 + suffixes[prefix] + ".index";
23+
DBReader<unsigned int> reader(db1.c_str(), db1Index.c_str(), par.threads,
24+
DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
25+
reader.open(DBReader<unsigned int>::NOSORT);
26+
reader.readMmapedDataInMemory();
27+
28+
std::string repDbSeq = par.db3 + suffixes[prefix];
29+
std::string repDbSeqIdx = par.db3 + suffixes[prefix] + ".index";
30+
31+
DBWriter dbwRep(repDbSeq.c_str(), repDbSeqIdx.c_str(), static_cast<unsigned int>(par.threads), par.compressed,
32+
reader.getDbtype());
33+
dbwRep.open();
34+
std::string seqsDbSeq = par.db3 + "_seq" + suffixes[prefix];
35+
std::string seqsDbSeqIdx = par.db3 + "_seq" + suffixes[prefix] + ".index";
36+
DBWriter dbwClu(seqsDbSeq.c_str(), seqsDbSeqIdx.c_str(), static_cast<unsigned int>(par.threads), par.compressed,
37+
reader.getDbtype());
38+
dbwClu.open();
39+
Debug::Progress progress(clusterReader.getSize());
40+
#pragma omp parallel
41+
{
42+
unsigned int thread_idx = 0;
43+
#ifdef OPENMP
44+
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
45+
#endif
46+
std::string resultBuffer;
47+
// write output file
48+
#pragma omp for schedule(dynamic, 1)
49+
for (size_t id = 0; id < clusterReader.getSize(); id++) {
50+
progress.updateProgress();
51+
char *data = clusterReader.getData(id, thread_idx);
52+
size_t repKey = clusterReader.getDbKey(id);
53+
size_t repDataId = reader.getId(repKey);
54+
size_t repEntryLen = reader.getEntryLen(repDataId);
55+
dbwRep.writeData(reader.getData(repDataId, thread_idx), repEntryLen - 1, repKey, thread_idx);
56+
while (*data != '\0') {
57+
// parse dbkey
58+
size_t dbKey = Util::fast_atoi<unsigned int>(data);
59+
if (dbKey == repKey) {
60+
data = Util::skipLine(data);
61+
continue;
62+
}
63+
size_t readerId = reader.getId(dbKey);
64+
dbwClu.writeData(reader.getData(readerId, thread_idx),
65+
reader.getEntryLen(readerId) - 1, dbKey, thread_idx);
66+
data = Util::skipLine(data);
67+
}
68+
resultBuffer.clear();
69+
}
70+
}
71+
dbwRep.close(true);
72+
dbwClu.close(true);
73+
reader.close();
74+
75+
// merge index
76+
DBReader<unsigned int> dbrRep(repDbSeq.c_str(), repDbSeqIdx.c_str(), par.threads,
77+
DBReader<unsigned int>::USE_INDEX);
78+
dbrRep.open(DBReader<unsigned int>::NOSORT);
79+
DBReader<unsigned int> dbrClu(seqsDbSeq.c_str(), seqsDbSeqIdx.c_str(), par.threads,
80+
DBReader<unsigned int>::USE_INDEX);
81+
dbrClu.open(DBReader<unsigned int>::NOSORT);
82+
std::string seqsDbSeqIdxTmp = seqsDbSeqIdx + "_tmp";
83+
84+
FILE *sIndex = FileUtil::openAndDelete(seqsDbSeqIdxTmp.c_str(), "w");
85+
std::vector<DBReader<unsigned int>::Index> allIndex(dbrClu.getSize() + dbrRep.getSize());
86+
size_t dataSize = 0;
87+
for (size_t i = 0; i < dbrRep.getSize(); i++) {
88+
allIndex[i] = *dbrRep.getIndex(i);
89+
dataSize += allIndex[i].length;
90+
}
91+
for (size_t i = 0; i < dbrClu.getSize(); i++) {
92+
DBReader<unsigned int>::Index *index = dbrClu.getIndex(i);
93+
index->offset += dataSize;
94+
allIndex[dbrRep.getSize() + i] = *index;
95+
}
96+
SORT_PARALLEL(allIndex.begin(), allIndex.end(), DBReader<unsigned int>::Index::compareById);
97+
char buffer[1024];
98+
for (size_t i = 0; i < allIndex.size(); i++) {
99+
size_t len = DBWriter::indexToBuffer(buffer, allIndex[i].id, allIndex[i].offset, allIndex[i].length);
100+
size_t written = fwrite(buffer, sizeof(char), len, sIndex);
101+
if (written != len) {
102+
Debug(Debug::ERROR) << "Cannot write index file " << seqsDbSeqIdxTmp << "\n";
103+
EXIT(EXIT_FAILURE);
104+
}
105+
}
106+
if (fclose(sIndex) != 0) {
107+
Debug(Debug::ERROR) << "Cannot close index file " << seqsDbSeqIdxTmp << "\n";
108+
EXIT(EXIT_FAILURE);
109+
}
110+
FileUtil::move(seqsDbSeqIdxTmp.c_str(), seqsDbSeqIdx.c_str());
111+
FileUtil::symlinkAlias(repDbSeq, seqsDbSeq + ".0");
112+
FileUtil::move(seqsDbSeq.c_str(), (seqsDbSeq + ".1").c_str());
113+
}
114+
clusterReader.close();
115+
DBReader<unsigned int>::copyDb(par.db2, par.db3 + "_clu");
116+
117+
struct DBSuffix {
118+
DBFiles::Files flag;
119+
const char *suffix;
120+
};
121+
122+
const DBSuffix suffices[] = {
123+
{DBFiles::HEADER, "_h"},
124+
{DBFiles::HEADER_INDEX, "_h.index"},
125+
{DBFiles::HEADER_DBTYPE, "_h.dbtype"},
126+
{DBFiles::LOOKUP, ".lookup"},
127+
{DBFiles::SOURCE, ".source"},
128+
{DBFiles::TAX_MAPPING, "_mapping"},
129+
{DBFiles::TAX_NAMES, "_names.dmp"},
130+
{DBFiles::TAX_NODES, "_nodes.dmp"},
131+
{DBFiles::TAX_MERGED, "_merged.dmp"},
132+
{DBFiles::TAX_MERGED, "_taxonomy"},
133+
};
134+
135+
for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
136+
std::string file = par.db1 + suffices[i].suffix;
137+
if (suffices[i].flag && FileUtil::fileExists(file.c_str())) {
138+
DBReader<unsigned int>::copyDb(file, par.db3 + suffices[i].suffix);
139+
}
140+
}
141+
for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
142+
std::string file = par.db3 + suffices[i].suffix;
143+
if (suffices[i].flag && FileUtil::fileExists(file.c_str())) {
144+
DBReader<unsigned int>::aliasDb(file, par.db3 + "_seq" + suffices[i].suffix);
145+
}
146+
}
147+
return EXIT_SUCCESS;
148+
}

src/util/mkrepseqdb.cpp

Lines changed: 0 additions & 139 deletions
This file was deleted.

0 commit comments

Comments
 (0)