Skip to content

Commit d92efa3

Browse files
committed
Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum (apache#12699)
1 parent a6d788e commit d92efa3

File tree

4 files changed

+131
-78
lines changed

4 files changed

+131
-78
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@
3030
import org.apache.lucene.util.automaton.Automaton;
3131
import org.apache.lucene.util.automaton.RunAutomaton;
3232
import org.apache.lucene.util.automaton.Transition;
33-
import org.apache.lucene.util.fst.ByteSequenceOutputs;
3433
import org.apache.lucene.util.fst.FST;
35-
import org.apache.lucene.util.fst.Outputs;
3634

3735
/**
3836
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
@@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
4644
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
4745

4846
final IndexInput in;
49-
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
5047

5148
IntersectTermsEnumFrame[] stack;
5249

@@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
6865

6966
private BytesRef savedStartTerm;
7067

68+
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
69+
new SegmentTermsEnum.OutputAccumulator();
70+
7171
// TODO: in some cases we can filter by length? eg
7272
// regexp foo*bar must be at least length 6 bytes
7373
public IntersectTermsEnum(
@@ -114,7 +114,6 @@ public IntersectTermsEnum(
114114
f.prefix = 0;
115115
f.setState(0);
116116
f.arc = arc;
117-
f.outputPrefix = arc.output();
118117
f.load(fr.rootCode);
119118

120119
// for assert:
@@ -184,22 +183,24 @@ private IntersectTermsEnumFrame pushFrame(int state) throws IOException {
184183
FST.Arc<BytesRef> arc = currentFrame.arc;
185184
int idx = currentFrame.prefix;
186185
assert currentFrame.suffix > 0;
187-
BytesRef output = currentFrame.outputPrefix;
186+
187+
outputAccumulator.reset();
188+
outputAccumulator.push(arc.output());
188189
while (idx < f.prefix) {
189190
final int target = term.bytes[idx] & 0xff;
190191
// TODO: we could be more efficient for the next()
191192
// case by using current arc as starting point,
192193
// passed to findTargetArc
193194
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
194195
assert arc != null;
195-
output = fstOutputs.add(output, arc.output());
196+
outputAccumulator.push(arc.output());
196197
idx++;
197198
}
198199

199200
f.arc = arc;
200-
f.outputPrefix = output;
201201
assert arc.isFinal();
202-
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
202+
outputAccumulator.push(arc.nextFinalOutput());
203+
f.load(outputAccumulator);
203204
return f;
204205
}
205206

lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
5555
int statsSingletonRunLength = 0;
5656
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
5757

58-
byte[] floorData = new byte[32];
5958
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
6059

6160
// Length of prefix shared by all terms in this block
@@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
9089

9190
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
9291

93-
// Cumulative output so far
94-
BytesRef outputPrefix;
95-
9692
int startBytePos;
9793
int suffix;
9894

@@ -120,7 +116,7 @@ void loadNextFloorBlock() throws IOException {
120116
}
121117
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
122118

123-
load(null);
119+
load((Long) null);
124120
}
125121

126122
public void setState(int state) {
@@ -142,12 +138,22 @@ public void setState(int state) {
142138
}
143139

144140
void load(BytesRef frameIndexData) throws IOException {
145-
if (frameIndexData != null) {
146-
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
147-
// Skip first long -- has redundant fp, hasTerms
148-
// flag, isFloor flag
149-
final long code = ite.fr.readVLongOutput(floorDataReader);
150-
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
141+
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
142+
load(ite.fr.readVLongOutput(floorDataReader));
143+
}
144+
145+
void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
146+
outputAccumulator.prepareRead();
147+
long code = ite.fr.readVLongOutput(outputAccumulator);
148+
outputAccumulator.setFloorData(floorDataReader);
149+
load(code);
150+
}
151+
152+
void load(Long blockCode) throws IOException {
153+
if (blockCode != null) {
154+
// This block is the first one in a possible sequence of floor blocks corresponding to a
155+
// single seek point from the FST terms index
156+
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
151157
// Floor frame
152158
numFollowFloorBlocks = floorDataReader.readVInt();
153159
nextFloorLabel = floorDataReader.readByte() & 0xff;

0 commit comments

Comments
 (0)