Skip to content

Commit 64c314e

Browse files
mgaido91cloud-fan
authored andcommitted
[SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash on UTF8String
## What changes were proposed in this pull request? SPARK-10399 introduced a performance regression on the hash computation for UTF8String. The regression can be evaluated with the code attached in the JIRA. That code runs in about 120 us per method on my laptop (MacBook Pro 2.5 GHz Intel Core i7, RAM 16 GB 1600 MHz DDR3) while the code from branch 2.3 takes on the same machine about 45 us for me. After the PR, the code takes about 45 us on the master branch too. ## How was this patch tested? running the perf test from the JIRA Closes #22338 from mgaido91/SPARK-25317. Authored-by: Marco Gaido <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent d749d03 commit 64c314e

File tree

1 file changed

+14
-9
lines changed

1 file changed

+14
-9
lines changed

common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import com.google.common.primitives.Ints;
2121

22+
import org.apache.spark.unsafe.Platform;
2223
import org.apache.spark.unsafe.memory.MemoryBlock;
2324
import org.apache.spark.unsafe.types.UTF8String;
2425

@@ -59,7 +60,7 @@ public static int hashUnsafeWordsBlock(MemoryBlock base, int seed) {
5960
// This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method.
6061
int lengthInBytes = Ints.checkedCast(base.size());
6162
assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)";
62-
int h1 = hashBytesByIntBlock(base, seed);
63+
int h1 = hashBytesByIntBlock(base, lengthInBytes, seed);
6364
return fmix(h1, lengthInBytes);
6465
}
6566

@@ -69,22 +70,27 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i
6970
}
7071

7172
public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
73+
return hashUnsafeBytesBlock(base, Ints.checkedCast(base.size()), seed);
74+
}
75+
76+
private static int hashUnsafeBytesBlock(MemoryBlock base, int lengthInBytes, int seed) {
7277
// This is not compatible with original and another implementations.
7378
// But remain it for backward compatibility for the components existing before 2.3.
74-
int lengthInBytes = Ints.checkedCast(base.size());
7579
assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
7680
int lengthAligned = lengthInBytes - lengthInBytes % 4;
77-
int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
81+
int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
82+
long offset = base.getBaseOffset();
83+
Object o = base.getBaseObject();
7884
for (int i = lengthAligned; i < lengthInBytes; i++) {
79-
int halfWord = base.getByte(i);
85+
int halfWord = Platform.getByte(o, offset + i);
8086
int k1 = mixK1(halfWord);
8187
h1 = mixH1(h1, k1);
8288
}
8389
return fmix(h1, lengthInBytes);
8490
}
8591

8692
public static int hashUTF8String(UTF8String str, int seed) {
87-
return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
93+
return hashUnsafeBytesBlock(str.getMemoryBlock(), str.numBytes(), seed);
8894
}
8995

9096
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
@@ -101,7 +107,7 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
101107
int lengthInBytes = Ints.checkedCast(base.size());
102108
assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";
103109
int lengthAligned = lengthInBytes - lengthInBytes % 4;
104-
int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
110+
int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
105111
int k1 = 0;
106112
for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) {
107113
k1 ^= (base.getByte(i) & 0xFF) << shift;
@@ -110,11 +116,10 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
110116
return fmix(h1, lengthInBytes);
111117
}
112118

113-
private static int hashBytesByIntBlock(MemoryBlock base, int seed) {
114-
long lengthInBytes = base.size();
119+
private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, int seed) {
115120
assert (lengthInBytes % 4 == 0);
116121
int h1 = seed;
117-
for (long i = 0; i < lengthInBytes; i += 4) {
122+
for (int i = 0; i < lengthInBytes; i += 4) {
118123
int halfWord = base.getInt(i);
119124
int k1 = mixK1(halfWord);
120125
h1 = mixH1(h1, k1);

0 commit comments

Comments
 (0)