Skip to content

Commit b6af251

Browse files
committed
JIT: Re-enable parallel compilation
1 parent a9b8c8f commit b6af251

File tree

5 files changed

+85
-17
lines changed

5 files changed

+85
-17
lines changed

FEXCore/Source/Interface/Context/Context.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -292,9 +292,6 @@ class ContextImpl final : public FEXCore::Context::Context, CPU::CodeBufferManag
292292
fextl::unique_ptr<FEXCore::Core::DebugData> DebugData;
293293
uint64_t StartAddr;
294294
uint64_t Length;
295-
// Lock for further CodeBuffer and LookupCache operations.
296-
// If empty, compilation was skipped since another thread already compiled the block.
297-
std::unique_lock<ForkableUniqueMutex> CodeBufferLock;
298295
};
299296
[[nodiscard]]
300297
CompileCodeResult CompileCode(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, uint64_t MaxInst = 0);

FEXCore/Source/Interface/Core/CPUBackend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ namespace CPU {
9595
uint8_t* BlockEntry;
9696
// The total size of the codeblock from [BlockBegin, BlockBegin+Size).
9797
size_t Size;
98+
99+
// Lock for further CodeBuffer and LookupCache operations.
100+
// If empty, compilation was skipped since another thread already compiled the block.
101+
std::unique_lock<ForkableUniqueMutex> CodeBufferLock;
98102
};
99103

100104
// Header that can live at the start of a JIT block.

FEXCore/Source/Interface/Core/Core.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,6 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
757757
.DebugData = nullptr, // nullptr here ensures that code serialization doesn't occur on from cache read
758758
.StartAddr = 0, // Unused
759759
.Length = 0, // Unused
760-
.CodeBufferLock {} // Unused
761760
};
762761
}
763762
}
@@ -783,16 +782,16 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
783782

784783
// Attempt to get the CPU backend to compile this code
785784

786-
auto Lock = std::unique_lock {CodeBufferWriteMutex};
787785

788786
// Re-check if another thread raced us in compiling this block.
789787
// We could lock CodeBufferWriteMutex earlier to prevent this from happening,
790788
// but this would increase lock contention. Redundant frontend runs aren't
791789
// as expensive and are easily reverted.
790+
// TODO: Instead, consider having a work queue and checking that instead?
792791
if (MaxInst != 1) {
793792
if (auto Block = Thread->LookupCache->FindBlock(GuestRIP)) {
794793
Thread->OpDispatcher->DelayedDisownBuffer();
795-
return {.CompiledCode = reinterpret_cast<void*>(Block), .DebugData = nullptr, .StartAddr = 0, .Length = 0, .CodeBufferLock {}};
794+
return {.CompiledCode = { reinterpret_cast<uint8_t*>(Block), {{ GuestRIP, reinterpret_cast<uint8_t*>(Block) }} }, .DebugData = nullptr, .StartAddr = 0, .Length = 0 };
796795
}
797796
}
798797

@@ -801,6 +800,13 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
801800
// Release the IR
802801
Thread->OpDispatcher->DelayedDisownBuffer();
803802

803+
// Post-processing huge multiblocks can still take a lot of time, so check *again* if we raced another thread for compilation
804+
if (MaxInst != 1) {
805+
if (auto Block = Thread->LookupCache->FindBlock(GuestRIP)) {
806+
return {.CompiledCode = { reinterpret_cast<uint8_t*>(Block), {{ GuestRIP, reinterpret_cast<uint8_t*>(Block) }} }, .DebugData = nullptr, .StartAddr = 0, .Length = 0 };
807+
}
808+
}
809+
804810
return {
805811
// FEX currently throws away the CPUBackend::CompiledCode object other than the entrypoint
806812
// In the future with code caching getting wired up, we will pass the rest of the data forward.
@@ -880,8 +886,9 @@ uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_
880886
}));
881887
}
882888

883-
// Clear any relocations that might have been generated
884-
Thread->CPUBackend->ClearRelocations();
889+
// TODO: Do we still need the CodeBufferWriteLock? Is it sufficient to trade it for the LookupCache lock?
890+
auto lk2 = Thread->LookupCache->Shared->AcquireLock();
891+
CompiledCode.CodeBufferLock = {};
885892

886893
if (IRCaptureCache.PostCompileCode(Thread, CodePtr, GuestRIP, StartAddr, Length, {}, DebugData.get(), false)) {
887894
// Early exit
@@ -892,6 +899,9 @@ uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_
892899
// Pages containing this block are added via AddBlockExecutableRange before each page gets accessed in the frontend
893900
Thread->LookupCache->AddBlockMapping(GuestRIP, CodePtr);
894901

902+
// Clear any relocations that might have been generated
903+
Thread->CPUBackend->ClearRelocations();
904+
895905
return (uintptr_t)CodePtr;
896906
}
897907

FEXCore/Source/Interface/Core/JIT/JIT.cpp

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -709,16 +709,44 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
709709
// Fairly excessive buffer range to make sure we don't overflow
710710
uint32_t BufferRange = SSACount * 16;
711711

712-
LOGMAN_THROW_A_FMT(CurrentCodeBuffer->LookupCache.get() == ThreadState->LookupCache->Shared, "INVARIANT VIOLATED: SharedLookupCache "
713-
"doesn't match up!\n");
714-
if (auto Prev = CheckCodeBufferUpdate()) {
715-
ThreadState->LookupCache->ChangeGuestToHostMapping(*Prev, *CurrentCodeBuffer->LookupCache);
716-
}
712+
auto RefreshCodeBuffer = [this, BufferRange](bool Align) {
713+
LOGMAN_THROW_A_FMT(CurrentCodeBuffer->LookupCache.get() == ThreadState->LookupCache->Shared, "INVARIANT VIOLATED: SharedLookupCache "
714+
"doesn't match up!\n");
715+
if (auto Prev = CheckCodeBufferUpdate()) {
716+
ThreadState->LookupCache->ChangeGuestToHostMapping(*Prev, *CurrentCodeBuffer->LookupCache);
717+
}
718+
719+
SetBuffer(CurrentCodeBuffer->Ptr, CurrentCodeBuffer->Size);
720+
SetCursorOffset(Align ? AlignUp(CodeBuffers.LatestOffset, 16) : CodeBuffers.LatestOffset);
721+
if ((GetCursorOffset() + BufferRange) > (CurrentCodeBuffer->Size - Utils::FEX_PAGE_SIZE)) {
722+
CTX->ClearCodeCache(ThreadState);
723+
}
724+
725+
if (Align) {
726+
Align16B();
727+
}
728+
729+
CodeBuffers.LatestOffset = GetCursorOffset();
730+
};
717731

718-
SetBuffer(CurrentCodeBuffer->Ptr, CurrentCodeBuffer->Size);
719-
SetCursorOffset(CodeBuffers.LatestOffset);
720-
if ((GetCursorOffset() + BufferRange) > (CurrentCodeBuffer->Size - Utils::FEX_PAGE_SIZE)) {
721-
CTX->ClearCodeCache(ThreadState);
732+
static thread_local std::unique_ptr<CodeBuffer> TempCodeBuffer;
733+
{
734+
// FEXCORE_PROFILE_SCOPED("AcquireLock1");
735+
// CodeData.CodeBufferLock = std::unique_lock { CodeBuffers.CodeBufferWriteMutex, std::try_to_lock };
736+
CodeData.CodeBufferLock = {};
737+
}
738+
if (CodeData.CodeBufferLock) {
739+
RefreshCodeBuffer(false);
740+
} else {
741+
// Another thread is holding the mutex for compiling, so this thread will compile to a
742+
// temporary buffer instead. We'll still need to wait for the mutex later (to relocate to
743+
// the main CodeBuffer), but we can do useful work in the meantime.
744+
auto DesiredSize = AlignUp(BufferRange, Utils::FEX_PAGE_SIZE) + Utils::FEX_PAGE_SIZE /* Guard area */;
745+
if (!TempCodeBuffer || TempCodeBuffer->Size < DesiredSize) {
746+
// TODO: Don't use CodeBuffer, since that will also allocate a LookupCache...
747+
TempCodeBuffer = std::make_unique<CodeBuffer>(DesiredSize);
748+
}
749+
SetBuffer(TempCodeBuffer->Ptr, TempCodeBuffer->Size);
722750
}
723751

724752
CodeData.BlockBegin = GetCursorAddress<uint8_t*>();
@@ -892,6 +920,32 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size
892920

893921
JITBlockTail->Size = CodeData.Size;
894922

923+
if (!CodeData.CodeBufferLock) {
924+
// We failed locking this mutex before, so we compiled to TempCodeBuffer instead.
925+
// Migrate the compile output to the actual CodeBuffer.
926+
{
927+
FEXCORE_PROFILE_SCOPED("AcquireLock2");
928+
CodeData.CodeBufferLock = std::unique_lock { CodeBuffers.CodeBufferWriteMutex };
929+
}
930+
931+
const auto TempSize = GetCursorOffset();
932+
933+
// NOTE: 16-byte alignment for block linking records must be preserved here
934+
RefreshCodeBuffer(true);
935+
936+
// Adjust host addresses
937+
const auto Delta = GetCursorAddress<uint8_t*>() - CodeData.BlockBegin;
938+
CodeBegin += Delta;
939+
CodeData.BlockBegin += Delta;
940+
for (auto& EntryPoint : CodeData.EntryPoints) {
941+
EntryPoint.second += Delta;
942+
}
943+
944+
// Copy over CodeBuffer contents
945+
memcpy(GetCursorAddress<uint8_t*>(), TempCodeBuffer->Ptr, TempSize);
946+
SetCursorOffset(CodeBuffers.LatestOffset + TempSize);
947+
}
948+
895949
CodeBuffers.LatestOffset = GetCursorOffset();
896950

897951
ClearICache(CodeData.BlockBegin, CodeOnlySize);

FEXCore/include/FEXCore/Utils/SignalScopeGuards.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ class ForkableUniqueMutex final {
3535
[[maybe_unused]] const auto Result = pthread_mutex_lock(&Mutex);
3636
LOGMAN_THROW_A_FMT(Result == 0, "{} failed to lock with {}", __func__, Result);
3737
}
38+
bool try_lock() {
39+
return pthread_mutex_trylock(&Mutex) == 0;
40+
}
3841
void unlock() {
3942
[[maybe_unused]] const auto Result = pthread_mutex_unlock(&Mutex);
4043
LOGMAN_THROW_A_FMT(Result == 0, "{} failed to unlock with {}", __func__, Result);

0 commit comments

Comments
 (0)