fix(rpc): Read RPC responses as binary data in Python (semgrep/semgrep-proprietary#5117)

TikhonJelvis · yosefAlsuhaibani · commit 3ab3b130c9d3 · 2025-12-07T13:38:39.000-08:00
Change the Python RPC implementation to read from the sub-process's output stream in bytes rather than Unicode characters. All of the IO is now measured in bytes, with explicit encoding/decoding steps to convert to text. The RPC format consists of a length in bytes followed by that many bytes of UTF-8-encoded text. However, the current Python implementation reads data from the process *as text* (`text=True` when starting the process), so `io.read(n)` counts in Unicode characters rather than bytes. When the RPC output includes non-ASCII characters, the number of bytes written in the message header is larger than the number of Unicode characters in the stream. This has not been a problem so far because we only run a single RPC call per process. After the RPC call we close the stream and send an EOF, so `io.read(n)` will read the whole string even if it has `< n` characters. However, this caused a problem when I implemented running multiple RPC calls through a single long-lived process because `io.read(n)` would block indefinitely if the stream did not contain at least `n` characters. This change fixes that problem. Test plan: ran existing tests + reproduced the problem and fix on top of #5066. synced from Pro d507ac7668dcccb43c12dc732a615866d53dc12b
diff --git a/cli/src/semgrep/rpc.py b/cli/src/semgrep/rpc.py
@@ -59,7 +59,7 @@
 
 
 # Read `size` bytes from `io`. Returns fewer bytes if we hit EOF.
-def _really_read(io: IO[str], size: int) -> str:
+def _really_read(io: IO[bytes], size: int) -> str:
     # Operate on bytes, not str.
     out: bytes = b""
     while len(out) < size:
@@ -73,23 +73,23 @@ def _really_read(io: IO[str], size: int) -> str:
         # clear to me (nmote) whether it is guaranteed to be present on the
         # streams provided by subprocess.Popen. So, to be on the safe side,
         # we'll just do this ourselves.
-        new: str = io.read(size)
+        new: bytes = io.read(size)
         # This happens if we hit EOF. In that case, repeatedly reading will lead
         # to an infinite loop.
         if len(new) == 0:
             logger.error(f"0 bytes read from RPC input stream")
             break
-        out = out + new.encode(ENCODING)
+        out = out + new
     # When we read the RPC call for file targeting, we could encounter files
     # with non-utf8 characters, in that case we replace them with <?>
     # i.e abc.txt -> ab<?>.txt
     return out.decode(ENCODING, errors="replace")
 
 
-def _read_packet(io: IO[str]) -> Optional[str]:
+def _read_packet(io: IO[bytes]) -> Optional[str]:
     # Unlike `read`, `readline` is guaranteed to return a full line unless there
     # is an EOF
-    size_str = io.readline().strip()
+    size_str = io.readline().decode(ENCODING).strip()
     if not size_str.isdigit():
         # Avoid horrific log spew if we somehow got a really long line
         truncated = size_str[:50]
@@ -99,12 +99,12 @@ def _read_packet(io: IO[str]) -> Optional[str]:
     return _really_read(io, size)
 
 
-def _write_packet(io: IO[str], packet: str) -> None:
+def _write_packet(io: IO[bytes], packet: str) -> None:
     # Size in bytes
     size: int = len(packet.encode(ENCODING))
     size_str = str(size) + "\n"
-    io.write(size_str)
-    io.write(packet)
+    io.write(size_str.encode(ENCODING))
+    io.write(packet.encode(ENCODING))
     io.flush()
 
 
@@ -156,8 +156,7 @@ def rpc_call(call: out.FunctionCall, cls: Type[T]) -> Optional[T]:
         cmd,
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
-        text=True,
-        encoding=ENCODING,
+        text=False,
     ) as proc:
         try:
             # These need to be local variables because otherwise mypy doesn't