Skip to content

Commit 34873ec

Browse files
authored
Add a script to fuzz the parser (courtesy of pysource-codegen) (#11015)
1 parent d3cd61f commit 34873ec

3 files changed

Lines changed: 271 additions & 0 deletions

File tree

scripts/fuzz-parser/fuzz.py

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
"""
2+
Run the parser on randomly generated (but syntactically valid) Python source-code files.
3+
4+
To install all dependencies for this script into an environment using `uv`, run:
5+
uv pip install -r scripts/fuzz-parser/requirements.txt
6+
7+
Example invocations of the script:
8+
- Run the fuzzer using seeds 0, 1, 2, 78 and 93 to generate the code:
9+
`python scripts/fuzz-parser/fuzz.py 0-2 78 93`
10+
- Run the fuzzer concurrently using seeds in range 0-10 inclusive,
11+
but only reporting bugs that are new on your branch:
12+
`python scripts/fuzz-parser/fuzz.py 0-10 --new-bugs-only`
13+
- Run the fuzzer concurrently on 10,000 different Python source-code files,
14+
and only print a summary at the end:
15+
`python scripts/fuzz-parser/fuzz.py 1-10000 --quiet
16+
17+
N.B. The script takes a few seconds to get started, as the script needs to compile
18+
your checked out version of ruff with `--release` as a first step before it
19+
can actually start fuzzing.
20+
"""
21+
22+
from __future__ import annotations
23+
24+
import argparse
25+
import concurrent.futures
26+
import subprocess
27+
from dataclasses import KW_ONLY, dataclass
28+
from typing import NewType
29+
30+
from pysource_codegen import generate as generate_random_code
31+
from pysource_minimize import minimize as minimize_repro
32+
from termcolor import colored
33+
34+
MinimizedSourceCode = NewType("MinimizedSourceCode", str)
35+
Seed = NewType("Seed", int)
36+
37+
38+
def run_ruff(executable_args: list[str], code: str) -> subprocess.CompletedProcess[str]:
39+
return subprocess.run(
40+
[*executable_args, "check", "--select=E999", "--no-cache", "-"],
41+
capture_output=True,
42+
text=True,
43+
input=code,
44+
)
45+
46+
47+
def contains_bug(code: str, *, only_new_bugs: bool = False) -> bool:
48+
"""Return True if the code triggers a parser error and False otherwise.
49+
50+
If `only_new_bugs` is set to `True`,
51+
the function also runs an installed version of Ruff on the same source code,
52+
and only returns `True` if the bug appears on the branch you have currently
53+
checked out but *not* in the latest release.
54+
"""
55+
new_result = run_ruff(["cargo", "run", "--release", "--"], code)
56+
if not only_new_bugs:
57+
return new_result.returncode != 0
58+
if new_result.returncode == 0:
59+
return False
60+
old_result = run_ruff(["ruff"], code)
61+
return old_result.returncode == 0
62+
63+
64+
@dataclass(slots=True)
65+
class FuzzResult:
66+
# The seed used to generate the random Python file.
67+
# The same seed always generates the same file.
68+
seed: Seed
69+
# If we found a bug, this will be the minimum Python code
70+
# required to trigger the bug. If not, it will be `None`.
71+
maybe_bug: MinimizedSourceCode | None
72+
73+
def print_description(self) -> None:
74+
"""Describe the results of fuzzing the parser with this seed."""
75+
if self.maybe_bug:
76+
print(colored(f"Ran fuzzer on seed {self.seed}", "red"))
77+
print(colored("The following code triggers a bug:", "red"))
78+
print()
79+
print(self.maybe_bug)
80+
print()
81+
else:
82+
print(colored(f"Ran fuzzer successfully on seed {self.seed}", "green"))
83+
84+
85+
def fuzz_code(seed: Seed, only_new_bugs: bool) -> FuzzResult:
86+
"""Return a `FuzzResult` instance describing the fuzzing result from this seed."""
87+
code = generate_random_code(seed)
88+
if contains_bug(code, only_new_bugs=only_new_bugs):
89+
try:
90+
new_code = minimize_repro(code, contains_bug)
91+
except ValueError:
92+
# `pysource_minimize.minimize()` sometimes raises `ValueError` internally.
93+
# Just ignore it if so, and use the original generated code;
94+
# minimizing the repro is a nice-to-have, but isn't crucial.
95+
new_code = code
96+
return FuzzResult(seed, MinimizedSourceCode(new_code))
97+
return FuzzResult(seed, None)
98+
99+
100+
def run_fuzzer_concurrently(args: ResolvedCliArgs) -> list[FuzzResult]:
101+
print(
102+
f"Concurrently running the fuzzer on "
103+
f"{len(args.seeds)} randomly generated source-code files..."
104+
)
105+
bugs: list[FuzzResult] = []
106+
with concurrent.futures.ProcessPoolExecutor() as executor:
107+
fuzz_result_futures = [
108+
executor.submit(fuzz_code, seed, args.only_new_bugs) for seed in args.seeds
109+
]
110+
try:
111+
for future in concurrent.futures.as_completed(fuzz_result_futures):
112+
fuzz_result = future.result()
113+
if not args.quiet:
114+
fuzz_result.print_description()
115+
if fuzz_result.maybe_bug:
116+
bugs.append(fuzz_result)
117+
except KeyboardInterrupt:
118+
print("\nShutting down the ProcessPoolExecutor due to KeyboardInterrupt...")
119+
print("(This might take a few seconds)")
120+
executor.shutdown(cancel_futures=True)
121+
raise
122+
return bugs
123+
124+
125+
def run_fuzzer_sequentially(args: ResolvedCliArgs) -> list[FuzzResult]:
126+
print(
127+
f"Sequentially running the fuzzer on "
128+
f"{len(args.seeds)} randomly generated source-code files..."
129+
)
130+
bugs: list[FuzzResult] = []
131+
for seed in args.seeds:
132+
fuzz_result = fuzz_code(seed, only_new_bugs=args.only_new_bugs)
133+
if not args.quiet:
134+
fuzz_result.print_description()
135+
if fuzz_result.maybe_bug:
136+
bugs.append(fuzz_result)
137+
return bugs
138+
139+
140+
def main(args: ResolvedCliArgs) -> None:
141+
if args.only_new_bugs:
142+
ruff_version = (
143+
subprocess.run(
144+
["ruff", "--version"], text=True, capture_output=True, check=True
145+
)
146+
.stdout.strip()
147+
.split(" ")[1]
148+
)
149+
print(
150+
f"As you have selected `--only-new-bugs`, "
151+
f"bugs will only be reported if they appear on your current branch "
152+
f"but do *not* appear in `ruff=={ruff_version}`"
153+
)
154+
if len(args.seeds) <= 5:
155+
bugs = run_fuzzer_sequentially(args)
156+
else:
157+
bugs = run_fuzzer_concurrently(args)
158+
noun_phrase = "New bugs" if args.only_new_bugs else "Bugs"
159+
if bugs:
160+
print(colored(f"{noun_phrase} found in the following seeds:", "red"))
161+
print(*sorted(bug.seed for bug in bugs))
162+
else:
163+
print(colored(f"No {noun_phrase.lower()} found!", "green"))
164+
165+
166+
def parse_seed_argument(arg: str) -> int | range:
167+
"""Helper for argument parsing"""
168+
if "-" in arg:
169+
start, end = map(int, arg.split("-"))
170+
if end <= start:
171+
raise argparse.ArgumentTypeError(
172+
f"Error when parsing seed argument {arg!r}: "
173+
f"range end must be > range start"
174+
)
175+
seed_range = range(start, end + 1)
176+
range_too_long = (
177+
f"Error when parsing seed argument {arg!r}: "
178+
f"maximum allowed range length is 1_000_000_000"
179+
)
180+
try:
181+
if len(seed_range) > 1_000_000_000:
182+
raise argparse.ArgumentTypeError(range_too_long)
183+
except OverflowError:
184+
raise argparse.ArgumentTypeError(range_too_long) from None
185+
return range(int(start), int(end) + 1)
186+
return int(arg)
187+
188+
189+
@dataclass(slots=True)
190+
class ResolvedCliArgs:
191+
seeds: list[Seed]
192+
_: KW_ONLY
193+
only_new_bugs: bool
194+
quiet: bool
195+
196+
197+
def parse_args() -> ResolvedCliArgs:
198+
"""Parse command-line arguments"""
199+
parser = argparse.ArgumentParser(
200+
description=__doc__, formatter_class=argparse.RawTextHelpFormatter
201+
)
202+
parser.add_argument(
203+
"seeds",
204+
type=parse_seed_argument,
205+
nargs="+",
206+
help="Either a single seed, or an inclusive range of seeds in the format `0-5`",
207+
)
208+
parser.add_argument(
209+
"--only-new-bugs",
210+
action="store_true",
211+
help=(
212+
"Only report bugs if they exist on the current branch, "
213+
"but *didn't* exist on the released version of Ruff "
214+
"installed into the Python environment we're running in"
215+
),
216+
)
217+
parser.add_argument(
218+
"--quiet",
219+
action="store_true",
220+
help="Print fewer things to the terminal while running the fuzzer",
221+
)
222+
args = parser.parse_args()
223+
seed_arguments: list[range | int] = args.seeds
224+
seen_seeds: set[int] = set()
225+
for arg in seed_arguments:
226+
if isinstance(arg, int):
227+
seen_seeds.add(arg)
228+
else:
229+
seen_seeds.update(arg)
230+
return ResolvedCliArgs(
231+
sorted(map(Seed, seen_seeds)),
232+
only_new_bugs=args.only_new_bugs,
233+
quiet=args.quiet,
234+
)
235+
236+
237+
if __name__ == "__main__":
238+
args = parse_args()
239+
main(args)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
pysource-codegen
2+
pysource-minimize
3+
ruff
4+
termcolor
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# This file was autogenerated by uv via the following command:
2+
# uv pip compile scripts/fuzz-parser/requirements.in --output-file scripts/fuzz-parser/requirements.txt
3+
asttokens==2.4.1
4+
# via pysource-minimize
5+
astunparse==1.6.3
6+
# via pysource-minimize
7+
click==8.1.7
8+
# via pysource-minimize
9+
markdown-it-py==3.0.0
10+
# via rich
11+
mdurl==0.1.2
12+
# via markdown-it-py
13+
pygments==2.17.2
14+
# via rich
15+
pysource-codegen==0.5.1
16+
pysource-minimize==0.6.2
17+
rich==13.7.1
18+
# via pysource-minimize
19+
ruff==0.4.0
20+
six==1.16.0
21+
# via
22+
# asttokens
23+
# astunparse
24+
termcolor==2.4.0
25+
typing-extensions==4.11.0
26+
# via pysource-codegen
27+
wheel==0.43.0
28+
# via astunparse

0 commit comments

Comments
 (0)