Skip to content

Commit fbc6efa

Browse files
authored
Refactoring of PerfCounters infrastructure (#1559)
* Refactoring of PerfCounters infrastructure The main feature in this pull request is the removal of the static sharing of PerfCounters and instead creating them at the top `RunBenchmarks()` function where all benchmark runners are created. A single PerfCountersMeasurement object is created and then shared with all the new BenchmarkRunners objects, one per existing benchmark. Other features conflated here in this PR are: - Added BENCHMARK_DONT_OPTIMIZE macro in global scope - Removal of the `IsValid()` query, being replaced by checking the number of remaining counters after validity tests - Refactoring of all GTests to reflect the changes and new semantics - extra comments throughout the new code to clarify intent It was extremely hard to separate all those features in different PRs as requested since they are so interdependent on each other so I'm just pushing them altogether and asking for forgiveness. This PR comes replacing PRs 1555 and 1558 which have been closed. * Fixed whitespace issue with clang-format My clang-format insists in deleting this single white space on line 601 while Github's clang format breaks when it is added. I had to disable format-on-save to check-in this revert change. I'm using clang 14.0.6.
1 parent 9885aef commit fbc6efa

File tree

7 files changed

+260
-178
lines changed

7 files changed

+260
-178
lines changed

include/benchmark/benchmark.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,18 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
218218
#define BENCHMARK_UNUSED
219219
#endif
220220

221+
// Used to annotate functions, methods and classes so they
222+
// are not optimized by the compiler. Useful for tests
223+
// where you expect loops to stay in place churning cycles
224+
#if defined(__clang__)
225+
#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
226+
#elif defined(__GNUC__) || defined(__GNUG__)
227+
#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
228+
#else
229+
// MSVC & Intel do not have a no-optimize attribute, only line pragmas
230+
#define BENCHMARK_DONT_OPTIMIZE
231+
#endif
232+
221233
#if defined(__GNUC__) || defined(__clang__)
222234
#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
223235
#elif defined(_MSC_VER) && !defined(__clang__)

src/benchmark.cc

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,21 +348,44 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
348348

349349
size_t num_repetitions_total = 0;
350350

351+
// This perfcounters object needs to be created before the runners vector
352+
// below so it outlasts their lifetime.
353+
PerfCountersMeasurement perfcounters(
354+
StrSplit(FLAGS_benchmark_perf_counters, ','));
355+
356+
// Vector of benchmarks to run
351357
std::vector<internal::BenchmarkRunner> runners;
352358
runners.reserve(benchmarks.size());
359+
360+
// Count the number of benchmarks with threads to warn the user in case
361+
// performance counters are used.
362+
int benchmarks_with_threads = 0;
363+
364+
// Loop through all benchmarks
353365
for (const BenchmarkInstance& benchmark : benchmarks) {
354366
BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
355367
if (benchmark.complexity() != oNone)
356368
reports_for_family = &per_family_reports[benchmark.family_index()];
357-
358-
runners.emplace_back(benchmark, reports_for_family);
369+
benchmarks_with_threads += (benchmark.threads() > 0);
370+
runners.emplace_back(benchmark, &perfcounters, reports_for_family);
359371
int num_repeats_of_this_instance = runners.back().GetNumRepeats();
360372
num_repetitions_total += num_repeats_of_this_instance;
361373
if (reports_for_family)
362374
reports_for_family->num_runs_total += num_repeats_of_this_instance;
363375
}
364376
assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
365377

378+
// The use of performance counters with threads would be unintuitive for
379+
// the average user so we need to warn them about this case
380+
if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
381+
GetErrorLogInstance()
382+
<< "***WARNING*** There are " << benchmarks_with_threads
383+
<< " benchmarks with threads and " << perfcounters.num_counters()
384+
<< " performance counters were requested. Beware counters will "
385+
"reflect the combined usage across all "
386+
"threads.\n";
387+
}
388+
366389
std::vector<size_t> repetition_indices;
367390
repetition_indices.reserve(num_repetitions_total);
368391
for (size_t runner_index = 0, num_runners = runners.size();

src/benchmark_runner.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {
221221

222222
BenchmarkRunner::BenchmarkRunner(
223223
const benchmark::internal::BenchmarkInstance& b_,
224+
PerfCountersMeasurement* pcm_,
224225
BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
225226
: b(b_),
226227
reports_for_family(reports_for_family_),
@@ -239,10 +240,7 @@ BenchmarkRunner::BenchmarkRunner(
239240
iters(has_explicit_iteration_count
240241
? ComputeIters(b_, parsed_benchtime_flag)
241242
: 1),
242-
perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')),
243-
perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
244-
? &perf_counters_measurement
245-
: nullptr) {
243+
perf_counters_measurement_ptr(pcm_) {
246244
run_results.display_report_aggregates_only =
247245
(FLAGS_benchmark_report_aggregates_only ||
248246
FLAGS_benchmark_display_aggregates_only);
@@ -255,7 +253,7 @@ BenchmarkRunner::BenchmarkRunner(
255253
run_results.file_report_aggregates_only =
256254
(b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
257255
BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
258-
perf_counters_measurement.IsValid())
256+
(perf_counters_measurement_ptr->num_counters() == 0))
259257
<< "Perf counters were requested but could not be set up.";
260258
}
261259
}

src/benchmark_runner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value);
5858
class BenchmarkRunner {
5959
public:
6060
BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
61+
benchmark::internal::PerfCountersMeasurement* pmc_,
6162
BenchmarkReporter::PerFamilyRunReports* reports_for_family);
6263

6364
int GetNumRepeats() const { return repeats; }
@@ -103,8 +104,7 @@ class BenchmarkRunner {
103104
// So only the first repetition has to find/calculate it,
104105
// the other repetitions will just use that precomputed iteration count.
105106

106-
PerfCountersMeasurement perf_counters_measurement;
107-
PerfCountersMeasurement* const perf_counters_measurement_ptr;
107+
PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
108108

109109
struct IterationResults {
110110
internal::ThreadManager::Result results;

src/perf_counters.cc

Lines changed: 88 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -71,80 +71,78 @@ bool PerfCounters::IsCounterSupported(const std::string& name) {
7171
return (ret == PFM_SUCCESS);
7272
}
7373

74-
// Validates all counter names passed, returning only the valid ones
75-
static std::vector<std::string> validateCounters(
74+
PerfCounters PerfCounters::Create(
7675
const std::vector<std::string>& counter_names) {
77-
// All valid names to be returned
76+
// Valid counters will populate these arrays but we start empty
7877
std::vector<std::string> valid_names;
78+
std::vector<int> counter_ids;
79+
std::vector<int> leader_ids;
80+
81+
// Resize to the maximum possible
82+
valid_names.reserve(counter_names.size());
83+
counter_ids.reserve(counter_names.size());
84+
85+
const int kCounterMode = PFM_PLM3; // user mode only
86+
87+
// Group leads will be assigned on demand. The idea is that once we cannot
88+
// create a counter descriptor, the reason is that this group has maxed out
89+
// so we set the group_id again to -1 and retry - giving the algorithm a
90+
// chance to create a new group leader to hold the next set of counters.
91+
int group_id = -1;
7992

80-
// Loop through all the given names
81-
int invalid_counter = 0;
82-
for (const std::string& name : counter_names) {
83-
// Check trivial empty
93+
// Loop through all performance counters
94+
for (size_t i = 0; i < counter_names.size(); ++i) {
95+
// we are about to push into the valid names vector
96+
// check if we did not reach the maximum
97+
if (valid_names.size() == PerfCounterValues::kMaxCounters) {
98+
// Log a message if we maxed out and stop adding
99+
GetErrorLogInstance()
100+
<< counter_names.size() << " counters were requested. The maximum is "
101+
<< PerfCounterValues::kMaxCounters << " and " << valid_names.size()
102+
<< " were already added. All remaining counters will be ignored\n";
103+
// stop the loop and return what we have already
104+
break;
105+
}
106+
107+
// Check if this name is empty
108+
const auto& name = counter_names[i];
84109
if (name.empty()) {
85-
GetErrorLogInstance() << "A counter name was the empty string\n";
86-
invalid_counter++;
110+
GetErrorLogInstance()
111+
<< "A performance counter name was the empty string\n";
87112
continue;
88113
}
89-
if (PerfCounters::IsCounterSupported(name)) {
90-
// we are about to push into the valid names vector
91-
// check if we did not reach the maximum
92-
if (valid_names.size() == PerfCounterValues::kMaxCounters) {
93-
GetErrorLogInstance()
94-
<< counter_names.size()
95-
<< " counters were requested. The maximum is "
96-
<< PerfCounterValues::kMaxCounters << " and "
97-
<< counter_names.size() - invalid_counter - valid_names.size()
98-
<< " will be ignored\n";
99-
// stop the loop and return what we have already
100-
break;
101-
}
102-
valid_names.push_back(name);
103-
} else {
104-
GetErrorLogInstance() << "Performance counter " << name
105-
<< " incorrect or not supported on this platform\n";
106-
invalid_counter++;
107-
}
108-
}
109-
// RVO should take care of this
110-
return valid_names;
111-
}
112-
113-
PerfCounters PerfCounters::Create(
114-
const std::vector<std::string>& counter_names) {
115-
std::vector<std::string> valid_names = validateCounters(counter_names);
116-
if (valid_names.empty()) {
117-
return NoCounters();
118-
}
119-
std::vector<int> counter_ids(valid_names.size());
120-
std::vector<int> leader_ids;
121114

122-
const int mode = PFM_PLM3; // user mode only
123-
int group_id = -1;
124-
for (size_t i = 0; i < valid_names.size(); ++i) {
115+
// Here first means first in group, ie the group leader
125116
const bool is_first = (group_id < 0);
117+
118+
// This struct will be populated by libpfm from the counter string
119+
// and then fed into the syscall perf_event_open
126120
struct perf_event_attr attr {};
127121
attr.size = sizeof(attr);
128-
const auto& name = valid_names[i];
122+
123+
// This is the input struct to libpfm.
129124
pfm_perf_encode_arg_t arg{};
130125
arg.attr = &attr;
131-
132-
const int pfm_get =
133-
pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
126+
const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
127+
PFM_OS_PERF_EVENT, &arg);
134128
if (pfm_get != PFM_SUCCESS) {
135-
GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
136-
return NoCounters();
129+
GetErrorLogInstance()
130+
<< "Unknown performance counter name: " << name << "\n";
131+
continue;
137132
}
138-
attr.disabled = is_first;
133+
134+
// We then proceed to populate the remaining fields in our attribute struct
139135
// Note: the man page for perf_event_create suggests inherit = true and
140136
// read_format = PERF_FORMAT_GROUP don't work together, but that's not the
141137
// case.
138+
attr.disabled = is_first;
142139
attr.inherit = true;
143140
attr.pinned = is_first;
144141
attr.exclude_kernel = true;
145142
attr.exclude_user = false;
146143
attr.exclude_hv = true;
147-
// Read all counters in one read.
144+
145+
// Read all counters in a group in one read.
148146
attr.read_format = PERF_FORMAT_GROUP;
149147

150148
int id = -1;
@@ -159,36 +157,64 @@ PerfCounters PerfCounters::Create(
159157
}
160158
}
161159
if (id < 0) {
162-
// We reached a limit perhaps?
160+
// If the file descriptor is negative we might have reached a limit
161+
// in the current group. Set the group_id to -1 and retry
163162
if (group_id >= 0) {
164163
// Create a new group
165164
group_id = -1;
166165
} else {
167-
// Give up, there is nothing else to try
166+
// At this point we have already retried to set a new group id and
167+
// failed. We then give up.
168168
break;
169169
}
170170
}
171171
}
172+
173+
// We failed to get a new file descriptor. We might have reached a hard
174+
// hardware limit that cannot be resolved even with group multiplexing
172175
if (id < 0) {
173-
GetErrorLogInstance()
174-
<< "Failed to get a file descriptor for " << name << "\n";
175-
return NoCounters();
176+
GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
177+
"for performance counter "
178+
<< name << ". Ignoring\n";
179+
180+
// We give up on this counter but try to keep going
181+
// as the others would be fine
182+
continue;
176183
}
177184
if (group_id < 0) {
178-
// This is a leader, store and assign it
185+
// This is a leader, store and assign it to the current file descriptor
179186
leader_ids.push_back(id);
180187
group_id = id;
181188
}
182-
counter_ids[i] = id;
189+
// This is a valid counter, add it to our descriptor's list
190+
counter_ids.push_back(id);
191+
valid_names.push_back(name);
183192
}
193+
194+
// Loop through all group leaders activating them
195+
// There is another option of starting ALL counters in a process but
196+
// that would be far reaching an intrusion. If the user is using PMCs
197+
// by themselves then this would have a side effect on them. It is
198+
// friendlier to loop through all groups individually.
184199
for (int lead : leader_ids) {
185200
if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
186-
GetErrorLogInstance() << "Failed to start counters\n";
201+
// This should never happen but if it does, we give up on the
202+
// entire batch as recovery would be a mess.
203+
GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
204+
"Claring out all counters.\n";
205+
206+
// Close all peformance counters
207+
for (int id : counter_ids) {
208+
::close(id);
209+
}
210+
211+
// Return an empty object so our internal state is still good and
212+
// the process can continue normally without impact
187213
return NoCounters();
188214
}
189215
}
190216

191-
return PerfCounters(valid_names, std::move(counter_ids),
217+
return PerfCounters(std::move(valid_names), std::move(counter_ids),
192218
std::move(leader_ids));
193219
}
194220

@@ -223,34 +249,10 @@ PerfCounters PerfCounters::Create(
223249
void PerfCounters::CloseCounters() const {}
224250
#endif // defined HAVE_LIBPFM
225251

226-
Mutex PerfCountersMeasurement::mutex_;
227-
int PerfCountersMeasurement::ref_count_ = 0;
228-
PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters();
229-
230-
// The validation in PerfCounter::Create will create less counters than passed
231-
// so it should be okay to initialize start_values_ and end_values_ with the
232-
// upper bound as passed
233252
PerfCountersMeasurement::PerfCountersMeasurement(
234253
const std::vector<std::string>& counter_names)
235254
: start_values_(counter_names.size()), end_values_(counter_names.size()) {
236-
MutexLock l(mutex_);
237-
if (ref_count_ == 0) {
238-
counters_ = PerfCounters::Create(counter_names);
239-
}
240-
// We chose to increment it even if `counters_` ends up invalid,
241-
// so that we don't keep trying to create, and also since the dtor
242-
// will decrement regardless of `counters_`'s validity
243-
++ref_count_;
244-
245-
BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names);
246-
}
247-
248-
PerfCountersMeasurement::~PerfCountersMeasurement() {
249-
MutexLock l(mutex_);
250-
--ref_count_;
251-
if (ref_count_ == 0) {
252-
counters_ = PerfCounters::NoCounters();
253-
}
255+
counters_ = PerfCounters::Create(counter_names);
254256
}
255257

256258
PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {

0 commit comments

Comments
 (0)