Timekeeping in the Linux Kernel
Stephen Boyd
Qualcomm Innovation Center, Inc.
1 / 40
In the beginning ...
2 / 40
there was a counter
0000ec544fef3c8a
3 / 40
Calculating the Passage of Time (in ns)
ccycles ccycles ccycles
= = ( ) seconds
1
f Hz f
f( )
seconds
ccycles ccycles
( ) seconds = ⋅ 1e9 = Tns
f f
4 / 40
Calculating the Passage of Time (in ns)
ccycles ccycles ccycles
= = ( ) seconds
1
f Hz f
f( )
seconds
ccycles ccycles
( ) seconds = ⋅ 1e9 = Tns
f f
Problems
Division is slow
Floating point math
Precision/overflow/underflow problems
5 / 40
Calculating the Passage of Time (in ns) Better
static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
{
return ((u64) cycles * mult) >> shift;
}
6 / 40
Calculating the Passage of Time (in ns) Better
static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
{
return ((u64) cycles * mult) >> shift;
}
Where do mult and shift come from?
clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
7 / 40
Read
struct Hardware
clocksource Counter
Abstract the Hardware!
struct clocksource {
cycle_t (*read)(struct clocksource *cs);
cycle_t mask;
u32 mult;
u32 shift;
...
};
clocksource_register_hz(struct clocksource *cs, u32 hz);
clocksource_register_khz(struct clocksource *cs, u32 khz);
Time diff:
struct clocksource *cs = &system_clocksource;
cycle_t start = cs->read(cs);
... /* do something for a while */
cycle_t end = cs->read(cs);
clocksource_cyc2ns(end - start, cs->mult, cs->shift);
8 / 40
POSIX Clocks
CLOCK_BOOTTIME
CLOCK_MONOTONIC
CLOCK_MONOTONIC_RAW
CLOCK_MONOTONIC_COARSE
CLOCK_REALTIME
CLOCK_REALTIME_COARSE
CLOCK_TAI
9 / 40
POSIX Clocks Comparison
CLOCK_BOOTTIME
CLOCK_MONOTONIC
CLOCK_REALTIME
10 / 40
Read Accumulate Track (RAT)
Best acronym ever
11 / 40
RAT in Action (Read)
struct tk_read_base {
struct clocksource *clock;
cycle_t (*read)(struct clocksource *cs);
cycle_t mask;
cycle_t cycle_last;
u32 mult;
u32 shift;
u64 xtime_nsec;
ktime_t base;
};
static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
cycle_t delta)
{
u64 nsec = delta * tkr->mult + tkr->xtime_nsec;
return nsec >> tkr->shift;
}
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
cycle_t delta = (tkr->read(tkr->clock) - tkr->cycle_last) & tkr->mask;
return timekeeping_delta_to_ns(tkr, delta);
}
12 / 40
RAT in Action (Accumulate + Track)
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
u32 shift, unsigned int *clock_set)
{
u64 interval = tk->cycle_interval << shift;
tk->tkr_mono.cycle_last += interval;
tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
...
}
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
unsigned int clock_set = 0;
while (tk->tkr_mono.xtime_nsec >= nsecps) {
int leap;
tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
...
}
13 / 40
Juggling Clocks
struct timekeeper {
struct tk_read_base tkr_mono;
struct tk_read_base tkr_raw;
u64 xtime_sec;
unsigned long ktime_sec;
struct timespec64 wall_to_monotonic;
ktime_t offs_real;
ktime_t offs_boot;
ktime_t offs_tai;
s32 tai_offset;
struct timespec64 raw_time;
};
14 / 40
Handling Clock Drift
1 ¯¯
¯
⋅ 1e9 = 52.083 ns
19200000
Vs.
1
⋅ 1e9 = 52.083311ns
19200008
15 / 40
Handling Clock Drift
100000
⋅ 1e9 = 520833ns
19200000
Vs.
100000
⋅ 1e9 = 5208331ns
19200008
After 100k cycles we've lost 2 ns
16 / 40
Mult to the Rescue!
(100000 ⋅ 873813333) ≫ 24 = 5208333ns
Vs.
(100000 ⋅ 873813109) ≫ 24 = 5208331ns
Approach: Adjust mult to match actual clock frequency
17 / 40
Making Things Fast and E cient
static struct {
seqcount_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned;
static struct timekeeper shadow_timekeeper;
struct tk_fast {
seqcount_t seq;
struct tk_read_base base[2];
};
static struct tk_fast tk_fast_mono ____cacheline_aligned;
static struct tk_fast tk_fast_raw ____cacheline_aligned;
18 / 40
A Note About NMIs and Time
19 / 40
Where We Are
20 / 40
What if my system doesn't have a counter?
Insert #sadface here
Can't use NOHZ
Can't use hrtimers in "high resolution" mode
Relegated to the jiffies clocksource:
static cycle_t jiffies_read(struct clocksource *cs)
{
return (cycle_t) jiffies;
}
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
...
};
21 / 40
Let's talk about ji es
22 / 40
Let's talk about ji es
Ji y = 1 / CONFIG_HZ
23 / 40
Let's talk about ji es
Ji y = 1 / CONFIG_HZ
Updated during the "tick"
24 / 40
The tick?
25 / 40
The tick
Periodic event that updates
jiffies
process accounting
global load accounting
timekeeping
POSIX timers
RCU callbacks
hrtimers
irq_work
26 / 40
Implementing the tick in hardware
Timer Value: 4efa4655
Match Value: 4efa4666
27 / 40
Abstract the Hardware!
struct clock_event_device {
void (*event_handler)(struct clock_event_device *);
int (*set_next_event)(unsigned long evt,
struct clock_event_device *);
int (*set_next_ktime)(ktime_t expires,
struct clock_event_device *);
ktime_t next_event;
u64 max_delta_ns;
u64 min_delta_ns;
u32 mult;
u32 shift;
unsigned int features;
#define CLOCK_EVT_FEAT_PERIODIC 0x000001
#define CLOCK_EVT_FEAT_ONESHOT 0x000002
#define CLOCK_EVT_FEAT_KTIME 0x000004
int irq;
...
};
void clockevents_config_and_register(struct clock_event_device *dev,
u32 freq, unsigned long min_delta,
unsigned long max_delta)
28 / 40
Three event_handlers
struct clock_event_device {
void (*event_handler)(struct clock_event_device *);
int (*set_next_event)(unsigned long evt,
struct clock_event_device *);
int (*set_next_ktime)(ktime_t expires,
struct clock_event_device *);
ktime_t next_event;
u64 max_delta_ns;
...
}
Handler Usage
tick_handle_periodic() default
tick_nohz_handler() lowres mode
hrtimer_interrupt() highres mode
29 / 40
Ticks During Idle
tick_handle_periodic()
t1 t2 idle idle t3
tick
tick
tick
tick
time
30 / 40
Tick-less Idle (i.e. CONFIG_NOHZ_IDLE)
tick_handle_periodic()
t1 t2 idle idle t3
tick
tick
tick
tick
time
tick_nohz_handler()
t1 t2 idle t3
tick
tick
tick
time
31 / 40
High Resolution Mode
tick_nohz_handler()
t1 t2 idle t3
tick
tick
tick
time
hrtimer_interrupt()
t1 t 2 idle t3
tick
tick
tick
hrt
hrt
hrt
time
32 / 40
Tick Devices
enum tick_device_mode {
TICKDEV_MODE_PERIODIC,
TICKDEV_MODE_ONESHOT,
};
struct tick_device {
struct clock_event_device *evtdev;
enum tick_device_mode mode;
};
DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
tick_device clockevent
Hardware
event
33 / 40
Running the Tick
struct tick_sched {
struct hrtimer sched_timer;
...
};
34 / 40
Running the Tick (Per-CPU)
struct tick_sched {
struct hrtimer sched_timer;
...
};
DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
35 / 40
Stopping the Tick
Not always as simple as
hrtimer_cancel(&ts->sched_timer)
Could be that we need to restart the timer so far in the future
hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED)
Needs to consider
timers
hrtimers
RCU callbacks
jiffie update responsibility
clocksource's max_idle_ns (timekeeping max deferment)
Details in tick_nohz_stop_sched_tick()
36 / 40
Tick Broadcast
For when your clockevents FAIL AT LIFE
i.e., they don't work during some CPU idle low power modes
Indicated by CLOCK_EVT_FEAT_C3_STOP flag
37 / 40
Timers
Operates with jiffies granularity
Requirements
jiffies increment
clockevent
softirq
38 / 40
HRTimers
Operates with ktime (nanoseconds) granularity
Requirements
timekeeping increment
clockevent
tick_device
39 / 40
Summary
What we covered What's di cult
clocksources Timekeeping has to handle NTP and drift
timekeeping Tick uses multiple abstraction layers
clockevents NOHZ gets complicated when starting/stopping the tick
jiffies Tick broadcast turns up NOHZ to 11
NOHZ
tick broadcast
timers
hrtimers
40 / 40