Linux时间管理之clocksource(1)

文章由LinuxBoy分享于2019-03-26 04:03:35热评（189）

Linux时间管理之clocksource(1)

前面提到了Linux下的时间相关的硬件。TSC PIT，HPET，ACPI_PM，这些硬件以一定的频率产生时钟中断，来帮助我们计时。Linux为了管理这些硬件，抽象出来clocksource。

struct clocksource { 
    /* 
     * Hotpath data, fits in a single cache line when the 
     * clocksource itself is cacheline aligned. 
     */ 
    cycle_t (*read)(struct clocksource *cs); 
    cycle_t cycle_last; 
    cycle_t mask; 
    u32 mult; 
    u32 shift; 
    u64 max_idle_ns; 
    u32 maxadj; 
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA 
    struct arch_clocksource_data archdata; 
#endif 
  
    const char *name; 
    struct list_head list; 
    int rating; 
    int (*enable)(struct clocksource *cs); 
    void (*disable)(struct clocksource *cs); 
    unsigned long flags; 
    void (*suspend)(struct clocksource *cs); 
    void (*resume)(struct clocksource *cs); 
  
    /* private: */ 
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 
    /* Watchdog related data, used by the framework */ 
    struct list_head wd_list; 
    cycle_t cs_last; 
    cycle_t wd_last; 
#endif 
} ____cacheline_aligned;

这些参数当中，比较重要的是rating，shift，mult。其中rating在上一篇博文提到了：

1--99：不适合于用作实际的时钟源，只用于启动过程或用于测试；
100--199：基本可用，可用作真实的时钟源，但不推荐；
200--299：精度较好，可用作真实的时钟源；
300--399：很好，精确的时钟源；
400--499：理想的时钟源，如有可能就必须选择它作为时钟源；

我们基本在前面看到：

include/linux/acpi_pmtmr.h 
------------------------------------------ 
#define PMTMR_TICKS_PER_SEC 3579545 
  
drivers/clocksource/acpi_pm.c 
--------------------------------------------- 
static struct clocksource clocksource_acpi_pm = { 
          .name = "acpi_pm", 
          .rating = 200, 
          .read = acpi_pm_read, 
          .mask = (cycle_t)ACPI_PM_MASK, 
          .mult = 0, /*to be calculated*/ 
          .shift = 22, 
          .flags = CLOCK_SOURCE_IS_CONTINUOUS, 
  
 }; 
  
dmesg output 
------------------------ 
[ 0.664201] hpet0: 8 comparators, 64-bit 14.318180 MHz counter 
  
arch/86/kernel/hpet.c 
-------------------------------- 
static struct clocksource clocksource_hpet = { 
    .name = "hpet", 
    .rating = 250, 
    .read = read_hpet, 
    .mask = HPET_MASK, 
    .flags = CLOCK_SOURCE_IS_CONTINUOUS, 
    .resume = hpet_resume_counter, 
#ifdef CONFIG_X86_64 
    .archdata = { .vclock_mode = VCLOCK_HPET }, 
#endif 
}; 
  
  
dmesg output: 
----------------------------- 
[ 0.004000] Detected 2127.727 MHz processor.  
  
  
arch/x86/kernel/tsc.c 
-------------------------------------- 
static struct clocksource clocksource_tsc = { 
    .name = "tsc", 
    .rating = 300, 
    .read = read_tsc, 
    .resume = resume_tsc, 
    .mask = CLOCKSOURCE_MASK(64), 
    .flags = CLOCK_SOURCE_IS_CONTINUOUS | 
                  CLOCK_SOURCE_MUST_VERIFY, 
#ifdef CONFIG_X86_64 
    .archdata = { .vclock_mode = VCLOCK_TSC }, 
#endif 
};

从上面可以看到，acpi_pm，hpet tsc的rating分别是200,250,300,他们的rating基本是和他们的frequency符合，TSC以2127.727MHz的频率技压群雄，等级rating=300最高，被选择成current_clocksource：

root@manu:~# cat /sys/devices/system/clocksource/clocksource0/available_clocksource  
tsc hpet acpi_pm  
root@manu:~# cat /sys/devices/system/clocksource/clocksource0/current_clocksource  
tsc

除此外，还有两个参数shift和mult，这两个参数是干啥的呢？

我们想一下，假如我们需要给你个以一定频率输出中断的硬件，你如何计时？比如我有一个频率是1000Hz的硬件，当前时钟源计数是3500，过了一段时间，我抬头看了下时钟源计数至是5500，过去了2000cycles，我就知道了过去了2000/1000 =2 second。

times_elapse = cycles_interval / frequency

从上面的例子中，我抬头看了下当前计数值这个肯定是瞎掰了，实际上要想获取时钟源还是需要和硬件打交道的。在clocksource中有一个成员变量是read，这个就是一个时钟源注册的时候，提供的一个函数，如果你想获得我的当前计数值，请调用这个read 函数。以TSC时钟为例：

static struct clocksource clocksource_tsc = { 
    .name = "tsc", 
    .rating = 300, 
    .read = read_tsc, 
    .resume = resume_tsc, 
    .mask = CLOCKSOURCE_MASK(64), 
    .flags = CLOCK_SOURCE_IS_CONTINUOUS | 
                  CLOCK_SOURCE_MUST_VERIFY, 
#ifdef CONFIG_X86_64 
    .archdata = { .vclock_mode = VCLOCK_TSC }, 
#endif 
}; 
  
/*--------- arch/x86/kernel/tsc.c -------------------*/ 
static cycle_t read_tsc(struct clocksource *cs) 
{ 
    cycle_t ret = (cycle_t)get_cycles(); 
  
    return ret >= clocksource_tsc.cycle_last ? 
        ret : clocksource_tsc.cycle_last; 
} 
  
/*------- arch/x86/include/asm/tsc.h----------------------*/ 
static inline cycles_t get_cycles(void) 
{ 
    unsigned long long ret = 0; 
  
#ifndef CONFIG_X86_TSC 
    if (!cpu_has_tsc) 
        return 0; 
#endif 
    rdtscll(ret); 
  
    return ret; 
} 
  
/*------arch/x86/include/asm/msr.h-----------------*/ 
#define rdtscll(val)                        \ 
    ((val) = __native_read_tsc()) 
  
static __always_inline unsigned long long __native_read_tsc(void) 
{ 
    DECLARE_ARGS(val, low, high); 
  
    asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); 
  
    return EAX_EDX_VAL(val, low, high); 
}

根据这个脉络，我们知道，最终就是rdtsc这条指令来获取当前计数值cycles。

扯了半天read这个成员变量，可以回到shift和mult了。其实shift和mult是为了解决下面这个公式的：

times_elapse = cycles_interval / frequency

就像上面的公式，有频率就足以计时了。为啥弄出来个shift和mult。原因在于kernel搞个除法不太方便，必须转化乘法和移位。Kernel中有很多这种把除法转化成乘法的样例。那么公式变成了:

times_elapse = cycles_interval * mult >> shift

Kernel用乘法+移位来替换除法：根据cycles来计算过去了多少ns。

/** 
 * clocksource_cyc2ns - converts clocksource cycles to nanoseconds 
 * @cycles:    cycles 
 * @mult:    cycle to nanosecond multiplier 
 * @shift:    cycle to nanosecond pisor (power of two) 
 * 
 * Converts cycles to nanoseconds, using the given mult and shift. 
 * 
 * XXX - This could use some mult_lxl_ll() asm optimization 
 */ 
static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) 
{ 
    return ((u64) cycles * mult) >> shift; 
}

单纯从精度上讲，肯定是mult越大越好，但是计算过程可能溢出，所以mult也不能无限制的大，这个计算中有个magic number 600 ：

void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 
{ 
    u64 sec; 
    /* 
     * Calc the maximum number of seconds which we can run before 
     * wrapping around. For clocksources which have a mask > 32bit 
     * we need to limit the max sleep time to have a good 
     * conversion precision. 10 minutes is still a reasonable 
     * amount. That results in a shift value of 24 for a 
     * clocksource with mask >= 40bit and f >= 4GHz. That maps to 
     * ~ 0.06ppm granularity for NTP. We apply the same 12.5% 
     * margin as we do in clocksource_max_deferment() 
     */ 
    sec = (cs->mask - (cs->mask >> 3)); 
    do_p(sec, freq); 
    do_p(sec, scale); 
    if (!sec) 
        sec = 1; 
    else if (sec > 600 && cs->mask > UINT_MAX) 
        sec = 600; 
  
    clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 
             NSEC_PER_SEC / scale, sec * scale); 
  
    /* 
     * for clocksources that have large mults, to avoid overflow. 
     * Since mult may be adjusted by ntp, add an safety extra margin 
     * 
     */ 
    cs->maxadj = clocksource_max_adjustment(cs); 
    while ((cs->mult + cs->maxadj < cs->mult) 
        || (cs->mult - cs->maxadj > cs->mult)) { 
        cs->mult >>= 1; 
        cs->shift--; 
        cs->maxadj = clocksource_max_adjustment(cs); 
    } 
  
    cs->max_idle_ns = clocksource_max_deferment(cs); 
}

这个600的意思是600秒，表示的Timer两次计算当前计数值的差不会超过10分钟。主要考虑的是系统进入IDLE状态之后，时间信息不会被更新，10分钟内只要退出IDLE，clocksource还是可以成功的转换时间。当然了，最后的这个时间不一定就是10分钟，它由clocksource_max_deferment计算并将结果存储在max_idle_ns中。

推荐文章：

Linux时间管理之clocksource(1)