mirror of
https://github.com/torvalds/linux.git
synced 2026-04-27 19:12:29 -04:00
Using mul_u64_u64_shr() provides similar calculation as mulhdu() assembly function, but enables inlining by the compiler. The home-made assembly function had special handling for when one of the arguments is not a fully populated u64 but time functions use it to multiply timebase by a calculated scale which is constructed to have most significant bit set. On mpc8xx sched_clock() runs 3% faster. On mpc83xx it is 2%. As you can see below, sched_clock() is not much bigger than before: c000cf68 <sched_clock>: c000cf68: 7d 2d 42 a6 mftbu r9 c000cf6c: 7d 0c 42 a6 mftb r8 c000cf70: 7d 4d 42 a6 mftbu r10 c000cf74: 7c 09 50 40 cmplw r9,r10 c000cf78: 40 82 ff f0 bne c000cf68 <sched_clock> c000cf7c: 3d 40 c1 37 lis r10,-16073 c000cf80: 38 8a b3 30 addi r4,r10,-19664 c000cf84: 80 ea b3 30 lwz r7,-19664(r10) c000cf88: 80 64 00 14 lwz r3,20(r4) c000cf8c: 39 40 00 00 li r10,0 c000cf90: 80 a4 00 04 lwz r5,4(r4) c000cf94: 80 c4 00 10 lwz r6,16(r4) c000cf98: 7c 63 40 10 subfc r3,r3,r8 c000cf9c: 80 84 00 08 lwz r4,8(r4) c000cfa0: 7d 06 49 10 subfe r8,r6,r9 c000cfa4: 7c c7 19 d6 mullw r6,r7,r3 c000cfa8: 7d 25 18 16 mulhwu r9,r5,r3 c000cfac: 7c 08 29 d6 mullw r0,r8,r5 c000cfb0: 7c 67 18 16 mulhwu r3,r7,r3 c000cfb4: 7d 29 30 14 addc r9,r9,r6 c000cfb8: 7c a8 28 16 mulhwu r5,r8,r5 c000cfbc: 7c ca 51 14 adde r6,r10,r10 c000cfc0: 7d 67 41 d6 mullw r11,r7,r8 c000cfc4: 7d 29 00 14 addc r9,r9,r0 c000cfc8: 7c c6 01 94 addze r6,r6 c000cfcc: 7c 63 28 14 addc r3,r3,r5 c000cfd0: 7d 4a 51 14 adde r10,r10,r10 c000cfd4: 7c e7 40 16 mulhwu r7,r7,r8 c000cfd8: 7c 63 58 14 addc r3,r3,r11 c000cfdc: 7d 4a 01 94 addze r10,r10 c000cfe0: 7c 63 30 14 addc r3,r3,r6 c000cfe4: 7d 4a 39 14 adde r10,r10,r7 c000cfe8: 35 24 ff e0 addic. r9,r4,-32 c000cfec: 41 80 00 10 blt c000cffc <sched_clock+0x94> c000cff0: 7c 63 48 30 slw r3,r3,r9 c000cff4: 38 80 00 00 li r4,0 c000cff8: 4e 80 00 20 blr c000cffc: 21 04 00 1f subfic r8,r4,31 c000d000: 54 69 f8 7e srwi r9,r3,1 c000d004: 7d 4a 20 30 slw r10,r10,r4 c000d008: 7d 29 44 30 srw r9,r9,r8 c000d00c: 7c 64 20 30 slw r4,r3,r4 c000d010: 7d 23 53 78 or r3,r9,r10 c000d014: 4e 80 00 20 blr Before this change: c000d0bc <sched_clock>: c000d0bc: 94 21 ff f0 stwu r1,-16(r1) c000d0c0: 7c 08 02 a6 mflr r0 c000d0c4: 90 01 00 14 stw r0,20(r1) c000d0c8: 93 e1 00 0c stw r31,12(r1) c000d0cc: 7d 2d 42 a6 mftbu r9 c000d0d0: 7d 0c 42 a6 mftb r8 c000d0d4: 7d 4d 42 a6 mftbu r10 c000d0d8: 7c 09 50 40 cmplw r9,r10 c000d0dc: 40 82 ff f0 bne c000d0cc <sched_clock+0x10> c000d0e0: 3f e0 c1 37 lis r31,-16073 c000d0e4: 3b ff b3 30 addi r31,r31,-19664 c000d0e8: 80 9f 00 14 lwz r4,20(r31) c000d0ec: 80 7f 00 10 lwz r3,16(r31) c000d0f0: 7c 84 40 10 subfc r4,r4,r8 c000d0f4: 80 bf 00 00 lwz r5,0(r31) c000d0f8: 80 df 00 04 lwz r6,4(r31) c000d0fc: 7c 63 49 10 subfe r3,r3,r9 c000d100: 48 00 37 85 bl c0010884 <mulhdu> c000d104: 81 3f 00 08 lwz r9,8(r31) c000d108: 35 49 ff e0 addic. r10,r9,-32 c000d10c: 41 80 00 20 blt c000d12c <sched_clock+0x70> c000d110: 80 01 00 14 lwz r0,20(r1) c000d114: 7c 83 50 30 slw r3,r4,r10 c000d118: 83 e1 00 0c lwz r31,12(r1) c000d11c: 38 80 00 00 li r4,0 c000d120: 7c 08 03 a6 mtlr r0 c000d124: 38 21 00 10 addi r1,r1,16 c000d128: 4e 80 00 20 blr c000d12c: 80 01 00 14 lwz r0,20(r1) c000d130: 54 8a f8 7e srwi r10,r4,1 c000d134: 21 09 00 1f subfic r8,r9,31 c000d138: 83 e1 00 0c lwz r31,12(r1) c000d13c: 7c 63 48 30 slw r3,r3,r9 c000d140: 7d 4a 44 30 srw r10,r10,r8 c000d144: 7c 84 48 30 slw r4,r4,r9 c000d148: 7d 43 1b 78 or r3,r10,r3 c000d14c: 7c 08 03 a6 mtlr r0 c000d150: 38 21 00 10 addi r1,r1,16 c000d154: 4e 80 00 20 blr c0010884 <mulhdu>: c0010884: 2c 06 00 00 cmpwi r6,0 c0010888: 2c 83 00 00 cmpwi cr1,r3,0 c001088c: 7c 8a 23 78 mr r10,r4 c0010890: 7c 84 28 16 mulhwu r4,r4,r5 c0010894: 41 82 00 14 beq c00108a8 <mulhdu+0x24> c0010898: 7c 0a 30 16 mulhwu r0,r10,r6 c001089c: 7c ea 29 d6 mullw r7,r10,r5 c00108a0: 7c e0 38 14 addc r7,r0,r7 c00108a4: 7c 84 01 94 addze r4,r4 c00108a8: 4d 86 00 20 beqlr cr1 c00108ac: 7d 23 29 d6 mullw r9,r3,r5 c00108b0: 7d 43 28 16 mulhwu r10,r3,r5 c00108b4: 41 82 00 18 beq c00108cc <mulhdu+0x48> c00108b8: 7c 03 31 d6 mullw r0,r3,r6 c00108bc: 7d 03 30 16 mulhwu r8,r3,r6 c00108c0: 7c e0 38 14 addc r7,r0,r7 c00108c4: 7c 84 41 14 adde r4,r4,r8 c00108c8: 7d 4a 01 94 addze r10,r10 c00108cc: 7c 84 48 14 addc r4,r4,r9 c00108d0: 7c 6a 01 94 addze r3,r10 c00108d4: 4e 80 00 20 blr Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com> Link: https://patch.msgid.link/f29e473c193c87bdbd36b209dfdee99d2f0c60dc.1733566130.git.christophe.leroy@csgroup.eu
120 lines
2.9 KiB
C
120 lines
2.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* Common time prototypes and such for all ppc machines.
|
|
*
|
|
* Written by Cort Dougan (cort@cs.nmt.edu) to merge
|
|
* Paul Mackerras' version and mine for PReP and Pmac.
|
|
*/
|
|
|
|
#ifndef __POWERPC_TIME_H
|
|
#define __POWERPC_TIME_H
|
|
|
|
#ifdef __KERNEL__
|
|
#include <linux/types.h>
|
|
#include <linux/percpu.h>
|
|
|
|
#include <asm/processor.h>
|
|
#include <asm/cpu_has_feature.h>
|
|
#include <asm/vdso/timebase.h>
|
|
|
|
/* time.c */
|
|
extern u64 decrementer_max;
|
|
|
|
extern unsigned long tb_ticks_per_jiffy;
|
|
extern unsigned long tb_ticks_per_usec;
|
|
extern unsigned long tb_ticks_per_sec;
|
|
extern struct clock_event_device decrementer_clockevent;
|
|
extern u64 decrementer_max;
|
|
|
|
|
|
extern void generic_calibrate_decr(void);
|
|
|
|
/* Some sane defaults: 125 MHz timebase, 1GHz processor */
|
|
extern unsigned long ppc_proc_freq;
|
|
#define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8)
|
|
extern unsigned long ppc_tb_freq;
|
|
#define DEFAULT_TB_FREQ 125000000UL
|
|
|
|
extern bool tb_invalid;
|
|
|
|
struct div_result {
|
|
u64 result_high;
|
|
u64 result_low;
|
|
};
|
|
|
|
static inline u64 get_vtb(void)
|
|
{
|
|
if (cpu_has_feature(CPU_FTR_ARCH_207S))
|
|
return mfspr(SPRN_VTB);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Accessor functions for the decrementer register.
|
|
* The 4xx doesn't even have a decrementer. I tried to use the
|
|
* generic timer interrupt code, which seems OK, with the 4xx PIT
|
|
* in auto-reload mode. The problem is PIT stops counting when it
|
|
* hits zero. If it would wrap, we could use it just like a decrementer.
|
|
*/
|
|
static inline u64 get_dec(void)
|
|
{
|
|
return mfspr(SPRN_DEC);
|
|
}
|
|
|
|
/*
|
|
* Note: Book E and 4xx processors differ from other PowerPC processors
|
|
* in when the decrementer generates its interrupt: on the 1 to 0
|
|
* transition for Book E/4xx, but on the 0 to -1 transition for others.
|
|
*/
|
|
static inline void set_dec(u64 val)
|
|
{
|
|
if (IS_ENABLED(CONFIG_BOOKE))
|
|
mtspr(SPRN_DEC, val);
|
|
else
|
|
mtspr(SPRN_DEC, val - 1);
|
|
}
|
|
|
|
static inline unsigned long tb_ticks_since(unsigned long tstamp)
|
|
{
|
|
return mftb() - tstamp;
|
|
}
|
|
|
|
#define mulhwu(x,y) \
|
|
({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
|
|
|
|
#ifdef CONFIG_PPC64
|
|
#define mulhdu(x,y) \
|
|
({unsigned long z; asm ("mulhdu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
|
|
#else
|
|
#define mulhdu(x, y) mul_u64_u64_shr(x, y, 64)
|
|
#endif
|
|
|
|
extern void div128_by_32(u64 dividend_high, u64 dividend_low,
|
|
unsigned divisor, struct div_result *dr);
|
|
|
|
extern void secondary_cpu_time_init(void);
|
|
extern void __init time_init(void);
|
|
|
|
DECLARE_PER_CPU(u64, decrementers_next_tb);
|
|
|
|
static inline u64 timer_get_next_tb(void)
|
|
{
|
|
return __this_cpu_read(decrementers_next_tb);
|
|
}
|
|
|
|
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
|
void timer_rearm_host_dec(u64 now);
|
|
#endif
|
|
|
|
/* Convert timebase ticks to nanoseconds */
|
|
unsigned long long tb_to_ns(unsigned long long tb_ticks);
|
|
|
|
void timer_broadcast_interrupt(void);
|
|
|
|
/* SPLPAR and VIRT_CPU_ACCOUNTING_NATIVE */
|
|
void pseries_accumulate_stolen_time(void);
|
|
u64 pseries_calculate_stolen_time(u64 stop_tb);
|
|
|
|
#endif /* __KERNEL__ */
|
|
#endif /* __POWERPC_TIME_H */
|