Looking at the ARM1 die, you see functional blocks such as 100 bytes of registers and a basic 32-bit ALU. On the M1 die, similar-sized functional blocks are a 12-megabyte cache and a complete 64-bit CPU core. It shows what Moore's law has done over 35 years.
一致性角度(Point of Coherency),一致性的角度可以说是整SoC的全局角度,这里包含CPU、DSPs、GPU或者DMA这些,意图是描述所有能够访问内存的单元。如果有人说PoC一致,那么那么就要保证整个系统的内存都是一致的。如图上图和下图所示,外部内存(总线访问,且总线上还挂着别的器件)和红色圈起来的区域的一致性是PoC一致性。如果外设设备通过AMBA总线修改了内存的数据,此时Core是无法感知到的,因此就出现cache coherency的问题。
统一性角度(Point of Unification),统一性是站在core层级而言的,core能看见的是i-cache,d-cache、TLB、MMU这些。
IC IVAU, X0 // Instruction Cache Invalidate by address to Point of Unification
DC CVAC, X0 // Data Cache Clean by address to Point of Coherency
IC IVAU, X1 // Might be out of order relative to the previous operations if
// x0 and x1 differ
IC IVAU, X0 // I cache Invalidate by address to Point of Unification
IC IALLU // I cache Invalidate All to Point of Unification
// Operations execute in order
下面的代码展示一个clear数据或者统一cache(PoC)通用的机制:
MRS X0, CLIDR_EL1
AND W3, W0, #0x07000000 // Get 2 x Level of Coherence
LSR W3, W3, #23
CBZ W3, Finished
MOV W10, #0 // W10 = 2 x cache level
MOV W8, #1 // W8 = constant 0b1
Loop1: ADD W2, W10, W10, LSR #1 // Calculate 3 x cache level
LSR W1, W0, W2 // extract 3-bit cache type for this level
AND W1, W1, #0x7
CMP W1, #2
B.LT Skip // No data or unified cache at this level
MSR CSSELR_EL1, X10 // Select this cache level
ISB // Synchronize change of CSSELR
MRS X1, CCSIDR_EL1 // Read CCSIDR
AND W2, W1, #7 // W2 = log2(linelen)-4
ADD W2, W2, #4 // W2 = log2(linelen)
UBFX W4, W1, #3, #10 // W4 = max way number, right aligned
CLZ W5, W4 /* W5 = 32-log2(ways), bit position of way in DC operand */
LSL W9, W4, W5 /* W9 = max way number, aligned to position in DC
operand */
LSL W16, W8, W5 // W16 = amount to decrement way number per iteration
Loop2: UBFX W7, W1, #13, #15 // W7 = max set number, right aligned
LSL W7, W7, W2 /* W7 = max set number, aligned to position in DC
operand */
LSL W17, W8, W2 // W17 = amount to decrement set number per iteration
Loop3: ORR W11, W10, W9 // W11 = combine way number and cache number...
ORR W11, W11, W7 // ... and set number for DC operand
DC CSW, X11 // Do data cache clean by set and way
SUBS W7, W7, W17 // Decrement set number
B.GE Loop3
SUBS X9, X9, X16 // Decrement way number
B.GE Loop2
Skip: ADD W10, W10, #2 // Increment 2 x cache level
CMP W3, W10
DSB /* Ensure completion of previous cache maintenance
operation */
B.GT Loop1
Finished:
/* Coherency example for data and instruction accesses within the same Inner
Shareable domain. Enter this code with <Wt> containing a new 32-bit instruction,
to be held in Cacheable space at a location pointed to by Xn. */
STR Wt, [Xn]
DC CVAU, Xn // Clean data cache by VA to point of unification (PoU)
DSB ISH // Ensure visibility of the data cleaned from cache
IC IVAU, Xn // Invalidate instruction cache by VA to PoU
DSB ISH // Ensure completion of the invalidations
ISB // Synchronize the fetched instruction stream
The code cleans and invalidates data and instruction caches by Virtual Address for a region starting at the base address given in x0 and length given in x1.
//
// X0 = base address
// X1 = length (we assume the length is not 0)
//
// Calculate end of the region
ADD x1, x1, x0 // Base Address + Length
//
// Clean the data cache by MVA
//
MRS X2, CTR_EL0 // Read Cache Type Register
// Get the minimun data cache line
//
UBFX X4, X2, #16, #4 // Extract DminLine (log2 of the cache line)
MOV X3, #4 // Dminline iss the number of words (4 bytes)
LSL X3, X3, X4 // X3 should contain the cache line
SUB X4, X3, #1 // get the mask for the cache line
BIC X4, X0, X4 // Aligned the base address of the region
clean data cache:
DC CVAU, X4 // Clean data cache line by VA to PoU
ADD X4, X4, X3 // Next cache line
CMP X4, X1 // Is X4 (current cache line) smaller than the end
// of the region
B.LT clean_data_cache // while (address < end_address)
DSB ISH // Ensure visibility of the data cleaned from cache
//
//Clean the instruction cache by VA
//
// Get the minimum instruction cache line (X2 contains ctr_el0)
AND X2, X2, #0xF // Extract IminLine (log2 of the cache line)
MOV X3, #4 // IminLine is the number of words (4 bytes)
LSL X3, X3, X2 // X3 should contain the cache line
SUB x4, x3, #1 // Get the mask for the cache line
BIC X4, X0, X4 // Aligned the base address of the region
clean_instruction_cache:
IC IVAU, X4 // Clean instruction cache line by VA to PoU
ADD X4, X4, X3 // Next cache line
CMP X4, X1 // Is X4 (current cache line) smaller than the end
// of the region
B.LT clean_instruction_cache // while (address < end_address)
DSB ISH // Ensure completion of the invalidations
ISB // Synchronize the fetched instruction stream
1.7 cache自举
1.7.1 相关寄存器
对于ARM处理器,需要知道cache的有关信息:
系统支持多级cache?
cache line大小?
多少set,多少way?
CLIDR_EL1 (Cache Level ID Register): 这个寄存器用来表示cache的类型,以及系统最多支持多少级的cache。
CTR_EL0 (Cache Type Register): 记录cache的相关信息,比如cache line大小,cache策略等等。
CCSIDR_EL1 (Current Cache Size ID Register ) :软件需要它来查询每一级cache的相关信息。
#include"io.h"#include"type.h"#defineICACHE_POLICY_VPIPT0#defineICACHE_POLICY_VIPT2#defineICACHE_POLICY_PIPT3staticconstchar*icache_policy_str[]= { [0 ... ICACHE_POLICY_PIPT] ="RESERVED/UNKNOWN", [ICACHE_POLICY_VIPT] ="VIPT", [ICACHE_POLICY_PIPT] ="PIPT", [ICACHE_POLICY_VPIPT] ="VPIPT"};/* * CTR_EL0, Cache Type Register * https://developer.arm.com/documentation/ddi0601/2022-03/ * AArch64-Registers/CTR-EL0--Cache-Type-Register?lang=en * * - IminLine, bits [3:0]: * Log2 of the number of words in the smallest cache line of * all the instruction caches that are controlled by the PE. * * - L1Ip, bits [15:14]: * Level 1 instruction cache policy. Indicates the indexing and * tagging policy for the L1 instruction cache. Possible values * of this field are: * - 00: VMID aware Physical Index, Physical tag (VPIPT). * - 01: ASID-tagged Virtual Index, Virtual Tag (AIVIVT). * (ARMv8 only) * - 10: Virtual Index, Physical Tag (VIPT). * - 11: Physical Index, Physical Tag (PIPT). * * - DminLine, bits [19:16]: * Log2 of the number of words in the smallest cache line of * all the data caches and unified caches that are controlled * by the PE. * * - ERG, bits [23:20] * Exclusives reservation granule, and, if FEAT_TME is implemented, * transactional reservation granule. Log2 of the number of words of * the maximum size of the reservation granule for the Load-Exclusive * and Store-Exclusive instructions, and, if FEAT_TME is implemented, * for detecting transactional conflicts. * * - CWG, bits [27:24] * Cache writeback granule. Log2 of the number of words of the * maximum size of memory that can be overwritten as a result of * the eviction of a cache entry that has had a memory location * in it modified. * - A value of 0b0000 indicates that this register does not provide * Cache writeback granule information and either: * - The architectural maximum of 512 words (2KB) must be assumed.3 * - The Cache writeback granule can be determined from maximum * cache line size encoded in the Cache Size ID Registers. * - Values greater than 0b1001 are reserved. * * - IDC, bit [28] * Data cache clean requirements for instruction to data coherence. * The meaning of this bit is: * - 0: Data cache clean to the Point of Unification is required * for instruction to data coherence, * unless CLIDR_EL1.LoC == 0b000 or * (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000). * - 1: Data cache clean to the Point of Unification is not * required for instruction to data coherence. * * - DIC, bit [29] * Instruction cache invalidation requirements for data to * instruction coherence. * - 0: Data cache clean to the Point of Unification is * required for instruction to data coherence, * unless CLIDR_EL1.LoC == 0b000 or * (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000). * - 1: Data cache clean to the Point of Unification is not * required for instruction to data coherence. * * - TminLine, bits [37:32] * Tag minimum Line. Log2 of the number of words covered by * Allocation Tags in the smallest cache line of all caches * which can contain Allocation tags that are controlled by * the PE. * - For an implementation with cache lines containing 64 * bytes of data and 4 Allocation Tags, this will be * log2(64/4) = 4. * - For an implementation with Allocations Tags in separate * cache lines of 128 Allocation Tags per line, this will * be log2(128*16/4) = 9. */#defineCTR_L1IP_SHIFT14#defineCTR_L1IP_MASK3#defineCTR_DMINLINE_SHIFT16#defineCTR_IMINLINE_SHIFT0#defineCTR_ERG_SHIFT20#defineCTR_CWG_SHIFT24#defineCTR_CWG_MASK15#defineCTR_IDC_SHIFT28#defineCTR_DIC_SHIFT29#defineCTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)/* * CSSELR_EL1, Cache Size Selection Register * * Selects the current Cache Size ID Register, CCSIDR_EL1, * by specifying the required cache level and the cache type * (either instruction or data cache). * * https://developer.arm.com/documentation/ddi0601/2022-03/ * AArch64-Registers/CCSIDR-EL1--Current-Cache-Size-ID-Register?lang=en * * - TnD, bit [4] * - 0: Data, Instruction or Unified cache. * - 1: Separate Allocation Tag cache. * * - Associativity, bits [12:3] * (Associativity of cache) - 1, therefore a value of * 0 indicates an associativity of 1. The associativity * does not have to be a power of 2. * * - Level, bits [3:1] * Cache level of required cache. * - 0b000 : Level 1 cache. * - 0b001 : Level 2 cache. * - 0b010 : Level 3 cache. * - 0b011 : Level 4 cache. * - 0b100 : Level 5 cache. * - 0b101 : Level 6 cache. * - 0b110 : Level 7 cache. * - InD, bit [0] * Instruction not Data bit. * - 0 : Data or unified cache. * - 1 : Instruction cache. */#defineCSSELR_IND_IBIT(0)#defineCSSELR_LEVEL_SHIFT1/* * CCSIDR_EL1, Current Cache Size ID Register * Provides information about the architecture of * the currently selected cache. * * https://developer.arm.com/documentation/ddi0601/2022-03/ * AArch64-Registers/CCSIDR-EL1--Current-Cache-Size-ID-Register?lang=en * * - NumSets, bits [27:13] * (Number of sets in cache) - 1, therefore a value of * 0 indicates 1 set in the cache. The number of sets * does not have to be a power of 2. * * - Associativity, bits [12:3] * (Associativity of cache) - 1, therefore a value of * 0 indicates an associativity of 1. The associativity * does not have to be a power of 2. * * - LineSize, bits [2:0] * (Log2(Number of bytes in cache line)) - 4. For example: * - For a line length of 16 bytes: Log2(16) = 4, * LineSize entry = 0. This is the minimum line length. * - For a line length of 32 bytes: Log2(32) = 5, * LineSize entry = 1. */#defineCCSIDR_NUMSETS_SHIFT13#defineCCSIDR_NUMSETS_MASK (0x1fff<< CCSIDR_NUMSETS_SHIFT)#defineCCSIDR_ASS_SHIFT3#defineCCSIDR_ASS_MASK (0x3ff<< CCSIDR_ASS_SHIFT)#defineCCSIDR_LINESIZE_MASK (0x7)/* * CLIDR_EL1, Cache Level ID Register * * Identifies the type of cache, or caches, that are implemented * at each level and can be managed using the architected cache * maintenance instructions that operate by set/way, * up to a maximum of seven levels. * * Also identifies the Level of Coherence (LoC) and Level of * Unification (LoU) for the cache hierarchy. * * https://developer.arm.com/documentation/ddi0601/2022-03/ * AArch64-Registers/CLIDR-EL1--Cache-Level-ID-Register?lang=en * * - Ttype<n>, bits [2(n-1)+34:2(n-1)+33], for n = 7 to 1 * When FEAT_MTE2 is implemented: * - 00 no tag cache * - 01 separate allocation tag cache * - 10 unified allocation tag and data cache * allocation tags and data in unified lines. * - 11 unified allocation tag and data cache * allocation tags and data in separate lines. * * - ICB, bits [32:30] * Inner cache boundary. This field indicates the boundary * for caching Inner Cacheable memory regions. * - 0b000 : Not disclosed by this mechanism. * - 0b001 : L1 cache is the highest Inner Cacheable level. * - .... : Lx cache is the highest Inner Cacheable level. * - 0b111 : L7 cache is the highest Inner Cacheable level. * * - LoC, bits [26:24] * Level of Unification Uniprocessor for the cache hierarchy. * * - LoUIS, bits [23:21] * Level of Unification Inner Shareable for the cache hierarchy. * * - Ctype<n>, bits [3(n-1)+2:3(n-1)], for n = 7 to 1 * Cache Type fields. Indicate the type of cache that is implemented * and can be managed using the architected cache maintenance instructions * that operate by set/way at each level, from Level 1 up to a maximum of * seven levels of cache hierarchy. Possible values of each field are: * - 0b000: No cache. * - 0b001: Instruction cache only. * - 0b010: Data cache only. * - 0b011: Separate instruction and data caches. * - 0b100: Unified cache. */enum cache_type { CACHE_TYPE_NOCACHE =0, CACHE_TYPE_INST =BIT(0), CACHE_TYPE_DATA =BIT(1), CACHE_TYPE_SEPARATE = CACHE_TYPE_INST | CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED =BIT(2),};staticconstchar*cache_type_string[]= {"nocache","i-cache","d-cache","separate cache","unifed cache"};#defineMAX_CACHE_LEVEL7#defineCLIDR_ICB_SHIFT30#defineCLIDR_LOUU_SHIFT27#defineCLIDR_LOC_SHIFT24#defineCLIDR_LOUIS_SHIFT21#defineCLIDR_ICB(clidr) (((clidr) >> CLIDR_ICB_SHIFT) &0x7)#defineCLIDR_LOUU(clidr) (((clidr) >> CLIDR_LOUU_SHIFT) &0x7)#defineCLIDR_LOC(clidr) (((clidr) >> CLIDR_LOC_SHIFT) &0x7)#defineCLIDR_LOUIS(clidr) (((clidr) >> CLIDR_LOUIS_SHIFT) &0x7)/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */#defineCLIDR_CTYPE_SHIFT(level) (3* (level -1))#defineCLIDR_CTYPE_MASK(level) (7<<CLIDR_CTYPE_SHIFT(level))#defineCLIDR_CTYPE(clidr, level) \ (((clidr) &CLIDR_CTYPE_MASK(level)) >>CLIDR_CTYPE_SHIFT(level))/* * Getting the value of cwg by read CTR_EL0. * The value is log2(x). */staticinlineunsignedintcache_type_cwg(void){return (read_sysreg(CTR_EL0)>> CTR_CWG_SHIFT) & CTR_CWG_MASK;}staticinlineunsignedintcache_line_size(void){ u32 cwg =cache_type_cwg();return4<< cwg;}staticinlineenum cache_type get_cache_type(int level){unsignedlong clidr_reg =0;if (level > MAX_CACHE_LEVEL)return CACHE_TYPE_NOCACHE; clidr_reg =read_sysreg(clidr_el1);returnCLIDR_CTYPE(clidr_reg, level);}/* * get the ways and sets on each levels. * * We can get the information from the raspi maunal. * https://www.raspberrypi.org/documentation/hardware/raspberrypi/bcm2711/README.md * * Caches: 32 KB data + 48 KB instruction L1 cache per core. 1MB L2 cache. */staticvoidget_cache_set_way(unsignedint level,unsignedint ind){unsignedlong val =0;unsignedint line_size, set, way;int temp; /* 1. Ensure the target cache by writing the CSSELR_EL1 register */ temp = (level -1) << CSSELR_LEVEL_SHIFT | ind;write_sysreg(temp, CSSELR_EL1); /* 2. Read CCSIDR_EL1 register * note, the register is implemented two layouts. **/ val =read_sysreg(CCSIDR_EL1); set = (val & CCSIDR_NUMSETS_MASK) >> CCSIDR_NUMSETS_SHIFT; set ++; way = (val & CCSIDR_ASS_MASK) >> CCSIDR_ASS_SHIFT; line_size =1<< ( (val & CCSIDR_LINESIZE_MASK) +4 );printk(" %s: set %u way %u line_size %u size %uKB\n", ind ?"i-cache":"d/u cache", set, way, line_size, (line_size * way * set)/1024);}intcache_self_test(void){int level;unsignedlong ctype;unsignedlong reg_val;printk("parse cache info:\n");for (level =1; level <= MAX_CACHE_LEVEL; level ++) { /* get cache type */ ctype =get_cache_type(level); /* when the cache type is nocache, return. */if (CACHE_TYPE_NOCACHE == ctype) { level --;break; }printk(" L%u: %s, cache line size %u\n", level, cache_type_string[ctype], cache_line_size());if (CACHE_TYPE_SEPARATE == ctype) {get_cache_set_way(level,1);get_cache_set_way(level,0); } elseif (CACHE_TYPE_UNIFIED == ctype) {get_cache_set_way(level,0); } } /* * Get ICB, LOUU, LOC and LOUIS * ICB: inner cache boundary. * LOUU: Single core PoU cache boundary. * LOC: PoC cache boundary. * LOUIS: PoU for inner sharing cache boundary. */ reg_val =read_sysreg(clidr_el1);printk(" IBC:%u LOUU:%u LoC:%u LoUIS:%u\n", CLIDR_ICB(reg_val), CLIDR_LOUU(reg_val), CLIDR_LOC(reg_val), CLIDR_LOUIS(reg_val)); reg_val =read_sysreg(ctr_el0);printk(" Detected %s I-cache\n", icache_policy_str[CTR_L1IP(reg_val)]);return level;}