/* Determine sizes of all data caches */ /// Cache size and behaviour struct CacheInfo { /// Size of the cache, in kilobytes, per CPU. /// For L1 unified (data + code) caches, this size is half the physical size. /// (we don't halve it for larger sizes, since normally /// data size >> code sizefor critical loops). uint size; /// Number of ways of associativity, eg: /// 1 = direct mapped /// 2 = 2-way set associative /// 3 = 3-way set associative /// ubyte.max = fully associative ubyte associativity; /// Number of bytes read into the cache when a cache miss occurs. uint lineSize; } /// The data caches. If there are fewer than 3 physical caches, /// the remaining levels are set to uint.max (== entire memory space) CacheInfo[3] datacache; uint stepping, model, family; //TODO: Implement this function properly void getCacheInfoPPC() { enum :int { PPC601, PPC603, PPC603E, PPC604, PPC604E, PPC620 }; int cputype = PPC603; // TODO: Determine PPC CPU type // 601 has a 8KB combined data & code cache. uint sizes[] = [4, 8, 16, 16, 32, 8]; ubyte ways[] = [8, 2, 4, 4, 4, 8]; datacache[0].size = sizes[cputype]; datacache[0].associativity = ways[cputype]; datacache[0].lineSize = (cputype == PPC601 || cputype == PPC604)? 64:32; } version(D_InlineAsm_X86) { uint max_cpuid, max_extended_cpuid; // CPUID2: "cache and tlb information" void getcacheinfoCPUID2() { // CPUID2 is a dog's breakfast. What was Intel thinking??? // We are only interested in the data caches // We only use this for old Intel CPUs, so we can assume a single-core system. void decipherCpuid2(ubyte x) { if (x==0) return; ubyte [] ids = [0x0A, 0x0C, 0x2C, 0x60, // level 2 cache 0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E, // level 3 cache 0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D ]; uint [] sizes = [8, 16, 32, 16, 128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512, 256, 512, 1024, 2048, 512, 1024, 4096, 6*1024, 512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024 ]; ubyte [] ways = [2, 4, 8, 8, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2, 8, 8, 8, 8, 4, 8, 16, 24, 4, 8, 8, 8, 4, 8, 12, 16, 12, 16 ]; enum { FIRSTDATA2 = 4, FIRSTDATA3 = 24 } for (int i=0; i< ids.length; ++i) { if (x==ids[i]) { int level = i< FIRSTDATA2 ? 0: i=0x49 && x<=0x7F) || x==0x86 || x==0x87){ datacache[level].lineSize = 64; } else datacache[level].lineSize = 32; } } } uint[4] a; bool firstTime = true; uint numinfos = 1; do { asm { mov EAX, 2; cpuid; mov a, EAX; mov a+4, EBX; mov a+8, ECX; mov a+12, EDX; } if (firstTime) { // lsb of a is how many times to loop. numinfos = a[0] & 0xFF; // and otherwise it should be ignored a[0] &= 0xFFFF_FF00; firstTime = false; } for (int c=0; c<4;++c) { // high bit set == no info. if (a[c] & 0x8000_0000) continue; decipherCpuid2(cast(ubyte)(a[c] & 0xFF)); decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF)); decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF)); decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF)); } } while (--numinfos); } // CPUID4: "Deterministic cache parameters" leaf void getcacheinfoCPUID4() { int cachenum = 0; for(;;) { uint a, b, number_of_sets; asm { mov EAX, 4; mov ECX, cachenum; cpuid; mov a, EAX; mov b, EBX; mov number_of_sets, ECX; } ++cachenum; if ((a&0x1F)==0) break; // no more caches if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches ++number_of_sets; ubyte level = cast(ubyte)(((a>>5)&7)-1); if (level > datacache.length) continue; // ignore deep caches uint numthreads = ((a>>14) & 0xFFF) + 1; uint numcores = ((a>>26) & 0x3F) + 1; datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1); datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size uint line_partitions = ((b >> 12)& 0x3FF) + 1; // Size = number of sets * associativity * cachelinesize * linepartitions // and must convert to Kb, also dividing by the number of cores. ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets * datacache[level].associativity : number_of_sets; datacache[level].size = cast(uint)( (sz * datacache[level].lineSize * line_partitions ) / (numcores *1024)); if (level == 0 && (a&0xF)==3) { // Halve the size for unified L1 caches datacache[level].size/=2; } } } // CPUID8000_0005&6 void getAMDcacheinfo() { uint c5, c6, d6; asm { mov EAX, 0x8000_0005; // L1 cache cpuid; // EAX has L1_TLB_4M. // EBX has L1_TLB_4K // EDX has L1 instruction cache mov c5, ECX; mov EAX, 0x8000_0006; // L2/L3 cache cpuid; mov c6, ECX; // L2 cache info mov d6, EDX; // L3 cache info } ubyte numcores = 1; if (max_extended_cpuid >=0x8000_0008) { asm { mov EAX, 0x8000_0008; cpuid; mov numcores, CL; } ++numcores; } datacache[0].size = ( (c5>>24) & 0xFF); datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF); datacache[0].lineSize = c5 & 0xFF; ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ]; datacache[1].size = (c6>>16) & 0xFFFF; datacache[1].associativity = assocmap[(c6>>12)&0xF]; datacache[1].lineSize = c6 & 0xFF; // The L3 cache value is TOTAL, not per core. datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1. datacache[2].associativity = assocmap[(d6>>12)&0xF]; datacache[2].lineSize = d6 & 0xFF; } void getcacheinfoX86() { uint a, b, c, d; asm { mov EAX, 0; cpuid; mov max_cpuid, EAX; mov b, EBX; mov c, ECX; mov d, EDX; mov EAX, 0x8000_0000; cpuid; mov max_extended_cpuid, EAX; } bool isAMD = (b==0x6874_7541 && d==0x6974_6E65 && c==0x444D_4163); //"AuthenticAMD" bool isIntel = (b==0x756E_6547 && d==0x4965_6E69 && c==0x6c65_746E); //"GenuineIntel" asm { mov EAX, 1; // model, stepping cpuid; mov a, EAX; mov b, EBX; mov c, ECX; mov d, EDX; } stepping = a & 0xF; uint fbase = (a >> 8) & 0xF; uint mbase = (a >> 4) & 0xF; family = (fbase == 0xF) ? fbase + (a >> 20) & 0xFF : fbase; model = ((fbase == 0xF) || (fbase == 6 && isIntel) ) ? mbase + ((a >> 12) & 0xF0) : mbase; // Intel docs specify that they return 0 for 0x8000_0005. // AMD docs do not specify the behaviour for 0004 and 0002. // For other manufacturers, the information is scant. Try all of // the Intel or AMD methods which are reported as possibly // available. if (!isIntel) { if (max_extended_cpuid >= 0x8000_0006) { getAMDcacheinfo(); } else if (isAMD) { // early AMD. (How early?) Guess the cache size // BUG: this are not the correct values. datacache[0].size = 16384; datacache[0].associativity = 2; datacache[0].lineSize = 32; } } if (!isAMD && max_cpuid>=4) { getcacheinfoCPUID4(); } else if (!isAMD && max_cpuid>=2) { getcacheinfoCPUID2(); } else { // Pentium, PMMX, late model 486, or an obscure CPU bool hasmmx = false; if (hasmmx) { // Pentium MMX. Also has 8kB code cache. datacache[0].size = 16; datacache[0].associativity = 4; datacache[0].lineSize = 32; } else { // Pentium 1. Also has 8kB code cache. // (BUG: could be 486) datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } } } // Return true if the cpuid instruction is supported bool hasCPUID() { uint flags; asm { pushfd; pop EAX; mov flags, EAX; xor EAX, 0x0020_0000; push EAX; popfd; pushfd; pop EAX; xor flags, EAX; } return (flags & 0x0020_0000) !=0; } } else { // inline asm X86 bool hasCPUID() { return false; } // Assume it's an X86. void getcacheinfoX86() { datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } } void getcacheinfo() { if (hasCPUID()) { getcacheinfoX86(); } else { // it's a 386 or 486, or a Cyrix 6x86. //Probably still has an external cache. } if (datacache[0].size==0) { // Guess same as Pentium 1. datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } for (int i=1; i< datacache.length; ++i) { if (datacache[i].size==0) { // Set all remaining levels of cache equal to full address space. datacache[i].size = uint.max/1024; datacache[i].associativity = 1; datacache[i].lineSize = datacache[i-1].lineSize; } } } import std.stdio : writefln; import std.cpuid; void main() { writefln(std.cpuid.toString()); getcacheinfo(); writefln("Family=%X Model=%X Stepping=%X", family, model, stepping); writefln("Data caches:"); for (int i=0; i