有没有办法从C#调用RDTSC汇编指令?

我想为我的C#应用​​程序提供一个非常高分辨率的计时器。 我想访问RDTSC汇编指令。 有没有办法做到这一点?

编辑:我正在移植一些C ++代码并尝试保留与原始相同的function。 我可能会切换到更多.NET,但想要评估RDTSC指令,以便我可以将结果与原始结果进行比较。

以下是如何做到这一点:

 using System; using System.ComponentModel; using System.Diagnostics; using System.Runtime.InteropServices; public static class Rdtsc { [StructLayout(LayoutKind.Sequential)] private struct SystemInfo { public ushort wProcessorArchitecture; public ushort wReserved; public uint dwPageSize; public IntPtr lpMinimumApplicationAddress; public IntPtr lpMaximumApplicationAddress; public IntPtr dwActiveProcessorMask; public uint dwNumberOfProcessors; public uint dwProcessorType; public uint dwAllocationGranularity; public ushort wProcessorLevel; public ushort wProcessorRevision; } [DllImport("kernel32.dll", ExactSpelling = true)] private static extern void GetNativeSystemInfo(out SystemInfo lpSystemInfo); [DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)] private static extern IntPtr VirtualAlloc(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, uint flProtect); [DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)] [return: MarshalAs(UnmanagedType.Bool)] private static extern bool VirtualProtect(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, out uint lpflOldProtect); [DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)] [return: MarshalAs(UnmanagedType.Bool)] private static extern bool VirtualFree(IntPtr lpAddress, IntPtr dwSize, uint dwFreeType); private const uint PAGE_READWRITE = 0x04; private const uint PAGE_EXECUTE = 0x10; private const uint MEM_COMMIT = 0x1000; private const uint MEM_RELEASE = 0x8000; [SuppressUnmanagedCodeSecurity] [UnmanagedFunctionPointer(CallingConvention.StdCall)] public delegate ulong TimestampDelegate(); public static readonly TimestampDelegate Timestamp; static Rdtsc() { SystemInfo systemInfo; GetNativeSystemInfo(out systemInfo); if (systemInfo.wProcessorArchitecture != 0 /* PROCESSOR_ARCHITECTURE_INTEL */ && systemInfo.wProcessorArchitecture != 9 /* PROCESSOR_ARCHITECTURE_AMD64 */) { // Fallback for ARM/IA64/... Timestamp = StopwatchGetTimestamp; return; } byte[] body; if (Environment.Is64BitProcess) { body = new byte[] { 0x0f, 0x31, // rdtsc 0x48, 0xc1, 0xe2, 0x20, // shl rdx,20h 0x48, 0x0b, 0xc2, // or rax,rdx 0xc3, // ret }; } else { body = new byte[] { 0x0f, 0x31, // rdtsc 0xc3, // ret }; } IntPtr buf = IntPtr.Zero; try { // We VirtualAlloc body.Length bytes, with R/W access // Note that from what I've read, MEM_RESERVE is useless // if the first parameter is IntPtr.Zero buf = VirtualAlloc(IntPtr.Zero, (IntPtr)body.Length, MEM_COMMIT, PAGE_READWRITE); if (buf == IntPtr.Zero) { throw new Win32Exception(); } // Copy our instructions in the buf Marshal.Copy(body, 0, buf, body.Length); // Change the access of the allocated memory from R/W to Execute uint oldProtection; bool result = VirtualProtect(buf, (IntPtr)body.Length, PAGE_EXECUTE, out oldProtection); if (!result) { throw new Win32Exception(); } // Create a delegate to the "function" Timestamp = (TimestampDelegate)Marshal.GetDelegateForFunctionPointer(buf, typeof(TimestampDelegate)); buf = IntPtr.Zero; } finally { // There was an error! if (buf != IntPtr.Zero) { // Free the allocated memory bool result = VirtualFree(buf, IntPtr.Zero, MEM_RELEASE); if (!result) { throw new Win32Exception(); } } } } // Fallback if rdtsc isn't available private static ulong StopwatchGetTimestamp() { return unchecked((ulong)Stopwatch.GetTimestamp()); } } 

一些说明:

  • 我已经包含了ARM处理器的后备( Stopwatch.GetTimestamp() )。
  • 没有使用CPUID来阻止RDTSC指令被移动( “rdtsc”之前的“cpuid” )。 我对集会不太满意,所以我不知道怎么做。 如果您想修改代码,请随意执行此操作并在“正确”的OpCodes上添加注释以供使用
  • 我正在使用RDTSC而不是 RDTSCP(同样的问题,汇编不是我的语言)
  • 非常慢……假设在没有内联的RDTSC的18-24个滴答中调用等效的Visual C ++代码,而在初始预热之后的C#版本在RDTSC的27-100个滴答中被调用。

Visual C ++比较代码:

 __declspec(noinline) uint64_t __stdcall Rdtsc(void) { return __rdtsc(); } 

Aaaaand现在…… 用rdtscp完全实现

 public static class Rdtsc { [StructLayout(LayoutKind.Sequential)] private struct SystemInfo { public ushort wProcessorArchitecture; public ushort wReserved; public uint dwPageSize; public IntPtr lpMinimumApplicationAddress; public IntPtr lpMaximumApplicationAddress; public IntPtr dwActiveProcessorMask; public uint dwNumberOfProcessors; public uint dwProcessorType; public uint dwAllocationGranularity; public ushort wProcessorLevel; public ushort wProcessorRevision; } [DllImport("kernel32.dll", ExactSpelling = true)] private static extern void GetNativeSystemInfo(out SystemInfo lpSystemInfo); [DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)] private static extern IntPtr VirtualAlloc(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, uint flProtect); [DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)] [return: MarshalAs(UnmanagedType.Bool)] private static extern bool VirtualProtect(IntPtr lpAddress, IntPtr dwSize, uint flAllocationType, out uint lpflOldProtect); [DllImport("kernel32.dll", ExactSpelling = true, SetLastError = true)] [return: MarshalAs(UnmanagedType.Bool)] private static extern bool VirtualFree(IntPtr lpAddress, IntPtr dwSize, uint dwFreeType); private const uint PAGE_READWRITE = 0x04; private const uint PAGE_EXECUTE = 0x10; private const uint PAGE_EXECUTE_READWRITE = 0x40; private const uint MEM_COMMIT = 0x1000; private const uint MEM_RELEASE = 0x8000; [SuppressUnmanagedCodeSecurity] [UnmanagedFunctionPointer(CallingConvention.StdCall)] public delegate ulong FuncUInt64(); ///  /// Uses rdtsc. On non-Intel uses Stopwatch.GetTimestamp. ///  public static readonly FuncUInt64 Timestamp; ///  /// Uses rdtscp if present. Otherwise uses cpuid + rdtsc. On /// non-Intel uses Stopwatch.GetTimestamp. ///  public static readonly FuncUInt64 TimestampP; public static readonly bool IsRdtscSupported; public static readonly bool IsRdtscPSupported; static Rdtsc() { SystemInfo systemInfo; GetNativeSystemInfo(out systemInfo); if (systemInfo.wProcessorArchitecture != 0 /* PROCESSOR_ARCHITECTURE_INTEL */ && systemInfo.wProcessorArchitecture != 9 /* PROCESSOR_ARCHITECTURE_AMD64 */) { // Fallback for ARM/IA64/... Timestamp = StopwatchGetTimestamp; TimestampP = StopwatchGetTimestamp; IsRdtscSupported = false; IsRdtscPSupported = false; return; } byte[] cpuid, rdtsc, rdtscp, rdtsccpuid; IsRdtscSupported = true; // Assembly generated with https://defuse.ca/online-x86-assembler.htm if (Environment.Is64BitProcess) { /* CPUID x64: push rbx; mov eax, 0x80000000; cpuid; mov ebx, 0x80000001; cmp eax, ebx; jb Error; mov eax, ebx; cpuid; mov eax, ecx; shl rax, 0x20; or rax, rdx jmp End; Error: xor rax, rax; End: pop rbx; ret; 0: 53 push rbx 1: b8 00 00 00 80 mov eax,0x80000000 6: 0f a2 cpuid 8: bb 01 00 00 80 mov ebx,0x80000001 d: 39 d8 cmp eax,ebx f: 72 0f jb 20  11: 89 d8 mov eax,ebx 13: 0f a2 cpuid 15: 89 c8 mov eax,ecx 17: 48 c1 e0 20 shl rax,0x20 1b: 48 09 d0 or rax,rdx 1e: eb 03 jmp 23  0000000000000020 : 20: 48 31 c0 xor rax,rax 0000000000000023 : 23: 5b pop rbx 24: c3 ret */ cpuid = new byte[] { 0x53, 0xB8, 0x00, 0x00, 0x00, 0x80, 0x0F, 0xA2, 0xBB, 0x01, 0x00, 0x00, 0x80, 0x39, 0xD8, 0x72, 0x16, 0x89, 0xD8, 0x48, 0xC7, 0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0xA2, 0x89, 0xC8, 0x48, 0xC1, 0xE0, 0x20, 0x48, 0x09, 0xD0, 0xEB, 0x03, 0x48, 0x31, 0xC0, 0x5B, 0xC3 }; /* RDTSC x64: rdtsc; shl rdx, 0x20; or rax,rdx; ret; 0: 0f 31 rdtsc 2: 48 c1 e2 20 shl rdx,0x20 6: 48 09 d0 or rax,rdx 9: c3 ret */ rdtsc = new byte[] { 0x0F, 0x31, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0xC3 }; /* RDTSCP x64 rdtscp; shl rdx, 0x20; or rax, rdx; ret; 0: 0f 01 f9 rdtscp 3: 48 c1 e2 20 shl rdx,0x20 7: 48 09 d0 or rax,rdx a: c3 ret */ rdtscp = new byte[] { 0x0F, 0x01, 0xF9, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0xC3 }; /* RDTSC + CPUID x64 push rbx; xor eax, eax; cpuid; rdtsc; shl rdx, 0x20; or rax, rdx; pop rbx; ret; 0: 53 push rbx 1: 31 c0 xor eax,eax 3: 0f a2 cpuid 5: 0f 31 rdtsc 7: 48 c1 e2 20 shl rdx,0x20 b: 48 09 d0 or rax,rdx e: 5b pop rbx f: c3 ret */ rdtsccpuid = new byte[] { 0x53, 0x31, 0xC0, 0x0F, 0xA2, 0x0F, 0x31, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0x5B, 0xC3 }; } else { /* CPUID x86: push ebx; mov eax, 0x80000000; cpuid; mov ebx, 0x80000001; cmp eax, ebx; jb Error; mov eax, ebx; cpuid; mov eax, edx; mov edx, ecx; jmp End; Error: xor eax, eax; xor edx, edx; End: pop ebx; ret; 0: 53 push ebx 1: b8 00 00 00 80 mov eax,0x80000000 6: 0f a2 cpuid 8: bb 01 00 00 80 mov ebx,0x80000001 d: 39 d8 cmp eax,ebx f: 72 0a jb 1b  11: 89 d8 mov eax,ebx 13: 0f a2 cpuid 15: 89 d0 mov eax,edx 17: 89 ca mov edx,ecx 19: eb 04 jmp 1f  0000001b : 1b: 31 c0 xor eax,eax 1d: 31 d2 xor edx,edx 0000001f : 1f: 5b pop ebx 20: c3 ret */ cpuid = new byte[] { 0x53, 0xB8, 0x00, 0x00, 0x00, 0x80, 0x0F, 0xA2, 0xBB, 0x01, 0x00, 0x00, 0x80, 0x39, 0xD8, 0x72, 0x0A, 0x89, 0xD8, 0x0F, 0xA2, 0x89, 0xD0, 0x89, 0xCA, 0xEB, 0x04, 0x31, 0xC0, 0x31, 0xD2, 0x5B, 0xC3 }; /* RDTSC x86: rdtsc; ret; 0: 0f 31 rdtsc 2: c3 ret */ rdtsc = new byte[] { 0x0F, 0x31, 0xC3 }; /* RDTSCP x86 rdtscp; ret; 0: 0f 01 f9 rdtscp 3: c3 ret */ rdtscp = new byte[] { 0x0F, 0x01, 0xF9, 0xC3 }; /* RDTSC + CPUID x86 push ebx; xor eax,eax; cpuid; rdtsc; pop ebx; ret; 0: 53 push ebx 1: 31 c0 xor eax,eax 3: 0f a2 cpuid 5: 0f 31 rdtsc 7: 5b pop ebx 8: c3 ret */ rdtsccpuid = new byte[] { 0x53, 0x31, 0xC0, 0x0F, 0xA2, 0x0F, 0x31, 0x5B, 0xC3 }; } IntPtr buf = IntPtr.Zero; try { // We pad the functions to 64 bytes (the length of a cache // line on the Intel processors) int cpuidLength = (cpuid.Length & 63) != 0 ? (cpuid.Length | 63) + 1 : cpuid.Length; int rdtscLength = (rdtsc.Length & 63) != 0 ? (rdtsc.Length | 63) + 1 : rdtsc.Length; int rdtscpLength = (rdtscp.Length & 63) != 0 ? (rdtscp.Length | 63) + 1 : rdtscp.Length; int rdtsccpuidLength = (rdtsccpuid.Length & 63) != 0 ? (rdtsccpuid.Length | 63) + 1 : rdtsccpuid.Length; // We don't know which one of rdtscp or rdtsccpuid we will // use, so we calculate space for the biggest one. // Note that it is very unlikely that we will go over 4096 // bytes (the minimum size of memory allocated by // VirtualAlloc) int totalLength = cpuidLength + rdtscLength + Math.Max(rdtscpLength, rdtsccpuidLength); // We VirtualAlloc totalLength bytes, with R/W access // Note that from what I've read, MEM_RESERVE is useless // if the first parameter is IntPtr.Zero buf = VirtualAlloc(IntPtr.Zero, (IntPtr)totalLength, MEM_COMMIT, PAGE_EXECUTE_READWRITE); if (buf == IntPtr.Zero) { throw new Win32Exception(); } // Copy cpuid instructions in the buf Marshal.Copy(cpuid, 0, buf, cpuid.Length); for (int i = cpuid.Length; i < cpuidLength; i++) { Marshal.WriteByte(buf, i, 0x90); // nop } // Copy rdtsc instructions in the buf Marshal.Copy(rdtsc, 0, buf + cpuidLength, rdtsc.Length); for (int i = rdtsc.Length; i < rdtscLength; i++) { Marshal.WriteByte(buf, cpuidLength + i, 0x90); // nop } var cpuidFunc = (FuncUInt64)Marshal.GetDelegateForFunctionPointer(buf, typeof(FuncUInt64)); // We use cpuid, EAX=0x80000001 to check for the rdtscp ulong supportedFeatures = cpuidFunc(); byte[] rdtscpSelected; int rdtscpSelectedLength; // Check the rdtscp flag if ((supportedFeatures & (1L << 27)) != 0) { // rdtscp supported rdtscpSelected = rdtscp; rdtscpSelectedLength = rdtscpLength; IsRdtscPSupported = true; } else { // rdtscp not supported. We use cpuid + rdtsc rdtscpSelected = rdtsccpuid; rdtscpSelectedLength = rdtsccpuidLength; IsRdtscPSupported = false; } // Copy rdtscp/rdtsccpuid instructions in the buf Marshal.Copy(rdtscpSelected, 0, buf + cpuidLength + rdtscLength, rdtscpSelected.Length); for (int i = rdtscpSelected.Length; i < rdtscpSelectedLength; i++) { Marshal.WriteByte(buf, cpuidLength + rdtscLength + i, 0x90); // nop } // Change the access of the allocated memory from R/W to Execute uint oldProtection; bool result = VirtualProtect(buf, (IntPtr)totalLength, PAGE_EXECUTE, out oldProtection); if (!result) { throw new Win32Exception(); } // Create a delegate to the "function" Timestamp = (FuncUInt64)Marshal.GetDelegateForFunctionPointer(buf + cpuidLength, typeof(FuncUInt64)); TimestampP = (FuncUInt64)Marshal.GetDelegateForFunctionPointer(buf + cpuidLength + rdtscLength, typeof(FuncUInt64)); buf = IntPtr.Zero; } finally { // There was an error! if (buf != IntPtr.Zero) { // Free the allocated memory bool result = VirtualFree(buf, IntPtr.Zero, MEM_RELEASE); if (!result) { throw new Win32Exception(); } } } } // Fallback if rdtsc isn't available. We can't use directly // Stopwatch.GetTimestamp() because the return type is different. private static ulong StopwatchGetTimestamp() { return unchecked((ulong)Stopwatch.GetTimestamp()); } } 

它要长得多......有两种方法,

 ulong ts1 = Rdtsc.Timestamp(); ulong ts2 = Rdtsc.TimestampP(); 

第一个使用rdtsc ,而第二个使用rdtscprdtscp优于rdtsc因为它没有在管道中重新排序。 TimestampP方法使用cpuid + rdtsc对旧处理器进行回退,但回退速度相当慢。 对于这两种情况,使用Stopwatch.GetTimestamp()对非Intel / Amd处理器进行回退。 在内部,类使用cpuid指令来检查是否存在rdtscp指令。 有两个字段, IsRdtscSupportedIsRdtscPSupported ,它们告诉处理器是否支持rdtscrdtscp

没有P / invoke的简单版本的RDTSC。 此代码使用asm代码替换现有方法。 查看其他计时器@xanatos

 public static class Rdtsc { // private static readonly byte[] Rdtscp = { 0x0F, 0x01, 0xF9, 0xC3 }; //32 bit timer private static readonly byte[] Rdtscp = { 0x0F, 0x01, 0xF9, 0x48, 0xC1, 0xE2, 0x20, 0x48, 0x09, 0xD0, 0xC3 }; //64bit [MethodImpl(MethodImplOptions.NoInlining)] public static ulong TimestampP() { Stopwatch.GetTimestamp(); Stopwatch.GetTimestamp(); return 0; } public static unsafe void Init() { Func func = TimestampP; var reference = __makeref(func); var ptrReference = (IntPtr*)*(IntPtr*)&reference; var ptrFunc = (IntPtr*)*ptrReference; var ptrTimestampP = (byte*)*(ptrFunc + 0x4); foreach (var b in Rdtscp) *(ptrTimestampP++) = b; } } 

并获得周期数。 打这样的话

 public static void Main(string[] args) { CyclesPerSecond = GetCyclesPerSeond(); for (var i = 0; i < 3; i++) Console.WriteLine($"CyclesPerSecond: {GetCyclesPerSeond(),5:0} "); for (var i = 0; i < 10; i++) Console.WriteLine($"cycles: {ParseDecimal(),5:0.00} "); } private static ulong GetCyclesPerSeond() { Rdtsc.Init(); var sw = Stopwatch.StartNew(); var startms = Rdtsc.TimestampP(); do { } while (sw.ElapsedMilliseconds < 1000); var endms = Rdtsc.TimestampP(); return endms - startms; } private static double ParseDecimal() { var decStr = "-49823174320.9293800"; var mincycles = ulong.MaxValue; var testDuration = 10000000000UL; var testCycles = 0UL; var minIterations = 100; var pos = 0; var startTest = Rdtsc.TimestampP(); do { var start = Rdtsc.TimestampP(); for (var j = 0; j < minIterations; j++) decimal.Parse(decStr); var end = Rdtsc.TimestampP(); var cycles = end - start; if (cycles <= mincycles) mincycles = cycles; testCycles = Rdtsc.TimestampP() - startTest; } while (testCycles < testDuration); return mincycles / (double)minIterations; } 

添加到@xanatos的优秀答案,我需要以nanos报告延迟并成功使用此代码。 使用风险,Windows具体,似乎有破损。

 public static string GetRdtscTicksToNanos(ulong ticks) { double calcBaseClock = 1024d * (double)Stopwatch.Frequency; double rval = (double)ticks / calcBaseClock; return rval.ToString("N9"); } 

它似乎:

Stopwatch.Frequency = QueryPerformanceFrequency = RDTSC_RATE / 1024

请参阅: http : //aakinshin.net/blog/post/stopwatch/