If everything is working properly, it won't say anything. On my machine, it's doing pretty well if it ever goes 15 seconds without detecting something, and averages 10 to 30 per minute. It needs the system to be idle and clocks to be stable to be effective, and will error at launch if it doesn't see conditions it can work with. The load it puts on a system looks like it will usually put clocks where they need to be, but I'm not 100% sure about that, especially on AMD CPUs. If you keep seeing that error, the solid fix will be to set min and max CPU speed to 99% in Windows' power settings.
My own testing is on Win10 Home 1809 and an R7 1700 at a fixed 4.0 GHz OC.
<technical ramblings below this point>
The situation it's looking for is one where a logical core gets a new thread scheduled on it while the other logical part of that physical core is already fully utilized and another physical core is idle. This should actually happen once in a while AFAIK, because a thread that has only been descheduled very briefly should probably go back to the physical core it was last on for the sake of more relevant cache contents (unless security messes with that anyway?). If another thread has been scheduled on that physical core the whole time, though, the window in which that could be useful is brief. We're sampling at one millisecond intervals, so we could catch an occasional false positive if Windows is doing something like that for caches.
The associated workload just has it waking and sleeping threads at random intervals from a pool of them two smaller than the detected logical core count.
It busy-waits (really running the PRNG for thread wake intervals) because sleeping at this kind of granularity sometimes (but only sometimes) completely fixes whatever the problem is. That's the bit I'll be trying to nail down next probably, because it could at least be a workaround.
The weird requirement with clocks is because Windows makes it easy to get the idle cycle count per logical core, but not the active cycle count. AFAICT that would involve figuring a bunch of stuff out per-process. Not having a handy way to get a real percentage, it just assumes the CPU's clock is the highest (most idle) it's seen, confirms on launch that that gives sane results, and goes from there.
Code: Select all
#define THRESHOLD_LOW 5 // CPU considered idle if < this value (idle is fuzzy)
#define THRESHOLD_EDGE 25 // going from idle to > this means a newly scheduled thrd
#define RAND_WAKE_IVAL (1024*1024*8) // inverse of the chance a worker will wake per PRNG run
#define RAND_SLEEP_IVAL (1024*1024*32) // inverse of the chance a worker will sleep per PRNG run
#define PRE_LOOPS 1024 // initial loops to spend verifying clocks/idle are good
#define PRE_IDLE_PCT_MIN 80 // minimum non-self idle in initial loops
#include <stdio.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
CONDITION_VARIABLE cond = CONDITION_VARIABLE_INIT;
SRWLOCK lock = SRWLOCK_INIT;
uint64_t jtMicros(void)
{
static int64_t freq = 0;
LARGE_INTEGER ms_hack_tmp;
if(freq == 0) { // first run init
QueryPerformanceFrequency(&ms_hack_tmp);
freq = ms_hack_tmp.QuadPart;
}
QueryPerformanceCounter(&ms_hack_tmp);
return ms_hack_tmp.QuadPart * 1000000 / freq;
}
void worker(void)
{
uint64_t busy_data = 1;
while(1) {
AcquireSRWLockExclusive(&lock);
SleepConditionVariableSRW(&cond, &lock, INFINITE, 0);
ReleaseSRWLockExclusive(&lock);
do {
busy_data ^= busy_data >> 12;
busy_data ^= busy_data << 25;
busy_data ^= busy_data >> 27;
} while(busy_data & (RAND_SLEEP_IVAL - 1));
}
}
int main(int argc, char* argv[])
{
SYSTEM_INFO sys_info;
GetNativeSystemInfo(&sys_info);
int cpu_count = (int)sys_info.dwNumberOfProcessors;
uint32_t buffer_len = 8 * cpu_count;
for(int i = 0; i < cpu_count-2; i++) _beginthread(worker, 0, NULL);
uint64_t* raw_new = malloc(buffer_len);
uint64_t* raw_old = malloc(buffer_len);
uint64_t* per_us_new = malloc(buffer_len);
uint64_t* per_us_old = malloc(buffer_len);
uint64_t* pct_new = malloc(buffer_len);
uint64_t* pct_old = malloc(buffer_len);
bool* both_low = malloc(cpu_count * sizeof(bool));
bool* both_high = malloc(cpu_count * sizeof(bool));
bool* rising_edge = malloc(cpu_count * sizeof(bool));
uint64_t max_cycles_per_us = 1;
uint64_t epoch = jtMicros();
uint64_t time_raw = 0;
uint64_t time_delta = 0;
uint64_t busy_data = 1;
uint64_t pre_loops = PRE_LOOPS;
while(1) {
///////////////////////////////////////////////////////////////////////////////////////////////////
// main loop begin
///////////////////////////////////////////////////////////////////////////////////////////////////
memcpy(raw_old , raw_new , buffer_len);
memcpy(per_us_old, per_us_new, buffer_len);
memcpy(pct_old , pct_new , buffer_len);
time_delta = jtMicros() - time_raw;
time_raw = jtMicros();
QueryIdleProcessorCycleTime(&buffer_len, raw_new);
for(int i = 0; i < cpu_count; i++) {
per_us_new[i] = (raw_new[i] - raw_old[i]) / time_delta;
if(pre_loops != PRE_LOOPS) { // running this on the first cycle will break things
if(per_us_new[i] > max_cycles_per_us) max_cycles_per_us = per_us_new[i];
}
pct_new[i] = 100 * per_us_new[i] / max_cycles_per_us;
pct_new[i] = 100 - pct_new[i];
both_low[i] = pct_new[i] <= THRESHOLD_LOW && pct_old[i] <= THRESHOLD_LOW;
both_high[i] = pct_new[i] >= 100 && pct_old[i] >= 100 ;
rising_edge[i] = pct_new[i] >= THRESHOLD_EDGE && pct_old[i] <= THRESHOLD_LOW;
}
bool new_shared_core = false;
bool idle_core = false;
for(int i = 0; i < cpu_count; i += 2) {
if(both_low[i] && both_low[i+1] ) idle_core = true;
if(both_high[i] && rising_edge[i+1]) new_shared_core = true;
if(both_high[i+1] && rising_edge[i] ) new_shared_core = true;
}
if(idle_core && new_shared_core && !pre_loops) {
printf("[ %10.3f ] suboptimal schedule\n", (double)(jtMicros() - epoch) / 1000000.0);
}
if(time_delta > 5000 && !pre_loops) {
printf("[ %10.3f ] sample delayed\n", (double)(jtMicros() - epoch) / 1000000.0);
}
// HACK - we busy-wait here because making the scheduler do stuff too often (with Sleep())
// seems to make the problem completely disappear (but only sometimes? Haven't figured
// out the logic to that yet)
// in this case we busy-wait by running the PRNG to figure out when to spawn threads
while(jtMicros() - time_raw < 1000) {
for(int i = 0; i < 131072; i++) {
busy_data ^= busy_data >> 12;
busy_data ^= busy_data << 25;
busy_data ^= busy_data >> 27;
if(!(busy_data & (RAND_WAKE_IVAL - 1)) && !pre_loops) WakeConditionVariable(&cond);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// main loop end
///////////////////////////////////////////////////////////////////////////////////////////////////
// first few loops are init
if(pre_loops) {
static pre_idle_samples = 0;
for(int i = 0; i < cpu_count; i++) if(pct_new[i] <= THRESHOLD_LOW) pre_idle_samples++;
pre_loops--;
if(!pre_loops) {
uint64_t pre_idle_pct = 100 * pre_idle_samples / ((cpu_count-1) * PRE_LOOPS);
if(pre_idle_pct > PRE_IDLE_PCT_MIN) {
printf("[ %10.3f ] init success, %llu%% non-self idle\n",
(double)(jtMicros() - epoch) / 1000000.0, pre_idle_pct);
} else {
printf("[ %10.3f ] init failure, %llu%% non-self idle\n",
(double)(jtMicros() - epoch) / 1000000.0, pre_idle_pct);
printf("Either your system isn't idle enough, or cores are downclocking.\n");
printf("Press enter to exit.\n");
getchar();
exit(-1);
}
}
}
}
}
EDIT fixed a bug, updated source and executable (apparently I'm an idiot, but having done basically the same thing twice in a row now, I know which bit of my code quality I need to take more seriously). No idea how it worked for me so reliably before.