/***** includes *****/ #include "libbenchmark_porting_abstraction_layer_internal.h" /****************************************************************************/ #if( defined _WIN32 && !defined KERNEL_MODE && NTDDI_VERSION >= NTDDI_WINXPSP3 && NTDDI_VERSION < NTDDI_WIN7 ) #ifdef LIBBENCHMARK_PAL_POPULATE_TOPOLOGY #error More than one porting abstraction layer matches current platform in "libbenchmark_porting_abstraction_layer_populate_topology.c". #endif #define LIBBENCHMARK_PAL_POPULATE_TOPOLOGY static void internal_populate_logical_processor_array_from_bitmask( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, lfds710_pal_uint_t bitmask ); int libbenchmark_porting_abstraction_layer_populate_topology( struct libbenchmark_topology_state *ts, struct libshared_memory_state *ms ) { BOOL brv; DWORD slpi_length = 0, number_slpi, loop; enum libbenchmark_topology_node_cache_type processor_cache_type_to_libbenchmark_topology_node_cache_type[3] = { LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_UNIFIED, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_INSTRUCTION, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_DATA }; int rv = 1; struct libbenchmark_topology_node_state *tns; SYSTEM_LOGICAL_PROCESSOR_INFORMATION *slpi = NULL; ULONG_PTR mask; LFDS710_PAL_ASSERT( ts != NULL ); LFDS710_PAL_ASSERT( ms != NULL ); // TRD : obtain information from the OS brv = GetLogicalProcessorInformation( slpi, &slpi_length ); slpi = libshared_memory_alloc_from_most_free_space_node( ms, slpi_length, sizeof(lfds710_pal_uint_t) ); brv = GetLogicalProcessorInformation( slpi, &slpi_length ); number_slpi = slpi_length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); /* TRD : we loop twice over the topology information first time we form up the system node and add that second time, we do everything else */ libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( loop = 0 ; loop < number_slpi ; loop++ ) if( (slpi+loop)->Relationship == RelationNumaNode ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) (slpi+loop)->ProcessorMask ); libbenchmark_misc_pal_helper_add_system_node_to_topology_tree( ts, tns ); for( loop = 0 ; loop < number_slpi ; loop++ ) { if( (slpi+loop)->Relationship == RelationNumaNode ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) ((slpi+loop)->ProcessorMask) ); libbenchmark_misc_pal_helper_add_numa_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) (slpi+loop)->NumaNode.NodeNumber ); // TRD : add each LP as an individual LP node for( mask = 1 ; mask != 0 ; mask <<= 1 ) if( ((slpi+loop)->ProcessorMask & mask) == mask ) libbenchmark_misc_pal_helper_add_logical_processor_node_to_topology_tree( ts, ms, (lfds710_pal_uint_t) ((slpi+loop)->ProcessorMask & mask), LOWERED, 0 ); } if( (slpi+loop)->Relationship == RelationProcessorPackage ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) ((slpi+loop)->ProcessorMask) ); libbenchmark_misc_pal_helper_add_socket_node_to_topology_tree( ts, tns ); } if( (slpi+loop)->Relationship == RelationProcessorCore ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) ((slpi+loop)->ProcessorMask) ); libbenchmark_misc_pal_helper_add_physical_processor_node_to_topology_tree( ts, tns ); } if( (slpi+loop)->Relationship == RelationCache ) { if( (slpi+loop)->Cache.Type == CacheUnified or (slpi+loop)->Cache.Type == CacheInstruction or (slpi+loop)->Cache.Type == CacheData ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) (slpi+loop)->ProcessorMask ); libbenchmark_misc_pal_helper_add_cache_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) (slpi+loop)->Cache.Level, processor_cache_type_to_libbenchmark_topology_node_cache_type[(slpi+loop)->Cache.Type] ); } } } return rv; } /****************************************************************************/ static void internal_populate_logical_processor_array_from_bitmask( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, lfds710_pal_uint_t bitmask ) { lfds710_pal_uint_t logical_processor_number = 1; struct libbenchmark_topology_node_state *tns_temp; LFDS710_PAL_ASSERT( ms != NULL ); LFDS710_PAL_ASSERT( tns != NULL ); // TRD : bitmask can be any value in its range /* TRD : iterate over the bits in the bitmask each is a LP number add every LP to *tns */ while( bitmask != 0 ) { if( bitmask & 0x1 ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, logical_processor_number, LOWERED, 0 ); bitmask >>= 1; logical_processor_number++; } return; } #endif /****************************************************************************/ #if( defined _WIN32 && !defined KERNEL_MODE && NTDDI_VERSION >= NTDDI_WIN7 ) #ifdef LIBBENCHMARK_PAL_POPULATE_TOPOLOGY #error More than one porting abstraction layer matches current platform in "libbenchmark_porting_abstraction_layer_populate_topology.c". #endif #define LIBBENCHMARK_PAL_POPULATE_TOPOLOGY static int numa_node_id_to_numa_node_id_compare_function( void const *new_key, void const *existing_key ); static void nna_cleanup( struct lfds710_btree_au_state *baus, struct lfds710_btree_au_element *baue ); static void internal_populate_logical_processor_array_from_bitmask( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, lfds710_pal_uint_t windows_processor_group_number, lfds710_pal_uint_t bitmask ); int libbenchmark_porting_abstraction_layer_populate_topology( struct libbenchmark_topology_state *ts, struct libshared_memory_state *ms ) { BOOL brv; DWORD offset = 0, slpie_length = 0, subloop; /* enum libbenchmark_topology_node_cache_type processor_cache_type_to_libbenchmark_topology_node_cache_type[3] = { LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_UNIFIED, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_INSTRUCTION, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_DATA }; */ int rv = 1; KAFFINITY bitmask; lfds710_pal_uint_t logical_processor_number; struct lfds710_btree_au_element *baue; struct lfds710_btree_au_state nna_tree_state; struct libbenchmark_topology_node_state *tns; SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *slpie, *slpie_buffer = NULL; LFDS710_PAL_ASSERT( ts != NULL ); LFDS710_PAL_ASSERT( ms != NULL ); // TRD : obtain information from the OS brv = GetLogicalProcessorInformationEx( RelationAll, slpie_buffer, &slpie_length ); slpie_buffer = libshared_memory_alloc_from_most_free_space_node( ms, slpie_length, sizeof(lfds710_pal_uint_t) ); brv = GetLogicalProcessorInformationEx( RelationAll, slpie_buffer, &slpie_length ); /* TRD : this API from MS is absolutely bloody appalling staggeringly and completely needlessly complex and inadequately documented I think I've found at least one design flaw and I'm inferring from the C structures a good deal of what's presumably going on where the docs just don't say (addendum - I've just found another huge fucking issue which has wasted two fucking days of my time the original non-Ex() API returns an actual C array, where the elements are structs, which contain a union, but in C the struct is sized to the max size of the union, so you can iterate over the array the NEW version, in the docs still says "array", but it actually returns a PACKED "array" (not an array, because you can't iterate over it) where the each element now has a Size member - you need to move your pointer by the number of bytes in Size - this is NOT in the docs, there is NO example code, and the ONLY WAY YOU CAN GUESS IS TO NOTICE THERE IS A SIZE MEMBER IN THE NEW STRUCT) (for example, just found a one-liner buried in the note on a particular structure returned for a particular node type; "If the PROCESSOR_RELATIONSHIP structure represents a processor core, the GroupCount member is always 1." this *implies* that a physical core is never split across groups this is a very important fact, if you're trying to work with this fucking API but it's not actually SPECIFICALLY STATED it's only implied - and so I do not feel confident in it and the appalling design and appallingly low quality of the docs in general hardly gives me confidence to just go ahead and believe in anything I find written - let alone something which is, offfhand, just implies, buried in some structure notes somewhere this is how it is all the way across this entire bloody API another example is that LPs are not actually returned by the API I'm *inferring* I can get the full list by taking the LP masks presented by the NUMA nodes it's *not* documented - i.e. it's not documented HOW TO GET THE LIST OF LOGICAL PROCESSORS IN THE SYSTEM fucking christ...!) I'm absolutely 100% certain my use of the API is not fully correct but I have no way to find out MS are bloody idiots - the "processor group" concept is absolutely and utterly crazy and it complicates *everything* by a power of 2 rather than simply iterating over the records provided, where just about any entity in the system (NUMA node, processor socket, etc) can have multiple records being returned, I have in fact to iterate over the whole record set, accumulating the multiple records, so I can FINALLY find out the full logical processor set for any given entity, so I can THEN, FINALLY, insert the entity into the toplogy tree i.e. for any given node, you have to fully iterate the list of records provided by the OS, to actually know you know all the LPs for that node there is no single-entity/single-record lookup or relationship MS -> you are bloody idiots; this is appalling, OBVIOUSLY appalling, and whoever designed it, and ESPECIALLY whoever APPROVED It, needs not only to be fired, but SHOT as ever with MS, something that takes a few minutes in Linux takes bloody hours with MS note due to aforementioned design flaw, it is not possible to collect cache information the problem is that if we have a cache which spans multiple processor groups, there will be mutiple records (or I presume there will be - I'm inferring), BUT, looking at the structures, it's not possible to know these are *the same cache* so, this mess; 1. RelationNumaNode - we need to loop over the full list of records to accumulate the full set of LPs for each NUMA node then we can add the record to tbe btree 2. RelationGroup - really REALLY don't care - with prejudice 3. RelationProcessorPackage - bizarrely, actually does the right thing (as far as it can be right in this sorry mess) and contains the full list of group IDs it belongs to, and the full list of LP IDs within each group so we can iterate once over the full set of records and insert this record type directly 4. RelationProcessorCore - same as RelationProcessorPackage 5. RelationCache - seems fubared; provide a single processor group and single mask of LPs, and so if a cache spans multiple processor groups, we'll get multiple records for it - problem is, we've no way of knowing *its the same cache* we get away with this with NUMA because each node has an ID the next best thing is going to be record the details of the cache from the structure (level, associativity, etc) and match based on that God I hate Microsoft */ // TRD : iterate once for system node libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); while( offset < slpie_length ) { slpie = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *) ( (char unsigned *) slpie_buffer + offset ); offset += slpie->Size; if( slpie->Relationship == RelationNumaNode ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) (slpie->NumaNode.GroupMask.Group), (lfds710_pal_uint_t) (slpie->NumaNode.GroupMask.Mask) ); } libbenchmark_misc_pal_helper_add_system_node_to_topology_tree( ts, tns ); // TRD : iterate again for everything else lfds710_btree_au_init_valid_on_current_logical_core( &nna_tree_state, numa_node_id_to_numa_node_id_compare_function, LFDS710_BTREE_AU_INSERT_RESULT_FAILURE_EXISTING_KEY, ts ); offset = 0; while( offset < slpie_length ) { slpie = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *) ( (char unsigned *) slpie_buffer + offset ); offset += slpie->Size; if( slpie->Relationship == RelationNumaNode ) { /* TRD : now for the first madness - accumulate the NUMA node records first, try to find this node in nna_tree_state if it's there, we use it - it not, we make it and add it, and use it once we've got a node to work with, we add the current list of LPs to that node */ rv = lfds710_btree_au_get_by_key( &nna_tree_state, NULL, (void *) &slpie->NumaNode.NodeNumber, &baue ); if( rv == 0 ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); baue = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds710_btree_au_element), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES ); LFDS710_BTREE_AU_SET_KEY_IN_ELEMENT( *baue, (void *) &slpie->NumaNode.NodeNumber ); LFDS710_BTREE_AU_SET_VALUE_IN_ELEMENT( *baue, tns ); lfds710_btree_au_insert( &nna_tree_state, baue, NULL ); } // TRD : baue now points at the correct node tns = LFDS710_BTREE_AU_GET_VALUE_FROM_ELEMENT( *baue ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->NumaNode.GroupMask.Group, (lfds710_pal_uint_t) slpie->NumaNode.GroupMask.Mask ); // TRD : now all all LPs from this NUMA node to tree logical_processor_number = 0; bitmask = slpie->NumaNode.GroupMask.Mask; while( bitmask != 0 ) { if( bitmask & 0x1 ) libbenchmark_misc_pal_helper_add_logical_processor_node_to_topology_tree( ts, ms, logical_processor_number, RAISED, (slpie->NumaNode.GroupMask.Group) ); bitmask >>= 1; logical_processor_number++; } } if( slpie->Relationship == RelationGroup ) { // TRD : we don't care about this - actually, we do care, we really REALLY hate this } if( slpie->Relationship == RelationProcessorPackage ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( subloop = 0 ; subloop < slpie->Processor.GroupCount ; subloop++ ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Group, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Mask ); libbenchmark_misc_pal_helper_add_socket_node_to_topology_tree( ts, tns ); } if( slpie->Relationship == RelationProcessorCore ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( subloop = 0 ; subloop < slpie->Processor.GroupCount ; subloop++ ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Group, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Mask ); libbenchmark_misc_pal_helper_add_physical_processor_node_to_topology_tree( ts, tns ); } /* if( slpie->Relationship == RelationCache ) { if( slpie->Cache.Type == CacheUnified or slpie->Cache.Type == CacheInstruction or slpie->Cache.Type == CacheData ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->ProcessorMask ); libbenchmark_misc_pal_helper_add_cache_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) slpie->Cache.Level, processor_cache_type_to_libbenchmark_topology_node_cache_type[slpie->Cache.Type] ); } } */ } /* TRD : now finally insert the built-up NUMA and cache records we call cleanup() on the accumulator tree - it's safe to re-use the nodes as they're emitted to the cleanup function so we then throw them into the topology_state tree */ lfds710_btree_au_cleanup( &nna_tree_state, nna_cleanup ); return rv; } /****************************************************************************/ static int numa_node_id_to_numa_node_id_compare_function( void const *new_key, void const *existing_key ) { int cr = 0; DWORD numa_node_id_existing, numa_node_id_new; LFDS710_PAL_ASSERT( new_key != NULL ); LFDS710_PAL_ASSERT( existing_key != NULL ); numa_node_id_new = *(DWORD *) new_key; numa_node_id_existing = *(DWORD *) existing_key; if( numa_node_id_new < numa_node_id_existing ) cr = -1; if( numa_node_id_new > numa_node_id_existing ) cr = 1; return cr; } /****************************************************************************/ static void nna_cleanup( struct lfds710_btree_au_state *baus, struct lfds710_btree_au_element *baue ) { DWORD *numa_node_id; struct libbenchmark_topology_node_state *tns; struct libbenchmark_topology_state *ts; LFDS710_PAL_ASSERT( baus != NULL ); LFDS710_PAL_ASSERT( baue != NULL ); ts = LFDS710_BTREE_AU_GET_USER_STATE_FROM_STATE( *baus ); numa_node_id = LFDS710_BTREE_AU_GET_KEY_FROM_ELEMENT( *baue ); tns = LFDS710_BTREE_AU_GET_VALUE_FROM_ELEMENT( *baue ); libbenchmark_misc_pal_helper_add_numa_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) *numa_node_id ); return; } /****************************************************************************/ static void internal_populate_logical_processor_array_from_bitmask( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, lfds710_pal_uint_t windows_processor_group_number, lfds710_pal_uint_t bitmask ) { lfds710_pal_uint_t logical_processor_number = 0; LFDS710_PAL_ASSERT( ms != NULL ); LFDS710_PAL_ASSERT( tns != NULL ); // TRD : windows_processor_group_number can be any value in its range // TRD : bitmask can be any value in its range /* TRD : iterate over the bits in the bitmask each is a LP number add every LP to *tns */ while( bitmask != 0 ) { if( bitmask & 0x1 ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, logical_processor_number, RAISED, windows_processor_group_number ); bitmask >>= 1; logical_processor_number++; } return; } #endif /****************************************************************************/ #if( defined _WIN32 && defined KERNEL_MODE && NTDDI_VERSION >= NTDDI_WINXP && NTDDI_VERSION < NTDDI_WIN7 ) #ifdef LIBBENCHMARK_PAL_POPULATE_TOPOLOGY #error More than one porting abstraction layer matches current platform in "libbenchmark_porting_abstraction_layer_populate_topology.c". #endif #define LIBBENCHMARK_PAL_POPULATE_TOPOLOGY int libbenchmark_porting_abstraction_layer_populate_topology( struct libbenchmark_topology_state *ts, struct libshared_memory_state *ms ) { CCHAR loop; struct libbenchmark_topology_node_state *tns; LFDS710_PAL_ASSERT( ts != NULL ); LFDS710_PAL_ASSERT( ms != NULL ); /* TRD : prior to Windows 7 there is no way to enumerate CPU topology all that is available is a count of the number of logical cores, KeNumberProcessors this is in fact only available *up to Vista SP1*... Windows 7 provides full functionality to get topology, so it's not clear what should be done on Vista SP1... as such to get the topology actually right, the user has to hardcode it the best general solution seems to be to take the number of logical cores assumes they're all on one processor and there's one NUMA node */ // TRD : create the system node, populate and insert libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( loop = 0 ; loop < KeNumberProcessors ; loop++ ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, loop, LOWERED, 0 ); libbenchmark_misc_pal_helper_add_system_node_to_topology_tree( ts, tns ); // TRD : create the NUMA node, populate and insert libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( loop = 0 ; loop < KeNumberProcessors ; loop++ ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, loop, LOWERED, 0 ); libbenchmark_misc_pal_helper_add_numa_node_to_topology_tree( ts, tns, 0 ); // TRD : create the socket node, populate and insert libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( loop = 0 ; loop < KeNumberProcessors ; loop++ ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, loop, LOWERED, 0 ); libbenchmark_misc_pal_helper_add_socket_node_to_topology_tree( ts, tns ); // TRD : create the physical processor node, populate and insert libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( loop = 0 ; loop < KeNumberProcessors ; loop++ ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, loop, LOWERED, 0 ); libbenchmark_misc_pal_helper_add_physical_processor_node_to_topology_tree( ts, tns ); // TRD : create the logical processor nodes, populate and insert for( loop = 0 ; loop < KeNumberProcessors ; loop++ ) libbenchmark_misc_pal_helper_add_logical_processor_node_to_topology_tree( ts, ms, loop, LOWERED, 0 ); return 1; } #endif /****************************************************************************/ #if( defined _WIN32 && defined KERNEL_MODE && NTDDI_VERSION >= NTDDI_WIN7 ) #ifdef LIBBENCHMARK_PAL_POPULATE_TOPOLOGY #error More than one porting abstraction layer matches current platform in "libbenchmark_porting_abstraction_layer_populate_topology.c". #endif #define LIBBENCHMARK_PAL_POPULATE_TOPOLOGY static int numa_node_id_to_numa_node_id_compare_function( void const *new_key, void const *existing_key ); static void nna_cleanup( struct lfds710_btree_au_state *baus, struct lfds710_btree_au_element *baue ); static void internal_populate_logical_processor_array_from_bitmask( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, lfds710_pal_uint_t windows_processor_group_number, lfds710_pal_uint_t bitmask ); int libbenchmark_porting_abstraction_layer_populate_topology( struct libbenchmark_topology_state *ts, struct libshared_memory_state *ms ) { /* enum libbenchmark_topology_node_cache_type processor_cache_type_to_libbenchmark_topology_node_cache_type[3] = { LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_UNIFIED, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_INSTRUCTION, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_DATA }; */ int rv = 1; KAFFINITY bitmask; lfds710_pal_uint_t logical_processor_number; NTSTATUS brv; struct lfds710_btree_au_element *baue; struct lfds710_btree_au_state nna_tree_state; struct libbenchmark_topology_node_state *tns; SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *slpie, *slpie_buffer = NULL; ULONG offset = 0, slpie_length = 0, subloop; LFDS710_PAL_ASSERT( ts != NULL ); LFDS710_PAL_ASSERT( ms != NULL ); // TRD : obtain information from the OS brv = KeQueryLogicalProcessorRelationship( NULL, RelationAll, slpie_buffer, &slpie_length ); slpie_buffer = libshared_memory_alloc_from_most_free_space_node( ms, slpie_length, sizeof(lfds710_pal_uint_t) ); brv = KeQueryLogicalProcessorRelationship( NULL, RelationAll, slpie_buffer, &slpie_length ); /* TRD : this API from MS is absolutely bloody appalling staggeringly and completely needlessly complex and inadequately documented I think I've found at least one design flaw and I'm inferring from the C structures a good deal of what's presumably going on where the docs just don't say (addendum - I've just found another huge fucking issue which has wasted two fucking days of my time the original non-Ex() API returns an actual C array, where the elements are structs, which contain a union, but in C the struct is sized to the max size of the union, so you can iterate over the array the NEW version, in the docs still says "array", but it actually returns a PACKED "array" (not an array, because you can't iterate over it) where the each element now has a Size member - you need to move your pointer by the number of bytes in Size - this is NOT in the docs, there is NO example code, and the ONLY WAY YOU CAN GUESS IS TO NOTICE THERE IS A SIZE MEMBER IN THE NEW STRUCT) (for example, just found a one-liner buried in the note on a particular structure returned for a particular node type; "If the PROCESSOR_RELATIONSHIP structure represents a processor core, the GroupCount member is always 1." this *implies* that a physical core is never split across groups this is a very important fact, if you're trying to work with this fucking API but it's not actually SPECIFICALLY STATED it's only implied - and so I do not feel confident in it and the appalling design and appallingly low quality of the docs in general hardly gives me confidence to just go ahead and believe in anything I find written - let alone something which is, offfhand, just implies, buried in some structure notes somewhere this is how it is all the way across this entire bloody API another example is that LPs are not actually returned by the API I'm *inferring* I can get the full list by taking the LP masks presented by the NUMA nodes it's *not* documented - i.e. it's not documented HOW TO GET THE LIST OF LOGICAL PROCESSORS IN THE SYSTEM fucking christ...!) I'm absolutely 100% certain my use of the API is not fully correct but I have no way to find out MS are bloody idiots - the "processor group" concept is absolutely and utterly crazy and it complicates *everything* by a power of 2 rather than simply iterating over the records provided, where just about any entity in the system (NUMA node, processor socket, etc) can have multiple records being returned, I have in fact to iterate over the whole record set, accumulating the multiple records, so I can FINALLY find out the full logical processor set for any given entity, so I can THEN, FINALLY, insert the entity into the toplogy tree i.e. for any given node, you have to fully iterate the list of records provided by the OS, to actually know you know all the LPs for that node there is no single-entity/single-record lookup or relationship MS -> you are bloody idiots; this is appalling, OBVIOUSLY appalling, and whoever designed it, and ESPECIALLY whoever APPROVED It, needs not only to be fired, but SHOT as ever with MS, something that takes a few minutes in Linux takes bloody hours with MS note due to aforementioned design flaw, it is not possible to collect cache information the problem is that if we have a cache which spans multiple processor groups, there will be mutiple records (or I presume there will be - I'm inferring), BUT, looking at the structures, it's not possible to know these are *the same cache* so, this mess; 1. RelationNumaNode - we need to loop over the full list of records to accumulate the full set of LPs for each NUMA node then we can add the record to tbe btree 2. RelationGroup - really REALLY don't care - with prejudice 3. RelationProcessorPackage - bizarrely, actually does the right thing (as far as it can be right in this sorry mess) and contains the full list of group IDs it belongs to, and the full list of LP IDs within each group so we can iterate once over the full set of records and insert this record type directly 4. RelationProcessorCore - same as RelationProcessorPackage 5. RelationCache - seems fubared; provide a single processor group and single mask of LPs, and so if a cache spans multiple processor groups, we'll get multiple records for it - problem is, we've no way of knowing *its the same cache* we get away with this with NUMA because each node has an ID the next best thing is going to be record the details of the cache from the structure (level, associativity, etc) and match based on that God I hate Microsoft */ // TRD : iterate once for system node libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); while( offset < slpie_length ) { slpie = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *) ( (char unsigned *) slpie_buffer + offset ); offset += slpie->Size; if( slpie->Relationship == RelationNumaNode ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) (slpie->NumaNode.GroupMask.Group), (lfds710_pal_uint_t) (slpie->NumaNode.GroupMask.Mask) ); } libbenchmark_misc_pal_helper_add_system_node_to_topology_tree( ts, tns ); // TRD : iterate again for everything else lfds710_btree_au_init_valid_on_current_logical_core( &nna_tree_state, numa_node_id_to_numa_node_id_compare_function, LFDS710_BTREE_AU_INSERT_RESULT_FAILURE_EXISTING_KEY, ts ); offset = 0; while( offset < slpie_length ) { slpie = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *) ( (char unsigned *) slpie_buffer + offset ); offset += slpie->Size; if( slpie->Relationship == RelationNumaNode ) { /* TRD : now for the first madness - accumulate the NUMA node records first, try to find this node in nna_tree_state if it's there, we use it - it not, we make it and add it, and use it once we've got a node to work with, we add the current list of LPs to that node */ rv = lfds710_btree_au_get_by_key( &nna_tree_state, NULL, (void *) &slpie->NumaNode.NodeNumber, &baue ); if( rv == 0 ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); baue = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds710_btree_au_element), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES ); LFDS710_BTREE_AU_SET_KEY_IN_ELEMENT( *baue, (void *) &slpie->NumaNode.NodeNumber ); LFDS710_BTREE_AU_SET_VALUE_IN_ELEMENT( *baue, tns ); lfds710_btree_au_insert( &nna_tree_state, baue, NULL ); } // TRD : baue now points at the correct node tns = LFDS710_BTREE_AU_GET_VALUE_FROM_ELEMENT( *baue ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->NumaNode.GroupMask.Group, (lfds710_pal_uint_t) slpie->NumaNode.GroupMask.Mask ); // TRD : now all all LPs from this NUMA node to tree logical_processor_number = 0; bitmask = slpie->NumaNode.GroupMask.Mask; while( bitmask != 0 ) { if( bitmask & 0x1 ) libbenchmark_misc_pal_helper_add_logical_processor_node_to_topology_tree( ts, ms, logical_processor_number, RAISED, (slpie->NumaNode.GroupMask.Group) ); bitmask >>= 1; logical_processor_number++; } } if( slpie->Relationship == RelationGroup ) { // TRD : we don't care about this - actually, we do care, we really REALLY hate this } if( slpie->Relationship == RelationProcessorPackage ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( subloop = 0 ; subloop < slpie->Processor.GroupCount ; subloop++ ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Group, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Mask ); libbenchmark_misc_pal_helper_add_socket_node_to_topology_tree( ts, tns ); } if( slpie->Relationship == RelationProcessorCore ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); for( subloop = 0 ; subloop < slpie->Processor.GroupCount ; subloop++ ) internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Group, (lfds710_pal_uint_t) slpie->Processor.GroupMask[subloop].Mask ); libbenchmark_misc_pal_helper_add_physical_processor_node_to_topology_tree( ts, tns ); } /* if( slpie->Relationship == RelationCache ) { if( slpie->Cache.Type == CacheUnified or slpie->Cache.Type == CacheInstruction or slpie->Cache.Type == CacheData ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_bitmask( ms, tns, (lfds710_pal_uint_t) slpie->ProcessorMask ); libbenchmark_misc_pal_helper_add_cache_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) slpie->Cache.Level, processor_cache_type_to_libbenchmark_topology_node_cache_type[slpie->Cache.Type] ); } } */ } /* TRD : now finally insert the built-up NUMA and cache records we call cleanup() on the accumulator tree - it's safe to re-use the nodes as they're emitted to the cleanup function so we then throw them into the topology_state tree */ lfds710_btree_au_cleanup( &nna_tree_state, nna_cleanup ); return rv; } /****************************************************************************/ static int numa_node_id_to_numa_node_id_compare_function( void const *new_key, void const *existing_key ) { int cr = 0; ULONG numa_node_id_existing, numa_node_id_new; LFDS710_PAL_ASSERT( new_key != NULL ); LFDS710_PAL_ASSERT( existing_key != NULL ); numa_node_id_new = *(ULONG *) new_key; numa_node_id_existing = *(ULONG *) existing_key; if( numa_node_id_new < numa_node_id_existing ) cr = -1; if( numa_node_id_new > numa_node_id_existing ) cr = 1; return cr; } /****************************************************************************/ static void nna_cleanup( struct lfds710_btree_au_state *baus, struct lfds710_btree_au_element *baue ) { ULONG *numa_node_id; struct libbenchmark_topology_node_state *tns; struct libbenchmark_topology_state *ts; LFDS710_PAL_ASSERT( baus != NULL ); LFDS710_PAL_ASSERT( baue != NULL ); ts = LFDS710_BTREE_AU_GET_USER_STATE_FROM_STATE( *baus ); numa_node_id = LFDS710_BTREE_AU_GET_KEY_FROM_ELEMENT( *baue ); tns = LFDS710_BTREE_AU_GET_VALUE_FROM_ELEMENT( *baue ); libbenchmark_misc_pal_helper_add_numa_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) *numa_node_id ); return; } /****************************************************************************/ static void internal_populate_logical_processor_array_from_bitmask( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, lfds710_pal_uint_t windows_processor_group_number, lfds710_pal_uint_t bitmask ) { lfds710_pal_uint_t logical_processor_number = 0; LFDS710_PAL_ASSERT( ms != NULL ); LFDS710_PAL_ASSERT( tns != NULL ); // TRD : windows_processor_group_number can be any value in its range // TRD : bitmask can be any value in its range /* TRD : iterate over the bits in the bitmask each is a LP number add every LP to *tns */ while( bitmask != 0 ) { if( bitmask & 0x1 ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, logical_processor_number, RAISED, windows_processor_group_number ); bitmask >>= 1; logical_processor_number++; } return; } #endif /****************************************************************************/ #if( defined __linux__ && !defined KERNEL_MODE && defined __STDC__ && __STDC_HOSTED__ == 1 ) #ifdef LIBBENCHMARK_PAL_POPULATE_TOPOLOGY #error More than one porting abstraction layer matches current platform in "libbenchmark_porting_abstraction_layer_populate_topology.c". #endif #define LIBBENCHMARK_PAL_POPULATE_TOPOLOGY static void internal_populate_logical_processor_array_from_path_to_csv_hex( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, char *path_to_csv_hex ); static int internal_verify_paths( lfds710_pal_uint_t number_paths, ... ); static void internal_read_string_from_path( char *path, char *string ); /****************************************************************************/ int libbenchmark_porting_abstraction_layer_populate_topology( struct libbenchmark_topology_state *ts, struct libshared_memory_state *ms ) { char numa_node_path[128], thread_siblings_path[128], core_siblings_path[128], cache_level_path[128], cache_type_path[128], shared_cpu_map_path[128], cache_level_string[16], cache_type_string[16]; int rv = 1, cache_type_string_to_type_enum_lookup[NUMBER_UPPERCASE_LETTERS_IN_LATIN_ALPHABET] = { -1, -1, -1, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_DATA, -1, -1, -1, -1, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_INSTRUCTION, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, LIBBENCHMARK_TOPOLOGY_NODE_CACHE_TYPE_UNIFIED, -1, -1, -1, -1 }; int long long unsigned level_temp; lfds710_pal_uint_t numa_node = 0, cpu_number = 0, index_number, level, type; struct libbenchmark_topology_iterate_state tis; struct libbenchmark_topology_node_state *tns, *tns_lp; LFDS710_PAL_ASSERT( ts != NULL ); LFDS710_PAL_ASSERT( ms != NULL ); sprintf( numa_node_path, "/sys/devices/system/node/node%llu/cpumap", (int long long unsigned) numa_node ); while( internal_verify_paths(1, numa_node_path) ) { libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_path_to_csv_hex( ms, tns, numa_node_path ); libbenchmark_misc_pal_helper_add_numa_node_to_topology_tree( ts, tns, (lfds710_pal_uint_t) numa_node ); sprintf( numa_node_path, "/sys/devices/system/node/node%llu/cpumap", (int long long unsigned) (++numa_node) ); } sprintf( thread_siblings_path, "/sys/devices/system/cpu/cpu%llu/topology/thread_siblings", (int long long unsigned) cpu_number ); sprintf( core_siblings_path, "/sys/devices/system/cpu/cpu%llu/topology/core_siblings", (int long long unsigned) cpu_number ); while( internal_verify_paths(2, core_siblings_path, thread_siblings_path) ) { libbenchmark_misc_pal_helper_add_logical_processor_node_to_topology_tree( ts, ms, cpu_number, LOWERED, 0 ); libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_path_to_csv_hex( ms, tns, thread_siblings_path ); libbenchmark_misc_pal_helper_add_physical_processor_node_to_topology_tree( ts, tns ); libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_path_to_csv_hex( ms, tns, core_siblings_path ); libbenchmark_misc_pal_helper_add_socket_node_to_topology_tree( ts, tns ); index_number = 0; sprintf( cache_level_path, "/sys/devices/system/cpu/cpu%llu/cache/index%llu/level", (int long long unsigned) cpu_number, (int long long unsigned) index_number ); sprintf( cache_type_path, "/sys/devices/system/cpu/cpu%llu/cache/index%llu/type", (int long long unsigned) cpu_number, (int long long unsigned) index_number ); sprintf( shared_cpu_map_path, "/sys/devices/system/cpu/cpu%llu/cache/index%llu/shared_cpu_map", (int long long unsigned) cpu_number, (int long long unsigned) index_number ); while( internal_verify_paths(3, cache_level_path, cache_type_path, shared_cpu_map_path) ) { internal_read_string_from_path( cache_level_path, cache_level_string ); sscanf( cache_level_string, "%llx", &level_temp ); level = (lfds710_pal_uint_t) level_temp; internal_read_string_from_path( cache_type_path, cache_type_string ); type = cache_type_string_to_type_enum_lookup[(int)(*cache_type_string - 'A')]; libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); internal_populate_logical_processor_array_from_path_to_csv_hex( ms, tns, shared_cpu_map_path ); libbenchmark_misc_pal_helper_add_cache_node_to_topology_tree( ts, tns, level, type ); index_number++; sprintf( cache_level_path, "/sys/devices/system/cpu/cpu%llu/cache/index%llu/level", (int long long unsigned) cpu_number, (int long long unsigned) index_number ); sprintf( cache_type_path, "/sys/devices/system/cpu/cpu%llu/cache/index%llu/type", (int long long unsigned) cpu_number, (int long long unsigned) index_number ); sprintf( shared_cpu_map_path, "/sys/devices/system/cpu/cpu%llu/cache/index%llu/shared_cpu_map", (int long long unsigned) cpu_number, (int long long unsigned) index_number ); } cpu_number++; sprintf( thread_siblings_path, "/sys/devices/system/cpu/cpu%llu/topology/thread_siblings", (int long long unsigned) cpu_number ); sprintf( core_siblings_path, "/sys/devices/system/cpu/cpu%llu/topology/core_siblings", (int long long unsigned) cpu_number ); } // TRD : now make and populate the notional system node libbenchmark_misc_pal_helper_new_topology_node( &tns, ms ); libbenchmark_topology_iterate_init( &tis, LIBBENCHMARK_TOPOLOGY_NODE_TYPE_LOGICAL_PROCESSOR ); while( libbenchmark_topology_iterate(ts, &tis, &tns_lp) ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, LIBBENCHMARK_TOPOLOGY_NODE_GET_LOGICAL_PROCESSOR_NUMBER(*tns_lp), LOWERED, 0 ); libbenchmark_misc_pal_helper_add_system_node_to_topology_tree( ts, tns ); return rv; } /****************************************************************************/ void libbenchmark_porting_abstraction_layer_topology_node_cleanup( struct libbenchmark_topology_node_state *tns ) { LFDS710_PAL_ASSERT( tns != NULL ); lfds710_list_aso_cleanup( &tns->logical_processor_children, NULL ); return; } /****************************************************************************/ static void internal_populate_logical_processor_array_from_path_to_csv_hex( struct libshared_memory_state *ms, struct libbenchmark_topology_node_state *tns, char *path_to_csv_hex ) { char diskbuffer[BUFSIZ], string[1024]; FILE *diskfile; int loop; int unsigned logical_processor_foursome, logical_processor_number = 0, subloop; lfds710_pal_uint_t length = 0; LFDS710_PAL_ASSERT( ms != NULL ); LFDS710_PAL_ASSERT( tns != NULL ); LFDS710_PAL_ASSERT( path_to_csv_hex != NULL ); /* TRD : we're passed a format string and args, which comprise the path form up the string, open the file, read the string, parse the string the string consists of 32-bit bitmasks in hex separated by commas no leading or trailing commas */ diskfile = fopen( path_to_csv_hex, "r" ); setbuf( diskfile, diskbuffer ); fgets( string, 1024, diskfile ); fclose( diskfile ); while( string[length++] != '\0' ); length -= 2; for( loop = ((int)length)-1 ; loop > -1 ; loop-- ) { if( string[loop] == ',' ) continue; sscanf( &string[loop], "%1x", &logical_processor_foursome ); for( subloop = 0 ; subloop < 4 ; subloop++ ) if( ( (logical_processor_foursome >> subloop) & 0x1 ) == 0x1 ) libbenchmark_misc_pal_helper_add_logical_processor_to_topology_node( tns, ms, logical_processor_number + subloop, LOWERED, 0 ); logical_processor_number += 4; } return; } /****************************************************************************/ static int internal_verify_paths( lfds710_pal_uint_t number_paths, ... ) { FILE *diskfile; int rv = 1; lfds710_pal_uint_t count = 0; va_list va; // TRD : number_paths can be any value in its range va_start( va, number_paths ); while( rv == 1 and count++ < number_paths ) if( NULL == (diskfile = fopen(va_arg(va,char *), "r")) ) rv = 0; else fclose( diskfile ); va_end( va ); return rv; } /****************************************************************************/ static void internal_read_string_from_path( char *path, char *string ) { char diskbuffer[BUFSIZ]; FILE *diskfile; LFDS710_PAL_ASSERT( path != NULL ); LFDS710_PAL_ASSERT( string != NULL ); diskfile = fopen( path, "r" ); setbuf( diskfile, diskbuffer ); fscanf( diskfile, "%s", string ); fclose( diskfile ); return; } #endif /****************************************************************************/ #if( !defined LIBBENCHMARK_PAL_POPULATE_TOPOLOGY ) #error No matching porting abstraction layer in "libbenchmark_porting_abstraction_layer_populate_topology.c". #endif