2 #include "libbenchmark_benchmarks_freelist_internal.h"
5 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state
10 struct lfds710_prng_st_state
14 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_overall_benchmark_state
16 struct lfds710_freelist_state
24 /****************************************************************************/
25 void libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_init( struct libbenchmark_topology_state *ts,
26 struct lfds710_list_aso_state *logical_processor_set,
27 struct libshared_memory_state *ms,
28 enum libbenchmark_topology_numa_mode numa_mode,
29 struct libbenchmark_threadset_state *tsets )
32 finished_flag = LOWERED;
35 ea_size_in_freelist_elements,
39 number_freelist_elements,
40 number_logical_processors,
41 number_logical_processors_in_numa_node,
43 number_freelist_element_pointers_per_atomic_isolation = LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES / sizeof(struct lfds710_freelist_element *),
44 largest_number_logical_processors_in_numa_node = 0,
46 smallest_power_of_two_larger_than_or_equal_to_number_logical_processors = 2,
47 temp_number_logical_processors;
49 struct lfds710_freelist_element * volatile
50 (*ea)[LFDS710_FREELIST_ELIMINATION_ARRAY_ELEMENT_SIZE_IN_FREELIST_ELEMENTS];
52 struct lfds710_list_asu_element
56 struct lfds710_prng_st_state
59 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_overall_benchmark_state
62 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state
65 struct lfds710_freelist_element
69 struct lfds710_freelist_state
72 struct libbenchmark_threadset_per_numa_state
76 struct libbenchmark_threadset_per_thread_state
79 struct libbenchmark_topology_node_state
82 LFDS710_PAL_ASSERT( ts != NULL );
83 LFDS710_PAL_ASSERT( logical_processor_set != NULL );
84 LFDS710_PAL_ASSERT( ms != NULL );
85 // TRD : numa_mode can be any value in its range
86 LFDS710_PAL_ASSERT( tsets != NULL );
88 lfds710_prng_st_init( &psts, LFDS710_PRNG_SEED );
90 obs = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_overall_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
92 libbenchmark_threadset_init( tsets, ts, logical_processor_set, ms, libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_thread, NULL );
96 case LIBBENCHMARK_TOPOLOGY_NUMA_MODE_SMP:
97 lfds710_list_aso_query( logical_processor_set, LFDS710_LIST_ASO_QUERY_GET_POTENTIALLY_INACCURATE_COUNT, NULL, (void *) &number_logical_processors );
98 temp_number_logical_processors = number_logical_processors >> 2;
99 while( temp_number_logical_processors != 0 )
101 temp_number_logical_processors >>= 1;
102 smallest_power_of_two_larger_than_or_equal_to_number_logical_processors <<= 1;
104 fs = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds710_freelist_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
105 ea = libshared_memory_alloc_from_unknown_node( ms, sizeof(struct lfds710_freelist_element *) * LFDS710_FREELIST_ELIMINATION_ARRAY_ELEMENT_SIZE_IN_FREELIST_ELEMENTS * smallest_power_of_two_larger_than_or_equal_to_number_logical_processors, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
106 lfds710_freelist_init_valid_on_current_logical_core( fs, ea, smallest_power_of_two_larger_than_or_equal_to_number_logical_processors, NULL );
108 // TRD : fill the elimination array and have one element per thread in the freelist proper
109 number_freelist_elements = (smallest_power_of_two_larger_than_or_equal_to_number_logical_processors * number_freelist_element_pointers_per_atomic_isolation) + number_logical_processors;
110 fe = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds710_freelist_element) * number_freelist_elements, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
111 for( loop = 0 ; loop < number_freelist_elements ; loop++ )
112 lfds710_freelist_push( fs, &fe[loop], &psts );
113 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
115 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
116 ptbs = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
117 LFDS710_PRNG_ST_GENERATE( psts, random_value );
118 LFDS710_PRNG_ST_MIXING_FUNCTION( random_value );
119 lfds710_prng_st_init( &ptbs->psts, random_value );
120 pts->users_per_thread_state = ptbs;
124 case LIBBENCHMARK_TOPOLOGY_NUMA_MODE_NUMA:
125 /* TRD : init the freelist from the NUMA node with most processors from the current set
126 or, if equal threads, with lowest NUMA
127 iterate over the NUMA node list
128 for each NUMA node, allocate one freelist element per thread on that node
129 and push those elements onto the freelist
131 the loop over the threads, and give each one the freelist state as it's user state
134 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
136 pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
139 number_logical_processors_in_numa_node = 0;
141 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
143 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );
145 libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );
147 if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
148 number_logical_processors_in_numa_node++;
151 if( number_logical_processors_in_numa_node > largest_number_logical_processors_in_numa_node )
155 lfds710_list_aso_query( logical_processor_set, LFDS710_LIST_ASO_QUERY_GET_POTENTIALLY_INACCURATE_COUNT, NULL, (void *) &number_logical_processors );
156 temp_number_logical_processors = number_logical_processors >> 2;
157 while( temp_number_logical_processors != 0 )
159 temp_number_logical_processors >>= 1;
160 smallest_power_of_two_larger_than_or_equal_to_number_logical_processors <<= 1;
163 fs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds710_freelist_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
164 ea = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds710_freelist_element *) * LFDS710_FREELIST_ELIMINATION_ARRAY_ELEMENT_SIZE_IN_FREELIST_ELEMENTS * smallest_power_of_two_larger_than_or_equal_to_number_logical_processors, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
165 lfds710_freelist_init_valid_on_current_logical_core( fs, ea, smallest_power_of_two_larger_than_or_equal_to_number_logical_processors, NULL );
167 /* TRD : now figure out how many elements are needed from each NUMA node
169 them push them interleaved, round-robin, to the freelist
172 libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMBER_OF_NODE_TYPE, (void *) (lfds710_pal_uint_t) LIBBENCHMARK_TOPOLOGY_NODE_TYPE_NUMA, (void *) &number_numa_nodes );
174 fe_array_pointers = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(struct lfds710_freelist_element *) * number_numa_nodes, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
175 fe_array_count = libshared_memory_alloc_from_most_free_space_node( ms, sizeof(lfds710_pal_uint_t) * number_numa_nodes, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
176 for( loop = 0 ; loop < number_numa_nodes ; loop++ )
177 fe_array_count[loop] = 0;
179 // TRD : now query the freelist for the EL size
180 lfds710_freelist_query( fs, LFDS710_FREELIST_QUERY_GET_ELIMINATION_ARRAY_EXTRA_ELEMENTS_IN_FREELIST_ELEMENTS, NULL, (void *) &ea_size_in_freelist_elements );
182 // TRD : we need to divide that number of elements over the NUMA nodes in proportion to their number of LPs...
186 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
188 pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
190 /* TRD : for each NUMA node, figure out how many LPs in the current set are in that NUMA node
191 and allocate then the correct number of elements from this NUMA node (1 per LP, plus for the node the correct proportion of the EA layer)
195 number_logical_processors_in_numa_node = 0;
197 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
199 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );
201 libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );
203 if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
204 number_logical_processors_in_numa_node++;
207 // TRD : blind +1 to deal with rounding, it will skew results but only slightly
208 fe_array_count[index] = number_logical_processors_in_numa_node + (ea_size_in_freelist_elements * number_logical_processors_in_numa_node) / number_logical_processors + 1;
209 fe_array_pointers[index] = libshared_memory_alloc_from_specific_node( ms, pns->numa_node_id, sizeof(struct lfds710_freelist_element) * fe_array_count[index], LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
213 while( finished_flag == LOWERED )
215 for( loop = 0 ; loop < index ; loop++ )
216 if( fe_array_count[loop] > 0 )
217 lfds710_freelist_push( fs, &fe_array_pointers[loop][ fe_array_count[loop]-- ], &psts );
219 finished_flag = RAISED;
221 for( loop = 0 ; loop < index ; loop++ )
222 if( fe_array_count[loop] > 0 )
223 finished_flag = LOWERED;
228 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
230 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
231 ptbs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
232 LFDS710_PRNG_ST_GENERATE( psts, random_value );
233 LFDS710_PRNG_ST_MIXING_FUNCTION( random_value );
234 lfds710_prng_st_init( &ptbs->psts, random_value );
235 pts->users_per_thread_state = ptbs;
239 case LIBBENCHMARK_TOPOLOGY_NUMA_MODE_NUMA_BUT_NOT_USED:
240 /* TRD : freelist state in the NUMA node with most threads from the current set
241 or, if equal threads, with lowest NUMA
242 all elements alloced from that node as well
244 SO much easier to figure out allocs than with NUMA OMG
245 all of this code needs rewriting
246 and the NUMA-but-not-used stuff is interesting but I don't think it carries its own weight
249 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_numa_states,lasue) )
251 pns = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
254 number_logical_processors_in_numa_node = 0;
256 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue_lp) )
258 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue_lp );
260 libbenchmark_topology_query( ts, LIBBENCHMARK_TOPOLOGY_QUERY_GET_NUMA_NODE_FOR_LOGICAL_PROCESSOR, pts->tns_lp, &numa_node_for_lp );
262 if( LIBBENCHMARK_TOPOLOGY_NODE_GET_NUMA_ID(*numa_node_for_lp) == pns->numa_node_id )
263 number_logical_processors_in_numa_node++;
266 if( number_logical_processors_in_numa_node > largest_number_logical_processors_in_numa_node )
270 lfds710_list_aso_query( logical_processor_set, LFDS710_LIST_ASO_QUERY_GET_POTENTIALLY_INACCURATE_COUNT, NULL, (void *) &number_logical_processors );
271 temp_number_logical_processors = number_logical_processors >> 2;
272 while( temp_number_logical_processors != 0 )
274 temp_number_logical_processors >>= 1;
275 smallest_power_of_two_larger_than_or_equal_to_number_logical_processors <<= 1;
278 fs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds710_freelist_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
279 ea = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds710_freelist_element *) * LFDS710_FREELIST_ELIMINATION_ARRAY_ELEMENT_SIZE_IN_FREELIST_ELEMENTS * smallest_power_of_two_larger_than_or_equal_to_number_logical_processors, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
280 lfds710_freelist_init_valid_on_current_logical_core( fs, ea, smallest_power_of_two_larger_than_or_equal_to_number_logical_processors, NULL );
282 // TRD : fill the elimination array and have one element per thread in the freelist proper
283 number_freelist_elements = (smallest_power_of_two_larger_than_or_equal_to_number_logical_processors * number_freelist_element_pointers_per_atomic_isolation) + number_logical_processors;
284 fe = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct lfds710_freelist_element) * number_freelist_elements, LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
285 for( loop = 0 ; loop < number_freelist_elements ; loop++ )
286 lfds710_freelist_push( fs, &fe[loop], &psts );
290 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
292 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
293 ptbs = libshared_memory_alloc_from_specific_node( ms, largest_pns->numa_node_id, sizeof(struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state), LFDS710_PAL_ATOMIC_ISOLATION_IN_BYTES );
294 LFDS710_PRNG_ST_GENERATE( psts, random_value );
295 LFDS710_PRNG_ST_MIXING_FUNCTION( random_value );
296 lfds710_prng_st_init( &ptbs->psts, random_value );
297 pts->users_per_thread_state = ptbs;
303 tsets->users_threadset_state = obs;
312 /****************************************************************************/
313 libshared_pal_thread_return_t LIBSHARED_PAL_THREAD_CALLING_CONVENTION libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_thread( void *libbenchmark_threadset_per_thread_state )
315 int long long unsigned
318 time_units_per_second;
324 struct lfds710_freelist_state
327 struct lfds710_freelist_element
330 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_overall_benchmark_state
333 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state
336 struct libbenchmark_threadset_per_thread_state
339 LFDS710_MISC_BARRIER_LOAD;
341 LFDS710_PAL_ASSERT( libbenchmark_threadset_per_thread_state != NULL );
343 pts = (struct libbenchmark_threadset_per_thread_state *) libbenchmark_threadset_per_thread_state;
345 ptbs = LIBBENCHMARK_THREADSET_PER_THREAD_STATE_GET_USERS_PER_THREAD_STATE( *pts );
346 obs = LIBBENCHMARK_THREADSET_PER_THREAD_STATE_GET_USERS_OVERALL_STATE( *pts );
349 LIBBENCHMARK_PAL_TIME_UNITS_PER_SECOND( &time_units_per_second );
351 libbenchmark_threadset_thread_ready_and_wait( pts );
353 LIBBENCHMARK_PAL_GET_HIGHRES_TIME( ¤t_time );
355 end_time = current_time + time_units_per_second * libbenchmark_globals_benchmark_duration_in_seconds;
357 while( current_time < end_time )
359 lfds710_freelist_pop( fs, &fe, &ptbs->psts );
360 lfds710_freelist_push( fs, fe, &ptbs->psts );
363 if( time_loop++ == TIME_LOOP_COUNT )
366 LIBBENCHMARK_PAL_GET_HIGHRES_TIME( ¤t_time );
370 ptbs->operation_count = operation_count;
372 LFDS710_MISC_BARRIER_STORE;
374 lfds710_misc_force_store();
376 return LIBSHARED_PAL_THREAD_RETURN_CAST(RETURN_SUCCESS);
383 /****************************************************************************/
384 void libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_cleanup( struct lfds710_list_aso_state *logical_processor_set,
385 enum libbenchmark_topology_numa_mode numa_mode,
386 struct libbenchmark_results_state *rs,
387 struct libbenchmark_threadset_state *tsets )
389 struct lfds710_list_asu_element
392 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_overall_benchmark_state
395 struct libbenchmark_benchmark_freelist_liblfds710_lockfree_push1_pop1_per_thread_benchmark_state
398 struct libbenchmark_threadset_per_thread_state
401 LFDS710_PAL_ASSERT( logical_processor_set != NULL );
402 // TRD : numa_mode can be any value in its range
403 LFDS710_PAL_ASSERT( rs != NULL );
404 LFDS710_PAL_ASSERT( tsets != NULL );
406 while( LFDS710_LIST_ASU_GET_START_AND_THEN_NEXT(tsets->list_of_per_thread_states,lasue) )
408 pts = LFDS710_LIST_ASU_GET_VALUE_FROM_ELEMENT( *lasue );
410 ptbs = LIBBENCHMARK_THREADSET_PER_THREAD_STATE_GET_USERS_PER_THREAD_STATE( *pts );
412 libbenchmark_results_put_result( rs,
413 LIBBENCHMARK_DATASTRUCTURE_ID_FREELIST,
414 LIBBENCHMARK_BENCHMARK_ID_PUSH1_THEN_POP1,
415 LIBBENCHMARK_LOCK_ID_LIBLFDS710_LOCKFREE,
417 logical_processor_set,
418 LIBBENCHMARK_TOPOLOGY_NODE_GET_LOGICAL_PROCESSOR_NUMBER( *pts->tns_lp ),
419 LIBBENCHMARK_TOPOLOGY_NODE_GET_WINDOWS_GROUP_NUMBER( *pts->tns_lp ),
420 ptbs->operation_count );
423 obs = tsets->users_threadset_state;
425 lfds710_freelist_cleanup( obs->fs, NULL );