Skip to content

Commit 43d5c4d

Browse files
vadikp-intelJonathan Peyton
authored andcommitted
[OpenMP] add 4 custom APIs supporting MSVC OMP codegen
This check-in adds 4 APIs to support MSVC, specifically: * 3 APIs (__kmpc_sections_init, __kmpc_next_section, __kmpc_end_sections) to support the dynamic scheduling of OMP sections. * 1 API (__kmpc_copyprivate_light, a light-weight version of __kmpc_copyrprivate) to support the OMP single copyprivate clause. Differential Revision: https://reviews.llvm.org/D128403
1 parent b97013f commit 43d5c4d

File tree

4 files changed

+284
-1
lines changed

4 files changed

+284
-1
lines changed

openmp/runtime/src/dllexports

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,13 @@ kmpc_set_disp_num_buffers 267
397397
__kmpc_end_scope 287
398398
%endif
399399

400+
%ifndef stub
401+
__kmpc_copyprivate_light 288
402+
__kmpc_sections_init 289
403+
__kmpc_next_section 290
404+
__kmpc_end_sections 291
405+
%endif
406+
400407
# User API entry points that have both lower- and upper- case versions for Fortran.
401408
# Number for lowercase version is indicated. Number for uppercase is obtained by adding 1000.
402409
# User API entry points are entry points that start with 'kmp_' or 'omp_'.

openmp/runtime/src/kmp.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3890,6 +3890,11 @@ KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
38903890
KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
38913891
KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
38923892

3893+
KMP_EXPORT kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 global_tid);
3894+
KMP_EXPORT kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 global_tid,
3895+
kmp_int32 numberOfSections);
3896+
KMP_EXPORT void __kmpc_end_sections(ident_t *loc, kmp_int32 global_tid);
3897+
38933898
KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
38943899
kmp_int32 schedtype, kmp_int32 *plastiter,
38953900
kmp_int *plower, kmp_int *pupper,
@@ -3903,6 +3908,9 @@ KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
39033908
void (*cpy_func)(void *, void *),
39043909
kmp_int32 didit);
39053910

3911+
KMP_EXPORT void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid,
3912+
void *cpy_data);
3913+
39063914
extern void KMPC_SET_NUM_THREADS(int arg);
39073915
extern void KMPC_SET_DYNAMIC(int flag);
39083916
extern void KMPC_SET_NESTED(int flag);

openmp/runtime/src/kmp_csupport.cpp

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2224,6 +2224,61 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
22242224
}
22252225
}
22262226

2227+
/* --------------------------------------------------------------------------*/
2228+
/*!
2229+
@ingroup THREADPRIVATE
2230+
@param loc source location information
2231+
@param gtid global thread number
2232+
@param cpy_data pointer to the data to be saved/copied or 0
2233+
@return the saved pointer to the data
2234+
2235+
__kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2236+
__kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2237+
coming from single), and returns that pointer in all calls (for single thread
2238+
it's not needed). This version doesn't do any actual data copying. Data copying
2239+
has to be done somewhere else, e.g. inline in the generated code. Due to this,
2240+
this function doesn't have any barrier at the end of the function, like
2241+
__kmpc_copyprivate does, so generated code needs barrier after copying of all
2242+
data was done.
2243+
*/
2244+
void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2245+
void **data_ptr;
2246+
2247+
KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2248+
2249+
KMP_MB();
2250+
2251+
data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2252+
2253+
if (__kmp_env_consistency_check) {
2254+
if (loc == 0) {
2255+
KMP_WARNING(ConstructIdentInvalid);
2256+
}
2257+
}
2258+
2259+
// ToDo: Optimize the following barrier
2260+
2261+
if (cpy_data)
2262+
*data_ptr = cpy_data;
2263+
2264+
#if OMPT_SUPPORT
2265+
ompt_frame_t *ompt_frame;
2266+
if (ompt_enabled.enabled) {
2267+
__ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2268+
if (ompt_frame->enter_frame.ptr == NULL)
2269+
ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2270+
OMPT_STORE_RETURN_ADDRESS(gtid);
2271+
}
2272+
#endif
2273+
/* This barrier is not a barrier region boundary */
2274+
#if USE_ITT_NOTIFY
2275+
__kmp_threads[gtid]->th.th_ident = loc;
2276+
#endif
2277+
__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2278+
2279+
return *data_ptr;
2280+
}
2281+
22272282
/* -------------------------------------------------------------------------- */
22282283

22292284
#define INIT_LOCK __kmp_init_user_lock_with_checks
@@ -4348,7 +4403,7 @@ void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
43484403
void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
43494404
omp_allocator_handle_t free_allocator) {
43504405
return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4351-
free_allocator);
4406+
free_allocator);
43524407
}
43534408

43544409
void omp_free(void *ptr, omp_allocator_handle_t allocator) {

openmp/runtime/src/kmp_dispatch.cpp

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2285,6 +2285,219 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
22852285
return status;
22862286
}
22872287

2288+
/*!
2289+
@ingroup WORK_SHARING
2290+
@param loc source location information
2291+
@param global_tid global thread number
2292+
@return Zero if the parallel region is not active and this thread should execute
2293+
all sections, non-zero otherwise.
2294+
2295+
Beginning of sections construct.
2296+
There are no implicit barriers in the "sections" calls, rather the compiler
2297+
should introduce an explicit barrier if it is required.
2298+
2299+
This implementation is based on __kmp_dispatch_init, using same constructs for
2300+
shared data (we can't have sections nested directly in omp for loop, there
2301+
should be a parallel region in between)
2302+
*/
2303+
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2304+
2305+
int active;
2306+
kmp_info_t *th;
2307+
kmp_team_t *team;
2308+
kmp_uint32 my_buffer_index;
2309+
dispatch_shared_info_template<kmp_int32> volatile *sh;
2310+
2311+
KMP_DEBUG_ASSERT(__kmp_init_serial);
2312+
2313+
if (!TCR_4(__kmp_init_parallel))
2314+
__kmp_parallel_initialize();
2315+
__kmp_resume_if_soft_paused();
2316+
2317+
/* setup data */
2318+
th = __kmp_threads[gtid];
2319+
team = th->th.th_team;
2320+
active = !team->t.t_serialized;
2321+
th->th.th_ident = loc;
2322+
2323+
KMP_COUNT_BLOCK(OMP_SECTIONS);
2324+
KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2325+
2326+
if (active) {
2327+
// Setup sections in the same way as dynamic scheduled loops.
2328+
// We need one shared data: which section is to execute next.
2329+
// (in case parallel is not active, all sections will be executed on the
2330+
// same thread)
2331+
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2332+
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2333+
2334+
my_buffer_index = th->th.th_dispatch->th_disp_index++;
2335+
2336+
// reuse shared data structures from dynamic sched loops:
2337+
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2338+
&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2339+
KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2340+
my_buffer_index));
2341+
2342+
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2343+
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2344+
2345+
KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2346+
"sh->buffer_index:%d\n",
2347+
gtid, my_buffer_index, sh->buffer_index));
2348+
__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2349+
__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2350+
// Note: KMP_WAIT() cannot be used there: buffer index and
2351+
// my_buffer_index are *always* 32-bit integers.
2352+
KMP_MB();
2353+
KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2354+
"sh->buffer_index:%d\n",
2355+
gtid, my_buffer_index, sh->buffer_index));
2356+
2357+
th->th.th_dispatch->th_dispatch_pr_current =
2358+
nullptr; // sections construct doesn't need private data
2359+
th->th.th_dispatch->th_dispatch_sh_current =
2360+
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2361+
}
2362+
2363+
#if OMPT_SUPPORT && OMPT_OPTIONAL
2364+
if (ompt_enabled.ompt_callback_work) {
2365+
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2366+
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2367+
ompt_callbacks.ompt_callback(ompt_callback_work)(
2368+
ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2369+
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2370+
}
2371+
#endif
2372+
KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2373+
2374+
return active;
2375+
}
2376+
2377+
/*!
2378+
@ingroup WORK_SHARING
2379+
@param loc source location information
2380+
@param global_tid global thread number
2381+
@param numberOfSections number of sections in the 'sections' construct
2382+
@return unsigned [from 0 to n) - number (id) of the section to execute next on
2383+
this thread. n (or any other number not in range) - nothing to execute on this
2384+
thread
2385+
*/
2386+
2387+
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2388+
kmp_int32 numberOfSections) {
2389+
2390+
KMP_TIME_PARTITIONED_BLOCK(OMP_sections);
2391+
2392+
kmp_info_t *th = __kmp_threads[gtid];
2393+
#ifdef KMP_DEBUG
2394+
kmp_team_t *team = th->th.th_team;
2395+
#endif
2396+
2397+
KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2398+
numberOfSections));
2399+
2400+
// For serialized case we should not call this function:
2401+
KMP_DEBUG_ASSERT(!team->t.t_serialized);
2402+
2403+
dispatch_shared_info_template<kmp_int32> volatile *sh;
2404+
2405+
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2406+
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2407+
2408+
KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2409+
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2410+
th->th.th_dispatch->th_dispatch_sh_current);
2411+
KMP_DEBUG_ASSERT(sh);
2412+
2413+
kmp_int32 sectionIndex = 0;
2414+
bool moreSectionsToExecute = true;
2415+
2416+
// Find section to execute:
2417+
sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2418+
if (sectionIndex >= numberOfSections) {
2419+
moreSectionsToExecute = false;
2420+
}
2421+
2422+
// status == 0: no more sections to execute;
2423+
// OMPTODO: __kmpc_end_sections could be bypassed?
2424+
if (!moreSectionsToExecute) {
2425+
kmp_int32 num_done;
2426+
2427+
num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2428+
2429+
if (num_done == th->th.th_team_nproc - 1) {
2430+
/* NOTE: release this buffer to be reused */
2431+
2432+
KMP_MB(); /* Flush all pending memory write invalidates. */
2433+
2434+
sh->u.s.num_done = 0;
2435+
sh->u.s.iteration = 0;
2436+
2437+
KMP_MB(); /* Flush all pending memory write invalidates. */
2438+
2439+
sh->buffer_index += __kmp_dispatch_num_buffers;
2440+
KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2441+
sh->buffer_index));
2442+
2443+
KMP_MB(); /* Flush all pending memory write invalidates. */
2444+
2445+
} // if
2446+
2447+
th->th.th_dispatch->th_deo_fcn = NULL;
2448+
th->th.th_dispatch->th_dxo_fcn = NULL;
2449+
th->th.th_dispatch->th_dispatch_sh_current = NULL;
2450+
th->th.th_dispatch->th_dispatch_pr_current = NULL;
2451+
2452+
#if OMPT_SUPPORT && OMPT_OPTIONAL
2453+
if (ompt_enabled.ompt_callback_dispatch) {
2454+
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2455+
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2456+
ompt_data_t instance = ompt_data_none;
2457+
instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2458+
ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2459+
&(team_info->parallel_data), &(task_info->task_data),
2460+
ompt_dispatch_section, instance);
2461+
}
2462+
#endif
2463+
KMP_POP_PARTITIONED_TIMER();
2464+
}
2465+
2466+
return sectionIndex;
2467+
}
2468+
2469+
/*!
2470+
@ingroup WORK_SHARING
2471+
@param loc source location information
2472+
@param global_tid global thread number
2473+
2474+
End of "sections" construct.
2475+
Don't need to wait here: barrier is added separately when needed.
2476+
*/
2477+
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2478+
2479+
kmp_info_t *th = __kmp_threads[gtid];
2480+
int active = !th->th.th_team->t.t_serialized;
2481+
2482+
KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2483+
2484+
if (!active) {
2485+
// In active case call finalization is done in __kmpc_next_section
2486+
#if OMPT_SUPPORT && OMPT_OPTIONAL
2487+
if (ompt_enabled.ompt_callback_work) {
2488+
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2489+
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2490+
ompt_callbacks.ompt_callback(ompt_callback_work)(
2491+
ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2492+
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2493+
}
2494+
#endif
2495+
KMP_POP_PARTITIONED_TIMER();
2496+
}
2497+
2498+
KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2499+
}
2500+
22882501
template <typename T>
22892502
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
22902503
kmp_int32 *plastiter, T *plower, T *pupper,

0 commit comments

Comments
 (0)