Download presentation
Presentation is loading. Please wait.
Published byJasper Bailey Modified over 9 years ago
1
Sound and Precise Analysis of Parallel Programs through Schedule Specialization Jingyue Wu, Yang Tang, Gang Hu, Heming Cui, Junfeng Yang Columbia University 1
2
Motivation 2 soundness (# of analyzed schedules / # of total schedules) precision Total Schedules Analyzed Schedules Static Analysis Static Analysis Dynamic Analysis Dynamic Analysis Analyzed Schedules ? ? Analyzing parallel programs is difficult.
3
Precision: Analyze the program over a small set of schedules. Soundness: Enforce these schedules at runtime. Schedule Specialization 3 soundness (# of analyzed schedules / # of total schedules) precision Total Schedules Static Analysis Static Analysis Dynamic Analysis Dynamic Analysis Analyzed Schedules Enforced Schedules Schedule Specializatio n Schedule Specializatio n
4
Enforcing Schedules Using Peregrine Deterministic multithreading – e.g. DMP (ASPLOS ’09), Kendo (ASPLOS ’09), CoreDet (ASPLOS ’10), Tern (OSDI ’10), Peregrine (SOSP ’11), DTHREADS (SOSP ’11) – Performance overhead e.g. Kendo: 16%, Tern & Peregrine: 39.1% Peregrine – Record schedules, and reuse them on a wide range of inputs. – Represent schedules explicitly. 4
5
Precision: Analyze the program over a small set of schedules. Soundness: Enforce these schedules at runtime. Schedule Specialization 5 soundness (# of analyzed schedules / # of total schedules) precision Static Analysis Static Analysis Dynamic Analysis Dynamic Analysis Analyzed Schedules Enforced Schedules Schedule Specializatio n Schedule Specializatio n
6
Framework Extract control flow and data flow enforced by a set of schedules 6 Schedule Specialization Schedule Specialization Program C/C++ program with Pthread Total order of synchronizations Specialized Program Specialized Program Extra def-use chains Extra def-use chains
7
Outline Example Control-Flow Specialization Data-Flow Specialization Results Conclusion 7
8
Running Example int results[p_max]; int global_id = 0; int main(int argc, char *argv[]) { int i; int p = atoi(argv[1]); for (i = 0; i < p; ++i) pthread_create(&child[i], 0, worker, 0); for (i = 0; i < p; ++i) pthread_join(child[i], 0); return 0; } void *worker(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } 8 Thread 0 Thread 1 Thread 2 create join lock unlock lock unlock Race-free?
9
Control-Flow Specialization int main(int argc, char *argv[]) { int i; int p = atoi(argv[1]); for (i = 0; i < p; ++i) pthread_create(&child[i], 0, worker, 0); for (i = 0; i < p; ++i) pthread_join(child[i], 0); return 0; } 9 create join atoi ++i create return i = 0 i < p ++i join i < p i = 0
10
Control-Flow Specialization int main(int argc, char *argv[]) { int i; int p = atoi(argv[1]); for (i = 0; i < p; ++i) pthread_create(&child[i], 0, worker, 0); for (i = 0; i < p; ++i) pthread_join(child[i], 0); return 0; } 10 create join atoi ++i create return i = 0 i < p ++i join i < p i = 0 atoi create i = 0 i < p
11
Control-Flow Specialization int main(int argc, char *argv[]) { int i; int p = atoi(argv[1]); for (i = 0; i < p; ++i) pthread_create(&child[i], 0, worker, 0); for (i = 0; i < p; ++i) pthread_join(child[i], 0); return 0; } 11 create join atoi ++i create return i = 0 i < p ++i join i < p i = 0 create atoi i = 0 i < p create ++i create i < p
12
Control-Flow Specialization int main(int argc, char *argv[]) { int i; int p = atoi(argv[1]); for (i = 0; i < p; ++i) pthread_create(&child[i], 0, worker, 0); for (i = 0; i < p; ++i) pthread_join(child[i], 0); return 0; } 12 create join atoi ++i create return i = 0 i < p ++i join i < p i = 0 atoi create i = 0 i < p ++i create i < p ++i i < p join i < p i = 0 ++i join i < p ++i i < p return
13
Control-Flow Specialized Program 13 int main(int argc, char *argv[]) { int i; int p = atoi(argv[1]); i = 0; // i < p == true pthread_create(&child[i], 0, worker.clone1, 0); ++i; // i < p == true pthread_create(&child[i], 0, worker.clone2, 0); ++i; // i < p == false i = 0; // i < p == true pthread_join(child[i], 0); ++i; // i < p == true pthread_join(child[i], 0); ++i; // i < p == false return 0; } atoi create i = 0 i < p ++i create i < p ++i i < p join i < p i = 0 ++i join i < p ++i i < p return
14
More Challenges on Control-Flow Specialization Ambiguity 14 call CallerCallee call S1 A schedule has too many synchronizations ret S2
15
Data-Flow Specialization int global_id = 0; void *worker.clone1(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } void *worker.clone2(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } 15 Thread 0 Thread 1 Thread 2 create join lock unlock lock unlock global_id = 0 my_id = global_id global_id++ my_id = global_id global_id++ my_id = global_id global_id++ my_id = global_id global_id++
16
Data-Flow Specialization int global_id = 0; void *worker.clone1(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } void *worker.clone2(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } 16 Thread 0 Thread 1 Thread 2 create join lock unlock lock unlock global_id = 0 my_id = global_id global_id++ my_id = global_id global_id++ my_id = global_id global_id++ my_id = global_id global_id++
17
Data-Flow Specialization int global_id = 0; void *worker.clone1(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } void *worker.clone2(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } 17 Thread 0 Thread 1 Thread 2 create join lock unlock lock unlock global_id = 0 my_id = 0 global_id = 1 my_id = 0 global_id = 1 my_id = global_id global_id++ my_id = global_id global_id++
18
Data-Flow Specialization int global_id = 0; void *worker.clone1(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } void *worker.clone2(void *arg) { pthread_mutex_lock(&global_id_lock); int my_id = global_id++; pthread_mutex_unlock(&global_id_lock); results[my_id] = compute(my_id); return 0; } 18 Thread 0 Thread 1 Thread 2 create join lock unlock lock unlock global_id = 0 my_id = 0 global_id = 1 my_id = 0 global_id = 1 my_id = 1 global_id = 2 my_id = 1 global_id = 2
19
Data-Flow Specialization int global_id = 0; void *worker.clone1(void *arg) { pthread_mutex_lock(&global_id_lock); global_id = 1; pthread_mutex_unlock(&global_id_lock); results[0] = compute(0); return 0; } void *worker.clone2(void *arg) { pthread_mutex_lock(&global_id_lock); global_id = 2; pthread_mutex_unlock(&global_id_lock); results[1] = compute(1); return 0; } 19 Thread 0 Thread 1 Thread 2 create join lock unlock lock unlock global_id = 0 my_id = 0 global_id = 1 my_id = 0 global_id = 1 my_id = 1 global_id = 2 my_id = 1 global_id = 2
20
More Challenges on Data-Flow Specialization Must/May alias analysis –global_id Reasoning about integers –results[0] = compute(0) –results[1] = compute(1) Many def-use chains 20
21
Evaluation Applications – Static race detector – Alias analyzer – Path slicer Programs – PBZip2 1.1.5 – aget 0.4.1 – 8 programs in SPLASH2 – 7 programs in PARSEC 21
22
22 ProgramOriginalSpecialized aget720 PBZip21250 fft960 blackscholes30 swaptions1650 streamcluster40 canneal210 bodytrack40 ferret60 raytrace2150 cholesky317 radix5314 water-spatial24471799 lu-contig18 barnes370369 water-nsquared354333 ocean331292 Static Race Detector # of False Positives
23
23 ProgramOriginalSpecialized aget720 PBZip21250 fft960 blackscholes30 swaptions1650 streamcluster40 canneal210 bodytrack40 ferret60 raytrace2150 cholesky317 radix5314 water-spatial24471799 lu-contig18 barnes370369 water-nsquared354333 ocean331292 Static Race Detector # of False Positives
24
24 ProgramOriginalSpecialized aget720 PBZip21250 fft960 blackscholes30 swaptions1650 streamcluster40 canneal210 bodytrack40 ferret60 raytrace2150 cholesky317 radix5314 water-spatial24471799 lu-contig18 barnes370369 water-nsquared354333 ocean331292 Static Race Detector # of False Positives
25
25 ProgramOriginalSpecialized aget720 PBZip21250 fft960 blackscholes30 swaptions1650 streamcluster40 canneal210 bodytrack40 ferret60 raytrace2150 cholesky317 radix5314 water-spatial24471799 lu-contig18 barnes370369 water-nsquared354333 ocean331292 Static Race Detector # of False Positives
26
Static Race Detector: Harmful Races Detected 4 in aget 2 in radix 1 in fft 26
27
Precision of Schedule-Aware Alias Analysis 27
28
Precision of Schedule-Aware Alias Analysis 28
29
Precision of Schedule-Aware Alias Analysis 29
30
Conclusion and Future Work Designed and implemented schedule specialization framework – Analyzes the program over a small set of schedules – Enforces these schedules at runtime Built and evaluated three applications – Easy to use – Precise Future work – More applications – Similar specialization ideas on sequential programs 30
31
Related Work Program analysis for parallel programs – Chord (PLDI ’06), RADAR (PLDI ’08), FastTrack (PLDI ’09) Slicing – Horgon (PLDI ’90), Bouncer (SOSP ’07), Jhala (PLDI ’05), Weiser (PhD thesis), Zhang (PLDI ’04) Deterministic multithreading – DMP (ASPLOS ’09), Kendo (ASPLOS ’09), CoreDet (ASPLOS ’10), Tern (OSDI ’10), Peregrine (SOSP ’11), DTHREADS (SOSP ’11) Program specialization – Consel (POPL ’93), Gluck (ISPL ’95), Jørgensen (POPL ’92), Nirkhe (POPL ’92), Reps (PDSPE ’96) 31
32
Backup Slides 32
33
Specialization Time 33
34
Handling Races We do not assume data-race freedom. We could if our only goal is optimization. 34
35
Input Coverage Use runtime verification for the inputs not covered A small set of schedules can cover a wide range of inputs 35
36
36
Similar presentations
© 2025 SlidePlayer.com. Inc.
All rights reserved.