-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbib.bib
611 lines (550 loc) · 20.3 KB
/
bib.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
@inproceedings{dang2017advanced,
mynote = {improving MPI-thread waking},
title={Advanced Thread Synchronization for Multithreaded {MPI}
Implementations},
author={Dang, Hoang-Vu and Seo, Sangmin and Amer, Abdelhalim and Balaji,
Pavan},
booktitle={Proceedings of the 17th IEEE/ACM International Symposium on
Cluster, Cloud and Grid Computing},
pages={314--324},
year={2017},
organization={IEEE Press}
}
@inproceedings{dang2016towards,
title={Towards millions of communicating threads},
author={Dang, Hoang-Vu and Snir, Marc and Gropp, William},
booktitle={Proceedings of the 23rd European MPI Users' Group Meeting},
pages={1--14},
year={2016},
organization={ACM}
}
@inproceedings{balaji2008toward,
mynote = {lock granularity},
title={Toward efficient support for multithreaded {MPI} communication},
author={Balaji, Pavan and Buntinas, Darius and Goodell, David and Gropp,
William and Thakur, Rajeev},
booktitle={European Parallel Virtual Machine/Message Passing Interface Users’
Group Meeting},
pages={120--129},
year={2008},
organization={Springer}
}
@article{mpichrma,
title={An implementation and evaluation of the {MPI} 3.0 one-sided
communication interface},
author={Dinan, James and Balaji, Pavan and Buntinas, Darius and Goodell,
David and Gropp, William and Thakur, Rajeev},
journal={Concurrency and Computation: Practice and Experience},
volume={28},
number={17},
pages={4385--4404},
year={2016},
}
@article{MPI-hybrid-core,
mynote = {thin and fat cores},
author = {Brian W Barrett and Ron Brightwell and Ryan Grant and Simon D Hammond
and K Scott Hemmert},
title ={An evaluation of {MPI} message rate on hybrid-core processors},
journal = {The International Journal of High Performance Computing
Applications},
volume = {28},
number = {4},
pages = {415-424},
year = {2014},
}
@inproceedings{Si:2014,
mynote = {share threads between MPI and OpenMP},
author = {Si, Min and Pe\~{n}a, Antonio J. and Balaji, Pavan and Takagi,
Masamichi and Ishikawa, Yutaka},
title = {{MT-MPI}: Multithreaded {MPI} for Many-core Environments},
booktitle = {Proceedings of the 28th ACM International Conference on
Supercomputing},
series = {ICS '14},
year = {2014},
isbn = {978-1-4503-2642-1},
location = {Munich, Germany},
pages = {125--134},
numpages = {10},
publisher = {ACM},
address = {New York, NY, USA}
}
@inproceedings{kamal2010fg,
mynote = {MPI processes are fibers},
title={{FG-MPI}: Fine-grain {MPI} for multicore and clusters},
author={Kamal, Humaira and Wagner, Alan},
booktitle={Parallel \& Distributed Processing, Workshops and Phd Forum
(IPDPSW), 2010 IEEE International Symposium on},
pages={1--8},
year={2010},
organization={IEEE}
}
@misc{MPI4,
author={\par {Message Passing Interface Forum}},
title ={{MPI} 4.0 },
howpublished={https://www.mpi-forum.org/mpi-40/}}
}
@misc{openmp5,
title= {{OpenMP} Technical Report 4:
Version 5.0 Preview 1},
author = {{OpenMP Language Working Group}},
year = {2016},
month ={November},
howpublished =
{\url{https://www.openmp.org/wp-content/uploads/openmp-tr4.pdf}}
}
}
@incollection{threadissue,
title={Issues in developing a thread-safe {MPI} implementation},
author={Gropp, William and Thakur, Rajeev},
booktitle={Recent Advances in Parallel Virtual Machine and Message Passing
Interface},
pages={12--21},
year={2006},
publisher={Springer}
}
@inproceedings{m5,
mynote = {better support to wildcards},
title={Mitigating {MPI} Message Matching Misery},
author={Mario Flajslik and James Dinan and Keith D. Underwood},
year={2016},
booktitle={International Supercomputing Conference},
}
@article{mpicon2,
mynote = {finer locks for mpich},
title={Fine-grained multithreading support for hybrid threaded {MPI}
programming},
author={Balaji, Pavan and Buntinas, Darius and Goodell, David and Gropp,
William and Thakur, Rajeev},
journal={International Journal of High Performance Computing Applications},
volume={24},
number={1},
pages={49--57},
year={2010},
publisher={SAGE Publications}
}
@inproceedings{mpiult,
mynote = {MPI + tasks},
title={{MPI+ ULT}: Overlapping Communication and Computation with
User-Level Threads},
author={Lu, Huiwei and Seo, Sangmin and Balaji, Pavan},
booktitle={2015 IEEE 17th International Conference onHigh Performance
Computing and Communications (HPCC)},
pages={444--454},
year={2015},
organization={IEEE}
}
@article{mpi-thread,
mynote = {multithreading optimizations in mpich},
title={{MPI+threads}: Runtime contention and remedies},
author={Amer, Abdelhalim and Lu, Huiwei and Wei, Yanjie and Balaji, Pavan
and Matsuoka, Satoshi},
journal={ACM SIGPLAN Notices},
volume={50},
number={8},
pages={239--248},
year={2015},
publisher={ACM}
}
@article{dang2017eliminating,
title={Eliminating contention bottlenecks in multithreaded {MPI}},
author={Dang, Hoang-Vu and Snir, Marc and Gropp, William},
journal={Parallel Computing},
volume={69},
pages={1--23},
year={2017},
publisher={Elsevier}
}
@article{carpen2017expected,
mynote = {derived datatypes often sucks},
title={On expected and observed communication performance with {MPI}
derived
datatypes},
author={Carpen-Amarie, Alexandra and Hunold, Sascha and Tr{\"a}ff, Jesper
Larsson},
journal={Parallel Computing},
volume={69},
pages={98--117},
year={2017},
publisher={Elsevier}
}
@inproceedings{vaidyanathan2015improving,
mynote = {Use progress thread and show advantage},
title={Improving concurrency and asynchrony in multithreaded {MPI}
applications using software offloading},
author={Vaidyanathan, Karthikeyan and Kalamkar, Dhiraj D and Pamnany, Kiran
and Hammond, Jeff R and Balaji, Pavan and Das, Dipankar and Park, Jongsoo
and Jo{\'o}, B{\'a}lint},
booktitle={Proceedings of the International Conference for High Performance
Computing, Networking, Storage and Analysis},
pages={30},
year={2015},
organization={ACM}
}
@inproceedings{farmer2016mpi,
mynote = {runs benchmarks -- show MT issues},
title={{MPI} Performance Characterization on {Infiniband} with Fine-Grain
Multithreaded Communication},
author={Farmer, Shane and Skjellum, Anthony and Grant, Ryan E and
Brightwell, Ron},
booktitle={High Performance Computing and Communications; IEEE 14th
International Conference on Smart City; IEEE 2nd International Conference
on Data Science and Systems (HPCC/SmartCity/DSS), 2016 IEEE 18th
International Conference on},
pages={1102--1106},
year={2016},
organization={IEEE}
}
@inproceedings{dosanjh2016rma,
mynote = {performance of multithreaded RMA sucks},
title={{RMA-MT}: A benchmark suite for assessing {MPI} multi-threaded {RMA}
performance},
author={Dosanjh, Matthew GF and Groves, Taylor and Grant, Ryan E and
Brightwell, Ron and Bridges, Patrick G},
booktitle={Cluster, Cloud and Grid Computing (CCGrid), 2016 16th IEEE/ACM
International Symposium on},
pages={550--559},
year={2016},
organization={IEEE}
}
article{amer2016locking,
title={Locking aspects in multithreaded MPI implementations},
author={Amer, ABDELHALIM and Lu, HUIWEI and Wei, YANJIE and Hammond, JEFF
and Matsuoka, SATOSHI and Balaji, PAVAN},
journal={Argonne National Lab., Tech. Rep. P6005-0516},
year={2016}
}
@inproceedings{akhmetova2017performance,
mynote = {shows good performance with thread multiple},
title={Performance study of multithreaded {MPI} and {OpenMP} tasking in a
large
scientific code},
author={Akhmetova, Dana and Iakymchuk, Roman and Ekeberg, Orjan and Laure,
Erwin},
booktitle={Parallel and Distributed Processing Symposium Workshops
(IPDPSW), 2017 IEEE International},
pages={756--765},
year={2017},
organization={IEEE}
}
@inproceedings{denis2014pioman,
mynote = { share progress engine},
title={pioman: a Generic Framework for Asynchronous Progression and
Multithreaded Communications},
author={Denis, Alexandre},
booktitle={IEEE International Conference on Cluster Computing (IEEE
Cluster)},
year={2014}
}
inproceedings{si2015techniques,
mynote = {MT-MPI and Casper -- brief overview},
title={Techniques for Enabling Highly Efficient Message Passing on
Many-Core Architectures},
author={Si, Min and Balaji, Pavan and Ishikawa, Yutaka},
booktitle={Cluster, Cloud and Grid Computing (CCGrid), 2015 15th IEEE/ACM
International Symposium on},
pages={697--700},
year={2015},
organization={IEEE}
}
@INPROCEEDINGS{brightwellseastarqueue,
mynote = {measured queue lengths},
author={R. Brightwell and K. Pedretti and K. Ferreira},
booktitle={2008 Proceedings of 17th International Conference on Computer
Communications and Networks},
title={Instrumentation and Analysis of {MPI} Queue Times on the {SeaStar}
High-Performance Network},
year={2008},
volume={},
number={},
pages={1-7},
keywords={message passing;parallel processing;protocols;queueing
theory;Cray XT series;MPI queue times instrumentation;SeaStar
high-performance network;communication behavior;network protocol
stack;network resource usage;next-generation networking hardware
design;parallel applications;parallel processing machines;performance
optimization;Application software;Hardware;Instruments;Next generation
networking;Parallel processing;Performance analysis;Protocols;Queueing
analysis;Random access memory;Storms},
doi={10.1109/ICCCN.2008.ECP.116},
ISSN={1095-2055},
month={Aug},}
@inproceedings{ferreira2017characterizing,
mynote = {measurements via simulation of queue length},
title={Characterizing {MPI} matching via trace-based simulation},
author={Ferreira, Kurt B and Levy, Scott and Pedretti, Kevin and Grant,
Ryan E},
booktitle={Proceedings of the 24th European MPI Users' Group Meeting},
pages={8},
year={2017},
organization={ACM}
}
@inproceedings{bayatpour2016adaptive,
mynote = {dynamic binning},
title={Adaptive and dynamic design for MPI tag matching},
author={Bayatpour, M and Subramoni, Hari and Chakraborty, S and Panda,
Dhabaleswar K},
booktitle={Cluster Computing (CLUSTER), 2016 IEEE International Conference
on},
pages={1--10},
year={2016},
organization={IEEE}
}
@article{zounmevo2014fast,
mynote = {a 2D queue structure},
title={A fast and resource-conscious MPI message queue mechanism for
large-scale jobs},
author={Zounmevo, Judicael A and Afsahi, Ahmad},
journal={Future Generation Computer Systems},
volume={30},
pages={265--290},
year={2014},
publisher={Elsevier}
}
inproceedings{bridges2015preparing,
mynote = {queueing model for multi-core MPI},
title={Preparing for exascale: modeling mpi for many-core systems using
fine-grain queues},
author={Bridges, Patrick G and Dosanjh, Matthew GF and Grant, Ryan and
Skjellum, Anthony and Farmer, Shane and Brightwell, Ron},
booktitle={Proceedings of the 3rd Workshop on Exascale MPI},
pages={5},
year={2015},
organization={ACM}
}
@inproceedings{raffenetti2017mpi,
mynote = {detailed analysis of instruction counts for various MPI functions},
title={Why is MPI so slow?: analyzing the fundamental limits in
implementing MPI-3.1},
author={Raffenetti, Ken and Amer, Abdelhalim and Oden, Lena and Archer,
Charles and Bland, Wesley and Fujita, Hajime and Guo, Yanfei and Janjusic,
Tomislav and Durnov, Dmitry and Blocksome, Michael and others},
booktitle={Proceedings of the International Conference for High Performance
Computing, Networking, Storage and Analysis},
pages={62},
year={2017},
organization={ACM}
}
@article{aji2016mpi,
mynote = {code data environment in datatype; pipeline},
title={{MPI-ACC}: Accelerator-aware {MPI} for scientific applications},
author={Aji, Ashwin M and Panwar, Lokendra S and Ji, Feng and Murthy,
Karthik and Chabbi, Milind and Balaji, Pavan and Bisset, Keith R and Dinan,
James and Feng, Wu-chun and Mellor-Crummey, John and others},
journal={IEEE Transactions on Parallel and Distributed Systems},
volume={27},
number={5},
pages={1401--1414},
year={2016},
publisher={IEEE}
}
@article{wang2014gpu,
mynote = {deal with derived datatypes on GPU},
title={{GPU}-aware {MPI} on {RDMA}-enabled clusters: Design, implementation
and
evaluation},
author={Wang, Hao and Potluri, Sreeram and Bureddy, Devendar and Rosales,
Carlos and Panda, Dhabaleswar K},
journal={IEEE Transactions on Parallel and Distributed Systems},
volume={25},
number={10},
pages={2595--2605},
year={2014},
publisher={IEEE}
}
@article{oden2016analyzing,
mynote = {advantage of GPU communication},
title={Analyzing {GPU}-controlled communication with dynamic parallelism in
terms of performance and energy},
author={Oden, Lena and Klenk, Benjamin and Fr{\"o}ning, Holger},
journal={Parallel Computing},
volume={57},
pages={125--134},
year={2016},
publisher={Elsevier}
}
@inproceedings{klenk2017overview,
mynote = {queue length, wildcard},
title={An overview of {MPI} characteristics of exascale proxy applications},
author={Klenk, Benjamin and Fr{\"o}ning, Holger},
booktitle={International Supercomputing Conference},
pages={217--236},
year={2017},
organization={Springer}
}
@INPROCEEDINGS{Denis2016mpioverlap,
mynote = { benchmarks to measure comm/comp overlap },
author={A. Denis and F. Trahay},
booktitle={2016 45th International Conference on Parallel Processing
(ICPP)},
title={{MPI} Overlap: Benchmark and Analysis},
year={2016},
volume={},
number={},
pages={258-267},
keywords={application program interfaces;message passing;parallel
processing;MPI libraries;MPI overlap;hardware platforms;nonblocking MPI
request;nonblocking point-to-point communications;sequential code;Benchmark
testing;Hardware;Kernel;Libraries;Receivers;Time
measurement;HPC;MPI;benchmark;overlap},
doi={10.1109/ICPP.2016.37},
ISSN={},
month={Aug},}
@INPROCEEDINGS{Klenk2017,
mynote = { MPI on GPU},
author={B. Klenk and H. Fröening and H. Eberle and L. Dennison},
booktitle={2017 IEEE International Parallel and Distributed Processing
Symposium (IPDPS)},
title={Relaxations for High-Performance Message Passing on Massively
Parallel {SIMT} Processors},
year={2017},
volume={},
number={},
pages={855-865},
keywords={graphics processing units;message passing;multiprocessing
systems;GPU;MPI;control flow switch;data transfer;general-purpose
CPU;graphics processing unit;high-performance message passing;massively
parallel SIMT processor;message passing interface;single instruction
multiple thread architecture;traffic sinking;traffic sourcing;Computer
architecture;Graphics processing units;Kernel;Message passing;Message
systems;Semantics;Communication Models;GPU Computing;Heterogeneous
systems;Message Passing},
doi={10.1109/IPDPS.2017.94},
ISSN={},
month={May},}
@article{thakur2009test,
title={Test suite for evaluating performance of multithreaded {MPI}
communication},
author={Thakur, Rajeev and Gropp, William},
journal={Parallel Computing},
volume={35},
number={12},
pages={608--617},
year={2009},
publisher={Elsevier}
}
@INPROCEEDINGS{goodell2010minimizing,
mynote = {overhead of object allocation },
author={D. Goodell and P. Balaji and D. Buntinas and G. Dozsa and W. Gropp
and S. Kumar and B. R. d. Supinski and R. Thakur},
booktitle={2010 IEEE International Conference on Cluster Computing},
title={Minimizing {MPI} Resource Contention in Multithreaded Multicore
Environments},
year={2010},
volume={},
number={},
pages={1-8},
keywords={message passing;minimisation;multi-threading;shared memory
systems;MPI object management;MPI resource contention
minimization;high-performance computing systems;hybrid programming
model;memory leak prevention;multithreaded MPI communication;multithreaded
messaging rate;multithreaded multicore environments;reference
counting;shared memory;Benchmark testing;Instruction sets;Message
systems;Resource management;Semantics;TV;Transient analysis},
doi={10.1109/CLUSTER.2010.11},
ISSN={1552-5244},
month={Sept},}
@inproceedings{si2015casper,
title={Casper: An asynchronous progress model for MPI RMA on many-core
architectures},
author={Si, Min and Pena, Antonio J and Hammond, Jeff and Balaji, Pavan and
Takagi, Masamichi and Ishikawa, Yutaka},
booktitle={Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE
International},
pages={665--676},
year={2015},
organization={IEEE}
}
@misc{openmpi,
howpublished = {\url{https://www.open-mpi.org/}},
title = {{Open MPI}: Open Source High Performance Computing},
author = {{OpenMPI Team}}
}
@inproceedings{hoefler2008message,
mynote = {progress thread that shares a core},
title={Message progression in parallel computing-to thread or not to
thread?},
author={Hoefler, Torsten and Lumsdaine, Andrew},
booktitle={Cluster Computing, 2008 IEEE International Conference on},
pages={213--222},
year={2008},
organization={IEEE}
}
@misc{gpudirect,
author ={{NVIDIA}},
title = {{NVIDIA GPU Direct}},
howpublished = {\url{https://developer.nvidia.com/gpudirect}}
}
@article{Shainer2011,
author="Shainer, Gilad
and Ayoub, Ali
and Lui, Pak
and Liu, Tong
and Kagan, Michael
and Trott, Christian R.
and Scantlen, Greg
and Crozier, Paul S.",
title="The development of {Mellanox/NVIDIA} {GPUDirect} over {InfiniBand}---a
new
model for {GPU} to {GPU} communications",
journal="Computer Science - Research and Development",
year="2011",
month="Jun",
day="01",
volume="26",
number="3",
pages="267--273",
abstract="The usage and adoption of General Purpose GPUs (GPGPU) in HPC systems
is increasing due to the unparalleled performance advantage of the GPUs and the
ability to fulfill the ever-increasing demands for floating points operations.
While the GPU can offload many of the application parallel computations, the
system architecture of a GPU-CPU-InfiniBand server does require the CPU to
initiate and manage memory transfers between remote GPUs via the high speed
InfiniBand network. In this paper we introduce for the first time a new
innovative technology---GPUDirect that enables Tesla GPUs to transfer data via
InfiniBand without the involvement of the CPU or buffer copies, hence
dramatically reducing the GPU communication time and increasing overall system
performance and efficiency. We also explore for the first time the performance
benefits of GPUDirect using Amber and LAMMPS applications.",
issn="1865-2042",
doi="10.1007/s00450-011-0157-1",
url="https://doi.org/10.1007/s00450-011-0157-1"
}
}
\misc{mpich,
howpublished = {\url{https://www.mpich.org/}},
title = {{MPICH}},
author = {MPICH Team}
}
@InProceedings{datatype2012,
author="Schneider, Timo
and Gerstenberger, Robert
and Hoefler, Torsten",
editor="Tr{\"a}ff, Jesper Larsson
and Benkner, Siegfried
and Dongarra, Jack J.",
title="Micro-applications for Communication Data Access Patterns and MPI Datatypes",
booktitle="Recent Advances in the Message Passing Interface",
year="2012",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="121--131",
abstract="Data is often communicated from different locations in application memory and is commonly serialized (copied) to send buffers or from receive buffers. MPI datatypes are a way to avoid such intermediate copies and optimize communications, however, it is often unclear which implementation and optimization choices are most useful in practice. We extracted the send/recv-buffer access pattern of a representative set of scientific applications into micro-applications that isolate their data access patterns. We also observed that the buffer-access patterns in applications can be categorized into three different groups. Our micro-applications show that up to 90{\%} of the total communication time can be spent with local serialization and we found significant performance discrepancies between state-of-the-art MPI implementations. Our micro-applications aim to provide a standard benchmark for MPI datatype implementations to guide optimizations similarly to SPEC CPU and the Livermore loops do for compiler optimizations.",
isbn="978-3-642-33518-1"
}
@inproceedings{Gysi:2016:DHS:3014904.3014974,
author = {Gysi, Tobias and B\"{a}r, Jeremia and Hoefler, Torsten},
title = {dCUDA: Hardware Supported Overlap of Computation and Communication},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
series = {SC '16},
year = {2016},
isbn = {978-1-4673-8815-3},
location = {Salt Lake City, Utah},
pages = {52:1--52:12},
articleno = {52},
numpages = {12},
url = {http://dl.acm.org.proxy2.library.illinois.edu/citation.cfm?id=3014904.3014974},
acmid = {3014974},
publisher = {IEEE Press},
address = {Piscataway, NJ, USA},
keywords = {distributed memory, gpu, latency hiding, programming model, remote memory access},
}