bib.bib



@inproceedings{dang2017advanced,
  mynote = {improving MPI-thread waking},
  title={Advanced Thread Synchronization for Multithreaded {MPI} 
  Implementations},
  author={Dang, Hoang-Vu and Seo, Sangmin and Amer, Abdelhalim and Balaji, 
  Pavan},
  booktitle={Proceedings of the 17th IEEE/ACM International Symposium on 
  Cluster, Cloud and Grid Computing},
  pages={314--324},
  year={2017},
  organization={IEEE Press}
}


@inproceedings{dang2016towards,
  title={Towards millions of communicating threads},
  author={Dang, Hoang-Vu and Snir, Marc and Gropp, William},
  booktitle={Proceedings of the 23rd European MPI Users' Group Meeting},
  pages={1--14},
  year={2016},
  organization={ACM}
}


@inproceedings{balaji2008toward,
  mynote = {lock granularity},
  title={Toward efficient support for multithreaded {MPI} communication},
  author={Balaji, Pavan and Buntinas, Darius and Goodell, David and Gropp, 
  William and Thakur, Rajeev},
  booktitle={European Parallel Virtual Machine/Message Passing Interface Users’ 
  Group Meeting},
  pages={120--129},
  year={2008},
  organization={Springer}
}


@article{mpichrma,
  title={An implementation and evaluation of the {MPI} 3.0 one-sided 
  communication interface},
  author={Dinan, James and Balaji, Pavan and Buntinas, Darius and Goodell, 
  David and Gropp, William and Thakur, Rajeev},
  journal={Concurrency and Computation: Practice and Experience},
  volume={28},
  number={17},
  pages={4385--4404},
  year={2016},
}

@article{MPI-hybrid-core,
mynote = {thin and fat cores},
author = {Brian W Barrett and Ron Brightwell and Ryan Grant and Simon D Hammond 
and K Scott Hemmert},
title ={An evaluation of {MPI} message rate on hybrid-core processors},
journal = {The International Journal of High Performance Computing 
Applications},
volume = {28},
number = {4},
pages = {415-424},
year = {2014},
}


@inproceedings{Si:2014,
 mynote = {share threads between MPI and OpenMP},
 author = {Si, Min and Pe\~{n}a, Antonio J. and Balaji, Pavan and Takagi, 
 Masamichi and Ishikawa, Yutaka},
 title = {{MT-MPI}: Multithreaded {MPI} for Many-core Environments},
 booktitle = {Proceedings of the 28th ACM International Conference on 
 Supercomputing},
 series = {ICS '14},
 year = {2014},
 isbn = {978-1-4503-2642-1},
 location = {Munich, Germany},
 pages = {125--134},
 numpages = {10},
 publisher = {ACM},
 address = {New York, NY, USA}
}


@inproceedings{kamal2010fg,
mynote = {MPI processes are fibers},
	title={{FG-MPI}: Fine-grain {MPI} for multicore and clusters},
	author={Kamal, Humaira and Wagner, Alan},
	booktitle={Parallel \& Distributed Processing, Workshops and Phd Forum 
	(IPDPSW), 2010 IEEE International Symposium on},
	pages={1--8},
	year={2010},
	organization={IEEE}
}

@misc{MPI4,
	author={\par {Message Passing Interface Forum}},
	title ={{MPI} 4.0 },
	howpublished={https://www.mpi-forum.org/mpi-40/}}
}

@misc{openmp5,
	title= {{OpenMP} Technical Report 4:
Version 5.0 Preview 1},
	author = {{OpenMP Language Working Group}},
	year = {2016},
	month ={November},
	howpublished = 
	{\url{https://www.openmp.org/wp-content/uploads/openmp-tr4.pdf}}
}


}
@incollection{threadissue,
	title={Issues in developing a thread-safe {MPI} implementation},
	author={Gropp, William and Thakur, Rajeev},
	booktitle={Recent Advances in Parallel Virtual Machine and Message Passing 
	Interface},
	pages={12--21},
	year={2006},
	publisher={Springer}
}

@inproceedings{m5,
mynote = {better support to wildcards},
	title={Mitigating {MPI} Message Matching Misery},
	author={Mario Flajslik and James Dinan and Keith D. Underwood},
	year={2016},
	booktitle={International Supercomputing Conference},
}

@article{mpicon2,
mynote = {finer locks for mpich},
	title={Fine-grained multithreading support for hybrid threaded {MPI} 
	programming},
	author={Balaji, Pavan and Buntinas, Darius and Goodell, David and Gropp, 
	William and Thakur, Rajeev},
	journal={International Journal of High Performance Computing Applications},
	volume={24},
	number={1},
	pages={49--57},
	year={2010},
	publisher={SAGE Publications}
}

@inproceedings{mpiult,
mynote = {MPI + tasks},
	title={{MPI+ ULT}: Overlapping Communication and Computation with 
	User-Level Threads},
	author={Lu, Huiwei and Seo, Sangmin and Balaji, Pavan},
	booktitle={2015 IEEE 17th International Conference onHigh Performance 
	Computing and Communications (HPCC)},
	pages={444--454},
	year={2015},
	organization={IEEE}
}


@article{mpi-thread,
mynote = {multithreading optimizations in mpich},
	title={{MPI+threads}: Runtime contention and remedies},
	author={Amer, Abdelhalim and Lu, Huiwei and Wei, Yanjie and Balaji, Pavan 
	and Matsuoka, Satoshi},
	journal={ACM SIGPLAN Notices},
	volume={50},
	number={8},
	pages={239--248},
	year={2015},
	publisher={ACM}
}

@article{dang2017eliminating,
	title={Eliminating contention bottlenecks in multithreaded {MPI}},
	author={Dang, Hoang-Vu and Snir, Marc and Gropp, William},
	journal={Parallel Computing},
	volume={69},
	pages={1--23},
	year={2017},
	publisher={Elsevier}
}


@article{carpen2017expected,
mynote = {derived datatypes often sucks},
	title={On expected and observed communication performance with {MPI} 
	derived 
	datatypes},
	author={Carpen-Amarie, Alexandra and Hunold, Sascha and Tr{\"a}ff, Jesper 
	Larsson},
	journal={Parallel Computing},
	volume={69},
	pages={98--117},
	year={2017},
	publisher={Elsevier}
}


@inproceedings{vaidyanathan2015improving,
mynote = {Use progress thread and show advantage},
	title={Improving concurrency and asynchrony in multithreaded {MPI} 
	applications using software offloading},
	author={Vaidyanathan, Karthikeyan and Kalamkar, Dhiraj D and Pamnany, Kiran 
	and Hammond, Jeff R and Balaji, Pavan and Das, Dipankar and Park, Jongsoo 
	and Jo{\'o}, B{\'a}lint},
	booktitle={Proceedings of the International Conference for High Performance 
	Computing, Networking, Storage and Analysis},
	pages={30},
	year={2015},
	organization={ACM}
}

@inproceedings{farmer2016mpi,
mynote = {runs benchmarks -- show MT issues},
	title={{MPI} Performance Characterization on {Infiniband} with Fine-Grain 
	Multithreaded Communication},
	author={Farmer, Shane and Skjellum, Anthony and Grant, Ryan E and 
	Brightwell, Ron},
	booktitle={High Performance Computing and Communications; IEEE 14th 
	International Conference on Smart City; IEEE 2nd International Conference 
	on Data Science and Systems (HPCC/SmartCity/DSS), 2016 IEEE 18th 
	International Conference on},
	pages={1102--1106},
	year={2016},
	organization={IEEE}
}

@inproceedings{dosanjh2016rma,
mynote = {performance of multithreaded RMA sucks},
	title={{RMA-MT}: A benchmark suite for assessing {MPI} multi-threaded {RMA} 
	performance},
	author={Dosanjh, Matthew GF and Groves, Taylor and Grant, Ryan E and 
	Brightwell, Ron and Bridges, Patrick G},
	booktitle={Cluster, Cloud and Grid Computing (CCGrid), 2016 16th IEEE/ACM 
	International Symposium on},
	pages={550--559},
	year={2016},
	organization={IEEE}
}

article{amer2016locking,
	title={Locking aspects in multithreaded MPI implementations},
	author={Amer, ABDELHALIM and Lu, HUIWEI and Wei, YANJIE and Hammond, JEFF 
	and Matsuoka, SATOSHI and Balaji, PAVAN},
	journal={Argonne National Lab., Tech. Rep. P6005-0516},
	year={2016}
}

@inproceedings{akhmetova2017performance,
mynote = {shows good performance with thread multiple},
	title={Performance study of multithreaded {MPI} and {OpenMP} tasking in a 
	large 
	scientific code},
	author={Akhmetova, Dana and Iakymchuk, Roman and Ekeberg, Orjan and Laure, 
	Erwin},
	booktitle={Parallel and Distributed Processing Symposium Workshops 
	(IPDPSW), 2017 IEEE International},
	pages={756--765},
	year={2017},
	organization={IEEE}
}


@inproceedings{denis2014pioman,
mynote = { share progress engine},
	title={pioman: a Generic Framework for Asynchronous Progression and 
	Multithreaded Communications},
	author={Denis, Alexandre},
	booktitle={IEEE International Conference on Cluster Computing (IEEE 
	Cluster)},
	year={2014}
}

inproceedings{si2015techniques,
mynote = {MT-MPI and Casper -- brief overview},
	title={Techniques for Enabling Highly Efficient Message Passing on 
	Many-Core Architectures},
	author={Si, Min and Balaji, Pavan and Ishikawa, Yutaka},
	booktitle={Cluster, Cloud and Grid Computing (CCGrid), 2015 15th IEEE/ACM 
	International Symposium on},
	pages={697--700},
	year={2015},
	organization={IEEE}
}

@INPROCEEDINGS{brightwellseastarqueue, 
mynote = {measured queue lengths},
	author={R. Brightwell and K. Pedretti and K. Ferreira}, 
	booktitle={2008 Proceedings of 17th International Conference on Computer 
	Communications and Networks}, 
	title={Instrumentation and Analysis of {MPI} Queue Times on the {SeaStar} 
	High-Performance Network}, 
	year={2008}, 
	volume={}, 
	number={}, 
	pages={1-7}, 
	keywords={message passing;parallel processing;protocols;queueing 
	theory;Cray XT series;MPI queue times instrumentation;SeaStar 
	high-performance network;communication behavior;network protocol 
	stack;network resource usage;next-generation networking hardware 
	design;parallel applications;parallel processing machines;performance 
	optimization;Application software;Hardware;Instruments;Next generation 
	networking;Parallel processing;Performance analysis;Protocols;Queueing 
	analysis;Random access memory;Storms}, 
	doi={10.1109/ICCCN.2008.ECP.116}, 
	ISSN={1095-2055}, 
	month={Aug},}

@inproceedings{ferreira2017characterizing,
mynote = {measurements via simulation of queue length},
	title={Characterizing {MPI} matching via trace-based simulation},
	author={Ferreira, Kurt B and Levy, Scott and Pedretti, Kevin and Grant, 
	Ryan E},
	booktitle={Proceedings of the 24th European MPI Users' Group Meeting},
	pages={8},
	year={2017},
	organization={ACM}
}

@inproceedings{bayatpour2016adaptive,
mynote = {dynamic binning},
	title={Adaptive and dynamic design for MPI tag matching},
	author={Bayatpour, M and Subramoni, Hari and Chakraborty, S and Panda, 
	Dhabaleswar K},
	booktitle={Cluster Computing (CLUSTER), 2016 IEEE International Conference 
	on},
	pages={1--10},
	year={2016},
	organization={IEEE}
}

@article{zounmevo2014fast,
mynote = {a 2D queue structure},
	title={A fast and resource-conscious MPI message queue mechanism for 
	large-scale jobs},
	author={Zounmevo, Judicael A and Afsahi, Ahmad},
	journal={Future Generation Computer Systems},
	volume={30},
	pages={265--290},
	year={2014},
	publisher={Elsevier}
}

inproceedings{bridges2015preparing,
mynote = {queueing model for multi-core MPI},
	title={Preparing for exascale: modeling mpi for many-core systems using 
	fine-grain queues},
	author={Bridges, Patrick G and Dosanjh, Matthew GF and Grant, Ryan and 
	Skjellum, Anthony and Farmer, Shane and Brightwell, Ron},
	booktitle={Proceedings of the 3rd Workshop on Exascale MPI},
	pages={5},
	year={2015},
	organization={ACM}
}

@inproceedings{raffenetti2017mpi,
mynote = {detailed analysis of instruction counts for various MPI functions},
	title={Why is MPI so slow?: analyzing the fundamental limits in 
	implementing MPI-3.1},
	author={Raffenetti, Ken and Amer, Abdelhalim and Oden, Lena and Archer, 
	Charles and Bland, Wesley and Fujita, Hajime and Guo, Yanfei and Janjusic, 
	Tomislav and Durnov, Dmitry and Blocksome, Michael and others},
	booktitle={Proceedings of the International Conference for High Performance 
	Computing, Networking, Storage and Analysis},
	pages={62},
	year={2017},
	organization={ACM}
}

@article{aji2016mpi,
mynote = {code data environment in datatype; pipeline},
	title={{MPI-ACC}: Accelerator-aware {MPI} for scientific applications},
	author={Aji, Ashwin M and Panwar, Lokendra S and Ji, Feng and Murthy, 
	Karthik and Chabbi, Milind and Balaji, Pavan and Bisset, Keith R and Dinan, 
	James and Feng, Wu-chun and Mellor-Crummey, John and others},
	journal={IEEE Transactions on Parallel and Distributed Systems},
	volume={27},
	number={5},
	pages={1401--1414},
	year={2016},
	publisher={IEEE}
}

@article{wang2014gpu,
mynote = {deal with derived datatypes on GPU},
	title={{GPU}-aware {MPI} on {RDMA}-enabled clusters: Design, implementation 
	and 
	evaluation},
	author={Wang, Hao and Potluri, Sreeram and Bureddy, Devendar and Rosales, 
	Carlos and Panda, Dhabaleswar K},
	journal={IEEE Transactions on Parallel and Distributed Systems},
	volume={25},
	number={10},
	pages={2595--2605},
	year={2014},
	publisher={IEEE}
}

@article{oden2016analyzing,
	mynote = {advantage of GPU communication},
	title={Analyzing {GPU}-controlled communication with dynamic parallelism in 
	terms of performance and energy},
	author={Oden, Lena and Klenk, Benjamin and Fr{\"o}ning, Holger},
	journal={Parallel Computing},
	volume={57},
	pages={125--134},
	year={2016},
	publisher={Elsevier}
}

@inproceedings{klenk2017overview,
mynote = {queue length, wildcard},
	title={An overview of {MPI} characteristics of exascale proxy applications},
	author={Klenk, Benjamin and Fr{\"o}ning, Holger},
	booktitle={International Supercomputing Conference},
	pages={217--236},
	year={2017},
	organization={Springer}
}

@INPROCEEDINGS{Denis2016mpioverlap,
mynote = { benchmarks to measure comm/comp overlap	 },
	author={A. Denis and F. Trahay}, 
	booktitle={2016 45th International Conference on Parallel Processing 
	(ICPP)}, 
	title={{MPI} Overlap: Benchmark and Analysis}, 
	year={2016}, 
	volume={}, 
	number={}, 
	pages={258-267}, 
	keywords={application program interfaces;message passing;parallel 
	processing;MPI libraries;MPI overlap;hardware platforms;nonblocking MPI 
	request;nonblocking point-to-point communications;sequential code;Benchmark 
	testing;Hardware;Kernel;Libraries;Receivers;Time 
	measurement;HPC;MPI;benchmark;overlap}, 
	doi={10.1109/ICPP.2016.37}, 
	ISSN={}, 
	month={Aug},}

@INPROCEEDINGS{Klenk2017,
mynote = { MPI on GPU},
	author={B. Klenk and H. Fröening and H. Eberle and L. Dennison},
	booktitle={2017 IEEE International Parallel and Distributed Processing 
	Symposium (IPDPS)},
	title={Relaxations for High-Performance Message Passing on Massively 
	Parallel {SIMT} Processors},
	year={2017},
	volume={},
	number={},
	pages={855-865},
	keywords={graphics processing units;message passing;multiprocessing 
	systems;GPU;MPI;control flow switch;data transfer;general-purpose 
	CPU;graphics processing unit;high-performance message passing;massively 
	parallel SIMT processor;message passing interface;single instruction 
	multiple thread architecture;traffic sinking;traffic sourcing;Computer 
	architecture;Graphics processing units;Kernel;Message passing;Message 
	systems;Semantics;Communication Models;GPU Computing;Heterogeneous 
	systems;Message Passing},
	doi={10.1109/IPDPS.2017.94},
	ISSN={},
	month={May},}

@article{thakur2009test,
	title={Test suite for evaluating performance of multithreaded {MPI} 
	communication},
	author={Thakur, Rajeev and Gropp, William},
	journal={Parallel Computing},
	volume={35},
	number={12},
	pages={608--617},
	year={2009},
	publisher={Elsevier}
}


@INPROCEEDINGS{goodell2010minimizing,
mynote = {overhead of object allocation },
	author={D. Goodell and P. Balaji and D. Buntinas and G. Dozsa and W. Gropp 
	and S. Kumar and B. R. d. Supinski and R. Thakur}, 
	booktitle={2010 IEEE International Conference on Cluster Computing}, 
	title={Minimizing {MPI} Resource Contention in Multithreaded Multicore 
	Environments}, 
	year={2010}, 
	volume={}, 
	number={}, 
	pages={1-8}, 
	keywords={message passing;minimisation;multi-threading;shared memory 
	systems;MPI object management;MPI resource contention 
	minimization;high-performance computing systems;hybrid programming 
	model;memory leak prevention;multithreaded MPI communication;multithreaded 
	messaging rate;multithreaded multicore environments;reference 
	counting;shared memory;Benchmark testing;Instruction sets;Message 
	systems;Resource management;Semantics;TV;Transient analysis}, 
	doi={10.1109/CLUSTER.2010.11}, 
	ISSN={1552-5244}, 
	month={Sept},}

@inproceedings{si2015casper,
	title={Casper: An asynchronous progress model for MPI RMA on many-core 
	architectures},
	author={Si, Min and Pena, Antonio J and Hammond, Jeff and Balaji, Pavan and 
	Takagi, Masamichi and Ishikawa, Yutaka},
	booktitle={Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE 
	International},
	pages={665--676},
	year={2015},
	organization={IEEE}
}

@misc{openmpi,
	howpublished = {\url{https://www.open-mpi.org/}},
    title = {{Open MPI}: Open Source High Performance Computing},
    author = {{OpenMPI Team}}
}

@inproceedings{hoefler2008message,
mynote = {progress thread that shares a core},
	title={Message progression in parallel computing-to thread or not to 
	thread?},
	author={Hoefler, Torsten and Lumsdaine, Andrew},
	booktitle={Cluster Computing, 2008 IEEE International Conference on},
	pages={213--222},
	year={2008},
	organization={IEEE}
}
	
@misc{gpudirect,
	author ={{NVIDIA}},
	title = {{NVIDIA GPU Direct}},
	howpublished = {\url{https://developer.nvidia.com/gpudirect}}
}

	
@article{Shainer2011,
author="Shainer, Gilad
and Ayoub, Ali
and Lui, Pak
and Liu, Tong
and Kagan, Michael
and Trott, Christian R.
and Scantlen, Greg
and Crozier, Paul S.",
title="The development of {Mellanox/NVIDIA} {GPUDirect} over {InfiniBand}---a 
new 
model for {GPU} to {GPU} communications",
journal="Computer Science - Research and Development",
year="2011",
month="Jun",
day="01",
volume="26",
number="3",
pages="267--273",
abstract="The usage and adoption of General Purpose GPUs (GPGPU) in HPC systems 
is increasing due to the unparalleled performance advantage of the GPUs and the 
ability to fulfill the ever-increasing demands for floating points operations. 
While the GPU can offload many of the application parallel computations, the 
system architecture of a GPU-CPU-InfiniBand server does require the CPU to 
initiate and manage memory transfers between remote GPUs via the high speed 
InfiniBand network. In this paper we introduce for the first time a new 
innovative technology---GPUDirect that enables Tesla GPUs to transfer data via 
InfiniBand without the involvement of the CPU or buffer copies, hence 
dramatically reducing the GPU communication time and increasing overall system 
performance and efficiency. We also explore for the first time the performance 
benefits of GPUDirect using Amber and LAMMPS applications.",
issn="1865-2042",
doi="10.1007/s00450-011-0157-1",
url="https://doi.org/10.1007/s00450-011-0157-1"
}
	
	
}
\misc{mpich,
    howpublished = {\url{https://www.mpich.org/}},
    title = {{MPICH}},
    author = {MPICH Team}
}

@InProceedings{datatype2012,
author="Schneider, Timo
and Gerstenberger, Robert
and Hoefler, Torsten",
editor="Tr{\"a}ff, Jesper Larsson
and Benkner, Siegfried
and Dongarra, Jack J.",
title="Micro-applications for Communication Data Access Patterns and MPI Datatypes",
booktitle="Recent Advances in the Message Passing Interface",
year="2012",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="121--131",
abstract="Data is often communicated from different locations in application memory and is commonly serialized (copied) to send buffers or from receive buffers. MPI datatypes are a way to avoid such intermediate copies and optimize communications, however, it is often unclear which implementation and optimization choices are most useful in practice. We extracted the send/recv-buffer access pattern of a representative set of scientific applications into micro-applications that isolate their data access patterns. We also observed that the buffer-access patterns in applications can be categorized into three different groups. Our micro-applications show that up to 90{\%} of the total communication time can be spent with local serialization and we found significant performance discrepancies between state-of-the-art MPI implementations. Our micro-applications aim to provide a standard benchmark for MPI datatype implementations to guide optimizations similarly to SPEC CPU and the Livermore loops do for compiler optimizations.",
isbn="978-3-642-33518-1"
}

@inproceedings{Gysi:2016:DHS:3014904.3014974,
 author = {Gysi, Tobias and B\"{a}r, Jeremia and Hoefler, Torsten},
 title = {dCUDA: Hardware Supported Overlap of Computation and Communication},
 booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
 series = {SC '16},
 year = {2016},
 isbn = {978-1-4673-8815-3},
 location = {Salt Lake City, Utah},
 pages = {52:1--52:12},
 articleno = {52},
 numpages = {12},
 url = {http://dl.acm.org.proxy2.library.illinois.edu/citation.cfm?id=3014904.3014974},
 acmid = {3014974},
 publisher = {IEEE Press},
 address = {Piscataway, NJ, USA},
 keywords = {distributed memory, gpu, latency hiding, programming model, remote memory access},
}