report/refs.bib


@article{baevskiData2vecGeneralFramework2022,
  title = {Data2vec: {{A General Framework}} for {{Self-supervised Learning}} in {{Speech}}, {{Vision}} and {{Language}}},
  shorttitle = {Data2vec},
  author = {Baevski, Alexei and Hsu, Wei-Ning and Xu, Qiantong and Babu, Arun and Gu, Jiatao and Auli, Michael},
  year = {2022},
  month = apr,
  journal = {arXiv:2202.03555 [cs]},
  eprint = {2202.03555},
  eprinttype = {arxiv},
  primaryclass = {cs},
  abstract = {While the general idea of self-supervised learning is identical across modalities, the actual algorithms and objectives differ widely because they were developed with a single modality in mind. To get us closer to general self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech, NLP or computer vision. The core idea is to predict latent representations of the full input data based on a masked view of the input in a self-distillation setup using a standard Transformer architecture. Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which are local in nature, data2vec predicts contextualized latent representations that contain information from the entire input. Experiments on the major benchmarks of speech recognition, image classification, and natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/siddhantray/Zotero/storage/EJVRNSTI/Baevski et al. - 2022 - data2vec A General Framework for Self-supervised .pdf;/Users/siddhantray/Zotero/storage/JTRHEY9V/2202.html}
}

@article{beltagyLongformerLongDocumentTransformer2020,
  title = {Longformer: {{The Long-Document Transformer}}},
  shorttitle = {Longformer},
  author = {Beltagy, Iz and Peters, Matthew E. and Cohan, Arman},
  year = {2020},
  month = dec,
  journal = {arXiv:2004.05150 [cs]},
  eprint = {2004.05150},
  eprinttype = {arxiv},
  primaryclass = {cs},
  abstract = {Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization dataset.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/siddhantray/Zotero/storage/UJPK2ACQ/Beltagy et al. - 2020 - Longformer The Long-Document Transformer.pdf;/Users/siddhantray/Zotero/storage/GJKBG4VZ/2004.html}
}

@article{brownLanguageModelsAre2020,
  title = {Language {{Models}} Are {{Few-Shot Learners}}},
  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and {Herbert-Voss}, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  year = {2020},
  month = jul,
  journal = {arXiv:2005.14165 [cs]},
  eprint = {2005.14165},
  eprinttype = {arxiv},
  primaryclass = {cs},
  abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/siddhantray/Zotero/storage/DKX47QSK/Brown et al. - 2020 - Language Models are Few-Shot Learners.pdf;/Users/siddhantray/Zotero/storage/2Z8JUV56/2005.html}
}

@article{devlinBERTPretrainingDeep2019,
  title = {{{BERT}}: {{Pre-training}} of {{Deep Bidirectional Transformers}} for {{Language Understanding}}},
  shorttitle = {{{BERT}}},
  author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  year = {2019},
  month = may,
  journal = {arXiv:1810.04805 [cs]},
  eprint = {1810.04805},
  eprinttype = {arxiv},
  primaryclass = {cs},
  abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5\% (7.7\% point absolute improvement), MultiNLI accuracy to 86.7\% (4.6\% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/siddhantray/Zotero/storage/2YT9M2VZ/Devlin et al. - 2019 - BERT Pre-training of Deep Bidirectional Transform.pdf;/Users/siddhantray/Zotero/storage/6H2S6WY2/1810.html}
}

@article{dosovitskiyImageWorth16x162021,
  title = {An {{Image}} Is {{Worth}} 16x16 {{Words}}: {{Transformers}} for {{Image Recognition}} at {{Scale}}},
  shorttitle = {An {{Image}} Is {{Worth}} 16x16 {{Words}}},
  author = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
  year = {2021},
  month = jun,
  journal = {arXiv:2010.11929 [cs]},
  eprint = {2010.11929},
  eprinttype = {arxiv},
  primaryclass = {cs},
  abstract = {While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/siddhantray/Zotero/storage/6D65SQTJ/Dosovitskiy et al. - 2021 - An Image is Worth 16x16 Words Transformers for Im.pdf;/Users/siddhantray/Zotero/storage/832627XR/2010.html}
}

@book{goodfellowDeepLearning2016,
  title = {Deep {{Learning}}},
  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
  year = {2016},
  month = nov,
  publisher = {{MIT Press}},
  abstract = {An introduction to a broad range of topics in deep learning, covering mathematical and conceptual background, deep learning techniques used in industry, and research perspectives.``Written by three experts in the field, Deep Learning is the only comprehensive book on the subject.''\textemdash Elon Musk, cochair of OpenAI; cofounder and CEO of Tesla and SpaceXDeep learning is a form of machine learning that enables computers to learn from experience and understand the world in terms of a hierarchy of concepts. Because the computer gathers knowledge from experience, there is no need for a human computer operator to formally specify all the knowledge that the computer needs. The hierarchy of concepts allows the computer to learn complicated concepts by building them out of simpler ones; a graph of these hierarchies would be many layers deep. This book introduces a broad range of topics in deep learning. The text offers mathematical and conceptual background, covering relevant concepts in linear algebra, probability theory and information theory, numerical computation, and machine learning. It describes deep learning techniques used by practitioners in industry, including deep feedforward networks, regularization, optimization algorithms, convolutional networks, sequence modeling, and practical methodology; and it surveys such applications as natural language processing, speech recognition, computer vision, online recommendation systems, bioinformatics, and videogames. Finally, the book offers research perspectives, covering such theoretical topics as linear factor models, autoencoders, representation learning, structured probabilistic models, Monte Carlo methods, the partition function, approximate inference, and deep generative models. Deep Learning can be used by undergraduate or graduate students planning careers in either industry or research, and by software engineers who want to begin using deep learning in their products or platforms. A website offers supplementary material for both readers and instructors.},
  googlebooks = {Np9SDQAAQBAJ},
  isbn = {978-0-262-03561-3},
  langid = {english},
  keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science}
}

@misc{heMaskedAutoencodersAre2021,
  title = {Masked {{Autoencoders Are Scalable Vision Learners}}},
  author = {He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
  year = {2021},
  month = dec,
  number = {arXiv:2111.06377},
  eprint = {2111.06377},
  eprinttype = {arxiv},
  primaryclass = {cs},
  institution = {{arXiv}},
  abstract = {This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask tokens. Second, we find that masking a high proportion of the input image, e.g., 75\%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8\%) among methods that use only ImageNet-1K data. Transfer performance in downstream tasks outperforms supervised pre-training and shows promising scaling behavior.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/siddhantray/Zotero/storage/LVGPABTP/He et al. - 2021 - Masked Autoencoders Are Scalable Vision Learners.pdf;/Users/siddhantray/Zotero/storage/IQXS7TNN/2111.html}
}

@inproceedings{hePERTPayloadEncoding2020,
  title = {{{PERT}}: {{Payload Encoding Representation}} from {{Transformer}} for {{Encrypted Traffic Classification}}},
  shorttitle = {{{PERT}}},
  booktitle = {2020 {{ITU Kaleidoscope}}: {{Industry-Driven Digital Transformation}} ({{ITU K}})},
  author = {He, Hong Ye and Guo Yang, Zhi and Chen, Xiang Ning},
  year = {2020},
  month = dec,
  pages = {1--8},
  doi = {10.23919/ITUK50268.2020.9303204},
  abstract = {Traffic identification becomes more important yet more challenging as related encryption techniques are rapidly developing nowadays. In difference to recent deep learning methods that apply image processing to solve such encrypted traffic problems, in this paper, we propose a method named Payload Encoding Representation from Transformer (PERT) to perform automatic traffic feature extraction using a state-of-the-art dynamic word embedding technique. Based on this, we further provide a traffic classification framework in which unlabeled traffic is utilized to pre-train an encoding network that learns the contextual distribution of traffic payload bytes. Then, the downward classification reuses the pre-trained network to obtain an enhanced classification result. By implementing experiments on a public encrypted traffic data set and our captured Android HTTPS traffic, we prove the proposed method can achieve an obvious better effectiveness than other compared baselines. To the best of our knowledge, this is the first time the encrypted traffic classification with the dynamic word embedding alone with its pre-training strategy has been addressed.},
  keywords = {Cryptography,Deep learning,dynamic word embedding,encrypted traffic classification,Feature extraction,Image coding,natural language processing,Payloads,Task analysis,Telecommunication traffic,traffic identification},
  file = {/Users/siddhantray/Zotero/storage/NJPKY52R/He et al. - 2020 - PERT Payload Encoding Representation from Transfo.pdf;/Users/siddhantray/Zotero/storage/7RXZ67EC/9303204.html}
}

@inproceedings{jayDeepReinforcementLearning2019,
  title = {A {{Deep Reinforcement Learning Perspective}} on {{Internet Congestion Control}}},
  booktitle = {Proceedings of the 36th {{International Conference}} on {{Machine Learning}}},
  author = {Jay, Nathan and Rotman, Noga and Godfrey, Brighten and Schapira, Michael and Tamar, Aviv},
  year = {2019},
  month = may,
  pages = {3050--3059},
  publisher = {{PMLR}},
  issn = {2640-3498},
  abstract = {We present and investigate a novel and timely application domain for deep reinforcement learning (RL): Internet congestion control. Congestion control is the core networking task of modulating traffic sources' data-transmission rates to efficiently utilize network capacity, and is the subject of extensive attention in light of the advent of Internet services such as live video, virtual reality, Internet-of-Things, and more. We show that casting congestion control as RL enables training deep network policies that capture intricate patterns in data traffic and network conditions, and leverage this to outperform the state-of-the-art. We also highlight significant challenges facing real-world adoption of RL-based congestion control, including fairness, safety, and generalization, which are not trivial to address within conventional RL formalism. To facilitate further research and reproducibility of our results, we present a test suite for RL-guided congestion control based on the OpenAI Gym interface.},
  langid = {english},
  file = {/Users/siddhantray/Zotero/storage/CPD88YJG/Jay et al. - 2019 - A Deep Reinforcement Learning Perspective on Inter.pdf}
}

@inproceedings{maoNeuralAdaptiveVideo2017,
  title = {Neural {{Adaptive Video Streaming}} with {{Pensieve}}},
  booktitle = {Proceedings of the {{Conference}} of the {{ACM Special Interest Group}} on {{Data Communication}}},
  author = {Mao, Hongzi and Netravali, Ravi and Alizadeh, Mohammad},
  year = {2017},
  month = aug,
  pages = {197--210},
  publisher = {{ACM}},
  address = {{Los Angeles CA USA}},
  doi = {10.1145/3098822.3098843},
  abstract = {Client-side video players employ adaptive bitrate (ABR) algorithms to optimize user quality of experience (QoE). Despite the abundance of recently proposed schemes, state-of-the-art ABR algorithms suffer from a key limitation: they use fixed control rules based on simplified or inaccurate models of the deployment environment. As a result, existing schemes inevitably fail to achieve optimal performance across a broad set of network conditions and QoE objectives. We propose Pensieve, a system that generates ABR algorithms using reinforcement learning (RL). Pensieve trains a neural network model that selects bitrates for future video chunks based on observations collected by client video players. Pensieve does not rely on pre-programmed models or assumptions about the environment. Instead, it learns to make ABR decisions solely through observations of the resulting performance of past decisions. As a result, Pensieve automatically learns ABR algorithms that adapt to a wide range of environments and QoE metrics. We compare Pensieve to state-of-theart ABR algorithms using trace-driven and real world experiments spanning a wide variety of network conditions, QoE metrics, and video properties. In all considered scenarios, Pensieve outperforms the best state-of-the-art scheme, with improvements in average QoE of 12\%\textendash 25\%. Pensieve also generalizes well, outperforming existing schemes even on networks for which it was not explicitly trained.},
  isbn = {978-1-4503-4653-5},
  langid = {english},
  file = {/Users/siddhantray/Zotero/storage/XCG5ZBUF/Mao et al. - 2017 - Neural Adaptive Video Streaming with Pensieve.pdf}
}

@misc{nickelPoincarEmbeddingsLearning2017,
  title = {Poincar\textbackslash 'e {{Embeddings}} for {{Learning Hierarchical Representations}}},
  author = {Nickel, Maximilian and Kiela, Douwe},
  year = {2017},
  month = may,
  number = {arXiv:1705.08039},
  eprint = {1705.08039},
  eprinttype = {arxiv},
  primaryclass = {cs, stat},
  institution = {{arXiv}},
  abstract = {Representation learning has become an invaluable approach for learning from symbolic data such as text and graphs. However, while complex symbolic datasets often exhibit a latent hierarchical structure, state-of-the-art methods typically learn embeddings in Euclidean vector spaces, which do not account for this property. For this purpose, we introduce a new approach for learning hierarchical representations of symbolic data by embedding them into hyperbolic space -- or more precisely into an n-dimensional Poincar\textbackslash 'e ball. Due to the underlying hyperbolic geometry, this allows us to learn parsimonious representations of symbolic data by simultaneously capturing hierarchy and similarity. We introduce an efficient algorithm to learn the embeddings based on Riemannian optimization and show experimentally that Poincar\textbackslash 'e embeddings outperform Euclidean embeddings significantly on data with latent hierarchies, both in terms of representation capacity and in terms of generalization ability.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/siddhantray/Zotero/storage/N33SFVNB/Nickel and Kiela - 2017 - Poincar'e Embeddings for Learning Hierarchical Re.pdf;/Users/siddhantray/Zotero/storage/ZZ2XINT2/1705.html}
}

@misc{sarkarNs3dumbelltopologysimulation2022,
  title = {Ns3-Dumbell-Topology-Simulation},
  author = {Sarkar, Pritam},
  year = {2022},
  month = mar,
  abstract = {Analyze and compare TCP Reno, TCP Westwood, and TCP Fack performance using NS3 simulator},
  copyright = {MIT},
  keywords = {congestion-loss,dumbbell-topology,ns3-simulator,routers,tcp-reno,tcp-westwood,throughput,topology}
}

@misc{Spearmint2020,
  title = {Spearmint},
  year = {2020},
  month = mar,
  abstract = {Spearmint Bayesian optimization codebase},
  howpublished = {Stanford Systems and Networking Research}
}


@inproceedings{vaswaniAttentionAllYou2017,
 author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Attention is All you Need},
 url = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
 volume = {30},
 year = {2017}
}


@misc{wettigShouldYouMask,
  doi = {10.48550/ARXIV.2202.08005},
  
  url = {https://arxiv.org/abs/2202.08005},
  
  author = {Wettig, Alexander and Gao, Tianyu and Zhong, Zexuan and Chen, Danqi},
  
  keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {Should You Mask 15\% in Masked Language Modeling?},
  
  publisher = {arXiv},
  month = {May},
  year = {2022},
  
  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{xiaAutomaticCurriculumGeneration2022,
  title = {Automatic {{Curriculum Generation}} for {{Learning Adaptation}} in {{Networking}}},
  author = {Xia, Zhengxu and Zhou, Yajie and Yan, Francis Y. and Jiang, Junchen},
  year = {2022},
  month = feb,
  number = {arXiv:2202.05940},
  eprint = {2202.05940},
  eprinttype = {arxiv},
  primaryclass = {cs},
  institution = {{arXiv}},
  abstract = {As deep reinforcement learning (RL) showcases its strengths in networking and systems, its pitfalls also come to the public's attention--when trained to handle a wide range of network workloads and previously unseen deployment environments, RL policies often manifest suboptimal performance and poor generalizability. To tackle these problems, we present Genet, a new training framework for learning better RL-based network adaptation algorithms. Genet is built on the concept of curriculum learning, which has proved effective against similar issues in other domains where RL is extensively employed. At a high level, curriculum learning gradually presents more difficult environments to the training, rather than choosing them randomly, so that the current RL model can make meaningful progress in training. However, applying curriculum learning in networking is challenging because it remains unknown how to measure the "difficulty" of a network environment. Instead of relying on handcrafted heuristics to determine the environment's difficulty level, our insight is to utilize traditional rule-based (non-RL) baselines: If the current RL model performs significantly worse in a network environment than the baselines, then the model's potential to improve when further trained in this environment is substantial. Therefore, Genet automatically searches for the environments where the current model falls significantly behind a traditional baseline scheme and iteratively promotes these environments as the training progresses. Through evaluating Genet on three use cases--adaptive video streaming, congestion control, and load balancing, we show that Genet produces RL policies which outperform both regularly trained RL policies and traditional baselines in each context, not only under synthetic workloads but also in real environments.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Networking and Internet Architecture},
  file = {/Users/siddhantray/Zotero/storage/XRSJ8ZTI/Xia et al. - 2022 - Automatic Curriculum Generation for Learning Adapt.pdf;/Users/siddhantray/Zotero/storage/MEQ4UVFL/2202.html}
}

@article{yanLearningSituRandomized,
  title = {Learning in Situ: A Randomized Experiment in Video Streaming},
  author = {Yan, Francis Y and Hong, James and Ayers, Hudson and Zhang, Keyi and Zhu, Chenzhi and Levis, Philip and Fouladi, Sadjad and Winstein, Keith},
  pages = {18},
  abstract = {We describe the results of a randomized controlled trial of video-streaming algorithms for bitrate selection and network prediction. Over the last year, we have streamed 38.6 years of video to 63,508 users across the Internet. Sessions are randomized in blinded fashion among algorithms.},
  langid = {english},
  file = {/Users/siddhantray/Zotero/storage/JFIJ86KI/Yan et al. - Learning in situ a randomized experiment in video.pdf}
}

@article{yanPantheonTrainingGround,
  title = {Pantheon: The Training Ground for {{Internet}} Congestion-Control Research},
  author = {Yan, Francis Y and Ma, Jestin and Hill, Greg D and Raghavan, Deepti and Wahby, Riad S and Levis, Philip and Winstein, Keith},
  pages = {13},
  abstract = {Internet transport algorithms are foundational to the performance of network applications. But a number of practical challenges make it difficult to evaluate new ideas and algorithms in a reproducible manner. We present the Pantheon, a system that addresses this by serving as a community ``training ground'' for research on Internet transport protocols and congestion control (https: //pantheon.stanford.edu). It allows network researchers to benefit from and contribute to a common set of benchmark algorithms, a shared evaluation platform, and a public archive of results.},
  langid = {english},
  file = {/Users/siddhantray/Zotero/storage/2YYZCDPL/Yan et al. - Pantheon the training ground for Internet congest.pdf}
}

@inproceedings{zaheerDeepSets2018,
  author = {Zaheer, Manzil and Kottur, Satwik and Ravanbhakhsh, Siamak and P\'{o}czos, Barnab\'{a}s and Salakhutdinov, Ruslan and Smola, Alexander J},
title = {Deep Sets},
year = {2017},
isbn = {9781510860964},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY, USA},
abstract = {We study the problem of designing models for machine learning tasks defined on sets. In contrast to traditional approach of operating on fixed dimensional vectors, we consider objective functions defined on sets that are invariant to permutations. Such problems are widespread, ranging from estimation of population statistics [1], to anomaly detection in piezometer data of embankment dams [2], to cosmology [3, 4]. Our main theorem characterizes the permutation invariant functions and provides a family of functions to which any permutation invariant objective function must belong. This family of functions has a special structure which enables us to design a deep network architecture that can operate on sets and which can be deployed on a variety of scenarios including both unsupervised and supervised learning tasks. We also derive the necessary and sufficient conditions for permutation equivariance in deep models. We demonstrate the applicability of our method on population statistic estimation, point cloud classification, set expansion, and outlier detection.},
booktitle = {Proceedings of the 31st International Conference on Neural Information Processing Systems},
pages = {3394–3404},
numpages = {11},
location = {Long Beach, California, USA},
series = {NIPS'17}
}

@inproceedings{zhangMimicNetFastPerformance2021,
  title = {{{MimicNet}}: Fast Performance Estimates for Data Center Networks with Machine Learning},
  shorttitle = {{{MimicNet}}},
  booktitle = {Proceedings of the 2021 {{ACM SIGCOMM}} 2021 {{Conference}}},
  author = {Zhang, Qizhen and Ng, Kelvin K. W. and Kazer, Charles and Yan, Shen and Sedoc, Jo{\~a}o and Liu, Vincent},
  year = {2021},
  month = aug,
  pages = {287--304},
  publisher = {{ACM}},
  address = {{Virtual Event USA}},
  doi = {10.1145/3452296.3472926},
  abstract = {At-scale evaluation of new data center network innovations is becoming increasingly intractable. This is true for testbeds, where few, if any, can afford a dedicated, full-scale replica of a data center. It is also true for simulations, which while originally designed for precisely this purpose, have struggled to cope with the size of today's networks.},
  isbn = {978-1-4503-8383-7},
  langid = {english},
  file = {/Users/siddhantray/Zotero/storage/EGUVK8RD/Zhang et al. - 2021 - MimicNet fast performance estimates for data cent.pdf}
}

@inproceedings{zhuNetworkPlanningDeep2021,
  title = {Network Planning with Deep Reinforcement Learning},
  booktitle = {Proceedings of the 2021 {{ACM SIGCOMM}} 2021 {{Conference}}},
  author = {Zhu, Hang and Gupta, Varun and Ahuja, Satyajeet Singh and Tian, Yuandong and Zhang, Ying and Jin, Xin},
  year = {2021},
  month = aug,
  pages = {258--271},
  publisher = {{ACM}},
  address = {{Virtual Event USA}},
  doi = {10.1145/3452296.3472902},
  abstract = {Network planning is critical to the performance, reliability and cost of web services. This problem is typically formulated as an Integer Linear Programming (ILP) problem. Today's practice relies on handtuned heuristics from human experts to address the scalability challenge of ILP solvers. In this paper, we propose NeuroPlan, a deep reinforcement learning (RL) approach to solve the network planning problem. This problem involves multi-step decision making and cost minimization, which can be naturally cast as a deep RL problem. We develop two important domain-specific techniques. First, we use a graph neural network (GNN) and a novel domain-specific node-link transformation for state encoding, in order to handle the dynamic nature of the evolving network topology during planning decision making. Second, we leverage a two-stage hybrid approach that first uses deep RL to prune the search space and then uses an ILP solver to find the optimal solution. This approach resembles today's practice, but avoids human experts with an RL agent in the first stage. Evaluation on real topologies and setups from large production networks demonstrates that NeuroPlan scales to large topologies beyond the capability of ILP solvers, and reduces the cost by up to 17\% compared to hand-tuned heuristics.},
  isbn = {978-1-4503-8383-7},
  langid = {english},
  file = {/Users/siddhantray/Zotero/storage/96T28PR9/Zhu et al. - 2021 - Network planning with deep reinforcement learning.pdf}
}

@article{Robbins2007ASA,
  title={A Stochastic Approximation Method},
  author={Herbert E. Robbins},
  journal={Annals of Mathematical Statistics},
  year={2007},
  volume={22},
  pages={400-407}
}

@inproceedings{rnnattention,
title = "Neural machine translation by jointly learning to align and translate",
author = "Dzmitry Bahdanau and Cho, {Kyung Hyun} and Yoshua Bengio",
year = "2015",
month = jan,
day = "1",
language = "English (US)",
booktitle = "3rd International Conference on Learning Representations, ICLR 2015 ; Conference date: 07-05-2015 Through 09-05-2015",
}

@misc{trans,
  title = {The Illustrated Transformer},
  author = {Jay Alammar},
  howpublished = {https://jalammar.github.io/illustrated-transformer/},
  note = {Accessed: 2022-07-15}
}

@misc{shaw2018selfattention,
      title={Self-Attention with Relative Position Representations}, 
      author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
      year={2018},
      eprint={1803.02155},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{gehring2017convolutional,
author = {Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N.},
title = {Convolutional Sequence to Sequence Learning},
year = {2017},
publisher = {JMLR.org},
abstract = {The prevalent approach to sequence to sequence learning maps an input sequence to a variable length output sequence via recurrent neural networks. We introduce an architecture based entirely on convolutional neural networks. Compared to recurrent models, computations over all elements can be fully parallelized during training to better exploit the GPU hardware and optimization is easier since the number of non-linearities is fixed and independent of the input length. Our use of gated linear units eases gradient propagation and we equip each decoder layer with a separate attention module. We outperform the accuracy of the deep LSTM setup of Wu et al. (2016) on both WMT'14 English-German and WMT'14 English-French translation at an order of magnitude faster speed, both on GPU and CPU.*},
booktitle = {Proceedings of the 34th International Conference on Machine Learning - Volume 70},
pages = {1243–1252},
numpages = {10},
location = {Sydney, NSW, Australia},
series = {ICML'17}
}

@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}

@Inbook{ns3,
author="Riley, George F.
and Henderson, Thomas R.",
title="The ns-3 Network Simulator",
bookTitle="Modeling and Tools for Network Simulation",
year="2010",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="15--34",
abstract="As networks of computing devices grow larger and more complex, the need for highly accurate and scalable network simulation technologies becomes critical. Despite the emergence of large-scale testbeds for network research, simulation still plays a vital role in terms of scalability (both in size and in experimental speed), reproducibility, rapid prototyping, and education. With simulation based studies, the approach can be studied in detail at varying scales, with varying data applications, varying field conditions, and will result in reproducible and analyzable results.",
isbn="978-3-642-12331-3",
doi="10.1007/978-3-642-12331-3_2",
url="https://doi.org/10.1007/978-3-642-12331-3_2"
}

@article{generalizingdnn,
  title={Exploring generalization in deep learning},
  author={Neyshabur, Behnam and Bhojanapalli, Srinadh and McAllester, David and Srebro, Nati},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}

@inproceedings{homa,
author = {Montazeri, Behnam and Li, Yilong and Alizadeh, Mohammad and Ousterhout, John},
title = {Homa: A Receiver-Driven Low-Latency Transport Protocol Using Network Priorities},
year = {2018},
isbn = {9781450355674},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3230543.3230564},
doi = {10.1145/3230543.3230564},
abstract = {Homa is a new transport protocol for datacenter networks. It provides exceptionally low latency, especially for workloads with a high volume of very short messages, and it also supports large messages and high network utilization. Homa uses in-network priority queues to ensure low latency for short messages; priority allocation is managed dynamically by each receiver and integrated with a receiver-driven flow control mechanism. Homa also uses controlled overcommitment of receiver downlinks to ensure efficient bandwidth utilization at high load. Our implementation of Homa delivers 99th percentile round-trip times less than 15 μs for short messages on a 10 Gbps network running at 80% load. These latencies are almost 100x lower than the best published measurements of an implementation. In simulations, Homa's latency is roughly equal to pFabric and significantly better than pHost, PIAS, and NDP for almost all message sizes and workloads. Homa can also sustain higher network loads than pFabric, pHost, or PIAS.},
booktitle = {Proceedings of the 2018 Conference of the ACM Special Interest Group on Data Communication},
pages = {221–235},
numpages = {15},
keywords = {data centers, network stacks, low latency, transport protocols},
location = {Budapest, Hungary},
series = {SIGCOMM '18}
}

@article{closemask,
author = {Wilson L. Taylor},
title ={Cloze Procedure: A New Tool for Measuring Readability},
journal = {Journalism Quarterly},
volume = {30},
number = {4},
pages = {415-433},
year = {1953},
doi = {10.1177/107769905303000401},

URL = {https://doi.org/10.1177/107769905303000401
    
},
eprint = {https://doi.org/10.1177/107769905303000401
    
},
abstract = { Here is the first comprehensive statement of a research method and its theory which were introduced briefly during a workshop at the 1953 AEJ convention. Included are findings from three pilot studies and two experiments in which “cloze procedure” results are compared with those of two readability formulas. }
}

@inproceedings{poolcv,
  author={Yandex, Artem Babenko and Lempitsky, Victor},
  booktitle={2015 IEEE International Conference on Computer Vision (ICCV)}, 
  title={Aggregating Local Deep Features for Image Retrieval}, 
  year={2015},
  volume={},
  number={},
  pages={1269-1277},
  doi={10.1109/ICCV.2015.150}}

@book{scaling,
author = {Grus, Joel},
title = {Data Science from Scratch: First Principles with Python},
year = {2015},
isbn = {149190142X},
publisher = {O'Reilly Media, Inc.},
edition = {1st},
abstract = {Data science libraries, frameworks, modules, and toolkits are great for doing data science, but they're also a good way to dive into the discipline without actually understanding data science. In this book, you'll learn how many of the most fundamental data science tools and algorithms work by implementing them from scratch. If you have an aptitude for mathematics and some programming skills, author Joel Grus will help you get comfortable with the math and statistics at the core of data science, and with hacking skills you need to get started as a data scientist. Today's messy glut of data holds answers to questions no one's even thought to ask. This book provides you with the know-how to dig those answers out.}
}

@book{arima,
author = {Box, George Edward Pelham and Jenkins, Gwilym},
title = {Time Series Analysis, Forecasting and Control},
year = {1990},
isbn = {0816211043},
publisher = {Holden-Day, Inc.},
address = {USA}
}


@book{arma,
  title={Hypothesis Testing in Time Series Analysis},
  author={Whittle, P.},
  isbn={9780598919823},
  lccn={52021616},
  series={Statistics / Uppsala universitet},
  url={https://books.google.ch/books?id=nE\_QAAAAMAAJ},
  year={1951},
  publisher={Almqvist \& Wiksells boktr.}
}

@book{ewma,
  title={Smoothing, Forecasting and Prediction of Discrete Time Series},
  author={Brown, R.G.},
  isbn={9780486495927},
  lccn={2004043896},
  series={Dover Phoenix Editions},
  url={https://books.google.ch/books?id=XXFNW\_QaJYgC},
  year={2004},
  publisher={Dover Publications}
}

@article{arimasuc,
 ISSN = {01621459},
 URL = {http://www.jstor.org/stable/2669408},
 author = {Ruey S. Tsay},
 journal = {Journal of the American Statistical Association},
 number = {450},
 pages = {638--643},
 publisher = {[American Statistical Association, Taylor & Francis, Ltd.]},
 title = {Time Series and Forecasting: Brief History and Future Research},
 urldate = {2022-07-25},
 volume = {95},
 year = {2000}
}

@article{lstm,
author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen},
title = {Long Short-Term Memory},
year = {1997},
issue_date = {November 15, 1997},
publisher = {MIT Press},
address = {Cambridge, MA, USA},
volume = {9},
number = {8},
issn = {0899-7667},
url = {https://doi.org/10.1162/neco.1997.9.8.1735},
doi = {10.1162/neco.1997.9.8.1735},
abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},
journal = {Neural Comput.},
month = {nov},
pages = {1735–1780},
numpages = {46}
}

@inproceedings{bilstm,
    title = "Modelling Radiological Language with Bidirectional Long Short-Term Memory Networks",
    author = "Cornegruta, Savelie  and
      Bakewell, Robert  and
      Withey, Samuel  and
      Montana, Giovanni",
    booktitle = "Proceedings of the Seventh International Workshop on Health Text Mining and Information Analysis",
    month = nov,
    year = "2016",
    address = "Auxtin, TX",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/W16-6103",
    doi = "10.18653/v1/W16-6103",
    pages = "17--27",
}

@inproceedings{puffer,
author = {Francis Y. Yan and Hudson Ayers and Chenzhi Zhu and Sadjad Fouladi and James Hong and Keyi Zhang and Philip Levis and Keith Winstein},
title = {Learning in situ: a randomized experiment in video streaming },
booktitle = {17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)},
year = {2020},
isbn = {978-1-939133-13-7},
address = {Santa Clara, CA},
pages = {495--511},
url = {https://www.usenix.org/conference/nsdi20/presentation/yan},
publisher = {USENIX Association},
month = feb,
}

@misc{caida,
  title = {The CAIDA Anonymized Internet Traces Data Access},
  key = {CAIDA},
  howpublished = {\url{https://www.caida.org/catalog/datasets/passive_dataset_download/}}
}

@misc{mlab,
  title = {Measurement Lab},
  key = {m-lab},
  howpublished = {\url{https://www.measurementlab.net/}}
}

@misc{crawdad,
  title = {Crawdad},
  key = {crawdad},
  howpublished = {\url{https://crawdad.org/}}
}

@misc{rocketfuel,
  title = {Rocketfuel: An ISP Topology Mapping Engine},
  key = {Rocketfuel: UWash},
  howpublished = {\url{https://research.cs.washington.edu/networking/rocketfuel/}}
}

@inproceedings{kazemianHeaderSpaceAnalysis,
author = {Peyman Kazemian and George Varghese and Nick McKeown},
title = {Header Space Analysis: Static Checking for Networks},
booktitle = {9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)},
year = {2012},
isbn = {978-931971-92-8},
address = {San Jose, CA},
pages = {113--126},
url = {https://www.usenix.org/conference/nsdi12/technical-sessions/presentation/kazemian},
publisher = {USENIX Association},
month = apr,
}

@misc{kd,
  doi = {10.48550/ARXIV.1503.02531},
  
  url = {https://arxiv.org/abs/1503.02531},
  
  author = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  
  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {Distilling the Knowledge in a Neural Network},
  
  publisher = {arXiv},
  
  year = {2015},
  
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@book{kairouzAdvancesOpenProblems2021,
  author={Kairouz, Peter and McMahan, H. Brendan and Avent, Brendan and Bellet, Aurélien and Bennis, Mehdi and Bhagoji, Arjun Nitin and Bonawit, Kallista and Charles, Zachary and Cormode, Graham and Cummings, Rachel and D’Oliveira, Rafael G. L. and Eichner, Hubert and El Rouayheb, Salim and Evans, David and Gardner, Josh and Garrett, Zachary and Gascón, Adrià and Ghazi, Badih and Gibbons, Phillip B. and Gruteser, Marco and Harchaoui, Zaid and He, Chaoyang and He, Lie and Huo, Zhouyuan and Hutchinson, Ben and Hsu, Justin and Jaggi, Martin and Javidi, Tara and Joshi, Gauri and Khodak, Mikhail and Konecný, Jakub and Korolova, Aleksandra and Koushanfar, Farinaz and Koyejo, Sanmi and Lepoint, Tancrède and Liu, Yang and Mittal, Prateek and Mohri, Mehryar and Nock, Richard and Özgür, Ayfer and Pagh, Rasmus and Qi, Hang and Ramage, Daniel and Raskar, Ramesh and Raykova, Mariana and Song, Dawn and Song, Weikang and Stich, Sebastian U. and Sun, Ziteng and Theertha Suresh, Ananda and Tramèr, Florian and Vepakomma, Praneeth and Wang, Jianyu and Xiong, Li and Xu, Zheng and Yang, Qiang and Yu, Felix X. and Yu, Han and Zhao, Sen},
  title={Advances and Open Problems in Federated Learning},
  year={2021},
  volume={},
  number={},
  pages={},
  abstract={The term Federated Learning was coined as recently as 2016 to describe a machine learning setting where multiple entities collaborate in solving a machine learning problem, under the coordination of a central server or service provider. Each client’s raw data is stored locally and not exchanged or transferred; instead, focused updates intended for immediate aggregation are used to achieve the learning objective. Since then, the topic has gathered much interest across many different disciplines and the realization that solving many of these interdisciplinary problems likely requires not just machine learning but techniques from distributed optimization, cryptography, security, differential privacy, fairness, compressed sensing, systems, information theory, statistics, and more. This monograph has contributions from leading experts across the disciplines, who describe the latest state-of-the art from their perspective. These contributions have been carefully curated into a comprehensive treatment that enables the reader to understand the work that has been done and get pointers to where effort is required to solve many of the problems before Federated Learning can become a reality in practical systems. Researchers working in the area of distributed systems will find this monograph an enlightening read that may inspire them to work on the many challenging issues that are outlined. This monograph will get the reader up to speed quickly and easily on what is likely to become an increasingly important topic: Federated Learning.},
  keywords={},
  doi={},
  ISSN={},
  publisher={now},
  isbn={},
  url={https://ieeexplore.ieee.org/document/9464278},}

@incollection{pytorch,
title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
booktitle = {Advances in Neural Information Processing Systems 32},
editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
pages = {8024--8035},
year = {2019},
publisher = {Curran Associates, Inc.},
url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf}
}

@article{pytorchlit,
  title={PyTorch Lightning},
  author={William {Falcon et al.}},
  journal={GitHub. Note: https://github.com/PyTorchLightning/pytorch-lightning},
  volume={3},
  year={2019}
}

@article{dropout,
author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
year = {2014},
issue_date = {January 2014},
publisher = {JMLR.org},
volume = {15},
number = {1},
issn = {1532-4435},
abstract = {Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different "thinned" networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets.},
journal = {J. Mach. Learn. Res.},
month = {jan},
pages = {1929–1958},
numpages = {30},
keywords = {regularization, model combination, deep learning, neural networks}
}

@inproceedings{weightdecay,
author = {Krogh, Anders and Hertz, John A.},
title = {A Simple Weight Decay Can Improve Generalization},
year = {1991},
isbn = {1558602224},
publisher = {Morgan Kaufmann Publishers Inc.},
address = {San Francisco, CA, USA},
abstract = {It has been observed in numerical simulations that a weight decay can improve generalization in a feed-forward neural network. This paper explains why. It is proven that a weight decay has two effects in a linear network. First, it suppresses any irrelevant components of the weight vector by choosing the smallest vector that solves the learning problem. Second, if the size is chosen right, a weight decay can suppress some of the effects of static noise on the targets, which improves generalization quite a lot. It is then shown how to extend these results to networks with hidden layers and non-linear units. Finally the theory is confirmed by some numerical simulations using the data from NetTalk.},
booktitle = {Proceedings of the 4th International Conference on Neural Information Processing Systems},
pages = {950–957},
numpages = {8},
location = {Denver, Colorado},
series = {NIPS'91}
}

@misc{layernorm,
  doi = {10.48550/ARXIV.1607.06450},
  
  url = {https://arxiv.org/abs/1607.06450},
  
  author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},
  
  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {Layer Normalization},
  
  publisher = {arXiv},
  
  year = {2016},
  
  copyright = {arXiv.org perpetual, non-exclusive license}
}


@inproceedings{adam,
  author    = {Diederik P. Kingma and
               Jimmy Ba},
  editor    = {Yoshua Bengio and
               Yann LeCun},
  title     = {Adam: {A} Method for Stochastic Optimization},
  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year      = {2015},
  url       = {http://arxiv.org/abs/1412.6980},
  timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{huber,
author = {Peter J. Huber},
title = {{Robust Estimation of a Location Parameter}},
volume = {35},
journal = {The Annals of Mathematical Statistics},
number = {1},
publisher = {Institute of Mathematical Statistics},
pages = {73 -- 101},
year = {1964},
doi = {10.1214/aoms/1177703732},
URL = {https://doi.org/10.1214/aoms/1177703732}
}


@misc{newhope,
  doi = {10.48550/ARXIV.2207.05843},
  
  url = {https://arxiv.org/abs/2207.05843},
  
  author = {Dietmüller, Alexander and Ray, Siddhant and Jacob, Romain and Vanbever, Laurent},
  
  keywords = {Networking and Internet Architecture (cs.NI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {A new hope for network model generalization},
  
  publisher = {arXiv},
  
  year = {2022},
  
  copyright = {Creative Commons Attribution Share Alike 4.0 International}
}


@inproceedings{classic,
author = {Abbasloo, Soheil and Yen, Chen-Yu and Chao, H. Jonathan},
title = {Classic Meets Modern: A Pragmatic Learning-Based Congestion Control for the Internet},
year = {2020},
isbn = {9781450379557},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3387514.3405892},
doi = {10.1145/3387514.3405892},
abstract = {These days, taking the revolutionary approach of using clean-slate learning-based designs to completely replace the classic congestion control schemes for the Internet is gaining popularity. However, we argue that current clean-slate learning-based techniques bring practical issues and concerns such as overhead, convergence issues, and low performance over unseen network conditions to the table. To address these issues, we take a pragmatic and evolutionary approach combining classic congestion control strategies and advanced modern deep reinforcement learning (DRL) techniques and introduce a novel hybrid congestion control for the Internet named Orca1. Through extensive experiments done over global testbeds on the Internet and various locally emulated network conditions, we demonstrate that Orca is adaptive and achieves consistent high performance in different network conditions, while it can significantly alleviate the issues and problems of its clean-slate learning-based counterparts.},
booktitle = {Proceedings of the Annual Conference of the ACM Special Interest Group on Data Communication on the Applications, Technologies, Architectures, and Protocols for Computer Communication},
pages = {632–647},
numpages = {16},
keywords = {TCP, Congestion Control, Deep Reinforcement Learning},
location = {Virtual Event, USA},
series = {SIGCOMM '20}
}

@article{dynamic,
  author={Nie, Xiaohui and Zhao, Youjian and Li, Zhihan and Chen, Guo and Sui, Kaixin and Zhang, Jiyang and Ye, Zijie and Pei, Dan},
  journal={IEEE Journal on Selected Areas in Communications}, 
  title={Dynamic TCP Initial Windows and Congestion Control Schemes Through Reinforcement Learning}, 
  year={2019},
  volume={37},
  number={6},
  pages={1231-1247},
  doi={10.1109/JSAC.2019.2904350}}

@article{10.1145/2534169.2486020,
author = {Winstein, Keith and Balakrishnan, Hari},
title = {TCP Ex Machina: Computer-Generated Congestion Control},
year = {2013},
issue_date = {October 2013},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {43},
number = {4},
issn = {0146-4833},
url = {https://doi.org/10.1145/2534169.2486020},
doi = {10.1145/2534169.2486020},
abstract = {This paper describes a new approach to end-to-end congestion control on a multi-user network. Rather than manually formulate each endpoint's reaction to congestion signals, as in traditional protocols, we developed a program called Remy that generates congestion-control algorithms to run at the endpoints.In this approach, the protocol designer specifies their prior knowledge or assumptions about the network and an objective that the algorithm will try to achieve, e.g., high throughput and low queueing delay. Remy then produces a distributed algorithm---the control rules for the independent endpoints---that tries to achieve this objective.In simulations with ns-2, Remy-generated algorithms outperformed human-designed end-to-end techniques, including TCP Cubic, Compound, and Vegas. In many cases, Remy's algorithms also outperformed methods that require intrusive in-network changes, including XCP and Cubic-over-sfqCoDel (stochastic fair queueing with CoDel for active queue management). Remy can generate algorithms both for networks where some parameters are known tightly a priori, e.g. datacenters, and for networks where prior knowledge is less precise, such as cellular networks. We characterize the sensitivity of the resulting performance to the specificity of the prior knowledge, and the consequences when real-world conditions contradict the assumptions supplied at design-time.},
journal = {SIGCOMM Comput. Commun. Rev.},
month = {aug},
pages = {123–134},
numpages = {12},
keywords = {computer-designed algorithms, congestion control}
}


@inproceedings{exmachina,
author = {Winstein, Keith and Balakrishnan, Hari},
title = {TCP Ex Machina: Computer-Generated Congestion Control},
year = {2013},
isbn = {9781450320566},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2486001.2486020},
doi = {10.1145/2486001.2486020},
abstract = {This paper describes a new approach to end-to-end congestion control on a multi-user network. Rather than manually formulate each endpoint's reaction to congestion signals, as in traditional protocols, we developed a program called Remy that generates congestion-control algorithms to run at the endpoints.In this approach, the protocol designer specifies their prior knowledge or assumptions about the network and an objective that the algorithm will try to achieve, e.g., high throughput and low queueing delay. Remy then produces a distributed algorithm---the control rules for the independent endpoints---that tries to achieve this objective.In simulations with ns-2, Remy-generated algorithms outperformed human-designed end-to-end techniques, including TCP Cubic, Compound, and Vegas. In many cases, Remy's algorithms also outperformed methods that require intrusive in-network changes, including XCP and Cubic-over-sfqCoDel (stochastic fair queueing with CoDel for active queue management). Remy can generate algorithms both for networks where some parameters are known tightly a priori, e.g. datacenters, and for networks where prior knowledge is less precise, such as cellular networks. We characterize the sensitivity of the resulting performance to the specificity of the prior knowledge, and the consequences when real-world conditions contradict the assumptions supplied at design-time.},
booktitle = {Proceedings of the ACM SIGCOMM 2013 Conference on SIGCOMM},
pages = {123–134},
numpages = {12},
keywords = {congestion control, computer-designed algorithms},
location = {Hong Kong, China},
series = {SIGCOMM '13}
}

@inproceedings{oboe,
author = {Akhtar, Zahaib and Nam, Yun Seong and Govindan, Ramesh and Rao, Sanjay and Chen, Jessica and Katz-Bassett, Ethan and Ribeiro, Bruno and Zhan, Jibin and Zhang, Hui},
title = {Oboe: Auto-Tuning Video ABR Algorithms to Network Conditions},
year = {2018},
isbn = {9781450355674},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3230543.3230558},
doi = {10.1145/3230543.3230558},
abstract = {Most content providers are interested in providing good video delivery QoE for all users, not just on average. State-of-the-art ABR algorithms like BOLA and MPC rely on parameters that are sensitive to network conditions, so may perform poorly for some users and/or videos. In this paper, we propose a technique called Oboe to auto-tune these parameters to different network conditions. Oboe pre-computes, for a given ABR algorithm, the best possible parameters for different network conditions, then dynamically adapts the parameters at run-time for the current network conditions. Using testbed experiments, we show that Oboe significantly improves BOLA, MPC, and a commercially deployed ABR. Oboe also betters a recently proposed reinforcement learning based ABR, Pensieve, by 24% on average on a composite QoE metric, in part because it is able to better specialize ABR behavior across different network states.},
booktitle = {Proceedings of the 2018 Conference of the ACM Special Interest Group on Data Communication},
pages = {44–58},
numpages = {15},
keywords = {video delivery, adaptive bitrate algorithms},
location = {Budapest, Hungary},
series = {SIGCOMM '18}
}

@inproceedings{auto,
author = {Chen, Li and Lingys, Justinas and Chen, Kai and Liu, Feng},
title = {AuTO: Scaling Deep Reinforcement Learning for Datacenter-Scale Automatic Traffic Optimization},
year = {2018},
isbn = {9781450355674},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3230543.3230551},
doi = {10.1145/3230543.3230551},
abstract = {Traffic optimizations (TO, e.g. flow scheduling, load balancing) in datacenters are difficult online decision-making problems. Previously, they are done with heuristics relying on operators' understanding of the workload and environment. Designing and implementing proper TO algorithms thus take at least weeks. Encouraged by recent successes in applying deep reinforcement learning (DRL) techniques to solve complex online control problems, we study if DRL can be used for automatic TO without human-intervention. However, our experiments show that the latency of current DRL systems cannot handle flow-level TO at the scale of current datacenters, because short flows (which constitute the majority of traffic) are usually gone before decisions can be made.Leveraging the long-tail distribution of datacenter traffic, we develop a two-level DRL system, AuTO, mimicking the Peripheral &amp; Central Nervous Systems in animals, to solve the scalability problem. Peripheral Systems (PS) reside on end-hosts, collect flow information, and make TO decisions locally with minimal delay for short flows. PS's decisions are informed by a Central System (CS), where global traffic information is aggregated and processed. CS further makes individual TO decisions for long flows. With CS&amp;PS, AuTO is an end-to-end automatic TO system that can collect network information, learn from past decisions, and perform actions to achieve operator-defined goals. We implement AuTO with popular machine learning frameworks and commodity servers, and deploy it on a 32-server testbed. Compared to existing approaches, AuTO reduces the TO turn-around time from weeks to ~100 milliseconds while achieving superior performance. For example, it demonstrates up to 48.14% reduction in average flow completion time (FCT) over existing solutions.},
booktitle = {Proceedings of the 2018 Conference of the ACM Special Interest Group on Data Communication},
pages = {191–205},
numpages = {15},
keywords = {reinforcement learning, datacenter networks, traffic optimization},
location = {Budapest, Hungary},
series = {SIGCOMM '18}
}

@inproceedings{learnroute,
author = {Valadarsky, Asaf and Schapira, Michael and Shahaf, Dafna and Tamar, Aviv},
title = {Learning to Route},
year = {2017},
isbn = {9781450355698},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3152434.3152441},
doi = {10.1145/3152434.3152441},
abstract = {Recently, much attention has been devoted to the question of whether/when traditional network protocol design, which relies on the application of algorithmic insights by human experts, can be replaced by a data-driven (i.e., machine learning) approach. We explore this question in the context of the arguably most fundamental networking task: routing. Can ideas and techniques from machine learning (ML) be leveraged to automatically generate "good" routing configurations? We focus on the classical setting of intradomain traffic engineering. We observe that this context poses significant challenges for data-driven protocol design. Our preliminary results regarding the power of data-driven routing suggest that applying ML (specifically, deep reinforcement learning) to this context yields high performance and is a promising direction for further research. We outline a research agenda for ML-guided routing.},
booktitle = {Proceedings of the 16th ACM Workshop on Hot Topics in Networks},
pages = {185–191},
numpages = {7},
location = {Palo Alto, CA, USA},
series = {HotNets-XVI}
}

@inproceedings{flow,
author = {Vojislav dhukic and Sangeetha Abdu Jyothi and Bojan Karlas and Muhsen Owaida and Ce Zhang and Ankit Singla},
title = {Is advance knowledge of flow sizes a plausible assumption?},
booktitle = {16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)},
year = {2019},
isbn = {978-1-931971-49-2},
address = {Boston, MA},
pages = {565--580},
url = {https://www.usenix.org/conference/nsdi19/presentation/dukic},
publisher = {USENIX Association},
month = feb,
}

@inproceedings{onlineflow, 
 author={Poupart, Pascal and Chen, Zhitang and Jaini, Priyank and Fung, Fred and Susanto, Hengky and Yanhui Geng and Li Chen and Chen, Kai and Hao Jin}, 
 booktitle={2016 IEEE 24th International Conference on Network Protocols (ICNP)}, 
  title={Online flow size prediction for improved network routing},  
 year={2016},
  volume={},  
number={},  pages={1-6}, 
 doi={10.1109/ICNP.2016.7785324}}

@inproceedings{oneproto,
author = {Suraj Jog and Zikun Liu and Antonio Franques and Vimuth Fernando and Sergi Abadal and Josep Torrellas and Haitham Hassanieh},
title = {One Protocol to Rule Them All: Wireless {Network-on-Chip} using Deep Reinforcement Learning},
booktitle = {18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)},
year = {2021},
isbn = {978-1-939133-21-2},
pages = {973--989},
url = {https://www.usenix.org/conference/nsdi21/presentation/jog},
publisher = {USENIX Association},
month = apr,
}

@article{heterowire, 
 author={Yu, Yiding and Wang, Taotao and Liew, Soung Chang}, 
 journal={IEEE Journal on Selected Areas in Communications},  
 title={Deep-Reinforcement Learning Multiple Access for Heterogeneous Wireless Networks}, 
  year={2019}, 
 volume={37}, 
 number={6}, 
 pages={1277-1290}, 
 doi={10.1109/JSAC.2019.2904329}}

@inproceedings{datadriven,
author = {Bartulovic, Mihovil and Jiang, Junchen and Balakrishnan, Sivaraman and Sekar, Vyas and Sinopoli, Bruno},
title = {Biases in Data-Driven Networking, and What to Do About Them},
year = {2017},
isbn = {9781450355698},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3152434.3152448},
doi = {10.1145/3152434.3152448},
abstract = {Recent efforts highlight the promise of data-driven approaches to optimize network decisions. Many such efforts use trace-driven evaluation; i.e., running offline analysis on network traces to estimate the potential benefits of different policies before running them in practice. Unfortunately, such frameworks can have fundamental pitfalls (e.g., skews due to previous policies that were used in the data collection phase and insufficient data for specific subpopulations) that could lead to misleading estimates and ultimately suboptimal decisions. In this paper, we shed light on such pitfalls and identify a promising roadmap to address these pitfalls by leveraging parallels in causal inference, namely the Doubly Robust estimator.},
booktitle = {Proceedings of the 16th ACM Workshop on Hot Topics in Networks},
pages = {192–198},
numpages = {7},
location = {Palo Alto, CA, USA},
series = {HotNets-XVI}
}

@inproceedings{blackbox,
author = {Silvery Fu and Saurabh Gupta and Radhika Mittal and Sylvia Ratnasamy},
title = {On the Use of {ML} for Blackbox System Performance Prediction},
booktitle = {18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)},
year = {2021},
isbn = {978-1-939133-21-2},
pages = {763--784},
url = {https://www.usenix.org/conference/nsdi21/presentation/fu},
publisher = {USENIX Association},
month = apr,
}

@misc{factor,
  doi = {10.48550/ARXIV.1703.10722},
  
  url = {https://arxiv.org/abs/1703.10722},
  
  author = {Kuchaiev, Oleksii and Ginsburg, Boris},
  
  keywords = {Computation and Language (cs.CL), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {Factorization tricks for LSTM networks},
  
  publisher = {arXiv},
  
  year = {2017},
  
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{recentnlp,
  doi = {10.48550/ARXIV.1904.01172},
  
  url = {https://arxiv.org/abs/1904.01172},
  
  author = {Storks, Shane and Gao, Qiaozi and Chai, Joyce Y.},
  
  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {Recent Advances in Natural Language Inference: A Survey of Benchmarks, Resources, and Approaches},
  
  publisher = {arXiv},
  
  year = {2019},
  
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@article{cvsurvey,
	doi = {10.1109/tpami.2022.3152247},
  
	url = {https://doi.org/10.1109%2Ftpami.2022.3152247},
  
	year = 2022,
	publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
  
	pages = {1--1},
  
	author = {Kai Han and Yunhe Wang and Hanting Chen and Xinghao Chen and Jianyuan Guo and Zhenhua Liu and Yehui Tang and An Xiao and Chunjing Xu and Yixing Xu and Zhaohui Yang and Yiman Zhang and Dacheng Tao},
  
	title = {A Survey on Vision Transformer},
  
	journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence}
}
@techreport{biases,
    author = {Tom M. Mitchell},
    title = {The Need for Biases in Learning Generalizations},
    institution = {Carnegie Mellon University},
    year = {1980}
}

@unpublished{transferng,
title= {Nuts and Bolts of Building Applications using Deep Learning},
author = {Andrew Ng},
year = {2016},
note= {NIPS},
URL= {https://nips.cc/Conferences/2016/ScheduleMultitrack?event=6203},
}

@inproceedings{planning,
author = {Zhu, Hang and Gupta, Varun and Ahuja, Satyajeet Singh and Tian, Yuandong and Zhang, Ying and Jin, Xin},
title = {Network Planning with Deep Reinforcement Learning},
year = {2021},
isbn = {9781450383837},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3452296.3472902},
doi = {10.1145/3452296.3472902},
abstract = {Network planning is critical to the performance, reliability and cost of web services. This problem is typically formulated as an Integer Linear Programming (ILP) problem. Today's practice relies on hand-tuned heuristics from human experts to address the scalability challenge of ILP solvers.In this paper, we propose NeuroPlan, a deep reinforcement learning (RL) approach to solve the network planning problem. This problem involves multi-step decision making and cost minimization, which can be naturally cast as a deep RL problem. We develop two important domain-specific techniques. First, we use a graph neural network (GNN) and a novel domain-specific node-link transformation for state encoding, in order to handle the dynamic nature of the evolving network topology during planning decision making. Second, we leverage a two-stage hybrid approach that first uses deep RL to prune the search space and then uses an ILP solver to find the optimal solution. This approach resembles today's practice, but avoids human experts with an RL agent in the first stage. Evaluation on real topologies and setups from large production networks demonstrates that NeuroPlan scales to large topologies beyond the capability of ILP solvers, and reduces the cost by up to 17% compared to hand-tuned heuristics.},
booktitle = {Proceedings of the 2021 ACM SIGCOMM 2021 Conference},
pages = {258–271},
numpages = {14},
keywords = {reinforcement learning, graph neural network, network planning},
location = {Virtual Event, USA},
series = {SIGCOMM '21}
}