bigscience.bib

@article{bigscience-akiki-23-bigscience-social-construction,
  author     = {Christopher Akiki and
                Giada Pistilli and
                Margot Mieskes and
                Matthias Gall{\'{e}} and
                Thomas Wolf and
                Suzana Ilic and
                Yacine Jernite},
  title      = {BigScience: {A} Case Study in the Social Construction of a Multilingual
                Large Language Model},
  journal    = {CoRR},
  volume     = {abs/2212.04960},
  year       = {2022},
  url        = {https://doi.org/10.48550/arXiv.2212.04960},
  doi        = {10.48550/arXiv.2212.04960},
  eprinttype = {arXiv},
  eprint     = {2212.04960},
  timestamp  = {Mon, 02 Jan 2023 15:09:55 +0100},
  biburl     = {https://dblp.org/rec/journals/corr/abs-2212-04960.bib},
  bibsource  = {dblp computer science bibliography, https://dblp.org}
}

@article{bigscience-alyafeai-2021-masader,
  author     = {Zaid Alyafeai and
                Maraim Masoud and
                Mustafa Ghaleb and
                Maged Saeed AlShaibani},
  title      = {Masader: Metadata Sourcing for Arabic Text and Speech Data Resources},
  journal    = {CoRR},
  volume     = {abs/2110.06744},
  year       = {2021},
  url        = {https://arxiv.org/abs/2110.06744},
  eprinttype = {arXiv},
  eprint     = {2110.06744},
  timestamp  = {Fri, 22 Oct 2021 13:33:09 +0200},
  biburl     = {https://dblp.org/rec/journals/corr/abs-2110-06744.bib},
  bibsource  = {dblp computer science bibliography, https://dblp.org}
}

@article{bigscience-alyafeai-2022-masader-plus,
  author     = {Yousef Altaher and
                Ali Fadel and
                Mazen Alotaibi and
                Mazen Alyazidi and
                Mishari Al{-}Mutairi and
                Mutlaq Aldhbuiub and
                Abdulrahman Mosaibah and
                Abdelrahman Rezk and
                Abdulrazzaq Alhendi and
                Mazen Abo Shal and
                Emad A. Alghamdi and
                Maged Saeed AlShaibani and
                Jezia Zakraoui and
                Wafaa Mohammed and
                Kamel Gaanoun and
                Khalid N. Elmadani and
                Mustafa Ghaleb and
                Nouamane Tazi and
                Raed Alharbi and
                Maraim Masoud and
                Zaid Alyafeai},
  title      = {Masader Plus: {A} New Interface for Exploring +500 Arabic {NLP} Datasets},
  journal    = {CoRR},
  volume     = {abs/2208.00932},
  year       = {2022},
  url        = {https://doi.org/10.48550/arXiv.2208.00932},
  doi        = {10.48550/arXiv.2208.00932},
  eprinttype = {arXiv},
  eprint     = {2208.00932},
  timestamp  = {Wed, 10 Aug 2022 07:53:39 +0200},
  biburl     = {https://dblp.org/rec/journals/corr/abs-2208-00932.bib},
  bibsource  = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{bigscience-bach-2022-promptsource,
  title     = {{P}rompt{S}ource: An Integrated Development Environment and Repository for Natural Language Prompts},
  author    = {Bach, Stephen  and
               Sanh, Victor  and
               Yong, Zheng Xin  and
               Webson, Albert  and
               Raffel, Colin  and
               Nayak, Nihal V.  and
               Sharma, Abheesht  and
               Kim, Taewoon  and
               Bari, M Saiful  and
               Fevry, Thibault  and
               Alyafeai, Zaid  and
               Dey, Manan  and
               Santilli, Andrea  and
               Sun, Zhiqing  and
               Ben-david, Srulik  and
               Xu, Canwen  and
               Chhablani, Gunjan  and
               Wang, Han  and
               Fries, Jason  and
               Al-shaibani, Maged  and
               Sharma, Shanya  and
               Thakker, Urmish  and
               Almubarak, Khalid  and
               Tang, Xiangru  and
               Radev, Dragomir  and
               Jiang, Mike Tian-jian  and
               Rush, Alexander},
  booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations},
  month     = may,
  year      = {2022},
  address   = {Dublin, Ireland},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2022.acl-demo.9},
  doi       = {10.18653/v1/2022.acl-demo.9},
  pages     = {93--104}
}

@article{bigscience-ding-23-toward-openness-beyond-open-access,
  author     = {Jennifer Ding and
                Christopher Akiki and
                Yacine Jernite and
                Anne Lee Steele and
                Temi Popo},
  title      = {Towards Openness Beyond Open Access: User Journeys through 3 Open
                {AI} Collaboratives},
  journal    = {CoRR},
  volume     = {abs/2301.08488},
  year       = {2023},
  url        = {https://doi.org/10.48550/arXiv.2301.08488},
  doi        = {10.48550/arXiv.2301.08488},
  eprinttype = {arXiv},
  eprint     = {2301.08488},
  timestamp  = {Thu, 26 Jan 2023 15:26:31 +0100},
  biburl     = {https://dblp.org/rec/journals/corr/abs-2301-08488.bib},
  bibsource  = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{bigscience-fries-2022-bigbio,
  title     = {BigBio: A Framework for Data-Centric Biomedical Natural Language Processing},
  author    = {Jason Alan Fries and Leon Weber and Natasha Seelam and Gabriel Altay and Debajyoti Datta and Samuele Garda and Myungsun Kang and Ruisi Su and Wojciech Kusa and Samuel Cahyawijaya and Fabio Barth and Simon Ott and Matthias Samwald and Stephen Bach and Stella Biderman and Mario S{\"a}nger and Bo Wang and Alison Callahan and Daniel Le{\'o}n Peri{\~n}{\'a}n and Th{\'e}o Gigant and Patrick Haller and Jenny Chim and Jose David Posada and John Michael Giorgi and Karthik Rangasai Sivaraman and Marc P{\`a}mies and Marianna Nezhurina and Robert Martin and Michael Cullan and Moritz Freidank and Nathan Dahlberg and Shubhanshu Mishra and Shamik Bose and Nicholas Michio Broad and Yanis Labrak and Shlok S Deshmukh and Sid Kiblawi and Ayush Singh and Minh Chien Vu and Trishala Neeraj and Jonas Golde and Albert Villanova del Moral and Benjamin Beilharz},
  booktitle = {Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
  year      = {2022},
  url       = {https://openreview.net/forum?id=8lQDn9zTQlW}
}

@inproceedings{bigscience-fries-2022-biomedical,
  title     = {Dataset Debt in Biomedical Language Modeling},
  author    = {Jason Alan Fries and Natasha Seelam and Gabriel Altay and Leon Weber and Myungsun Kang and Debajyoti Datta and Ruisi Su and Samuele Garda and Bo Wang and Simon Ott and Matthias Samwald and Wojciech Kusa},
  booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
  year      = {2022},
  url       = {https://openreview.net/forum?id=HRfzInfr8Z9}
}

@inproceedings{bigscience-jernite-2022-governance,
  author    = {Jernite, Yacine and Nguyen, Huu and Biderman, Stella and Rogers, Anna and Masoud, Maraim and Danchev, Valentin and Tan, Samson and Luccioni, Alexandra Sasha and Subramani, Nishant and Johnson, Isaac and Dupont, Gerard and Dodge, Jesse and Lo, Kyle and Talat, Zeerak and Radev, Dragomir and Gokaslan, Aaron and Nikpoor, Somaieh and Henderson, Peter and Bommasani, Rishi and Mitchell, Margaret},
  title     = {Data Governance in the Age of Large-Scale Data-Driven Language Technology},
  year      = {2022},
  isbn      = {9781450393522},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/3531146.3534637},
  doi       = {10.1145/3531146.3534637},
  abstract  = {The recent emergence and adoption of Machine Learning technology, and specifically of Large Language Models, has drawn attention to the need for systematic and transparent management of language data. This work proposes an approach to global language data governance that attempts to organize data management amongst stakeholders, values, and rights. Our proposal is informed by prior work on distributed governance that accounts for human values and grounded by an international research collaboration that brings together researchers and practitioners from 60 countries. The framework we present is a multi-party international governance structure focused on language data, and incorporating technical and organizational tools needed to support its work.},
  booktitle = {2022 ACM Conference on Fairness, Accountability, and Transparency},
  pages     = {2206–2222},
  numpages  = {17},
  keywords  = {technology governance, data rights, language data, datasets},
  location  = {Seoul, Republic of Korea},
  series    = {FAccT '22}
}

@inproceedings{bigscience-laurencon-roots:2022,
  title     = {The BigScience {ROOTS} Corpus: A 1.6{TB} Composite Multilingual Dataset},
  author    = {Hugo Lauren{\c{c}}on and Lucile Saulnier and Thomas Wang and Christopher Akiki and Albert Villanova del Moral and Teven Le Scao and Leandro Von Werra and Chenghao Mou and Eduardo Gonz{\'a}lez Ponferrada and Huu Nguyen and J{\"o}rg Frohberg and Mario {\v{S}}a{\v{s}}ko and Quentin Lhoest and Angelina McMillan-Major and G{\'e}rard Dupont and Stella Biderman and Anna Rogers and Loubna Ben allal and Francesco De Toni and Giada Pistilli and Olivier Nguyen and Somaieh Nikpoor and Maraim Masoud and Pierre Colombo and Javier de la Rosa and Paulo Villegas and Tristan Thrush and Shayne Longpre and Sebastian Nagel and Leon Weber and Manuel Romero Mu{\~n}oz and Jian Zhu and Daniel Van Strien and Zaid Alyafeai and Khalid Almubarak and Vu Minh Chien and Itziar Gonzalez-Dios and Aitor Soroa and Kyle Lo and Manan Dey and Pedro Ortiz Suarez and Aaron Gokaslan and Shamik Bose and David Ifeoluwa Adelani and Long Phan and Hieu Tran and Ian Yu and Suhas Pai and Jenny Chim and Violette Lepercq and Suzana Ilic and Margaret Mitchell and Sasha Luccioni and Yacine Jernite},
  booktitle = {Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
  year      = {2022},
  url       = {https://openreview.net/forum?id=UoEw6KigkUn}
}

@article{bigscience-le-scao-23-bloom,
  author     = {Teven Le Scao and
                Angela Fan and
                Christopher Akiki and
                Ellie Pavlick and
                Suzana Ilic and
                Daniel Hesslow and
                Roman Castagn{\'{e}} and
                Alexandra Sasha Luccioni and
                Fran{\c{c}}ois Yvon and
                Matthias Gall{\'{e}} and
                Jonathan Tow and
                Alexander M. Rush and
                Stella Biderman and
                Albert Webson and
                Pawan Sasanka Ammanamanchi and
                Thomas Wang and
                Beno{\^{\i}}t Sagot and
                Niklas Muennighoff and
                Albert Villanova del Moral and
                Olatunji Ruwase and
                Rachel Bawden and
                Stas Bekman and
                Angelina McMillan{-}Major and
                Iz Beltagy and
                Huu Nguyen and
                Lucile Saulnier and
                Samson Tan and
                Pedro Ortiz Suarez and
                Victor Sanh and
                Hugo Lauren{\c{c}}on and
                Yacine Jernite and
                Julien Launay and
                Margaret Mitchell and
                Colin Raffel and
                Aaron Gokaslan and
                Adi Simhi and
                Aitor Soroa and
                Alham Fikri Aji and
                Amit Alfassy and
                Anna Rogers and
                Ariel Kreisberg Nitzav and
                Canwen Xu and
                Chenghao Mou and
                Chris Emezue and
                Christopher Klamm and
                Colin Leong and
                Daniel van Strien and
                David Ifeoluwa Adelani and
                et al.},
  title      = {{BLOOM:} {A} 176B-Parameter Open-Access Multilingual Language Model},
  journal    = {CoRR},
  volume     = {abs/2211.05100},
  year       = {2022},
  url        = {https://doi.org/10.48550/arXiv.2211.05100},
  doi        = {10.48550/arXiv.2211.05100},
  eprinttype = {arXiv},
  eprint     = {2211.05100},
  timestamp  = {Wed, 07 Dec 2022 23:00:56 +0100},
  biburl     = {https://dblp.org/rec/journals/corr/abs-2211-05100.bib},
  bibsource  = {dblp computer science bibliography, https://dblp.org}
}

@misc{bigscience-mcmillan-major-2022-sourcing-catalogue,
  doi       = {10.48550/ARXIV.2201.10066},
  url       = {https://arxiv.org/abs/2201.10066},
  author    = {McMillan-Major, Angelina and Alyafeai, Zaid and Biderman, Stella and Chen, Kimbo and De Toni, Francesco and Dupont, Gérard and Elsahar, Hady and Emezue, Chris and Aji, Alham Fikri and Ilić, Suzana and Khamis, Nurulaqilla and Leong, Colin and Masoud, Maraim and Soroa, Aitor and Suarez, Pedro Ortiz and Talat, Zeerak and van Strien, Daniel and Jernite, Yacine},
  keywords  = {Computation and Language (cs.CL), Databases (cs.DB), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title     = {Documenting Geographically and Contextually Diverse Data Sources: The BigScience Catalogue of Language Data and Resources},
  publisher = {arXiv},
  year      = {2022},
  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{bigscience-mielke-2021-tokenization-survey,
  doi       = {10.48550/ARXIV.2112.10508},
  url       = {https://arxiv.org/abs/2112.10508},
  author    = {Mielke, Sabrina J. and Alyafeai, Zaid and Salesky, Elizabeth and Raffel, Colin and Dey, Manan and Gallé, Matthias and Raja, Arun and Si, Chenglei and Lee, Wilson Y. and Sagot, Benoît and Tan, Samson},
  keywords  = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title     = {Between words and characters: A Brief History of Open-Vocabulary Modeling and Tokenization in NLP},
  publisher = {arXiv},
  year      = {2021},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@article{bigscience-piktus-23-roots-search-tool,
  author     = {Aleksandra Piktus and
                Christopher Akiki and
                Paulo Villegas and
                Hugo Lauren{\c{c}}on and
                G{\'{e}}rard Dupont and
                Alexandra Sasha Luccioni and
                Yacine Jernite and
                Anna Rogers},
  title      = {The {ROOTS} Search Tool: Data Transparency for LLMs},
  journal    = {CoRR},
  volume     = {abs/2302.14035},
  year       = {2023},
  url        = {https://doi.org/10.48550/arXiv.2302.14035},
  doi        = {10.48550/arXiv.2302.14035},
  eprinttype = {arXiv},
  eprint     = {2302.14035},
  timestamp  = {Tue, 28 Feb 2023 14:02:05 +0100},
  biburl     = {https://dblp.org/rec/journals/corr/abs-2302-14035.bib},
  bibsource  = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{bigscience-sanh-2022-multitask-t0,
  title     = {Multitask Prompted Training Enables Zero-Shot Task Generalization},
  author    = {Victor Sanh and Albert Webson and Colin Raffel and Stephen Bach and Lintang Sutawika and Zaid Alyafeai and Antoine Chaffin and Arnaud Stiegler and Arun Raja and Manan Dey and M Saiful Bari and Canwen Xu and Urmish Thakker and Shanya Sharma Sharma and Eliza Szczechla and Taewoon Kim and Gunjan Chhablani and Nihal Nayak and Debajyoti Datta and Jonathan Chang and Mike Tian-Jian Jiang and Han Wang and Matteo Manica and Sheng Shen and Zheng Xin Yong and Harshit Pandey and Rachel Bawden and Thomas Wang and Trishala Neeraj and Jos Rozen and Abheesht Sharma and Andrea Santilli and Thibault Fevry and Jason Alan Fries and Ryan Teehan and Teven Le Scao and Stella Biderman and Leo Gao and Thomas Wolf and Alexander M Rush},
  booktitle = {International Conference on Learning Representations},
  year      = {2022},
  url       = {https://openreview.net/forum?id=9Vrb9D0WI4}
}

@inproceedings{bigscience-scao-2022-million-gpu-hours,
  title     = {What Language Model to Train if You Have One Million {GPU} Hours?},
  author    = {Teven Le Scao and Thomas Wang and Daniel Hesslow and Lucile Saulnier and Stas Bekman and M Saiful Bari and Stella Biderman and Hady Elsahar and Jason Phang and Ofir Press and Colin Raffel and Victor Sanh and Sheng Shen and Lintang Sutawika and Jaesung Tae and Zheng Xin Yong and Julien Launay and Iz Beltagy},
  booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
  year      = {2022},
  url       = {https://openreview.net/forum?id=rI7BL3fHIZq}
}

@inproceedings{bigscience-talat-2022-multilingual-bias-evaluation-challenges,
  title     = {You reap what you sow: On the Challenges of Bias Evaluation Under Multilingual Settings},
  author    = {Zeerak Talat and Aur{\'e}lie N{\'e}v{\'e}ol and Stella Biderman and Miruna Clinciu and Manan Dey and Shayne Longpre and Sasha Luccioni and Maraim Masoud and Margaret Mitchell and Dragomir Radev and Shanya Sharma and Arjun Subramonian and Jaesung Tae and Samson Tan and Deepak Tunuguntla and Oskar van der Wal},
  booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
  year      = {2022},
  url       = {https://openreview.net/forum?id=rK-7NhfSIW5}
}

@inproceedings{bigscience-teehan2022-emergent-structures-dynamics,
  title     = {Emergent Structures and Training Dynamics in Large Language Models},
  author    = {Ryan Teehan and Miruna Clinciu and Oleg Serikov and Eliza Szczechla and Natasha Seelam and Shachar Mirkin and Aaron Gokaslan},
  booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
  year      = {2022},
  url       = {https://openreview.net/forum?id=SbgL3zrIWc}
}

@inproceedings{bigscience-toni-2022-historical-zero-shot,
  title     = {Entities, Dates, and Languages: Zero-Shot on Historical Texts with T0},
  author    = {Francesco De Toni and Christopher Akiki and Javier de la Rosa and Cl{\'e}mentine Fourrier and Enrique Manjavacas and Stefan Schweter and Daniel Van Strien},
  booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
  year      = {2022},
  url       = {https://openreview.net/forum?id=BRzIS3GrIbc}
}

@inproceedings{bigscience-wang-2022-architecture-pretraining-generalization,
  title     = {What Language Model Architecture and Pretraining Objective Works Best for Zero-Shot Generalization?},
  author    = {Wang, Thomas and Roberts, Adam and Hesslow, Daniel and Scao, Teven Le and Chung, Hyung Won and Beltagy, Iz and Launay, Julien and Raffel, Colin},
  booktitle = {Proceedings of the 39th International Conference on Machine Learning},
  pages     = {22964--22984},
  year      = {2022},
  editor    = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
  volume    = {162},
  series    = {Proceedings of Machine Learning Research},
  month     = {17--23 Jul},
  publisher = {PMLR},
  pdf       = {https://proceedings.mlr.press/v162/wang22u/wang22u.pdf},
  url       = {https://proceedings.mlr.press/v162/wang22u.html}
}

@inproceedings{bigscience-yong-2022-adapting-bigscience-multilingual-unseen,
  title     = {Adapting BigScience Multilingual Model to Unseen Languages},
  author    = {Zheng Xin Yong and Vassilina Nikoulina},
  booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
  year      = {2022},
  url       = {https://openreview.net/forum?id=rL7mI3GSIbq}
}