-
Notifications
You must be signed in to change notification settings - Fork 1
/
bigscience.bib
344 lines (325 loc) · 19 KB
/
bigscience.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
@article{bigscience-akiki-23-bigscience-social-construction,
author = {Christopher Akiki and
Giada Pistilli and
Margot Mieskes and
Matthias Gall{\'{e}} and
Thomas Wolf and
Suzana Ilic and
Yacine Jernite},
title = {BigScience: {A} Case Study in the Social Construction of a Multilingual
Large Language Model},
journal = {CoRR},
volume = {abs/2212.04960},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2212.04960},
doi = {10.48550/arXiv.2212.04960},
eprinttype = {arXiv},
eprint = {2212.04960},
timestamp = {Mon, 02 Jan 2023 15:09:55 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2212-04960.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{bigscience-alyafeai-2021-masader,
author = {Zaid Alyafeai and
Maraim Masoud and
Mustafa Ghaleb and
Maged Saeed AlShaibani},
title = {Masader: Metadata Sourcing for Arabic Text and Speech Data Resources},
journal = {CoRR},
volume = {abs/2110.06744},
year = {2021},
url = {https://arxiv.org/abs/2110.06744},
eprinttype = {arXiv},
eprint = {2110.06744},
timestamp = {Fri, 22 Oct 2021 13:33:09 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2110-06744.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{bigscience-alyafeai-2022-masader-plus,
author = {Yousef Altaher and
Ali Fadel and
Mazen Alotaibi and
Mazen Alyazidi and
Mishari Al{-}Mutairi and
Mutlaq Aldhbuiub and
Abdulrahman Mosaibah and
Abdelrahman Rezk and
Abdulrazzaq Alhendi and
Mazen Abo Shal and
Emad A. Alghamdi and
Maged Saeed AlShaibani and
Jezia Zakraoui and
Wafaa Mohammed and
Kamel Gaanoun and
Khalid N. Elmadani and
Mustafa Ghaleb and
Nouamane Tazi and
Raed Alharbi and
Maraim Masoud and
Zaid Alyafeai},
title = {Masader Plus: {A} New Interface for Exploring +500 Arabic {NLP} Datasets},
journal = {CoRR},
volume = {abs/2208.00932},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2208.00932},
doi = {10.48550/arXiv.2208.00932},
eprinttype = {arXiv},
eprint = {2208.00932},
timestamp = {Wed, 10 Aug 2022 07:53:39 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2208-00932.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{bigscience-bach-2022-promptsource,
title = {{P}rompt{S}ource: An Integrated Development Environment and Repository for Natural Language Prompts},
author = {Bach, Stephen and
Sanh, Victor and
Yong, Zheng Xin and
Webson, Albert and
Raffel, Colin and
Nayak, Nihal V. and
Sharma, Abheesht and
Kim, Taewoon and
Bari, M Saiful and
Fevry, Thibault and
Alyafeai, Zaid and
Dey, Manan and
Santilli, Andrea and
Sun, Zhiqing and
Ben-david, Srulik and
Xu, Canwen and
Chhablani, Gunjan and
Wang, Han and
Fries, Jason and
Al-shaibani, Maged and
Sharma, Shanya and
Thakker, Urmish and
Almubarak, Khalid and
Tang, Xiangru and
Radev, Dragomir and
Jiang, Mike Tian-jian and
Rush, Alexander},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations},
month = may,
year = {2022},
address = {Dublin, Ireland},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2022.acl-demo.9},
doi = {10.18653/v1/2022.acl-demo.9},
pages = {93--104}
}
@article{bigscience-ding-23-toward-openness-beyond-open-access,
author = {Jennifer Ding and
Christopher Akiki and
Yacine Jernite and
Anne Lee Steele and
Temi Popo},
title = {Towards Openness Beyond Open Access: User Journeys through 3 Open
{AI} Collaboratives},
journal = {CoRR},
volume = {abs/2301.08488},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2301.08488},
doi = {10.48550/arXiv.2301.08488},
eprinttype = {arXiv},
eprint = {2301.08488},
timestamp = {Thu, 26 Jan 2023 15:26:31 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2301-08488.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{bigscience-fries-2022-bigbio,
title = {BigBio: A Framework for Data-Centric Biomedical Natural Language Processing},
author = {Jason Alan Fries and Leon Weber and Natasha Seelam and Gabriel Altay and Debajyoti Datta and Samuele Garda and Myungsun Kang and Ruisi Su and Wojciech Kusa and Samuel Cahyawijaya and Fabio Barth and Simon Ott and Matthias Samwald and Stephen Bach and Stella Biderman and Mario S{\"a}nger and Bo Wang and Alison Callahan and Daniel Le{\'o}n Peri{\~n}{\'a}n and Th{\'e}o Gigant and Patrick Haller and Jenny Chim and Jose David Posada and John Michael Giorgi and Karthik Rangasai Sivaraman and Marc P{\`a}mies and Marianna Nezhurina and Robert Martin and Michael Cullan and Moritz Freidank and Nathan Dahlberg and Shubhanshu Mishra and Shamik Bose and Nicholas Michio Broad and Yanis Labrak and Shlok S Deshmukh and Sid Kiblawi and Ayush Singh and Minh Chien Vu and Trishala Neeraj and Jonas Golde and Albert Villanova del Moral and Benjamin Beilharz},
booktitle = {Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
year = {2022},
url = {https://openreview.net/forum?id=8lQDn9zTQlW}
}
@inproceedings{bigscience-fries-2022-biomedical,
title = {Dataset Debt in Biomedical Language Modeling},
author = {Jason Alan Fries and Natasha Seelam and Gabriel Altay and Leon Weber and Myungsun Kang and Debajyoti Datta and Ruisi Su and Samuele Garda and Bo Wang and Simon Ott and Matthias Samwald and Wojciech Kusa},
booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
year = {2022},
url = {https://openreview.net/forum?id=HRfzInfr8Z9}
}
@inproceedings{bigscience-jernite-2022-governance,
author = {Jernite, Yacine and Nguyen, Huu and Biderman, Stella and Rogers, Anna and Masoud, Maraim and Danchev, Valentin and Tan, Samson and Luccioni, Alexandra Sasha and Subramani, Nishant and Johnson, Isaac and Dupont, Gerard and Dodge, Jesse and Lo, Kyle and Talat, Zeerak and Radev, Dragomir and Gokaslan, Aaron and Nikpoor, Somaieh and Henderson, Peter and Bommasani, Rishi and Mitchell, Margaret},
title = {Data Governance in the Age of Large-Scale Data-Driven Language Technology},
year = {2022},
isbn = {9781450393522},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3531146.3534637},
doi = {10.1145/3531146.3534637},
abstract = {The recent emergence and adoption of Machine Learning technology, and specifically of Large Language Models, has drawn attention to the need for systematic and transparent management of language data. This work proposes an approach to global language data governance that attempts to organize data management amongst stakeholders, values, and rights. Our proposal is informed by prior work on distributed governance that accounts for human values and grounded by an international research collaboration that brings together researchers and practitioners from 60 countries. The framework we present is a multi-party international governance structure focused on language data, and incorporating technical and organizational tools needed to support its work.},
booktitle = {2022 ACM Conference on Fairness, Accountability, and Transparency},
pages = {2206–2222},
numpages = {17},
keywords = {technology governance, data rights, language data, datasets},
location = {Seoul, Republic of Korea},
series = {FAccT '22}
}
@inproceedings{bigscience-laurencon-roots:2022,
title = {The BigScience {ROOTS} Corpus: A 1.6{TB} Composite Multilingual Dataset},
author = {Hugo Lauren{\c{c}}on and Lucile Saulnier and Thomas Wang and Christopher Akiki and Albert Villanova del Moral and Teven Le Scao and Leandro Von Werra and Chenghao Mou and Eduardo Gonz{\'a}lez Ponferrada and Huu Nguyen and J{\"o}rg Frohberg and Mario {\v{S}}a{\v{s}}ko and Quentin Lhoest and Angelina McMillan-Major and G{\'e}rard Dupont and Stella Biderman and Anna Rogers and Loubna Ben allal and Francesco De Toni and Giada Pistilli and Olivier Nguyen and Somaieh Nikpoor and Maraim Masoud and Pierre Colombo and Javier de la Rosa and Paulo Villegas and Tristan Thrush and Shayne Longpre and Sebastian Nagel and Leon Weber and Manuel Romero Mu{\~n}oz and Jian Zhu and Daniel Van Strien and Zaid Alyafeai and Khalid Almubarak and Vu Minh Chien and Itziar Gonzalez-Dios and Aitor Soroa and Kyle Lo and Manan Dey and Pedro Ortiz Suarez and Aaron Gokaslan and Shamik Bose and David Ifeoluwa Adelani and Long Phan and Hieu Tran and Ian Yu and Suhas Pai and Jenny Chim and Violette Lepercq and Suzana Ilic and Margaret Mitchell and Sasha Luccioni and Yacine Jernite},
booktitle = {Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
year = {2022},
url = {https://openreview.net/forum?id=UoEw6KigkUn}
}
@article{bigscience-le-scao-23-bloom,
author = {Teven Le Scao and
Angela Fan and
Christopher Akiki and
Ellie Pavlick and
Suzana Ilic and
Daniel Hesslow and
Roman Castagn{\'{e}} and
Alexandra Sasha Luccioni and
Fran{\c{c}}ois Yvon and
Matthias Gall{\'{e}} and
Jonathan Tow and
Alexander M. Rush and
Stella Biderman and
Albert Webson and
Pawan Sasanka Ammanamanchi and
Thomas Wang and
Beno{\^{\i}}t Sagot and
Niklas Muennighoff and
Albert Villanova del Moral and
Olatunji Ruwase and
Rachel Bawden and
Stas Bekman and
Angelina McMillan{-}Major and
Iz Beltagy and
Huu Nguyen and
Lucile Saulnier and
Samson Tan and
Pedro Ortiz Suarez and
Victor Sanh and
Hugo Lauren{\c{c}}on and
Yacine Jernite and
Julien Launay and
Margaret Mitchell and
Colin Raffel and
Aaron Gokaslan and
Adi Simhi and
Aitor Soroa and
Alham Fikri Aji and
Amit Alfassy and
Anna Rogers and
Ariel Kreisberg Nitzav and
Canwen Xu and
Chenghao Mou and
Chris Emezue and
Christopher Klamm and
Colin Leong and
Daniel van Strien and
David Ifeoluwa Adelani and
et al.},
title = {{BLOOM:} {A} 176B-Parameter Open-Access Multilingual Language Model},
journal = {CoRR},
volume = {abs/2211.05100},
year = {2022},
url = {https://doi.org/10.48550/arXiv.2211.05100},
doi = {10.48550/arXiv.2211.05100},
eprinttype = {arXiv},
eprint = {2211.05100},
timestamp = {Wed, 07 Dec 2022 23:00:56 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2211-05100.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{bigscience-mcmillan-major-2022-sourcing-catalogue,
doi = {10.48550/ARXIV.2201.10066},
url = {https://arxiv.org/abs/2201.10066},
author = {McMillan-Major, Angelina and Alyafeai, Zaid and Biderman, Stella and Chen, Kimbo and De Toni, Francesco and Dupont, Gérard and Elsahar, Hady and Emezue, Chris and Aji, Alham Fikri and Ilić, Suzana and Khamis, Nurulaqilla and Leong, Colin and Masoud, Maraim and Soroa, Aitor and Suarez, Pedro Ortiz and Talat, Zeerak and van Strien, Daniel and Jernite, Yacine},
keywords = {Computation and Language (cs.CL), Databases (cs.DB), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Documenting Geographically and Contextually Diverse Data Sources: The BigScience Catalogue of Language Data and Resources},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
@misc{bigscience-mielke-2021-tokenization-survey,
doi = {10.48550/ARXIV.2112.10508},
url = {https://arxiv.org/abs/2112.10508},
author = {Mielke, Sabrina J. and Alyafeai, Zaid and Salesky, Elizabeth and Raffel, Colin and Dey, Manan and Gallé, Matthias and Raja, Arun and Si, Chenglei and Lee, Wilson Y. and Sagot, Benoît and Tan, Samson},
keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
title = {Between words and characters: A Brief History of Open-Vocabulary Modeling and Tokenization in NLP},
publisher = {arXiv},
year = {2021},
copyright = {arXiv.org perpetual, non-exclusive license}
}
@article{bigscience-piktus-23-roots-search-tool,
author = {Aleksandra Piktus and
Christopher Akiki and
Paulo Villegas and
Hugo Lauren{\c{c}}on and
G{\'{e}}rard Dupont and
Alexandra Sasha Luccioni and
Yacine Jernite and
Anna Rogers},
title = {The {ROOTS} Search Tool: Data Transparency for LLMs},
journal = {CoRR},
volume = {abs/2302.14035},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2302.14035},
doi = {10.48550/arXiv.2302.14035},
eprinttype = {arXiv},
eprint = {2302.14035},
timestamp = {Tue, 28 Feb 2023 14:02:05 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2302-14035.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{bigscience-sanh-2022-multitask-t0,
title = {Multitask Prompted Training Enables Zero-Shot Task Generalization},
author = {Victor Sanh and Albert Webson and Colin Raffel and Stephen Bach and Lintang Sutawika and Zaid Alyafeai and Antoine Chaffin and Arnaud Stiegler and Arun Raja and Manan Dey and M Saiful Bari and Canwen Xu and Urmish Thakker and Shanya Sharma Sharma and Eliza Szczechla and Taewoon Kim and Gunjan Chhablani and Nihal Nayak and Debajyoti Datta and Jonathan Chang and Mike Tian-Jian Jiang and Han Wang and Matteo Manica and Sheng Shen and Zheng Xin Yong and Harshit Pandey and Rachel Bawden and Thomas Wang and Trishala Neeraj and Jos Rozen and Abheesht Sharma and Andrea Santilli and Thibault Fevry and Jason Alan Fries and Ryan Teehan and Teven Le Scao and Stella Biderman and Leo Gao and Thomas Wolf and Alexander M Rush},
booktitle = {International Conference on Learning Representations},
year = {2022},
url = {https://openreview.net/forum?id=9Vrb9D0WI4}
}
@inproceedings{bigscience-scao-2022-million-gpu-hours,
title = {What Language Model to Train if You Have One Million {GPU} Hours?},
author = {Teven Le Scao and Thomas Wang and Daniel Hesslow and Lucile Saulnier and Stas Bekman and M Saiful Bari and Stella Biderman and Hady Elsahar and Jason Phang and Ofir Press and Colin Raffel and Victor Sanh and Sheng Shen and Lintang Sutawika and Jaesung Tae and Zheng Xin Yong and Julien Launay and Iz Beltagy},
booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
year = {2022},
url = {https://openreview.net/forum?id=rI7BL3fHIZq}
}
@inproceedings{bigscience-talat-2022-multilingual-bias-evaluation-challenges,
title = {You reap what you sow: On the Challenges of Bias Evaluation Under Multilingual Settings},
author = {Zeerak Talat and Aur{\'e}lie N{\'e}v{\'e}ol and Stella Biderman and Miruna Clinciu and Manan Dey and Shayne Longpre and Sasha Luccioni and Maraim Masoud and Margaret Mitchell and Dragomir Radev and Shanya Sharma and Arjun Subramonian and Jaesung Tae and Samson Tan and Deepak Tunuguntla and Oskar van der Wal},
booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
year = {2022},
url = {https://openreview.net/forum?id=rK-7NhfSIW5}
}
@inproceedings{bigscience-teehan2022-emergent-structures-dynamics,
title = {Emergent Structures and Training Dynamics in Large Language Models},
author = {Ryan Teehan and Miruna Clinciu and Oleg Serikov and Eliza Szczechla and Natasha Seelam and Shachar Mirkin and Aaron Gokaslan},
booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
year = {2022},
url = {https://openreview.net/forum?id=SbgL3zrIWc}
}
@inproceedings{bigscience-toni-2022-historical-zero-shot,
title = {Entities, Dates, and Languages: Zero-Shot on Historical Texts with T0},
author = {Francesco De Toni and Christopher Akiki and Javier de la Rosa and Cl{\'e}mentine Fourrier and Enrique Manjavacas and Stefan Schweter and Daniel Van Strien},
booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
year = {2022},
url = {https://openreview.net/forum?id=BRzIS3GrIbc}
}
@inproceedings{bigscience-wang-2022-architecture-pretraining-generalization,
title = {What Language Model Architecture and Pretraining Objective Works Best for Zero-Shot Generalization?},
author = {Wang, Thomas and Roberts, Adam and Hesslow, Daniel and Scao, Teven Le and Chung, Hyung Won and Beltagy, Iz and Launay, Julien and Raffel, Colin},
booktitle = {Proceedings of the 39th International Conference on Machine Learning},
pages = {22964--22984},
year = {2022},
editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
volume = {162},
series = {Proceedings of Machine Learning Research},
month = {17--23 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v162/wang22u/wang22u.pdf},
url = {https://proceedings.mlr.press/v162/wang22u.html}
}
@inproceedings{bigscience-yong-2022-adapting-bigscience-multilingual-unseen,
title = {Adapting BigScience Multilingual Model to Unseen Languages},
author = {Zheng Xin Yong and Vassilina Nikoulina},
booktitle = {Challenges {\&} Perspectives in Creating Large Language Models},
year = {2022},
url = {https://openreview.net/forum?id=rL7mI3GSIbq}
}