diff --git a/README.md b/README.md index ab1816dba5645bfe297c61116dcc58bd9dbc4c33..b5e073f8039ba7dc40966cd8f6fdfe1939d1b053 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ - [175b_samples.jsonl](175b_samples.jsonl) - Unconditional, unfiltered 2048 token samples from GPT-3 with p=.85, t=1.  **CONTENT WARNING:** GPT-3 was trained on arbitrary data from the web, so may contain offensive content and language. - [data](data) - Synthetic datasets for word scramble and arithmetic tasks described in the paper. +- [dataset_statistics](dataset_statistics) - Statistics for all languages included in the training dataset mix. - [overlap_frequency.md](overlap_frequency.md) - Samples of 13-gram overlaps between our training data and benchmarks, selected by frequency in the training set. diff --git a/dataset_statistics/README.md b/dataset_statistics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f94ebd3330994b3a9f28588902793e4b856e167 --- /dev/null +++ b/dataset_statistics/README.md @@ -0,0 +1,9 @@ +# Dataset Language Statistics + +We provide statistics about the relative and absolute prevalence of different languages in the dataset mix used during training of GPT-3. + +The concepts of "characters" and "words" can have different meanings in different languages, so any effort to count is imperfect, but our hope is that this provides helpful information to our readers nonetheless. To help support a wide variety of downstream analyses, we provide language-level summary counts broken down at the unicode-character level, whitespace-delineated word level, and document level. + +- [Languages by character count](languages_by_character_count.csv) +- [Languages by word count](languages_by_word_count.csv) +- [Languages by document count](languages_by_document_count.csv) diff --git a/dataset_statistics/languages_by_character_count.csv b/dataset_statistics/languages_by_character_count.csv new file mode 100644 index 0000000000000000000000000000000000000000..ea3ec7f63a89fdb6415b58359bac934b755615f8 --- /dev/null +++ b/dataset_statistics/languages_by_character_count.csv @@ -0,0 +1,119 @@ +language,number of characters,percentage of total characters +en,1051665177484,92.09864% +fr,20309400904,1.77858% +de,19136098380,1.67583% +es,9007559288,0.78883% +it,7322862470,0.64129% +pt,6203099243,0.54323% +nl,4049596619,0.35464% +ru,2562941612,0.22445% +pl,2108747016,0.18467% +ro,1893347238,0.16581% +ja,1839624833,0.16110% +fi,1833334362,0.16055% +zh,1828425488,0.16012% +da,1378441965,0.12072% +sv,1375230893,0.12043% +no,1308235397,0.11457% +hu,915478070,0.08017% +cs,895960781,0.07846% +tr,822088508,0.07199% +id,788762151,0.06908% +hr,648782481,0.05682% +el,391298494,0.03427% +vi,385194550,0.03373% +ar,347863128,0.03046% +sr,329047895,0.02882% +zh-Hant,307203079,0.02690% +ca,200204951,0.01753% +sk,178526882,0.01563% +sl,168541068,0.01476% +ko,166606790,0.01459% +et,155847809,0.01365% +th,132290925,0.01159% +uk,106674892,0.00934% +lv,95524752,0.00837% +lt,93295278,0.00817% +fa,89917051,0.00787% +ms,86819698,0.00760% +bs,82688510,0.00724% +iw,77574561,0.00679% +is,74815624,0.00655% +sq,53984529,0.00473% +tl,50598813,0.00443% +jw,44370802,0.00389% +gl,42190530,0.00369% +bg,36982817,0.00324% +hi,32973377,0.00289% +af,31596385,0.00277% +cy,18583807,0.00163% +az,17544167,0.00154% +ta,16969536,0.00149% +ga,16333324,0.00143% +mr,10908648,0.00096% +uz,10520330,0.00092% +ml,9846956,0.00086% +bn,9476525,0.00083% +rw,7769601,0.00068% +ceb,7637612,0.00067% +kn,7247885,0.00063% +eu,7240696,0.00063% +ka,6892199,0.00060% +my,6872846,0.00060% +ku,6141015,0.00054% +hy,5852081,0.00051% +te,5459303,0.00048% +kk,5261833,0.00046% +gd,4730222,0.00041% +mt,4526552,0.00040% +sw,3751233,0.00033% +be,3550142,0.00031% +ne,3520611,0.00031% +km,3339136,0.00029% +mk,3315574,0.00029% +ur,3304237,0.00029% +st,3288532,0.00029% +mg,3146474,0.00028% +ht,2499304,0.00022% +pa,2396522,0.00021% +si,2349140,0.00021% +lo,2070101,0.00018% +lg,1721003,0.00015% +gu,1673043,0.00015% +su,1369411,0.00012% +tg,1185110,0.00010% +yi,1134806,0.00010% +ny,1098953,0.00010% +hmn,892319,0.00008% +ky,686551,0.00006% +or,431618,0.00004% +dv,341091,0.00003% +xx-Goth,267028,0.00002% +xx-Runr,235117,0.00002% +iu,208741,0.00002% +xx-Qaai,182333,0.00002% +bh,151835,0.00001% +syr,94109,0.00001% +mn,46819,0.00000% +xx-Phnx,27086,0.00000% +xx-Tfng,24878,0.00000% +xx-Egyp,20079,0.00000% +xx-Nkoo,17216,0.00000% +xx-Cakm,14832,0.00000% +chr,12933,0.00000% +xx-Yiii,9717,0.00000% +xx-Java,7504,0.00000% +xx-Dsrt,7330,0.00000% +xx-Phag,6847,0.00000% +xx-Tavt,5331,0.00000% +xx-Copt,3960,0.00000% +xx-Glag,3811,0.00000% +xx-Olck,3736,0.00000% +xx-Sund,3095,0.00000% +xx-Shaw,2836,0.00000% +xx-Samr,1529,0.00000% +xx-Avst,941,0.00000% +xx-Bopo,894,0.00000% +xx-Ogam,558,0.00000% +xx-Linb,431,0.00000% +xx-Cham,244,0.00000% \ No newline at end of file diff --git a/dataset_statistics/languages_by_document_count.csv b/dataset_statistics/languages_by_document_count.csv new file mode 100644 index 0000000000000000000000000000000000000000..2764009355b7ceb139f9f2649c58f65b7c6e2da7 --- /dev/null +++ b/dataset_statistics/languages_by_document_count.csv @@ -0,0 +1,119 @@ +language,number of documents,percentage of total documents +en,235987420,93.68882% +de,3014597,1.19682% +fr,2568341,1.01965% +pt,1608428,0.63856% +it,1456350,0.57818% +es,1284045,0.50978% +nl,934788,0.37112% +pl,632959,0.25129% +ja,619582,0.24598% +da,396477,0.15740% +no,379239,0.15056% +ro,320256,0.12714% +fi,315228,0.12515% +zh,292976,0.11631% +ru,289121,0.11478% +cs,243802,0.09679% +sv,161516,0.06412% +hu,149584,0.05939% +zh-Hant,107588,0.04271% +id,104437,0.04146% +hr,100384,0.03985% +tr,91414,0.03629% +ca,80899,0.03212% +vi,69147,0.02745% +sl,66333,0.02633% +et,56643,0.02249% +sk,52826,0.02097% +ko,48852,0.01939% +el,44378,0.01762% +sr,41553,0.01650% +th,41301,0.01640% +ar,36275,0.01440% +ms,23184,0.00920% +sq,21796,0.00865% +bs,19932,0.00791% +fa,17863,0.00709% +lt,17756,0.00705% +lv,17698,0.00703% +gl,17226,0.00684% +uk,14418,0.00572% +tl,13657,0.00542% +af,11521,0.00457% +hi,11426,0.00454% +iw,10172,0.00404% +is,9140,0.00363% +bg,8024,0.00319% +ga,7518,0.00298% +cy,5047,0.00200% +ta,4252,0.00169% +ml,4138,0.00164% +rw,3759,0.00149% +mr,3460,0.00137% +az,3265,0.00130% +ka,2780,0.00110% +sw,2725,0.00108% +uz,2659,0.00106% +bn,2655,0.00105% +gd,2456,0.00098% +ku,2274,0.00090% +my,2147,0.00085% +eu,2020,0.00080% +te,1574,0.00062% +ny,1523,0.00060% +st,1402,0.00056% +mg,1386,0.00055% +km,1313,0.00052% +mk,1244,0.00049% +hy,1222,0.00049% +ur,1213,0.00048% +kn,1189,0.00047% +si,968,0.00038% +pa,938,0.00037% +ne,875,0.00035% +ht,809,0.00032% +mt,746,0.00030% +kk,682,0.00027% +ceb,648,0.00026% +jw,648,0.00026% +lg,613,0.00024% +lo,555,0.00022% +gu,545,0.00022% +be,542,0.00022% +su,480,0.00019% +yi,365,0.00014% +hmn,313,0.00012% +or,193,0.00008% +ky,140,0.00006% +dv,114,0.00005% +tg,94,0.00004% +bh,90,0.00004% +iu,58,0.00002% +syr,47,0.00002% +mn,11,0.00000% +xx-Goth,6,0.00000% +xx-Copt,4,0.00000% +xx-Glag,4,0.00000% +xx-Qaai,4,0.00000% +xx-Runr,3,0.00000% +xx-Yiii,3,0.00000% +chr,3,0.00000% +xx-Shaw,3,0.00000% +xx-Sund,2,0.00000% +xx-Olck,2,0.00000% +xx-Dsrt,2,0.00000% +xx-Nkoo,2,0.00000% +xx-Tfng,1,0.00000% +xx-Java,1,0.00000% +xx-Tavt,0,0.00000% +xx-Bopo,0,0.00000% +xx-Samr,0,0.00000% +xx-Avst,0,0.00000% +xx-Ogam,0,0.00000% +xx-Egyp,0,0.00000% +xx-Linb,0,0.00000% +xx-Cham,0,0.00000% +xx-Cakm,0,0.00000% +xx-Phnx,0,0.00000% +xx-Phag,0,0.00000% \ No newline at end of file diff --git a/dataset_statistics/languages_by_word_count.csv b/dataset_statistics/languages_by_word_count.csv new file mode 100644 index 0000000000000000000000000000000000000000..dceebb4335d31fcaf2fe3b23e6dd129dc67471f5 --- /dev/null +++ b/dataset_statistics/languages_by_word_count.csv @@ -0,0 +1,119 @@ +language,number of words,percentage of total words +en,181014683608,92.64708% +fr,3553061536,1.81853% +de,2870869396,1.46937% +es,1510070974,0.77289% +it,1187784217,0.60793% +pt,1025413869,0.52483% +nl,669055061,0.34244% +ru,368157074,0.18843% +ro,308182352,0.15773% +pl,303812362,0.15550% +fi,221644679,0.11344% +da,221551540,0.11339% +sv,220920577,0.11307% +ja,217047918,0.11109% +no,212193299,0.10860% +zh,193517396,0.09905% +cs,139918438,0.07161% +hu,127224375,0.06512% +id,116930321,0.05985% +tr,116141938,0.05944% +hr,101613675,0.05201% +vi,83077650,0.04252% +el,61607673,0.03153% +ar,60839973,0.03114% +sr,52875283,0.02706% +zh-Hant,38583893,0.01975% +ca,35126650,0.01798% +ko,33147663,0.01697% +sk,27957963,0.01431% +th,26806557,0.01372% +sl,26037337,0.01333% +et,20718080,0.01060% +fa,16731301,0.00856% +iw,15027640,0.00769% +uk,14905898,0.00763% +ms,13389340,0.00685% +lv,13290098,0.00680% +bs,13160941,0.00674% +lt,12921255,0.00661% +is,12792837,0.00655% +hi,9434632,0.00483% +sq,9253803,0.00474% +tl,8650331,0.00443% +gl,6947527,0.00356% +jw,6604056,0.00338% +bg,5919807,0.00303% +af,5461216,0.00280% +ta,5163171,0.00264% +mr,3660217,0.00187% +cy,3459671,0.00177% +ml,3227746,0.00165% +bn,3003033,0.00154% +ga,2878943,0.00147% +az,2496202,0.00128% +kn,1913389,0.00098% +my,1853421,0.00095% +te,1638366,0.00084% +uz,1458861,0.00075% +rw,1430208,0.00073% +ceb,1329456,0.00068% +ne,1120450,0.00057% +ku,1091032,0.00056% +eu,1048905,0.00054% +km,1041164,0.00053% +ka,924256,0.00047% +gd,841970,0.00043% +hy,840171,0.00043% +mt,748610,0.00038% +si,708343,0.00036% +pa,703086,0.00036% +ur,689768,0.00035% +kk,670231,0.00034% +sw,585858,0.00030% +st,538257,0.00028% +be,533405,0.00027% +mk,529413,0.00027% +mg,507043,0.00026% +gu,494798,0.00025% +lo,449476,0.00023% +ht,430911,0.00022% +lg,261217,0.00013% +yi,227609,0.00012% +tg,210167,0.00011% +su,208819,0.00011% +hmn,175972,0.00009% +ny,161994,0.00008% +or,131688,0.00007% +dv,112819,0.00006% +ky,91289,0.00005% +bh,48094,0.00002% +xx-Goth,48025,0.00002% +xx-Runr,37558,0.00002% +iu,31142,0.00002% +syr,21482,0.00001% +mn,7779,0.00000% +xx-Phnx,4343,0.00000% +xx-Qaai,4185,0.00000% +xx-Egyp,3395,0.00000% +xx-Nkoo,3338,0.00000% +xx-Tfng,3277,0.00000% +xx-Cakm,2608,0.00000% +xx-Yiii,2357,0.00000% +chr,2315,0.00000% +xx-Phag,1750,0.00000% +xx-Tavt,1622,0.00000% +xx-Dsrt,1504,0.00000% +xx-Java,1448,0.00000% +xx-Sund,780,0.00000% +xx-Copt,707,0.00000% +xx-Glag,673,0.00000% +xx-Olck,573,0.00000% +xx-Shaw,542,0.00000% +xx-Samr,313,0.00000% +xx-Avst,213,0.00000% +xx-Bopo,188,0.00000% +xx-Linb,156,0.00000% +xx-Ogam,84,0.00000% +xx-Cham,49,0.00000% \ No newline at end of file