dc.contributor.author | Gurevych, Iryna |
dc.contributor.author | Habernal, Ivan |
dc.contributor.author | Zayed, Omnia |
dc.date.accessioned | 2017-06-07T13:05:08Z |
dc.date.available | 2017-06-07T13:05:08Z |
dc.date.issued | 2016-04-14 |
dc.identifier.uri | http://hdl.handle.net/11372/LRT-2203 |
dc.description | A large web corpus (over 10 billion tokens) licensed under CreativeCommons license family in 50+ languages that has been extracted from CommonCrawl, the largest publicly available general Web crawl to date with about 2 billion crawled URLs. |
dc.language.iso | afr |
dc.language.iso | ara |
dc.language.iso | ben |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | ell |
dc.language.iso | eng |
dc.language.iso | est |
dc.language.iso | fas |
dc.language.iso | fin |
dc.language.iso | fra |
dc.language.iso | guj |
dc.language.iso | heb |
dc.language.iso | hin |
dc.language.iso | hrv |
dc.language.iso | hun |
dc.language.iso | ind |
dc.language.iso | ita |
dc.language.iso | jpn |
dc.language.iso | kan |
dc.language.iso | kor |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | mal |
dc.language.iso | mar |
dc.language.iso | mkd |
dc.language.iso | nep |
dc.language.iso | nld |
dc.language.iso | nor |
dc.language.iso | pan |
dc.language.iso | pol |
dc.language.iso | por |
dc.language.iso | ron |
dc.language.iso | rus |
dc.language.iso | slk |
dc.language.iso | slv |
dc.language.iso | som |
dc.language.iso | spa |
dc.language.iso | sqi |
dc.language.iso | swa |
dc.language.iso | swe |
dc.language.iso | tam |
dc.language.iso | tel |
dc.language.iso | tgl |
dc.language.iso | tha |
dc.language.iso | tur |
dc.language.iso | ukr |
dc.language.iso | und |
dc.language.iso | urd |
dc.language.iso | vie |
dc.language.iso | zho |
dc.publisher | Technische Universität Darmstadt |
dc.relation.isreferencedby | http://www.lrec-conf.org/proceedings/lrec2016/pdf/388_Paper.pdf |
dc.rights | Creative Commons - Attribution 4.0 International (CC BY 4.0) |
dc.rights.uri | http://creativecommons.org/licenses/by/4.0/ |
dc.source.uri | https://dkpro.github.io/dkpro-c4corpus/ |
dc.subject | CommonCrawl |
dc.subject | Creative Commons |
dc.subject | Web corpus |
dc.subject | Amazon Web Services |
dc.title | C4Corpus (CC-BY part) |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LRT + Open Submissions |
contact.person | Ivan Habernal habernal@ukp.informatik.tu-darmstadt.de Technische Universität Darmstadt |
sponsor | German Research Foundation (DFG) DIP DA 1600/1-1 Information Consolidation: A New Paradigm in Knowledge Search nationalFunds |
sponsor | Amazon Amazon Web Services in Education Grant Web Services in Education Grant Other |
size.info | 10000000000 tokens |
files.size | 6619701998 |
files.count | 59 |
Files in this item
This item is
Creative Commons - Attribution 4.0 International (CC BY 4.0)
Publicly Available
and licensed under:Creative Commons - Attribution 4.0 International (CC BY 4.0)
- Name
- Lic_by_Lang_af_NoBoilerplate_true_MinHtml_true-r-00009.seg-00000.warc.gz
- Size
- 3.17 MB
- Format
- application/x-gzip
- MD5
- 111816dd96b40a70befa75fccde98452
- Name
- Lic_by_Lang_ar_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 22.09 MB
- Format
- application/x-gzip
- MD5
- 2cc0821b1d076d9aa605dfa01199f2b1
- Name
- Lic_by_Lang_bg_NoBoilerplate_true_MinHtml_true-r-00010.seg-00000.warc.gz
- Size
- 2.93 MB
- Format
- application/x-gzip
- MD5
- 6212665da332e9ac227edb72708290aa
- Name
- Lic_by_Lang_bn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 1.23 MB
- Format
- application/x-gzip
- MD5
- d410193b2a7c18ccec676e8d873baa60
- Name
- Lic_by_Lang_cs_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Size
- 2.89 MB
- Format
- application/x-gzip
- MD5
- 1c71d569e6052f845394fae1da9bc54b
- Name
- Lic_by_Lang_da_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 1.89 MB
- Format
- application/x-gzip
- MD5
- 4d2794a456a157546baeea17f8fb46fe
- Name
- Lic_by_Lang_de_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 46.29 MB
- Format
- application/x-gzip
- MD5
- c61354e4f6d31b2a348e7a6d62bdfd4e
- Name
- Lic_by_Lang_el_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 20.64 MB
- Format
- application/x-gzip
- MD5
- 027a67b1c2803dd2dc928dbfce8706e0
- Name
- Lic_by_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 953.68 MB
- Format
- application/x-gzip
- MD5
- d93272a97fe99183b10985cdd286010a
- Name
- Lic_by_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00001.warc.gz
- Size
- 953.7 MB
- Format
- application/x-gzip
- MD5
- 5a6aca26d0accb7ec55b63daf2d51178
- Name
- Lic_by_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00002.warc.gz
- Size
- 953.71 MB
- Format
- application/x-gzip
- MD5
- 82ea7b49f30616ef0e1e537a8760a4ef
- Name
- Lic_by_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00003.warc.gz
- Size
- 953.71 MB
- Format
- application/x-gzip
- MD5
- 3dfe551080b1122806daea637eb85a39
- Name
- Lic_by_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00004.warc.gz
- Size
- 953.71 MB
- Format
- application/x-gzip
- MD5
- 09aca35e9a163ccc553df68ea262260d
- Name
- Lic_by_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00005.warc.gz
- Size
- 196.67 MB
- Format
- application/x-gzip
- MD5
- 287777260ef4368908a3140b3ea95892
- Name
- Lic_by_Lang_es_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Size
- 379.26 MB
- Format
- application/x-gzip
- MD5
- 2719d16e7016ab0331a8139819fd3830
- Name
- Lic_by_Lang_et_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 3 MB
- Format
- application/x-gzip
- MD5
- a9a098410e1d7c537dfed6469c94fcb9
- Name
- Lic_by_Lang_fa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 5.02 MB
- Format
- application/x-gzip
- MD5
- 94b77326ad37a49b9a75abc5c46b5c23
- Name
- Lic_by_Lang_fi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 2.4 MB
- Format
- application/x-gzip
- MD5
- 85eb5a31cbf9c493f54b9b3d7218033c
- Name
- Lic_by_Lang_fr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 45.48 MB
- Format
- application/x-gzip
- MD5
- 9f726a63bf773211e696fe9f80595ea6
- Name
- Lic_by_Lang_gu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 13.11 KB
- Format
- application/x-gzip
- MD5
- 357d1181d80734f3c4dc533aa72acc4d
- Name
- Lic_by_Lang_he_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 627.55 KB
- Format
- application/x-gzip
- MD5
- 321577f81435e87f4f0b4248b555f146
- Name
- Lic_by_Lang_hi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 978.73 KB
- Format
- application/x-gzip
- MD5
- e68ae849b4dbede70f873834d13e9172
- Name
- Lic_by_Lang_hr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 20.72 MB
- Format
- application/x-gzip
- MD5
- 3da8a763126bb0be3ba9f8a150ea7e0e
- Name
- Lic_by_Lang_hu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 3.02 MB
- Format
- application/x-gzip
- MD5
- 8fab83723f2ff538ee72aebe2fc1f476
- Name
- Lic_by_Lang_id_NoBoilerplate_true_MinHtml_true-r-00007.seg-00000.warc.gz
- Size
- 424.23 MB
- Format
- application/x-gzip
- MD5
- 16ed0f576a1ee4e86b1120784709293a
- Name
- Lic_by_Lang_it_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 53.22 MB
- Format
- application/x-gzip
- MD5
- 6fbce93f927d37650c05e9592bfc1839
- Name
- Lic_by_Lang_ja_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 1.71 MB
- Format
- application/x-gzip
- MD5
- 2493ee68787dc39f86146ab6c93ae9cb
- Name
- Lic_by_Lang_kn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 20.84 KB
- Format
- application/x-gzip
- MD5
- 19397cb6b6c9b00d2619ef6ae810c189
- Name
- Lic_by_Lang_ko_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 4.1 MB
- Format
- application/x-gzip
- MD5
- 77517e1332882bb1d73202f4766a8ae5
- Name
- Lic_by_Lang_lt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 973.51 KB
- Format
- application/x-gzip
- MD5
- e76f1fa1397ce9f0b9765b7b341b3a9d
- Name
- Lic_by_Lang_lv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Size
- 618.12 KB
- Format
- application/x-gzip
- MD5
- bf5aa6e228c3132c4f77a8a92e21176c
- Name
- Lic_by_Lang_mk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 238.76 KB
- Format
- application/x-gzip
- MD5
- af488917d143cae315d6281b9be4898b
- Name
- Lic_by_Lang_ml_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 680.6 KB
- Format
- application/x-gzip
- MD5
- 8eb8f85ea3048d9dce829bc2c9a976a8
- Name
- Lic_by_Lang_mr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 449.68 KB
- Format
- application/x-gzip
- MD5
- 2dba1acef7484c0b110663169ae119ce
- Name
- Lic_by_Lang_ne_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 344.14 KB
- Format
- application/x-gzip
- MD5
- 3554778460b0db54c90be51a81afee85
- Name
- Lic_by_Lang_nl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 9.71 MB
- Format
- application/x-gzip
- MD5
- 62bae3143c2aea4ad247bcfe96004715
- Name
- Lic_by_Lang_no_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 3.16 MB
- Format
- application/x-gzip
- MD5
- c741ad2e83a3fa1427379f6c96f0589b
- Name
- Lic_by_Lang_pa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 12.71 KB
- Format
- application/x-gzip
- MD5
- 0f61077b74bd5ab04d2b0b321e1054cc
- Name
- Lic_by_Lang_pl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 7.32 MB
- Format
- application/x-gzip
- MD5
- a914dae4c8fbcad69b4b77b382bca032
- Name
- Lic_by_Lang_pt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 122.16 MB
- Format
- application/x-gzip
- MD5
- 4fcbe595e921463efc4c59390188076c
- Name
- Lic_by_Lang_ro_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 8.99 MB
- Format
- application/x-gzip
- MD5
- 9969403ac99b887f6ddcdd8f1a44e2c5
- Name
- Lic_by_Lang_ru_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 26.83 MB
- Format
- application/x-gzip
- MD5
- e43b5b9d57a7f99522cb5f806b13cd4d
- Name
- Lic_by_Lang_sk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 914.81 KB
- Format
- application/x-gzip
- MD5
- 26962b86a4b39863ba56ce99515480de
- Name
- Lic_by_Lang_sl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 960.84 KB
- Format
- application/x-gzip
- MD5
- e83931d5afc4dc0f0f65158d231c35fe
- Name
- Lic_by_Lang_so_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 331.17 KB
- Format
- application/x-gzip
- MD5
- 044a9a533bb070fafc4875ab3906ed4d
- Name
- Lic_by_Lang_sq_NoBoilerplate_true_MinHtml_true-r-00020.seg-00000.warc.gz
- Size
- 2.46 MB
- Format
- application/x-gzip
- MD5
- 83d8025d1cf15ee02b696a2ae8e6e540
- Name
- Lic_by_Lang_sv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Size
- 6.56 MB
- Format
- application/x-gzip
- MD5
- 47c3b6de2c889d9bcb2021737d909b9c
- Name
- Lic_by_Lang_sw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Size
- 1.22 MB
- Format
- application/x-gzip
- MD5
- 839baebeeda119cec41683375eb5c17b
- Name
- Lic_by_Lang_ta_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 6.42 MB
- Format
- application/x-gzip
- MD5
- 6cbd88db733a7d253a0f57e627a542ae
- Name
- Lic_by_Lang_te_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 405.26 KB
- Format
- application/x-gzip
- MD5
- 3a5c412c4e239cad8ec568d8d8c37704
- Name
- Lic_by_Lang_th_NoBoilerplate_true_MinHtml_true-r-00011.seg-00000.warc.gz
- Size
- 14.19 MB
- Format
- application/x-gzip
- MD5
- c287743001edcb9fe9e73cf10caf7939
- Name
- Lic_by_Lang_tl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 2.78 MB
- Format
- application/x-gzip
- MD5
- f1d4af461d317d48c97af0ebf13b9268
- Name
- Lic_by_Lang_tr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 14.87 MB
- Format
- application/x-gzip
- MD5
- b58d3d0e76fad22c2b202a56a6e69bbb
- Name
- Lic_by_Lang_uk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 946.32 KB
- Format
- application/x-gzip
- MD5
- 56dd2ac265c1b55ec6703f07e9ff9310
- Name
- Lic_by_Lang_unknown_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 49.16 MB
- Format
- application/x-gzip
- MD5
- 84c05cfbbd0446087b21cd8e85d45e50
- Name
- Lic_by_Lang_ur_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 167.68 KB
- Format
- application/x-gzip
- MD5
- ffadc233ca24f76de62a7308d89ae31b
- Name
- Lic_by_Lang_vi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 16.98 MB
- Format
- application/x-gzip
- MD5
- 0a47e8b1a2966b7bb00a83bf57c29254
- Name
- Lic_by_Lang_zh-cn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 3.14 MB
- Format
- application/x-gzip
- MD5
- 09850798260c807f2410ca40382887f6
- Name
- Lic_by_Lang_zh-tw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Size
- 150.57 KB
- Format
- application/x-gzip
- MD5
- 1d9848c18323f2f2add9c28d9a743e7a