dc.contributor.author | Gurevych, Iryna |
dc.contributor.author | Habernal, Ivan |
dc.contributor.author | Zayed, Omnia |
dc.date.accessioned | 2017-06-07T13:06:37Z |
dc.date.available | 2017-06-07T13:06:37Z |
dc.date.issued | 2016-04-14 |
dc.identifier.uri | http://hdl.handle.net/11372/LRT-2204 |
dc.description | A large web corpus (over 10 billion tokens) licensed under CreativeCommons license family in 50+ languages that has been extracted from CommonCrawl, the largest publicly available general Web crawl to date with about 2 billion crawled URLs. |
dc.language.iso | afr |
dc.language.iso | ara |
dc.language.iso | ben |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | ell |
dc.language.iso | eng |
dc.language.iso | est |
dc.language.iso | fas |
dc.language.iso | fin |
dc.language.iso | fra |
dc.language.iso | heb |
dc.language.iso | hin |
dc.language.iso | hrv |
dc.language.iso | hun |
dc.language.iso | ind |
dc.language.iso | ita |
dc.language.iso | jpn |
dc.language.iso | kan |
dc.language.iso | kor |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | mal |
dc.language.iso | mkd |
dc.language.iso | nep |
dc.language.iso | nld |
dc.language.iso | nor |
dc.language.iso | pan |
dc.language.iso | pol |
dc.language.iso | por |
dc.language.iso | ron |
dc.language.iso | rus |
dc.language.iso | slk |
dc.language.iso | slv |
dc.language.iso | som |
dc.language.iso | spa |
dc.language.iso | sqi |
dc.language.iso | swa |
dc.language.iso | swe |
dc.language.iso | tam |
dc.language.iso | tel |
dc.language.iso | tgl |
dc.language.iso | tha |
dc.language.iso | tur |
dc.language.iso | ukr |
dc.language.iso | und |
dc.language.iso | vie |
dc.language.iso | zho |
dc.publisher | Technische Universität Darmstadt |
dc.relation.isreferencedby | http://www.lrec-conf.org/proceedings/lrec2016/pdf/388_Paper.pdf |
dc.rights | Creative Commons - Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) |
dc.rights.uri | http://creativecommons.org/licenses/by-nc/4.0/ |
dc.source.uri | https://dkpro.github.io/dkpro-c4corpus/ |
dc.subject | CommonCrawl |
dc.subject | Creative Commons |
dc.subject | Web corpus |
dc.subject | Amazon Web Services |
dc.title | C4Corpus (CC BY-NC part) |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LRT + Open Submissions |
contact.person | Ivan Habernal habernal@ukp.informatik.tu-darmstadt.de Technische Universität Darmstadt |
sponsor | German Research Foundation (DFG) DIP DA 1600/1-1 Information Consolidation: A New Paradigm in Knowledge Search nationalFunds |
sponsor | Amazon Amazon Web Services in Education Grant Web Services in Education Grant Other |
size.info | 10000000000 tokens |
files.size | 1508239534 |
files.count | 52 |
Files in this item
This item is
Creative Commons - Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Publicly Available
and licensed under:Creative Commons - Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
- Name
- Lic_by-nc_Lang_af_NoBoilerplate_true_MinHtml_true-r-00009.seg-00000.warc.gz
- Size
- 13.55 KB
- Format
- application/x-gzip
- MD5
- 78d26cdef7bc599a49ca8eecba13f3a2
- Name
- Lic_by-nc_Lang_ar_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 937.25 KB
- Format
- application/x-gzip
- MD5
- dfe8bac91ae9206fb3115bf44884599a
- Name
- Lic_by-nc_Lang_bg_NoBoilerplate_true_MinHtml_true-r-00010.seg-00000.warc.gz
- Size
- 335.68 KB
- Format
- application/x-gzip
- MD5
- a7f9bc0c8d817dcabbb42cbd850e6721
- Name
- Lic_by-nc_Lang_bn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 12.78 KB
- Format
- application/x-gzip
- MD5
- c10e60c63adf09390b81187bcd6c43e1
- Name
- Lic_by-nc_Lang_cs_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Size
- 1.89 MB
- Format
- application/x-gzip
- MD5
- 9f7eb479d129e660497bbd9c751a1456
- Name
- Lic_by-nc_Lang_da_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 514.73 KB
- Format
- application/x-gzip
- MD5
- a647c06d398be86546db53ab40c4fe5a
- Name
- Lic_by-nc_Lang_de_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 10.88 MB
- Format
- application/x-gzip
- MD5
- 9e911adb2ef0469ea99a996ac466767b
- Name
- Lic_by-nc_Lang_el_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 1.62 MB
- Format
- application/x-gzip
- MD5
- a6ecae8453cefbefbb5b95e386dcb65d
- Name
- Lic_by-nc_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 953.7 MB
- Format
- application/x-gzip
- MD5
- 5b33ad2dc94b990922acf90159018bc1
- Name
- Lic_by-nc_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00001.warc.gz
- Size
- 49.66 MB
- Format
- application/x-gzip
- MD5
- ec9c70eaa8925085dcd15ad667d34b9e
- Name
- Lic_by-nc_Lang_es_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Size
- 252.03 MB
- Format
- application/x-gzip
- MD5
- be91b4768d9ef7a0d0a57c6d056f110a
- Name
- Lic_by-nc_Lang_et_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 248.87 KB
- Format
- application/x-gzip
- MD5
- 06891f86e24b53ff3cd30be1ad32a86b
- Name
- Lic_by-nc_Lang_fa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 350.26 KB
- Format
- application/x-gzip
- MD5
- e8f875c273fa368de5cd835a4713183e
- Name
- Lic_by-nc_Lang_fi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 478.4 KB
- Format
- application/x-gzip
- MD5
- 93f6c41913ecef22169d807632028fa6
- Name
- Lic_by-nc_Lang_fr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 17.08 MB
- Format
- application/x-gzip
- MD5
- b66ccdaf21aa2adaa82f115061296b09
- Name
- Lic_by-nc_Lang_he_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 601.71 KB
- Format
- application/x-gzip
- MD5
- 5182a10e9cb434104e949e3e7f8eaf3e
- Name
- Lic_by-nc_Lang_hi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 129.82 KB
- Format
- application/x-gzip
- MD5
- 5b9bebc69ccc925485259f095d05bf65
- Name
- Lic_by-nc_Lang_hr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 1.51 MB
- Format
- application/x-gzip
- MD5
- 8162986d4fe4f977e30db78962e6c1ec
- Name
- Lic_by-nc_Lang_hu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 698.36 KB
- Format
- application/x-gzip
- MD5
- fc6cd4ffd704506cfa6e6148ffb31208
- Name
- Lic_by-nc_Lang_id_NoBoilerplate_true_MinHtml_true-r-00007.seg-00000.warc.gz
- Size
- 5.96 MB
- Format
- application/x-gzip
- MD5
- 18c3dbff2214a9fcf52972bdf3efcf0f
- Name
- Lic_by-nc_Lang_it_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 20.68 MB
- Format
- application/x-gzip
- MD5
- c73a0895c251e1536355d8155c07073c
- Name
- Lic_by-nc_Lang_ja_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 521.07 KB
- Format
- application/x-gzip
- MD5
- 2d467dcf00fc6b73519c7ecbcb506aa7
- Name
- Lic_by-nc_Lang_kn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 44.81 KB
- Format
- application/x-gzip
- MD5
- 6e7d0fde85b75381107fa14d2cfbe696
- Name
- Lic_by-nc_Lang_ko_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 2.03 MB
- Format
- application/x-gzip
- MD5
- 96425614df668737e3aeb0e5e50ab6c2
- Name
- Lic_by-nc_Lang_lt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 229.15 KB
- Format
- application/x-gzip
- MD5
- e7f35c1e01244b55dfe78c3c4e3e3f8c
- Name
- Lic_by-nc_Lang_lv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Size
- 46.39 KB
- Format
- application/x-gzip
- MD5
- 189559ee67c7bb83e0f065ec069aadc9
- Name
- Lic_by-nc_Lang_mk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 75.68 KB
- Format
- application/x-gzip
- MD5
- c75393224dbd8ddd4a238d4be5fb2a20
- Name
- Lic_by-nc_Lang_ml_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 15.98 KB
- Format
- application/x-gzip
- MD5
- 891c4ce630fb3f602f7cc8429205b516
- Name
- Lic_by-nc_Lang_ne_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 70.01 KB
- Format
- application/x-gzip
- MD5
- 65b12b8f0baf41d17bc2eb818e317072
- Name
- Lic_by-nc_Lang_nl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 2.73 MB
- Format
- application/x-gzip
- MD5
- adf300960b1963b2ae673ca4fc05d491
- Name
- Lic_by-nc_Lang_no_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 490.74 KB
- Format
- application/x-gzip
- MD5
- 79271fb1e6649f60f82d16569df54749
- Name
- Lic_by-nc_Lang_pa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 1.33 KB
- Format
- application/x-gzip
- MD5
- 47bea29e1fd9a83a8e0f741c3f687d6b
- Name
- Lic_by-nc_Lang_pl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 941.07 KB
- Format
- application/x-gzip
- MD5
- 3be442fb31376b1d2a8f70a969353780
- Name
- Lic_by-nc_Lang_pt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Size
- 91.79 MB
- Format
- application/x-gzip
- MD5
- 26146627cbcd72cc725871f88f45b82c
- Name
- Lic_by-nc_Lang_ro_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 905.45 KB
- Format
- application/x-gzip
- MD5
- d781ddb2afdb53f2c9b103beeb41860e
- Name
- Lic_by-nc_Lang_ru_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Size
- 651.44 KB
- Format
- application/x-gzip
- MD5
- 8f5a5b8318ae43bd2dd8ccfabc26560c
- Name
- Lic_by-nc_Lang_sk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 498.81 KB
- Format
- application/x-gzip
- MD5
- 54eaf233a9bfabca8e706d29c0c6156a
- Name
- Lic_by-nc_Lang_sl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 306.11 KB
- Format
- application/x-gzip
- MD5
- 812cb4ac3abed2fbe1cedc933384c97a
- Name
- Lic_by-nc_Lang_so_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Size
- 47.82 KB
- Format
- application/x-gzip
- MD5
- a555f2ff3a0477f716ac38fddc0a4d7d
- Name
- Lic_by-nc_Lang_sq_NoBoilerplate_true_MinHtml_true-r-00020.seg-00000.warc.gz
- Size
- 1.66 MB
- Format
- application/x-gzip
- MD5
- e904baca51a0c44506afbb7791cae630
- Name
- Lic_by-nc_Lang_sv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Size
- 1.01 MB
- Format
- application/x-gzip
- MD5
- b8b2b4aac16764dabe65b37154f3a520
- Name
- Lic_by-nc_Lang_sw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Size
- 3.67 KB
- Format
- application/x-gzip
- MD5
- 852c4838a8a5de9e4a8507de6d592fb5
- Name
- Lic_by-nc_Lang_ta_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Size
- 6.09 KB
- Format
- application/x-gzip
- MD5
- c75370fd8dddfc231b8e86e945ec9a4d
- Name
- Lic_by-nc_Lang_te_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Size
- 10.67 KB
- Format
- application/x-gzip
- MD5
- 811dde70288d12ea19d2599a9d65a8f4
- Name
- Lic_by-nc_Lang_th_NoBoilerplate_true_MinHtml_true-r-00011.seg-00000.warc.gz
- Size
- 2.12 MB
- Format
- application/x-gzip
- MD5
- 1516a4e890710aac7c9495ece9eda61a
- Name
- Lic_by-nc_Lang_tl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Size
- 389.01 KB
- Format
- application/x-gzip
- MD5
- 309cf55349abc8a8a27bcfc79ed69d07
- Name
- Lic_by-nc_Lang_tr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Size
- 1.22 MB
- Format
- application/x-gzip
- MD5
- 6ab491029966b3d070b961a218508d5d
- Name
- Lic_by-nc_Lang_uk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Size
- 36.98 KB
- Format
- application/x-gzip
- MD5
- 1126dfbdf6852d2f223f7d4fd41ecc49
- Name
- Lic_by-nc_Lang_unknown_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 9.59 MB
- Format
- application/x-gzip
- MD5
- 32d33b917ed99d268396702b6cbe1349
- Name
- Lic_by-nc_Lang_vi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Size
- 1.5 MB
- Format
- application/x-gzip
- MD5
- bf85039f32163c2e58a403a1093ae627
- Name
- Lic_by-nc_Lang_zh-cn_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Size
- 130.08 KB
- Format
- application/x-gzip
- MD5
- 6499c8a03a02be39bcf153fd641975dc
- Name
- Lic_by-nc_Lang_zh-tw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Size
- 184.23 KB
- Format
- application/x-gzip
- MD5
- 3bcf993d01658cc0e72389612c77612e