dc.contributor.author | Gurevych, Iryna |
dc.contributor.author | Habernal, Ivan |
dc.contributor.author | Zayed, Omnia |
dc.date.accessioned | 2017-06-07T13:10:23Z |
dc.date.available | 2017-06-07T13:10:23Z |
dc.date.issued | 2016-04-14 |
dc.identifier.uri | http://hdl.handle.net/11372/LRT-2209 |
dc.description | A large web corpus (over 10 billion tokens) licensed under CreativeCommons license family in 50+ languages that has been extracted from CommonCrawl, the largest publicly available general Web crawl to date with about 2 billion crawled URLs. |
dc.language.iso | afr |
dc.language.iso | ara |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | ell |
dc.language.iso | eng |
dc.language.iso | est |
dc.language.iso | fas |
dc.language.iso | fin |
dc.language.iso | fra |
dc.language.iso | hrv |
dc.language.iso | hun |
dc.language.iso | ind |
dc.language.iso | ita |
dc.language.iso | jpn |
dc.language.iso | kor |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | nld |
dc.language.iso | nor |
dc.language.iso | pol |
dc.language.iso | por |
dc.language.iso | rus |
dc.language.iso | slv |
dc.language.iso | som |
dc.language.iso | spa |
dc.language.iso | swa |
dc.language.iso | swe |
dc.language.iso | tgl |
dc.language.iso | tha |
dc.language.iso | tur |
dc.language.iso | ukr |
dc.language.iso | und |
dc.language.iso | vie |
dc.publisher | Technische Universität Darmstadt |
dc.relation.isreferencedby | http://www.lrec-conf.org/proceedings/lrec2016/pdf/388_Paper.pdf |
dc.rights | Public Domain Mark (PD) |
dc.rights.uri | http://creativecommons.org/publicdomain/mark/1.0/ |
dc.source.uri | https://dkpro.github.io/dkpro-c4corpus/ |
dc.subject | CommonCrawl |
dc.subject | Creative Commons |
dc.subject | Web corpus |
dc.subject | Amazon Web Services |
dc.title | C4Corpus (publicdomain part) |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LRT + Open Submissions |
contact.person | Ivan Habernal habernal@ukp.informatik.tu-darmstadt.de Technische Universität Darmstadt |
sponsor | German Research Foundation (DFG) DIP DA 1600/1-1 Information Consolidation: A New Paradigm in Knowledge Search nationalFunds |
sponsor | Amazon Amazon Web Services in Education Grant Web Services in Education Grant Other |
size.info | 10000000000 tokens |
files.size | 89036585 |
files.count | 36 |
Soubory tohoto záznamu
Stáhnout všechny soubory záznamu (84.91 MB)- Název
- Lic_publicdomain_Lang_af_NoBoilerplate_true_MinHtml_true-r-00009.seg-00000.warc.gz
- Velikost
- 851 bajtů
- Formát
- application/x-gzip
- MD5
- 5fa2ab45c8d2d3a3b45809ffaa9c09eb
- Název
- Lic_publicdomain_Lang_ar_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 62.62 KB
- Formát
- application/x-gzip
- MD5
- cafa5a853b4fffed9fb7c81ffaae80bf
- Název
- Lic_publicdomain_Lang_bg_NoBoilerplate_true_MinHtml_true-r-00010.seg-00000.warc.gz
- Velikost
- 37.94 KB
- Formát
- application/x-gzip
- MD5
- d291cd5deeff806db6dac27e7f31a816
- Název
- Lic_publicdomain_Lang_cs_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Velikost
- 2.21 KB
- Formát
- application/x-gzip
- MD5
- fed503373e97e581ed2dacb999185c26
- Název
- Lic_publicdomain_Lang_da_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 2.56 KB
- Formát
- application/x-gzip
- MD5
- 7a8f8b03417a91e543f821516056fc37
- Název
- Lic_publicdomain_Lang_de_NoBoilerplate_true_MinHtml_true-r-00008.seg-00000.warc.gz
- Velikost
- 208.56 KB
- Formát
- application/x-gzip
- MD5
- 84a5f4d980c3a7784fe5c44cb095977c
- Název
- Lic_publicdomain_Lang_el_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 10.11 KB
- Formát
- application/x-gzip
- MD5
- 46a1035ba227e9817309954432c921a6
- Název
- Lic_publicdomain_Lang_en_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 73.52 MB
- Formát
- application/x-gzip
- MD5
- a4ca095d945ec3338ad157ac794192b2
- Název
- Lic_publicdomain_Lang_es_NoBoilerplate_true_MinHtml_true-r-00022.seg-00000.warc.gz
- Velikost
- 4.91 MB
- Formát
- application/x-gzip
- MD5
- 900a30f1f159595c78c006bc9d75912d
- Název
- Lic_publicdomain_Lang_et_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 759 bajtů
- Formát
- application/x-gzip
- MD5
- 0d5a6aaefdae7cb922b87ba74efff946
- Název
- Lic_publicdomain_Lang_fa_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 29.18 KB
- Formát
- application/x-gzip
- MD5
- 3df1b8cae322307c85890d56e3b4feff
- Název
- Lic_publicdomain_Lang_fi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 190 KB
- Formát
- application/x-gzip
- MD5
- c5a4e5f17ea95cb668a80677868b8e72
- Název
- Lic_publicdomain_Lang_fr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 2.84 MB
- Formát
- application/x-gzip
- MD5
- d81c7ef00f15dab384215c3e0a06ebd0
- Název
- Lic_publicdomain_Lang_hr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 10.76 KB
- Formát
- application/x-gzip
- MD5
- 68509cea87e36e9241aead807363a163
- Název
- Lic_publicdomain_Lang_hu_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 398.67 KB
- Formát
- application/x-gzip
- MD5
- 1a7305b25d4013a6245febe67b36b8ac
- Název
- Lic_publicdomain_Lang_id_NoBoilerplate_true_MinHtml_true-r-00007.seg-00000.warc.gz
- Velikost
- 239.58 KB
- Formát
- application/x-gzip
- MD5
- 1487be32b9a1dab8d3d903ade1cf37b2
- Název
- Lic_publicdomain_Lang_it_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 831.26 KB
- Formát
- application/x-gzip
- MD5
- e23e233f2f82ef2c25874a11f7ecaa4f
- Název
- Lic_publicdomain_Lang_ja_NoBoilerplate_true_MinHtml_true-r-00004.seg-00000.warc.gz
- Velikost
- 11.47 KB
- Formát
- application/x-gzip
- MD5
- e3ad404aa65691c3bac4798fab27b50f
- Název
- Lic_publicdomain_Lang_ko_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 11.32 KB
- Formát
- application/x-gzip
- MD5
- 565d03dae2a0c4f1825a4483cf15c8a1
- Název
- Lic_publicdomain_Lang_lt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 1.58 KB
- Formát
- application/x-gzip
- MD5
- 74730fe8a76b422e5bb2c6b28834fe2f
- Název
- Lic_publicdomain_Lang_lv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Velikost
- 1.63 KB
- Formát
- application/x-gzip
- MD5
- 117ffcbc34f519f66a2d94a2f14031a5
- Název
- Lic_publicdomain_Lang_nl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 13.96 KB
- Formát
- application/x-gzip
- MD5
- 27cc4b3391289c54a3daddbdb72915b1
- Název
- Lic_publicdomain_Lang_no_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 84.05 KB
- Formát
- application/x-gzip
- MD5
- 6a292adfa78baee6d768739691128996
- Název
- Lic_publicdomain_Lang_pl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 102.69 KB
- Formát
- application/x-gzip
- MD5
- 0b093028be51e1628de18afcb45f1d4b
- Název
- Lic_publicdomain_Lang_pt_NoBoilerplate_true_MinHtml_true-r-00023.seg-00000.warc.gz
- Velikost
- 196.11 KB
- Formát
- application/x-gzip
- MD5
- 1e2bf067709585f6e08de56ec5c1f66a
- Název
- Lic_publicdomain_Lang_ru_NoBoilerplate_true_MinHtml_true-r-00024.seg-00000.warc.gz
- Velikost
- 85 KB
- Formát
- application/x-gzip
- MD5
- 366f92911df0414a36f29c99b6eaa28b
- Název
- Lic_publicdomain_Lang_sl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 1.56 KB
- Formát
- application/x-gzip
- MD5
- 2a20bf71fe6fd1b380f09d577cb07643
- Název
- Lic_publicdomain_Lang_so_NoBoilerplate_true_MinHtml_true-r-00018.seg-00000.warc.gz
- Velikost
- 2.3 KB
- Formát
- application/x-gzip
- MD5
- 65db2258fb6cefc459f1553e340b4a15
- Název
- Lic_publicdomain_Lang_sv_NoBoilerplate_true_MinHtml_true-r-00025.seg-00000.warc.gz
- Velikost
- 20.91 KB
- Formát
- application/x-gzip
- MD5
- 02b457c3b0c4e1fba9b7bd513083d205
- Název
- Lic_publicdomain_Lang_sw_NoBoilerplate_true_MinHtml_true-r-00026.seg-00000.warc.gz
- Velikost
- 5.85 KB
- Formát
- application/x-gzip
- MD5
- 8f2d3712cc1657b1b98f2da757c8ec8b
- Název
- Lic_publicdomain_Lang_th_NoBoilerplate_true_MinHtml_true-r-00011.seg-00000.warc.gz
- Velikost
- 102.62 KB
- Formát
- application/x-gzip
- MD5
- 00ce2bc416d8084fdff1e97c6ad24dab
- Název
- Lic_publicdomain_Lang_tl_NoBoilerplate_true_MinHtml_true-r-00015.seg-00000.warc.gz
- Velikost
- 45.09 KB
- Formát
- application/x-gzip
- MD5
- 277267fc1ebec66a88e2b94c7a3a36a5
- Název
- Lic_publicdomain_Lang_tr_NoBoilerplate_true_MinHtml_true-r-00021.seg-00000.warc.gz
- Velikost
- 29.59 KB
- Formát
- application/x-gzip
- MD5
- c9cbaf7f1f9c5b2782ceda2fdaae954b
- Název
- Lic_publicdomain_Lang_uk_NoBoilerplate_true_MinHtml_true-r-00014.seg-00000.warc.gz
- Velikost
- 1.95 KB
- Formát
- application/x-gzip
- MD5
- 4a75e3f4c722ca9b0c67639455392a30
- Název
- Lic_publicdomain_Lang_unknown_NoBoilerplate_true_MinHtml_true-r-00017.seg-00000.warc.gz
- Velikost
- 987.9 KB
- Formát
- application/x-gzip
- MD5
- f59469c5775f3eeca670d37936d32d8e
- Název
- Lic_publicdomain_Lang_vi_NoBoilerplate_true_MinHtml_true-r-00012.seg-00000.warc.gz
- Velikost
- 3.63 KB
- Formát
- application/x-gzip
- MD5
- 8a771141caecd61602369182342c9fc7