dc.contributor.author | Mareček, David |
dc.contributor.author | Yu, Zhiwei |
dc.contributor.author | Zeman, Daniel |
dc.contributor.author | Žabokrtský, Zdeněk |
dc.date.accessioned | 2016-06-27T12:27:25Z |
dc.date.available | 2016-06-27T12:27:25Z |
dc.date.issued | 2016-06-20 |
dc.identifier.uri | http://hdl.handle.net/11234/1-1743 |
dc.description | Texts in 107 languages from the W2C corpus (http://hdl.handle.net/11858/00-097C-0000-0022-6133-9), first 1,000,000 tokens per language, tagged by the delexicalized tagger described in Yu et al. (2016, LREC, Portorož, Slovenia). Changes in version 1.1: 1. Universal Dependencies tagset instead of the older and smaller Google Universal POS tagset. 2. SVM classifier trained on Universal Dependencies 1.2 instead of HamleDT 2.0. 3. Balto-Slavic languages, Germanic languages and Romance languages were tagged by classifier trained only on the respective group of languages. Other languages were tagged by a classifier trained on all available languages. The "c7" combination from version 1.0 is no longer used. |
dc.language.iso | bel |
dc.language.iso | bos |
dc.language.iso | bul |
dc.language.iso | ces |
dc.language.iso | hbs |
dc.language.iso | hrv |
dc.language.iso | hsb |
dc.language.iso | mkd |
dc.language.iso | pol |
dc.language.iso | rus |
dc.language.iso | slk |
dc.language.iso | slv |
dc.language.iso | srp |
dc.language.iso | ukr |
dc.language.iso | lav |
dc.language.iso | lit |
dc.language.iso | afr |
dc.language.iso | dan |
dc.language.iso | deu |
dc.language.iso | eng |
dc.language.iso | fao |
dc.language.iso | fry |
dc.language.iso | gsw |
dc.language.iso | isl |
dc.language.iso | lim |
dc.language.iso | ltz |
dc.language.iso | nds |
dc.language.iso | nld |
dc.language.iso | nno |
dc.language.iso | nor |
dc.language.iso | sco |
dc.language.iso | swe |
dc.language.iso | yid |
dc.language.iso | arg |
dc.language.iso | ast |
dc.language.iso | cat |
dc.language.iso | fra |
dc.language.iso | glg |
dc.language.iso | hat |
dc.language.iso | ita |
dc.language.iso | lat |
dc.language.iso | lmo |
dc.language.iso | nap |
dc.language.iso | pms |
dc.language.iso | por |
dc.language.iso | ron |
dc.language.iso | spa |
dc.language.iso | vec |
dc.language.iso | wln |
dc.language.iso | bre |
dc.language.iso | cym |
dc.language.iso | gla |
dc.language.iso | gle |
dc.language.iso | ell |
dc.language.iso | hye |
dc.language.iso | sqi |
dc.language.iso | diq |
dc.language.iso | fas |
dc.language.iso | glk |
dc.language.iso | kur |
dc.language.iso | tgk |
dc.language.iso | ben |
dc.language.iso | bpy |
dc.language.iso | guj |
dc.language.iso | hif |
dc.language.iso | hin |
dc.language.iso | mar |
dc.language.iso | nep |
dc.language.iso | urd |
dc.language.iso | amh |
dc.language.iso | ara |
dc.language.iso | arz |
dc.language.iso | heb |
dc.language.iso | est |
dc.language.iso | fin |
dc.language.iso | hun |
dc.language.iso | eus |
dc.language.iso | kat |
dc.language.iso | chv |
dc.language.iso | aze |
dc.language.iso | tur |
dc.language.iso | uzb |
dc.language.iso | kaz |
dc.language.iso | tat |
dc.language.iso | sah |
dc.language.iso | kor |
dc.language.iso | mon |
dc.language.iso | tel |
dc.language.iso | kan |
dc.language.iso | mal |
dc.language.iso | tam |
dc.language.iso | new |
dc.language.iso | vie |
dc.language.iso | ind |
dc.language.iso | jav |
dc.language.iso | mlg |
dc.language.iso | mri |
dc.language.iso | msa |
dc.language.iso | pam |
dc.language.iso | sun |
dc.language.iso | tgl |
dc.language.iso | war |
dc.language.iso | swa |
dc.language.iso | epo |
dc.language.iso | ido |
dc.language.iso | ina |
dc.language.iso | vol |
dc.publisher | Charles University, Faculty of Mathematics and Physics, Institute of Formal and Applied Linguistics (UFAL) |
dc.relation.replaces | http://hdl.handle.net/11234/1-1662 |
dc.rights | Creative Commons - Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) |
dc.rights.uri | http://creativecommons.org/licenses/by-sa/4.0/ |
dc.source.uri | http://ufal.mff.cuni.cz/deltacorpus |
dc.subject | part of speech |
dc.subject | tagging |
dc.subject | semi-supervised |
dc.subject | cross-language |
dc.title | Deltacorpus 1.1 |
dc.type | corpus |
metashare.ResourceInfo#ContentInfo.mediaType | text |
dc.rights.label | PUB |
has.files | yes |
branding | LINDAT / CLARIAH-CZ |
contact.person | Daniel Zeman zeman@ufal.mff.cuni.cz Charles University in Prague, ÚFAL |
sponsor | Grantová agentura České republiky GA15-10472S Morphologically and Syntactically Annotated Corpora of Many Languages nationalFunds |
size.info | 94307862 tokens |
files.size | 460267520 |
files.count | 1 |
Files in this item
This item is
Creative Commons - Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
Publicly Available
and licensed under:Creative Commons - Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
- Name
- deltacorpus-1.1.tar
- Size
- 438.95 MB
- Format
- application/x-tar
- Description
- Deltacorpus 1.1
- MD5
- 6420ab90b7edca2dfc1a7269c1c3cbf7
- deltacorpus-1.1
- LANGUAGES.txt5 kB
- README.txt953 B
- data
- tgk.txt.gz4 MB
- mal.txt.gz5 MB
- pam.txt.gz4 MB
- bos.txt.gz4 MB
- jav.txt.gz4 MB
- bel.txt.gz4 MB
- hrv.txt.gz4 MB
- ben.txt.gz5 MB
- slv.txt.gz4 MB
- aze.txt.gz4 MB
- spa.txt.gz4 MB
- fra.txt.gz4 MB
- ron.txt.gz4 MB
- hin.txt.gz4 MB
- hat.txt.gz3 MB
- war.txt.gz2 MB
- dan.txt.gz4 MB
- hbs.txt.gz4 MB
- pol.txt.gz4 MB
- kur.txt.gz4 MB
- hsb.txt.gz201 kB
- epo.txt.gz4 MB
- lat.txt.gz4 MB
- lav.txt.gz4 MB
- arz.txt.gz4 MB
- tam.txt.gz5 MB
- nds.txt.gz3 MB
- vie.txt.gz3 MB
- rus.txt.gz4 MB
- sqi.txt.gz4 MB
- ind.txt.gz4 MB
- swe.txt.gz4 MB
- nep.txt.gz5 MB
- vol.txt.gz744 kB
- arg.txt.gz4 MB
- bpy.txt.gz5 MB
- guj.txt.gz4 MB
- deu.txt.gz4 MB
- hif.txt.gz4 MB
- hye.txt.gz4 MB
- msa.txt.gz4 MB
- uzb.txt.gz4 MB
- wln.txt.gz632 kB
- fry.txt.gz4 MB
- yid.txt.gz4 MB
- sah.txt.gz5 MB
- kor.txt.gz5 MB
- diq.txt.gz1 MB
- isl.txt.gz4 MB
- swa.txt.gz4 MB
- eus.txt.gz4 MB
- cym.txt.gz3 MB
- vec.txt.gz4 MB
- cat.txt.gz4 MB
- amh.txt.gz39 kB
- urd.txt.gz4 MB
- nap.txt.gz1 MB
- tat.txt.gz5 MB
- kaz.txt.gz5 MB
- lmo.txt.gz3 MB
- gsw.txt.gz4 MB
- glk.txt.gz2 MB
- ara.txt.gz4 MB
- mon.txt.gz4 MB
- new.txt.gz304 kB
- eng.txt.gz4 MB
- sun.txt.gz1 MB
- pms.txt.gz1 MB
- sco.txt.gz4 MB
- tgl.txt.gz4 MB
- heb.txt.gz4 MB
- bul.txt.gz4 MB
- tel.txt.gz5 MB
- ita.txt.gz4 MB
- mri.txt.gz4 MB
- fas.txt.gz4 MB
- kat.txt.gz5 MB
- gle.txt.gz4 MB
- glg.txt.gz4 MB
- chv.txt.gz70 kB
- ukr.txt.gz4 MB
- hun.txt.gz4 MB
- lim.txt.gz4 MB
- fao.txt.gz4 MB
- ido.txt.gz1 MB
- ast.txt.gz4 MB
- afr.txt.gz4 MB
- gla.txt.gz3 MB
- ina.txt.gz3 MB
- mar.txt.gz5 MB
- mlg.txt.gz3 MB
- slk.txt.gz4 MB
- tur.txt.gz4 MB
- ltz.txt.gz4 MB
- kan.txt.gz5 MB
- ell.txt.gz4 MB
- ces.txt.gz4 MB
- bre.txt.gz3 MB
- nor.txt.gz4 MB
- fin.txt.gz4 MB
- por.txt.gz4 MB
- lit.txt.gz4 MB
- srp.txt.gz4 MB
- est.txt.gz4 MB
- nno.txt.gz4 MB
- mkd.txt.gz4 MB
- nld.txt.gz4 MB
- POS_TAGSET.txt584 B