မဝ်ဂျူ:headword/data

နူ ဝိက်ရှေန်နရဳ

Documentation for this module may be created at မဝ်ဂျူ:headword/data/doc

local data = {}

data.invariable = {
	"cmavo",
	"cmene",
	"fu'ivla",
	"gismu",
	"Han tu",
	"hanzi",
	"hanja",
	"jyutping",
	"kanji",
	"lujvo",
	"phrasebook",
	"pinyin",
	"rafsi",
	"romaji",
}

data.lemmas = {
	"အက္ခရ်ဂၠေံ",
	"ဝေါဟာဂၠေံ",
	"နာမဝိသေသန",
	"အာက်နဝ်မာဲနဝ်",
	"adpositions",
	"ကြိယာဝိသေသန",
	"affixes",
	"ambipositions",
	"လိက်ပရေၚ်",
	"circumfixes",
	"circumpositions",
	"classifiers",
	"cmavo",
	"cmavo clusters",
	"cmene",
	"combining forms",
	"သမ္ဗန္ဓ",
	"counters",
	"determiners",
	"diacritical marks",
	"equative adjectives",
	"fu'ivla",
	"gismu",
	"Han characters",
	"Han tu",
	"hanzi",
	"hanja",
	"ideophones",
	"idioms",
	"infixes",
	"interfixes",
	"initialisms",
	"interjections",
	"kanji",
	"letters",
	"ligatures",
	"lujvo",
	"morphemes",
	"non-constituents",
	"နာမ်",
	"ဂၞန်သင်္ချာ",
	"သင်္ကေတဂၞန်သင်္ချာ",
	"ဂၞန်သင်္ချာ",
	"particles",
	"မအရေဝ်",
	"postpositions",
	"postpositional phrases",
	"predicatives",
	"prefixes",
	"ဝိဘတ်",
	"prepositional phrases",
	"preverbs",
	"pronominal adverbs",
	"သဗ္ဗနာမ်",
	"proverbs",
	"နာမ်မကိတ်ညဳ",
	"punctuation marks",
	"relatives",
	"တံရိုဟ်",
	"stems",
	"suffixes",
	"syllables",
	"symbols",
	"ကြိယာ",
}

data.nonlemmas = {
	"ဗီုပြင်လုပ်ကၠောန်စွံလဝ်နကဵုမစိုပ်တရဴ",
	"လုပ်ကၠောန်စွံလဝ်နကဵုမစိုပ်တရဴ",
	"adjectival participles",
	"ဗီုပြင်နာမဝိသေသန",
	"နာမဝိသေသနဗီုပြင်ဣတ္တိလိင်",
	"နာမဝိသေသနဗီုပြင်ကိုန်ဗဟုဝစ်",
	"ဗီုပြင်ကြိယာဝိသေသန",
	"adverbial participles",
	"agent participles",
	"article forms",
	"circumfix forms",
	"combined forms",
	"ဗီုပြင်ပတုပ်ရံင်နာမဝိသေသန",
	"နာမဝိသေသနပတုပ်ရံင်",
	"ဗီုပြင်ပတုပ်ရံင်ကြိယာဝိသေသန",
	"ကြိယာဝိသေသနပတုပ်ရံင်",
	"ပွမထညောံ",
	"converbs",
	"determiner comparative forms",
	"ဗီုပြင်ဖျေံလဝ်သန္နိဋ္ဌာန်",
	"determiner superlative forms",
	"diminutive nouns",
	"equative adjective forms",
	"equative adjectives",
	"future participles",
	"gerunds",
	"infinitive forms",
	"infinitives",
	"interjection forms",
	"jyutping",
	"kanji readings",
	"ခ္ဍံက်လိက်ဗၠေတ်",
	"negative participles",
	"nominal participles",
	"သဗ္ဗနာမ်ဝိဘတ်",
	"ဗီုပြင်ရုပ်နာမ်",
	"နာမ်ဗီုပြၚ်ၜါလ္ပာ်",
	"ဗီုပြင်နာမ်",
	"ဗီုပြင်နာမ်ပဝ်ကာယ်လ်",
	"နာမ်ဗီုပြင်ကိုန်ဗဟုဝစ်",
	"နာမ်ဗီုပြင်ပိုင်ပြဳ",
	"နာမ်ဗီုပြင်ကိုန်ဨကဝုစ်",
	"ဗီုပြင်ဂၞန်သင်္ချာ",
	"လုပ်ကၠောန်စွံလဝ်",
	"ဗီုပြင်လုပ်ကၠောန်စွံလဝ်",
	"ဗီုပြင်အမှိက်",
	"လုပ်ကၠောန်စွံလဝ်ဟွံတဝ်စၞေဟ်",
	"လုပ်ကၠောန်စွံလဝ်နကဵုအတိတ်ပြဟ်ပြေဟ်",
	"လုပ်ကၠောန်စွံလဝ်နကဵုအတိတ်",
	"ဗီုပြင်လုပ်ကၠောန်စွံလဝ်နကဵုအတိတ်",
	"လုပ်ကၠောန်စွံလဝ်ဗီုပြင်ဟွံတဝ်စၞေဟ်နူအတိတ်",
	"လုပ်ကၠောန်စွံလဝ်မက္ဍိုပ်ပေင်ပြဟ်ပြေဟ်",
	"လုပ်ကၠောန်စွံလဝ်ဗီုပြင်မက္ဍိုပ်ပေင်",
	"လုပ်ကၠောန်စွံလဝ်မက္ဍိုပ်ပေင်ဟွံတဝ်စၞေဟ်",
	"ဖေန်အိန်",
	"ကိုန်ဗဟုဝစ်",
	"ဗီုပြင်ပသ္ၚောဲထောံ",
	"ဗီုပြင်မုက်နာမ်",
	"ဝိဘတ်ပသ္ၚောဲထောံလဝ်",
	"ဗီုပြင်ဝိဘတ်",
	"ဝိဘတ်ဗီုပြင်သဗ္ဗနာမ်",
	"လုပ်ကၠောန်စွံလဝ်ပစ္စုပ္ပန်ပြဟ်ပြေဟ်",
	"လုပ်ကၠောန်စွံလဝ်ပစ္စုပ္ပန်",
	"လုပ်ကၠောန်စွံလဝ်ပစ္စုပ္ပန်ဗီုပြင်ဟွံတဝ်စၞေဟ်",
	"ဗီုပြင်သဗ္ဗနာမ်",
	"သဗ္ဗနာမ်ဗီုပြင်ပိုၚ်ပြဳ",
	"ဗီုပြင်နာမ်မကိတ်ညဳ",
	"နာမ်မကိတ်ညဳဗီုပြင်ကိုန်ဗဟုဝစ်",
	"rafsi",
	"ဗီုအက္ခရ်ရောမ",
	"ဗီုပြင်တံရိုဟ်",
	"singulatives",
	"ဗီုပြင်အဆက်လက္ကရဴ",
	"ဗီုပြင်သဒ္ဒာနာမဝိသေသန",
	"သဒ္ဒာနာမဝိသေသန",
	"ဗီုပြင်သဒ္ဒာကြိယာဝိသေသန",
	"သဒ္ဒာကြိယာဝိသေသန",
	"ဗီုပြင်ကြိယာ",
	"နာမ်ဝါစာ",
	"နာမ်အပြံင်အသၠာဲ",
}

-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = {
	-------- Languages without spaces between words (sometimes spaces between phrases) --------
	"aho", -- Ahom
	"blt", -- Tai Dam
	"ja", -- Japanese
	"khb", -- Lü
	"km", -- Khmer
	"lo", -- Lao
	"mnw", -- Mon
	"my", -- Burmese
	"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
	"nod", -- Northern Thai
	"ojp", -- Old Japanese
	"shn", -- Shan
	"sou", -- Southern Thai
	"tdd", -- Tai Nüa
	"th", -- Thai
	"tts", -- Isan
	"twh", -- Tai Dón
	"txg", -- Tangut
	"zh", -- Chinese (all varieties with Chinese characters)
	"zkt", -- Khitan

	-------- Languages with spaces between syllables --------
	"ahk", -- Akha
	"aou", -- A'ou
	"atb", -- Zaiwa
	"byk", -- Biao
	"cdy", -- Chadong
	--"duu", -- Drung; not sure
	--"hmx-pro", -- Proto-Hmong-Mien
	--"hnj", -- Green Hmong; not sure
	"huq", -- Tsat
	"ium", -- Iu Mien
	--"lis", -- Lisu; not sure
	"mtq", -- Muong
	--"mww", -- White Hmong; not sure
	"onb", -- Lingao
	--"sit-gkh", -- Gokhy; not sure
	--"swi", -- Sui; not sure
	"tbq-lol-pro", -- Proto-Loloish
	"tdh", -- Thulung
	"ukk", -- Muak Sa-aak
	"vi", -- Vietnamese
	"yig", -- Wusa Nasu
	"zng", -- Mang

	-------- Languages with ~ with surrounding spaces used to separate variants --------
	"mkh-ban-pro", -- Proto-Bahnaric
	"sit-pro", -- Proto-Sino-Tibetan; listed above

	-------- Other weirdnesses --------
	"mul", -- Translingual; gestures, Morse code, etc.
	"aot", -- Atong (India); bullet is a letter

	-------- All sign languages	--------
	"ads",
	"aed",
	"aen",
	"afg",
	"ase",
	"asf",
	"asp",
	"asq",
	"av",
	"ads",
	"bfi",
	"bfk",
	"bog",
	"bqn",
	"bqy",
	"bvl",
	"bzs",
	"cds",
	"csc",
	"csd",
	"cse",
	"csf",
	"csg",
	"csl",
	"csn",
	"csq",
	"csr",
	"doq",
	"dse",
	"dsl",
	"ecs",
	"esl",
	"esn",
	"eso",
	"eth",
	"fcs",
	"fse",
	"fsl",
	"fss",
	"gds",
	"gse",
	"gsg",
	"gsm",
	"gss",
	"gus",
	"hab",
	"haf",
	"hds",
	"hks",
	"hos",
	"hps",
	"hsh",
	"hsl",
	"icl",
	"iks",
	"ils",
	"inl",
	"ins",
	"ise",
	"isg",
	"isr",
	"jcs",
	"jhs",
	"jls",
	"jos",
	"jsl",
	"jus",
	"kgi",
	"kvk",
	"lbs",
	"lls",
	"lsl",
	"lso",
	"lsp",
	"lst",
	"lsy",
	"lws",
	"mdl",
	"mfs",
	"mre",
	"msd",
	"msr",
	"mzc",
	"mzg",
	"mzy",
	"nbs",
	"ncs",
	"nsi",
	"nsl",
	"nsp",
	"nsr",
	"nzs",
	"okl",
	"pgz",
	"pks",
	"prl",
	"prz",
	"psc",
	"psd",
	"psg",
	"psl",
	"pso",
	"psp",
	"psr",
	"pys",
	"rms",
	"rsl",
	"rsm",
	"sdl",
	"sfb",
	"sfs",
	"sgg",
	"sgx",
	"slf",
	"sls",
	"sqk",
	"sqs",
	"ssp",
	"ssr",
	"svk",
	"swl",
	"syy",
	"tse",
	"tsm",
	"tsq",
	"tss",
	"tsy",
	"tza",
	"ugn",
	"ugy",
	"ukl",
	"uks",
	"vgt",
	"vsi",
	"vsl",
	"vsv",
	"xki",
	"xml",
	"xms",
	"ygs",
	"ysl",
	"zib",
	"zsl",
}

-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = {
	"akk", -- Akkadian; hyphens between syllables
	"akl", -- Aklanon; hyphens for mid-word glottal stops
	"ber-pro", -- Proto-Berber; morphemes separated by hyphens
	"ceb", -- Cebuano; hyphens for mid-word glottal stops
	"cnk", -- Khumi Chin; hyphens used in single words
	"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
	"de", -- too many false positives
	"esx-esk-pro", -- hyphen used to separate morphemes
	"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
	"hil", -- Hiligaynon; hyphens for mid-word glottal stops
	"ilo", -- Ilocano; hyphens for mid-word glottal stops
	"lcp", -- Western Lawa; dash as syllable joiner
	"lwl", -- Eastern Lawa; dash as syllable joiner
	"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
	"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
	"msb", -- Masbatenyo; too many false positives
	"tl", -- Tagalog; too many false positives
	"war", -- Waray-Waray; too many false positives
	"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}

-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = {
	-- Languages without gender but which use the gender field for other purposes
	"ja",
	"th",
}

data.notranslit = {
	"ams",
	"az",
	"bbc",
	"bug",
	"cia",
	"cjm",
	"cmn",
	"hak",
	"ja",
	"kzg",
	"lad",
	"lzh",
	"ms",
	"mul",
	"mvi",
	"nan",
	"oj",
	"okn",
	"pi",
	"ro",
	"ryn",
	"rys",
	"ryu",
	"sh",
	"tgt",
	"th",
	"tkn",
	"tly",
	"txg",
	"und",
	"vi",
	"xug",
	"yoi",
	"yox",
	"yue",
	"za",
	"zh",
}

-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = {
	"Ahom",
	"Arab",
		"fa-Arab",
		"glk-Arab",
		"kk-Arab",
		"ks-Arab",
		"ku-Arab",
		"mzn-Arab",
		"ms-Arab",
		"ota-Arab",
		"pa-Arab",
		"ps-Arab",
		"sd-Arab",
		"tt-Arab",
		"ug-Arab",
		"ur-Arab",
	"Armi",
	"Armn",
	"Avst",
	"Bali",
	"Bamu",
	"Batk",
	"Beng",
		"as-Beng",
	"Bopo",
	"Brah",
	"Brai",
	"Bugi",
	"Buhd",
	"Cakm",
	"Cans",
	"Cari",
	"Cham",
	"Cher",
	"Copt",
	"Cprt",
	"Cyrl",
	"Cyrs",
	"Deva",
	"Dsrt",
	"Egyd",
	"Egyp",
	"Ethi",
	"Geok",
	"Geor",
	"Glag",
	"Goth",
	"Grek",
		"polytonic",
	"Gujr",
	"Guru",
	"Hang",
	"Hani",
	"Hano",
	"Hebr",
	"Hira",
	"Hluw",
	"Ital",
	"Java",
	"Kali",
	"Kana",
	"Khar",
	"Khmr",
	"Knda",
	"Kthi",
	"Lana",
	"Laoo",
	"Latn",
		"Latf",
		"Latg",
		"Latinx",
		"nv-Latn",
		"pjt-Latn",
	"Lepc",
	"Limb",
	"Linb",
	"Lisu",
	"Lyci",
	"Lydi",
	"Mand",
	"Mani",
	"Merc",
	"Mero",
	"Mlym",
	"Mong",
		"mnc-Mong",
		"sjo-Mong",
		"xwo-Mong",
	"Mtei",
	"Mymr",
	"Narb",
	"Nkoo",
	"Ogam",
	"Olck",
	"Orkh",
	"Orya",
	"Osma",
	"Palm",
	"Phag",
	"Phli",
	"Phlv",
	"Phnx",
	"Plrd",
	"Prti",
	"Rjng",
	"Runr",
	"Samr",
	"Sarb",
	"Saur",
	"Sgnw",
	"Shaw",
	"Shrd",
	"Sinh",
	"Sora",
	"Sund",
	"Sylo",
	"Syrc",
	"Tagb",
	"Tale",
	"Talu",
	"Taml",
	"Tang",
	"Tavt",
	"Telu",
	"Tfng",
	"Tglg",
	"Thaa",
	"Thai",
	"Tibt",
		"xzh-Tibt",
	"Ugar",
	"Vaii",
	"Xpeo",
	"Xsux",
	"Yiii",
	"Zmth",
	"Zsym",

	"IPAchar",
	"musical",
	"Ruminumerals",
}

for key, list in pairs(data) do
	data[key] = {}
	for _, item in ipairs(list) do
		data[key][item] = true
	end
end

-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number. We put this at the bottom
-- because it's a map, not a list.
data.pos_for_gender_number_cat = {
	["နာမ်"] = "နာမ်",
	["proper nouns"] = "နာမ်",
	["suffixes"] = "suffixes",
	-- We include verbs because impf and pf are valid "genders".
	["ကြိယာ"] = "ကြိယာ",
}

return data