မဝ်ဂျူ:cau-cir-translit

Documentation for this module may be created at မဝ်ဂျူ:cau-cir-translit/doc
local export = {}
local GRAVE, ACUTE, CIRC, MACRON, BREVE, CARON, DOTBELOW = mw.ustring.char(0x300), mw.ustring.char(0x301), mw.ustring.char(0x302), mw.ustring.char(0x304), mw.ustring.char(0x306), mw.ustring.char(0x30C), mw.ustring.char(0x323)

local tt = {
	["а"] = "aa", ["б"] = "b", ["в"] = "v", ["г"] = "ɣ", ["д"] = "d", ["е"] = "ee", ["ё"] = "jaw", ["ж"] = "ž", ["з"] = "z", ["и"] = "ii", ["й"] = "j", ["к"] = "k", ["л"] = "l", ["м"] = "m", ["н"] = "n", ["о"] = "oo", ["п"] = "p", ["р"] = "r", ["с"] = "s", ["т"] = "t", ["у"] = "uu", ["ф"] = "f", ["х"] = "x", ["ц"] = "c", ["ч"] = "č", ["ш"] = "š", ["щ"] = "ś", ["ъ"] = "", ["ы"] = "ə", ["ь"] = "ʲ", ["э"] = "e", ["ю"] = "jəw", ["я"] = "jaa", ["ӏ"] = "ʼ",
	["А"] = "Aa", ["Б"] = "B", ["В"] = "V", ["Г"] = "Ɣ", ["Д"] = "D", ["Е"] = "Ee", ["Ё"] = "Jaw", ["Ж"] = "Ž", ["З"] = "Z", ["И"] = "Ii", ["Й"] = "J", ["К"] = "K", ["Л"] = "L", ["М"] = "M", ["Н"] = "N", ["О"] = "Oo", ["П"] = "P", ["Р"] = "R", ["С"] = "S", ["Т"] = "T", ["У"] = "Uu", ["Ф"] = "F", ["Х"] = "X", ["Ц"] = "C", ["Ч"] = "Č", ["Ш"] = "Š", ["Щ"] = "Ś", ["Ъ"] = "", ["Ы"] = "Ə", ["Ь"] = "ʲ", ["Э"] = "A", ["Ю"] = "Jəw", ["Я"] = "Jaa", ["Ӏ"] = "ʼ"
}

local digraphs = {
	["го"] = "gʷo", ["гу"] = "gʷu", ["гъ"] = "ğ", ["гь"] = "gʲ", ["жъ"] = "ẑ", ["жь"] = "ź", ["ко"] = "kʷo", ["ку"] = "kʷu", ["къ"] = "q", ["кӏ"] = "kʼ", ["лъ"] = "lˢ", ["ль"] = "lᶻ", ["лӏ"] = "lˢʼ", ["пӏ"] = "pʼ", ["сӏ"] = "sʼ", ["тӏ"] = "tʼ", ["фӏ"] = "fʼ", ["хо"] = "xʷo", ["ху"] = "xʷu", ["хъ"] = "χ", ["хь"] = "ḥ", ["цо"] = "cʷo", ["цу"] = "cʷu", ["цӏ"] = "cʼ", ["чъ"] = "ĉ", ["чӏ"] = "ĉʼ", ["шъ"] = "ŝ", ["шӏ"] = "šʼ", ["щӏ"] = "śʼ",
	["Го"] = "Gʷo", ["Гу"] = "Gʷu", ["Гъ"] = "Ğ", ["Гь"] = "Gʲ", ["Жъ"] = "Ẑ", ["Жь"] = "Ź", ["Ко"] = "Kʷo", ["Ку"] = "Kʷu", ["Къ"] = "Q", ["Кӏ"] = "Kʼ", ["Лъ"] = "Lˢ", ["Ль"] = "Lᶻ", ["Лӏ"] = "Lˢʼ", ["Пӏ"] = "Pʼ", ["Сӏ"] = "Sʼ", ["Тӏ"] = "Tʼ", ["Фӏ"] = "Fʼ", ["Хо"] = "Xʷo", ["Ху"] = "Xʷu", ["Хъ"] = "Χ", ["Хь"] = "Ḥ", ["Цо"] = "Cʷo", ["Цу"] = "Cʷu", ["Цӏ"] = "Cʼ", ["Чъ"] = "Ĉ", ["Чӏ"] = "Ĉʼ", ["Шъ"] = "Ŝ", ["Шӏ"] = "Šʼ", ["Щӏ"] = "Śʼ"
}

-- Prevents overlapping substitutions (e.g. "лӏо"), and also ensures labialized consonants can be substituted before determining where "j" is placed in relation to "е" and "и".
local digraphs2 = {
	["jе"] = "je", ["jи"] = "ji", ["wо"] = "wo", ["wу"] = "wu", ["ӏо"] = "ʼʷo", ["ӏу"] = "ʼʷu",
	["Jе"] = "Je", ["Jи"] = "Ji", ["Wо"] = "Wo", ["Wу"] = "Wu", ["Ӏо"] = "ʼʷo", ["Ӏу"] = "ʼʷu"
}

local trigraphs = {
	["гъо"] = "ğʷo", ["гъу"] = "ğʷu", ["дзо"] = "dzʷo", ["дзу"] = "dzʷu", ["жъо"] = "žʷo", ["жъу"] = "žʷu", ["кхъ"] = "qχ", ["къо"] = "qʷo", ["къу"] = "qʷu", ["къӏ"] = "qʼ", ["кӏо"] = "kʷʼo", ["кӏу"] = "kʷʼu", ["пӏо"] = "pʷʼo", ["пӏу"] = "pʷʼu", ["тлӏ"] = "tˡʼ", ["тӏо"] = "tʷʼo", ["тӏу"] = "tʷʼu", ["хъо"] = "χʷo", ["хъу"] = "χʷu", ["чъо"] = "ćʷo", ["чъу"] = "ćʷu", ["шъо"] = "šʷo", ["шъу"] = "šʷu", ["шӏо"] = "šʷʼo", ["шӏу"] = "šʷʼu",
	["Гъо"] = "Ğʷo", ["Гъу"] = "Ğʷu", ["Дзо"] = "Dzʷo", ["Дзу"] = "Dzʷu", ["Жъо"] = "Žʷo", ["Жъу"] = "Žʷu", ["Кхъ"] = "Qχ", ["Къо"] = "Qʷo", ["Къу"] = "Qʷu", ["Къӏ"] = "Qʼ", ["Кӏо"] = "Kʷʼo", ["Кӏу"] = "Kʷʼu", ["Пӏо"] = "Pʷʼo", ["Пӏу"] = "Pʷʼu", ["тлӏ"] = "tˡʼ", ["Тӏо"] = "Tʷʼo", ["Тӏу"] = "Tʷʼu", ["Хъо"] = "Χʷo", ["Хъу"] = "Χʷu", ["Чъо"] = "Ćʷo", ["Чъу"] = "Ćʷu", ["Шъо"] = "Šʷo", ["Шъу"] = "Šʷu", ["Шӏо"] = "Šʷʼo", ["Шӏу"] = "Šʷʼu"
}

local tetragraphs = {
	["кхъо"] = "qχʷo", ["кхъу"] = "qχʷu",
	["Кхъо"] = "Qχʷo", ["Кхъу"] = "Qχʷu"
}

function export.tr(text, lang, sc)
	local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
	
	-- Convert uppercase palochka to lowercase, along with any "false" palochkas (entered as Latin "I" or "l", or Cyrillic "І"). Lowercase palochka is found in tables above.
	text = mw.ustring.gsub(text, "[IlІӀ]", "ӏ")
	
	for tetragraph, replacement in pairs(tetragraphs) do
		text = string.gsub(text, tetragraph, replacement)
	end
	
	for trigraph, replacement in pairs(trigraphs) do
		text = string.gsub(text, trigraph, replacement)
	end
	
	for digraph, replacement in pairs(digraphs) do
		text = string.gsub(text, digraph, replacement)
	end
	
	-- Contextual addition of "j" before "е" and "и", and "w" before "о" and "у". NOTE: These break with string.gsub, so must use mw.ustring.gsub.
	text = mw.ustring.gsub(mw.ustring.toNFC(text), "^([еи])", "j%1")
	text = mw.ustring.gsub(text, "^([ЕИ])", function(a) return "J" .. mw.ustring.lower(a) end)
	text = mw.ustring.gsub(text, "([ouаеёиоуыэюяOUАЕЁИОУЫЭЮЯ%s%p])([еи])", "%1j%2")
	text = mw.ustring.gsub(text, "([%s%p])([ЕИ])", function(a, b) return a .. "J" .. mw.ustring.lower(b) end)
	text = mw.ustring.gsub(text, "^([оу])", "w%1")
	text = mw.ustring.gsub(text, "^([ОУ])", function(a) return "W" .. mw.ustring.lower(a) end)
	text = mw.ustring.gsub(text, "([ouаеёиоуыэюяOUАЕЁИОУЫЭЮЯ%s%p])([оу])", "%1w%2")
	text = mw.ustring.gsub(text, "([%s%p])([ОУ])", function(a, b) return a .. "W" .. mw.ustring.lower(b) end)
	
	for digraph, replacement in pairs(digraphs2) do
		text = string.gsub(text, digraph, replacement)
	end
	
	text = string.gsub(text, UTF8_char, tt)
	
	-- Remove epenthetic vowel for labialized consonants if before a non-iotated, non-labialized vowel.
	text = mw.ustring.gsub(text, "u([aeiouə])", "%1")
	
	-- Reposition apostrophes then decompose.
	text = mw.ustring.toNFD(mw.ustring.gsub(mw.ustring.gsub(text, "ʼʲ", "ʲʼ"), "ʼʷ", "ʷʼ"))
	
	-- When double letters both have a modifier letter and/or an apostrophe, only show on the second for readability purposes.
	for letter in string.gmatch("abcdefghijklmnopqrstuvxzəɣχABCDEFGHIJKLMNOPQRSTUVXZƏƔΧ", UTF8_char) do
		text = mw.ustring.gsub(text, letter .. "([" .. GRAVE .. ACUTE .. CIRC .. MACRON .. BREVE .. CARON .. DOTBELOW .. "]?)([ʲˡˢᶻ]?ʲ?ʼ?)" .. mw.ustring.lower(letter) .. "%1%2", letter .. "%1" .. mw.ustring.lower(letter) .. "%1%2")
	end
	
	-- Remove consecutive j/ʲ and w/ʷ then recompose.
	return mw.ustring.toNFC(mw.ustring.gsub(mw.ustring.gsub(text, "ʲ?([Jj])ʲ?", "%1"), "ʷ?([Ww])ʷ?", "%1"))
end

return export