မဝ်ဂျူ:za-pron

local export = {}

local m_str_utils = require("Module:string utilities")

local find = m_str_utils.find
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local lower = m_str_utils.lower
local match = m_str_utils.match
local reverse = m_str_utils.reverse
local upper = m_str_utils.upper

local lang = require("Module:languages").getByCode("za")

-- FIXME: needs rewrite [3 February 2020 (UTC)]
-- FIXME: 老壯文 seems to omit marks tones from new Mandarin borrowings ([[w:zh:新壮文#注解]])

-- https://en.wikipedia.org/wiki/Standard_Zhuang
-- https://baike.baidu.com/item/壮语/7703463
-- 在线学壮文 https://web.archive.org/web/0/http://www.gxmyw.com.cn/plus/list.php?tid=21
-- 基础壮文学习系列：壮文标点符号与书写规则 https://web.archive.org/web/0/http://www.gxmyw.com.cn/wsxzw/2013/1017/57.html

local initialConv = {
	['b']   = 'p',
	['mb']  = 'ɓ',
	['m']   = 'm',
	['f']   = 'f',
	['v']   = 'β',
	['by']  = 'pʲ',
	['my']  = 'mʲ',

	['d']   = 't',
	['nd']  = 'ɗ',
	['n']   = 'n',
	['l']   = 'l',
	['s']   = 'θ',

	['ny']  = 'ɲ',
	['c']   = 'ɕ',
	['y']   = 'j',

	['g']   = 'k',
	['ng']  = 'ŋ',
	['r']   = 'ɣ',
	['gy']  = 'kʲ',
	['ngv'] = 'ŋʷ',
	['gv']  = 'kʷ',

	['']    = 'ʔ',
	['h']   = 'h',
}
	-- [bmfvdnslghrcy]?[gbd]?[vy]?

local vowelConv = {
	['a']   = { alone = 'a', wfinal = 'aː' },
	['e']   = { alone = 'e', wfinal = 'eː' },
	['i']   = { alone = 'i', wfinal = 'i' },
	['o']   = { alone = 'o', wfinal = 'oː' },
	['u']   = { alone = 'u', wfinal = 'u' },
	['w']   = { alone = 'ɯ', wfinal = 'ɯ' },

	['ai']  = { alone = 'aːi', wfinal = false },
	['ei']  = { alone = 'ei', wfinal = false },
	['oi']  = { alone = 'oːi', wfinal = false },
	['ui']  = { alone = 'uːi', wfinal = false },
	['wi']  = { alone = 'ɯːi', wfinal = false },

	['ae']  = { alone = 'ai', wfinal = 'a' },
	['ie']  = { alone = false, wfinal = 'iː' },
	['oe']  = { alone = false, wfinal = 'o' },
	['ue']  = { alone = false, wfinal = 'uː' },
	['we']  = { alone = false, wfinal = 'ɯː' },

	['au']  = { alone = 'aːu', wfinal = false },
	['aeu'] = { alone = 'au', wfinal = false },
	['eu']  = { alone = 'eːu', wfinal = false },
	['iu']  = { alone = 'iu', wfinal = false },
	['ou']  = { alone = 'ou', wfinal = false },

	['aw']  = { alone = 'aɯ', wfinal = false },
}
	-- [aeiouw][ieu]?[uw]?
	-- w/ final only: [aeiouw]e?
	-- cannot be w/ final: ai, ei, oi, ui, wi, au, aeu, eu, iu, ou, aw // [aeiouw]e?[iuw]
	-- cannot be w/o final: ie, oe, ue // [iou]e

local finalConv = {
	['']   = '',
	['m']  = 'm',

	['n']  = 'n',
	['ng'] = 'ŋ',
	['p']  = 'p',
	['b']  = 'p',
	['t']  = 't',
	['d']  = 't',
	['k']  = 'k',
	['g']  = 'k',
}
	-- [mnpbtdkg]?g?

local toneConv = {
	['1']   = '˨˦', --24
	['2']  = '˧˩', --31 z
	['3']  = '˥', --55 j
	['4']  = '˦˨', --42 x
	['5']  = '˧˥', --35 q
	['6']  = '˧', --33 h

	['7']  = '˥', --55
	['7:'] = '˧˥', --35
	['8']  = '˧', --33
}

local toneConvToNumbers = {
	['']   = '1',
	['z']  = '2',
	['j']  = '3',
	['x']  = '4',
	['q']  = '5',
	['h']  = '6',
}

local toneConvFromNumbers = {
	['1']  = '',
	['2']  = 'z',
	['3']  = 'j',
	['4']  = 'x',
	['5']  = 'q',
	['6']  = 'h',

	['7']  = '',
	['7:']  = '',
	['8']  = '',
}

local consonantConv_1957 = {
	['mb']  = 'ƃ',
	['nd']  = 'ƌ',
	['ng']  = 'ŋ',
	['ngv'] = 'ŋv',
}

local vowelConv_1957 = {
	['oe'] = 'ɵ',
	['ae'] = 'ə',
	['w']  = 'ɯ',
}

local toneConv_1957 = {
	['1']  = '',
	['2'] = 'ƨ',
	['3'] = 'з',
	['4'] = 'ч',
	['5'] = 'ƽ',
	['6'] = 'ƅ',

	['7']  = '',
	['7:']  = '',
	['8']  = '',
}

local function fix(text)
	local output = {}

	for word in gmatch(text, '\'?[A-Za-z]+[^A-Za-z]*') do
		local apostrophe, word, nonword = match(word, '(\'?)([A-Za-z]+)([^A-Za-z]*)')

		word = gsub(word, '[zjxq]', toneConvToNumbers) -- excludes h which is ambiguously tone or consonant

		-- /CV-CV/...=<CVCV>...
		-- /CVC-V/...=<CVC'V>...
		-- regex (pattern?) wildcards are greedy from the beginning of the string
		-- so counteract this by reversing the string
		-- so if we look for "([CVC])" it will first match what was originally the last CVC sequence
		-- (or something)
		word = reverse(word)
		word = '|' .. gsub(word, '(g?[mnpbtdkg]?)([ieu]?[uw]?[aeiouwAEIUOUW]+)([vy]?[gbd]?[bmfvdnslghrcyBMFVDNSLGHRCY]?)', '%1%2%3|')
		-- "+" seems to be needed after "[aiueow]"
		-- correct: "daeuz"→"daeuz" wrong: "daeuz"→"da|euz"
		word = reverse(word)
		mw.log('za1＞' .. word)

		-- fix bad initial consonant: "|hya"→"h|ya", "|ngya"→"n|gya"
		word = gsub(word, '(|)([^aiueow])([^aiueow])([^aiueow]?)([aiueow])', function(x,a,b,c,d)
			if not initialConv[lower(a..b..c)] then
				return a..x..b..c..d
			end
		end)
		word = gsub(word, '([aiueow]+)([mnpbtdkg]g?)(|)', function(v,c,x)
			-- if there is a final consonant,
			if c ~= '' then
				-- and vowel sequence is not a sequence that only appears before finals,
				if not match(v, '^[aeiouw]e?$') then
					-- detect valid ...VC sequence at end of string
					return reverse(gsub(reverse(v..c..x), '(|)([^aiueow]+)(e?[aeiouw])', '%1%2%3|'))
				end
			end
		end)
		word = gsub(word, '|gvu', 'g|vu')
		mw.log('za2＞' .. word)

		word = gsub(word, 'h|', '6|')
		word = gsub(word, '([A-Za-z]+)|', function(a)
			if match(a, '[ptk]$') then
				return a..'7|'
			elseif match(a, '[bdg]$') and not match(a, 'ng$') then
				return a..'8|'
			else
				return a..'1|'
			end
		end)
		mw.log('za3＞' .. word)

		table.insert(output, apostrophe .. gsub(word, '|', '') .. nonword)
	end

	return table.concat(output)
end

function export.convert(text, scheme, new_bor)
	if type(text) == "table" then
		text, scheme, new_bor = text.args[1], text.args[2], text.args['new_bor']
	end
	local converted = {}

	local extra_pre = match(text, '^[^A-Za-z]*')

	text = fix(text)

	mw.log('za4＞' .. text)

	for syllable in gmatch(text, '[A-Za-z]+%d[^A-Za-z]*') do
		local initial, vowel, final, tone, extra = match(syllable, '^([BMFVDNSLGHRCYbmfvdnslghrcy]?[gbd]?[vy]?)([AEIOUWaeiouw][ieu]?[uw]?)([mnpbtdkg]?g?)(%d)([^A-Za-z]*)$')
		
		local caps = false
		mw.log('za5＞' .. initial, vowel, final, tone, extra)

		if find(initial .. vowel .. final, '[A-Z]') then
			caps = true
			initial, vowel, final = lower(initial), lower(vowel), lower(final)
		end

		if scheme == 'IPA' then
			initial = initialConv[initial]
			vowel = final == '' and vowelConv[vowel].alone or vowelConv[vowel].wfinal
			final = finalConv[final]
			if tone == '7' and find(vowel, 'ː') then
				tone = '7:'
			elseif new_bor and tone == '1' then
				tone = '5'
			end

			tone = toneConv[tone]

			syllable = initial .. vowel .. final .. tone

			table.insert(converted, syllable)
		elseif scheme == 'old' then
			initial = consonantConv_1957[initial] or initial
			vowel = gsub(vowel, '[oa]e', vowelConv_1957)
			vowel = gsub(vowel, 'w', vowelConv_1957)
			final = consonantConv_1957[final] or final
			tone = toneConv_1957[tone]

			if vowel == 'ə' and final == '' then
				vowel = 'əi'
			elseif vowel == 'aɯ' and final == '' then
				vowel = 'əɯ'
			end

			syllable = initial .. vowel .. final .. tone .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'hyphenation' then
			tone = toneConvFromNumbers[tone]

			extra = gsub(extra, '\'', '')
			syllable = initial .. vowel .. final .. tone .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'tone_numbers' then
			if new_bor and tone == '1' then
				tone = '5'
			end

			extra = gsub(extra, '\'', '')
			syllable = initial .. vowel .. final .. '<sup>' .. tone .. '</sup>' .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'raw_syllables' then
			table.insert(converted, syllable)
		else
			error('Convert to what representation?')
		end
	end

	if scheme == 'IPA' then
		converted = '/' .. table.concat(converted, ' ') .. '/'
	elseif scheme == 'old' then
		converted = extra_pre .. table.concat(converted, '')
		converted = mw.ustring.gsub(mw.ustring.gsub(converted, "([6Ƅƅ])'", "%1"), "([6Ƅƅ])&#39;", "%1")
	elseif scheme == 'hyphenation' then
		converted = gsub(extra_pre .. table.concat(converted, '‧'), ' ', '')
	elseif scheme == 'tone_numbers' then
		converted = extra_pre .. table.concat(converted, '')
	elseif scheme == 'raw_syllables' then
		-- (pass)
	end

	return converted
end

function export.show(frame)
	local params = {
		[1] = { },
		['new_bor'] = { type = "boolean" },
	}
	local args = require("Module:parameters").process(frame:getParent().args, params)

	local text, new_bor = args[1], args['new_bor']
	if not text then text = mw.title.getCurrentTitle().text end

	local ret = {}

	table.insert(
		ret,
		require("Module:accent qualifier").format_qualifiers(lang, {"ဗၞတ်ဇြုန်"}) ..
		" " ..
		require("Module:IPA").format_IPA_full {
			lang = lang,
			items = {{ pron = export.convert(text, "IPA", new_bor) }}
		}
	)

	table.insert(
		ret,
		'ဂၞန်ရမျာင်ကျာ: ' ..
		export.convert(text, 'tone_numbers', new_bor)
	)

	table.insert(
		ret,
		'ရမျာင်တိုန်စှ်ေ: ' ..
		export.convert(text, 'hyphenation', new_bor) ..
		' '
	)

	return table.concat(ret, '\n* ')
end

function export.is_latin(frame)
	local text = frame.args[1]
	if find(text, '[ƂƃƋƌŊŋƏəƟɵƜɯƧƨЗзЧчƼƽƄƅ]') then
		return ''
	elseif find(text, '[A-Za-z]') then
		return 'y'
	else
		return '' -- CJK is too much of a pain to detect
	end
end

return export