မဝ်ဂျူ:fa-cls-translit

Documentation for this module may be created at မဝ်ဂျူ:fa-cls-translit/doc
-- Authors: Sameerhameedy

local U = mw.ustring.char
local gsub = mw.ustring.gsub
local export = {}

local fatHataan = U(0x64B) -- اً, tanvin-e nasb (تنوین نصب)
local Dammataan = U(0x64C) -- un
local kasrataan = U(0x64D) -- in
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local jazm = "ْ"
local he = "ه"
local zwnj = U(0x200C)
local highhmz = U(0x654)
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local balticons = "ڃڇڑڗݜݨݩǩ"

local consonants = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticons
local consonants2 = "ءبپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنوؤهیئywة" .. balticons -- including semivowels
local vowels = "āēīōū"
local semivowel = "یو"
local hes = "هح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local alif_wasla = "ٱ"
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. zwnj .. "]"

--- The characters ټ ٹ ډ ڈ ے are included only for Mughal Persian and Hazaragi.

local mapping = {
	["آ"] = "ā",
	["ب"] = "b",
	["پ"] = "p",
	["ت"] = "t",
	["ث"] = "s",
	["ج"] = "j",
	["چ"] = "č",
	["ح"] = "h",
	["خ"] = "x",
	["د"] = "d",
	["ذ"] = "z",
	["ر"] = "r",
	["ز"] = "z",
	["ژ"] = "ž",
	["س"] = "s",
	["ش"] = "š",
	["ص"] = "s",
	["ض"] = "z",
	["ط"] = "t",
	["ظ"] = "z",
	["غ"] = "ğ",
	["ف"] = "f",
	["ق"] = "q",
	["ک"] = "k",
	["گ"] = "g",
	["ل"] = "l",
	["م"] = "m",
	["ن"] = "n",
	["و"] = "ō",
	["ی"] = "ē",
	["۔"] = ".",

	["ه"] = "h",

	["ع"] = "'",
	["ء"] = "'",
	["ئ"] = "'",
	["ؤ"] = "'",
	["أ"] = "'",

	-- diacritics
	[zabar] = "a",
	[zer] = "i",
	[pesh] = "u",
	[jazm] = "", -- also sukun - no vowel
	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
	[highhmz] = "-yi",

	-- ligatures
	["ﻻ"] = "lā",
	["ﷲ"] = "allāh",

	-- kashida
	["ـ"] = "-", -- kashida, no sound

	-- alif_wasla
	[alif_wasla] = "", -- nothing

	-- numerals
	["۱"] = "1",
	["۲"] = "2",
	["۳"] = "3",
	["۴"] = "4",
	["۵"] = "5",
	["۶"] = "6",
	["۷"] = "7",
	["۸"] = "8",
	["۹"] = "9",
	["۰"] = "0",

	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = "“", -- quotation mark
	["»"] = "”", -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousan

	-- regional characters (FOR VERY SPECIFIC USECASES)
	["ټ"] = "ṭ",
	["ٹ"] = "ṭ",
	["ډ"] = "ḍ",
	["ڈ"] = "ḍ",
	-- balti
	-- cant do anything about ژ because it conflicts with persian
	["ڃ"] = "ž",
	["ڇ"] = "č̣",
	["ڑ"] = "ṛ",
	["ڗ"] = "dz",
	["ݜ"] = "ṣ",
	["ݨ"] = "ng",
	["ݩ"] = "ny",
	["ھ"] = "h",
	["ے"] = "e",
}

local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".'!»٪؉۔`,/–—%{%}"
local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = "ع"
local alif = "ا"
local malif = "آ"
local hamza = "ء"
local ye = "ی"
local ye2 = "ئ"
local vao = "و"
local dagger_alif = U(0x670)
local marbuta = U(0x629)
local te = "ت"
local ye3 = "ے"
local laam = "ل"
local vowel = "[" .. vowels .. ZZP .. jazm .. semivowel .. malif .. "]"
local sun_letters = "تثدذرزسشصضطظلن"

local before_diacritic_checking_subs = {
	------------ transformations prior to checking for diacritics --------------
	{ U(0x06E5), "و" },
	{ U(0x06E6), "ی" },
	{ "ہ", he }, -- get rid of balti he (allows balti to transliterate)
	{ "ک" .. highhmz, "ǩ" },
	{ "([" .. fatHataan .. ZZP .. dagger_alif .. "])" .. tashdid, tashdid .. "%1" },
	{ alif .. fatHataan, zabar .. "ن" },
	{ fatHataan .. alif, zabar .. "ن" },
	{ jazm .. ye .. dagger_alif, jazm .. ye .. zabar .. alif },
	{ zabar .. "[" .. ye .. vao .. "]" .. dagger_alif, zabar .. alif },
	{ ye .. dagger_alif, zabar .. alif }, -- the first letter is U+06CC
	{ ye3, ye },
	{ "[أإ]", ye2 },
	-- kashiida
	{ "^" .. "ـ" .. zabar .. alif , "ـ" .. malif },
	{ "^" .. "ـ" .. "([" .. ZZP .. "])" , "ـ" .. alif .. "%1" },
	{ zabar .. dagger_alif, zabar .. alif },
	{ dagger_alif, zabar .. alif },
	{ fatHataan, zabar .. "ن" }, -- fatḥatan
	{ Dammataan, pesh .. "ن" }, -- ḍammatan
	{ kasrataan, zer .. "ن" }, -- kasratan

	-- allah ligatures and arabic al
	{ alif_wasla .. laam , "l-" },
	{ alif_wasla, "" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. ZZP .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" },
	{ marbuta .. "([" .. ZZP .. "])" .. alif .. laam , te .. "%1-" .. laam .. "%-" },
	{ "l%-" .. "([" .. sun_letters .. "])" .. tashdid, "%1" .. jazm .. "-%1" },
	{ "l%-" .. laam .. tashdid, laam .. laam },
	{ "l%-" .. laam, laam .. laam },
	{ "l%-", laam .. "-" },
	{ marbuta .. "([" .. ZZP .. "])" .. alif, te .. "%1-" },
	{ marbuta .. "([" .. ZZP .. jazm .. "])", te .. "%1" },
	{ marbuta, he },
	{
		"(["
			.. consonants2
			.. "]["
			.. ZZP
			.. "])("
			.. space_like_class
			.. ")"
			.. alif
			.. laam
			.. "(["
			.. jazm
			.. laam
			.. "])",
		"%1%2" .. laam .. "%3",
	},
	{ laam .. laam .. tashdid, laam .. tashdid },
	-- use jazm/sukoon to prevent this conversion
	{ "(خ)" .. vao .. zabar .. alif, "%1" .. zabar .. alif },
	{ "(خ)" .. vao .. zabar, "%1" .. pesh },
	{ "(خ)" .. vao .. ye .. "([^" .. ZZP .. jazm .. "])", "%1" .. ye .. "%2" },
	-- izāfa
	{ zwnj, "-" },
	{ jazm .. alif, jazm .. "-" .. alif }, -- vowel killing, invisible ZWNJ
	{ zabar .. jazm, "-" }, -- vowel killing, invisible ZWNJ
}

local has_diacritics_subs = {
	-- this ensure allah ligatures and al- work
	{ "l%-", "" },
	{ "[" .. sun_letters .. "]" .. jazm .. "%-" , "" },
	{ "[" .. consonants2 .. "]" .. "([" .. ZZP .. "])" .. space_like_class .. alif .. laam , "" },
	-- remove punctuation and tashdid
	{ "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .. "]", "" },
	{ "[" .. consonants .. "]$", "" },
	{ "[" .. consonants .. "](" .. space_like_class .. ")", "%1" },
	{ "[" .. consonants .. "]%-", "-" },
	-- these are required for arabic al- to work
	{ "[" .. consonants2 .. "]" .. "([" .. zer .. pesh .. "])" .. alif .. laam, laam },
	{ "[" .. consonants2 .. "]([" .. zer .. pesh .. "])%-" .. alif .. laam, laam },
	-- remove CV pairs
	-- consonants paired to alif
	{ "[" .. consonants2 .. "]" .. jazm, "" },
	{ "[" .. consonants2 .. "]" .. jazm .. malif, "" },
	{ "[" .. consonants2 .. "]" .. zabar .. alif, "" },
	-- consonants paired to a semivowel
	{
		"[" .. consonants .. alif .. "][" .. semivowel .. ZZP .. "]([" .. semivowel .. "])([" .. semivowel .. "])",
		"%1%2",
	},
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. jazm .. semivowel .. "]", "" },
	{ "[" .. alif .. consonants2 .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },
	{ malif, "" }, -- counts as a CV pair
	{ jazm .. alif .. "[" .. ZZP .. "]", "" },
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "]", "" },
	{ "[" .. consonants2 .. alif .. semivowel .. "][" .. semivowel .. "]", "" },
	-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ
	{ "[" .. numbers .. "ٱ" .. "آ" .. "]", "" },
	{ "%s", "" },
	{ "%-", "" },
	{ "[" .. semivowel .. "]", "" },
	{ "(" .. vowel .. ")", "" },
}

local function has_diacritics(text)
	local count
	text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
	if count > 0 then
		require("Module:debug").track("fa-translit/lrm or rlm")
	end
	for _, sub in ipairs(has_diacritics_subs) do
		text = gsub(text, unpack(sub))
	end
	return #text == 0
end

function export.tr(text, lang, sc)
	if type(text) == "table" then
		local function f(x)
			return (x ~= "") and x or nil
		end
		text, lang, sc, omit_i3raab, force_translit =
			f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
	end
	for _, sub in ipairs(before_diacritic_checking_subs) do
		text = gsub(text, sub[1], sub[2])
	end

	if not force_translit and not has_diacritics(text) then
		require("Module:debug").track("fa-translit/lacking diacritics")
		return nil
	end

	--define the "end" of a word
	text = gsub(text, "#", "HASHTAG")
	text = gsub(text, "^", "#")
	text = gsub(text, "$", "#")
	text = gsub(text, " | ", "# | #")
	text = gsub(text, "%s", "# #")
	text = gsub(text, "\n", "#" .. "\n" .. "#")
	text = gsub(text, "([" .. punctuation .. "])", "#" .. "%1" .. "#")
	text = "##" .. gsub(text, " ", "# #") .. "##"
	text = gsub(text, "%-", "#-#")
	-- hastags now mark the beginning and end of a word
	--character reformatting and exceptions
	text = gsub(text, highhmz, "#" .. highhmz .. "#")
	--this ensures "and" is transliterated as a short vowel
	text = gsub(text, "#" .. vao .. "#", "#u#")
	text = gsub(text, "#" .. vao .. jazm .. malif, "#w-" .. malif )
	-- prevent izafa from converting until later

	-- Tashdeed
	text = gsub(text, "([" .. consonants .. "])" .. tashdid, "%1%1")
	text = gsub(text, "([" .. consonants .. "])" .. tashdid .. "([" .. ZZP .. "])", "%1%1%2")
	text = gsub(text, "([" .. consonants .. "])" .. "([" .. ZZP .. "])" .. tashdid, "%1%1%2")
	text = gsub(text, ye .. "([" .. ZZP .. "])" .. tashdid, "yy%1")
	text = gsub(text, vao .. "([" .. ZZP .. "])" .. tashdid, "ww%1")
	text = gsub(text, ye .. tashdid .. "([" .. ZZP .. "])", "yy%1")
	text = gsub(text, vao .. tashdid .. "([" .. ZZP .. "])", "ww%1")

	-- distinguish initial alif from vowel alif
	text = gsub(text, "([" .. consonants2 .. "])" .. zabar .. alif, "%1ā")
	text = gsub(text, "([" .. consonants2 .. "])" .. alif, "%1ā")
	text = gsub(text, jazm .. malif, "'ā") -- invisible ZWNJ
	text = gsub(text, "([" .. consonants2 .. "])" .. malif, "%1'ā")
	text = gsub(text, alif .. ye, "ē")
	text = gsub(text, alif .. vao, "ō")
	text = gsub(text, alif .. zer .. ye, "ī")
	text = gsub(text, alif .. pesh .. vao, "ū")
	text = gsub(text, tashdid .. alif, tashdid .. "ā")

	-- convert semi vowels
	text = gsub(text, ye .. "ā", "yā")
	text = gsub(text, vao .. "ā", "wā")
	text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1")
	text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1")
	text = gsub(text, ye .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ē%1%2")
	text = gsub(text, vao .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ō%1%2")
	text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2")
	text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")
	text = gsub(text, "([" .. consonants .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2")
	text = gsub(text, "([" .. consonants .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")

	-- conversions for vaav/waaw/vao
	text = gsub(text, pesh .. vao, "ū")
	text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1")
	text = gsub(text, "(" .. vowel .. ")" .. vao, "%1w")
	-- conversions for ye
	text = gsub(text, zer .. ye, "ī")
	text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1")
	text = gsub(text, "(" .. vowel .. ")" .. ye, "%1y")

	--Alif with short vowel
	text = gsub(text, alif .. "([" .. ZZP .. "])", "%1")

	-- final changes
	-- izafa
	text = gsub(text, "ē" .. zer .. "#", "ē-yi#")
	text = gsub(text, zer .. "y" .. zer .. "#", "ī-yi#")
	text = gsub(text, "([^" .. consonants .. "])" .. "y" .. zer .. "#", "%1-yi#")
	text = gsub(text, "([" .. consonants2 .. "])" .. zer .. "#", "%1-i#")
	text = gsub(text, '("\'")' .. "##" .. zer .. "#", "%1-i#")
	-- do not count zer as izafa before silent alif
	text = gsub(text, "%-i" .. "##" .. "(" .. space_like_class .. ")" .. "##" .. "([" .. sun_letters .. "]" .. jazm .. "#%-#" .. ")", "i%1%2")
	text = gsub(text, "%-i" .. "#%-#" .. "([" .. sun_letters .. "]" .. "#%-#" .. ")", "i-%1")
	-- he deletion
	text = gsub(text, "([" .. ZZP .. "])" .. he .. "#" .. zwnj, "%1-")
	text = gsub(text, "([" .. ZZP .. "])" .. he .. "#", "%1#")
	text = gsub(text, "#" .. ain , "#")

	-- get rid of hashtags (not needed)
	text = gsub(text, "#", "")
	text = gsub(text, "HASHTAG", "#")
	text = string.gsub(text, lrm, "")
	text = string.gsub(text, rlm, "")
	-- convert all characters
	text = mw.ustring.gsub(text, ".", mapping)

	-- alif
	-- Final corrections
	text = mw.ustring.gsub(text, "āa", "ā")
	text = mw.ustring.gsub(text, "aaa", "ā")
	text = mw.ustring.gsub(text, "āā", "ā")
	text = mw.ustring.gsub(text, "aa", "ā")
	text = mw.ustring.gsub(text, "ī" .. "([" .. vowels .. "])", "iy%1")
	text = mw.ustring.gsub(text, "ū" .. "([" .. vowels .. "])", "uw%1")

	text = mw.ustring.toNFC(text)

	return text
end

return export