မဝ်ဂျူ:fa-cls-translit
မံက်ပြာကတ်
Documentation for this module may be created at မဝ်ဂျူ:fa-cls-translit/doc
-- Authors: Sameerhameedy
local U = mw.ustring.char
local gsub = mw.ustring.gsub
local export = {}
local fatHataan = U(0x64B) -- اً, tanvin-e nasb (تنوین نصب)
local Dammataan = U(0x64C) -- un
local kasrataan = U(0x64D) -- in
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local jazm = "ْ"
local he = "ه"
local zwnj = U(0x200C)
local highhmz = U(0x654)
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local balticons = "ڃڇڑڗݜݨݩǩ"
local consonants = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticons
local consonants2 = "ءبپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنوؤهیئywة" .. balticons -- including semivowels
local vowels = "āēīōū"
local semivowel = "یو"
local hes = "هح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local alif_wasla = "ٱ"
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. zwnj .. "]"
--- The characters ټ ٹ ډ ڈ ے are included only for Mughal Persian and Hazaragi.
local mapping = {
["آ"] = "ā",
["ب"] = "b",
["پ"] = "p",
["ت"] = "t",
["ث"] = "s",
["ج"] = "j",
["چ"] = "č",
["ح"] = "h",
["خ"] = "x",
["د"] = "d",
["ذ"] = "z",
["ر"] = "r",
["ز"] = "z",
["ژ"] = "ž",
["س"] = "s",
["ش"] = "š",
["ص"] = "s",
["ض"] = "z",
["ط"] = "t",
["ظ"] = "z",
["غ"] = "ğ",
["ف"] = "f",
["ق"] = "q",
["ک"] = "k",
["گ"] = "g",
["ل"] = "l",
["م"] = "m",
["ن"] = "n",
["و"] = "ō",
["ی"] = "ē",
["۔"] = ".",
["ه"] = "h",
["ع"] = "'",
["ء"] = "'",
["ئ"] = "'",
["ؤ"] = "'",
["أ"] = "'",
-- diacritics
[zabar] = "a",
[zer] = "i",
[pesh] = "u",
[jazm] = "", -- also sukun - no vowel
[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
[highhmz] = "-yi",
-- ligatures
["ﻻ"] = "lā",
["ﷲ"] = "allāh",
-- kashida
["ـ"] = "-", -- kashida, no sound
-- alif_wasla
[alif_wasla] = "", -- nothing
-- numerals
["۱"] = "1",
["۲"] = "2",
["۳"] = "3",
["۴"] = "4",
["۵"] = "5",
["۶"] = "6",
["۷"] = "7",
["۸"] = "8",
["۹"] = "9",
["۰"] = "0",
-- punctuation (leave on separate lines)
["؟"] = "?", -- question mark
["،"] = ",", -- comma
["؛"] = ";", -- semicolon
["«"] = "“", -- quotation mark
["»"] = "”", -- quotation mark
["٪"] = "%", -- percent
["؉"] = "‰", -- per mille
["٫"] = ".", -- decimals
["٬"] = ",", -- thousan
-- regional characters (FOR VERY SPECIFIC USECASES)
["ټ"] = "ṭ",
["ٹ"] = "ṭ",
["ډ"] = "ḍ",
["ڈ"] = "ḍ",
-- balti
-- cant do anything about ژ because it conflicts with persian
["ڃ"] = "ž",
["ڇ"] = "č̣",
["ڑ"] = "ṛ",
["ڗ"] = "dz",
["ݜ"] = "ṣ",
["ݨ"] = "ng",
["ݩ"] = "ny",
["ھ"] = "h",
["ے"] = "e",
}
local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".'!»٪؉۔`,/–—%{%}"
local numbers = "۱۲۳۴۵۶۷۸۹۰"
local ain = "ع"
local alif = "ا"
local malif = "آ"
local hamza = "ء"
local ye = "ی"
local ye2 = "ئ"
local vao = "و"
local dagger_alif = U(0x670)
local marbuta = U(0x629)
local te = "ت"
local ye3 = "ے"
local laam = "ل"
local vowel = "[" .. vowels .. ZZP .. jazm .. semivowel .. malif .. "]"
local sun_letters = "تثدذرزسشصضطظلن"
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
{ U(0x06E5), "و" },
{ U(0x06E6), "ی" },
{ "ہ", he }, -- get rid of balti he (allows balti to transliterate)
{ "ک" .. highhmz, "ǩ" },
{ "([" .. fatHataan .. ZZP .. dagger_alif .. "])" .. tashdid, tashdid .. "%1" },
{ alif .. fatHataan, zabar .. "ن" },
{ fatHataan .. alif, zabar .. "ن" },
{ jazm .. ye .. dagger_alif, jazm .. ye .. zabar .. alif },
{ zabar .. "[" .. ye .. vao .. "]" .. dagger_alif, zabar .. alif },
{ ye .. dagger_alif, zabar .. alif }, -- the first letter is U+06CC
{ ye3, ye },
{ "[أإ]", ye2 },
-- kashiida
{ "^" .. "ـ" .. zabar .. alif , "ـ" .. malif },
{ "^" .. "ـ" .. "([" .. ZZP .. "])" , "ـ" .. alif .. "%1" },
{ zabar .. dagger_alif, zabar .. alif },
{ dagger_alif, zabar .. alif },
{ fatHataan, zabar .. "ن" }, -- fatḥatan
{ Dammataan, pesh .. "ن" }, -- ḍammatan
{ kasrataan, zer .. "ن" }, -- kasratan
-- allah ligatures and arabic al
{ alif_wasla .. laam , "l-" },
{ alif_wasla, "" },
{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" },
{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" },
{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. ZZP .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" },
{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" },
{ marbuta .. "([" .. ZZP .. "])" .. alif .. laam , te .. "%1-" .. laam .. "%-" },
{ "l%-" .. "([" .. sun_letters .. "])" .. tashdid, "%1" .. jazm .. "-%1" },
{ "l%-" .. laam .. tashdid, laam .. laam },
{ "l%-" .. laam, laam .. laam },
{ "l%-", laam .. "-" },
{ marbuta .. "([" .. ZZP .. "])" .. alif, te .. "%1-" },
{ marbuta .. "([" .. ZZP .. jazm .. "])", te .. "%1" },
{ marbuta, he },
{
"(["
.. consonants2
.. "]["
.. ZZP
.. "])("
.. space_like_class
.. ")"
.. alif
.. laam
.. "(["
.. jazm
.. laam
.. "])",
"%1%2" .. laam .. "%3",
},
{ laam .. laam .. tashdid, laam .. tashdid },
-- use jazm/sukoon to prevent this conversion
{ "(خ)" .. vao .. zabar .. alif, "%1" .. zabar .. alif },
{ "(خ)" .. vao .. zabar, "%1" .. pesh },
{ "(خ)" .. vao .. ye .. "([^" .. ZZP .. jazm .. "])", "%1" .. ye .. "%2" },
-- izāfa
{ zwnj, "-" },
{ jazm .. alif, jazm .. "-" .. alif }, -- vowel killing, invisible ZWNJ
{ zabar .. jazm, "-" }, -- vowel killing, invisible ZWNJ
}
local has_diacritics_subs = {
-- this ensure allah ligatures and al- work
{ "l%-", "" },
{ "[" .. sun_letters .. "]" .. jazm .. "%-" , "" },
{ "[" .. consonants2 .. "]" .. "([" .. ZZP .. "])" .. space_like_class .. alif .. laam , "" },
-- remove punctuation and tashdid
{ "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .. "]", "" },
{ "[" .. consonants .. "]$", "" },
{ "[" .. consonants .. "](" .. space_like_class .. ")", "%1" },
{ "[" .. consonants .. "]%-", "-" },
-- these are required for arabic al- to work
{ "[" .. consonants2 .. "]" .. "([" .. zer .. pesh .. "])" .. alif .. laam, laam },
{ "[" .. consonants2 .. "]([" .. zer .. pesh .. "])%-" .. alif .. laam, laam },
-- remove CV pairs
-- consonants paired to alif
{ "[" .. consonants2 .. "]" .. jazm, "" },
{ "[" .. consonants2 .. "]" .. jazm .. malif, "" },
{ "[" .. consonants2 .. "]" .. zabar .. alif, "" },
-- consonants paired to a semivowel
{
"[" .. consonants .. alif .. "][" .. semivowel .. ZZP .. "]([" .. semivowel .. "])([" .. semivowel .. "])",
"%1%2",
},
{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },
{ "[" .. consonants2 .. alif .. "][" .. ZZP .. jazm .. semivowel .. "]", "" },
{ "[" .. alif .. consonants2 .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },
{ malif, "" }, -- counts as a CV pair
{ jazm .. alif .. "[" .. ZZP .. "]", "" },
{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "]", "" },
{ "[" .. consonants2 .. alif .. semivowel .. "][" .. semivowel .. "]", "" },
-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ
{ "[" .. numbers .. "ٱ" .. "آ" .. "]", "" },
{ "%s", "" },
{ "%-", "" },
{ "[" .. semivowel .. "]", "" },
{ "(" .. vowel .. ")", "" },
}
local function has_diacritics(text)
local count
text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
if count > 0 then
require("Module:debug").track("fa-translit/lrm or rlm")
end
for _, sub in ipairs(has_diacritics_subs) do
text = gsub(text, unpack(sub))
end
return #text == 0
end
function export.tr(text, lang, sc)
if type(text) == "table" then
local function f(x)
return (x ~= "") and x or nil
end
text, lang, sc, omit_i3raab, force_translit =
f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
end
for _, sub in ipairs(before_diacritic_checking_subs) do
text = gsub(text, sub[1], sub[2])
end
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("fa-translit/lacking diacritics")
return nil
end
--define the "end" of a word
text = gsub(text, "#", "HASHTAG")
text = gsub(text, "^", "#")
text = gsub(text, "$", "#")
text = gsub(text, " | ", "# | #")
text = gsub(text, "%s", "# #")
text = gsub(text, "\n", "#" .. "\n" .. "#")
text = gsub(text, "([" .. punctuation .. "])", "#" .. "%1" .. "#")
text = "##" .. gsub(text, " ", "# #") .. "##"
text = gsub(text, "%-", "#-#")
-- hastags now mark the beginning and end of a word
--character reformatting and exceptions
text = gsub(text, highhmz, "#" .. highhmz .. "#")
--this ensures "and" is transliterated as a short vowel
text = gsub(text, "#" .. vao .. "#", "#u#")
text = gsub(text, "#" .. vao .. jazm .. malif, "#w-" .. malif )
-- prevent izafa from converting until later
-- Tashdeed
text = gsub(text, "([" .. consonants .. "])" .. tashdid, "%1%1")
text = gsub(text, "([" .. consonants .. "])" .. tashdid .. "([" .. ZZP .. "])", "%1%1%2")
text = gsub(text, "([" .. consonants .. "])" .. "([" .. ZZP .. "])" .. tashdid, "%1%1%2")
text = gsub(text, ye .. "([" .. ZZP .. "])" .. tashdid, "yy%1")
text = gsub(text, vao .. "([" .. ZZP .. "])" .. tashdid, "ww%1")
text = gsub(text, ye .. tashdid .. "([" .. ZZP .. "])", "yy%1")
text = gsub(text, vao .. tashdid .. "([" .. ZZP .. "])", "ww%1")
-- distinguish initial alif from vowel alif
text = gsub(text, "([" .. consonants2 .. "])" .. zabar .. alif, "%1ā")
text = gsub(text, "([" .. consonants2 .. "])" .. alif, "%1ā")
text = gsub(text, jazm .. malif, "'ā") -- invisible ZWNJ
text = gsub(text, "([" .. consonants2 .. "])" .. malif, "%1'ā")
text = gsub(text, alif .. ye, "ē")
text = gsub(text, alif .. vao, "ō")
text = gsub(text, alif .. zer .. ye, "ī")
text = gsub(text, alif .. pesh .. vao, "ū")
text = gsub(text, tashdid .. alif, tashdid .. "ā")
-- convert semi vowels
text = gsub(text, ye .. "ā", "yā")
text = gsub(text, vao .. "ā", "wā")
text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1")
text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1")
text = gsub(text, ye .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ē%1%2")
text = gsub(text, vao .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ō%1%2")
text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2")
text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")
text = gsub(text, "([" .. consonants .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2")
text = gsub(text, "([" .. consonants .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")
-- conversions for vaav/waaw/vao
text = gsub(text, pesh .. vao, "ū")
text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1")
text = gsub(text, "(" .. vowel .. ")" .. vao, "%1w")
-- conversions for ye
text = gsub(text, zer .. ye, "ī")
text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1")
text = gsub(text, "(" .. vowel .. ")" .. ye, "%1y")
--Alif with short vowel
text = gsub(text, alif .. "([" .. ZZP .. "])", "%1")
-- final changes
-- izafa
text = gsub(text, "ē" .. zer .. "#", "ē-yi#")
text = gsub(text, zer .. "y" .. zer .. "#", "ī-yi#")
text = gsub(text, "([^" .. consonants .. "])" .. "y" .. zer .. "#", "%1-yi#")
text = gsub(text, "([" .. consonants2 .. "])" .. zer .. "#", "%1-i#")
text = gsub(text, '("\'")' .. "##" .. zer .. "#", "%1-i#")
-- do not count zer as izafa before silent alif
text = gsub(text, "%-i" .. "##" .. "(" .. space_like_class .. ")" .. "##" .. "([" .. sun_letters .. "]" .. jazm .. "#%-#" .. ")", "i%1%2")
text = gsub(text, "%-i" .. "#%-#" .. "([" .. sun_letters .. "]" .. "#%-#" .. ")", "i-%1")
-- he deletion
text = gsub(text, "([" .. ZZP .. "])" .. he .. "#" .. zwnj, "%1-")
text = gsub(text, "([" .. ZZP .. "])" .. he .. "#", "%1#")
text = gsub(text, "#" .. ain , "#")
-- get rid of hashtags (not needed)
text = gsub(text, "#", "")
text = gsub(text, "HASHTAG", "#")
text = string.gsub(text, lrm, "")
text = string.gsub(text, rlm, "")
-- convert all characters
text = mw.ustring.gsub(text, ".", mapping)
-- alif
-- Final corrections
text = mw.ustring.gsub(text, "āa", "ā")
text = mw.ustring.gsub(text, "aaa", "ā")
text = mw.ustring.gsub(text, "āā", "ā")
text = mw.ustring.gsub(text, "aa", "ā")
text = mw.ustring.gsub(text, "ī" .. "([" .. vowels .. "])", "iy%1")
text = mw.ustring.gsub(text, "ū" .. "([" .. vowels .. "])", "uw%1")
text = mw.ustring.toNFC(text)
return text
end
return export