မဝ်ဂျူ:IPA/X-SAMPA
မံက်ပြာကတ်
This module contains functions related to X-SAMPA. It was split from Module:IPA to save memory because X-SAMPA is not used directly in entries, but only when saving a page (as {{x2i}}
, {{x2ipa}}
, {{x2ipachar}}
are always substituted).
local decode_entities = require("Module:string utilities").decode_entities
local export = {}
local m_XSAMPA = mw.loadData('Module:IPA/data/X-SAMPA')
-- IPA <-> XSAMPA lookup tables
local i2x_lookup = {}
local function Populate_IPA_XSAMPA_LookupTables()
if #i2x_lookup == 0 then
for XSAMPA_symbol, data in pairs(m_XSAMPA) do
local IPA_symbol = data[1]
i2x_lookup[IPA_symbol] = XSAMPA_symbol
local with_descender = data.with_descender
if with_descender then
i2x_lookup[with_descender] = XSAMPA_symbol
end
end
end
return i2x_lookup
end
function export.IPA_to_XSAMPA(text)
Populate_IPA_XSAMPA_LookupTables()
local escape = false
if type(text) == 'table' then -- a frame, extract args
text = text.args[1]
text = text:gsub('{{=}}','='):gsub('{{!}}','|')
text = decode_entities(text) -- XXX
escape = true
end
text = text:gsub('ːː', ':') -- this basically sums up m_symbols[2].XSAMPA
text = mw.ustring.gsub(text, '.', i2x_lookup)
if escape then
text = require("Module:string/nowiki")(text)
end
return text
end
function export.XSAMPA_to_IPA(text)
local data = m_XSAMPA
local escape = false
if type(text) == 'table' then -- a frame, extract args
text = text.args[1]
text = decode_entities(text) -- XXX
escape = true
end
-- Simpler function adapted from [[w:Module:Sandbox/Erutuon/X-SAMPA]]
local output, characteristics = {}, {}
local angle_bracket
if text:sub(1, 1) == "<" and text:sub(-1) == ">" then
table.insert(output, "⟨")
angle_bracket = "⟩"
text = text:sub(2, -2)
end
local escaped = {}
local emoticon = 0x1F600 - 1
local function escape(text, pattern)
emoticon = emoticon + 1
return text:gsub(
pattern,
function(match)
local emoticon = mw.ustring.char(emoticon)
escaped[emoticon] = match
return emoticon
end)
end
--[[
Replace
-- HTML tags
-- character entity references ( )
-- numeric character references (   )
with characters from Emoticon block.
--]]
text = escape(text, '<[^>]+>')
text = escape(text, '&%a+;')
text = escape(text, '&#%d+;')
text = escape(text, '&#x%x+;')
while #text > 0 do
-- skip non-ASCII bytes (that is, multi-byte characters)
text = text:gsub(
'^[\128-\255]+',
function (nonASCII)
table.insert(output, nonASCII)
return ""
end)
for i = 4, 1, -1 do
local potential_XSAMPA = text:sub(1, i)
local result = data[potential_XSAMPA]
local IPA, with_descender, has_descender, is_diacritic
if result then
IPA = result[1]
with_descender = result.with_descender
has_descender = result.has_descender
is_diacritic = result.is_diacritic
if with_descender then
-- Go backwords through the transcription, skipping any diacritics.
local j = 0
while characteristics[#characteristics - j].is_diacritic do
j = j + 1
end
--[[ Look at the first non-diacritic symbol before the current symbol.
If it has a descender, use the descender form of the current symbol. ]]
if characteristics[#characteristics - j].has_descender then
IPA = with_descender
end
end
elseif i == 1 then
IPA = potential_XSAMPA
end
if IPA then
text = text:sub(i + 1)
table.insert(output, IPA)
table.insert(characteristics, { has_descender = has_descender, is_diacritic = is_diacritic } )
break
end
end
end
table.insert(output, angle_bracket)
output = table.concat(output)
output = output:gsub("[\194-\244][\128-\191]+", escaped)
return output
end
return export