မာတိကာသို့ ခုန်သွားရန်

မဝ်ဂျူ:headword: အရာမတၞဟ်ခြာ အကြာ မူတၞဟ်ဂမၠိုင်

နူ ဝိက်ရှေန်နရဳ
Content deleted Content added
မကလေၚ်ပလီုထောံ90740နူကဵု咽頭べさ (ဓရီုကျာ)မပလေဝ်ဒါန်လဝ်
ဒစင်: ဟွံပ
No edit summary
ဒစင်: မကလေၚ်ပလေဝ်ဒါန်လဝ်
လာင် ၁- လာင် ၁-
local export = {}
local export = {}


-- Named constants for all modules used, to make it easier to swap out sandbox versions.
local m_data = mw.loadData("Module:headword/data")
local debug_track_module = "Module:debug/track"
local gender_and_number_module = "Module:gender and number"
local headword_data_module = "Module:headword/data"
local headword_page_module = "Module:headword/page"
local links_module = "Module:links"
local palindromes_module = "Module:palindromes"
local qualifier_module = "Module:qualifier"
local scripts_module = "Module:scripts"
local scripts_data_module = "Module:scripts/data"
local script_utilities_module = "Module:script utilities"
local script_utilities_data_module = "Module:script utilities/data"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local utilities_module = "Module:utilities"

local m_str_utils = require(string_utilities_module)

local concat = table.concat
local encode_entities = m_str_utils.encode_entities
local insert = table.insert
local ipairs = ipairs
local pairs = pairs
local pattern_escape = m_str_utils.pattern_escape
local rgmatch = mw.ustring.gmatch
local rsubn = mw.ustring.gsub
local rfind = mw.ustring.find
local ulen = m_str_utils.len
local rmatch = mw.ustring.match
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

local m_data = mw.loadData(headword_data_module)


local isLemma = m_data.lemmas
local isLemma = m_data.lemmas
လာင် ၈- လာင် ၄၀-
local toBeTagged = m_data.toBeTagged
local toBeTagged = m_data.toBeTagged


-- If set to true, categories always appear, even in non-mainspace pages
local parameters = {
local test_force_categories = false
lang = { type = "object" },
script = { type = "object" },
heads = { type = "table" },
translits = { type = "table" },
transcriptions = { type = "table" },
inflections = { type = "table" },
genders = { type = "table" },
categories = { type = "table" },
pos_category = { type = "string" },
sort_key = { type = "string" },
id = { type = "string" },
}


-- Version of rsubn() that discards all but the first return value.
local function test_script(text, script_code)
local function rsub(term, foo, bar)
if type(text) == "string" and type(script_code) == "string" then
return (rsubn(term, foo, bar))
local sc = require("Module:scripts").getByCode(script_code)
end
local characters

if sc then
-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
characters = sc:getCharacters()
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
end
-- page text to [[Wiktionary:Tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
-- by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID]].
local out
--
if characters then
-- If `lang` (a language object) is given, an additional tracking page [[Wiktionary:Tracking/headword/TRACK_ID/CODE]] is
text = mw.ustring.gsub(text, "%W", "")
-- linked to where CODE is the language code of `lang`, and you can find all entries in the combination of `track_id`
out = mw.ustring.find(text, "[" .. characters .. "]")
-- and `lang` by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID/CODE]]. This makes it possible to
end
-- isolate only the entries with a specific tracking property that are in a given language. Note that if `lang`
-- references at etymology-only language, both that language's code and its full parent's code are tracked.
if out then
local function track(track_id, lang)
return true
local tracking_page = "headword/" .. track_id
else
local m_debug_track = require(debug_track_module)
return false
if lang and lang:hasType("etymology-only") then
end
m_debug_track{tracking_page, tracking_page .. "/" .. lang:getCode(),
tracking_page .. "/" .. lang:getFullCode()}
elseif lang then
m_debug_track{tracking_page, tracking_page .. "/" .. lang:getCode()}
else
else
m_debug_track(tracking_page)
mw.log("Parameters to test_script were incorrect.")
return nil
end
end
return true
end
end




local function preprocess(data)
local function text_in_script(text, script_code)
local sc = require(scripts_module).getByCode(script_code)
--[=[
if not sc then
[[Special:WhatLinksHere/Template:tracking/headword/heads-not-table]]
error("Internal error: Bad script code " .. script_code)
[[Special:WhatLinksHere/Template:tracking/headword/translits-not-table]]
]=]
if type(data.heads) ~= "table" then
if data.heads then
require("Module:debug").track("headword/heads-not-table")
end
data.heads = { data.heads }
end
end
local characters = sc.characters

if type(data.translits) ~= "table" then
local out
if data.translits then
if characters then
require("Module:debug").track("headword/translits-not-table")
text = rsub(text, "%W", "")
end
out = rfind(text, "[" .. characters .. "]")
data.translits = { data.translits }
end
end

if out then
if type(data.transcriptions) ~= "table" then
return true
if data.transcriptions then
else
require("Module:debug").track("headword/transcriptions-not-table")
return false
end
data.transcriptions = { data.transcriptions }
end
end
end

if not data.heads or #data.heads == 0 then

data.heads = {""}
local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་•:᠊"
local notWordPunc = "[^" .. wordPunc .. "]+"


-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
-- customized separator: `part` is the object specifying the term, which should optionally contain:
-- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
-- * right qualifiers in `qq`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
-- (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(part, formatted, j)
local left_qualifiers, right_qualifiers
local reftext

left_qualifiers = part.q and #part.q > 0 and part.q
if left_qualifiers then
left_qualifiers = require(qualifier_module).format_qualifier(left_qualifiers) .. " "
end
end

right_qualifiers = part.qq and #part.qq > 0 and part.qq
local title = mw.title.getCurrentTitle()
if right_qualifiers then
right_qualifiers = " " .. require(qualifier_module).format_qualifier(right_qualifiers)
-- Determine if term is reconstructed
local is_reconstructed = data.lang:getType() == "ဗီုပြင်သိုင်တၟိ"
or title.nsText == "ဗီုပြင်သိုင်တၟိ"
-- Create a default headword.
local subpagename = title.subpageText
local pagename = title.text
local default_head
if is_reconstructed then
default_head = require("Module:utilities").plain_gsub(pagename, data.lang:getCanonicalName() .. "/", "")
else
default_head = subpagename
end
end
if part.refs and #part.refs > 0 then
local refs = {}
-- Add links to multi-word page names when appropriate
for _, ref in ipairs(part.refs) do
if data.lang:getCode() ~= "zh" then
if type(ref) ~= "table" then
local spacingPunctuation = "([%s%p]+)"
ref = {text = ref}
--[[ Variable containing anything that is
not a punctuation character found inside of words.
Used to exclude characters from the above regex. ]]
local notWordPunc = "([^-־׳״'.·*]+)"
local contains_words = false
for possibleWordBreak in mw.ustring.gmatch(default_head, spacingPunctuation) do
if mw.ustring.find(possibleWordBreak, notWordPunc) then
contains_words = true
break
end
end
local refargs
end
if ref.name or ref.group then
refargs = {name = ref.name, group = ref.group}
if (not is_reconstructed) and contains_words then
local function workaround_to_exclude_chars(s)
return mw.ustring.gsub(s, notWordPunc, "]]%1[[")
end
end
insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
default_head = "[["
.. mw.ustring.gsub(
default_head,
spacingPunctuation,
workaround_to_exclude_chars
)
.. "]]"
--[=[
use this when workaround is no longer needed:
default_head = "[["
.. mw.ustring.gsub(default_head, WORDBREAKCHARS, "]]%1[[")
.. "]]"
Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
default_head = mw.ustring.gsub(default_head, "%[%[%]%]", "")
end
end
reftext = concat(refs)
end
end

local separator = part.separator or j > 1 and " <i>หรือ</i> " -- use "" to request no separator
if is_reconstructed then

default_head = "*" .. default_head
if left_qualifiers then
formatted = left_qualifiers .. formatted
end
end
if reftext then
formatted = formatted .. reftext
-- If a head is the empty string "", then replace it with the default
for i, head in ipairs(data.heads) do
if head == "" then
head = default_head
else
if head == default_head and data.lang:getCanonicalName() == "English" then
table.insert(data.categories, data.lang:getCanonicalName() .. " terms with redundant head parameter")
end
end
data.heads[i] = head
end
end
if right_qualifiers then
formatted = formatted .. right_qualifiers
--[[ Try to detect the script if it was not provided
We use the first headword for this, and assume
that all of them have the same script
This *should* always be true, right? ]]
if not data.sc then
data.sc = require("Module:scripts").findBestScript(data.heads[1], data.lang)
end
end
if separator then
formatted = separator .. formatted
-- Make transliterations
end
for i, head in ipairs(data.heads) do

local translit = data.translits[i]
return formatted
end
-- Try to generate a transliteration if necessary

-- Generate it if the script is not Latn or similar, and if no transliteration was provided

if translit == "-" then
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
translit = nil
function export.head_is_multiword(head)
elseif not translit
for possibleWordBreak in rgmatch(head, spacingPunctuation) do
and not (
if rmatch(possibleWordBreak, notWordPunc) then
data.sc:getCode():find("Latn", nil, true)
return true
or data.sc:getCode() == "Latinx"
or data.sc:getCode() == "None"
)
and (not data.sc or data.sc:getCode() ~= "Imag") then
translit = data.lang:transliterate(require("Module:links").remove_links(head), data.sc)
-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not translit and not notranslit[data.lang:getCode()] then
translit = "<small>သွက်ဒးပြံၚ်သၠာဲအခဝ်နွံမံင်</small>"
table.insert(data.categories, "သွက်ဒးပြံၚ်သၠာဲအခဝ်နွံမံင်ဝေါဟာ" .. data.lang:getCanonicalName() .. "ဂမၠိုင်")
end
end
end
-- Link to the transliteration entry for languages that require this
if translit and data.lang:link_tr() then
translit = require("Module:links").full_link{
term = translit,
lang = data.lang,
sc = require("Module:scripts").getByCode("Latn"),
tr = "-"
}
end
data.translits[i] = translit
end
end

return false
if data.id and type(data.id) ~= "string" then
end
error("The id in the data table should be a string.")


--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head, default)
local function workaround_to_exclude_chars(s)
return rsub(s, notWordPunc, "\2%1\1")
end
end

head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
if default then
head = head
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
end

--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
head = encode_entities(head, "[]", true, true)

--[=[
use this when workaround is no longer needed:

head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
return (head
:gsub("\1\2", "")
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end
end




local function non_categorizable(full_raw_pagename)
-- Format a headword with transliterations
return full_raw_pagename:find("^ภาคผนวก:อากัปกิริยา/")
end


-- Format a headword with transliterations.
local function format_headword(data)
local function format_headword(data)
local m_links = require("Module:links")
local m_scriptutils = require(script_utilities_module)

local m_scriptutils = require("Module:script utilities")
if data.heads and #data.heads and data.lang then
require("Module:debug").track{
"headword/heads/" .. #data.heads,
"headword/heads/" .. #data.heads .. "/" .. data.lang:getCode()
}
end
-- Are there non-empty transliterations?
-- Are there non-empty transliterations?
-- Need to do it this way because translit[1] might be nil while translit[2] is not
local has_translits = false
local has_translits = false
local has_manual_translits = false

-- Format the headwords
------ Format the headwords. ------
for i, head in ipairs(data.heads) do

if data.translits[i] or data.transcriptions[i] then
local head_parts = {}
local unique_head_parts = {}

local has_multiple_heads = #data.heads > 1

for j, head in ipairs(data.heads) do
if head.tr or head.ts then
has_translits = true
has_translits = true
end
end
if head.tr and head.tr_manual or head.ts then
has_manual_translits = true
-- Apply processing to the headword, for formatting links and such
if head:find("[[", nil, true) and (not data.sc or data.sc:getCode() ~= "Imag") then
head = m_links.language_link({term = head, lang = data.lang}, false)
end
end

local formatted
-- Add language and script wrapper

if i == 1 then
-- Apply processing to the headword, for formatting links and such.
head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil, data.id)
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Imag" then
formatted = require(links_module).language_link{term = head.term, lang = data.lang}
else
else
head = m_scriptutils.tag_text(head, data.lang, data.sc, "head", nil)
formatted = data.lang:makeDisplayText(head.term, head.sc, true)
end

local function tag_text_and_add_quals_and_refs(head, formatted, j)
-- Add language and script wrapper.
formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)

-- Add qualifiers, references and separator.
return format_term_with_qualifiers_and_refs(head, formatted, j)
end

local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
insert(head_parts, head_part)

-- If multiple heads, try to determine whether all heads display the same. To do this we need to effectively
-- rerun the text tagging and addition of qualifiers and references, using 1 for all indices.
if has_multiple_heads then
local unique_head_part
if j == 1 then
unique_head_part = head_part
else
unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
end
unique_head_parts[unique_head_part] = true
end
end
data.heads[i] = head
end
end

local translits_formatted = ""
local set_size = 0
if has_multiple_heads then
for _ in pairs(unique_head_parts) do
set_size = set_size + 1
end
end
if set_size == 1 then
head_parts = head_parts[1]
else
head_parts = concat(head_parts)
end

if has_manual_translits then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/has-manual-translit]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/has-manual-translit/LANGCODE]]
track("has-manual-translit", data.lang)
end

------ Format the transliterations and transcriptions. ------

local translits_formatted

if has_translits then
if has_translits then
local translit_parts = {}
-- Format the transliterations
for i, head in ipairs(data.heads) do
for _, head in ipairs(data.heads) do
if head.tr or head.ts then
local translit = data.translits[i]
local transcription = data.transcriptions[i]
local this_parts = {}
if head.tr then
insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
if transcription and translit then
if head.ts then
translit = m_scriptutils.tag_translit(translit, data.lang:getCode(), "head")
insert(this_parts, " ")
transcription = m_scriptutils.tag_transcription(transcription, data.lang:getCode(), "head")
end
data.translits[i] = translit .. " /" .. transcription .. "/"
end
elseif transcription then
if head.ts then
transcription = m_scriptutils.tag_transcription(transcription, data.lang:getCode(), "head")
insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
data.translits[i] = "/" .. transcription .. "/"
else
end
insert(translit_parts, concat(this_parts))
translit = m_scriptutils.tag_translit(translit or "?", data.lang:getCode(), "head")
data.translits[i] = translit
end
end
end
end

translits_formatted = " (" .. table.concat(data.translits, " ''ဝါ'' ") .. ")"
translits_formatted = " (" .. concat(translit_parts, " <i>หรือ</i> ") .. ")"

local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration")
local langname = data.lang:getCanonicalName()
local transliteration_page = mw.title.new("การถอดอักษรภาษา" .. langname, "วิกิพจนานุกรม")
local saw_translit_page = false
if transliteration_page then

local success, exists = pcall(function () return transliteration_page.exists end)
if success and exists then
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[ဝိက်ရှေန်နရဳ:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
translits_formatted = " [[วิกิพจนานุกรม:การถอดอักษรภาษา" .. langname .. "|•]]" .. translits_formatted
saw_translit_page = true
end
-- If data.lang is an etymology-only language and we didn't find a translation page for it, fall back to the
-- full parent.
if not saw_translit_page and data.lang:hasType("etymology-only") then
langname = data.lang:getFullName()
transliteration_page = mw.title.new("การถอดอักษรภาษา" .. langname, "วิกิพจนานุกรม")

if transliteration_page and transliteration_page.exists then
translits_formatted = " [[วิกิพจนานุกรม:การถอดอักษรภาษา" .. langname .. "|•]]" .. translits_formatted
end
end
end
end
else
translits_formatted = ""
end
end

------ Paste heads and transliterations/transcriptions. ------
return table.concat(data.heads, " ''ဝါ'' ") .. translits_formatted

local lemma_gloss
if data.gloss then
lemma_gloss = ' <span class="ib-content qualifier-content">' .. data.gloss .. '</span>'
else
lemma_gloss = ""
end

return head_parts .. translits_formatted .. lemma_gloss
end
end




local function format_genders(data)
local function format_genders(data)
local retval = ""
if data.genders and #data.genders > 0 then
if data.genders and #data.genders > 0 then
if data.gloss then
local gen = require("Module:gender and number")
retval = ","
return "&nbsp;" .. gen.format_list(data.genders, data.lang)
end
else
local pos_for_cat
return ""
if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] and
not m_data.no_gender_cat[data.lang:getFullCode()] then
local pos_category = data.pos_category:gsub("^reconstructed ", "")
pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
end
local text, cats = require(gender_and_number_module).format_genders(data.genders, data.lang, pos_for_cat)
for _, cat in ipairs(cats) do
insert(data.categories, cat)
end
retval = retval .. "&nbsp;" .. text
end
end
return retval
end
end




local function format_inflection_parts(data, parts)
local function format_inflection_parts(data, parts)
local any_part_translit = false
local m_links = require("Module:links")

for key, part in ipairs(parts) do
for j, part in ipairs(parts) do
if type(part) ~= "table" then
if type(part) ~= "table" then
part = {term = part}
part = {term = part}
end
end

local qualifiers = ""
if part.qualifiers and #part.qualifiers > 0 then
qualifiers = require("Module:qualifier").format_qualifier(part.qualifiers) .. " "
-- [[Special:WhatLinksHere/Template:tracking/headword/qualifier]]
require("Module:debug").track("headword/qualifier")
end
local partaccel = part.accel
local partaccel = part.accel
local face = part.hypothetical and "hypothetical" or "bold"
local face = part.face or "bold"
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
local nolink = part.hypothetical or part.nolink
error("The face `" .. face .. "` " .. (
mw.loadData(script_utilities_data_module).faces[face] and
-- Convert the term into a full link
"should not be used for non-headword terms on the headword line." or
-- Don't show a transliteration here, the consensus seems to be not to
"is invalid."
-- show them in headword lines to avoid clutter.
))
part = m_links.full_link(
{
end

term = not nolink and part.term or nil,
-- Here the final part 'or data.nolinkinfl' allows to have 'nolinkinfl=true'
alt = part.alt or (nolink and part.term or nil),
-- right into the 'data' table to disable inflection links of the entire headword
lang = part.lang or data.lang,
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
sc = part.sc or parts.sc or (not part.lang and data.sc),
local nolinkinfl = part.face == "hypothetical" or (part.nolink and track("nolink") or part.nolinkinfl) or (
id = part.id,
data.nolink and track("nolink") or data.nolinkinfl)
genders = part.genders,

tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil),
local formatted
ts = part.transcription,
if part.label then
accel = parts.accel or partaccel,
-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
},
formatted = "<i>" .. part.label .. "</i>"
face,
false
else
-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
)
-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
part = qualifiers .. part
-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
parts[key] = part
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- to be enabled in languages with more complex scripts (e.g. Arabic).
local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
if tr ~= "-" then
any_part_translit = true
end
formatted = require(links_module).full_link(
{
term = not nolinkinfl and part.term or nil,
alt = part.alt or (nolinkinfl and part.term or nil),
lang = part.lang or data.lang,
sc = part.sc or parts.sc or nil,
id = part.id,
genders = part.genders,
tr = tr,
ts = part.transcription,
accel = partaccel or parts.accel,
},
face
)
end

parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
end
end

local parts_output = ""
local parts_output

if #parts > 0 then
if #parts > 0 then
parts_output = " " .. table.concat(parts, " ''ဝါ'' ")
parts_output = (parts.label and " " or "") .. concat(parts)
elseif parts.request then
elseif parts.request then
parts_output = " <small>[please provide]</small>"
parts_output = " <small>[please provide]</small>"
insert(data.categories, "Requests for inflections in " .. data.lang:getFullName() .. " entries")
.. require("Module:utilities").format_categories(
else
{data.lang:getCanonicalName() .. " entries needing inflection"},
parts_output = ""
lang,
nil,
nil,
data.force_cat_output,
data.sc
)
end
end

return "''" .. parts.label .. "''" .. parts_output
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
return parts_label .. parts_output, any_part_translit
end
end



-- Format the inflections following the headword
-- Format the inflections following the headword.
local function format_inflections(data)
local function format_inflections(data)
local any_part_translit = false
if data.inflections and #data.inflections > 0 then
if data.inflections and #data.inflections > 0 then
-- Format each inflection individually
-- Format each inflection individually.
for key, infl in ipairs(data.inflections) do
for key, infl in ipairs(data.inflections) do
local this_any_part_translit
data.inflections[key] = format_inflection_parts(data, infl)
data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
if this_any_part_translit then
any_part_translit = true
end
end
end

return " (" .. table.concat(data.inflections, " ") .. ")"
local concat_result = concat(data.inflections, ", ")
return " (" .. concat_result .. ")"
else
else
return ""
return ""
လာင် ၃၆၉- လာင် ၄၅၃-
end
end


--[==[
local function show_headword_line(data)
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- Check the namespace against the language type
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
if mw.title.getCurrentTitle().nsText == "" then
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
if data.lang:getType() == "ဗီုပြင်သိုင်တၟိ" then
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
error("Entries for this language must be placed in the ဗီုပြင်သိုင်တၟိ: namespace.")
--
elseif data.lang:getType() == "appendix-constructed" then
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
error("Entries for this language must be placed in the အဆက်လက္ကရဴ: namespace.")
-- based on whether it ends in " forms"; otherwise, return nil.]==]
end
function export.pos_lemma_or_nonlemma(plpos, best_guess)
-- Is it a lemma category?
if isLemma[plpos] then
return "คำหลัก"
end
end
local plpos_no_recon = plpos:gsub("^reconstructed ", "")
if isLemma[plpos_no_recon] then
local tracking_categories = {}
return "คำหลัก"
local pos_category = data.pos_category .. " " .. data.lang:getCanonicalName()
if pos_category ~= "Translingual Han characters" then
table.insert(data.categories, 1, pos_category)
end
end
-- Is it a lemma category?
if isLemma[data.pos_category] or isLemma[data.pos_category:gsub("^reconstructed ", "")] then
table.insert(data.categories, 1, "ဝေါဟာ" .. data.lang:getCanonicalName() .. "နွံပ္ဍဲအဘိဓာန်ဂမၠိုင်")
-- Is it a nonlemma category?
-- Is it a nonlemma category?
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
elseif isNonLemma[data.pos_category]
return "รูปผัน"
or isNonLemma[data.pos_category:gsub("^ဗီုပြင်သိုင်တၟိ ", "")]
end
or isLemma[data.pos_category:gsub("^mutated ", "")]
or isNonLemma[data.pos_category:gsub("^mutated ", "")] then
local plpos_no_mut = plpos:gsub("^mutated ", "")
if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
return "รูปผัน"
table.insert(data.categories, 1, "ဝေါဟာ" .. data.lang:getCanonicalName() .. "ဗီုပြၚ်ဟွံမွဲကဵုပ္ဍဲအဘိဓာန်ဂမၠိုင်")
elseif best_guess then
-- It's neither; we don't know what this category is, so tag it with a tracking category.
return plpos:find("^รูป") and "รูปผัน" or "คำหลัก"
else
else
return nil
--[=[
[[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos]]
]=]
table.insert(tracking_categories, "ဝေါဟာ" .. data.lang:getCanonicalName() .. "နွံပ္ဍဲအဘိဓာန်ဂမၠိုင်")
require("Module:debug").track{
"headword/unrecognized pos",
"headword/unrecognized pos/lang/" .. data.lang:getCode(),
"headword/unrecognized pos/pos/" .. data.pos_category
}
end
end
end

-- Preprocess

preprocess(data)
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
local m_links = require("Module:links")
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
-- no blank (zero-length) strings are present.
if data.lang:getType() ~= "reconstructed" then
local function init_and_find_maximum_index(data, element, allow_blank_string)
for _, head in ipairs(data.heads) do
local maxind = 0
if mw.title.getCurrentTitle().prefixedText ~= m_links.getLinkPage(m_links.remove_links(head), data.lang) then
if not data[element] then
--[=[
data[element] = {}
[[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch]]
end
]=]
local typ = type(data[element])
require("Module:debug").track{
if typ ~= "table" then
"headword/pagename spelling mismatch",
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
"headword/pagename spelling mismatch/" .. data.lang:getCode()
end
}
for k, v in pairs(data[element]) do
break
if k ~= "maxindex" then
if type(k) ~= "number" then
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
end
end
if k > maxind then
maxind = k
end
if v then
if type(v) ~= "string" then
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
end
if not allow_blank_string and v == "" then
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
end
end
end
end
return maxind
end

--[==[
-- Add the page to various maintenance categories for the language and the
-- whole page. These are placed in the headword somewhat arbitrarily, but
-- mainly because headword templates are mandatory for entries (meaning that
-- in theory it provides full coverage).
--
-- This is provided as an external entry point so that modules which transclude
-- information from other entries (such as {{tl|ja-see}}) can take advantage
-- of this feature as well, because they are used in place of a conventional
-- headword template.]==]
do
-- Handle any manual sortkeys that have been specified in raw categories
-- by tracking if they are the same or different from the automatically-
-- generated sortkey, so that we can track them in maintenance
-- categories.
local function handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
sortkey = sortkey or lang:makeSortKey(page.pagename)
-- If there are raw categories with no sortkey, then they will be
-- sorted based on the default MediaWiki sortkey, so we check against
-- that.
if tbl == true then
if page.raw_defaultsort ~= sortkey then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
return
end
local redundant, different
for k in pairs(tbl) do
if k == sortkey then
redundant = true
else
different = true
end
end
if redundant then
insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
end
if different then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
end
return sortkey
end
end
function export.maintenance_cats(page, lang, lang_cats, page_cats)
-- Format and return all the gathered information
for _, cat in ipairs(page.cats) do
return
insert(page_cats, cat)
format_headword(data) ..
end
format_genders(data) ..
lang = lang:getFull() -- since we are just generating categories
format_inflections(data) ..
local canonical = lang:getCanonicalName()
require("Module:utilities").format_categories(
local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()]
tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
if tbl then
)
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with topic categories using raw markup")
end
tbl = page.wikitext_langname_cat[canonical]
if tbl then
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with language name categories using raw markup")
end
if require(utilities_module).get_current_L2() ~= "ภาษา" .. canonical then
insert(lang_cats, canonical .. " entries with incorrect language header")
end
end
end
end



--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
function export.full_headword(data)
local remove_links = require(links_module).remove_links
local tracking_categories = {}
local format_categories = require(utilities_module).format_categories

-- Script-tags the topmost header.
-- Prevent data from being destructively modified.
local pagename = mw.title.getCurrentTitle().text
local data = require(table_module).shallowcopy(data)
local fullPagename = mw.title.getCurrentTitle().fullText

local namespace = mw.title.getCurrentTitle().nsText
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------

if data.getCanonicalName then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
end

if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In data, the first argument to full_headword, data.lang should be a language object.")
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
end
end

if not data.sc then
if data.id and type(data.id) ~= "string" then
error("The id in the data table should be a string.")
data.sc = require("Module:scripts").findBestScript(data.heads and data.heads[1] ~= "" and data.heads[1] or pagename, data.lang)
end

------------ 2. Initialize pagename etc. ------------

local langcode = data.lang:getCode()
local full_langcode = data.lang:getFullCode()
local langname = data.lang:getCanonicalName()
local full_langname = "ภาษา" .. data.lang:getFullName()

local raw_pagename, page = data.pagename
if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc.
page = require(headword_page_module).process_page(raw_pagename)
else
else
page = m_data.page
-- Track uses of sc parameter
end
local best = require("Module:scripts").findBestScript(pagename, data.lang)

require("Module:debug").track("headword/sc")
-- Check the namespace against the language type.
if page.namespace == "" then
if data.lang:hasType("reconstructed") then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
end

------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

if type(data.heads) == "table" and type(data.heads[1]) == "table" then
-- new-style
if data.translits or data.transcriptions then
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
end
else
-- convert old-style `heads`, `translits` and `transcriptions` to new-style
local maxind = math.max(
init_and_find_maximum_index(data, "heads"),
init_and_find_maximum_index(data, "translits", true),
init_and_find_maximum_index(data, "transcriptions", true)
)
for i = 1, maxind do
data.heads[i] = {
term = data.heads[i],
tr = data.translits[i],
ts = data.transcriptions[i],
}
end
end

-- Make sure there's at least one head.
if not data.heads[1] then
data.heads[1] = {}
end

------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------

init_and_find_maximum_index(data, "categories")
init_and_find_maximum_index(data, "whole_page_categories")
local pos_category_already_present = false
if #data.categories > 0 then
local escaped_langname = pattern_escape(full_langname)
local matches_lang_pattern = "^" .. escaped_langname .. " "
for _, cat in ipairs(data.categories) do
-- Does the category begin with the language name? If not, tag it with a tracking category.
if not cat:find(matches_lang_pattern) then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category/LANGCODE]]
track("no lang category", data.lang)
end
end

-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
-- throw an error below.
if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
-- Optimization to avoid inserting category already present.
pos_category_already_present = true
end
end

if not data.pos_category then
error("`data.pos_category` not specified and could not be inferred from the categories given in "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
)
end

-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
if not pos_category_already_present and not data.noposcat then
local pos_category = data.pos_category .. full_langname
-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
if pos_category ~= "Translingual Han characters" then
insert(data.categories, 1, pos_category)
end
end

-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
if not postype then
-- We don't know what this category is, so tag it with a tracking category.
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANGCODE]]
track("unrecognized pos", data.lang)
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/POS]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/POS/LANGCODE]]
track("unrecognized pos/pos/" .. data.pos_category, data.lang)
elseif not data.noposcat then
insert(data.categories, 1, postype .. full_langname)
end

------------ 5. Create a default headword, and add links to multiword page names. ------------

-- Determine if term is reconstructed
local is_reconstructed = page.namespace == "การสืบสร้าง" or data.lang:hasType("reconstructed")

-- Create a default headword based on the pagename, which is determined in
-- advance by the data module so that it only needs to be done once.
local default_head = page.pagename

-- Add links to multi-word page names when appropriate
if not data.nolinkhead and not m_data.no_multiword_links[langcode] and not m_data.no_multiword_links[full_langcode]
and not is_reconstructed and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
end

if is_reconstructed then
default_head = "*" .. default_head
end

------------ 6. Fill in missing values in `data.heads`. ------------

-- True if any script among the headword scripts has spaces in it.
local any_script_has_spaces = false
-- True if any term has a redundant head= param.
local has_redundant_head_param = false

for _, head in ipairs(data.heads) do

------ 6a. If missing head, replace with default head.
if not head.term then
head.term = default_head
elseif head.term == default_head then
has_redundant_head_param = true
end

------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------ otherwise fall back to the overall script if given. If neither given, autodetect the script.
local auto_sc = data.lang:findBestScript(head.term)
if data.sc:getCode() == best:getCode() then
if (
require("Module:debug").track("headword/sc/redundant")
auto_sc:getCode() == "None" and
require("Module:debug").track("headword/sc/redundant/" .. data.sc:getCode())
require(scripts_module).findBestScriptWithoutLang(head.term):getCode() ~= "None"
) then
insert(data.categories, full_langname .. " terms in nonstandard scripts")
end
if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
head.sc = auto_sc
else
else
if not head.sc then -- Overall script code given.
require("Module:debug").track("headword/sc/needed")
head.sc = data.sc
require("Module:debug").track("headword/sc/needed/" .. data.sc:getCode())
end
--[[
-- Track uses of sc parameter.
if head.sc:getCode() == auto_sc:getCode() then
insert(data.categories, full_langname .. " terms with redundant script codes")
else
insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
end
--]]
end

-- If using a discouraged character sequence, add to maintenance category.
if head.sc:hasNormalizationFixes() == true then
local composed_head = toNFC(head.term)
if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
insert(data.whole_page_categories, "Pages using discouraged character sequences")
end
end

any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
------ (provided automatic translit is available, e.g. not in Persian or Hebrew).

-- Make transliterations
head.tr_manual = nil

-- Try to generate a transliteration if necessary
if head.tr == "-" then
head.tr = nil
elseif not notranslit[langcode] and not notranslit[full_langcode] and head.sc:isTransliterated() then
head.tr_manual = not not head.tr

local text = head.term
if not data.lang:link_tr() then
text = remove_links(text)
end

local automated_tr, tr_categories
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)

if automated_tr or head.tr_fail then
local manual_tr = head.tr

if manual_tr then
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
insert(data.categories, full_langname .. " terms with redundant transliterations")
elseif not head.tr_fail then
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
end
end

if not manual_tr then
head.tr = automated_tr
for _, category in ipairs(tr_categories) do
insert(data.categories, category)
end
end
end

-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not head.tr then
head.tr = "<small>transliteration needed</small>"
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
else
-- Otherwise, trim it.
head.tr = mw.text.trim(head.tr)
end
end

-- Link to the transliteration entry for languages that require this.
if head.tr and data.lang:link_tr() then
head.tr = require(links_module).full_link {
term = head.tr,
lang = data.lang,
sc = require(scripts_module).getByCode("Latn"),
tr = "-"
}
end
end
end
end

------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
local displayTitle

-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
if namespace == "" and data.sc and toBeTagged[data.sc:getCode()] or
-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
data.sc:getCode() == "Jpan" and (test_script(pagename, "Hira") or test_script(pagename, "Kana")) then
-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
displayTitle = '<span class="' .. data.sc:getCode() .. '">' .. pagename .. '</span>'
local display_title
elseif namespace == "ဗီုပြင်သိုင်တၟိ" then
-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
displayTitle, matched = mw.ustring.gsub(
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
fullPagename,
-- about the best we can do (alternatively we could potentially do script detection on the pagename).
"^(ဗီုပြင်သိုင်တၟိ:[^/]+/)(.+)$",
local dt_script = data.heads[1].sc
local dt_script_code = dt_script:getCode()
local page_non_ascii = page.namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^ชื่อไม่รองรับ/", "")
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
display_title = 'ชื่อไม่รองรับ/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
display_title = '<span class="' .. dt_script_code .. '">' .. page.full_raw_pagename .. '</span>'
-- Keep Han entries region-neutral in the display title.
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
elseif page.namespace == "การสืบสร้าง" then
local matched
display_title, matched = rsubn(
page.full_raw_pagename,
"^(การสืบสร้าง:[^/]+/)(.+)$",
function(before, term)
function(before, term)
return before ..
return before ..
require("Module:script utilities").tag_text(
require(script_utilities_module).tag_text(
term,
term,
data.lang,
data.lang,
data.sc
dt_script
)
)
end
end
)
)
if matched == 0 then
if matched == 0 then
displayTitle = nil
display_title = nil
end
end
end
end

if displayTitle then
if display_title then
local frame = mw.getCurrentFrame()
mw.getCurrentFrame():callParserFunction(
frame:callParserFunction(
"DISPLAYTITLE",
"DISPLAYTITLE",
display_title
displayTitle
)
)
end
end

------------ 8. Insert additional categories. ------------

if data.force_cat_output then
if data.force_cat_output then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]
--[=[
[[Special:WhatLinksHere/Template:tracking/headword/force cat output]]
track("force cat output")
]=]
require("Module:debug").track("headword/force cat output")
end
end

if data.getCanonicalName then
if has_redundant_head_param then
if not data.no_redundant_head_cat then
error('The "data" variable supplied to "full_headword" should not be a language object.')
insert(data.categories, full_langname .. " terms with redundant head parameter")
end
end
end
-- Were any categories specified?
if data.categories and #data.categories > 0 then
local lang_name = require("Module:string").pattern_escape(data.lang:getCanonicalName())
for _, cat in ipairs(data.categories) do
-- Does the category begin with the language name? If not, tag it with a tracking category.
if not mw.ustring.find(cat, "^" .. lang_name) then


-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and
--[=[
not m_data.no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode] then
[[Special:WhatLinksHere/Template:tracking/head tracking/no lang category]]
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
]=]
-- Use the pagename, not the head= value, because the latter may have extra
require("Module:debug").track{
-- junk in it, e.g. superscripted text that throws off the algorithm.
"headword/no lang category",
local checkpattern = ".[%s%-፡]."
"headword/no lang category/lang/" .. data.lang:getCode()
if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then
}
-- Exclude hyphens if the data module states that they should for this language
checkpattern = ".[%s፡]."
end
if rmatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
insert(data.categories, full_langname .. " multiword terms")
end
end

if data.sccat then
for _, head in ipairs(data.heads) do
insert(data.categories, data.pos_category .. full_langname .. "ใน" ..
head.sc:getDisplayForm())
end
end

-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
if page.namespace ~= "Reconstruction" then
-- Map from languages to a string containing the characters to ignore when considering whether a term has
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
-- values.
local characters_to_ignore = {
["aaq"] = "α", -- Penobscot
["acy"] = "δθ", -- Cypriot Arabic
["anc"] = "γ", -- Ngas
["aou"] = "χ", -- A'ou
["awg"] = "β", -- Anguthimri
["bhp"] = "β", -- Bima
["byk"] = "θ", -- Biao
["cdy"] = "θ", -- Chadong
["clm"] = "χ", -- Klallam
["col"] = "χ", -- Colombia-Wenatchi
["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
["ets"] = "θ", -- Yekhee
["gmw-gts"] = "χ", -- Gottscheerish
["hur"] = "θ", -- Halkomelem
["izh"] = "ь", -- Ingrian
["kic"] = "θ", -- Kickapoo
["lil"] = "χ", -- Lillooet
["mhz"] = "β", -- Mor (Austronesian)
["neg"]= "ӡ", -- Negidal (normally in Cyrillic)
["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
["pox"] = "χ", -- Polabian
["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
["sjw"] = "θ", -- Shawnee
["squ"] = "χ", -- Squamish
["str"] = "χθ", -- Saanich; uses two Greek letters
["twa"] = "χ", -- Twana
["yha"] = "θ", -- Baha
["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
-- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-- the currect characters are being chosen in the entry names
["zng"] = "θ", -- Mang
}
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
-- scripts whose `character_category` is false as well as Zmth (mathematical notation symbols), which has a
-- category of "Mathematical notation symbols". When counting scripts, we need to elide language-specific
-- variants because e.g. Beng and as-Beng have slightly different characters but we don't want to consider them
-- two different scripts (e.g. [[এৰ]] has two characters which are detected respectively as Beng and as-Beng).
local seen_scripts = {}
local num_seen_scripts = 0
local num_loops = 0
local canon_pagename = page.pagename
local ch_to_ignore = characters_to_ignore[full_langcode]
if ch_to_ignore then
canon_pagename = rsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
end
local script_data = mw.loadData(scripts_data_module)
while true do
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
break
end
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
num_loops = num_loops + 1
local pagename_script = require(scripts_module).findBestScriptWithoutLang(canon_pagename, "None only as last resort")
local script_chars = pagename_script.characters
if not script_chars then
-- we are stuck; this happens with None
break
end
local script_code = pagename_script:getCode()
local replaced
canon_pagename, replaced = rsubn(canon_pagename, "[" .. script_chars .. "]", "")
if replaced and script_code ~= "Zmth" and script_data[script_code] and
script_data[script_code].character_category ~= false then
script_code = script_code:gsub("^.-%-", "")
if not seen_scripts[script_code] then
seen_scripts[script_code] = true
num_seen_scripts = num_seen_scripts + 1
end
end
end
end
end

if num_seen_scripts > 1 then
if not data.pos_category
and mw.ustring.find(data.categories[1], "^" .. data.lang:getCanonicalName())
insert(data.categories, full_langname .. " terms written in multiple scripts")
then
data.pos_category = mw.ustring.gsub(data.categories[1], "^" .. data.lang:getCanonicalName() .. " ", "")
table.remove(data.categories, 1)
end
end
end
end
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
if not data.pos_category then
error(
'No valid part-of-speech categories were found in the list '
.. 'of categories passed to the function "full_headword". '
.. 'The part-of-speech category should consist of a language\'s '
.. 'canonical name plus a part of speech.'
)
end
-- Categorise for unusual characters
local standard = data.lang:getStandardCharacters()
local standard = data.lang:getStandardCharacters()
if standard and not non_categorizable(page.full_raw_pagename) then
local function char_category(char)
if standard then
local specials = {
if mw.ustring.len(mw.title.getCurrentTitle().subpageText) ~= 1 and not mw.ustring.match(mw.title.getCurrentTitle().text, "^Unsupported titles/") then
["#"] = "number sign",
for character in mw.ustring.gmatch(mw.title.getCurrentTitle().subpageText, "([^" .. standard .. "])") do
["("] = "parentheses",
local upper = mw.ustring.upper(character)
[")"] = "parentheses",
if not mw.ustring.find(upper, "[" .. standard .. "]") then
["<"] = "angle brackets",
[">"] = "angle brackets",
["["] = "square brackets",
["]"] = "square brackets",
["_"] = "underscore",
["{"] = "braces",
["|"] = "vertical line",
["}"] = "braces",
["ß"] = "ẞ",
["\205\133"] = "", -- this is UTF-8 for U+0345 ( ͅ)
["\239\191\189"] = "replacement character",
}
char = toNFD(char)
:gsub(".[\128-\191]*", function(m)
local new_m = specials[m]
new_m = new_m or m:uupper()
return new_m
end)
return toNFC(char)
end
if full_langcode ~= "hi" and full_langcode ~= "lo" then
local standard_chars_scripts = {}
for _, head in ipairs(data.heads) do
standard_chars_scripts[head.sc:getCode()] = true
end
-- Iterate over the scripts, in case there is more than one (as they can have different sets of standard characters).
for code in pairs(standard_chars_scripts) do
local sc_standard = data.lang:getStandardCharacters(code)
if sc_standard then
if page.pagename_len > 1 then
local explode_standard = {}
local function explode(char)
explode_standard[char] = true
return ""
end
local sc_standard = rsub(sc_standard, page.comb_chars.combined_double, explode)
sc_standard = rsub(sc_standard,page.comb_chars.combined_single, explode)
:gsub(".[\128-\191]*", explode)
local num_cat_inserted
for char in pairs(page.explode_pagename) do
if not explode_standard[char] then
if char:find("[0-9]") then
if not num_cat_inserted then
insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วยตัวเลข")
num_cat_inserted = true
end
elseif rfind(char, page.emoji_pattern) then
insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วยอีโมจิ")
else
local upper = char_category(char)
if not explode_standard[upper] then
char = upper
end
insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย " .. char)
end
end
end
end
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
sc_standard = toNFD(sc_standard)
for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
if not rmatch(sc_standard, diacritic) then
insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย ◌" .. diacritic)
end
end
for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
if not rmatch(sc_standard, diacritic) then
insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย ◌" .. diacritic .. "◌")
end
end
end
end
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
elseif ulen(page.pagename) ~= 1 then
for character in rgmatch(page.pagename, "([^" .. standard .. "])") do
local upper = char_category(character)
if not rmatch(upper, "[" .. standard .. "]") then
character = upper
character = upper
end
end
insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย " .. character)
table.insert(
data.categories, "ဝေါဟာခ္ဍံလိက်" ..
data.lang:getCanonicalName() .. "မရပ်စပ်လဝ်အခဝ်" .. character
)
end
end
end
end
end
end
if data.heads[1].sc:isSystem("alphabet") then
-- Categorise for palindromes
local pagename, i = page.pagename:ulower(), 2
if mw.title.getCurrentTitle().nsText ~= "ဗီုပြင်သိုင်တၟိ"
while rmatch(pagename, "(%a)" .. ("%1"):rep(i)) do
and require('Module:palindromes').is_palindrome(
i = i + 1
mw.title.getCurrentTitle().subpageText, data.lang, data.sc
insert(data.categories, "ศัพท์" .. full_langname .. "ที่มีตัวอักษรเดิมติดกัน " .. i .. " ตัว")
) then
end
table.insert(data.categories, "ဝေါဟာဖါ်လေန်ဒရုန်" .. data.lang:getCanonicalName() .. "ဂမၠိုင်")
end
end
-- Categorise for palindromes
return
if not data.nopalindromecat and page.namespace ~= "การสืบสร้าง" and ulen(page.pagename) > 2
show_headword_line(data) ..
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
require("Module:utilities").format_categories(
-- multiple scripts?
data.categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc
and require(palindromes_module).is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
) ..
insert(data.categories, "พาลินโดรม" .. full_langname)
require("Module:utilities").format_categories(
end
tracking_categories, data.lang, data.sort_key, nil, data.force_cat_output, data.sc

)
if page.namespace == "" and not data.lang:hasType("reconstructed") then
local m_links = require(links_module)
for _, head in ipairs(data.heads) do
if page.full_raw_pagename ~= m_links.get_link_page(remove_links(head.term), data.lang, head.sc) then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch/LANGCODE]]
track("pagename spelling mismatch", data.lang)
break
end
end
end

-- Add to various maintenance categories.
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)

------------ 9. Format and return headwords, genders, inflections and categories. ------------

-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
-- so make sure we do it before evaluating `data.categories`.
local text = '<span class="headword-line">' ..
format_headword(data) ..
format_genders(data) ..
format_inflections(data) .. '</span>'

-- Language-specific categories.
local cats = format_categories(
data.categories, data.lang, data.sort_key, page.encoded_pagename,
data.force_cat_output or test_force_categories, data.heads[1].sc
)
-- Language-agnostic categories.
local whole_page_cats = format_categories(
data.whole_page_categories, nil, "-"
)
return text .. cats .. whole_page_cats
end
end



မူလိက်မပလေဝ်လဝ် နဒဒှ် ၀၀:၄၁၊ ၁၃ ဂျောန် ၂၀၂၄

Documentation for this module may be created at မဝ်ဂျူ:headword/doc

local export = {}

-- Named constants for all modules used, to make it easier to swap out sandbox versions.
local debug_track_module = "Module:debug/track"
local gender_and_number_module = "Module:gender and number"
local headword_data_module = "Module:headword/data"
local headword_page_module = "Module:headword/page"
local links_module = "Module:links"
local palindromes_module = "Module:palindromes"
local qualifier_module = "Module:qualifier"
local scripts_module = "Module:scripts"
local scripts_data_module = "Module:scripts/data"
local script_utilities_module = "Module:script utilities"
local script_utilities_data_module = "Module:script utilities/data"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local utilities_module = "Module:utilities"

local m_str_utils = require(string_utilities_module)

local concat = table.concat
local encode_entities = m_str_utils.encode_entities
local insert = table.insert
local ipairs = ipairs
local pairs = pairs
local pattern_escape = m_str_utils.pattern_escape
local rgmatch = mw.ustring.gmatch
local rsubn = mw.ustring.gsub
local rfind = mw.ustring.find
local ulen = m_str_utils.len
local rmatch = mw.ustring.match
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

local m_data = mw.loadData(headword_data_module)

local isLemma = m_data.lemmas
local isNonLemma = m_data.nonlemmas
local notranslit = m_data.notranslit
local toBeTagged = m_data.toBeTagged

-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false

-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
	return (rsubn(term, foo, bar))
end

-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
-- page text to [[Wiktionary:Tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
-- by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID]].
--
-- If `lang` (a language object) is given, an additional tracking page [[Wiktionary:Tracking/headword/TRACK_ID/CODE]] is
-- linked to where CODE is the language code of `lang`, and you can find all entries in the combination of `track_id`
-- and `lang` by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID/CODE]]. This makes it possible to
-- isolate only the entries with a specific tracking property that are in a given language. Note that if `lang`
-- references at etymology-only language, both that language's code and its full parent's code are tracked.
local function track(track_id, lang)
	local tracking_page = "headword/" .. track_id
	local m_debug_track = require(debug_track_module)
	if lang and lang:hasType("etymology-only") then
		m_debug_track{tracking_page, tracking_page .. "/" .. lang:getCode(),
			tracking_page .. "/" .. lang:getFullCode()}
	elseif lang then
		m_debug_track{tracking_page, tracking_page .. "/" .. lang:getCode()}
	else
		m_debug_track(tracking_page)
	end
	return true
end


local function text_in_script(text, script_code)
	local sc = require(scripts_module).getByCode(script_code)
	if not sc then
		error("Internal error: Bad script code " .. script_code)
	end
	local characters = sc.characters

	local out
	if characters then
		text = rsub(text, "%W", "")
		out = rfind(text, "[" .. characters .. "]")
	end

	if out then
		return true
	else
		return false
	end
end


local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
	 Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་•:᠊"
local notWordPunc = "[^" .. wordPunc .. "]+"


-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
-- customized separator: `part` is the object specifying the term, which should optionally contain:
-- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
-- * right qualifiers in `qq`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
--   (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(part, formatted, j)
	local left_qualifiers, right_qualifiers
	local reftext

	left_qualifiers = part.q and #part.q > 0 and part.q
	if left_qualifiers then
		left_qualifiers = require(qualifier_module).format_qualifier(left_qualifiers) .. " "
	end

	right_qualifiers = part.qq and #part.qq > 0 and part.qq
	if right_qualifiers then
		right_qualifiers = " " .. require(qualifier_module).format_qualifier(right_qualifiers)
	end
	if part.refs and #part.refs > 0 then
		local refs = {}
		for _, ref in ipairs(part.refs) do
			if type(ref) ~= "table" then
				ref = {text = ref}
			end
			local refargs
			if ref.name or ref.group then
				refargs = {name = ref.name, group = ref.group}
			end
			insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
		end
		reftext = concat(refs)
	end

	local separator = part.separator or j > 1 and " <i>หรือ</i> " -- use "" to request no separator

	if left_qualifiers then
		formatted = left_qualifiers .. formatted
	end
	if reftext then
		formatted = formatted .. reftext
	end
	if right_qualifiers then
		formatted = formatted .. right_qualifiers
	end
	if separator then
		formatted = separator .. formatted
	end

	return formatted
end


--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
function export.head_is_multiword(head)
	for possibleWordBreak in rgmatch(head, spacingPunctuation) do
		if rmatch(possibleWordBreak, notWordPunc) then
			return true
		end
	end

	return false
end


--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head, default)
	local function workaround_to_exclude_chars(s)
		return rsub(s, notWordPunc, "\2%1\1")
	end

	head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
	if default then
		head = head
			:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
			:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
	end

	--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
	head = encode_entities(head, "[]", true, true)

	--[=[
	use this when workaround is no longer needed:

	head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

	Remove any empty links, which could have been created above
	at the beginning or end of the string.
	]=]
	return (head
		:gsub("\1\2", "")
		:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end


local function non_categorizable(full_raw_pagename)
	return full_raw_pagename:find("^ภาคผนวก:อากัปกิริยา/")
end


-- Format a headword with transliterations.
local function format_headword(data)
	local m_scriptutils = require(script_utilities_module)

	-- Are there non-empty transliterations?
	local has_translits = false
	local has_manual_translits = false

	------ Format the headwords. ------

	local head_parts = {}
	local unique_head_parts = {}

	local has_multiple_heads = #data.heads > 1

	for j, head in ipairs(data.heads) do
		if head.tr or head.ts then
			has_translits = true
		end
		if head.tr and head.tr_manual or head.ts then
			has_manual_translits = true
		end

		local formatted

		-- Apply processing to the headword, for formatting links and such.
		if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Imag" then
			formatted = require(links_module).language_link{term = head.term, lang = data.lang}
		else
			formatted = data.lang:makeDisplayText(head.term, head.sc, true)
		end

		local function tag_text_and_add_quals_and_refs(head, formatted, j)
			-- Add language and script wrapper.
			formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)

			-- Add qualifiers, references and separator.
			return format_term_with_qualifiers_and_refs(head, formatted, j)
		end

		local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
		insert(head_parts, head_part)

		-- If multiple heads, try to determine whether all heads display the same. To do this we need to effectively
		-- rerun the text tagging and addition of qualifiers and references, using 1 for all indices.
		if has_multiple_heads then
			local unique_head_part
			if j == 1 then
				unique_head_part = head_part
			else
				unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
			end
			unique_head_parts[unique_head_part] = true
		end
	end

	local set_size = 0
	if has_multiple_heads then
		for _ in pairs(unique_head_parts) do
			set_size = set_size + 1
		end
	end
	if set_size == 1 then
		head_parts = head_parts[1]
	else
		head_parts = concat(head_parts)
	end

	if has_manual_translits then
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/has-manual-translit]]
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/has-manual-translit/LANGCODE]]
		track("has-manual-translit", data.lang)
	end

	------ Format the transliterations and transcriptions. ------

	local translits_formatted

	if has_translits then
		local translit_parts = {}
		for _, head in ipairs(data.heads) do
			if head.tr or head.ts then
				local this_parts = {}
				if head.tr then
					insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
					if head.ts then
						insert(this_parts, " ")
					end
				end
				if head.ts then
					insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
				end
				insert(translit_parts, concat(this_parts))
			end
		end

		translits_formatted = " (" .. concat(translit_parts, " <i>หรือ</i> ") .. ")"

		local langname = data.lang:getCanonicalName()
		local transliteration_page = mw.title.new("การถอดอักษรภาษา" .. langname, "วิกิพจนานุกรม")
		local saw_translit_page = false

		if transliteration_page and transliteration_page.exists then
			translits_formatted = " [[วิกิพจนานุกรม:การถอดอักษรภาษา" .. langname .. "|•]]" .. translits_formatted
			saw_translit_page = true
		end
		-- If data.lang is an etymology-only language and we didn't find a translation page for it, fall back to the
		-- full parent.
		if not saw_translit_page and data.lang:hasType("etymology-only") then
			langname = data.lang:getFullName()
			transliteration_page = mw.title.new("การถอดอักษรภาษา" .. langname, "วิกิพจนานุกรม")

			if transliteration_page and transliteration_page.exists then
				translits_formatted = " [[วิกิพจนานุกรม:การถอดอักษรภาษา" .. langname .. "|•]]" .. translits_formatted
			end
		end
	else
		translits_formatted = ""
	end

	------ Paste heads and transliterations/transcriptions. ------

	local lemma_gloss
	if data.gloss then
		lemma_gloss = ' <span class="ib-content qualifier-content">' .. data.gloss .. '</span>'
	else
		lemma_gloss = ""
	end

	return head_parts .. translits_formatted .. lemma_gloss
end


local function format_genders(data)
	local retval = ""
	if data.genders and #data.genders > 0 then
		if data.gloss then
			retval = ","
		end
		local pos_for_cat
		if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] and
			not m_data.no_gender_cat[data.lang:getFullCode()] then
			local pos_category = data.pos_category:gsub("^reconstructed ", "")
			pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
		end
		local text, cats = require(gender_and_number_module).format_genders(data.genders, data.lang, pos_for_cat)
		for _, cat in ipairs(cats) do
			insert(data.categories, cat)
		end
		retval = retval .. "&nbsp;" .. text
	end
	return retval
end


local function format_inflection_parts(data, parts)
	local any_part_translit = false

	for j, part in ipairs(parts) do
		if type(part) ~= "table" then
			part = {term = part}
		end

		local partaccel = part.accel
		local face = part.face or "bold"
		if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
			error("The face `" .. face .. "` " .. (
				mw.loadData(script_utilities_data_module).faces[face] and
				"should not be used for non-headword terms on the headword line." or
				"is invalid."
			))
		end

		-- Here the final part 'or data.nolinkinfl' allows to have 'nolinkinfl=true'
		-- right into the 'data' table to disable inflection links of the entire headword
		-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
		local nolinkinfl = part.face == "hypothetical" or (part.nolink and track("nolink") or part.nolinkinfl) or (
			data.nolink and track("nolink") or data.nolinkinfl)

		local formatted
		if part.label then
			-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
			formatted = "<i>" .. part.label .. "</i>"
		else
			-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
			-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
			-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
			-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
			-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
			-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
			-- to be enabled in languages with more complex scripts (e.g. Arabic).
			local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
			if tr ~= "-" then
				any_part_translit = true
			end
			formatted = require(links_module).full_link(
				{
					term = not nolinkinfl and part.term or nil,
					alt = part.alt or (nolinkinfl and part.term or nil),
					lang = part.lang or data.lang,
					sc = part.sc or parts.sc or nil,
					id = part.id,
					genders = part.genders,
					tr = tr,
					ts = part.transcription,
					accel = partaccel or parts.accel,
				},
				face
				)
		end

		parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
	end

	local parts_output

	if #parts > 0 then
		parts_output = (parts.label and " " or "") .. concat(parts)
	elseif parts.request then
		parts_output = " <small>[please provide]</small>"
		insert(data.categories, "Requests for inflections in " .. data.lang:getFullName() .. " entries")
	else
		parts_output = ""
	end

	local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
	return parts_label .. parts_output, any_part_translit
end


-- Format the inflections following the headword.
local function format_inflections(data)
	local any_part_translit = false
	if data.inflections and #data.inflections > 0 then
		-- Format each inflection individually.
		for key, infl in ipairs(data.inflections) do
			local this_any_part_translit
			data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
			if this_any_part_translit then
				any_part_translit = true
			end
		end

		local concat_result = concat(data.inflections, ", ")
		return " (" .. concat_result .. ")"
	else
		return ""
	end
end

--[==[
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
	-- Is it a lemma category?
	if isLemma[plpos] then
		return "คำหลัก"
	end
	local plpos_no_recon = plpos:gsub("^reconstructed ", "")
	if isLemma[plpos_no_recon] then
		return "คำหลัก"
	end
	-- Is it a nonlemma category?
	if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
		return "รูปผัน"
	end
	local plpos_no_mut = plpos:gsub("^mutated ", "")
	if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
		return "รูปผัน"
	elseif best_guess then
		return plpos:find("^รูป") and "รูปผัน" or "คำหลัก"
	else
		return nil
	end
end


-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
-- no blank (zero-length) strings are present.
local function init_and_find_maximum_index(data, element, allow_blank_string)
	local maxind = 0
	if not data[element] then
		data[element] = {}
	end
	local typ = type(data[element])
	if typ ~= "table" then
		error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
	end
	for k, v in pairs(data[element]) do
		if k ~= "maxindex" then
			if type(k) ~= "number" then
				error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
			end
			if k > maxind then
				maxind = k
			end
			if v then
				if type(v) ~= "string" then
					error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
				end
				if not allow_blank_string and v == "" then
					error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
				end
			end
		end
	end
	return maxind
end

--[==[
-- Add the page to various maintenance categories for the language and the
-- whole page. These are placed in the headword somewhat arbitrarily, but
-- mainly because headword templates are mandatory for entries (meaning that
-- in theory it provides full coverage).
-- 
-- This is provided as an external entry point so that modules which transclude
-- information from other entries (such as {{tl|ja-see}}) can take advantage
-- of this feature as well, because they are used in place of a conventional
-- headword template.]==]
do
	-- Handle any manual sortkeys that have been specified in raw categories
	-- by tracking if they are the same or different from the automatically-
	-- generated sortkey, so that we can track them in maintenance
	-- categories.
	local function handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
		sortkey = sortkey or lang:makeSortKey(page.pagename)
		-- If there are raw categories with no sortkey, then they will be
		-- sorted based on the default MediaWiki sortkey, so we check against
		-- that.
		if tbl == true then
			if page.raw_defaultsort ~= sortkey then
				insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
			end
			return
		end
		local redundant, different
		for k in pairs(tbl) do
			if k == sortkey then
				redundant = true
			else
				different = true
			end
		end
		if redundant then
			insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
		end
		if different then
			insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
		end
		return sortkey
	end
	
	function export.maintenance_cats(page, lang, lang_cats, page_cats)
		for _, cat in ipairs(page.cats) do
			insert(page_cats, cat)
		end
		lang = lang:getFull() -- since we are just generating categories
		local canonical = lang:getCanonicalName()
		local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()]
		if tbl then
			sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
			insert(lang_cats, canonical .. " entries with topic categories using raw markup")
		end
		tbl = page.wikitext_langname_cat[canonical]
		if tbl then
			handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
			insert(lang_cats, canonical .. " entries with language name categories using raw markup")
		end
		if require(utilities_module).get_current_L2() ~= "ภาษา" .. canonical then
			insert(lang_cats, canonical .. " entries with incorrect language header")
		end
	end
end


--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
	local remove_links = require(links_module).remove_links
	local format_categories = require(utilities_module).format_categories

	-- Prevent data from being destructively modified.
	local data = require(table_module).shallowcopy(data)

	------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------

	if data.getCanonicalName then
		error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
	end

	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
		error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
	end

	if data.id and type(data.id) ~= "string" then
		error("The id in the data table should be a string.")
	end

	------------ 2. Initialize pagename etc. ------------

	local langcode = data.lang:getCode()
	local full_langcode = data.lang:getFullCode()
	local langname = data.lang:getCanonicalName()
	local full_langname = "ภาษา" .. data.lang:getFullName()

	local raw_pagename, page = data.pagename
	if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc.
		page = require(headword_page_module).process_page(raw_pagename)
	else
		page = m_data.page
	end

	-- Check the namespace against the language type.
	if page.namespace == "" then
		if data.lang:hasType("reconstructed") then
			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
		elseif data.lang:hasType("appendix-constructed") then
			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
		end
	end

	------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

	if type(data.heads) == "table" and type(data.heads[1]) == "table" then
		-- new-style
		if data.translits or data.transcriptions then
			error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
		end
	else
		-- convert old-style `heads`, `translits` and `transcriptions` to new-style
		local maxind = math.max(
			init_and_find_maximum_index(data, "heads"),
			init_and_find_maximum_index(data, "translits", true),
			init_and_find_maximum_index(data, "transcriptions", true)
		)
		for i = 1, maxind do
			data.heads[i] = {
				term = data.heads[i],
				tr = data.translits[i],
				ts = data.transcriptions[i],
			}
		end
	end

	-- Make sure there's at least one head.
	if not data.heads[1] then
		data.heads[1] = {}
	end

	------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------

	init_and_find_maximum_index(data, "categories")
	init_and_find_maximum_index(data, "whole_page_categories")
	local pos_category_already_present = false
	if #data.categories > 0 then
		local escaped_langname = pattern_escape(full_langname)
		local matches_lang_pattern = "^" .. escaped_langname .. " "
		for _, cat in ipairs(data.categories) do
			-- Does the category begin with the language name? If not, tag it with a tracking category.
			if not cat:find(matches_lang_pattern) then
				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category]]
				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category/LANGCODE]]
				track("no lang category", data.lang)
			end
		end

		-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
		-- throw an error below.
		if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
			data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
			-- Optimization to avoid inserting category already present.
			pos_category_already_present = true
		end
	end

	if not data.pos_category then
		error("`data.pos_category` not specified and could not be inferred from the categories given in "
			.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
			.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
			.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
			)
	end

	-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
	if not pos_category_already_present and not data.noposcat then
		local pos_category = data.pos_category .. full_langname
		-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
		if pos_category ~= "Translingual Han characters" then
			insert(data.categories, 1, pos_category)
		end
	end

	-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
	-- add an appropriate category.
	local postype = export.pos_lemma_or_nonlemma(data.pos_category)
	if not postype then
		-- We don't know what this category is, so tag it with a tracking category.
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANGCODE]]
		track("unrecognized pos", data.lang)
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/POS]]
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/POS/LANGCODE]]
		track("unrecognized pos/pos/" .. data.pos_category, data.lang)
	elseif not data.noposcat then
		insert(data.categories, 1, postype .. full_langname)
	end

	------------ 5. Create a default headword, and add links to multiword page names. ------------

	-- Determine if term is reconstructed
	local is_reconstructed = page.namespace == "การสืบสร้าง" or data.lang:hasType("reconstructed")

	-- Create a default headword based on the pagename, which is determined in
	-- advance by the data module so that it only needs to be done once.
	local default_head = page.pagename

	-- Add links to multi-word page names when appropriate
	if not data.nolinkhead and not m_data.no_multiword_links[langcode] and not m_data.no_multiword_links[full_langcode]
		and	not is_reconstructed and export.head_is_multiword(default_head) then
		default_head = export.add_multiword_links(default_head, true)
	end

	if is_reconstructed then
		default_head = "*" .. default_head
	end

	------------ 6. Fill in missing values in `data.heads`. ------------

	-- True if any script among the headword scripts has spaces in it.
	local any_script_has_spaces = false
	-- True if any term has a redundant head= param.
	local has_redundant_head_param = false

	for _, head in ipairs(data.heads) do

		------ 6a. If missing head, replace with default head.
		if not head.term then
			head.term = default_head
		elseif head.term == default_head then
			has_redundant_head_param = true
		end

		------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
		------     otherwise fall back to the overall script if given. If neither given, autodetect the script.
		
		local auto_sc = data.lang:findBestScript(head.term)
		if (
			auto_sc:getCode() == "None" and
			require(scripts_module).findBestScriptWithoutLang(head.term):getCode() ~= "None"
		) then
			insert(data.categories, full_langname .. " terms in nonstandard scripts")
		end
		if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
			head.sc = auto_sc
		else
			if not head.sc then -- Overall script code given.
				head.sc = data.sc
			end
--[[
			-- Track uses of sc parameter.
			if head.sc:getCode() == auto_sc:getCode() then
				insert(data.categories, full_langname .. " terms with redundant script codes")
			else
				insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
			end
--]]
		end

		-- If using a discouraged character sequence, add to maintenance category.
		if head.sc:hasNormalizationFixes() == true then
			local composed_head = toNFC(head.term)
			if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
				insert(data.whole_page_categories, "Pages using discouraged character sequences")
			end
		end

		any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

		------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
		------     (provided automatic translit is available, e.g. not in Persian or Hebrew).

		-- Make transliterations
		head.tr_manual = nil

		-- Try to generate a transliteration if necessary
		if head.tr == "-" then
			head.tr = nil
		elseif not notranslit[langcode] and not notranslit[full_langcode] and head.sc:isTransliterated() then
			head.tr_manual = not not head.tr

			local text = head.term
			if not data.lang:link_tr() then
				text = remove_links(text)
			end

			local automated_tr, tr_categories
			automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)

			if automated_tr or head.tr_fail then
				local manual_tr = head.tr

				if manual_tr then
					if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
						insert(data.categories, full_langname .. " terms with redundant transliterations")
					elseif not head.tr_fail then
						insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
					end
				end

				if not manual_tr then
					head.tr = automated_tr
					for _, category in ipairs(tr_categories) do
						insert(data.categories, category)
					end
				end
			end

			-- There is still no transliteration?
			-- Add the entry to a cleanup category.
			if not head.tr then
				head.tr = "<small>transliteration needed</small>"
				-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
				-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
				insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
			else
				-- Otherwise, trim it.
				head.tr = mw.text.trim(head.tr)
			end
		end

		-- Link to the transliteration entry for languages that require this.
		if head.tr and data.lang:link_tr() then
			head.tr = require(links_module).full_link {
				term = head.tr,
				lang = data.lang,
				sc = require(scripts_module).getByCode("Latn"),
				tr = "-"
			}
		end
	end

	------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
	-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
	-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
	-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
	local display_title
	-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
	-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
	-- about the best we can do (alternatively we could potentially do script detection on the pagename).
	local dt_script = data.heads[1].sc
	local dt_script_code = dt_script:getCode()
	local page_non_ascii = page.namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
	local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^ชื่อไม่รองรับ/", "")
	if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
		display_title = 'ชื่อไม่รองรับ/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
	elseif page_non_ascii and toBeTagged[dt_script_code]
		or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
		or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
		display_title = '<span class="' .. dt_script_code .. '">' .. page.full_raw_pagename .. '</span>'
	-- Keep Han entries region-neutral in the display title.
	elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
		display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
	elseif page.namespace == "การสืบสร้าง" then
		local matched
		display_title, matched = rsubn(
			page.full_raw_pagename,
			"^(การสืบสร้าง:[^/]+/)(.+)$",
			function(before, term)
				return before ..
					require(script_utilities_module).tag_text(
						term,
						data.lang,
						dt_script
					)
			end
		)
		if matched == 0 then
			display_title = nil
		end
	end

	if display_title then
		mw.getCurrentFrame():callParserFunction(
			"DISPLAYTITLE",
			display_title
		)
	end

	------------ 8. Insert additional categories. ------------

	if data.force_cat_output then
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]
		track("force cat output")
	end

	if has_redundant_head_param then
		if not data.no_redundant_head_cat then
			insert(data.categories, full_langname .. " terms with redundant head parameter")
		end
	end

	-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and
		not m_data.no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode] then
		-- Check for spaces or hyphens, but exclude prefixes and suffixes.
		-- Use the pagename, not the head= value, because the latter may have extra
		-- junk in it, e.g. superscripted text that throws off the algorithm.
		local checkpattern = ".[%s%-፡]."
		if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then
			-- Exclude hyphens if the data module states that they should for this language
			checkpattern = ".[%s፡]."
		end
		if rmatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
			insert(data.categories, full_langname .. " multiword terms")
		end
	end

	if data.sccat then
		for _, head in ipairs(data.heads) do
			insert(data.categories, data.pos_category .. full_langname .. "ใน" ..
				head.sc:getDisplayForm())
		end
	end

	-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
	if page.namespace ~= "Reconstruction" then
		-- Map from languages to a string containing the characters to ignore when considering whether a term has
		-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
		-- values.
		local characters_to_ignore = {
			["aaq"] = "α", -- Penobscot
			["acy"] = "δθ", -- Cypriot Arabic
			["anc"] = "γ", -- Ngas
			["aou"] = "χ", -- A'ou
			["awg"] = "β", -- Anguthimri
			["bhp"] = "β", -- Bima
			["byk"] = "θ", -- Biao
			["cdy"] = "θ", -- Chadong
			["clm"] = "χ", -- Klallam
			["col"] = "χ", -- Colombia-Wenatchi
			["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
			["ets"] = "θ", -- Yekhee
			["gmw-gts"] = "χ", -- Gottscheerish
			["hur"] = "θ", -- Halkomelem
			["izh"] = "ь", -- Ingrian
			["kic"] = "θ", -- Kickapoo
			["lil"] = "χ", -- Lillooet
			["mhz"] = "β", -- Mor (Austronesian)
			["neg"]=  "ӡ", -- Negidal (normally in Cyrillic)
			["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
			["pox"] = "χ", -- Polabian
			["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
			["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
			["sjw"] = "θ", -- Shawnee
			["squ"] = "χ", -- Squamish
			["str"] = "χθ", -- Saanich; uses two Greek letters
			["twa"] = "χ", -- Twana
			["yha"] = "θ", -- Baha
			["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
						   -- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
			["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
								 -- the currect characters are being chosen in the entry names
			["zng"] = "θ", -- Mang
		}
		-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
		-- scripts whose `character_category` is false as well as Zmth (mathematical notation symbols), which has a
		-- category of "Mathematical notation symbols". When counting scripts, we need to elide language-specific
		-- variants because e.g. Beng and as-Beng have slightly different characters but we don't want to consider them
		-- two different scripts (e.g. [[এৰ]] has two characters which are detected respectively as Beng and as-Beng).
		local seen_scripts = {}
		local num_seen_scripts = 0
		local num_loops = 0
		local canon_pagename = page.pagename
		local ch_to_ignore = characters_to_ignore[full_langcode]
		if ch_to_ignore then
			canon_pagename = rsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
		end
		local script_data = mw.loadData(scripts_data_module)
		while true do
			if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
				break
			end
			-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
			num_loops = num_loops + 1
			local pagename_script = require(scripts_module).findBestScriptWithoutLang(canon_pagename, "None only as last resort")
			local script_chars = pagename_script.characters
			if not script_chars then
				-- we are stuck; this happens with None
				break
			end
			local script_code = pagename_script:getCode()
			local replaced
			canon_pagename, replaced = rsubn(canon_pagename, "[" .. script_chars .. "]", "")
			if replaced and script_code ~= "Zmth" and script_data[script_code] and
				script_data[script_code].character_category ~= false then
				script_code = script_code:gsub("^.-%-", "")
				if not seen_scripts[script_code] then
					seen_scripts[script_code] = true
					num_seen_scripts = num_seen_scripts + 1
				end
			end
		end

		if num_seen_scripts > 1 then
			insert(data.categories, full_langname .. " terms written in multiple scripts")
		end
	end
	
	-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
	local standard = data.lang:getStandardCharacters()
	if standard and not non_categorizable(page.full_raw_pagename) then
		local function char_category(char)
			local specials = {
				["#"] = "number sign",
				["("] = "parentheses",
				[")"] = "parentheses",
				["<"] = "angle brackets",
				[">"] = "angle brackets",
				["["] = "square brackets",
				["]"] = "square brackets",
				["_"] = "underscore",
				["{"] = "braces",
				["|"] = "vertical line",
				["}"] = "braces",
				["ß"] = "ẞ",
				["\205\133"] = "", -- this is UTF-8 for U+0345 ( ͅ)
				["\239\191\189"] = "replacement character",
			}
			char = toNFD(char)
				:gsub(".[\128-\191]*", function(m)
					local new_m = specials[m]
					new_m = new_m or m:uupper()
					return new_m
				end)
			return toNFC(char)
		end
		if full_langcode ~= "hi" and full_langcode ~= "lo" then
			local standard_chars_scripts = {}
			for _, head in ipairs(data.heads) do
				standard_chars_scripts[head.sc:getCode()] = true
			end
			-- Iterate over the scripts, in case there is more than one (as they can have different sets of standard characters).
			for code in pairs(standard_chars_scripts) do
				local sc_standard = data.lang:getStandardCharacters(code)
				if sc_standard then
					if page.pagename_len > 1 then
						local explode_standard = {}
						local function explode(char)
							explode_standard[char] = true
							return ""
						end
						local sc_standard = rsub(sc_standard, page.comb_chars.combined_double, explode)
						sc_standard = rsub(sc_standard,page.comb_chars.combined_single, explode)
							:gsub(".[\128-\191]*", explode)
						local num_cat_inserted
						for char in pairs(page.explode_pagename) do
							if not explode_standard[char] then
								if char:find("[0-9]") then
									if not num_cat_inserted then
										insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วยตัวเลข")
										num_cat_inserted = true
									end
								elseif rfind(char, page.emoji_pattern) then
									insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วยอีโมจิ")
								else
									local upper = char_category(char)
									if not explode_standard[upper] then
										char = upper
									end
									insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย " .. char)
								end
							end
						end
					end
					-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
					sc_standard = toNFD(sc_standard)
					for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
						if not rmatch(sc_standard, diacritic) then
							insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย ◌" .. diacritic)
						end
					end
					for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
						if not rmatch(sc_standard, diacritic) then
							insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย ◌" .. diacritic .. "◌")
						end
					end
				end
			end
		-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
		elseif ulen(page.pagename) ~= 1 then
			for character in rgmatch(page.pagename, "([^" .. standard .. "])") do
				local upper = char_category(character)
				if not rmatch(upper, "[" .. standard .. "]") then
					character = upper
				end
				insert(data.categories, "ศัพท์" .. full_langname .. "ที่สะกดด้วย " .. character)
			end
		end
	end
	
	if data.heads[1].sc:isSystem("alphabet") then
		local pagename, i = page.pagename:ulower(), 2
		while rmatch(pagename, "(%a)" .. ("%1"):rep(i)) do
			i = i + 1
			insert(data.categories, "ศัพท์" .. full_langname .. "ที่มีตัวอักษรเดิมติดกัน " .. i .. " ตัว")
		end
	end
	
	-- Categorise for palindromes
	if not data.nopalindromecat and page.namespace ~= "การสืบสร้าง" and ulen(page.pagename) > 2
		-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
		-- multiple scripts?
		and require(palindromes_module).is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
		insert(data.categories, "พาลินโดรม" .. full_langname)
	end

	if page.namespace == "" and not data.lang:hasType("reconstructed") then
		local m_links = require(links_module)
		for _, head in ipairs(data.heads) do
			if page.full_raw_pagename ~= m_links.get_link_page(remove_links(head.term), data.lang, head.sc) then
				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]
				-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch/LANGCODE]]
				track("pagename spelling mismatch", data.lang)
				break
			end
		end
	end

	-- Add to various maintenance categories.
	export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)

	------------ 9. Format and return headwords, genders, inflections and categories. ------------

	-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
	-- so make sure we do it before evaluating `data.categories`.
	local text = '<span class="headword-line">' ..
		format_headword(data) ..
		format_genders(data) ..
		format_inflections(data) .. '</span>'

	-- Language-specific categories.
	local cats = format_categories(
		data.categories, data.lang, data.sort_key, page.encoded_pagename,
		data.force_cat_output or test_force_categories, data.heads[1].sc
	)
	-- Language-agnostic categories.
	local whole_page_cats = format_categories(
		data.whole_page_categories, nil, "-"
	)
	return text .. cats .. whole_page_cats
end

return export