Module:ar-utilities
Jump to navigation
Jump to search
- The following documentation is located at Module:ar-utilities/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module does several things, including generating text and categories for {{ar-root}}
.
local m_links = require("Module:links")
local m_utilities = require("Module:utilities")
local ar_translit = require("Module:ar-translit")
local m_headword = require("Module:headword")
local export = {}
local lang = require("Module:languages").getByCode("ar")
local sc = require("Module:scripts").getByCode("Arab")
local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsubn = mw.ustring.gsub
local rsplit = mw.text.split
local u = require("Module:string/char")
local consonants = "[بتثجحخدذرزسشصضطظعغقفلكمنهويء]"
local function ine(x) -- If Not Empty
if x == "" then
return nil
else
return x
end
end
-- version of rsubn() that discards all but the first return value
function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- synthesize a frame so that exported functions meant to be called from
-- templates can be called from the debug console.
function debug_frame(parargs, args)
return {args = args, getParent = function() return {args = parargs} end}
end
function export.catfix()
return m_utilities.catfix(lang, sc)
end
--------------------------- hamza processing ------------------------------
-- hamza variants
local HAMZA = u(0x0621) -- hamza on the line (stand-alone hamza) = ء
local HAMZA_ON_ALIF = u(0x0623)
local HAMZA_ON_WAW = u(0x0624)
local HAMZA_UNDER_ALIF = u(0x0625)
local HAMZA_ON_YA = u(0x0626)
local HAMZA_ANY = "[" .. HAMZA .. HAMZA_ON_ALIF .. HAMZA_UNDER_ALIF .. HAMZA_ON_WAW .. HAMZA_ON_YA .. "]"
local HAMZA_PH = u(0xFFF0) -- hamza placeholder
-- diacritics
local A = u(0x064E) -- fatḥa
local AN = u(0x064B) -- fatḥatān (fatḥa tanwīn)
local U = u(0x064F) -- ḍamma
local UN = u(0x064C) -- ḍammatān (ḍamma tanwīn)
local I = u(0x0650) -- kasra
local IN = u(0x064D) -- kasratān (kasra tanwīn)
local SK = u(0x0652) -- sukūn = no vowel
local SH = u(0x0651) -- šadda = gemination of consonants
local DAGGER_ALIF = u(0x0670)
local DIACRITIC_ANY_BUT_SH = "[" .. A .. I .. U .. AN .. IN .. UN .. SK .. DAGGER_ALIF .. "]"
-- Pattern matching short vowels
local AIU = "[" .. A .. I .. U .. "]"
-- Pattern matching any diacritics that may be on a consonant
local DIACRITIC = SH .. "?" .. DIACRITIC_ANY_BUT_SH
-- various letters and signs
local ALIF = u(0x0627) -- ʾalif = ا
local AMAQ = u(0x0649) -- ʾalif maqṣūra = ى
local AMAD = u(0x0622) -- ʾalif madda = آ
local WAW = u(0x0648) -- wāw = و
local YA = u(0x064A) -- yā = ي
function reorder_shadda(text)
-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
-- replaced with short-vowel+shadda during NFC normalisation, which
-- MediaWiki does for all Unicode strings; however, it makes the
-- detection process inconvenient, so undo it. (For example, the code in
-- remove_in would fail to detect the -in in مُتَرَبٍّ because the shadda
-- would come after the -in.)
text = rsub(text, "(" .. DIACRITIC_ANY_BUT_SH .. ")" .. SH, SH .. "%1")
return text
end
local hamza_subs = {
--------------------------- handle initial hamza --------------------------
-- put initial hamza on a seat according to following vowel.
{"^" .. HAMZA_PH .. "([" .. I .. YA .. "])", HAMZA_UNDER_ALIF .. "%1"},
{" " .. HAMZA_PH .. "([" .. I .. YA .. "])", " " .. HAMZA_UNDER_ALIF .. "%1"},
{"^" .. HAMZA_PH, HAMZA_ON_ALIF}, -- if no vowel, assume a
{" " .. HAMZA_PH, " " .. HAMZA_ON_ALIF}, -- if no vowel, assume a
----------------------------- handle final hamza --------------------------
-- "final" hamza may be followed by a short vowel or tanwīn sequence
-- use a previous short vowel to get the seat
{"(" .. AIU .. ")(" .. HAMZA_PH .. ")(" .. DIACRITIC .. "?)$",
function(v, ham, diacrit)
ham = v == I and HAMZA_ON_YA or v == U and HAMZA_ON_WAW or HAMZA_ON_ALIF
return v .. ham .. diacrit
end
},
{"(" .. AIU .. ")(" .. HAMZA_PH .. ")(" .. DIACRITIC .. "? )",
function(v, ham, diacrit)
ham = v == I and HAMZA_ON_YA or v == U and HAMZA_ON_WAW or HAMZA_ON_ALIF
return v .. ham .. diacrit
end
},
-- else hamza is on the line
{HAMZA_PH .. "(" .. DIACRITIC .. "?)$", HAMZA .. "%1"},
---------------------------- handle medial hamza --------------------------
-- if long vowel or diphthong precedes, we need to ignore it.
{"([" .. AMAD .. ALIF .. WAW .. YA .. "]" .. SK .. "?)(" .. HAMZA_PH .. ")(" .. SH .. "?)([^ ])",
function(prec, ham, shad, v2)
ham = (v2 == I or v2 == YA) and HAMZA_ON_YA or
(v2 == U or v2 == WAW) and HAMZA_ON_WAW or
rfind(prec, YA) and HAMZA_ON_YA or
HAMZA
return prec .. ham ..shad .. v2
end
},
-- otherwise, seat of medial hamza relates to vowels on one or both sides.
{"([^ ])(" .. HAMZA_PH .. ")(" .. SH .. "?)(" .. AN .. "?[^ ])",
function(v1, ham, shad, v2)
ham = (v1 == I or v2 == I or v2 == YA) and HAMZA_ON_YA or
(v1 == U or v2 == U or v2 == WAW) and HAMZA_ON_WAW or
-- special exception for the accusative ending, in words like
-- جُزْءًا (juzʾan). By the rules of Thackston pp. 281-282 a
-- hamza-on-alif should appear, but that would result in
-- two alifs in a row, which is generally forbidden.
-- According to Haywood/Nahmad pp. 114-115, after sukūn before
-- the accusative ending (including when a pronominal suffix
-- follows) hamza is written on yāʾ if the previous letter
-- is connecting, else on the line. The only examples they
-- give involve preceding non-connecting z (جُزْءًا juzʾan and
-- (جُزْءَهُ juzʾahu) and preceding diphthongs, with the only
-- connecting letter being yāʾ, where we have hamza-on-yāʾ
-- anyway by the preceding regexp. Haywood/Nahmad's rule seems
-- too complicated, and since it conflicts with Thackston,
-- we only implement the case where otherwise two alifs would
-- appear with the indefinite accusative ending.
v2 == AN .. ALIF and HAMZA or
HAMZA_ON_ALIF
return v1 .. ham .. shad .. v2
end
},
--------------------------- handle alif madda -----------------------------
{HAMZA_ON_ALIF .. A .. "?" .. ALIF, AMAD},
----------------------- catch any remaining hamzas ------------------------
{HAMZA_PH, HAMZA}
}
function export.process_hamza(term)
-- convert HAMZA_PH into appropriate hamza seat
for _, sub in ipairs(hamza_subs) do
term = rsub(term, sub[1], sub[2])
end
-- sequence of hamza-on-wāw + wāw is problematic and leads to a preferred
-- alternative with some other type of hamza, as well as the original
-- sequence; sequence of wāw + hamza-on-wāw + wāw is especially problematic
-- and leads to two different alternatives with the original sequence not
-- one of them
if rfind(term, WAW .. "ؤُو") then
return {rsub(term, WAW .. "ؤُو", WAW .. "ئُو"), rsub(term, WAW .. "ؤُو", WAW .. "ءُو")}
elseif rfind(term, YA .. "ؤُو") then
return {rsub(term, YA .. "ؤُو", YA .. "ئُو"), term}
elseif rfind(term, ALIF .. "ؤُو") then
-- Here John Mace "Arabic Verbs" is inconsistent. In past-tense parts,
-- the preferred alternative has hamza on the line, whereas in
-- non-past parts the preferred alternative has hamza-on-yāʾ even
-- though the sequence of vowels is identical. It's too complicated to
-- propagate information about tense through to here so pick one.
return {rsub(term, ALIF .. "ؤُو", ALIF .. "ئُو"), term}
-- no alternative spelling in sequence of U/A + hamza-on-wāw + U + wāw;
-- sequence of I + hamza-on-wāw + U + wāw does not occur (has
-- hamza-on-yāʾ instead)
else
return {term}
end
end
----------------------------------- misc junk ---------------------------------
local LRM = u(0x200E) -- left-to-right mark
local function link(term, tr, gloss, face, alt)
if word == "" or word == "—" then
return word
else
return m_links.full_link( { term = term, alt = alt, lang = lang, tr = tr, sc = sc, gloss = gloss }, face )
end
end
local function format_genders(lang, sc, genders)
if genders and #genders > 0 then
local gen = require("Module:gender and number")
return " " .. gen.format_list(genders, lang)
else
return ""
end
end
local ordinal = { "first", "second", "third", "fourth", "fifth" }
local function hamzaError(rootTable, output)
for i, letter in pairs(rootTable) do
if not rfind(letter, consonants) then
if rfind(letter, "[أإؤئ]") then
table.insert(output, '<span class="previewonly" style="font-size: small;">[Seated hamzas, such as "' .. letter .. '", are not allowed in the names of roots. Use bare hamza, "‏<span lang="ar">ء</span>‎".]</span>')
require("Module:debug").track("ar-root/hamza-error") -- [[Special:WhatLinksHere/Wiktionary:Tracking/ar-root/hamza-error]]
else
error(letter .. ", the " .. ordinal[i] .. " letter in the category name, is not a consonant.")
end
end
end
end
local function validateRoot(rootTable, joined_root)
if type(rootTable) ~= "table" then
error("rootTable is not a table", 2)
end
for i, letter in ipairs(rootTable) do
if mw.ustring.len(letter) > 1 then
error('"' .. letter .. '", the ' .. ordinal[i] .. ' letter in the root "' .. joined_root .. '" should be a single letter.')
end
end
end
function export.ar_root(frame)
local output = {}
local categories = {}
local title = mw.title.getCurrentTitle()
local fulltitle = title.fullText
local pagename = title.text
local namespace = title.nsText
local params = {
[1] = {},
[2] = {},
[3] = {},
[4] = {},
["nocat"] = { type = "boolean", default = false },
["plain"] = { type = "boolean", default = false },
["t"] = {},
["gloss"] = { alias_of = "t" },
["face"] = { default = "term" },
["notext"] = { type = "boolean", default = false },
["nolink"] = { type = "boolean", default = false },
}
local args = require("Module:parameters").process(frame:getParent().args, params)
local rootLetters = {}
if not args[1] and namespace == "Template" then
rootLetters = { "ك", "ت", "ب" }
elseif args[1] and args[2] then
rootLetters = { args[1], args[2], args[3], args[4] }
elseif args[1] then
rootLetters = rsplit(args[1], " ")
else
rootLetters = rsplit(fulltitle, " ")
end
hamzaError(rootLetters, output)
local joined_root = table.concat(rootLetters, " ")
validateRoot(rootLetters, joined_root)
local joined_tr = ar_translit.tr(table.concat(rootLetters, "-"), lang, sc, nil, nil, "force") or "-"
if fulltitle == joined_root then
table.insert(output, m_headword.full_headword({lang = lang, sc = sc, pos_category = "roots", categories = {}, heads = { joined_root }, translits = { joined_tr }}) )
table.insert(categories, "[[Category:Arabic roots|" .. (ine(args["sort"]) or joined_root) .. "]]")
if args[1] then
require("Module:debug").track("ar-root") -- [[Special:WhatLinksHere/Wiktionary:Tracking/ar-root]]
end
else
if sc:countCharacters(pagename) < mw.ustring.len(pagename) - 2 then
require("Module:debug").track("ar-root/title-not-ar") -- [[Special:WhatLinksHere/Wiktionary:Tracking/ar-root/title-not-ar]]
if not args["nocat"] then
require("Module:debug").track("ar-root/title-not-ar/cat") -- [[Special:WhatLinksHere/Wiktionary:Tracking/ar-root/title-not-ar/cat]]
end
end
local link_text
if args["nolink"] then
link_text = link(nil, joined_tr, ine(args["gloss"]), args["face"], joined_root)
else
link_text = link(joined_root, joined_tr, ine(args["gloss"]), args["face"] )
end
table.insert(output, link_text)
table.insert(categories, m_utilities.format_categories( { "Arabic terms belonging to the root " .. joined_root }, lang) )
end
if args["plain"] then
return joined_root
elseif args["nocat"] then
return table.concat(output)
elseif args["notext"] then
return table.concat(categories)
else
return table.concat(output) .. table.concat(categories)
end
end
function export.ar_rootbox(frame)
local output = {}
local categories = {}
local title = mw.title.getCurrentTitle()
local fulltitle = title.fullText
local pagename = title.text
local namespace = title.nsText
local params = {
[1] = {},
["nocat"] = {type = "boolean"},
["plain"] = {type = "boolean"},
["t"] = {},
["gloss"] = {alias_of = "t"},
["notext"] = {type = "boolean"},
["nolink"] = {type = "boolean"},
["sort"] = {},
["face"] = {}
}
local args = require("Module:parameters").process(frame:getParent().args, params)
local rootLetters = {}
if not args[1] and namespace == "Template" then
rootLetters = { "ك", "ت", "ب" }
elseif args[1] and args[2] then
rootLetters = { args[1], args[2], args[3], args[4] }
elseif args[1] then
rootLetters = rsplit(args[1], " ")
else
rootLetters = rsplit(fulltitle, " ")
end
hamzaError(rootLetters, output)
local joined_root = table.concat(rootLetters, " ")
validateRoot(rootLetters, joined_root)
local joined_tr = ar_translit.tr(table.concat(rootLetters, "-"), lang, sc, nil, nil, "force") or "-"
if fulltitle == joined_root then
table.insert(output, m_headword.full_headword(
{
lang = lang,
sc = sc,
pos_category = "roots",
categories = {},
heads = {joined_root}
}))
table.insert(categories, "[[Category:Arabic roots|" .. (args["sort"] or joined_root) .. "]]")
if args["nocat"] then
return table.concat(output)
else
return table.concat(output) .. table.concat(categories)
end
else
local link_text
if args["nolink"] then
link_text = link(nil, joined_tr, args["gloss"], args["face"], joined_root)
else
link_text = link(joined_root, joined_tr, args["gloss"], args["face"])
end
table.insert(output, link_text)
table.insert(categories, m_utilities.format_categories(
{"Arabic terms belonging to the root " .. joined_root},
lang))
if args["nocat"] then
return table.concat(output)
elseif args["plain"] then
return table.concat(output)
else
return "<table class=\"wikitable\" style=\"float: right; clear: right; text-align: center;\"><tr><th>[[w:Semitic root|Root]]</th></tr><tr><td>" .. link_text .. "</td></tr></table>" .. table.concat(categories)
end
end
end
function export.ar_root2(parargs, args)
return export.ar_root(debug_frame(parargs, args))
end
-- Used in {{ar-adj-in}} so that we can specify a full lemma rather than
-- requiring the user to truncate the -in ending. FIXME: Move ar-adj-in
-- into Lua.
function export.remove_in(frame)
local lemma = frame.args[1] or error("Lemma required.")
return rsub(reorder_shadda(lemma), IN .. "$", "")
end
-- Used in {{ar-adj-an}} so that we can specify a full lemma rather than
-- requiring the user to truncate the -an ending. FIXME: Move ar-adj-an
-- into Lua.
function export.remove_an(frame)
local lemma = frame.args[1] or error("Lemma required.")
return rsub(reorder_shadda(lemma), AN .. AMAQ .. "$", "")
end
-- Compare two words and find the alternation pattern (vowel changes, prefixes, suffixes etc.)
-- Still a WIP, doesn't work correctly yet.
function export.find_pattern(word1, word2)
return nil
end
function export.etymology(frame)
local text, categories = {}, {}
local linkText
local frame_params = {
[1] = { required = true },
}
local frame_args = require("Module:parameters").process(frame.args, frame_params)
local anchor = frame_args[1]
local data = {
["color adjective"] = {
anchor = "Color or defect adjectives",
text = "color adjective",
categories = { "color/defect adjectives" },
},
["defect adjective"] = {
anchor = "Color or defect adjectives",
text = "defect adjective",
categories = { "color/defect adjectives" },
},
}
local params = {
[1] = {},
["nocat"] = { type = boolean, default = false },
["lc"] = { type = boolean, default = false },
["nocap"] = { alias_of = "lc" },
["notext"] = { type = boolean, default = false},
}
local args = require("Module:parameters").process(frame:getParent().args, params)
if anchor and data[anchor] then
local data = data[anchor]
anchor = data.anchor or error('The data table does not include an anchor for "' .. anchor .. '".')
linkText = data.text or error('The data table does not include link text for "' .. anchor .. '".')
if not args.lc then
linkText = rsubn(linkText, "^%a", function(a) return mw.ustring.upper(a) end)
end
if not args.notext then
table.insert(text, "[[Appendix:Arabic nominals#" .. anchor .. "|" .. linkText .. "]]")
end
if not args.nocat then
table.insert(categories, m_utilities.format_categories(data.categories, lang) )
end
else
error('The anchor "' .. tostring(anchor) .. '" is not found in the list of anchors.')
end
return table.concat(text) .. table.concat(categories)
end
return export