Module:zh-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate Chinese language text. It is also used to transliterate Eastern Min, Jin, Mandarin, Southern Pinghua, Gan, Xiang, Middle Chinese, Literary Chinese, Northern Min, Teochew, Old Chinese, Wu, Cantonese, Sichuanese, and Taishanese. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:zh-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local m_str_utils = require("Module:string utilities")

local find_templates = require("Module:template parser").find_templates
local get_section = require("Module:pages").get_section
local gsub = string.gsub
local insert = table.insert
local safe_require = require("Module:utilities").safe_require
local split = m_str_utils.split
local toNFD = mw.ustring.toNFD
local trim = m_str_utils.trim
local ugsub = m_str_utils.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local usub = m_str_utils.sub
local uupper = m_str_utils.upper

local tag

local lect_code = mw.loadData("Module:zh/data/lect codes").langcode_to_abbr

local export = {}

local function fail(lang, request)
	require("Module:debug/track")("zh-translit/needs manual translit/" .. lang)
	return nil
end

local function get_content(title)
	local content = mw.title.new(title)
	if not content then
		return false
	end
	return get_section(content:getContent(), "Chinese", 2)
end

-- Match function for regex ",(?! )".
local function split_on_comma_without_space(str, start)
	local i
	repeat
		i = str:find(",", start)
		if not i then
			return
		end
		start = i + 1
	until str:sub(start, start) ~= " "
	return i, i
end

local function handle_readings(readings, lang, tr)
	if lang == "ltc" or lang == "och" then
		if tr and readings ~= tr then
			return false
		end
		return readings
	elseif (
		lang == "cmn" or
		lang == "csp" or
		lang == "wuu" or
		lang == "yue" or
		lang == "zhx-tai"
	) then
		readings = split(readings, split_on_comma_without_space, true)
	else
		readings = split(readings, "/", true, true)
	end
	local tr_orig = tr
	for _, reading in ipairs(readings) do
		reading = trim(reading)
		if not reading:find("=") then
			if (
				not tr or
				tr == reading or
				gsub(ulower(tr), "%^", "") == reading
			) then
				tr = reading
			elseif ulower(reading) ~= tr then
				return false
			end
		elseif lang == "cmn" and reading == "cap=y" then
			local tr_cap = "^" .. tr
			if not tr_orig or tr_orig == tr_cap then
				tr = tr_cap
			end
		end
	end
	return tr
end

local function iterate_content(content, lang, see, seen, tr)
	for template in find_templates(content) do
		local name = template:get_name()
		if name == "zh-pron" then
			for k, v in pairs(template:get_arguments()) do
				if (
					#v > 0 and
					type(k) == "string" and
					k == lect_code[lang]
				) then
					tr = handle_readings(v, lang, tr)
					break
				end
			end
			if tr == false then
				return tr
			end
		elseif name == "zh-see" then
			local arg = trim(template:get_arguments()[1])
			if not seen[arg] then
				insert(see, arg)
			end
		end
	end
	return tr
end

function export.tr(text, lang, sc)
	if (not text) or text == "" then
		return text
	end
	
	if lang == "zh" or lang == "lzh" then
		lang = "cmn"
	end
	
	if not lect_code[lang] then
		lang = require("Module:languages").getByCode(lang, nil, true):getFullCode()
	end
	
	local content = get_content(text)
	if not content then
		return fail(lang)
	end
	
	local see = {}
	local seen = {
		[text] = true
	}
	local tr = iterate_content(content, lang, see, seen)
	
	if tr == nil then
		local i, title = 1
		while i <= #see do
			title = see[i]
			content = get_content(title)
			if content then
				tr = iterate_content(content, lang, see, seen, tr)
				if tr == false then
					return fail(lang)
				end
				seen[title] = true
			end
			i = i + 1
		end
	end
	
	if not tr then
		return fail(lang)
	end
	
	if lang == "cmn" then
		tr = tr:gsub("#", "")
		if tr:match("[\194-\244]") then
			tag = tag or mw.loadData("Module:zh/data/cmn-tag").MT
			tr = tr:gsub(".[\128-\191]*", function(m)
				if m == "一" then
					return "yī"
				elseif m == "不" then
					return "bù"
				else
					m = tag[m] and tag[m][1]
					if m then
						return toNFD(m):gsub("^[aeiou]", "\1%0") -- temporarily use \1 for apostrophes, as it's not in %p
					end
				end
			end)
			tr = ugsub(tr, "%f[^%z%s%p](^?)\1", "%1") -- remove any initial apostrophes inserted by the previous function
				:gsub("\1", "'")
		end
		tr = ugsub(tr, "%^('?.)", uupper)
	elseif lang == "csp" or lang == "yue" or lang == "zhx-tai" then
		tr = tr:gsub("%d[%d%*%-]*%f[^%d%*]", "<sup>%0</sup>")
	elseif lang == "hak" then
		-- TODO
	elseif lang == "ltc" or lang == "och" then
		if tr == "n" then
			return fail(lang)
		end
		local index = tr and split(tr, lang == "ltc" and "," or ";", true, true) or {}
		for i = 1, ulen(text) do
			local module_type = lang .. "-pron"
			if lang == "och" then
				module_type = module_type .. "-ZS"
			end
			
			local data_module = safe_require("Module:zh/data/" .. module_type .. "/" .. usub(text, i, i))
			
			if not data_module or (((not index[i]) or index[i] == "y") and #data_module > 1) then
				return fail(lang)
			end
			
			if index[i] == "y" then
				index[i] = 1
			elseif index[i] then
				index[i] = tonumber(index[i])
			end
			
			index[i] = index[i] and data_module[index[i]] or data_module[1]
			
			if lang == "ltc" then
				local data = mw.loadData("Module:ltc-pron/data")
				local initial, final, tone = require("Module:ltc-pron").infer_categories(index[i])
				tone = tone ~= "" and ("<sup>" .. tone .. "</sup>") or tone
				index[i] = data.initialConv["Zhengzhang"][initial] .. data.finalConv["Zhengzhang"][final] .. tone
			else
				index[i] = index[i][6]
			end
		end
		tr = table.concat(index, " ")
		if lang == "och" then
			tr = "*" .. tr
		end
	elseif lang == "nan" then
		-- TODO
	elseif lang == "nan-tws" then
		tr = require("Module:nan-pron").pengim_display(tr)
	elseif lang == "wuu" then
		local w_pron = require("Module:wuu-pron")
		if tr:match(';') then
			--TODO
			return fail(lang)
		elseif tr:match(':') then
			tr = w_pron.wugniu_format(tr:sub(4))
		else
			tr = w_pron.wugniu_format(w_pron.wikt_to_wugniu(tr))
		end
	elseif lang == "zhx-sic" then
		tr = ugsub(tr, "([%d-])(%a)", "%1 %2")
			:gsub("%d[%d%*%-]*%f[^%d%*]", "<sup>%0</sup>")
	else
		tr = require("Module:" .. lang .. "-pron").rom(tr)
	end
	
	-- End with a space so that concurrent parts of running text that need to be transliterated separately (e.g. due to links) are still properly separated.
	return tr .. " "
end

return export