Module:lo-translit

From Linguifex
Jump to navigation Jump to search

This module will transliterate Lao language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:lo-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local gsub = mw.ustring.gsub
local sub = mw.ustring.sub

-- Mapping of initial consonants.
local initial_conv = {
	['ກ'] = 'k', ['ຂ'] = 'kh', ['ຄ'] = 'kh', ['ງ'] = 'ng',
	['ຈ'] = 'ch', ['ສ'] = 's', ['ຊ'] = 's', ['ຍ'] = 'ny',
	['ດ'] = 'd', ['ຕ'] = 't', ['ຖ'] = 'th', ['ທ'] = 'th', ['ນ'] = 'n',
	['ບ'] = 'b', ['ປ'] = 'p', ['ຜ'] = 'ph', ['ຝ'] = 'f', ['ພ'] = 'ph', ['ຟ'] = 'f', ['ມ'] = 'm',
	['ຢ'] = 'y', ['ຣ'] = 'r', ['ລ'] = 'l', ['ວ'] = 'w',
	['ຫ'] = 'h', ['ອ'] = 'ʼ', ['ຮ'] = 'h',

	['ຫງ'] = 'ng',
	['ຫຍ'] = 'ny',
	['ຫນ'] = 'n', ['ໜ'] = 'n',
	['ຫມ'] = 'm', ['ໝ'] = 'm',
	['ຫຣ'] = 'r',
	['ຫລ'] = 'l', ['ຫຼ'] = 'l',
	['ຫວ'] = 'w',

	['ກຣ'] = 'kr', ['ກລ'] = 'kl',
	['ຂຣ'] = 'khr', ['ຄຣ'] = 'khr', ['ຂລ'] = 'khl', ['ຄລ'] = 'khl',
	['ປຣ'] = 'pr', ['ປລ'] = 'pl',
	['ພຣ'] = 'phr', ['ຟຣ'] = 'fr', ['ພລ'] = 'phl', ['ຟລ'] = 'fl',
	['ດຣ'] = 'dr', ['ຕຣ'] = 'tr'
}

-- Mapping of glides.
local glide_conv = {
	['ຼ'] = 'r'
}

-- Mapping of vowel combinations.
local vowel_conv = {
	['ະ'] = 'a', ['ັ'] = 'a',
	['ິ'] = 'i',
	['ຶ'] = 'ư', ['ຸ'] = 'u', ['ຸຍ'] = 'ui',
	['ເະ'] = 'e', ['ເັ'] = 'e',
	['ແະ'] = 'æ', ['ແັ'] = 'æ',
	['ໂະ'] = 'o', ['ົ'] = 'o',
	['ເາະ'] = 'ǫ', ['ັອ'] = 'ǫ',
	['ເິ'] = 'œ',
	['ເັຍ'] = 'ia', ['ັຽ'] = 'ia',
	['ເຶອ'] = 'ưa',
	['ົວະ'] = 'ua', ['ັວ'] = 'ua', ['ວັ'] = 'ua',
	['ໄ'] = 'ai', ['ໃ'] = 'ai', ['ັຍ'] = 'ai',
	['ເົາ'] = 'ao',
	['ົາວ'] = 'uau',
	['ຳ'] = 'am', ['ໍາ'] = 'am',
	['ວຳ'] = 'uam',

	['າ'] = 'ā',
	['າວ'] = 'āo',
	['ີ'] = 'ī',
	['ື'] = 'ư̄',
	['ູ'] = 'ū',
	['ເ'] = 'ē',
	['ແ'] = 'ǣ',
	['ໂ'] = 'ō',
	['ໂຍ'] = 'ōi', ['ໂຽ'] = 'ōi',
	['ໍ'] = 'ǭ', ['ອ'] = 'ǭ',
	['ອຍ'] = 'ǭi', ['ອຽ'] = 'ǭi',
	['ເີ'] = 'œ̄',
	['ເີຽ'] = 'œ̄i', ['ເີຍ'] = 'œ̄i',
	['ເຍ'] = 'īa', ['ເັຽ'] = 'īa', ['ຽ'] = 'īa',
	['ເືອ'] = 'ư̄a', ['ເືອຍ'] = 'ư̄ai',
	['ົວ'] = 'ūa', ['ວ'] = 'ūa',
	['ວຍ'] = 'uāi', ['ວຽ'] = 'uāi',
	['າຍ'] = 'āi', ['າຽ'] = 'āi',
	['ວາ'] = 'uā',
	['ວາຍ'] = 'uāi', ['ວາຽ'] = 'uāi',
	['ແວ'] = 'ǣu', -- ແ_ວ can either be ǣu and uǣ with the first one being more common.
	['ີວ'] = 'īu', ['ິວ'] = 'iu',
	['ຽວ'] = 'iāu',
	['ວີວ'] = 'uīu',
}

-- Mapping of coda consonants.
local coda_conv = {
	['ກ'] = 'k', ['ຂ'] = 'k', ['ຄ'] = 'k',
	['ງ'] = 'ng',
	['ຈ'] = 't', ['ຊ'] = 't',
	['ດ'] = 't', ['ຕ'] = 't', ['ຖ'] = 't', ['ທ'] = 't',
	['ສ'] = 's',
	['ນ'] = 'n',
	['ບ'] = 'p', ['ປ'] = 'p', ['ພ'] = 'p', ['ຟ'] = 'p',
	['ມ'] = 'm',
	['ຢ'] = 'y',
	['ຣ'] = 'n', ['ລ'] = 'n',
	['ວ'] = 'w',
	[''] = '',
}

-- Special symbols.
local sp_symbols = {
	['ຯ'] = '〃', ['ໆ'] = '〃',
	['໌'] = '',
	['໐'] = '0', ['໑'] = '1', ['໒'] = '2', ['໓'] = '3', ['໔'] = '4',
	['໕'] = '5', ['໖'] = '6', ['໗'] = '7', ['໘'] = '8', ['໙'] = '9'
}

-- List of character types.
local char_type = {
	['ກ'] = 'coda', ['ຂ'] = 'coda', ['ຄ'] = 'coda', ['ງ'] = 'coda',
	['ຈ'] = 'coda', ['ຊ'] = 'coda', ['ຍ'] = 'ambig',
	['ດ'] = 'coda', ['ຕ'] = 'coda', ['ຖ'] = 'coda', ['ທ'] = 'coda', ['ນ'] = 'coda',
	['ບ'] = 'coda', ['ປ'] = 'coda', ['ຜ'] = 'cons', ['ຝ'] = 'cons', ['ພ'] = 'coda', ['ຟ'] = 'coda', ['ມ'] = 'coda',
	['ຢ'] = 'coda', ['ຣ'] = 'coda', ['ລ'] = 'coda', ['ວ'] = 'ambig',
	['ສ'] = 'coda', ['ຫ'] = 'cons', ['ອ'] = 'ambig', ['ຮ'] = 'cons',
	['ໜ'] = 'cons', ['ໝ'] = 'cons',
	['ຯ'] = 'iter_symbol',
	['ະ'] = 'vowel_let', ['ັ'] = 'suf_vowel', ['າ'] = 'vowel_let', ['ຳ'] = 'suf_vowel',
	['ິ'] = 'suf_vowel', ['ີ'] = 'suf_vowel', ['ຶ'] = 'suf_vowel', ['ື'] = 'suf_vowel',
	['ຸ'] = 'suf_vowel', ['ູ'] = 'suf_vowel', ['ົ'] = 'suf_vowel',
	['ຼ'] = 'glide',
	['ຽ'] = 'vowel_let',
	['ເ'] = 'pref_vowel', ['ແ'] = 'pref_vowel',
	['ໂ'] = 'pref_vowel', ['ໃ'] = 'pref_vowel', ['ໄ'] = 'pref_vowel',
	['ໆ'] = 'iter_symbol',
	['່'] = 'tone', ['້'] = 'tone', ['໊'] = 'tone', ['໋'] = 'tone',
	['໌'] = 'canc_symbol', ['ໍ'] = 'suf_vowel',
	['໐'] = 'number', ['໑'] = 'number', ['໒'] = 'number', ['໓'] = 'number', ['໔'] = 'number',
	['໕'] = 'number', ['໖'] = 'number', ['໗'] = 'number', ['໘'] = 'number', ['໙'] = 'number'
}

-- List of consonant classes
local cons_class = {
	['ກ'] = 'mid', ['ຂ'] = 'high', ['ຄ'] = 'low', ['ງ'] = 'low',
	['ຈ'] = 'mid', ['ສ'] = 'high', ['ຊ'] = 'low', ['ຍ'] = 'low',
	['ດ'] = 'mid', ['ຕ'] = 'mid', ['ຖ'] = 'high', ['ທ'] = 'low', ['ນ'] = 'low',
	['ບ'] = 'mid', ['ປ'] = 'mid', ['ຜ'] = 'high', ['ຝ'] = 'high', ['ພ'] = 'low', ['ຟ'] = 'low', ['ມ'] = 'low',
	['ຢ'] = 'mid', ['ຣ'] = 'low', ['ລ'] = 'low', ['ວ'] = 'low',
	['ຫ'] = 'high', ['ອ'] = 'mid', ['ຮ'] = 'low'
}

-- Reset the syllable table.
local function reset_syllable()
	return { curr = {}, initial = {}, glide = {}, vowel = {}, tone = {}, coda = {}, sp = {} }  -- current (i.e. full syllable), initial, vowel, tone, coda, sp(ecial)
end

-- Store the current syllable, then reset the syllable table.
local function store_and_reset(syllables, curr_syll)
	table.insert(syllables, {
		curr = curr_syll.curr,
		initial = curr_syll.initial,
		glide = curr_syll.glide,
		vowel = curr_syll.vowel,
		tone = curr_syll.tone,
		coda = curr_syll.coda,
		sp = curr_syll.sp
	})
	return reset_syllable()
end

-- Split the entry into individual syllables.
function export.split_syll(text, debug)
	-- Store the split syllables.
	local syllables = {}
	local debug_syllables = {}
	local curr_syll = reset_syllable()

	-- Iterate through Lao characters.
	for lao_text in gmatch(text, '[ກ-ໝ]+') do
		local c, c_types = {}, {}

		-- Classify each character in the syllable.
		for i = 1, len(lao_text) do
			c[i] = sub(lao_text, i, i)
			c_types[i] = char_type[c[i]]
		end

		-- Parse the entry by identifying each character's type.
		for i = 1, #c + 1 do
			local type_curr, type_next = c_types[i], c_types[i+1]
			local curr_vowel_full = table.concat(curr_syll.vowel)

			-- Prefix vowels are always the start of a new syllable.
			if type_curr == 'pref_vowel' or i == #c + 1 then
				if #curr_syll.curr ~= 0 then
					curr_syll = store_and_reset(syllables, curr_syll)
				end
				table.insert(curr_syll.vowel, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Glide consonants always follow the initial consonant.
			elseif type_curr == 'glide' then
				table.insert(curr_syll.glide, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Suffix vowels and vowel letters are always part of the same syllable.
			elseif type_curr == 'suf_vowel' or type_curr == 'vowel_let' then
				table.insert(curr_syll.vowel, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Same with tone marks.
			elseif type_curr == 'tone' then
				table.insert(curr_syll.tone, c[i])
				table.insert(curr_syll.curr, c[i])

			-- Some consonants can end a syllable.
			elseif type_curr == 'coda' then
				if #curr_syll.coda == 0 and initial_conv[table.concat(curr_syll.initial)..c[i]] and (#curr_syll.vowel == 0 or char_type[curr_vowel_full] == 'pref_vowel') then
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif #curr_syll.coda == 0 and #curr_syll.initial ~= 0 and (type_next ~= 'glide' and type_next ~= 'suf_vowel' and type_next ~= 'vowel_let' and type_next ~= 'tone')
				and not (type_next == 'ambig' and match(c_types[i+2], 'co'))
				and not ((c_types[i-1] ~= 'tone' and c_types[i-1] ~= 'suf_vowel' and c[i-1] ~= 'ອ') and type_next == 'ambig' and match(c[i + 2], '[ຍາ]')) then
					table.insert(curr_syll.coda, c[i])
					table.insert(curr_syll.curr, c[i])
				else
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				end

			-- However, some consonants can only start a syllable.
			elseif type_curr == 'cons' then
				if #curr_syll.coda == 0 and initial_conv[table.concat(curr_syll.initial)..c[i]] and (#curr_syll.vowel == 0 or char_type[curr_vowel_full] == 'pref_vowel') then
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				else
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				end

			-- Ambiguous characters can both start or end a syllable.
			elseif type_curr == 'ambig' then
				if #curr_syll.curr > 0 and c[i] == 'ອ' and type_next == 'suf_vowel' then
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif #curr_syll.initial == 0 or char_type[curr_vowel_full] == 'pref_vowel' then
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif c[i] == 'ຍ' and c[i-1] == 'າ' then -- quick hack (FIXME)
					table.insert(curr_syll.vowel, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif c[i] == 'ຍ' and c[i-1] ~= 'ຫ' and #curr_vowel_full == 0 then
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				elseif #curr_syll.initial ~= 0 and (#curr_vowel_full == 0 or vowel_conv[curr_vowel_full..c[i]] and
				(type_next ~= 'glide' and type_next ~= 'suf_vowel' and type_next ~= 'vowel_let' and type_next ~= 'tone')) then
					table.insert(curr_syll.vowel, c[i])
					table.insert(curr_syll.curr, c[i])
				else
					curr_syll = store_and_reset(syllables, curr_syll)
					table.insert(curr_syll.initial, c[i])
					table.insert(curr_syll.curr, c[i])
				end

			-- Iteration and cancel symbols should be treated as part of the same syllable.
			elseif type_curr == 'iter_symbol' or 'canc_symbol' then
				table.insert(curr_syll.curr, c[i])
				table.insert(curr_syll.sp, c[i])

			-- However, numbers should be treated in their own syllable.
			elseif type_curr == 'number' then
				if curr_syll.initial ~= 0 or curr_syll.glide ~= 0 or curr_syll.vowel ~= 0 or curr_syll.tone ~= 0 or curr_syll.coda ~= 0 then
					curr_syll = store_and_reset(syllables, curr_syll)
				end
				table.insert(curr_syll.curr, c[i])
				table.insert(curr_syll.sp, c[i])
			end
		end
	end

	-- For debug mode, return concatenated `curr` values.
	if debug then
		for _, syll in ipairs(syllables) do
			table.insert(debug_syllables, table.concat(syll.curr))
		end
		return table.concat(debug_syllables, '-')
	-- Otherwise, return full syllable information.
	else
		return syllables
	end
end

-- Generate the transliteration of a Lao entry given the split syllables.
function export.tr(text, lang, sc)
	-- Split the entry into syllables.
	local syllables = export.split_syll(text, false)

	-- Store the transliteration.
	local translit = {}

	-- Iterate through each syllable.
	for _, syllable in ipairs(syllables) do
		-- Handle various edge cases.
		if table.concat(syllable.initial) == 'ຫ' and table.concat(syllable.glide) == 'ຼ' then  -- ຫຼ
			syllable.initial = {'ຫ', 'ຼ'}
			syllable.glide = {}
		end
		if table.concat(syllable.initial) ~= '' and table.concat(syllable.vowel) == '' then  -- null vowel is pronounced like ະ given an initial consonant
			syllable.vowel = {'ະ'}
		end

		-- Handle cases where ambiguous vowels are put in the initial consonant position when it really should be a vowel.
		if #syllable.initial > 1 and syllable.initial[#syllable.initial] == 'ວ' then
			table.remove(syllable.initial)
			table.insert(syllable.vowel, 'ວ')
		end
		if #syllable.initial > 1 and syllable.initial[#syllable.initial] == 'ຍ' then
			table.remove(syllable.initial)
			table.insert(syllable.vowel, 'ຍ')
		end

		-- Handle cases where ຍ is in the vowle position but should be in initially position with ຫ.
		if #syllable.vowel > 1 and syllable.vowel[1] == 'ຍ' and syllable.initial[1] == 'ຫ' then
			table.remove(syllable.vowel, 1)
			table.insert(syllable.initial, 'ຍ')
		end

		-- Map consonants, glides, vowels and codas mapped to their transliterations.
		local initial = initial_conv[table.concat(syllable.initial)] or ''
		local glide = glide_conv[table.concat(syllable.glide)] or ''
		local vowel = vowel_conv[table.concat(syllable.vowel)] or ''
		local coda = coda_conv[table.concat(syllable.coda)] or ''
		-- Special symbols can just be added directly.
		local sp = ''
		for c in gmatch(table.concat(syllable.curr), ".") do
			sp = sp .. (sp_symbols[c] or '')
		end

		-- ແ_ວ is uǣ with certain initial consonants (ກຂຄງຈສຊຖທລອຮ) plus a coda.
		if match(table.concat(syllable.initial), '[ກຂຄງຈສຊຖທລອຮ]') and match(table.concat(syllable.vowel), 'ແວ') and coda ~= '' then
			vowel = 'uǣ'
		end
		-- _ວຍ is ūai when the initial consonant is ຫ.
		if match(table.concat(syllable.initial), 'ຫ') and match(table.concat(syllable.vowel), 'ວຍ') then
			vowel = 'ūai'
		end
		-- _ວຽ is uīa when the coda is ນ.
		if match(table.concat(syllable.coda), 'ນ') and match(table.concat(syllable.vowel), 'ວຽ') then
			vowel = 'uīa'
		end

		-- Construct the transliterated syllable string.
		local syll_string = initial .. glide .. vowel .. coda .. sp

		-- Check if '໌' is present, which indicates a cancel symbol.
		if match(table.concat(syllable.sp), '໌') then
			syll_string = gsub(syll_string, '.$', '<small><del>%0</del></small>')
		end

		-- Then check if ຯ or ໆ is present, which indicates an iteration symbol.
		if match(table.concat(syllable.sp), '[ຯໆ]') and (initial ~= '' or glide ~= '' or vowel ~= '' or coda ~= '') then
			-- Add the transliteration of the syllable to the list with another small underlined version.
			syll_string = gsub(syll_string, '〃', '')
			table.insert(translit, syll_string)
			table.insert(translit, '<small><u>' .. syll_string .. '</u></small>')
		else
			-- Add the transliteration of the syllable to the list only once.
			table.insert(translit, syll_string)
		end
	end

	-- Return the transliteration as a concatenated string.
	return table.concat(translit, ' ')
end

return export