<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://linguifex.com/w/index.php?action=history&amp;feed=atom&amp;title=Module%3Alt-common</id>
	<title>Module:lt-common - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://linguifex.com/w/index.php?action=history&amp;feed=atom&amp;title=Module%3Alt-common"/>
	<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:lt-common&amp;action=history"/>
	<updated>2026-06-25T18:39:23Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.43.6</generator>
	<entry>
		<id>https://linguifex.com/w/index.php?title=Module:lt-common&amp;diff=495343&amp;oldid=prev</id>
		<title>Sware: 1 revision imported</title>
		<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:lt-common&amp;diff=495343&amp;oldid=prev"/>
		<updated>2026-04-21T12:00:56Z</updated>

		<summary type="html">&lt;p&gt;1 revision imported&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;1&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;1&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Revision as of 12:00, 21 April 2026&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-notice&quot; lang=&quot;en&quot;&gt;&lt;div class=&quot;mw-diff-empty&quot;&gt;(No difference)&lt;/div&gt;
&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;</summary>
		<author><name>Sware</name></author>
	</entry>
	<entry>
		<id>https://linguifex.com/w/index.php?title=Module:lt-common&amp;diff=495342&amp;oldid=prev</id>
		<title>wikt&gt;TongcyDai: Add input normalization and stem processing functions; restructure module for shared use; full backward compatibility maintained</title>
		<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:lt-common&amp;diff=495342&amp;oldid=prev"/>
		<updated>2026-04-17T06:53:55Z</updated>

		<summary type="html">&lt;p&gt;Add input normalization and stem processing functions; restructure module for shared use; full backward compatibility maintained&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;local export = {}&lt;br /&gt;
&lt;br /&gt;
local m_str_utils = require(&amp;quot;Module:string utilities&amp;quot;)&lt;br /&gt;
local u = m_str_utils.char&lt;br /&gt;
local ugsub = m_str_utils.gsub&lt;br /&gt;
local ulower = m_str_utils.lower&lt;br /&gt;
local uupper = m_str_utils.upper&lt;br /&gt;
local ufind = m_str_utils.find&lt;br /&gt;
local ulen = m_str_utils.len&lt;br /&gt;
local ucodepoint = m_str_utils.codepoint&lt;br /&gt;
&lt;br /&gt;
-- Keep native Unicode normalization functions (no replacement available)&lt;br /&gt;
local toNFC = mw.ustring.toNFC&lt;br /&gt;
local toNFD = mw.ustring.toNFD&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Unicode constants&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
local GRAVE  = u(0x0300)  -- combining grave accent&lt;br /&gt;
local ACUTE  = u(0x0301)  -- combining acute accent&lt;br /&gt;
local TILDE  = u(0x0303)  -- combining tilde&lt;br /&gt;
local MACRON = u(0x0304)  -- combining macron&lt;br /&gt;
local DOTABOVE = u(0x0307)  -- combining dot above&lt;br /&gt;
local CARON  = u(0x030C)  -- combining caron&lt;br /&gt;
local OGONEK = u(0x0328)  -- combining ogonek&lt;br /&gt;
&lt;br /&gt;
local ANY_ACCENT = &amp;quot;[&amp;quot; .. GRAVE .. ACUTE .. TILDE .. &amp;quot;]&amp;quot;&lt;br /&gt;
&lt;br /&gt;
-- Legacy aliases for backward compatibility&lt;br /&gt;
local grave = GRAVE&lt;br /&gt;
local acute = ACUTE&lt;br /&gt;
local tilde = TILDE&lt;br /&gt;
local macron = MACRON&lt;br /&gt;
local dotabove = DOTABOVE&lt;br /&gt;
local caron = CARON&lt;br /&gt;
local ogonek = OGONEK&lt;br /&gt;
local accents = ANY_ACCENT&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Internal helper functions&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
local dotless_to_dotted = {&lt;br /&gt;
	[&amp;quot;ı&amp;quot;] = &amp;quot;i&amp;quot;,&lt;br /&gt;
	[&amp;quot;ȷ&amp;quot;] = &amp;quot;j&amp;quot;,&lt;br /&gt;
}&lt;br /&gt;
&lt;br /&gt;
local function char_to_dotted_form(base, below)&lt;br /&gt;
	return (dotless_to_dotted[base] or base) .. below&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
local function normalize_dotted_chars(text)&lt;br /&gt;
	-- Remove any dots above, and convert dotless forms to dotted. &lt;br /&gt;
	-- On entry, text must be in NFD form.&lt;br /&gt;
	return ugsub(text, &amp;quot;([iıjȷ])(&amp;quot; .. ogonek .. &amp;quot;?)&amp;quot; .. dotabove, char_to_dotted_form)&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
local function char_to_accent_form(base, below)&lt;br /&gt;
	-- Add a &amp;#039;dot above&amp;#039; after the base.&lt;br /&gt;
	if base == &amp;quot;i&amp;quot; or base == &amp;quot;j&amp;quot; then&lt;br /&gt;
		return base .. below .. dotabove&lt;br /&gt;
	end&lt;br /&gt;
	-- Convert any dotless chars combining with accents to the dotted form, &lt;br /&gt;
	-- so that they normalize properly. This shouldn&amp;#039;t happen, but just in case.&lt;br /&gt;
	return char_to_dotted_form(base, below)&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
local function stripped_text_form(text)&lt;br /&gt;
	-- Remove accents.&lt;br /&gt;
	text = ugsub(toNFD(text), accents .. &amp;quot;+&amp;quot;, &amp;quot;&amp;quot;)&lt;br /&gt;
	-- Normalize dotless characters and dot-above diacritics.&lt;br /&gt;
	return normalize_dotted_chars(text)&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Input validation&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
-- Reject Private Use Area characters (U+E000–U+F8FF).&lt;br /&gt;
function export.reject_pua(s)&lt;br /&gt;
	if not s then return end&lt;br /&gt;
	for i = 1, ulen(s) do&lt;br /&gt;
		local cp = ucodepoint(s, i)&lt;br /&gt;
		if cp &amp;gt;= 0xE000 and cp &amp;lt;= 0xF8FF then&lt;br /&gt;
			error(string.format(&lt;br /&gt;
				&amp;quot;lt-common: private use area character U+%04X detected in \&amp;quot;%s\&amp;quot;. &amp;quot; ..&lt;br /&gt;
				&amp;quot;Please use a standard Unicode character instead.&amp;quot;, cp, s))&lt;br /&gt;
		end&lt;br /&gt;
	end&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Input normalization&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
-- Detect nonstandard encoding patterns in the input.&lt;br /&gt;
-- Returns: dotless_flag (found ı/ȷ), precomp_i_flag (found precomposed í/ì/ĩ)&lt;br /&gt;
function export.detect_nonstandard(s)&lt;br /&gt;
	if not s then return false, false end&lt;br /&gt;
	local nfd_s = toNFD(s)&lt;br /&gt;
	local dotless_flag   = ufind(nfd_s, &amp;quot;[ıȷ]&amp;quot;) ~= nil&lt;br /&gt;
	local precomp_i_flag = ufind(nfd_s, &amp;quot;[íìĩ]&amp;quot;) ~= nil&lt;br /&gt;
	return dotless_flag, precomp_i_flag&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- Normalize input to clean canonical NFC.&lt;br /&gt;
-- Handles dotless i/j (ı, ȷ) and stray dot-above combinations.&lt;br /&gt;
function export.canonicalize_input(s)&lt;br /&gt;
	if not s then return s end&lt;br /&gt;
	s = toNFD(s)&lt;br /&gt;
	&lt;br /&gt;
	-- Remove stray dot-above after i/j (with or without ogonek)&lt;br /&gt;
	s = ugsub(s, &amp;quot;([iıjȷ])(&amp;quot; .. OGONEK .. &amp;quot;?)&amp;quot; .. DOTABOVE, function(base, below)&lt;br /&gt;
		base = (base == &amp;quot;ı&amp;quot;) and &amp;quot;i&amp;quot; or (base == &amp;quot;ȷ&amp;quot;) and &amp;quot;j&amp;quot; or base&lt;br /&gt;
		return base .. below&lt;br /&gt;
	end)&lt;br /&gt;
	&lt;br /&gt;
	-- Convert any remaining dotless i/j to standard forms&lt;br /&gt;
	s = ugsub(s, &amp;quot;ı&amp;quot;, &amp;quot;i&amp;quot;)&lt;br /&gt;
	s = ugsub(s, &amp;quot;ȷ&amp;quot;, &amp;quot;j&amp;quot;)&lt;br /&gt;
	&lt;br /&gt;
	return toNFC(s)&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Partial NFD conversion (stem_ac representation)&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
-- Convert canonical NFC to partial NFD (stem_ac).&lt;br /&gt;
-- Applies full NFD, then recomposes non-accent diacritics.&lt;br /&gt;
-- Only grave/acute/tilde remain as combining characters.&lt;br /&gt;
function export.to_stem_ac(s)&lt;br /&gt;
	if not s then return s end&lt;br /&gt;
	s = toNFD(s)&lt;br /&gt;
	&lt;br /&gt;
	-- Recompose ogonek vowels&lt;br /&gt;
	s = ugsub(s, &amp;quot;a&amp;quot; .. OGONEK, &amp;quot;ą&amp;quot;)&lt;br /&gt;
	s = ugsub(s, &amp;quot;e&amp;quot; .. OGONEK, &amp;quot;ę&amp;quot;)&lt;br /&gt;
	s = ugsub(s, &amp;quot;i&amp;quot; .. OGONEK, &amp;quot;į&amp;quot;)&lt;br /&gt;
	s = ugsub(s, &amp;quot;u&amp;quot; .. OGONEK, &amp;quot;ų&amp;quot;)&lt;br /&gt;
	&lt;br /&gt;
	-- Recompose macron vowel&lt;br /&gt;
	s = ugsub(s, &amp;quot;u&amp;quot; .. MACRON, &amp;quot;ū&amp;quot;)&lt;br /&gt;
	&lt;br /&gt;
	-- Recompose dot-above e&lt;br /&gt;
	s = ugsub(s, &amp;quot;e&amp;quot; .. DOTABOVE, &amp;quot;ė&amp;quot;)&lt;br /&gt;
	&lt;br /&gt;
	-- Recompose caron consonants&lt;br /&gt;
	s = ugsub(s, &amp;quot;c&amp;quot; .. CARON, &amp;quot;č&amp;quot;)&lt;br /&gt;
	s = ugsub(s, &amp;quot;s&amp;quot; .. CARON, &amp;quot;š&amp;quot;)&lt;br /&gt;
	s = ugsub(s, &amp;quot;z&amp;quot; .. CARON, &amp;quot;ž&amp;quot;)&lt;br /&gt;
	&lt;br /&gt;
	return s&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Accent manipulation&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
-- Strip all accent marks (grave/acute/tilde) from partial NFD text.&lt;br /&gt;
function export.to_stem_bare(stem_ac)&lt;br /&gt;
	if not stem_ac then return stem_ac end&lt;br /&gt;
	return ugsub(stem_ac, ANY_ACCENT, &amp;quot;&amp;quot;)&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- Check if partial NFD text contains any accent marks.&lt;br /&gt;
function export.has_accent(stem_ac)&lt;br /&gt;
	return ufind(stem_ac, ANY_ACCENT) ~= nil&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Complete input pipeline&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
-- Process raw user input through the complete normalization pipeline.&lt;br /&gt;
-- Returns: stem_bare, stem_ac, dotless_flag, precomp_flag&lt;br /&gt;
function export.process_input(raw)&lt;br /&gt;
	if not raw then return raw, raw, false, false end&lt;br /&gt;
	&lt;br /&gt;
	export.reject_pua(raw)&lt;br /&gt;
	local dotless_flag, precomp_flag = export.detect_nonstandard(raw)&lt;br /&gt;
	local canon = export.canonicalize_input(raw)&lt;br /&gt;
	local stem_ac = export.to_stem_ac(canon)&lt;br /&gt;
	local stem_bare = export.to_stem_bare(stem_ac)&lt;br /&gt;
	&lt;br /&gt;
	return stem_bare, stem_ac, dotless_flag, precomp_flag&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
-- Display and text processing&lt;br /&gt;
-- =============================================================================&lt;br /&gt;
&lt;br /&gt;
function export.makeDisplayText(text, lang, sc)&lt;br /&gt;
	if not text then return text end&lt;br /&gt;
	-- Normalize dotless characters and dot-above diacritics (while retaining accents).&lt;br /&gt;
	text = normalize_dotted_chars(toNFD(text))&lt;br /&gt;
	-- Add a &amp;#039;dot above&amp;#039; between &amp;quot;i&amp;quot; or &amp;quot;j&amp;quot; and an accent.&lt;br /&gt;
	text = ugsub(text, &amp;quot;([iıjȷ])(&amp;quot; .. ogonek .. &amp;quot;?)%f&amp;quot; .. accents, char_to_accent_form)&lt;br /&gt;
	return toNFC(text)&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
-- Called from [[Module:languages]] since [[Module:lt-common]] is set as the stripDiacritics handler in&lt;br /&gt;
-- [[Module:languages/data/2]].&lt;br /&gt;
function export.stripDiacritics(text, lang, sc)&lt;br /&gt;
	if not text then return text end&lt;br /&gt;
	return toNFC(stripped_text_form(text))&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
local sortkey_substitutes = {&lt;br /&gt;
	[ogonek] = u(0xF000),&lt;br /&gt;
	[caron] = u(0xF001),&lt;br /&gt;
	[macron] = u(0xF002),&lt;br /&gt;
	[dotabove] = u(0xF003),&lt;br /&gt;
	[&amp;quot;y&amp;quot;] = &amp;quot;i&amp;quot; .. u(0xF004),&lt;br /&gt;
}&lt;br /&gt;
&lt;br /&gt;
function export.makeSortKey(text, lang, sc)&lt;br /&gt;
	if not text then return text end&lt;br /&gt;
	-- Normalize to the stripped-text form and convert diacritics to Private Use &lt;br /&gt;
	-- Area characters so they sort after all other characters.&lt;br /&gt;
	text = stripped_text_form(ulower(text))&lt;br /&gt;
		:gsub(&amp;quot;.[\128-\191]*&amp;quot;, sortkey_substitutes)&lt;br /&gt;
	return toNFC(uupper(text))&lt;br /&gt;
end&lt;br /&gt;
&lt;br /&gt;
return export&lt;/div&gt;</summary>
		<author><name>wikt&gt;TongcyDai</name></author>
	</entry>
</feed>