Sware: 1 revision imported

2026-04-21T12:00:56Z

1 revision imported

← Older revision	Revision as of 12:00, 21 April 2026
(No difference)

wikt>TongcyDai: Add input normalization and stem processing functions; restructure module for shared use; full backward compatibility maintained

2026-04-17T06:53:55Z

Add input normalization and stem processing functions; restructure module for shared use; full backward compatibility maintained

New page

local export = {}

local m_str_utils = require("Module:string utilities")
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local uupper = m_str_utils.upper
local ufind = m_str_utils.find
local ulen = m_str_utils.len
local ucodepoint = m_str_utils.codepoint

-- Keep native Unicode normalization functions (no replacement available)
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

-- =============================================================================
-- Unicode constants
-- =============================================================================

local GRAVE = u(0x0300) -- combining grave accent
local ACUTE = u(0x0301) -- combining acute accent
local TILDE = u(0x0303) -- combining tilde
local MACRON = u(0x0304) -- combining macron
local DOTABOVE = u(0x0307) -- combining dot above
local CARON = u(0x030C) -- combining caron
local OGONEK = u(0x0328) -- combining ogonek

local ANY_ACCENT = "[" .. GRAVE .. ACUTE .. TILDE .. "]"

-- Legacy aliases for backward compatibility
local grave = GRAVE
local acute = ACUTE
local tilde = TILDE
local macron = MACRON
local dotabove = DOTABOVE
local caron = CARON
local ogonek = OGONEK
local accents = ANY_ACCENT

-- =============================================================================
-- Internal helper functions
-- =============================================================================

local dotless_to_dotted = {
["ı"] = "i",
["ȷ"] = "j",
}

local function char_to_dotted_form(base, below)
return (dotless_to_dotted[base] or base) .. below
end

local function normalize_dotted_chars(text)
-- Remove any dots above, and convert dotless forms to dotted.
-- On entry, text must be in NFD form.
return ugsub(text, "([iıjȷ])(" .. ogonek .. "?)" .. dotabove, char_to_dotted_form)
end

local function char_to_accent_form(base, below)
-- Add a 'dot above' after the base.
if base == "i" or base == "j" then
return base .. below .. dotabove
end
-- Convert any dotless chars combining with accents to the dotted form,
-- so that they normalize properly. This shouldn't happen, but just in case.
return char_to_dotted_form(base, below)
end

local function stripped_text_form(text)
-- Remove accents.
text = ugsub(toNFD(text), accents .. "+", "")
-- Normalize dotless characters and dot-above diacritics.
return normalize_dotted_chars(text)
end

-- =============================================================================
-- Input validation
-- =============================================================================

-- Reject Private Use Area characters (U+E000–U+F8FF).
function export.reject_pua(s)
if not s then return end
for i = 1, ulen(s) do
local cp = ucodepoint(s, i)
if cp >= 0xE000 and cp <= 0xF8FF then
error(string.format(
"lt-common: private use area character U+%04X detected in \"%s\". " ..
"Please use a standard Unicode character instead.", cp, s))
end
end
end

-- =============================================================================
-- Input normalization
-- =============================================================================

-- Detect nonstandard encoding patterns in the input.
-- Returns: dotless_flag (found ı/ȷ), precomp_i_flag (found precomposed í/ì/ĩ)
function export.detect_nonstandard(s)
if not s then return false, false end
local nfd_s = toNFD(s)
local dotless_flag = ufind(nfd_s, "[ıȷ]") ~= nil
local precomp_i_flag = ufind(nfd_s, "[íìĩ]") ~= nil
return dotless_flag, precomp_i_flag
end

-- Normalize input to clean canonical NFC.
-- Handles dotless i/j (ı, ȷ) and stray dot-above combinations.
function export.canonicalize_input(s)
if not s then return s end
s = toNFD(s)

-- Remove stray dot-above after i/j (with or without ogonek)
s = ugsub(s, "([iıjȷ])(" .. OGONEK .. "?)" .. DOTABOVE, function(base, below)
base = (base == "ı") and "i" or (base == "ȷ") and "j" or base
return base .. below
end)

-- Convert any remaining dotless i/j to standard forms
s = ugsub(s, "ı", "i")
s = ugsub(s, "ȷ", "j")

return toNFC(s)
end

-- =============================================================================
-- Partial NFD conversion (stem_ac representation)
-- =============================================================================

-- Convert canonical NFC to partial NFD (stem_ac).
-- Applies full NFD, then recomposes non-accent diacritics.
-- Only grave/acute/tilde remain as combining characters.
function export.to_stem_ac(s)
if not s then return s end
s = toNFD(s)

-- Recompose ogonek vowels
s = ugsub(s, "a" .. OGONEK, "ą")
s = ugsub(s, "e" .. OGONEK, "ę")
s = ugsub(s, "i" .. OGONEK, "į")
s = ugsub(s, "u" .. OGONEK, "ų")

-- Recompose macron vowel
s = ugsub(s, "u" .. MACRON, "ū")

-- Recompose dot-above e
s = ugsub(s, "e" .. DOTABOVE, "ė")

-- Recompose caron consonants
s = ugsub(s, "c" .. CARON, "č")
s = ugsub(s, "s" .. CARON, "š")
s = ugsub(s, "z" .. CARON, "ž")

return s
end

-- =============================================================================
-- Accent manipulation
-- =============================================================================

-- Strip all accent marks (grave/acute/tilde) from partial NFD text.
function export.to_stem_bare(stem_ac)
if not stem_ac then return stem_ac end
return ugsub(stem_ac, ANY_ACCENT, "")
end

-- Check if partial NFD text contains any accent marks.
function export.has_accent(stem_ac)
return ufind(stem_ac, ANY_ACCENT) ~= nil
end

-- =============================================================================
-- Complete input pipeline
-- =============================================================================

-- Process raw user input through the complete normalization pipeline.
-- Returns: stem_bare, stem_ac, dotless_flag, precomp_flag
function export.process_input(raw)
if not raw then return raw, raw, false, false end

export.reject_pua(raw)
local dotless_flag, precomp_flag = export.detect_nonstandard(raw)
local canon = export.canonicalize_input(raw)
local stem_ac = export.to_stem_ac(canon)
local stem_bare = export.to_stem_bare(stem_ac)

return stem_bare, stem_ac, dotless_flag, precomp_flag
end

-- =============================================================================
-- Display and text processing
-- =============================================================================

function export.makeDisplayText(text, lang, sc)
if not text then return text end
-- Normalize dotless characters and dot-above diacritics (while retaining accents).
text = normalize_dotted_chars(toNFD(text))
-- Add a 'dot above' between "i" or "j" and an accent.
text = ugsub(text, "([iıjȷ])(" .. ogonek .. "?)%f" .. accents, char_to_accent_form)
return toNFC(text)
end

-- Called from [[Module:languages]] since [[Module:lt-common]] is set as the stripDiacritics handler in
-- [[Module:languages/data/2]].
function export.stripDiacritics(text, lang, sc)
if not text then return text end
return toNFC(stripped_text_form(text))
end

local sortkey_substitutes = {
[ogonek] = u(0xF000),
[caron] = u(0xF001),
[macron] = u(0xF002),
[dotabove] = u(0xF003),
["y"] = "i" .. u(0xF004),
}

function export.makeSortKey(text, lang, sc)
if not text then return text end
-- Normalize to the stripped-text form and convert diacritics to Private Use
-- Area characters so they sort after all other characters.
text = stripped_text_form(ulower(text))
:gsub(".[\128-\191]*", sortkey_substitutes)
return toNFC(uupper(text))
end

return export

Module:lt-common - Revision history

Sware: 1 revision imported

wikt>TongcyDai: Add input normalization and stem processing functions; restructure module for shared use; full backward compatibility maintained